# Visualize sequence probabilities
In this notebook, we will visualize the probabilities of the predictions accross datasets.

In [1]:
import pandas as pd
import json
import plotly.express as px
import numpy as np
from collections import Counter

In [45]:
# balmy_violet_577_custom_final

predictions = {"NIST": "../clean_paper/predictions/balmy-violet-577_custom_final/NIST/1730720158_test_full_greedy/predictions.jsonl",
               "MONA_OO": "../clean_paper/predictions/balmy-violet-577_custom_final/MONA_GCMS_overlaps_only/1730907164_all_full_greedy/predictions.jsonl",
               "MONA_NO": "../clean_paper/predictions/balmy-violet-577_custom_final/MONA_GCMS/1730895666_all_full_greedy/predictions.jsonl",
               "Cayman": "../clean_paper/predictions/balmy-violet-577_custom_final/Cayman_library/1730893023_all_full_greedy/predictions.jsonl",
               "SWGDRUG": "../clean_paper/predictions/balmy-violet-577_custom_final/SWGDRUG/1730884288_all_full_greedy/predictions.jsonl",
               "RCX_OO": "../clean_paper/predictions/balmy-violet-577_custom_final/RCX_OO/1731678771_all_full_greedy/predictions.jsonl",
               "RCX_NO": "../clean_paper/predictions/balmy-violet-577_custom_final/RCX_NO/1731678847_all_full_greedy/predictions.jsonl"
}


In [50]:
import math

def extract_probs(predictions_file):
    probs = []
    with open(predictions_file, "r") as f:
        for line in f:
            line = line.strip()
            linedict = json.loads(line)
            if not linedict:
                probs.append(0)
                continue
            max_prob = max(list(linedict.values()))
            probs.append(max_prob)
    return probs

def extract_simils(best_preds_file):
    simils = []
    with open(best_preds_file, "r") as f:
        for line in f:
            line = line.strip()
            linedict = json.loads(line)
            if not linedict:
                simils.append(0)
                continue
            simil = linedict["simil_best_simil_morgan_tanimoto"]
            simils.append(simil)
    return simils

def count_probs(prob_simil, num_bins=100):
    c = [list() for i in range(num_bins+1)]
    for prob, simil in prob_simil:
        c[int(simil*num_bins)].append(prob)
    return c

def visualize_probs(predictions_dict, num_bins=100, pick_first_n=None):
    total_mean_probs = {}
    avg_simils = {}
    dataset_size = {}
    for dataset, prob_file in predictions_dict.items():
        simils = extract_simils(prob_file.replace("predictions.jsonl", "df_best_predictions.jsonl"))
        probs = extract_probs(prob_file)
        simils, probs = simils[:pick_first_n], probs[:pick_first_n]
        prob_simil = list(zip(probs, simils))
        all_probs = count_probs(prob_simil, num_bins=num_bins)
        mean_probs = [sum(l)/len(l) if len(l)>0 else None for l in all_probs]
        total_mean_prob = sum([sum(l) for l in all_probs])/sum([len(l) for l in all_probs])
        df = pd.DataFrame({"avg_probs": mean_probs, "simils": [i/num_bins for i in range(num_bins+1)]})
        fig = px.box(df, x="simils", y="avg_probs", title=f"Prediction probabilities {dataset}")
        fig.show()

        total_mean_probs[dataset] = total_mean_prob
        avg_simils[dataset] = sum(simils)/len(simils)
        dataset_size[dataset] = len(simils)
    return total_mean_probs, avg_simils, dataset_size

total_mean_probs, avg_simils, dataset_size = visualize_probs(predictions, num_bins=100, pick_first_n=None)

for dataset, mean_prob in total_mean_probs.items():
    print(f"Mean probability and men similarity for {dataset}: {mean_prob:.4f}, {avg_simils[dataset]:.4f}, dataset size {dataset_size[dataset]}")

Mean probability and men similarity for NIST: 0.7869, 0.6464, dataset size 28267
Mean probability and men similarity for MONA_OO: 0.7902, 0.6823, dataset size 12758
Mean probability and men similarity for MONA_NO: 0.6209, 0.3958, dataset size 5015
Mean probability and men similarity for Cayman: 0.6483, 0.5733, dataset size 469
Mean probability and men similarity for SWGDRUG: 0.7801, 0.6634, dataset size 1640
Mean probability and men similarity for RCX_OO: 0.6557, 0.4582, dataset size 108
Mean probability and men similarity for RCX_NO: 0.5556, 0.3085, dataset size 111
