# Enrichment Analysis Notebook

Compares the results of SPINDOCTOR gene set summarization vs statistical ontological enrichment.

Draft: https://docs.google.com/document/d/1H103ux6Dd1_bPM0un4RwutBLcYJx-0ybil2AwlAvG_Q/edit#

## Initial setup

Here we take care of imports, defining the data dictionary for the pandas dataframes

In [1]:
!pip --version

pip 23.0.1 from /Users/marcin/Documents/VIMSS/ontology/LLMs/enrichgpt-results/venv/lib/python3.10/site-packages/pip (python 3.10)


In [2]:
!pip list

Package                    Version
-------------------------- ------------
adeft                      0.11.2
aiohttp                    3.8.5
aiosignal                  1.3.1
airium                     0.2.5
altair                     5.1.1
aniso8601                  9.0.1
annotated-types            0.5.0
antlr4-python3-runtime     4.9.3
anyio                      4.0.0
appdirs                    1.4.4
appnope                    0.1.3
argon2-cffi                23.1.0
argon2-cffi-bindings       21.2.0
arrow                      1.2.3
asttokens                  2.4.0
async-lru                  2.0.4
async-timeout              4.0.3
attrs                      23.1.0
Babel                      2.12.1
backcall                   0.2.0
bcp47                      0.0.4
beautifulsoup4             4.12.2
bioc                       2.1
bleach                     6.0.0
blinker                    1.6.2
boto3                      1.28.45
botocore                   1.31.45
cachetools                

In [3]:
#!pip install --force-reinstall --upgrade more-itertools


In [4]:
import yaml
from yaml import Loader
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import kstest, ttest_ind, wilcoxon
import math
from statsmodels.stats.multitest import multipletests

#import itertools as it
#import collections as ct
#import more_itertools as mit

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import colorsys
import seaborn as sns

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF
from ontogpt.evaluation.enrichment.eval_enrichment import EvalEnrichment
go = get_adapter("sqlite:obo:go")
hgnc = get_adapter("sqlite:obo:hgnc")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
TURBO = "gpt-3.5-turbo"
DAVINCI = "text-davinci-003"
GPT4 = "gpt-4"
MODELS = [TURBO, DAVINCI, GPT4]

In [6]:
# data dictionary
MODEL = "model"
METHOD = "method"
HAS_TOP_HIT = "has top term"
IN_TOP_5 = "in top 5"
IN_TOP_10 = "in top 10"
RANK = "rank"
SIZE_OVERLAP = "size overlap"
SIMILARITY = "similarity"
NR_SIZE_OVERLAP = "nr size overlap"
NR_SIMILARITY = "nr similarity"
GENESET = "gene_set"
PROMPT_VARIANT = "prompt_variant"
SOURCE_GENESET = "source geneset"
GENESET_DESCRIPTION = "description"
GENESET_SIZE = "gene_set_size"
TRUNCATION_FACTOR = "truncation factor"
NUM_TERMS = "num terms"
NUM_GO_TERMS = "num GO terms"
UNPARSED = "unparsed"
NUM_UNPARSED = "num unparsed"
TERM_IDS = "term ids"
GO_TERM_IDS = "go term ids"
GO_TERM_P_VALUES = "go term p values"
MAX_P_VALUE = "max p value"
MIN_P_VALUE = "min p value"
MEAN_P_VALUE = "mean p value"
PROPORTION_SIGNIFICANT = "proportion significant"
NOVEL = "unannotated"
NOVEL_LABELS = "unannotated labels"
NUM_NOVEL = "num unannotated"
GENE_RANDOMIZATION_FACTOR = "gene_randomization_factor"
SUMMARY = "summary"
TP_num = "number of true positives across a set of gene GO annotations"
FP_num = "number of false positives across a set of gene GO annotations"
TP = "true positive rate across a set of gene GO annotations"
FP = "false positive rate across a set of gene GO annotations"
PRECISION = "precision"
RECALL = "recall"
RECALL_GENERAL = "recall_general"
RECALL_SPECIFIC = "recall_specific"


In [7]:
from ontogpt.evaluation.enrichment.eval_enrichment import GeneSetComparison

In [8]:
df = pd.read_csv('results/processed.tsv', sep='\t', header=0, index_col=0)
df

Unnamed: 0_level_0,gene_set,cutoff,closure,top_n,source,model,method,method_desc,run,truncation_factor,...,all_predictions_closure,unparsed,true_positive_terms,false_positive_terms,unparsed_terms,gene_set_size,precision,recall,recall_general,recall_specific
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
glycolysis-gocam-0-0.005,glycolysis-gocam-0,0.005,False,5,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,...,23,2,GO:0006096,GO:0006006,energy production|atp generation,10,0.500000,0.500000,1.000000,0.200000
glycolysis-gocam-0-0.005,glycolysis-gocam-0,0.005,False,10,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,...,23,2,GO:0006096,GO:0006006,energy production|atp generation,10,0.500000,0.500000,0.200000,0.200000
glycolysis-gocam-0-0.005,glycolysis-gocam-0,0.005,False,25,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,...,23,2,GO:0006096|GO:0006006,,energy production|atp generation,10,1.000000,1.000000,0.250000,0.153846
glycolysis-gocam-0-0.005,glycolysis-gocam-0,0.005,False,5000,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,...,23,2,GO:0006096|GO:0006006,,energy production|atp generation,10,1.000000,1.000000,0.250000,0.142857
glycolysis-gocam-0-0.005,glycolysis-gocam-0,0.005,True,5,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,...,23,2,GO:0006096|GO:0019318|GO:0061621|GO:0006094,,energy production|atp generation,10,1.000000,1.000000,0.666667,0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HALLMARK_MTORC1_SIGNALING-1-99,HALLMARK_MTORC1_SIGNALING-1,99.000,False,5000,,,,,closure,,...,4852,0,GO:0044283|GO:0044281|GO:0005737|GO:0019752|GO...,,,180,1.000000,1.000000,1.000000,1.000000
HALLMARK_MTORC1_SIGNALING-1-99,HALLMARK_MTORC1_SIGNALING-1,99.000,True,5,,,,,closure,,...,4852,0,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,180,0.000794,0.000794,1.000000,1.000000
HALLMARK_MTORC1_SIGNALING-1-99,HALLMARK_MTORC1_SIGNALING-1,99.000,True,10,,,,,closure,,...,4852,0,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,180,0.001720,0.001720,1.000000,1.000000
HALLMARK_MTORC1_SIGNALING-1-99,HALLMARK_MTORC1_SIGNALING-1,99.000,True,25,,,,,closure,,...,4852,0,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0019899|GO:0050709|GO:0035094|GO...,,180,0.004761,0.004761,1.000000,1.000000


In [9]:
print(MODEL)
df[[MODEL, METHOD]].drop_duplicates()

model


Unnamed: 0_level_0,model,method
name,Unnamed: 1_level_1,Unnamed: 2_level_1
glycolysis-gocam-0-0.005,4.0,gpt
glycolysis-gocam-0-0.005,3.5,gpt
glycolysis-gocam-0-0.005,3.0,gpt
glycolysis-gocam-0-0.005,,standard
glycolysis-gocam-0-0.005,,standard_no_ontology
glycolysis-gocam-0-0.005,,random
glycolysis-gocam-0-0.005,,rank_based
glycolysis-gocam-0-0.005,,


In [10]:
df[[GENESET]].drop_duplicates()

Unnamed: 0_level_0,gene_set
name,Unnamed: 1_level_1
glycolysis-gocam-0-0.005,glycolysis-gocam-0
glycolysis-gocam-1-0.005,glycolysis-gocam-1
bicluster_RNAseqDB_1001-0-0.005,bicluster_RNAseqDB_1001-0
bicluster_RNAseqDB_1001-1-0.005,bicluster_RNAseqDB_1001-1
HALLMARK_HYPOXIA-0-0.005,HALLMARK_HYPOXIA-0
...,...
T cell proliferation-1-0.005,T cell proliferation-1
mtorc1-0-0.005,mtorc1-0
mtorc1-1-0.005,mtorc1-1
HALLMARK_MTORC1_SIGNALING-0-0.005,HALLMARK_MTORC1_SIGNALING-0


## TABLE: All gene sets and their sizes

Copy this to [gene set](https://docs.google.com/spreadsheets/d/1gGO5IHEg-N0hivtHBO6-rdXtin8hPhw-zv6eYOBgXcE/edit#gid=1762479413) tab

In [11]:
df[[GENESET, GENESET_SIZE]].drop_duplicates().style.hide()

gene_set,gene_set_size
glycolysis-gocam-0,10
glycolysis-gocam-1,9
bicluster_RNAseqDB_1001-0,76
bicluster_RNAseqDB_1001-1,63
HALLMARK_HYPOXIA-0,200
HALLMARK_HYPOXIA-1,180
HALLMARK_DNA_REPAIR-0,150
HALLMARK_DNA_REPAIR-1,135
HALLMARK_G2M_CHECKPOINT-0,200
HALLMARK_G2M_CHECKPOINT-1,180


In [12]:
df[[MODEL, METHOD]].drop_duplicates().style.hide()

model,method
4.0,gpt
3.5,gpt
3.0,gpt
,standard
,standard_no_ontology
,random
,rank_based
,


In [13]:
df[[MODEL, METHOD, PROMPT_VARIANT]].drop_duplicates().style.hide()

model,method,prompt_variant
4.0,gpt,v1
4.0,gpt,v2
3.5,gpt,v1
3.5,gpt,v2
3.0,gpt,v1
3.0,gpt,v2
,standard,
,standard_no_ontology,
,random,
,rank_based,


In [14]:
df_orig = df.copy()

df = df.loc[(df["cutoff"] == 0.005) & (df["closure"] == True) & (df["top_n"] == 1)] 
df

Unnamed: 0_level_0,gene_set,cutoff,closure,top_n,source,model,method,method_desc,run,truncation_factor,...,all_predictions_closure,unparsed,true_positive_terms,false_positive_terms,unparsed_terms,gene_set_size,precision,recall,recall_general,recall_specific
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


## Subset Analysis

In [15]:

go = get_adapter("sqlite:obo:go")
subsets = list(go.subsets())
subsets_by_term = defaultdict(list)
for subset in subsets:
    members = [m for m in go.subset_members(subset) if m.startswith("GO:")]
    for t in members:
        subsets_by_term[t].append(subset)
    members_ancs = go.ancestors(members)
    anc_subset = f"anc_of_{subset}"
    for a in members_ancs:
        if a not in members:
            subsets_by_term[a].append(anc_subset)

In [16]:
go = get_adapter("sqlite:obo:go")
labels = list(go.labels(go.entities()))

In [17]:
lmap = {id: label for id, label in labels}

In [18]:
objs = []
n = 0
for row in df.to_dict(orient="records"):
    n += 1
    if n % 500 == 0:
        print(row[GENESET])
    for t in row[GO_TERM_IDS]:
        obj = {MODEL: row.get(MODEL), METHOD: row.get(METHOD) , "term": t, "label": lmap.get(t, t)}
        for s in subsets_by_term.get(t, []):
            obj[s] = 1
        objs.append(obj)
print(len(objs))
subsets_df=pd.DataFrame(objs) 
pd.set_option('display.max_rows', 10)
subsets_df

0


### All subsets

In [19]:
subsets_df.fillna(0).groupby([MODEL, METHOD]).mean(numeric_only=True).style.highlight_max(axis=1, props='font-weight:bold').format(precision=3)

KeyError: 'model'

In [None]:
def agg_table(this_df, cols, exclude=[None]):
    qcols = [MODEL, METHOD] + cols
    agg_df = this_df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
    for x in exclude:
        agg_df= agg_df.query(f"method != '{x}'")
    return agg_df.reset_index().style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

pd.options.display.precision = 2
pd.set_option("display.precision", 2)

## TABLE: MAIN RESULTS

In [None]:
CORE_METRICS = [PROPORTION_SIGNIFICANT, HAS_TOP_HIT, NUM_GO_TERMS, NUM_NOVEL, NUM_UNPARSED]
EXCLUDE = ["standard", "standard no ontology", "random", "rank based", "closure"]
agg_table(df, CORE_METRICS, EXCLUDE)
#means.query("method != 'standard'").style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)
#

In [None]:

qcols = [MODEL, METHOD] + CORE_METRICS

print(METHOD)
#agg_df_orig = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
#agg_df_plot = pd.DataFrame( columns=agg_df_orig.columns)#NULL#pd.DataFrame() 
#print(EXCLUDE)
#agg_df_plot = agg_df_plot.query(f"method in @METHOD")


print(qcols)
agg_df_plot = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
for x in EXCLUDE:
    #print(x)
    agg_df_plot= agg_df_plot.query(f"method != '{x}'")
    
print(agg_df_plot.shape)
print("cols")
print(agg_df_plot.columns)

print("index")
print(agg_df_plot.index)

newindex = agg_df_plot.index
#len = agg_df.index.to_flat_index().size()
#print(len)
#for i in range(1,len):
#    newindex[i] = agg_df.index[i]

#print(agg_df.loc[:, 'model'])

#print(agg_df['num GO terms'])

#rework for significant GO terms
xs = agg_df_plot['num GO terms']/agg_df_plot['num unannotated']
ys = agg_df_plot['num unparsed']/agg_df_plot['num unannotated']
#print(agg_df.index)

print(xs)
yells = ["yellow"]*5
reds = ["red"] *3
purps = ["purple"] *3
orans = ["orange"]*3
#colors = [*yells, *reds, *purps, *orans]
colors = [*reds, *purps, *orans]
print(colors)

plt.figure(figsize=(6,6))
fig, ax = plt.subplots()

ax.scatter(xs,ys,
           c=colors)
#, label=agg_df.index

for i in range(0,len(xs)):
    ax.annotate(agg_df_plot.index[i], (xs[i], ys[i]))
    
ax.set_xlabel("True terms/Error rate")
ax.set_ylabel("Unparsed terms/Error rate")

#handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, labels, loc='upper left')

plt.xlim(0,75)
plt.ylim(0,75)

plt.savefig("true_vs_unparsed_vs_error.pdf", format="pdf", bbox_inches="tight")

plt.show()

### Core subsets

Copy this to [subsets tab](https://docs.google.com/spreadsheets/d/1gGO5IHEg-N0hivtHBO6-rdXtin8hPhw-zv6eYOBgXcE/edit#gid=669935942)

In [None]:
subsets_cols =  [MODEL, METHOD, "goslim_generic", "anc_of_goslim_generic", "goslim_agr", "anc_of_goslim_agr"]
subsets_grouped = subsets_df.fillna(0).groupby([MODEL, METHOD])[subsets_cols]
means = subsets_grouped.mean(numeric_only=True)
means["closure_of_goslim_generic"] = means["goslim_generic"] + means["anc_of_goslim_generic"]
means["closure_of_goslim_agr"] = means["goslim_agr"] + means["anc_of_goslim_agr"]
means.style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

### TABLE: Above the shoreline in key subsets

[anc subsets tab](https://docs.google.com/spreadsheets/d/1gGO5IHEg-N0hivtHBO6-rdXtin8hPhw-zv6eYOBgXcE/edit#gid=345667144)

In [None]:
sorted_means = means.sort_values("anc_of_goslim_generic", ascending=False)
filtered = sorted_means[["anc_of_goslim_generic", "anc_of_goslim_agr"]]
filtered.style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

In [None]:
means[["anc_of_goslim_generic", "anc_of_goslim_agr"]].sort_values("anc_of_goslim_agr", ascending=True).style.highlight_min(axis=0, props='font-weight:bold').format(precision=3)

In [None]:
# Reset the index of the pivoted DataFrame to have 'profession' as a column
#subsets_df = subsets_df.fillna(0).groupby([MODEL, METHOD])[subsets_cols].mean(numeric_only=True).reset_index()

# Melt the DataFrame to have 'category', 'profession', and 'percentage' columns
#melted_df = subsets_df.melt(id_vars=[MODEL, METHOD], var_name='subset', value_name='proportion')
#melted_df["mm"] = melted_df[MODEL] + melted_df[METHOD]
#melted_df
# Create a bar plot using Seaborn
#plt.figure(figsize=(10, 6))
#sns.barplot(x='subset', y='proportion', hue="mm", data=melted_df)
#plt.title('Subsets by method')
#plt.xlabel('Subset')
#plt.ylabel('Proportion')
#plt.legend(title='Subsets')
#plt.show()

## Evaluation

In [None]:
agg_table(df, [NUM_GO_TERMS, SIZE_OVERLAP, NR_SIZE_OVERLAP], ["standard"])

In [None]:
eval_summary_cols = [HAS_TOP_HIT, IN_TOP_5, IN_TOP_10, SIZE_OVERLAP, SIMILARITY, NUM_TERMS, NUM_GO_TERMS, NR_SIZE_OVERLAP, NR_SIMILARITY, MEAN_P_VALUE, MIN_P_VALUE, MAX_P_VALUE, PROPORTION_SIGNIFICANT, NUM_NOVEL]
agg_table(df, eval_summary_cols)

## as above, no perturbation

In [None]:
df_no_perturb = df.query(f"{GENE_RANDOMIZATION_FACTOR} == 0")
agg_table(df_no_perturb, CORE_METRICS, EXCLUDE)


## Maximums

In [None]:
df[[MODEL, METHOD] + eval_summary_cols].groupby([MODEL, METHOD]).max(numeric_only=True).style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

### Effect of truncation

Larger gene sets penalize annotation-based GPT methods due to the necessity to truncate to fit in the window size.



In [None]:

sns.set(color_codes=True)
np.random.seed(sum(map(ord, "regression")))
sns.lmplot(x=GENESET_SIZE, y=TRUNCATION_FACTOR, data=df.query("method=='ontological_synopsis'"))
plt.show()

In [None]:
dftups = [(method, df.query(f"method=='{method}'")) for method in ["ontological_synopsis", "narrative_synopsis"]]
for m, mdf in dftups:
    mdf.assign(dataset=m)
concatenated = pd.concat([mdf.assign(dataset=m) for m, mdf in dftups])
sns.scatterplot(x=GENESET_SIZE, y=TRUNCATION_FACTOR, data=concatenated, style='dataset', hue='model')

In [None]:
df.query(f"{GENESET_SIZE} < 50")[[SOURCE_GENESET]].drop_duplicates()

In [None]:
df_small = df.query(f"{GENESET_SIZE} < 50")
means = df_small[[MODEL, METHOD] + eval_summary_cols].groupby(['model', 'method']).mean(numeric_only=True)
means

In [None]:
def color_lightness(rgb, lightness_scale):
    h, l, s = colorsys.rgb_to_hls(*rgb)
    return colorsys.hls_to_rgb(h, min(1, l * lightness_scale), s = s)

In [None]:

agg_df_plot = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
for x in EXCLUDE:
    #print(x)
    agg_df_plot = agg_df_plot.query(f"method != '{x}'")    
    
agg_df_small_plot = df_small.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
for x in EXCLUDE:
    #print(x)
    agg_df_small_plot = agg_df_small_plot.query(f"method != '{x}'")
    
agg_df_plot_std = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).std(numeric_only=True)
for x in EXCLUDE:
    #print(x)
    agg_df_plot_std = agg_df_plot_std.query(f"method != '{x}'")    
    
agg_df_small_plot_std = df_small.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).std(numeric_only=True)
for x in EXCLUDE:
    #print(x)
    agg_df_small_plot_std = agg_df_small_plot_std.query(f"method != '{x}'")


print(agg_df_small_plot.shape)
print("cols")
print(agg_df_small_plot.columns)

print("index")
print(agg_df_small_plot.index)


In [None]:

qcols = [MODEL, METHOD] + CORE_METRICS

print(METHOD)
#agg_df_orig = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
#agg_df = pd.DataFrame( columns=agg_df_orig.columns)#NULL#pd.DataFrame() 
#print(EXCLUDE)
#agg_df = agg_df_orig.query(f"method in @METHOD")

print(qcols)

newindex = agg_df_plot.index
#len = agg_df.index.to_flat_index().size()
#print(len)
#for i in range(1,len):
#    newindex[i] = agg_df.index[i]

#print(agg_df.loc[:, 'model'])

#print(agg_df['num GO terms'])

#rework for significant GO terms
xs = agg_df_plot['num GO terms']/agg_df_plot['num unannotated']
ys = agg_df_plot['num unparsed']/agg_df_plot['num unannotated']

xs_small = agg_df_small_plot['num GO terms']/agg_df_small_plot['num unannotated']
ys_small = agg_df_small_plot['num unparsed']/agg_df_small_plot['num unannotated']

#print(agg_df.index)

print(xs)
yells = ["yellow"]*5
reds = ["red"] *3
purps = ["purple"] *3
orans = ["orange"]*3

colorr = mpl.colors.ColorConverter.to_rgb("red")
rgbr = [color_lightness(colorr, scale) for scale in [2]]
colorp = mpl.colors.ColorConverter.to_rgb("purple")
rgbp = [color_lightness(colorp, scale) for scale in [2]]
coloro = mpl.colors.ColorConverter.to_rgb("orange")
rgbo = [color_lightness(coloro, scale) for scale in [2]]

redss = rgbr *3
purpss = rgbp *3
oranss = rgbo *3


#colors = [*yells, *reds, *purps, *orans]
colors = [*reds, *purps, *orans]
print(colors)

colors_small = [*redss, *purpss, *oranss]
print(colors_small)

circles= ['o','^','s']*3
#triangles= ['^']*3
#squares= ['s']*3

markers = [*circles]#, *triangles, *squares]

SMALL_SIZE = 6
plt.rc('font', size=SMALL_SIZE)   
plt.figure(figsize=(6,6))
fig, ax = plt.subplots()

#ax.scatter(xs,ys, c=colors, s=50, marker=marker)
#ax.scatter(xs_small,xs_small, c=colors, s=25, marker=marker)

unique_markers = set(markers)
print(unique_markers)
print(markers)

for um in unique_markers:
    #print(um)
    #print(markers.index(um))    
    #print([index for index, elem in enumerate(markers) if elem == um])
    mask = [index for index, elem in enumerate(markers) if elem == um]# markers == um
    maskar = np.array(mask)
    print(mask)
    print(type(mask))
    print(maskar)
    print(type(maskar))
    minv = mask[0]
    maxv = mask[2]
    print(str(minv)+"\t"+str(maxv))
    nowcol = [colors[mask[0]], colors[mask[1]], colors[mask[2]]]
    print(nowcol)
    
    ax.scatter(xs[mask], ys[mask], marker=um, c=nowcol, s=150, alpha=0.5, edgecolors="black",  linewidth=0.5)
    ax.scatter(xs_small[mask], ys_small[mask], marker=um, c=nowcol, s=50, alpha=0.5, edgecolors="black",  linewidth=0.5)
    
#, label=agg_df.index
agg_df_plot_labels = agg_df_plot.index.values
for i in range(0, len(xs)):
    agg_df_plot_labels[i] = (agg_df_plot_labels[i][0] + ": " + agg_df_plot_labels[i][1]).replace("(","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(")","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(" - "," : ")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace("gpt","GPT")        
                
for i in range(0,len(xs)):
    ax.annotate(agg_df_plot_labels[i], (xs[i], ys[i]))
    ax.annotate(agg_df_plot_labels[i], (xs_small[i], ys_small[i]))  
ax.set_xlabel("True terms/Error")
ax.set_ylabel("Unparsed terms/Error")

#handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, labels, loc='upper left')

plt.xlim(5,70)
plt.ylim(5,70)
plt.gca().set_aspect("equal")

plt.savefig("true_vs_unparsed_vs_error.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:

qcols = [MODEL, METHOD] + CORE_METRICS

print(METHOD)
#agg_df_orig = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
#agg_df = pd.DataFrame( columns=agg_df_orig.columns)#NULL#pd.DataFrame() 
#print(EXCLUDE)
#agg_df = agg_df_orig.query(f"method in @METHOD")

print(qcols)

newindex = agg_df_plot.index
#len = agg_df.index.to_flat_index().size()
#print(len)
#for i in range(1,len):
#    newindex[i] = agg_df.index[i]

#print(agg_df.loc[:, 'model'])

#print(agg_df['num GO terms'])

#rework for significant GO terms
xs = agg_df_plot['has top term']/agg_df_plot['num unannotated']
ys = agg_df_plot['proportion significant']/agg_df_plot['num unannotated']

xs_small = agg_df_small_plot['has top term']/agg_df_small_plot['num unannotated']
ys_small = agg_df_small_plot['proportion significant']/agg_df_small_plot['num unannotated']

#print(agg_df.index)

print(xs)
print(ys)
print(np.max(xs))
print(np.max(ys))
print(np.max(xs_small))
print(np.max(ys_small))


yells = ["yellow"]*5
reds = ["red"] *3
purps = ["purple"] *3
orans = ["orange"]*3

colorr = mpl.colors.ColorConverter.to_rgb("red")
rgbr = [color_lightness(colorr, scale) for scale in [2]]
colorp = mpl.colors.ColorConverter.to_rgb("purple")
rgbp = [color_lightness(colorp, scale) for scale in [2]]
coloro = mpl.colors.ColorConverter.to_rgb("orange")
rgbo = [color_lightness(coloro, scale) for scale in [2]]

redss = rgbr *3
purpss = rgbp *3
oranss = rgbo *3


#colors = [*yells, *reds, *purps, *orans]
colors = [*reds, *purps, *orans]
print(colors)

colors_small = [*redss, *purpss, *oranss]
print(colors_small)

circles= ['o','^','s']*3
#triangles= ['^']*3
#squares= ['s']*3

markers = [*circles]#, *triangles, *squares]

SMALL_SIZE = 6
plt.rc('font', size=SMALL_SIZE)   
plt.figure(figsize=(6,6))
fig, ax = plt.subplots()

#ax.scatter(xs,ys, c=colors, s=50, marker=marker)
#ax.scatter(xs_small,xs_small, c=colors, s=25, marker=marker)

unique_markers = set(markers)
print(unique_markers)
print(markers)

for um in unique_markers:
    #print(um)
    #print(markers.index(um))    
    #print([index for index, elem in enumerate(markers) if elem == um])
    mask = [index for index, elem in enumerate(markers) if elem == um]# markers == um
    maskar = np.array(mask)
    print(mask)
    print(type(mask))
    print(maskar)
    print(type(maskar))
    minv = mask[0]
    maxv = mask[2]
    print(str(minv)+"\t"+str(maxv))
    nowcol = [colors[mask[0]], colors[mask[1]], colors[mask[2]]]
    print(nowcol)
    
    ax.scatter(xs[mask], ys[mask], marker=um, c=nowcol, s=150, alpha=0.5, edgecolors="black",  linewidth=0.5)
    ax.scatter(xs_small[mask], ys_small[mask], marker=um, c=nowcol, s=50, alpha=0.5, edgecolors="black",  linewidth=0.5)
    
#, label=agg_df.index
agg_df_plot_labels = agg_df_plot.index.values
for i in range(0, len(xs)):
    agg_df_plot_labels[i] = (agg_df_plot_labels[i][0] + ": " + agg_df_plot_labels[i][1]).replace("(","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(")","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(" - "," : ")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace("gpt","GPT")        
                
for i in range(0,len(xs)):
    ax.annotate(agg_df_plot_labels[i], (xs[i], ys[i]))
    ax.annotate(agg_df_plot_labels[i], (xs_small[i], ys_small[i]))  
ax.set_xlabel("Has top term/Error")
ax.set_ylabel("Proportion significant/Error")

#handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, labels, loc='upper left')

plt.xlim(0,7)
plt.ylim(0,7)
plt.gca().set_aspect("equal")

plt.savefig("top_vs_proportion_vs_error.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:

qcols = [MODEL, METHOD] + CORE_METRICS

print(METHOD)
#agg_df_orig = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
#agg_df = pd.DataFrame( columns=agg_df_orig.columns)#NULL#pd.DataFrame() 
#print(EXCLUDE)
#agg_df = agg_df_orig.query(f"method in @METHOD")

print(qcols)

agg_df_plot_all = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD])

#pd.concat(map(lambda x: x[1], groups)).sort_index(
agg_df_plot_all = agg_df_plot_all.apply(lambda x: x)
#
#agg_df_plot_all = agg_df_plot_all.head(1000000)

print(agg_df_plot_all.shape)
type(agg_df_plot_all)
#print(agg_df_plot_all.iloc[:200])
for x in EXCLUDE:
    #print(x)
    agg_df_plot_all = agg_df_plot_all.query(f"method != '{x}'")    
    
agg_df_small_plot_all = df_small.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD])
agg_df_small_plot_all = agg_df_small_plot_all.apply(lambda x: x)

for x in EXCLUDE:
    #print(x)
    agg_df_small_plot_all = agg_df_small_plot_all.query(f"method != '{x}'")
    
newindex = agg_df_plot_all.index
#len = agg_df.index.to_flat_index().size()
#print(len)
#for i in range(1,len):
#    newindex[i] = agg_df.index[i]

#print(agg_df.loc[:, 'model'])
#print(agg_df['num GO terms'])

#rework for significant GO terms
xs = agg_df_plot_all['has top term']
ys = agg_df_plot_all['proportion significant']

xs_small = agg_df_small_plot_all['has top term']
ys_small = agg_df_small_plot_all['proportion significant']

#print(agg_df.index)

#print(xs)
#print(ys)
#print(np.max(xs))
#print(np.max(ys))
#print(np.max(xs_small))
#print(np.max(ys_small))


#yells = ["yellow"]*5
reds = ["red"] *3
purps = ["purple"] *3
orans = ["orange"]*3

colorr = mpl.colors.ColorConverter.to_rgb("red")
rgbr = [color_lightness(colorr, scale) for scale in [2]]
colorp = mpl.colors.ColorConverter.to_rgb("purple")
rgbp = [color_lightness(colorp, scale) for scale in [2]]
coloro = mpl.colors.ColorConverter.to_rgb("orange")
rgbo = [color_lightness(coloro, scale) for scale in [2]]

redss = rgbr *3
purpss = rgbp *3
oranss = rgbo *3


#colors = [*yells, *reds, *purps, *orans]
colors = [*reds, *purps, *orans]
print(colors)

colors_small = [*redss, *purpss, *oranss]
print(colors_small)

circles= ['o','^','s']*3
#triangles= ['^']*3
#squares= ['s']*3

markers = [*circles]#, *triangles, *squares]

SMALL_SIZE = 6
plt.rc('font', size=SMALL_SIZE)   
plt.figure(figsize=(6,6))
fig, ax = plt.subplots()

#ax.scatter(xs,ys, c=colors, s=50, marker=marker)
#ax.scatter(xs_small,xs_small, c=colors, s=25, marker=marker)

unique_markers = set(markers)
print(unique_markers)
print(markers)

for um in unique_markers:
    #print(um)
    #print(markers.index(um))    
    #print([index for index, elem in enumerate(markers) if elem == um])
    mask = [index for index, elem in enumerate(markers) if elem == um]# markers == um
    maskar = np.array(mask)
    #print(mask)
    #print(type(mask))
    #print(maskar)
    #print(type(maskar))
    minv = mask[0]
    maxv = mask[2]
    #print(str(minv)+"\t"+str(maxv))
    nowcol = [colors[mask[0]], colors[mask[1]], colors[mask[2]]]
    #print(nowcol)
    
    ax.scatter(xs[mask], ys[mask], marker=um, c=nowcol, s=150, alpha=0.5, edgecolors="black",  linewidth=0.5)
    ax.scatter(xs_small[mask], ys_small[mask], marker=um, c=nowcol, s=50, alpha=0.5, edgecolors="black",  linewidth=0.5)
    
#, label=agg_df.index
agg_df_plot_labels = agg_df_plot_all.index.values
for i in range(0, len(xs)):
    agg_df_plot_labels[i] = (agg_df_plot_labels[i][0] + ": " + agg_df_plot_labels[i][1]).replace("(","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(")","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(" - "," : ")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace("gpt","GPT")        
                
for i in range(0,len(xs)):
    ax.annotate(agg_df_plot_labels[i], (xs[i], ys[i]))
for i in range(0,len(xs_small)):
    ax.annotate(agg_df_plot_labels[i], (xs_small[i], ys_small[i]))  

ax.set_xlabel("Has top term/Error")
ax.set_ylabel("Proportion significant/Error")

#handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, labels, loc='upper left')

plt.xlim(0,1)
plt.ylim(0,1)
plt.gca().set_aspect("equal")

plt.savefig("top_vs_proportion__individual.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:
df.columns
#num GO terms = TP
#num unannotated = FP
#num terms
#num unparsed

In [None]:
CORE_METRICS2 = [FP, TP]
qcols = [MODEL, METHOD] + CORE_METRICS2

print(METHOD)
print(qcols)

agg_df_plot_tpfp = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)

agg_df_plot_tpfp = agg_df_plot_tpfp.apply(lambda x: x)

print(agg_df_plot_tpfp.shape)
type(agg_df_plot_tpfp)
for x in EXCLUDE:
    agg_df_plot_tpfp = agg_df_plot_tpfp.query(f"method != '{x}'")    


agg_df_small_plot_tpfp = df_small.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
agg_df_small_plot_tpfp = agg_df_small_plot_tpfp.apply(lambda x: x)
print(agg_df_small_plot_tpfp.columns)

for x in EXCLUDE:
    agg_df_small_plot_tpfp = agg_df_small_plot_tpfp.query(f"method != '{x}'")
    
newindex = agg_df_plot_tpfp.index

xs = agg_df_plot_tpfp['true positive rate across a set of gene GO annotations']
ys = agg_df_plot_tpfp['false positive rate across a set of gene GO annotations']

xs_small = agg_df_small_plot_tpfp['true positive rate across a set of gene GO annotations']
ys_small = agg_df_small_plot_tpfp['false positive rate across a set of gene GO annotations']

reds = ["red"] *3
purps = ["purple"] *3
orans = ["orange"]*3

colorr = mpl.colors.ColorConverter.to_rgb("red")
rgbr = [color_lightness(colorr, scale) for scale in [2]]
colorp = mpl.colors.ColorConverter.to_rgb("purple")
rgbp = [color_lightness(colorp, scale) for scale in [2]]
coloro = mpl.colors.ColorConverter.to_rgb("orange")
rgbo = [color_lightness(coloro, scale) for scale in [2]]

redss = rgbr *3
purpss = rgbp *3
oranss = rgbo *3

colors = [*reds, *purps, *orans]
print(colors)

colors_small = [*redss, *purpss, *oranss]
print(colors_small)

circles= ['o','^','s']*3

markers = [*circles]

SMALL_SIZE = 6
plt.figure()
plt.rc('font', size=SMALL_SIZE)   
plt.figure(figsize=(10,6))
fig, ax = plt.subplots()

unique_markers = set(markers)
print(unique_markers)
print(markers)

for um in unique_markers:
    mask = [index for index, elem in enumerate(markers) if elem == um]
    maskar = np.array(mask)
    minv = mask[0]
    maxv = mask[2]
    nowcol = [colors[mask[0]], colors[mask[1]], colors[mask[2]]]
    
    ax.scatter(xs[mask], ys[mask], marker=um, c=nowcol, s=150, alpha=0.5, edgecolors="black",  linewidth=0.5)#, marker=um#, c=nowcol
    ax.scatter(xs_small[mask], ys_small[mask], marker=um, c=nowcol, s=50, alpha=0.5, edgecolors="black",  linewidth=0.5)#, marker=um#, c=nowcol
    
agg_df_plot_labels = agg_df_plot_tpfp.index.values
for i in range(0, len(xs)):
    agg_df_plot_labels[i] = (agg_df_plot_labels[i][0] + ": " + agg_df_plot_labels[i][1]).replace("(","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(")","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(" - "," : ")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace("gpt","GPT")        
                
for i in range(0,len(xs)):
    ax.annotate(agg_df_plot_labels[i], (xs[i], ys[i]))
for i in range(0,len(xs_small)):
    ax.annotate(agg_df_plot_labels[i], (xs_small[i], ys_small[i]))  

xmax = 0.3
ymax = 0.6
# Major ticks every 20, minor ticks every 5
xmajor_ticks = np.arange(0, xmax, 0.05)
xminor_ticks = np.arange(0, xmax, 0.01)
ymajor_ticks = np.arange(ymax, 1, 0.1)
yminor_ticks = np.arange(ymax, 1, 0.05)

ax.set_xticks(xmajor_ticks)
ax.set_xticks(xminor_ticks, minor=True)
ax.set_yticks(ymajor_ticks)
ax.set_yticks(yminor_ticks, minor=True)

# And a corresponding grid
ax.grid(which='both')

# Or if you want different settings for the grids:
ax.grid(which='minor', alpha=0.5)
ax.grid(which='major', alpha=0.5)
#ax.grid(color="blue")


ax.set_xlabel("FP")
ax.set_ylabel("TP")

plt.xlim(0,xmax)
plt.ylim(ymax,1)
#plt.gca().set_aspect("equal")

plt.savefig("TP_vs_FP.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:
CORE_METRICS3 = [RECALL, PRECISION]
qcols = [MODEL, METHOD] + CORE_METRICS3

print(METHOD)
print(qcols)

agg_df_plot_pr = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
agg_df_plot_pr = agg_df_plot_pr.apply(lambda x: x)

print(agg_df_plot_pr.shape)
type(agg_df_plot_pr)
for x in EXCLUDE:
    agg_df_plot_pr = agg_df_plot_pr.query(f"method != '{x}'")    

print(agg_df_plot_pr.columns)

agg_df_small_plot_pr = df_small.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
agg_df_small_plot_pr = agg_df_small_plot_pr.apply(lambda x: x)

for x in EXCLUDE:
    agg_df_small_plot_pr = agg_df_small_plot_pr.query(f"method != '{x}'")
    
newindex = agg_df_plot_pr.index

xs = agg_df_plot_pr['recall']
ys = agg_df_plot_pr['precision']

xs_small = agg_df_small_plot_pr['recall']
ys_small = agg_df_small_plot_pr['precision']

reds = ["red"] *3
purps = ["purple"] *3
orans = ["orange"]*3

colorr = mpl.colors.ColorConverter.to_rgb("red")
rgbr = [color_lightness(colorr, scale) for scale in [2]]
colorp = mpl.colors.ColorConverter.to_rgb("purple")
rgbp = [color_lightness(colorp, scale) for scale in [2]]
coloro = mpl.colors.ColorConverter.to_rgb("orange")
rgbo = [color_lightness(coloro, scale) for scale in [2]]

redss = rgbr *3
purpss = rgbp *3
oranss = rgbo *3

colors = [*reds, *purps, *orans]
print(colors)

colors_small = [*redss, *purpss, *oranss]
print(colors_small)

circles= ['o','^','s']*3

markers = [*circles]

SMALL_SIZE = 6
plt.figure()
plt.rc('font', size=SMALL_SIZE)   
plt.figure(figsize=(10,6))
fig, ax = plt.subplots()

unique_markers = set(markers)
print(unique_markers)
print(markers)

for um in unique_markers:
    mask = [index for index, elem in enumerate(markers) if elem == um]
    maskar = np.array(mask)
    minv = mask[0]
    maxv = mask[2]
    nowcol = [colors[mask[0]], colors[mask[1]], colors[mask[2]]]
    
    ax.scatter(xs[mask], ys[mask], marker=um, c=nowcol, s=200, alpha=0.5, edgecolors="black",  linewidth=0.5)#, marker=um#, c=nowcol
    ax.scatter(xs_small[mask], ys_small[mask], marker=um, c="white", s=100, alpha=0.5, edgecolors=nowcol,  linewidth=2)#, marker=um#, c=nowcol
    
agg_df_plot_labels = agg_df_plot_pr.index.values
for i in range(0, len(xs)):
    agg_df_plot_labels[i] = (agg_df_plot_labels[i][0] + ": " + agg_df_plot_labels[i][1]).replace("(","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(")","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(" - "," : ")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace("gpt","GPT")        
                
for i in range(0,len(xs)):
    ax.annotate(agg_df_plot_labels[i], (xs[i], ys[i]))
for i in range(0,len(xs_small)):
    ax.annotate(agg_df_plot_labels[i], (xs_small[i], ys_small[i]))  

xmax = 0.3
ymax = 0.6
# Major ticks every 20, minor ticks every 5
xmajor_ticks = np.arange(0, 1, 0.05)
xminor_ticks = np.arange(0, 1, 0.01)
ymajor_ticks = np.arange(0, 1, 0.05)
yminor_ticks = np.arange(0, 1, 0.01)

ax.set_xticks(xmajor_ticks)
ax.set_xticks(xminor_ticks, minor=True)
ax.set_yticks(ymajor_ticks)
ax.set_yticks(yminor_ticks, minor=True)

# And a corresponding grid
ax.grid(which='both')

# Or if you want different settings for the grids:
ax.grid(which='minor', alpha=0.5)
ax.grid(which='major', alpha=0.5)
#ax.grid(color="blue")


ax.set_xlabel("Recall")
ax.set_ylabel("Precision")

plt.xlim(0,0.1)
plt.ylim(0.8,1)
#plt.gca().set_aspect("equal")

plt.savefig("precision_vs_recall.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:
agg_df_plot_pr.index

In [None]:
CORE_METRICS3 = [RECALL, PRECISION]
qcols = [MODEL, METHOD] + CORE_METRICS3

agg_df_plot_pr_all = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD])
agg_df_plot_pr_all = agg_df_plot_pr_all.apply(lambda x: x)

for x in EXCLUDE:
    agg_df_plot_pr_all = agg_df_plot_pr_all.query(f"method != '{x}'")    

agg_df_small_plot_pr_all = df_small.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD])
agg_df_small_plot_pr_all = agg_df_small_plot_pr_all.apply(lambda x: x)

for x in EXCLUDE:
    agg_df_small_plot_pr_all = agg_df_small_plot_pr_all.query(f"method != '{x}'")

print(agg_df_plot_pr_all.columns)


dfpvals = pd.DataFrame(index=range(9),columns=range(9))
print(agg_df_plot_pr_all.loc[:, 'model'].unique())
print(agg_df_plot_pr_all.loc[:, 'method'].unique())
labels = []
labels2 = []
count = 0

for model in agg_df_plot_pr_all.loc[:, 'model'].unique():  
    print("count "+str(count))
    for method in agg_df_plot_pr_all.loc[:, 'method'].unique():
        #print("1 "+model+"\t"+method)
        curmethod = model+"__"+method 
        labels.append(curmethod)
        #print((agg_df_plot_pr_all['model'] == model) & (agg_df_plot_pr_all['method'] == method))
        vals1 = agg_df_plot_pr_all[(agg_df_plot_pr_all['model'] == model) & (agg_df_plot_pr_all['method'] == method)]
        print(vals1.shape)
        count2 = 0
        first = False
        for model2 in agg_df_plot_pr_all.loc[:, 'model'].unique():             
            for method2 in agg_df_plot_pr_all.loc[:, 'method'].unique():
                print("count2 "+str(count2))                
                curmethod2 = model2+"__"+method2 
                if(not first):
                    labels2.append(curmethod2)
                if(curmethod != curmethod2):
                    print("1 "+model+"\t"+method+"\t"+"2 "+model2+"\t"+method2)
                    #print((agg_df_plot_pr_all['model'] == model) & (agg_df_plot_pr_all['method'] == method))
                    vals2 = agg_df_plot_pr_all[(agg_df_plot_pr_all['model'] == model2) & (agg_df_plot_pr_all['method'] == method2)]
                    print(vals2.shape)
                    pval= ttest_ind(vals1['precision'], vals2['precision'], alternative='greater', equal_var = False)
                    print(pval)
                    print(pval[1])
                    #print(type(pval[1]))
                    dfpvals.iloc[count, count2] = pval[1]
                else:
                    dfpvals.iloc[count, count2] = 0
                count2 = count2 + 1
        first = True
        count = count + 1

dfpvals = dfpvals.apply(pd.to_numeric)
dfpvals.index = labels
dfpvals.columns = labels

dfpvals_correct = dfpvals.copy()
dfpvals_correct = multipletests(dfpvals.to_numpy().flatten(), method='fdr_bh')
print(type(dfpvals_correct))
dfpvals_correct = [*dfpvals_correct[1]]
print(type(dfpvals_correct))
print(dfpvals_correct)

dfpvals_correct_lists = [dfpvals_correct[x:x+9] for x in range(0, len(dfpvals_correct), 9)]#list(it.batched(iterable, n))
dfpvals_correct = pd.DataFrame(dfpvals_correct_lists, index =labels, columns =labels)

In [None]:

#for c in dfpvals.select_dtypes(include = [np.number]).columns:
#    dfpvals[c] = np.log10(dfpvals[c].values)
    
ax = plt.axes()
#dfpvals_plot = dfpvals
#dfpvals_plot[dfpvals_plot < 0.0001] = 1
#dfpvals_plot[dfpvals_plot >= 0.0001] = 0

#dfpvals[dfpvals < 0.0001] = 0

#numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']#
#dfpvalslog = dfpvals.copy()
#for c in [c for c in dfpvals.columns if dfpvals[c].dtype in numeric]:
#    dfpvalslog[c] = np.log10(dfpvals[c])
    
sns.heatmap(dfpvals_correct, cmap="YlGnBu", annot=True, annot_kws={"fontsize":8}, ax = ax)
ax.set_title('Precision t-test')

plt.savefig("methodpairs_precision_correct_ttest.pdf", format="pdf", bbox_inches="tight")


plt.show()

In [None]:
g = sns.clustermap(dfpvals_correct, cmap="YlGnBu", annot =True,annot_kws={"fontsize":8})

ax = g.ax_heatmap

rorder = g.dendrogram_row.reordered_ind
corder = g.dendrogram_col.reordered_ind

#countr = 0
#for r in dfpvals.index:
#    countc = 0
#    for c in dfpvals.columns:
#        #print(dfpvals.iloc[countr, countc])
#        if(pd.to_numeric(dfpvals.iloc[countr, countc]) < 0.001):
#            outf = "{r:}\t{c:}\t\t\t{nr:}\t{nc:}"
#            print(outf.format(r=countr, c=countc, nr=rorder[countr], nc=corder[countc]))
#            #len(rorder) - rorder[countr]-1
#            ax.add_patch(Rectangle((corder[countc], len(rorder) - rorder[countr]-1), 1, 1, fill=False, edgecolor='orange', lw=2))
#        countc=countc+1
#    countr=countr+1

ax.set_title('Precision t-test')
plt.savefig("methodpairs_precision_ttest_correct_cluster.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:
CORE_METRICS3 = [RECALL, PRECISION]
qcols = [MODEL, METHOD] + CORE_METRICS3

agg_df_plot_pr_all = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD])
agg_df_plot_pr_all = agg_df_plot_pr_all.apply(lambda x: x)

for x in EXCLUDE:
    agg_df_plot_pr_all = agg_df_plot_pr_all.query(f"method != '{x}'")    

agg_df_small_plot_pr_all = df_small.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD])
agg_df_small_plot_pr_all = agg_df_small_plot_pr_all.apply(lambda x: x)

for x in EXCLUDE:
    agg_df_small_plot_pr_all = agg_df_small_plot_pr_all.query(f"method != '{x}'")

print(agg_df_plot_pr_all.columns)


dfpvals = pd.DataFrame(index=range(9),columns=range(9))
print(agg_df_plot_pr_all.loc[:, 'model'].unique())
print(agg_df_plot_pr_all.loc[:, 'method'].unique())
labels = []
labels2 = []
count = 0

for model in agg_df_plot_pr_all.loc[:, 'model'].unique():  
    print("count "+str(count))
    for method in agg_df_plot_pr_all.loc[:, 'method'].unique():
        #print("1 "+model+"\t"+method)
        curmethod = model+"__"+method 
        labels.append(curmethod)
        #print((agg_df_plot_pr_all['model'] == model) & (agg_df_plot_pr_all['method'] == method))
        vals1 = agg_df_plot_pr_all[(agg_df_plot_pr_all['model'] == model) & (agg_df_plot_pr_all['method'] == method)]
        print(vals1.shape)
        count2 = 0
        first = False
        for model2 in agg_df_plot_pr_all.loc[:, 'model'].unique():             
            for method2 in agg_df_plot_pr_all.loc[:, 'method'].unique():
                print("count2 "+str(count2))                
                curmethod2 = model2+"__"+method2 
                if(not first):
                    labels2.append(curmethod2)
                if(curmethod != curmethod2):
                    print("1 "+model+"\t"+method+"\t"+"2 "+model2+"\t"+method2)
                    #print((agg_df_plot_pr_all['model'] == model) & (agg_df_plot_pr_all['method'] == method))
                    vals2 = agg_df_plot_pr_all[(agg_df_plot_pr_all['model'] == model2) & (agg_df_plot_pr_all['method'] == method2)]
                    print(vals2.shape)
                    pval= ttest_ind(vals1['recall'], vals2['recall'], alternative='greater', equal_var = False)
                    print(pval)
                    print(pval[1])
                    #print(type(pval[1]))
                    dfpvals.iloc[count, count2] = pval[1]
                else:
                    dfpvals.iloc[count, count2] = 0
                count2 = count2 + 1
        first = True
        count = count + 1

dfpvals = dfpvals.apply(pd.to_numeric)
dfpvals.index = labels
dfpvals.columns = labels

dfpvals_correct = dfpvals.copy()
dfpvals_correct = multipletests(dfpvals.to_numpy().flatten(), method='fdr_bh')
print(type(dfpvals_correct))
dfpvals_correct = [*dfpvals_correct[1]]
print(type(dfpvals_correct))
print(dfpvals_correct)

dfpvals_correct_lists = [dfpvals_correct[x:x+9] for x in range(0, len(dfpvals_correct), 9)]#list(it.batched(iterable, n))
dfpvals_correct = pd.DataFrame(dfpvals_correct_lists, index =labels, columns =labels)

dfpvals_correct

In [None]:

ax = plt.axes()
#dfpvals_plot = dfpvals
#dfpvals_plot[dfpvals_plot < 0.0001] = 1
#dfpvals_plot[dfpvals_plot >= 0.0001] = 0

#dfpvals[dfpvals < 0.0001] = 0

numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#dfpvalslog = dfpvals.copy()
#for c in [c for c in dfpvals.columns if dfpvals[c].dtype in numeric]:
#    dfpvalslog[c] = np.log10(dfpvals[c])
    
sns.heatmap(dfpvals_correct, cmap="YlGnBu", annot=True, annot_kws={"fontsize":8}, ax = ax)
ax.set_title('Recall t-test')

plt.savefig("methodpairs_recall_ttest_correct.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:
g = sns.clustermap(dfpvals_correct, cmap="YlGnBu", annot =True, annot_kws={"fontsize":8})

ax = g.ax_heatmap

rorder = g.dendrogram_row.reordered_ind
corder = g.dendrogram_col.reordered_ind

#countr = 0
#for r in dfpvals.index:
#    countc = 0
#    for c in dfpvals.columns:
#        #print(dfpvals.iloc[countr, countc])
#        if(pd.to_numeric(dfpvals.iloc[countr, countc]) < 0.001):
#            outf = "{r:}\t{c:}\t\t\t{nr:}\t{nc:}"
#            #print(outf.format(r=countr, c=countc, nr=rorder[countr], nc=corder[countc]))
#            #len(rorder) - rorder[countr]-1
#            ax.add_patch(Rectangle((corder[countc], len(rorder) - rorder[countr]-1), 1, 1, fill=False, edgecolor='orange', lw=2))
#        countc=countc+1
#    countr=countr+1

ax.set_title('Recall t-test')
plt.savefig("methodpairs_recall_ttest_correct_cluster.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:

qcols = [MODEL, METHOD] + CORE_METRICS

print(METHOD)
#agg_df_orig = df.replace(r"_", " ", regex=True)[qcols].groupby([MODEL, METHOD]).mean(numeric_only=True)
#agg_df = pd.DataFrame( columns=agg_df_orig.columns)#NULL#pd.DataFrame() 
#print(EXCLUDE)
#agg_df = agg_df_orig.query(f"method in @METHOD")

print(qcols)

newindex = agg_df_plot.index
#len = agg_df.index.to_flat_index().size()
#print(len)
#for i in range(1,len):
#    newindex[i] = agg_df.index[i]

#print(agg_df.loc[:, 'model'])

#print(agg_df['num GO terms'])

#rework for significant GO terms
xs = agg_df_plot['has top term']
ys = agg_df_plot['proportion significant']
xs_er = agg_df_plot_std['has top term']
ys_er = agg_df_plot_std['proportion significant']

xs_small = agg_df_small_plot['has top term']
ys_small = agg_df_small_plot['proportion significant']
xs_small_er = agg_df_small_plot_std['has top term']
ys_small_er = agg_df_small_plot_std['proportion significant']


#print(agg_df.index)

print(xs)
print(ys)
print(np.max(xs))
print(np.max(ys))
print(np.max(xs_small))
print(np.max(ys_small))


yells = ["yellow"]*5
reds = ["red"] *3
purps = ["purple"] *3
orans = ["orange"]*3

colorr = mpl.colors.ColorConverter.to_rgb("red")
rgbr = [color_lightness(colorr, scale) for scale in [2]]
colorp = mpl.colors.ColorConverter.to_rgb("purple")
rgbp = [color_lightness(colorp, scale) for scale in [2]]
coloro = mpl.colors.ColorConverter.to_rgb("orange")
rgbo = [color_lightness(coloro, scale) for scale in [2]]

redss = rgbr *3
purpss = rgbp *3
oranss = rgbo *3


#colors = [*yells, *reds, *purps, *orans]
colors = [*reds, *purps, *orans]
print(colors)

colors_small = [*redss, *purpss, *oranss]
print(colors_small)

circles= ['o','^','s']*3
#triangles= ['^']*3
#squares= ['s']*3

markers = [*circles]#, *triangles, *squares]

SMALL_SIZE = 6
plt.rc('font', size=SMALL_SIZE)   
plt.figure(figsize=(6,6))
fig, ax = plt.subplots()

#ax.scatter(xs,ys, c=colors, s=50, marker=marker)
#ax.scatter(xs_small,xs_small, c=colors, s=25, marker=marker)

unique_markers = set(markers)
print(unique_markers)
print(markers)

for um in unique_markers:
    #print(um)
    #print(markers.index(um))    
    #print([index for index, elem in enumerate(markers) if elem == um])
    mask = [index for index, elem in enumerate(markers) if elem == um]# markers == um
    maskar = np.array(mask)
    print(mask)
    print(type(mask))
    print(maskar)
    print(type(maskar))
    minv = mask[0]
    maxv = mask[2]
    print(str(minv)+"\t"+str(maxv))
    nowcol = [colors[mask[0]], colors[mask[1]], colors[mask[2]]]
    print(nowcol)
    
    ax.scatter(xs[mask], ys[mask], marker=um, c=nowcol, s=150, alpha=0.5, edgecolors="black",  linewidth=0.5)
    ax.scatter(xs_small[mask], ys_small[mask], marker=um, c=nowcol, s=50, alpha=0.5, edgecolors="black",  linewidth=0.5)

    ax.errorbar(xs[mask], ys[mask], xerr= xs_er[mask], yerr=ys_er[mask], fmt="o", elinewidth=0.4, capsize=0.4)# ecolor=nowcol,
    ax.errorbar(xs_small[mask], ys_small[mask], xerr= xs_small_er[mask], yerr=ys_small_er[mask], fmt="o", elinewidth=0.4, capsize=0.4)
    
#, label=agg_df.index
agg_df_plot_labels = agg_df_plot.index.values
for i in range(0, len(xs)):
    agg_df_plot_labels[i] = (agg_df_plot_labels[i][0] + ": " + agg_df_plot_labels[i][1]).replace("(","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(")","")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace(" - "," : ")
    agg_df_plot_labels[i] = agg_df_plot_labels[i].replace("gpt","GPT")        
                
for i in range(0,len(xs)):
    ax.annotate(agg_df_plot_labels[i], (xs[i], ys[i]))
    ax.annotate(agg_df_plot_labels[i], (xs_small[i], ys_small[i]))  
ax.set_xlabel("Has top term")
ax.set_ylabel("Proportion significant")

#handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, labels, loc='upper left')

plt.xlim(-0.2,1)
plt.ylim(-0.2,1)
plt.gca().set_aspect("equal")

plt.savefig("top_vs_proportion_std.pdf", format="pdf", bbox_inches="tight")

plt.show()

In [None]:
means.query("method != 'standard'").style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

## TABLE: evaluation for gene sets < 75

In [None]:
agg_table(df_small, CORE_METRICS, EXCLUDE)

## Looking at individual gene sets

In [None]:

def terms_summary(df, variant="v1", max_rows=9999):
    term_dict = {}
    n = 0
    for _, row in df.iterrows():
        if row[PROMPT_VARIANT] and row[PROMPT_VARIANT] != variant:
            continue
        n += 1
        if n > max_rows:
            break
        model = row['model'] 
        if "turbo" in model:
            model = "turbo"
        elif "davinci" in model:
            model = "dav"
        else:
            model = ""
        method = str(row['method']).replace('_', ' ')
        if method in ["closure", "rank_based", "random"]:
            continue
        mm = f"{model} {method}"
        if method == "standard":
            nr_term_ids = list(filter_redundant(row[TERM_IDS]))
        else:
            nr_term_ids = None
        for ix, t_id in enumerate(row[TERM_IDS]):
            if t_id not in term_dict:
                t = {"id": t_id, "label": go.label(t_id), "redundant": False}
                term_dict[t_id] = t
            else:
                t = term_dict[t_id]
            t[mm] = ix
            if nr_term_ids and t_id not in nr_term_ids:
                t["redundant"] = True
    objs = list(term_dict.values())
    return pd.DataFrame(objs)

In [None]:
import oaklib.datamodels.obograph as og
from oaklib.utilities.obograph_utils import graph_to_image, default_stylemap_path
!mkdir -p output

MMAP = {"standard": None, 
        "gpt-3.5-turbo.no_synopsis": "NS",
        "gpt-3.5-turbo.ontological_synopsis": "ONT",
        "gpt-3.5-turbo.narrative_synopsis": "NAR",
       }

def viz(geneset, include_std=True, mmap = MMAP, variant="v1"):
    methods = mmap.keys()
    [gsobj] = [c for c in comps if c.name == geneset]
    std = gsobj.payloads["standard"]
    t2p = {e.class_id : e.p_value_adjusted for e in std.enrichment_results}
    terms = set()
    terms.add("GO:0008150")
    terms.add("GO:0003674")
    terms.add("GO:0005575")
    m2t = defaultdict(list)
    t2m = defaultdict(set)
    seeds = []
    for m in methods:
        if m == "standard":
            mv = m
        else:
            mv = f"{m}.{variant}"
        p = gsobj.payloads[mv]
        if include_std or m != "standard":
            terms.update(p.term_ids)
        for t in p.term_ids:
            if t.startswith("MONDO:"):
                continue
            m2t[m].append(t)
            t2m[t].add(m)
            if m == "standard":
                seeds.append(t)
    # rels = go.gap_fill_relationships(list(terms), predicates=[IS_A, PART_OF, "RO:0002211", "RO:0002212", RO:0002213"])
    rels = go.gap_fill_relationships(list(terms), predicates=[IS_A, PART_OF])
    g = go.relationships_to_graph(rels)
    for n in g.nodes:
        if not n.lbl:
            n.lbl = go.label(n.id)
        if not n.lbl:
            n.lbl = n.id
        if n.id in t2p:
            n.lbl += f" {t2p[n.id]:.2e}"
        #for m in t2m[n.id]:
        #    n.lbl += f" [<b>{m}</b>]"
    for m in m2t.keys():
        if m == "standard":
            continue
        for t in m2t[m]:
            n = og.Node(id=f"{mmap[m]}:{t}", lbl=mmap[m])
            g.nodes.append(n)
            g.edges.append(og.Edge(n.id, "has", t))
    outfile = f"output/{geneset.replace(' ', '_')}-{include_std}-{variant}.png"
    graph_to_image(g, seeds=seeds, imgfile=outfile, stylemap="conf/enr-style.json")
    graph_to_image(g, seeds=seeds, imgfile=outfile, stylemap="conf/enr-style.json")
    #return g
    
viz('peroxisome-0')
viz('peroxisome-0', variant="v2")

In [None]:
def geneset_summary(df, geneset):
    sdf = terms_summary(df.query(f"{GENESET} == '{geneset}'").sort_values("similarity", ascending=False))
    [gsobj] = [c for c in comps if c.name == geneset]
    std = gsobj.payloads["standard"]
    t2p = {e.class_id : e.p_value_adjusted for e in std.enrichment_results}
    sdf["p_label"] = sdf.apply(lambda row: str(row.label) + " " + str(t2p.get(row.id, "")), axis=1)
    return sdf

geneset_summary(df, 'peroxisome-0')

### Peroxisome

In [None]:
peroxisome = df.query(f"{GENESET} == 'peroxisome-0'").sort_values("similarity", ascending=False)
peroxisome[[MODEL, METHOD] + eval_summary_cols]

In [None]:
# terms_summary(peroxisome).style.highlight_min(axis=1, props='font-weight:bold', numeric_only=True)
terms_summary(peroxisome)

In [None]:
terms_summary(peroxisome, "v2")

## Sensory Ataxia

In [None]:
ataxia = df.query(f"{GENESET} == 'sensory ataxia-0'").sort_values("similarity", ascending=False)
ataxia[[MODEL, METHOD] + eval_summary_cols]                                                  

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)


In [None]:
ataxia[[MODEL, METHOD, PROMPT_VARIANT, GO_TERM_IDS, NOVEL_LABELS]]

In [None]:
viz('sensory ataxia-0')

![img](output/sensory_ataxia-0-True-v1.png)

In [None]:
viz('sensory ataxia-0', variant="v2")

![img](output/sensory_ataxia-0-True-v2.png)

In [None]:
terms_summary(ataxia)

In [None]:
def retrieve_payload(geneset, method):
    for comp in comps:
        if comp.name == geneset:
            return comp.payloads[method]

In [None]:
print(retrieve_payload("sensory ataxia-0", "gpt-3.5-turbo.ontological_synopsis.v1").response_text)

In [None]:
print(retrieve_payload("sensory ataxia-0", "gpt-3.5-turbo.narrative_synopsis.v1").response_text)

In [None]:
print(retrieve_payload("sensory ataxia-0", "gpt-3.5-turbo.no_synopsis.v1").response_text)

## T cell proliferation

In [None]:
tcp = df.query(f"{GENESET} == 'T cell proliferation-0'").sort_values("similarity", ascending=False)
tcp[[MODEL, METHOD] + eval_summary_cols]

In [None]:
viz('T cell proliferation-0')

![img](output/T_cell_proliferation-0-True-v1.png)

In [None]:
terms_summary(tcp)

## Endocytosis

In [None]:
endocytosis = df.query(f"{GENESET} == 'endocytosis-0'").sort_values("similarity", ascending=False)
terms_summary(endocytosis)

In [None]:
print(retrieve_payload("endocytosis-0", "gpt-3.5-turbo.narrative_synopsis.v1").prompt)

In [None]:
print(retrieve_payload("endocytosis-0", "gpt-3.5-turbo.narrative_synopsis.v1").response_text)

In [None]:
print(retrieve_payload("endocytosis-0", "gpt-3.5-turbo.ontological_synopsis.v1").prompt)

## Hydrolysis

In [None]:
hydrolysis = df.query(f"{GENESET} == 'hydrolase activity, hydrolyzing O-glycosyl compounds-0'").sort_values("similarity", ascending=False)
terms_summary(hydrolysis)

In [None]:
viz('hydrolase activity, hydrolyzing O-glycosyl compounds-0')

![img](output/hydrolase_activity,_hydrolyzing_O-glycosyl_compounds-0-True-v1.png)

## Variability

In [None]:
pv_pivot = df.pivot_table(index=[MODEL, METHOD, GENESET], columns=PROMPT_VARIANT, values=PROPORTION_SIGNIFICANT)
# Calculate differences between run "1" and run "2"
pv_pivot['diff'] = pv_pivot["v1"] - pv_pivot["v2"]
pv_pivot

In [None]:

# Now you can perform statistics on the 'diff' column
df_diff_stats = pv_pivot['diff'].agg(['mean', 'std', 'var', 'min', 'max'])
df_diff_stats['range'] = df_diff_stats['max'] - df_diff_stats['min']
df_diff_stats

In [None]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

# pivot your dataframe as before
df_pivot = df.pivot_table(index=[MODEL, METHOD, GENESET], columns=PROMPT_VARIANT, values=TERM_IDS, aggfunc=list)
df_pivot

In [None]:

# calculate Jaccard index for each group
df_pivot['jaccard_index'] = df_pivot.apply(lambda row: jaccard_similarity(row["v1"][0], row["v2"][0]), axis=1)

# reset index to get it back in the form of a DataFrame
result = df_pivot.reset_index()
result

In [None]:
result['jaccard_index'].describe()

In [None]:
result.groupby([MODEL])['jaccard_index'].describe()

In [None]:
result.groupby([MODEL, METHOD])['jaccard_index'].describe()[['count', 'mean', 'std', 'min', 'max']].style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

In [None]:
summaries = [s for s in list(df[SUMMARY]) if s]
len(summaries)
import random
def random_summary():
    return summaries[int(random.random() * len(summaries))]

random_summary()

In [None]:
from ontogpt.clients import OpenAIClient

simclient = OpenAIClient(model="text-embedding-ada-002")

def text_similarity(text1, text2):
    return simclient.similarity(text1, text2)

text_similarity("nucleus of cell", "nuclear membrane")

In [None]:
text_similarity(random_summary(), random_summary())

In [None]:
rs = random_summary()
text_similarity(rs, rs)

In [None]:
df_pivot = df.pivot_table(index=[MODEL, METHOD, GENESET], columns=PROMPT_VARIANT, values=SUMMARY, aggfunc=list)
df_pivot

df_pivot['sim'] = df_pivot.apply(lambda row: text_similarity(row["v1"][0], row["v2"][0]), axis=1)
df_pivot['length_diff'] = df_pivot.apply(lambda row: abs(len(row["v1"][0])-len(row["v2"][0])), axis=1)


# reset index to get it back in the form of a DataFrame
result = df_pivot.reset_index()
result

In [None]:
rnd = result.query("model=='text-davinci-003' and method=='no_synopsis'")
rnd[MODEL]=""
rnd[METHOD]="RANDOM"
rnd["sim"] = rnd.apply(lambda row: text_similarity(random_summary(), random_summary()), axis=1)
rnd

In [None]:
sim_summary = pd.concat([result, rnd]).groupby([MODEL, METHOD])['sim'].describe()[['count', 'mean', 'std', 'min', 'max']]
sim_summary.style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

In [None]:
stats_df = result.groupby(METHOD).agg({'sim': [np.mean, np.std]}).reset_index()
stats_df.columns = ['category', 'mean', 'stddev']

# Set the plot style
sns.set(style="whitegrid")

# Create the bar plot with error bars
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(x='category', y='mean', data=stats_df, yerr=stats_df['stddev'], capsize=.1)

# Add labels and title
plt.xlabel("Category")
plt.ylabel("Mean Score")
plt.title("Mean Score with Standard Deviation for Each Category")

# Show the plot
plt.show()


In [None]:
diff_summary = result.groupby([MODEL, METHOD])['length_diff'].describe()[['count', 'mean', 'std', 'min', 'max']]
diff_summary.style.highlight_max(axis=0, props='font-weight:bold').format(precision=3)

## Unparsed

In [None]:
def unparsed(df, model):
    novel_term_map = defaultdict(dict)
    for _, row in df.iterrows():
        gs = row[GENESET]
        #if not gs.endswith("-0"):
        #    continue
        gs = gs.replace("-0", "")
        for lbl in row[NOVEL_LABELS]:
            if row[MODEL] != model:
                continue
            m = row[METHOD]
            novel_term_map[lbl]["NAME"] = lbl
            if gs not in novel_term_map[lbl]:
                novel_term_map[lbl][m] = []
            novel_term_map[lbl][m].append(gs)
    novel_df = pd.DataFrame(novel_term_map.values())
    return novel_df

## Potential Hallucinations

Summarize all GO terms that are in all summaries that are not in the closure of annotated terms for any of the genes in the gene set

In [None]:
def hallucinatons(df, model):
    novel_term_map = defaultdict(dict)
    for _, row in df.iterrows():
        gs = row[GENESET]
        #if not gs.endswith("-0"):
        #    continue
        gs = gs.replace("-0", "")
        for lbl in row[NOVEL_LABELS]:
            if row[MODEL] != model:
                continue
            m = row[METHOD]
            novel_term_map[lbl]["NAME"] = lbl
            if gs not in novel_term_map[lbl]:
                novel_term_map[lbl][m] = []
            novel_term_map[lbl][m].append(gs)
    novel_df = pd.DataFrame(novel_term_map.values())
    return novel_df

In [None]:
novel_df_turbo = hallucinatons(df, TURBO).reset_index(drop=True)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
novel_df_turbo

In [None]:
hallucinatons(df, DAVINCI)

In [None]:
hallucinatons(df, GPT4)

In [None]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
enc.encode("negative regulation")

def tok_similarity(t1, t2):
    if t1 is None and t2 is None:
        return 1
    if t1 is None or t2 is None:
        return 0

    toks1 = set(enc.encode(t1))
    toks2 = set(enc.encode(t2))
    return len(toks1.intersection(toks2)) / len(toks1.union(toks2))


In [None]:
closure_by_gene_set = {}
for comp in comps:
    closure = comp.payloads["closure"].term_strings
    closure_by_gene_set[comp.name] = closure

def hallucinatons_with_best_match(df, model):
    novel_term_map = defaultdict(dict)
    for _, row in df.iterrows():
        gs = row[GENESET]
        # print(gs)
        closure = closure_by_gene_set[gs]
        #if not gs.endswith("-0"):
        #    continue
        gs = gs.replace("-0", "")
        for lbl in row[NOVEL_LABELS]:
            if row[MODEL] != model:
                continue
            m = row[METHOD]
            novel_term_map[lbl]["NAME"] = lbl
            if gs not in novel_term_map[lbl]:
                novel_term_map[lbl][m] = []
            # sims = [(ct, text_similarity(ct, lbl)) for ct in closure]
            sims = [(ct, tok_similarity(ct, lbl)) for ct in closure]
            sims = sorted(sims, key=lambda x: -x[1])
            best = sims[0][0]
            novel_term_map[lbl][m].append((gs, best))
    novel_df = pd.DataFrame(novel_term_map.values())
    return novel_df

novel_df_turbo = hallucinatons_with_best_match(df, TURBO).reset_index(drop=True)
novel_df_turbo

## New Annotations

In [None]:
endocytosis = df.query(f"{GENESET} == 'endocytosis-0'").sort_values("similarity", ascending=False)
terms_summary(endocytosis)

In [None]:
viz('endocytosis-0')

![img](output/endocytosis-0-True-v1.png)

### New Annotations

in 2022-03-24, GO `molecular sequesting` only had 6 genes annotated; this increased to 30 in 2023.
If these are past the LLM training date then we would not expect these to influence results. Additionally,


In [None]:
sequestering = df.query(f"{GENESET} == 'molecular sequestering-0'").sort_values("similarity", ascending=False)
sequestering[eval_summary_cols]                                                           

In [None]:
terms_summary(sequestering)

In [None]:
viz('molecular sequestering-0')

![img](output/molecular_sequestering-0-True-v1.png)

## IGRB

This gene set contains genes previously annotated to IGRB, many of which have been since removed

In [None]:
igrb = df.query(f"{GENESET} == 'ig-receptor-binding-2022-0'").sort_values("similarity", ascending=False)
terms_summary(igrb)

In [None]:
viz('ig-receptor-binding-2022-0')

![img](output/ig-receptor-binding-2022-0-True-v1.png)

In [None]:
viz('HALLMARK_GLYCOLYSIS-0')

![img](output/HALLMARK_GLYCOLYSIS-0-True-v1.png)

In [None]:
viz('HALLMARK_KRAS_SIGNALING_UP-0')

![img](output/HALLMARK_KRAS_SIGNALING_UP-0-True-v1.png)

## Summaries

In [None]:
objs = []
for c in comps:
    for m, payload in c.payloads.items():
        if payload.summary:
            objs.append({"model": c.model, "geneset": c.name, "method": m, "summary": payload.summary})

In [None]:
sdf = pd.DataFrame(objs)

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
sdf

In [None]:
sdf.query("geneset == 'sensory ataxia-0'")