In [1]:
import yaml
from yaml import Loader

In [2]:
from ontogpt.evaluation.enrichment.eval_enrichment import GeneSetComparison, Overlap

In [3]:
def load_comparisons():
    with open(f"../analysis/enrichment-summary.yaml") as f:
        obj = yaml.load(f, Loader)
        comps = [GeneSetComparison(**x) for x in obj]
        return comps

In [4]:
comps = load_comparisons()

In [5]:
comps[0].name

'EDS-0'

In [6]:
objs = []
for c in comps:
    for k, v in c.overlaps.items():
        obj = {"name": c.name, "model": c.model, "size": len(c.gene_symbols), "m1": k[0], "m2": k[1], **v.dict()}
        objs.append(obj)

In [7]:
objs[2]

{'name': 'EDS-0',
 'model': 'gpt-3.5-turbo',
 'size': 19,
 'm1': 'narrative_synopsis',
 'm2': 'standard',
 'jaccard': 0.0425531914893617,
 'common': ['GO:0006024', 'GO:0030198'],
 'overlap_score': 2,
 'left_jaccard': 0.3333333333333333,
 'right_jaccard': 0.046511627906976744,
 'summary_jaccard': None}

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame(objs).fillna(0)

In [10]:
df

Unnamed: 0,name,model,size,m1,m2,jaccard,common,overlap_score,left_jaccard,right_jaccard,summary_jaccard
0,EDS-0,gpt-3.5-turbo,19,narrative_synopsis,ontological_synopsis,0.250000,"[GO:0030198, GO:0006024]",2.0,0.333333,0.500000,0.09375
1,EDS-0,gpt-3.5-turbo,19,narrative_synopsis,no_synopsis,0.111111,[GO:0030198],1.0,0.166667,0.250000,0.08000
2,EDS-0,gpt-3.5-turbo,19,narrative_synopsis,standard,0.042553,"[GO:0006024, GO:0030198]",2.0,0.333333,0.046512,0.00000
3,EDS-0,gpt-3.5-turbo,19,narrative_synopsis,standard_no_ontology,0.000000,0,0.0,0.000000,0.000000,0.00000
4,EDS-0,gpt-3.5-turbo,19,narrative_synopsis,rank_based,0.000000,0,0.0,0.000000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...
787,peroxisome-1,text-davinci-003,8,no_synopsis,random,0.000000,0,0.0,0.000000,0.000000,0.00000
788,peroxisome-1,text-davinci-003,8,standard,standard_no_ontology,0.000000,0,0.0,0.000000,0.000000,0.00000
789,peroxisome-1,text-davinci-003,8,standard,random,0.000000,0,0.0,0.000000,0.000000,0.00000
790,peroxisome-1,text-davinci-003,8,standard_no_ontology,standard,0.000000,0,0.0,0.000000,0.000000,0.00000


In [11]:
df.query('model=="text-davinci-003"')

Unnamed: 0,name,model,size,m1,m2,jaccard,common,overlap_score,left_jaccard,right_jaccard,summary_jaccard
44,EDS-0,text-davinci-003,19,narrative_synopsis,ontological_synopsis,0.000000,0,0.0,0.000,0.000000,0.200000
45,EDS-0,text-davinci-003,19,narrative_synopsis,no_synopsis,0.083333,[MESH:D003094],1.0,0.125,0.200000,0.352941
46,EDS-0,text-davinci-003,19,narrative_synopsis,standard,0.020000,[GO:0032963],1.0,0.125,0.023256,0.000000
47,EDS-0,text-davinci-003,19,narrative_synopsis,standard_no_ontology,0.000000,0,0.0,0.000,0.000000,0.000000
48,EDS-0,text-davinci-003,19,narrative_synopsis,rank_based,0.000000,0,0.0,0.000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
787,peroxisome-1,text-davinci-003,8,no_synopsis,random,0.000000,0,0.0,0.000,0.000000,0.000000
788,peroxisome-1,text-davinci-003,8,standard,standard_no_ontology,0.000000,0,0.0,0.000,0.000000,0.000000
789,peroxisome-1,text-davinci-003,8,standard,random,0.000000,0,0.0,0.000,0.000000,0.000000
790,peroxisome-1,text-davinci-003,8,standard_no_ontology,standard,0.000000,0,0.0,0.000,0.000000,0.000000


In [12]:
df.groupby(['m1', 'm2']).mean(numeric_only=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,jaccard,overlap_score,left_jaccard,right_jaccard,summary_jaccard
m1,m2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
narrative_synopsis,no_synopsis,84.888889,0.111275,1.25,0.187456,0.227569,0.261806
narrative_synopsis,ontological_synopsis,84.888889,0.111202,1.5,0.192269,0.213459,0.248297
narrative_synopsis,random,84.888889,0.002447,0.333333,0.02154,0.003034,0.0
narrative_synopsis,rank_based,84.888889,0.0068,0.888889,0.092476,0.007849,0.0
narrative_synopsis,standard,84.888889,0.015222,2.333333,0.272993,0.016256,0.0
narrative_synopsis,standard_no_ontology,84.888889,0.0,0.0,0.0,0.0,0.0
no_synopsis,narrative_synopsis,84.888889,0.111275,1.25,0.227569,0.187456,0.261806
no_synopsis,ontological_synopsis,84.888889,0.09739,0.805556,0.165224,0.156169,0.26763
no_synopsis,random,84.888889,0.002717,0.194444,0.023926,0.003266,0.0
no_synopsis,rank_based,84.888889,0.005765,0.75,0.089802,0.00631,0.0


In [13]:
df.groupby(['model', 'm1', 'm2']).max(numeric_only=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,size,jaccard,overlap_score,left_jaccard,right_jaccard,summary_jaccard
model,m1,m2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-3.5-turbo,narrative_synopsis,no_synopsis,200,0.375,3.0,1.0,0.666667,0.545455
gpt-3.5-turbo,narrative_synopsis,ontological_synopsis,200,0.5,3.0,1.0,1.0,0.541667
gpt-3.5-turbo,narrative_synopsis,random,200,0.016949,1.0,0.125,0.019231,0.0
gpt-3.5-turbo,narrative_synopsis,rank_based,200,0.03125,5.0,0.666667,0.035714,0.0
gpt-3.5-turbo,narrative_synopsis,standard,200,0.051282,8.0,0.8,0.055556,0.0
gpt-3.5-turbo,narrative_synopsis,standard_no_ontology,200,0.0,0.0,0.0,0.0,0.0
gpt-3.5-turbo,no_synopsis,narrative_synopsis,200,0.375,3.0,0.666667,1.0,0.545455
gpt-3.5-turbo,no_synopsis,ontological_synopsis,200,0.6,3.0,0.75,1.0,0.652174
gpt-3.5-turbo,no_synopsis,random,200,0.028571,1.0,0.2,0.035714,0.0
gpt-3.5-turbo,no_synopsis,rank_based,200,0.033333,3.0,0.333333,0.035714,0.0


## Summaries

In [16]:
objs = []
for c in comps:
    for m, payload in c.payloads.items():
        if payload.summary:
            objs.append({"model": c.model, "geneset": c.name, "method": m, "summary": payload.summary})

In [17]:
sdf = pd.DataFrame(objs)

In [18]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
sdf

Unnamed: 0,model,geneset,method,summary
0,gpt-3.5-turbo,EDS-0,narrative_synopsis,Summary: The
1,gpt-3.5-turbo,EDS-0,ontological_synopsis,"Summary: The genes are primarily involved in collagen biosynthesis and extracellular matrix organization, with many of them implicated in various forms of Ehlers-Danlos syndrome.\n\n"
2,gpt-3.5-turbo,EDS-0,no_synopsis,"Summary: Genes are primarily involved in extracellular matrix formation and organization, as well as post-translational modifications.\n"
3,gpt-3.5-turbo,EDS-1,narrative_synopsis,"Summary: Several of these genes are involved in collagen synthesis and modification, and mutations in many of them result in various forms of Ehlers-Danlos syndrome, a connective tissue disorder.\n"
4,gpt-3.5-turbo,EDS-1,ontological_synopsis,Summary: Genes involved in extracellular matrix organization and collagen biosynthesis are over-represented.\n\n
5,gpt-3.5-turbo,EDS-1,no_synopsis,Summary: Genes are mostly involved in extracellular matrix organization and collagen biosynthesis. \n\n
6,text-davinci-003,EDS-0,narrative_synopsis,"Summary: The genes described all encode proteins that participate in the biosynthesis, metabolism and regulation of collagen molecules, which are a type of connective tissue protein involved in various cellular activities. \n"
7,text-davinci-003,EDS-0,ontological_synopsis,"Summary: The genes involved appear to be related to the extracellular matrix, endoplasmic reticulum and Golgi apparatus, and involved in activities such as collagen fibril organization, endodermal cell differentiation, proteoglycan biosynthetic process, negative regulation of transcription by RNA polymerase II and dermatan sulfate biosynthetic process.\n"
8,text-davinci-003,EDS-0,no_synopsis,"Summary: The majority of the genes found are involved in connective tissue development and bone morphogenesis, specifically collagen, glycosaminoglycan and glycoprotein production. \n"
9,text-davinci-003,EDS-1,narrative_synopsis,Summary: This gene list includes proteins that are involved in glycosaminoglycan synthesis and connective-tissue disorder Ehlers-Danlos Syndrome.\n
