# Preprocess

Compares the results of SPINDOCTOR gene set summarization vs statistical ontological enrichment.

Draft: https://docs.google.com/document/d/1H103ux6Dd1_bPM0un4RwutBLcYJx-0ybil2AwlAvG_Q/edit#

## Initial setup

Here we take care of imports, defining the data dictionary for the pandas dataframes

In [1]:
# note the gpt4 dir includes combined results from davinci, 3.5 and 4
results_dir = "results/human/gpt4"

In [2]:
import yaml
from yaml import Loader
from collections import defaultdict
import pandas as pd
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF
from ontogpt.evaluation.enrichment.eval_enrichment import EvalEnrichment
go = get_adapter("sqlite:obo:go")
hgnc = get_adapter("sqlite:obo:hgnc")

Downloading hgnc.db.gz: 0.00B [00:00, ?B/s]

In [81]:
closure_map = defaultdict(set)
for s, _, o in go.relationships(predicates=[IS_A, PART_OF], include_entailed=True):
    closure_map[s].add(o)

In [82]:
print(len(closure_map))

84220


In [3]:
# ruamel is faster than pyyaml
from ruamel.yaml import YAML
ryaml = YAML()

In [7]:
from ontogpt.evaluation.enrichment.eval_enrichment import GeneSetComparison

In [12]:
# assumes comparisons have been run and concatenated (see Makefile) 
import glob
def load_gene_set_results():
    results = []
    for fn in glob.glob(f"{results_dir}/*.yaml"):
        print(fn)
        with open(fn) as f:
            #obj = yaml.load(f, Loader)
            #obj = yaml.safe_load(f)
            obj = ryaml.load(f)
            results.extend(obj)
    return results

In [13]:
comps = load_gene_set_results()

results/human/gpt4/canonical-glycolysis-gocam-results-2.yaml
results/human/gpt4/bicluster_RNAseqDB_1001-results-2.yaml
results/human/gpt4/HALLMARK_HYPOXIA-results-2.yaml
results/human/gpt4/HALLMARK_DNA_REPAIR-results-2.yaml
results/human/gpt4/HALLMARK_G2M_CHECKPOINT-results-2.yaml
results/human/gpt4/EDS-results-2.yaml
results/human/gpt4/HALLMARK_IL2_STAT5_SIGNALING-results-2.yaml
results/human/gpt4/HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION-results-2.yaml
results/human/gpt4/HALLMARK_PI3K_AKT_MTOR_SIGNALING-results-2.yaml
results/human/gpt4/HALLMARK_COAGULATION-results-2.yaml
results/human/gpt4/peroxisome-results-2.yaml
results/human/gpt4/HALLMARK_APICAL_JUNCTION-results-2.yaml
results/human/gpt4/HALLMARK_ANGIOGENESIS-results-2.yaml
results/human/gpt4/go-postsynapse-calcium-results-2.yaml
results/human/gpt4/HALLMARK_BILE_ACID_METABOLISM-results-2.yaml
results/human/gpt4/HALLMARK_CHOLESTEROL_HOMEOSTASIS-results-2.yaml
results/human/gpt4/bicluster_RNAseqDB_1002-results-2.yaml
results/huma

In [15]:
len(comps)

144

In [16]:
results_by_gene_sets = comps

In [88]:
def eval_gene_set_result(gs):
    rows = []
    std = gs["payloads"]["standard"]
    if "enrichment_results" not in std:
        print(f"NO GOLD STANDARD: {gs['name']}")
    expected_results = std.get("enrichment_results", [])
    for run, method_result in gs["payloads"].items():
        
        predicted_term_ids = method_result.get("term_ids", [])
        #closure_map = defaultdict(list)
        predicted_term_closure_ids = set()
        for t in predicted_term_ids:
            predicted_term_closure_ids.update(closure_map.get(t, set()))
        #for s, _, o in go.relationships(subjects=predicted_term_ids, predicates=[IS_A, PART_OF], include_entailed=True):
            #closure_map[s].append(o)
        #    predicted_term_closure_ids.add(o)
        for cutoff in [0, 0.005, 0.05, 99]:
            row = {}
            row["name"] = f"{gs['name']}-{cutoff}"
            row["cutoff"] = cutoff
            row["gene_set"] = gs["name"]
            method = method_result.get("method", "")
            approach = "gpt"
            if method == "no_synopsis":
                src = "NONE"
            elif method == "ontological_synopsis":
                src = "GO"
            elif method == "narrative_synopsis":
                src = "RefSeq"
            else:
                approach = method
                src = ""
            row["source"] = src
            model = method_result.get("model", "")
            if model == "gpt-4":
                model = "4"
            elif model == "gpt-3.5-turbo":
                model = "3.5"
            elif model == "text-davinci-003":
                model = "3"
            row["model"] = model
            row["method"] = approach
            if model:
                row["method_desc"] = f"{method}-{model}"
            else:
                row["method_desc"] = method_result.get("truncation_factor", "")
            row["run"] = run
            for k in ["truncation_factor", "prompt_variant", "response_token_length"]:
                row[k] = method_result.get(k, "")
            row["prompt_length"] = len(method_result.get("prompt", ""))
            true_positive_terms = []
            false_negative_terms = []
            more_specific_false_negative_terms = []   # predicted a descendant
            more_general_false_negative_terms = []   # predicted a descendant
            unparsed_terms = []
            standard_enrichment_results = [(r["p_value_adjusted"], r["class_id"]) for r in expected_results]
            if cutoff == 99:
                for x in gs["payloads"]["closure"]["term_ids"]:
                    standard_enrichment_results.append((cutoff, x))
            for p_val, true_term_id in standard_enrichment_results:
                if cutoff > 0 and p_val > cutoff:
                    break
                if true_term_id in predicted_term_ids:
                    true_positive_terms.append(true_term_id)
                elif true_term_id in predicted_term_closure_ids:
                    # predicted a more specific term
                    more_specific_false_negative_terms.append(true_term_id)
                elif closure_map.get(true_term_id, set()).intersection(predicted_term_ids):
                    # predicted a more general term
                    more_general_false_negative_terms.append(true_term_id)
                else:
                    false_negative_terms.append(true_term_id)
                if cutoff == 0:
                    break
            false_positive_terms = []
            if cutoff > 0:
                for t in predicted_term_ids:
                    if t not in true_positive_terms:
                        if t.startswith("GO:"):
                            false_positive_terms.append(t)
                        elif ":" in t:
                            # MONDO, UBERON, etc
                            pass
                        else:
                            unparsed_terms.append(t)
            else:
                if not true_positive_terms:
                    if predicted_term_ids:
                        false_positive_terms.append(predicted_term_ids[0])
            row["true_positives"] = len(true_positive_terms)
            row["false_positives"] = len(false_positive_terms)
            row["false_negatives"] = len(false_negative_terms)
            row["more_general_false_negatives"] = len(more_general_false_negative_terms)
            row["more_specific_false_negatives"] = len(more_specific_false_negative_terms)
            row["all_predictions_closure"] = len(predicted_term_closure_ids)
            row["unparsed"] = len(unparsed_terms)
            row["true_positive_terms"] = "|".join(true_positive_terms)
            row["false_positive_terms"] = "|".join(false_positive_terms)
            row["unparsed_terms"] = "|".join(unparsed_terms)
            row["gene_symbols"] = "|".join(gs.get("gene_symbols", []))
            row["gene_ids"] = "|".join(gs.get("gene_ids", []))
            rows.append(row)
    return rows

rows = eval_gene_set_result(results_by_gene_sets[4])
df = pd.DataFrame(rows)
df.to_csv("results/TEST-processed.tsv", sep="\t", index=False)
df

Unnamed: 0,name,cutoff,gene_set,source,model,method,method_desc,run,truncation_factor,prompt_variant,...,false_negatives,more_general_false_negatives,more_specific_false_negatives,all_predictions_closure,unparsed,true_positive_terms,false_positive_terms,unparsed_terms,gene_symbols,gene_ids
0,HALLMARK_HYPOXIA-0-0,0.0,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,0,0,0,53,0,GO:0005975,,,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
1,HALLMARK_HYPOXIA-0-0.005,0.005,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,171,21,28,53,4,GO:0005975|GO:0006096|GO:0001525,GO:1904659|GO:0051726|GO:0030198,oxidative stress response|apoptosis regulation...,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
2,HALLMARK_HYPOXIA-0-0.05,0.05,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,261,25,31,53,4,GO:0005975|GO:0006096|GO:0001525|GO:0030198,GO:1904659|GO:0051726,oxidative stress response|apoptosis regulation...,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
3,HALLMARK_HYPOXIA-0-99,99.0,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,8296,244,88,53,4,GO:0005975|GO:0006096|GO:0001525|GO:0030198|GO...,,oxidative stress response|apoptosis regulation...,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
4,HALLMARK_HYPOXIA-0-0,0.0,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v2,1.0,v2,...,0,0,1,42,0,,GO:0006096,,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
5,HALLMARK_HYPOXIA-0-0.005,0.005,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v2,1.0,v2,...,176,14,30,42,4,GO:0006096|GO:0044237|GO:0001525,GO:0007165,cellular stress response|cell growth and proli...,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
6,HALLMARK_HYPOXIA-0-0.05,0.05,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v2,1.0,v2,...,263,22,32,42,4,GO:0006096|GO:0044237|GO:0001525|GO:0007165,,cellular stress response|cell growth and proli...,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
7,HALLMARK_HYPOXIA-0-99,99.0,HALLMARK_HYPOXIA-0,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v2,1.0,v2,...,7636,917,80,42,4,GO:0006096|GO:0044237|GO:0001525|GO:0007165|GO...,,cellular stress response|cell growth and proli...,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
8,HALLMARK_HYPOXIA-0-0,0.0,HALLMARK_HYPOXIA-0,GO,4.0,gpt,ontological_synopsis-4,gpt-4.ontological_synopsis.v1,0.4096,v1,...,0,1,0,7,0,,GO:0008152,,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...
9,HALLMARK_HYPOXIA-0-0.005,0.005,HALLMARK_HYPOXIA-0,GO,4.0,gpt,ontological_synopsis-4,gpt-4.ontological_synopsis.v1,0.4096,v1,...,169,51,2,7,4,GO:0008152,GO:0038023,cell proliferation regulation|protein binding ...,ACKR3|ADM|ADORA2B|AK4|AKAP12|ALDOA|ALDOB|ALDOC...,HGNC:23692|HGNC:259|HGNC:264|HGNC:363|HGNC:370...


In [89]:
rows = []
for gs in results_by_gene_sets:
    this_rows = eval_gene_set_result(gs)
    rows.extend(this_rows)
    print(len(rows))
    df = pd.DataFrame(rows)
    df.to_csv("results/TEMP.tsv", sep="\t", index=False)
df = pd.DataFrame(rows)
pd.set_option('display.max_rows', 10)
df.to_csv("results/processed.tsv", sep="\t", index=False)
df

92
184
NO GOLD STANDARD: bicluster_RNAseqDB_1001-0
276
NO GOLD STANDARD: bicluster_RNAseqDB_1001-1
368
460
552
644
736
828
920
1012
1104
1196
1288
1380
1472
1564
1656
1748
1840
1932
2024
2116
2208
2300
2392
2484
2576
2668
2760
2852
2944
3036
3128
3220
3312
3404
3496
3588
3680
3772
3864
3956
4048
4140
4232
4324
4416
4508
4600
4692
4784
4876
4968
5060
5152
5244
5336
5428
5520
5612
5704
5796
5888
5980
6072
6164
6256
6348
6440
6532
6624
6716
6808
6900
6992
7084
7176
7268
7360
7452
7544
7636
7728
7820
7912
8004
8096
8188
8280
8372
8464
8556
8648
8740
8832
8924
9016
9108
9200
9292
9384
9476
9568
9660
9752
9844
9936
10028
10120
10212
10304
10396
10488
10580
10672
10764
10856
10948
11040
11132
11224
11316
11408
11500
11592
11684
11776
11868
11960
12052
12144
12236
12328
12420
12512
12604
12696
12788
12880
12972
13064
13156
13248


Unnamed: 0,name,cutoff,gene_set,source,model,method,method_desc,run,truncation_factor,prompt_variant,...,false_negatives,more_general_false_negatives,more_specific_false_negatives,all_predictions_closure,unparsed,true_positive_terms,false_positive_terms,unparsed_terms,gene_symbols,gene_ids
0,glycolysis-gocam-0-0,0.000,glycolysis-gocam-0,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,0,0,0,23,0,GO:0006096,,,HK1|GPI|PFKM|ALDOA|TPI1|GAPDH|PGK1|PGAM2|ENO3|PKM,HGNC:4922|HGNC:4458|HGNC:8877|HGNC:414|HGNC:12...
1,glycolysis-gocam-0-0.005,0.005,glycolysis-gocam-0,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,19,6,12,23,2,GO:0006096|GO:0006006,,energy production|atp generation,HK1|GPI|PFKM|ALDOA|TPI1|GAPDH|PGK1|PGAM2|ENO3|PKM,HGNC:4922|HGNC:4458|HGNC:8877|HGNC:414|HGNC:12...
2,glycolysis-gocam-0-0.05,0.050,glycolysis-gocam-0,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,30,6,13,23,2,GO:0006096|GO:0006006,,energy production|atp generation,HK1|GPI|PFKM|ALDOA|TPI1|GAPDH|PGK1|PGAM2|ENO3|PKM,HGNC:4922|HGNC:4458|HGNC:8877|HGNC:414|HGNC:12...
3,glycolysis-gocam-0-99,99.000,glycolysis-gocam-0,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,1.0,v1,...,908,22,35,23,2,GO:0006096|GO:0006006|GO:0006006|GO:0006096|GO...,,energy production|atp generation,HK1|GPI|PFKM|ALDOA|TPI1|GAPDH|PGK1|PGAM2|ENO3|PKM,HGNC:4922|HGNC:4458|HGNC:8877|HGNC:414|HGNC:12...
4,glycolysis-gocam-0-0,0.000,glycolysis-gocam-0,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v2,1.0,v2,...,0,0,0,23,0,GO:0006096,,,HK1|GPI|PFKM|ALDOA|TPI1|GAPDH|PGK1|PGAM2|ENO3|PKM,HGNC:4922|HGNC:4458|HGNC:8877|HGNC:414|HGNC:12...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13243,HALLMARK_MTORC1_SIGNALING-1-99,99.000,HALLMARK_MTORC1_SIGNALING-1,,,rank_based,,rank_based,,,...,101,6234,484,542,0,GO:0005737|GO:0070062|GO:0005829|GO:0043231|GO...,GO:0070374|GO:0005096|GO:0005085|GO:0005912|GO...,,TXNRD1|MLLT11|SLC2A1|AK4|ACTR3|TUBG1|HMGCS1|AT...,HGNC:12437|HGNC:16997|HGNC:11005|HGNC:363|HGNC...
13244,HALLMARK_MTORC1_SIGNALING-1-0,0.000,HALLMARK_MTORC1_SIGNALING-1,,,,,closure,,,...,0,0,0,4852,0,GO:0044283,,,TXNRD1|MLLT11|SLC2A1|AK4|ACTR3|TUBG1|HMGCS1|AT...,HGNC:12437|HGNC:16997|HGNC:11005|HGNC:363|HGNC...
13245,HALLMARK_MTORC1_SIGNALING-1-0.005,0.005,HALLMARK_MTORC1_SIGNALING-1,,,,,closure,,,...,0,0,0,4852,0,GO:0044283|GO:0044281|GO:0005737|GO:0019752|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,TXNRD1|MLLT11|SLC2A1|AK4|ACTR3|TUBG1|HMGCS1|AT...,HGNC:12437|HGNC:16997|HGNC:11005|HGNC:363|HGNC...
13246,HALLMARK_MTORC1_SIGNALING-1-0.05,0.050,HALLMARK_MTORC1_SIGNALING-1,,,,,closure,,,...,0,0,0,4852,0,GO:0044283|GO:0044281|GO:0005737|GO:0019752|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,TXNRD1|MLLT11|SLC2A1|AK4|ACTR3|TUBG1|HMGCS1|AT...,HGNC:12437|HGNC:16997|HGNC:11005|HGNC:363|HGNC...


In [47]:
df["method"].unique()

array(['no_synopsis-gpt-4', 'ontological_synopsis-gpt-4',
       'narrative_synopsis-gpt-4', 'no_synopsis-gpt-3.5-turbo',
       'ontological_synopsis-gpt-3.5-turbo',
       'narrative_synopsis-gpt-3.5-turbo', 'no_synopsis-text-davinci-003',
       'ontological_synopsis-text-davinci-003',
       'narrative_synopsis-text-davinci-003', 'standard',
       'standard_no_ontology', 'random', 'rank_based', ''], dtype=object)