# Enrichment Analysis Notebook

Compares the results of SPINDOCTOR gene set summarization vs statistical ontological enrichment.

Draft: https://docs.google.com/document/d/1H103ux6Dd1_bPM0un4RwutBLcYJx-0ybil2AwlAvG_Q/edit#

## Initial setup

Here we take care of imports, defining the data dictionary for the pandas dataframes

In [None]:
import yaml
from yaml import Loader
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import kstest, ttest_ind, wilcoxon, mannwhitneyu
import math
from statsmodels.stats.multitest import multipletests

#import itertools as it
#import collections as ct
#import more_itertools as mit

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import colorsys
import seaborn as sns

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF
from ontogpt.evaluation.enrichment.eval_enrichment import EvalEnrichment
go = get_adapter("sqlite:obo:go")
hgnc = get_adapter("sqlite:obo:hgnc")

In [None]:
TURBO = "gpt-3.5-turbo"
DAVINCI = "text-davinci-003"
GPT4 = "gpt-4"
MODELS = [TURBO, DAVINCI, GPT4]

In [99]:
df_bad = pd.read_csv('results/processed.tsv', sep='\t', header=0, low_memory=False)
df_bad

Unnamed: 0,go_term_ids,name,cutoff,closure,top_n,source,model,method,method_desc,run,...,true_positive_terms,false_positive_terms,false_negative_terms_example20,unparsed_terms,gene_set_size,precision,recall,recall_general,recall_specific,f1_score
0,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,1,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,,,,10,1.000000,1.000000,1.00,1.000000,1.000000
1,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,5,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.500000,0.200000,1.00,0.200000,0.285714
2,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,10,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.500000,0.100000,0.20,0.200000,0.166667
3,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,25,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.000000,0.080000,0.25,0.153846,0.148148
4,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,5000,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.000000,0.051282,0.25,0.142857,0.097561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99355,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,1,,,,,closure,...,GO:0044283,,,,180,1.000000,1.000000,1.00,1.000000,1.000000
99356,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,5,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.000794,1.000000,1.00,1.000000,0.001587
99357,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,10,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.001720,1.000000,1.00,1.000000,0.003434
99358,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,25,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0019899|GO:0050709|GO:0035094|GO...,,,180,0.004761,1.000000,1.00,1.000000,0.009477


In [100]:
import csv

with open('results/processed.tsv', mode='r', encoding='utf-8') as file:
    tsv_reader = csv.DictReader(file, delimiter='\t')

    data = [row for row in tsv_reader]

df2 = pd.DataFrame(data)

df2

Unnamed: 0,go_term_ids,name,cutoff,closure,top_n,source,model,method,method_desc,run,...,true_positive_terms,false_positive_terms,false_negative_terms_example20,unparsed_terms,gene_set_size,precision,recall,recall_general,recall_specific,f1_score
0,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,1,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,,,,10,1.0,1.0,1.0,1.0,1.0
1,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,5,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.5,0.2,1.0,0.2,0.2857142857142857
2,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,10,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.5,0.1,0.2,0.2,0.16666666666666666
3,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,25,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.0,0.08,0.25,0.15384615384615385,0.14814814814814814
4,"{'BFO:0000003', 'GO:0016052', 'GO:0006090', 'G...",glycolysis-gocam-0-0.005,0.005,False,5000,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.0,0.05128205128205128,0.25,0.14285714285714285,0.0975609756097561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99355,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,1,,,,,closure,...,GO:0044283,,,,180,1.0,1.0,1.0,1.0,1.0
99356,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,5,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.0007942811755361397,1.0,1.0,1.0,0.0015873015873015873
99357,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,10,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.0017199862401100791,1.0,1.0,1.0,0.003434065934065934
99358,"{'GO:0006997', 'GO:0023051', 'GO:0048762', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,25,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0019899|GO:0050709|GO:0035094|GO...,,,180,0.004760997905160921,1.0,1.0,1.0,0.009476876421531463


In [101]:
import csv
import pandas as pd

def compare_tsv(file_path):
    df = pd.read_csv(file_path, sep='\t')

    discrepancies = []
    line_number = 0

    with open(file_path, 'r', newline='', encoding='utf-8') as file:
        tsv_reader = csv.DictReader(file, delimiter='\t')

        for dict_row in tsv_reader:
            line_number += 1

            if line_number <= len(df):
                pandas_row = df.iloc[line_number - 1].to_dict()
                
                for key in dict_row:
                    if str(dict_row[key]) != str(pandas_row[key]):
                        
                        if dict_row[key] != "4" and dict_row[key] != "67":
                        #    continue
        
                            #print(type(dict_row[key]))
                            discrepancies.append(f"Discrepancy found in line {line_number}: CSV - {dict_row[key]}, Pandas - {pandas_row[key]}")
                            break
            else:
                discrepancies.append(f"Extra line in CSV at line {line_number}")

    if line_number < len(df):
        discrepancies.append(f"Pandas has extra rows from line {line_number + 1} onwards")

    return discrepancies

file_path = 'results/processed.tsv'
discrepancies = compare_tsv(file_path)
countd = 0
for discrepancy in discrepancies:
    if countd < 100:
        print(discrepancy)
        countd = countd + 1
    else:
        break

Discrepancy found in line 1: CSV - , Pandas - nan
Discrepancy found in line 3: CSV - 0.16666666666666666, Pandas - 0.1666666666666666
Discrepancy found in line 4: CSV - , Pandas - nan
Discrepancy found in line 5: CSV - , Pandas - nan
Discrepancy found in line 6: CSV - , Pandas - nan
Discrepancy found in line 7: CSV - , Pandas - nan
Discrepancy found in line 8: CSV - , Pandas - nan
Discrepancy found in line 9: CSV - , Pandas - nan
Discrepancy found in line 10: CSV - , Pandas - nan
Discrepancy found in line 11: CSV - , Pandas - nan
Discrepancy found in line 13: CSV - 0.16666666666666666, Pandas - 0.1666666666666666
Discrepancy found in line 14: CSV - , Pandas - nan
Discrepancy found in line 15: CSV - , Pandas - nan
Discrepancy found in line 16: CSV - , Pandas - nan
Discrepancy found in line 17: CSV - , Pandas - nan
Discrepancy found in line 18: CSV - , Pandas - nan
Discrepancy found in line 19: CSV - , Pandas - nan
Discrepancy found in line 20: CSV - , Pandas - nan
Discrepancy found in l

In [None]:
import csv

file_path = 'results/processed.tsv'


standard_characters = set('\t\n\rabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,;:"\'!?-()[]{}')


def detect_problems(file_path):
    problems = []
    expected_delimiter_count = None

    with open(file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            # Count the number of tab characters
            delimiter_count = line.count('\t')

            if expected_delimiter_count is None:
                expected_delimiter_count = delimiter_count
            elif delimiter_count != expected_delimiter_count:
                problems.append(f"Inconsistent number of tabs in line {line_number}")

            # Check for non-ASCII characters
            if any(ord(char) > 127 for char in line):
                problems.append(f"Non-ASCII character found in line {line_number}")
            
            #for char in line:    
            #    if char not in standard_characters:
            #        problems.append(f"Non-standard character '{char}' found in line {line_number}")
            #        break

    return problems

#Use
file_problems = detect_problems(file_path)
for problem in file_problems:
    print(problem)

Non-ASCII character found in line 5343
Non-ASCII character found in line 5344
Non-ASCII character found in line 5345
Non-ASCII character found in line 5346
Non-ASCII character found in line 5348
Non-ASCII character found in line 5349
Non-ASCII character found in line 5350
Non-ASCII character found in line 5351
Non-ASCII character found in line 5353
Non-ASCII character found in line 5354
Non-ASCII character found in line 5355
Non-ASCII character found in line 5356
Non-ASCII character found in line 5358
Non-ASCII character found in line 5359
Non-ASCII character found in line 5360
Non-ASCII character found in line 5361
Non-ASCII character found in line 5363
Non-ASCII character found in line 5364
Non-ASCII character found in line 5365
Non-ASCII character found in line 5366
Non-ASCII character found in line 5368
Non-ASCII character found in line 5369
Non-ASCII character found in line 5370
Non-ASCII character found in line 5371
Non-ASCII character found in line 17043
Non-ASCII character foun

In [None]:
df_bad.columns[23]

'false_negative_terms_example20'

In [None]:
df = pd.read_csv('results/processed_1234567891011121314151617181920212223.tsv', sep='\t', header=0, low_memory=False)
df

ParserError: Error tokenizing data. C error: Expected 23 fields in line 302, saw 31


In [None]:
df = pd.read_csv('results/processed.tsv', sep='\t', header=0, low_memory=False)
df

In [None]:
print(df.iloc[45561,0])

In [None]:
print(df.tail(2))

In [None]:
def replace_non_ascii(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:
        for line in file:
            modified_line = ''.join(char if ord(char) < 128 else '*' for char in line)
            output_file.write(modified_line)

# Specify your file paths
input_file_path = 'results/processed.tsv'
output_file_path = 'results/processed_clean.tsv'

# Replace non-ASCII characters and write to a new file
replace_non_ascii(input_file_path, output_file_path)

# Load the modified file into a DataFrame
df = pd.read_csv(output_file_path, sep='\t')

# Display the DataFrame (or perform further operations)
print(df)

In [None]:
def process_line(line, max_field_length, list_delimiter):
    # Replace non-ASCII characters with '*'
    cleaned_line = ''.join(char if ord(char) < 128 else '*' for char in line)

    # Truncate long fields
    fields = cleaned_line.split('\t')
    truncated_fields = []
    for field in fields:
        if len(field) > max_field_length:
            # Find the nearest list delimiter before the max length
            truncation_index = field.rfind(list_delimiter, 0, max_field_length)
            if truncation_index == -1:
                # If no list delimiter found, truncate to max length
                truncation_index = max_field_length
            truncated_field = field[:truncation_index] + '}'
            truncated_fields.append(truncated_field)
        else:
            truncated_fields.append(field)
    
    return '\t'.join(truncated_fields)

def parse_and_load_tsv(input_file_path, output_file_path, max_field_length=1000, list_delimiter=','):
    with open(input_file_path, 'r', encoding='utf-8', newline='') as file, \
         open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:

        line_number = 0
        for line in file:
            line_number += 1

            # Handle empty lines
            if not line.strip():
                output_file.write('\n')
                continue

            try:
                processed_line = process_line(line, max_field_length, list_delimiter)
                output_file.write(processed_line + '\n')
            except Exception as e:
                print(f"Error processing line {line_number}: {e}")

            
# Specify your file paths
input_file_path = 'results/processed.tsv'
output_file_path = 'results/processed_cleaned.tsv'

# Process the input file and write the modified data to a new file
parse_and_load_tsv(input_file_path, output_file_path)

# Load the modified file into a DataFrame
df = pd.read_csv(output_file_path, sep='\t', lineterminator='\n')

df

In [None]:
df.shape