# Enrichment Analysis Notebook

Compares the results of SPINDOCTOR gene set summarization vs statistical ontological enrichment.

Draft: https://docs.google.com/document/d/1H103ux6Dd1_bPM0un4RwutBLcYJx-0ybil2AwlAvG_Q/edit#

## Initial setup

Here we take care of imports, defining the data dictionary for the pandas dataframes

In [1]:
import yaml
from yaml import Loader
from collections import defaultdict
import pandas as pd
import numpy as np
from scipy.stats import kstest, ttest_ind, wilcoxon, mannwhitneyu
import math
from statsmodels.stats.multitest import multipletests

#import itertools as it
#import collections as ct
#import more_itertools as mit

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import colorsys
import seaborn as sns

from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A, PART_OF
from ontogpt.evaluation.enrichment.eval_enrichment import EvalEnrichment
go = get_adapter("sqlite:obo:go")
hgnc = get_adapter("sqlite:obo:hgnc")

In [3]:
TURBO = "gpt-3.5-turbo"
DAVINCI = "text-davinci-003"
GPT4 = "gpt-4"
MODELS = [TURBO, DAVINCI, GPT4]

In [13]:
import csv

with open('results/processed.tsv', mode='r', encoding='utf-8') as file:
    tsv_reader = csv.DictReader(file, delimiter='\t')

    data = [row for row in tsv_reader]

df2 = pd.DataFrame(data)

df2

Unnamed: 0,go_term_ids,name,cutoff,closure,top_n,source,model,method,method_desc,run,...,true_positive_terms,false_positive_terms,false_negative_terms_example20,unparsed_terms,gene_set_size,precision,recall,recall_general,recall_specific,f1_score
0,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,1,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,,,,10,1.0,1.0,1.0,1.0,1.0
1,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,5,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.5,0.2,1.0,0.2,0.2857142857142857
2,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,10,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.5,0.1,0.2,0.2,0.16666666666666666
3,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,25,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.0,0.08,0.25,0.15384615384615385,0.14814814814814814
4,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,5000,NONE,4,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.0,0.05128205128205128,0.25,0.14285714285714285,0.0975609756097561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99355,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,1,,,,,closure,...,GO:0044283,,,,180,1.0,1.0,1.0,1.0,1.0
99356,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,5,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.0007942811755361397,1.0,1.0,1.0,0.0015873015873015873
99357,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,10,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.0017199862401100791,1.0,1.0,1.0,0.003434065934065934
99358,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.0,True,25,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0019899|GO:0050709|GO:0035094|GO...,,,180,0.004760997905160921,1.0,1.0,1.0,0.009476876421531463


In [30]:
import csv
import pandas as pd

def compare_tsv(file_path):
    df = pd.read_csv(file_path, sep='\t')

    discrepancies = []
    line_number = 0

    with open(file_path, 'r', newline='', encoding='utf-8') as file:
        tsv_reader = csv.DictReader(file, delimiter='\t')

        for dict_row in tsv_reader:
            line_number += 1

            if line_number <= len(df):
                pandas_row = df.iloc[line_number - 1].to_dict()
                
                for key in dict_row:
                    if str(dict_row[key]) != str(pandas_row[key]):
                        
                        if dict_row[key] != "4" and dict_row[key] != "67":
                        #    continue
        
                            #print(type(dict_row[key]))
                            discrepancies.append(f"Discrepancy found in line {line_number}: CSV - {dict_row[key]}, Pandas - {pandas_row[key]}")
                            break
            else:
                discrepancies.append(f"Extra line in CSV at line {line_number}")

    if line_number < len(df):
        discrepancies.append(f"Pandas has extra rows from line {line_number + 1} onwards")

    return discrepancies

file_path = 'results/processed.tsv'
discrepancies = compare_tsv(file_path)
countd = 0
for discrepancy in discrepancies:
    if countd < 100:
        print(discrepancy)
        countd = countd + 1
    else:
        break

Discrepancy found in line 1: CSV - , Pandas - nan
Discrepancy found in line 3: CSV - 0.16666666666666666, Pandas - 0.1666666666666666
Discrepancy found in line 4: CSV - , Pandas - nan
Discrepancy found in line 5: CSV - , Pandas - nan
Discrepancy found in line 6: CSV - , Pandas - nan
Discrepancy found in line 7: CSV - , Pandas - nan
Discrepancy found in line 8: CSV - , Pandas - nan
Discrepancy found in line 9: CSV - , Pandas - nan
Discrepancy found in line 10: CSV - , Pandas - nan
Discrepancy found in line 11: CSV - , Pandas - nan
Discrepancy found in line 13: CSV - 0.16666666666666666, Pandas - 0.1666666666666666
Discrepancy found in line 14: CSV - , Pandas - nan
Discrepancy found in line 15: CSV - , Pandas - nan
Discrepancy found in line 16: CSV - , Pandas - nan
Discrepancy found in line 17: CSV - , Pandas - nan
Discrepancy found in line 18: CSV - , Pandas - nan
Discrepancy found in line 19: CSV - , Pandas - nan
Discrepancy found in line 20: CSV - , Pandas - nan
Discrepancy found in l

In [34]:
import csv

file_path = 'results/processed.tsv'


standard_characters = set('\t\n\rabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,;:"\'!?-()[]{}')


def detect_problems(file_path):
    problems = []
    expected_delimiter_count = None

    with open(file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            # Count the number of tab characters
            delimiter_count = line.count('\t')

            if expected_delimiter_count is None:
                expected_delimiter_count = delimiter_count
            elif delimiter_count != expected_delimiter_count:
                problems.append(f"Inconsistent number of tabs in line {line_number}")

            # Check for non-ASCII characters
            if any(ord(char) > 127 for char in line):
                problems.append(f"Non-ASCII character found in line {line_number}")
            
            #for char in line:    
            #    if char not in standard_characters:
            #        problems.append(f"Non-standard character '{char}' found in line {line_number}")
            #        break

    return problems

#Use
file_problems = detect_problems(file_path)
for problem in file_problems:
    print(problem)

Inconsistent number of tabs in line 302
Inconsistent number of tabs in line 303
Inconsistent number of tabs in line 304
Inconsistent number of tabs in line 309
Inconsistent number of tabs in line 310
Inconsistent number of tabs in line 311
Inconsistent number of tabs in line 316
Inconsistent number of tabs in line 317
Inconsistent number of tabs in line 318
Inconsistent number of tabs in line 323
Inconsistent number of tabs in line 324
Inconsistent number of tabs in line 325
Inconsistent number of tabs in line 330
Inconsistent number of tabs in line 331
Inconsistent number of tabs in line 332
Inconsistent number of tabs in line 337
Inconsistent number of tabs in line 338
Inconsistent number of tabs in line 339
Inconsistent number of tabs in line 2264
Inconsistent number of tabs in line 2265
Inconsistent number of tabs in line 2266
Inconsistent number of tabs in line 2267
Inconsistent number of tabs in line 2268
Inconsistent number of tabs in line 2269
Inconsistent number of tabs in lin

In [49]:
df = pd.read_csv('results/processed.tsv', sep='\t', header=0, low_memory=False)
df

Unnamed: 0,go_term_ids,name,cutoff,closure,top_n,source,model,method,method_desc,run,...,true_positive_terms,false_positive_terms,false_negative_terms_example20,unparsed_terms,gene_set_size,precision,recall,recall_general,recall_specific,f1_score
0,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,1,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,,,,10,1.000000,1.000000,1.00,1.000000,1.000000
1,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,5,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.500000,0.200000,1.00,0.200000,0.285714
2,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,10,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.500000,0.100000,0.20,0.200000,0.166667
3,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,25,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.000000,0.080000,0.25,0.153846,0.148148
4,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,5000,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.000000,0.051282,0.25,0.142857,0.097561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99355,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,1,,,,,closure,...,GO:0044283,,,,180,1.000000,1.000000,1.00,1.000000,1.000000
99356,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,5,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.000794,1.000000,1.00,1.000000,0.001587
99357,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,10,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.001720,1.000000,1.00,1.000000,0.003434
99358,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,25,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0019899|GO:0050709|GO:0035094|GO...,,,180,0.004761,1.000000,1.00,1.000000,0.009477


In [41]:
print(df.iloc[45561,0])

{'GO:0016569', 'BFO:0000015', 'GO:0003677', 'GO:0003676', 'GO:0032502', 'BFO:0000003', 'GO:0008150', 'GO:0097159', 'GO:0042592', 'GO:0003674', 'GO:0005488', 'GO:1901363'}


In [9]:
print(df.tail(2))

                                             go_term_ids  \
99358  {'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...   
99359  {'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...   

                                 name  cutoff  closure  top_n source  model  \
99358  HALLMARK_MTORC1_SIGNALING-1-99    99.0     True     25    NaN    NaN   
99359  HALLMARK_MTORC1_SIGNALING-1-99    99.0     True   5000    NaN    NaN   

      method method_desc      run  ...  \
99358    NaN         NaN  closure  ...   
99359    NaN         NaN  closure  ...   

                                     true_positive_terms  \
99358  GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...   
99359  GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...   

                                    false_positive_terms  \
99358  GO:0008525|GO:0019899|GO:0050709|GO:0035094|GO...   
99359                                                NaN   

       false_negative_terms_example20  unparsed_terms  gene_set_size  \
99358               

In [42]:
def replace_non_ascii(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:
        for line in file:
            modified_line = ''.join(char if ord(char) < 128 else '*' for char in line)
            output_file.write(modified_line)

# Specify your file paths
input_file_path = 'results/processed.tsv'
output_file_path = 'results/processed_clean.tsv'

# Replace non-ASCII characters and write to a new file
replace_non_ascii(input_file_path, output_file_path)

# Load the modified file into a DataFrame
df = pd.read_csv(output_file_path, sep='\t')

# Display the DataFrame (or perform further operations)
print(df)

                                             go_term_ids  \
0      {'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...   
1      {'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...   
2      {'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...   
3      {'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...   
4      {'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...   
...                                                  ...   
99355  {'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...   
99356  {'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...   
99357  {'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...   
99358  {'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...   
99359  {'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...   

                                 name  cutoff  closure  top_n source  model  \
0            glycolysis-gocam-0-0.005   0.005    False      1   NONE    4.0   
1            glycolysis-gocam-0-0.005   0.005    False      5   NONE    4.0   
2            glycolysis-gocam-0-0.005   0.

In [47]:
def process_line(line, max_field_length, list_delimiter):
    # Replace non-ASCII characters with '*'
    cleaned_line = ''.join(char if ord(char) < 128 else '*' for char in line)

    # Truncate long fields
    fields = cleaned_line.split('\t')
    truncated_fields = []
    for field in fields:
        if len(field) > max_field_length:
            # Find the nearest list delimiter before the max length
            truncation_index = field.rfind(list_delimiter, 0, max_field_length)
            if truncation_index == -1:
                # If no list delimiter found, truncate to max length
                truncation_index = max_field_length
            truncated_field = field[:truncation_index] + '}'
            truncated_fields.append(truncated_field)
        else:
            truncated_fields.append(field)
    
    return '\t'.join(truncated_fields)

def parse_and_load_tsv(input_file_path, output_file_path, max_field_length=1000, list_delimiter=','):
    with open(input_file_path, 'r', encoding='utf-8', newline='') as file, \
         open(output_file_path, 'w', encoding='utf-8', newline='') as output_file:

        line_number = 0
        for line in file:
            line_number += 1

            # Handle empty lines
            if not line.strip():
                output_file.write('\n')
                continue

            try:
                processed_line = process_line(line, max_field_length, list_delimiter)
                output_file.write(processed_line + '\n')
            except Exception as e:
                print(f"Error processing line {line_number}: {e}")

            
# Specify your file paths
input_file_path = 'results/processed.tsv'
output_file_path = 'results/processed_cleaned.tsv'

# Process the input file and write the modified data to a new file
parse_and_load_tsv(input_file_path, output_file_path)

# Load the modified file into a DataFrame
df = pd.read_csv(output_file_path, sep='\t', lineterminator='\n')

df

Unnamed: 0,go_term_ids,name,cutoff,closure,top_n,source,model,method,method_desc,run,...,true_positive_terms,false_positive_terms,false_negative_terms_example20,unparsed_terms,gene_set_size,precision,recall,recall_general,recall_specific,f1_score
0,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,1,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,,,,10,1.000000,1.000000,1.00,1.000000,1.000000
1,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,5,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.500000,0.200000,1.00,0.200000,0.285714
2,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,10,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096,GO:0006006,GO:0006090|GO:001605,energy production|atp generation,10,0.500000,0.100000,0.20,0.200000,0.166667
3,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,25,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.000000,0.080000,0.25,0.153846,0.148148
4,"{'BFO:0000015', 'GO:0032787', 'GO:0043436', 'G...",glycolysis-gocam-0-0.005,0.005,False,5000,NONE,4.0,gpt,no_synopsis-4,gpt-4.no_synopsis.v1,...,GO:0006096|GO:0006006,,GO:0006090|GO:001605,energy production|atp generation,10,1.000000,0.051282,0.25,0.142857,0.097561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99343,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,1,,,,,closure,...,GO:0044283,,,,180,1.000000,1.000000,1.00,1.000000,1.000000
99344,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,5,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.000794,1.000000,1.00,1.000000,0.001587
99345,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,10,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0009749|GO:0019899|GO:0050709|GO...,,,180,0.001720,1.000000,1.00,1.000000,0.003434
99346,"{'GO:0051348', 'GO:0005976', 'GO:0035524', 'GO...",HALLMARK_MTORC1_SIGNALING-1-99,99.000,True,25,,,,,closure,...,GO:0044283|GO:0005737|GO:0019752|GO:0036094|GO...,GO:0008525|GO:0019899|GO:0050709|GO:0035094|GO...,,,180,0.004761,1.000000,1.00,1.000000,0.009477


In [43]:
df.shape

(99360, 31)

In [48]:
def top_line_lengths(file_path, num_lines=10):
    line_lengths = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip the newline character for accurate length
            line_lengths.append(len(line.rstrip('\n')))

    # Sort the list of line lengths in descending order and get the top 'num_lines' lengths
    top_lengths = sorted(line_lengths, reverse=True)[:num_lines]

    return top_lengths

# Usage
file_path = 'results/processed.tsv'
top_lengths = top_line_lengths(file_path)
print(f"Top 10 line lengths: {top_lengths}")

Top 10 line lengths: [184334, 184332, 184330, 184303, 184301, 184299, 184259, 184257, 184255, 181807]


In [None]:
grouped_data = df.groupby(['source', 'model', 'method'])[['true_positives', 'false_positives', 'false_negatives']].sum().reset_index()

# Calculate precision, recall, and F1-score for each group
grouped_data['precision'] = grouped_data['true_positives'] / (grouped_data['true_positives'] + grouped_data['false_positives'])
grouped_data['recall'] = grouped_data['true_positives'] / (grouped_data['true_positives'] + grouped_data['false_negatives'])
grouped_data['f1_score'] = 2 * ((grouped_data['precision'] * grouped_data['recall']) / (grouped_data['precision'] + grouped_data['recall']))
grouped_data

In [None]:
grouped_data_cutoff_combination = df.groupby(['cutoff', 'source', 'model', 'method'])[['true_positives', 'false_positives', 'false_negatives']].sum().reset_index()

# Calculate precision, recall, and F1-score for each group
grouped_data_cutoff_combination['precision'] = grouped_data_cutoff_combination['true_positives'] / (grouped_data_cutoff_combination['true_positives'] + grouped_data_cutoff_combination['false_positives'])
grouped_data_cutoff_combination['recall'] = grouped_data_cutoff_combination['true_positives'] / (grouped_data_cutoff_combination['true_positives'] + grouped_data_cutoff_combination['false_negatives'])
grouped_data_cutoff_combination['f1_score'] = 2 * ((grouped_data_cutoff_combination['precision'] * grouped_data_cutoff_combination['recall']) / (grouped_data_cutoff_combination['precision'] + grouped_data_cutoff_combination['recall']))
grouped_data_cutoff_combination

In [None]:
def custom_order(label):
    prefix_order = ["NONE", "RefSeq", "GO", "NA"]
    suffix_order = ["3.0-GPT", "3.5GPT", "4.0-GPT"]
    
    try:
        prefix_parts = label.split('-')
        prefix_weight = prefix_order.index(prefix_parts[0])
        
        suffix = '-'.join(prefix_parts[1:])
        suffix_weight = suffix_order.index(suffix)
    except ValueError:
        return (999, 999)  # put unexpected values at the end
    
    return prefix_weight, suffix_weight

def label_order(label):
    prefixes = ["NONE", "RefSeq", "GO", "NA"]
    for i, prefix in enumerate(prefixes):
        if label.startswith(prefix):
            return i
    return len(prefixes)  # Return a value that puts any other labels at the end


cutoff_values = sorted([0.05, 0.005, 99])

data_filled = df.fillna("NA")

grouped_data_filled = data_filled.groupby(['cutoff', 'closure', 'top_n', 'source', 'model', 'method'])[['true_positives', 'false_positives', 'false_negatives']].sum().reset_index()

grouped_data_filled['precision'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_positives'])
grouped_data_filled['recall'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_negatives'])
grouped_data_filled['f1_score'] = 2 * ((grouped_data_filled['precision'] * grouped_data_filled['recall']) / (grouped_data_filled['precision'] + grouped_data_filled['recall']))

grouped_data_filled['sort_order'] = grouped_data_filled['source'].apply(custom_order)
grouped_data_filled = grouped_data_filled.sort_values(['sort_order', 'top_n', 'closure'])

unique_combinations_closure_topn = sorted(grouped_data_filled[['closure', 'top_n']].drop_duplicates().values, key=lambda x: (x[1], x[0]))

num_rows = len(unique_combinations_closure_topn)
num_cols = len(cutoff_values)

fig, ax = plt.subplots(num_rows, num_cols, figsize=(16, 11), gridspec_kw={"hspace": 0.4, "wspace": 0.5})
        
for row_idx, (closure, top_n) in enumerate(unique_combinations_closure_topn):
    for col_idx, cutoff in enumerate(cutoff_values):
        
        if (col_idx < 1 or col_idx >= num_cols - 1) and (row_idx == 0 or row_idx == 1):
            ax[row_idx, col_idx].axis('off') 
            continue
            
        data_combination = grouped_data_filled[(grouped_data_filled['cutoff'] == cutoff) & (grouped_data_filled['closure'] == closure) & (grouped_data_filled['top_n'] == top_n)]
        data_combination['label'] = data_combination['source'] + '-' + data_combination['model'].astype(str) + '-' + data_combination['method']

        # Sort data_combination based on the label_order
        data_combination['label_order'] = data_combination['label'].apply(label_order)
        data_combination = data_combination.sort_values('label_order')
        
        x_newest = np.arange(len(data_combination['label']))
        width_newest = 0.2

        ax[row_idx, col_idx].bar(x_newest - width_newest, data_combination['precision'], width_newest, label='Precision')
        ax[row_idx, col_idx].bar(x_newest, data_combination['recall'], width_newest, label='Recall')
        ax[row_idx, col_idx].bar(x_newest + width_newest, data_combination['f1_score'], width_newest, label='F1-score')
        
        ax[row_idx, col_idx].set_title(f'cutoff = {cutoff}, closure  = {closure}, top_n = {top_n}', fontsize=10)
        
        ax[row_idx, col_idx].set_ylim(0, 1)
        
        # Change y-axis tick marks
        ax[row_idx, col_idx].set_yticks(np.arange(0, 1.1, 0.25))
        if col_idx != 0:  # Hide y-axis labels for non-left columns
            ax[row_idx, col_idx].set_yticklabels([])
        
        # Set x-axis ticks
        ax[row_idx, col_idx].set_xticks(x_newest)
        if row_idx == num_rows - 1:  # Adjusting the condition for x-tick labels for the last row
            ax[row_idx, col_idx].set_xticklabels(data_combination['label'], rotation=90)
        else:
            ax[row_idx, col_idx].set_xticklabels([])
    
        # Add horizontal gridlines
        ax[row_idx, col_idx].grid(axis='y', which='major', color='black', alpha=0.2)
    
        if row_idx == 0 and col_idx == num_cols - 2:
            ax[row_idx, col_idx].legend()

plt.tight_layout()
plt.savefig("process_precision_recall_allGPT.pdf", format="pdf", bbox_inches="tight")
plt.show()


In [None]:
def custom_order(label):
    prefix_order = ["NONE", "RefSeq", "GO", "NA"]
    suffix_order = ["3.0-GPT", "3.5GPT", "4.0-GPT"]
    
    try:
        prefix_parts = label.split('-')
        prefix_weight = prefix_order.index(prefix_parts[0])
        
        suffix = '-'.join(prefix_parts[1:])
        suffix_weight = suffix_order.index(suffix)
    except ValueError:
        return (999, 999)  # put unexpected values at the end
    
    return prefix_weight, suffix_weight

def label_order(label):
    prefixes = ["NONE", "RefSeq", "GO", "NA"]
    for i, prefix in enumerate(prefixes):
        if label.startswith(prefix):
            return i
    return len(prefixes)  # Return a value that puts any other labels at the end


data_filled = df.fillna("NA")
condition = (
    (data_filled['cutoff'] == 0.005) &
    (data_filled['closure'] == True) &
    (data_filled['top_n'] == 10)
)
filtered_data = data_filled[condition]
filtered_data.reset_index(drop=True, inplace=True)

grouped_data_filled = filtered_data.groupby(['cutoff', 'closure', 'top_n', 'source', 'model', 'method'])[['true_positives', 'false_positives', 'false_negatives']].sum().reset_index()

grouped_data_filled['precision'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_positives'])
grouped_data_filled['recall'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_negatives'])
grouped_data_filled['f1_score'] = 2 * ((grouped_data_filled['precision'] * grouped_data_filled['recall']) / (grouped_data_filled['precision'] + grouped_data_filled['recall']))

grouped_data_filled['sort_order'] = grouped_data_filled['source'].apply(custom_order)
grouped_data_filled = grouped_data_filled.sort_values(['sort_order', 'top_n', 'closure'])


fig, ax = plt.subplots(1, 1, figsize=(8, 6))  # Create a single subplot

data_combination = grouped_data_filled#filtered_data[(filtered_data['cutoff'] == 0.005) & (filtered_data['closure'] == True) & (filtered_data['top_n'] == 10)]
data_combination['label'] = data_combination['source'] + '-' + data_combination['model'].astype(str) + '-' + data_combination['method']

# Sort data_combination based on the label_order
data_combination['label_order'] = data_combination['label'].apply(label_order)
data_combination = data_combination.sort_values('label_order')

x_newest = np.arange(len(data_combination['label']))
width_newest = 0.2

ax.bar(x_newest - width_newest, data_combination['precision'], width_newest, label='Precision')
ax.bar(x_newest, data_combination['recall'], width_newest, label='Recall')
ax.bar(x_newest + width_newest, data_combination['f1_score'], width_newest, label='F1-score')

ax.set_title(f'cutoff = 0.005, closure = True, top_n = 10', fontsize=10)

ax.set_ylim(0, 1)

# Change y-axis tick marks
ax.set_yticks(np.arange(0, 1.1, 0.25))
ax.set_yticklabels([])  # Hide y-axis labels

# Set x-axis ticks
ax.set_xticks(x_newest)
ax.set_xticklabels(data_combination['label'], rotation=90)

# Add horizontal gridlines
ax.grid(axis='y', which='major', color='black', alpha=0.2)
ax.legend()

plt.tight_layout()
plt.savefig("process_precision_recall_specific_0.005_true_10.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
filtered_data['method'] = filtered_data['method'].str.upper()

In [None]:
filtered_data['method']

In [None]:
def custom_order(label):
    prefix_order = ["NONE", "RefSeq", "GO", "NA"]
    suffix_order = ["4.0-GPT"]
    
    try:
        prefix_parts = label.split('-')
        prefix_weight = prefix_order.index(prefix_parts[0])
        
        suffix = '-'.join(prefix_parts[1:])
        suffix_weight = suffix_order.index(suffix)
    except ValueError:
        return (999, 999)  # put unexpected values at the end
    
    return prefix_weight, suffix_weight

def label_order(label):
    prefixes = ["NONE", "RefSeq", "GO", "NA"]
    for i, prefix in enumerate(prefixes):
        if label.startswith(prefix):
            return i
    return len(prefixes)  # Return a value that puts any other labels at the end

data_filled = df.fillna("NA")
condition = (
    (data_filled['cutoff'] == 0.005) &
    (data_filled['closure'] == True) &
    (data_filled['top_n'] == 10) &
    (data_filled['model'] == 4.0)
)
filtered_data = data_filled[condition]
filtered_data.reset_index(drop=True, inplace=True)

grouped_data_filled = filtered_data.groupby(['cutoff', 'closure', 'top_n', 'source', 'model', 'method'])[['true_positives', 'false_positives', 'false_negatives']].sum().reset_index()

grouped_data_filled['precision'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_positives'])
grouped_data_filled['recall'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_negatives'])
grouped_data_filled['f1_score'] = 2 * ((grouped_data_filled['precision'] * grouped_data_filled['recall']) / (grouped_data_filled['precision'] + grouped_data_filled['recall']))

grouped_data_filled['sort_order'] = grouped_data_filled['source'].apply(custom_order)
grouped_data_filled = grouped_data_filled.sort_values(['sort_order', 'top_n', 'closure'])

# Filter out sets with 'NA-NA' label prefix
filtered_data_combination = grouped_data_filled[~grouped_data_filled['source'].str.startswith('NA')]

filtered_data_combination['method'] = filtered_data_combination['method'].str.upper()

fig, ax = plt.subplots(1, 1, figsize=(8, 6))  # Create a single subplot

filtered_data_combination['label'] = filtered_data_combination['method'] + '-' + filtered_data_combination['model'].astype(str) + '-' + filtered_data_combination['source']


# Sort data_combination based on the label_order
filtered_data_combination['label_order'] = filtered_data_combination['label'].apply(label_order)
filtered_data_combination = filtered_data_combination.sort_values('label_order')

x_newest = np.arange(len(filtered_data_combination['label']))
width_newest = 0.2

ax.bar(x_newest - width_newest, filtered_data_combination['precision'], width_newest, label='Precision')
ax.bar(x_newest, filtered_data_combination['recall'], width_newest, label='Recall')
ax.bar(x_newest + width_newest, filtered_data_combination['f1_score'], width_newest, label='F1-score')

ax.set_title(f'cutoff = 0.005, closure = True, top_n = 10', fontsize=10)

ax.set_ylim(0, 0.75)

# Change y-axis tick marks
ax.set_yticks(np.arange(0, 1.1, 0.25))
#ax.set_yticklabels([])  # Hide y-axis labels

# Set x-axis ticks
ax.set_xticks(x_newest)
ax.set_xticklabels(filtered_data_combination['label'], rotation=90)

# Add horizontal gridlines
ax.grid(axis='y', which='major', color='black', alpha=0.2)
ax.legend()

plt.tight_layout()
plt.savefig("process_precision_recall_specific_0.005_true_10__gpt4.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
def custom_order(label):
    prefix_order = ["NONE", "RefSeq", "GO", "NA"]
    suffix_order = ["3.0-GPT", "3.5GPT", "4.0-GPT"]
    
    try:
        prefix_parts = label.split('-')
        prefix_weight = prefix_order.index(prefix_parts[0])
        
        suffix = '-'.join(prefix_parts[1:])
        suffix_weight = suffix_order.index(suffix)
    except ValueError:
        return (999, 999)  # put unexpected values at the end
    
    return prefix_weight, suffix_weight

def label_order(label):
    prefixes = ["NONE", "RefSeq", "GO", "NA"]
    for i, prefix in enumerate(prefixes):
        if label.startswith(prefix):
            return i
    return len(prefixes)  # Return a value that puts any other labels at the end

data_filled = df.fillna("NA")
condition = (
    (data_filled['cutoff'] == 0.005) &
    (data_filled['closure'] == True) &
    (data_filled['top_n'] == 10)
)
filtered_data = data_filled[condition]
filtered_data.reset_index(drop=True, inplace=True)

grouped_data_filled = filtered_data.groupby(['cutoff', 'closure', 'top_n', 'source', 'model', 'method'])[['true_positives', 'false_positives', 'false_negatives']].sum().reset_index()

grouped_data_filled['precision'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_positives'])
grouped_data_filled['recall'] = grouped_data_filled['true_positives'] / (grouped_data_filled['true_positives'] + grouped_data_filled['false_negatives'])
grouped_data_filled['f1_score'] = 2 * ((grouped_data_filled['precision'] * grouped_data_filled['recall']) / (grouped_data_filled['precision'] + grouped_data_filled['recall']))

grouped_data_filled['sort_order'] = grouped_data_filled['source'].apply(custom_order)
grouped_data_filled = grouped_data_filled.sort_values(['sort_order', 'top_n', 'closure'])

# Filter out sets with 'NA-NA' label prefix
filtered_data_combination = grouped_data_filled[~grouped_data_filled['source'].str.startswith('NA')]

filtered_data_combination['method'] = filtered_data_combination['method'].str.upper()

fig, ax = plt.subplots(1, 1, figsize=(8, 6))  # Create a single subplot

#filtered_data_combination['label'] = filtered_data_combination['source'] + '-' + filtered_data_combination['model'].astype(str) + '-' + filtered_data_combination['method']
filtered_data_combination['label'] = filtered_data_combination['method'] + '-' + filtered_data_combination['model'].astype(str) + '-' + filtered_data_combination['source']


# Sort data_combination based on the label_order
filtered_data_combination['label_order'] = filtered_data_combination['label'].apply(label_order)
filtered_data_combination = filtered_data_combination.sort_values('label_order')

x_newest = np.arange(len(filtered_data_combination['label']))
width_newest = 0.2

ax.bar(x_newest - width_newest, filtered_data_combination['precision'], width_newest, label='Precision')
ax.bar(x_newest, filtered_data_combination['recall'], width_newest, label='Recall')
ax.bar(x_newest + width_newest, filtered_data_combination['f1_score'], width_newest, label='F1-score')

ax.set_title(f'cutoff = 0.005, closure = True, top_n = 10', fontsize=10)

ax.set_ylim(0, 1)

# Change y-axis tick marks
ax.set_yticks(np.arange(0, 1.1, 0.25))
#ax.set_yticklabels([])  # Hide y-axis labels

# Set x-axis ticks
ax.set_xticks(x_newest)
ax.set_xticklabels(filtered_data_combination['label'], rotation=90)

# Add horizontal gridlines
ax.grid(axis='y', which='major', color='black', alpha=0.2)
ax.legend()

plt.tight_layout()
plt.savefig("process_precision_recall_specific_0.005_true_10.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
print(filtered_data.shape)
filtered_data.columns

In [None]:
filtered_data_dropna = filtered_data[filtered_data['model'] != 'NA']

In [None]:
filtered_data_dropna['method'] = filtered_data_dropna['method'].str.upper()

In [None]:
method_model_combinations = filtered_data_dropna.groupby(['method', 'model', 'source'])
method_model_combinations.head()

In [None]:
method_model_combinations_columns = method_model_combinations[['method', 'model', 'source']]
#unique_rows = selected_columns.drop_duplicates()
unique_rows_df = pd.DataFrame(columns=['method', 'model', 'source'])

for group_name, group_data in method_model_combinations_columns:
    unique_rows = group_data.drop_duplicates()
    unique_rows_df = pd.concat([unique_rows_df, unique_rows])

print(unique_rows_df.shape)
unique_rows_df

In [None]:
import pandas as pd
from itertools import product

all_methods = ['GPT']
all_models = [3.0, 3.5, 4.0]
all_sources = ['NONE', 'RefSeq', 'GO']

all_possible_combinations_list = list(product(all_methods, all_models, all_sources))
all_possible_combinations = pd.DataFrame(all_possible_combinations_list, columns=['method', 'model', 'source'])

# Create a set of unique combinations from method_model_combinations
existing_combinations = set(method_model_combinations.groups.keys())
print(f'len existing: {len(existing_combinations)}')
print(existing_combinations)

# Create a set of all possible combinations
all_possible_combinations_set = set(map(tuple, all_possible_combinations.values))
print(f'len possible: {len(all_possible_combinations_set)}')
print(all_possible_combinations_set)


In [None]:
# Check for missing combinations
missing_combinations1 = all_possible_combinations_set - existing_combinations
print(missing_combinations1)
missing_combinations2 = existing_combinations - all_possible_combinations_set
print(missing_combinations2)
# Convert the missing combinations back to a DataFrame if needed
missing_combinations_df = pd.DataFrame(list(missing_combinations1), columns=['method', 'model', 'source'])

# Display the missing combinations
print("Missing combinations")
print(missing_combinations_df)

In [None]:
filtered_data_dropna['f1_score'].isna().sum()

In [None]:
filtered_data_dropna['f1_score'].unique()

In [None]:
method_model_combinations

In [None]:
metrics = ['f1_score']

t_test_results = []
pairs_with_na = []
all_possible_pairs = list(product(method_model_combinations, repeat=2))

# Perform pairwise t-tests for each metric
for metric in metrics:
    filtered_data_dropna[metric] = pd.to_numeric(filtered_data_dropna[metric], errors='coerce')

    for (key1, group1), (key2, group2) in all_possible_pairs:#existing_combinations:#:#all_possible_combinations_list:#
        print(f'1: {key1}\t2: {key2}')

        group1_metric = pd.to_numeric(group1[metric], errors='coerce').dropna()
        group2_metric = pd.to_numeric(group2[metric], errors='coerce').dropna()

        t_stat, p_value = ttest_ind(group1_metric, group2_metric, 
                                    nan_policy='propagate', permutations=100000, alternative='greater',
                                   equal_var='false')
        
        # Store the results
        t_test_results.append({
            'Method1': key1[0],
            'Model1': key1[1],
            'Source1': key1[2],
            'Method2': key2[0],
            'Model2': key2[1],
            'Source2': key2[2],
            'Metric': metric,
            'T-Statistic': t_stat,
            'P-Value': p_value,
        })

        '''#example output
        if key1 == ('GPT', 3.5, 'NONE') and key2 == ('GPT', 3.0, 'NONE'):
            print(f'1: {key1}\t2: {key2}')
            testres = ttest_ind(group1_metric, group2_metric, 
                                    nan_policy='propagate', permutations=100000, alternative='greater',
                                   equal_var='false')
            print(str(testres))
            print(type(group1[metric]))
            print(group1_metric)
            print(group2_metric)
            print(group1_metric[group1_metric.isna()])
            print(group2_metric[group2_metric.isna()])
            print(f'1: {group1[metric].isna().sum()}\t2: {group2_metric.isna().sum()}')
            print(f'group1: {len(group1_metric)}, group2: {len(group2_metric)}')
            print(t_test_results[-1])'''

# Create a DataFrame to display the results
t_test_results_df = pd.DataFrame(t_test_results)

# Display the results
#print("Pairwise t-test results:")
#print(significant_results)

#print("pairs_with_na")
#print(pairs_with_na)

In [None]:
t_test_results_df.to_csv('t_test_results_f1.tsv', sep='\t', index=False)
#t_test_results_df = pd.read_csv('t_test_results_f1.tsv', sep='\t')

In [None]:
print(f'len: {len(all_possible_pairs)}')
print(f'len: {len(all_possible_pairs[5])}')
print(f'len: {len(all_possible_pairs[5][0])}')
print(f'len: {len(all_possible_pairs[5][1])}')
print(type(all_possible_pairs[5][0][0]))
all_possible_pairs[5][0][0]
#all_possible_pairs[5][1]

In [None]:
# You can filter the results based on a significance level (e.g., 0.05) if needed
#significance_level = 0.05
#significant_results = t_test_results_df[t_test_results_df['P-Value'] < significance_level]

In [None]:
t_test_results_df['Method-Model-Source-1 >'] = t_test_results_df['Method1'].str.upper() + '-' + t_test_results_df['Model1'].astype(str) + '-' + t_test_results_df['Source1']
t_test_results_df['< Method-Model-Source-2'] = t_test_results_df['Method2'].str.upper() + '-' + t_test_results_df['Model2'].astype(str) + '-' + t_test_results_df['Source2']

#print(significant_results['Method+Model+Source 1'])

# Pivot the dataframe
heatmap_data = t_test_results_df.pivot(index='Method-Model-Source-1 >', columns='< Method-Model-Source-2', values='P-Value')
heatmap_data = heatmap_data[heatmap_data.index]
heatmap_data = heatmap_data.iloc[::-1]
print(heatmap_data.shape)
heatmap_data

In [None]:
# Plot the heatmap
plt.figure(figsize=(10, 8))
heatmap_data_log = np.log10(heatmap_data)
heatmap_data_log[heatmap_data_log < 0] = -heatmap_data_log[heatmap_data_log < 0]

diag_value = np.nan
shape = heatmap_data_log.shape
#print(shape)
for i in range(min(shape)):
#    print(f'i: {i}')
#    print(f'max(shape)-i: {max(shape)-1-i}')
    heatmap_data_log.iloc[i, max(shape)-1-i] = diag_value

#mask = heatmap_data_log > 1.3
#mask=~mask, 
sns.heatmap(heatmap_data_log, annot=True, cmap='RdPu', cbar_kws={'label': '-log10 P-value'}, linewidths=1, linecolor='black', vmin=1.3)
#sns.heatmap(heatmap_data_log, annot=True, cmap='RdPu', cbar_kws={'label': '-log10 P-Value'}, linewidths=1, linecolor='black', vmin=1.3)

plt.title("Pairwise F1-score tests")
plt.grid(False)
plt.savefig("TALISMAN_ttest_f1.pdf", dpi=300, bbox_inches='tight')

plt.show()

In [None]:
print(f'len: {len(all_possible_pairs)}')
print(f'len: {len(all_possible_pairs[5])}')
print(f'len: {len(all_possible_pairs[5][0])}')
print(f'len: {len(all_possible_pairs[5][1])}')
print(type(all_possible_pairs[5][0][0]))
all_possible_pairs[5][0][0]
#all_possible_pairs[5][1]

In [None]:
metrics = ['precision']

t_test_results_precision = []
pairs_with_na = []
all_possible_pairs = list(product(method_model_combinations, repeat=2))

# Perform pairwise t-tests for each metric
for metric in metrics:
    filtered_data_dropna[metric] = pd.to_numeric(filtered_data_dropna[metric], errors='coerce')

    for (key1, group1), (key2, group2) in all_possible_pairs:#existing_combinations:#:#all_possible_combinations_list:#
        print(f'1: {key1}\t2: {key2}')

        group1_metric = pd.to_numeric(group1[metric], errors='coerce').dropna()
        group2_metric = pd.to_numeric(group2[metric], errors='coerce').dropna()

        t_stat, p_value = ttest_ind(group1_metric, group2_metric, 
                                    nan_policy='propagate', permutations=100000, alternative='greater',
                                   equal_var='false')
        
        # Store the results
        t_test_results_precision.append({
            'Method1': key1[0],
            'Model1': key1[1],
            'Source1': key1[2],
            'Method2': key2[0],
            'Model2': key2[1],
            'Source2': key2[2],
            'Metric': metric,
            'T-Statistic': t_stat,
            'P-Value': p_value,
        })
        
        if key1 == ('GPT', 3.5, 'NONE') and key2 == ('GPT', 3.0, 'NONE'):
            print(f'1: {key1}\t2: {key2}')
            testres = ttest_ind(group1[metric].dropna(), group2[metric].dropna(), nan_policy='propagate', permutations=10000, alternative='greater',equal_var='false')
            print(str(testres))
            print(type(group1[metric]))
            print(group1_metric)
            print(group2_metric)
            print(group1_metric[group1_metric.isna()])
            print(group2_metric[group2_metric.isna()])
            print(f'1: {group1[metric].isna().sum()}\t2: {group2_metric.isna().sum()}')
            print(f'group1: {len(group1_metric)}, group2: {len(group2_metric)}')
            print(t_test_results_precision[-1])

# Create a DataFrame to display the results
t_test_results_precision_df = pd.DataFrame(t_test_results_precision)


In [None]:
t_test_results_precision_df['Method-Model-Source-1 >'] = t_test_results_precision_df['Method1'].str.upper() + '-' + t_test_results_precision_df['Model1'].astype(str) + '-' + t_test_results_df['Source1']
t_test_results_precision_df['< Method-Model-Source-2'] = t_test_results_precision_df['Method2'].str.upper() + '-' + t_test_results_precision_df['Model2'].astype(str) + '-' + t_test_results_df['Source2']

#print(significant_results['Method+Model+Source 1'])

# Pivot the dataframe
heatmap_data = t_test_results_precision_df.pivot(index='Method-Model-Source-1 >', columns='< Method-Model-Source-2', values='P-Value')
heatmap_data = heatmap_data[heatmap_data.index]
heatmap_data = heatmap_data.iloc[::-1]
print(heatmap_data.shape)
heatmap_data

In [None]:
# Plot the heatmap
plt.figure(figsize=(10, 8))
heatmap_data_log = np.log10(heatmap_data)
heatmap_data_log[heatmap_data_log < 0] = -heatmap_data_log[heatmap_data_log < 0]

diag_value = np.nan
shape = heatmap_data_log.shape
#print(shape)
for i in range(min(shape)):
#    print(f'i: {i}')
#    print(f'max(shape)-i: {max(shape)-1-i}')
    heatmap_data_log.iloc[i, max(shape)-1-i] = diag_value

#mask = heatmap_data_log > 1.3
#mask=~mask, 
sns.heatmap(heatmap_data_log, annot=True, cmap='RdPu', cbar_kws={'label': '-log10 P-value'}, linewidths=1, linecolor='black', vmin=1.3)
#sns.heatmap(heatmap_data_log, annot=True, cmap='RdPu', cbar_kws={'label': '-log10 P-Value'}, linewidths=1, linecolor='black', vmin=1.3)

plt.title("Pairwise precision tests")
plt.grid(False)
plt.savefig("TALISMAN_ttest_precision.pdf", dpi=300, bbox_inches='tight')

plt.show()

In [None]:
metrics = ['recall']

t_test_results_recall = []
pairs_with_na = []
all_possible_pairs = list(product(method_model_combinations, repeat=2))

# Perform pairwise t-tests for each metric
for metric in metrics:
    filtered_data_dropna[metric] = pd.to_numeric(filtered_data_dropna[metric], errors='coerce')

    for (key1, group1), (key2, group2) in all_possible_pairs:#existing_combinations:#:#all_possible_combinations_list:#
        print(f'1: {key1}\t2: {key2}')

        group1_metric = pd.to_numeric(group1[metric], errors='coerce').dropna()
        group2_metric = pd.to_numeric(group2[metric], errors='coerce').dropna()

        t_stat, p_value = ttest_ind(group1_metric, group2_metric, 
                                    nan_policy='propagate', permutations=100000, alternative='greater',
                                   equal_var='false')
        
        # Store the results
        t_test_results_recall.append({
            'Method1': key1[0],
            'Model1': key1[1],
            'Source1': key1[2],
            'Method2': key2[0],
            'Model2': key2[1],
            'Source2': key2[2],
            'Metric': metric,
            'T-Statistic': t_stat,
            'P-Value': p_value,
        })
        
        if key1 == ('GPT', 3.5, 'NONE') and key2 == ('GPT', 3.0, 'NONE'):
            print(f'1: {key1}\t2: {key2}')
            testres = ttest_ind(group1[metric].dropna(), group2[metric].dropna(), nan_policy='propagate', permutations=10000, alternative='greater',equal_var='false')
            print(str(testres))
            print(type(group1[metric]))
            print(group1_metric)
            print(group2_metric)
            print(group1_metric[group1_metric.isna()])
            print(group2_metric[group2_metric.isna()])
            print(f'1: {group1[metric].isna().sum()}\t2: {group2_metric.isna().sum()}')
            print(f'group1: {len(group1_metric)}, group2: {len(group2_metric)}')
            print(t_test_results_recall[-1])

# Create a DataFrame to display the results
t_test_results_recall_df = pd.DataFrame(t_test_results_recall)


In [None]:
t_test_results_recall_df['Method-Model-Source-1 >'] = t_test_results_recall_df['Method1'].str.upper() + '-' + t_test_results_recall_df['Model1'].astype(str) + '-' + t_test_results_df['Source1']
t_test_results_recall_df['< Method-Model-Source-2'] = t_test_results_recall_df['Method2'].str.upper() + '-' + t_test_results_recall_df['Model2'].astype(str) + '-' + t_test_results_df['Source2']

#print(significant_results['Method+Model+Source 1'])

# Pivot the dataframe
heatmap_data = t_test_results_recall_df.pivot(index='Method-Model-Source-1 >', columns='< Method-Model-Source-2', values='P-Value')
heatmap_data = heatmap_data[heatmap_data.index]
heatmap_data = heatmap_data.iloc[::-1]
print(heatmap_data.shape)
heatmap_data

In [None]:
# Plot the heatmap
plt.figure(figsize=(10, 8))
heatmap_data_log = np.log10(heatmap_data)
heatmap_data_log[heatmap_data_log < 0] = -heatmap_data_log[heatmap_data_log < 0]

diag_value = np.nan
shape = heatmap_data_log.shape
#print(shape)
for i in range(min(shape)):
#    print(f'i: {i}')
#    print(f'max(shape)-i: {max(shape)-1-i}')
    heatmap_data_log.iloc[i, max(shape)-1-i] = diag_value

#mask = heatmap_data_log > 1.3
#mask=~mask, 
sns.heatmap(heatmap_data_log, annot=True, cmap='RdPu', cbar_kws={'label': '-log10 P-value'}, linewidths=1, linecolor='black', vmin=1.3)
#sns.heatmap(heatmap_data_log, annot=True, cmap='RdPu', cbar_kws={'label': '-log10 P-Value'}, linewidths=1, linecolor='black', vmin=1.3)

plt.title("Pairwise recall tests")
plt.grid(False)
plt.savefig("TALISMAN_ttest_recall.pdf", dpi=300, bbox_inches='tight')

plt.show()

In [None]:
metrics = ['f1_score']

mu_test_results = []
pairs_with_na = []
all_possible_pairs = list(product(method_model_combinations, repeat=2))

# Perform pairwise t-tests for each metric
for metric in metrics:
    filtered_data_dropna[metric] = pd.to_numeric(filtered_data_dropna[metric], errors='coerce')

    for (key1, group1), (key2, group2) in all_possible_pairs:#existing_combinations:#:#all_possible_combinations_list:#
        print(f'1: {key1}\t2: {key2}')

        group1_metric = pd.to_numeric(group1[metric], errors='coerce').dropna()
        group2_metric = pd.to_numeric(group2[metric], errors='coerce').dropna()

        mu_stat, p_value = mannwhitneyu(group1_metric, group2_metric, alternative='greater', method='exact')
        #ttest_ind(group1_metric, group2_metric, 
        #                            nan_policy='propagate', permutations=100000, alternative='greater',
        #                           equal_var='false')
        
        # Store the results
        mu_test_results.append({
            'Method1': key1[0],
            'Model1': key1[1],
            'Source1': key1[2],
            'Method2': key2[0],
            'Model2': key2[1],
            'Source2': key2[2],
            'Metric': metric,
            'MU-Statistic': mu_stat,
            'P-Value': p_value,
        })



# Create a DataFrame to display the results
mu_test_results_df = pd.DataFrame(mu_test_results)

# Display the results
#print("Pairwise t-test results:")
#print(significant_results)

#print("pairs_with_na")
#print(pairs_with_na)