In [37]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from scipy.stats import chi2_contingency, fisher_exact
from statsmodels.stats.multitest import multipletests
import types

In [3]:
from dataManager import DataManager, LogManager

In [4]:
# Prepare the logger
logger = LogManager(is_active=True)

# Initialize the DataManager class and import all the data
DATA_PATH = "../../preparation/codice/"
HPO2GENES_PATH = f"{DATA_PATH}phenotype_to_genes.txt"
HPO2GENES_PROVA_PATH = f"{DATA_PATH}HPO2Genes_head.csv"
GO_ONTOLOGY_PATH = f"{DATA_PATH}go-basic.obo" 
GENE2GO_PATH = f"{DATA_PATH}gene2go"  # Path to the downloaded gene2go file

data_manager = DataManager()

print(DATA_PATH)

../../preparation/codice/


In [5]:
logger.log("Importing HPO2Genes file...")
data_manager.importHPO2GeneFile(HPO2GENES_PATH, L_bound = 50, R_bound = 75)
logger.log("HPO2Genes file imported.")
print(f"Shape:{data_manager.hpo_shape()}")
data_manager.hpo_head()

Importing HPO2Genes file...
HPO2Genes file imported.
Shape:(4388, 534)


hpo_id,HP:0000011,HP:0000012,HP:0000013,HP:0000046,HP:0000066,HP:0000097,HP:0000110,HP:0000113,HP:0000114,HP:0000155,...,HP:0410005,HP:0410241,HP:0430046,HP:5200005,HP:5200010,HP:5200017,HP:5200018,HP:5200029,HP:5200123,HP:6000531
ncbi_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
humanTaxID = 9606
GO_taxonomies = [humanTaxID]
logger.log("\nImporting GO2Genes file...")
data_manager.importGO2GeneFile(go_ontology_path=GO_ONTOLOGY_PATH, gene2go_path=GENE2GO_PATH, taxids = GO_taxonomies)
logger.log("GO2Genes file imported.")
print(f"Shape:{data_manager.go_shape()}")
data_manager.go_head(columns = [])


Importing GO2Genes file...
../../preparation/codice/go-basic.obo: fmt(1.2) rel(2024-10-27) 44,017 Terms
**NOTE: DEFAULT TAXID STORED FROM gene2go IS 9606 (human)

HMS:0:01:59.055239 362,883 annotations, 20,819 genes, 18,767 GOs, 1 taxids READ: ../../preparation/codice/gene2go 
20785 IDs in loaded association branch, biological_process
GO2Genes file imported.
Shape:(20785, 18720)


GO,GO:0000002,GO:0000009,GO:0000012,GO:0000014,GO:0000015,GO:0000016,GO:0000017,GO:0000018,GO:0000019,GO:0000022,...,GO:2001288,GO:2001294,GO:2001295,GO:2001299,GO:2001301,GO:2001302,GO:2001303,GO:2001304,GO:2001306,GO:2001311
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
data_manager.hpo_head()

hpo_id,HP:0000011,HP:0000012,HP:0000013,HP:0000046,HP:0000066,HP:0000097,HP:0000110,HP:0000113,HP:0000114,HP:0000155,...,HP:0410005,HP:0410241,HP:0430046,HP:5200005,HP:5200010,HP:5200017,HP:5200018,HP:5200029,HP:5200123,HP:6000531
ncbi_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Statistical Test (Chi2 or Fisher)

### I choose one HPO randomly among the ones selected

In [10]:
random_hpo_column = random.choice(data_manager.hpo_gene_data.columns)
print(random_hpo_column)

HP:0012736


### I choose some GO terms randomly

In [9]:
sample_go_columns = random.sample(list(data_manager.go_gene_data.columns), 1000)
sample_go_columns[:10]

['GO:0004707',
 'GO:0045112',
 'GO:0000038',
 'GO:0008466',
 'GO:0015203',
 'GO:0072219',
 'GO:0097165',
 'GO:0004074',
 'GO:0060292',
 'GO:0042662']

### Now I compute the significance of these sampled go terms with respect to that randomly chosen hpo term

I don't apply any correction

In [12]:
sample_results = data_manager.compute_significance(hpo_column=random_hpo_column, go_columns = sample_go_columns,
                              method = "chi2", only_significant=True,
                              correction = None)

NameError: name 'data_manager' is not defined

In [None]:
sample_results.head()

In [53]:
import importlib
import dataManager  # Import the module normally
importlib.reload(dataManager)
from dataManager import DataManager

In [56]:
def compute_p_value(self, go_column, hpo_column, method = "chi2"):
    #get the series from the column names
    hpo_series = self.hpo_gene_data[hpo_column]
    go_series = self.go_gene_data[go_column]

    # Align both Series to have the same index
    hpo_series, go_series = hpo_series.align(go_series, join='inner')

    # Confusion matrix components
    both = ((go_series == 1) & (hpo_series == 1)).sum()
    only_go = ((go_series == 1) & (hpo_series == 0)).sum()
    only_hpo = ((go_series == 0) & (hpo_series == 1)).sum()
    neither = ((go_series == 0) & (hpo_series == 0)).sum()
    # Contingency table with Laplace Smoothing to prevent some cells to have frequency 0
    contingency_table = [[both + 1, only_hpo + 1],
                        [only_go + 1, neither + 1]]
    p_value = 1
    # Chi-Square test (or switch to Fisher's Exact Test if preferred)
    if method.lower() == "chi2":
        _, p_value, _, _ = chi2_contingency(contingency_table)
    elif method.lower() == "fisher":
        _, p_value = fisher_exact(contingency_table) 
    return p_value

def compute_significance(self, hpo_column:str, go_columns:list = None,
                              method:str = "chi2", only_significant:bool=True,
                              correction:str = None):
        # if go_columns is none, all of them are considered (default)
        if go_columns is None:
            go_columns = self.go_gene_data.columns

        # Apply the function across GO columns (sampled)
        p_values = [self.compute_p_value(go_col, hpo_column, method=method) for go_col in go_columns]

        results_df = pd.DataFrame({'GO_Term': go_columns, 'P_Value': p_values})

        if correction == None:
            results_df['Significant']  = results_df['P_Value'] < 0.05
        else:
            corrected_results = multipletests(results_df['P_Value'], method=correction)
            results_df['Adjusted_P_Value'] = corrected_results[1]  # Corrected p-values
            results_df['Significant'] = corrected_results[0]       # True/False for significance
    
        # Filter significant GO terms
        if only_significant == True:
            return results_df[results_df['Significant']]
        else:
            return results_df

In [57]:
data_manager.compute_p_value = types.MethodType(compute_p_value, data_manager)
data_manager.compute_significance = types.MethodType(compute_significance, data_manager)

In [58]:
data_manager.compute_significance(hpo_column=random_hpo_column, go_columns = sample_go_columns,
                              method = "chi2", only_significant=True,
                              correction = None)

Unnamed: 0,GO_Term,P_Value,Significant
1,GO:0045112,4.476381e-02,True
2,GO:0000038,1.356866e-09,True
3,GO:0008466,4.476381e-02,True
5,GO:0072219,4.476381e-02,True
7,GO:0004074,4.476381e-02,True
...,...,...,...
993,GO:0008613,1.075391e-02,True
994,GO:0034346,4.476381e-02,True
996,GO:0035306,4.476381e-02,True
997,GO:0006880,1.075391e-02,True
