In [18]:
import pandas as pd

# Load the provided TSV files for a list of genes and a summary for each one
file_path_summary = '../gene_desease_associations/C0497327_disease_gda_summary.tsv'
file_path_evidences = '../gene_desease_associations/C0497327_disease_gda_evidences.tsv'

# Reading the TSV files
genes_summary = pd.read_csv(file_path_summary, sep='\t')
genes_evidences = pd.read_csv(file_path_evidences, sep='\t')

genes_summary = pd.DataFrame(genes_summary)
genes_evidences = pd.DataFrame(genes_evidences)

In [19]:
genes_evidences.head()

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,Score_gda,Association_Type,Type,Original_DB,Sentence,PMID,PMID_Year
0,Dementia,C0497327,IGFALS,3483,0.1,Biomarker,,BEFREE,Attention is drawn to the similarities between...,7431026.0,1980.0
1,Dementia,C0497327,PRNP,5621,0.5,GeneticVariation,,BEFREE,"In contrast, a recent case with proven <span c...",8520719.0,1995.0
2,Dementia,C0497327,APOE,348,0.2,GeneticVariation,,BEFREE,"In multivariate models adjusted for age, educa...",20625087.0,2010.0
3,Dementia,C0497327,ITM2B,9445,0.2,Biomarker,,BEFREE,"In a British family, mutation of the terminati...",16246057.0,2005.0
4,Dementia,C0497327,APOE,348,0.2,GeneticVariation,,BEFREE,Association of the epsilon 4 allele of <span c...,9086316.0,1996.0


In [20]:
unique_gene_counts = genes_evidences['Association_Type'].value_counts()
print(unique_gene_counts)

Association_Type
Biomarker                        1545
GeneticVariation                 1224
AlteredExpression                 242
PosttranslationalModification       6
CausalMutation                      4
SusceptibilityMutation              1
Name: count, dtype: int64


In [21]:
def find_max_score_gda(group):
    return group.loc[group['Score_gda'].idxmax()]

# Group by 'Association_type' and apply the function to each group
max_score_per_association = genes_evidences.groupby('Association_Type').apply(find_max_score_gda)

# Select only the required columns
max_score_per_association = max_score_per_association[['Gene', 'Association_Type', 'Score_gda', 'Sentence']]

# Create Data Frame
genes = pd.DataFrame(max_score_per_association)
genes

Unnamed: 0_level_0,Gene,Association_Type,Score_gda,Sentence
Association_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AlteredExpression,TREM2,AlteredExpression,0.7,"The selective expression of <span class=""gene""..."
Biomarker,APP,Biomarker,0.7,Also problematic is the alternative hypothesis...
CausalMutation,MAPT,CausalMutation,0.5,
GeneticVariation,APP,GeneticVariation,0.7,The most common familial early onset <span cla...
PosttranslationalModification,BDNF,PosttranslationalModification,0.1,Multivariable logistic regression analyses det...
SusceptibilityMutation,GBA,SusceptibilityMutation,0.16,


In [52]:
# Filter rows
filtered_genes = genes_evidences[genes_evidences['Score_gda'] == 0.7]

# Sort the DataFrame by 'Score_gda' in descending order
sorted_genes = genes_evidences.sort_values(by='Score_gda', ascending=False)

# Drop duplicates in 'Gene' column to ensure different genes
unique_genes = sorted_genes.drop_duplicates(subset=['Gene'])

# Get the first 5 rows
unique_genes.head(5)

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,Score_gda,Association_Type,Type,Original_DB,Sentence,PMID,PMID_Year
1895,Dementia,C0497327,APP,351,0.7,GeneticVariation,,BEFREE,"We found a novel <span class=""gene"" id=""227279...",22727994.0,2012.0
625,Dementia,C0497327,TREM2,54209,0.7,Biomarker,,HPO,,,
1,Dementia,C0497327,PRNP,5621,0.5,GeneticVariation,,BEFREE,"In contrast, a recent case with proven <span c...",8520719.0,1995.0
1516,Dementia,C0497327,MAPT,4137,0.5,Biomarker,,BEFREE,Frontotemporal dementia and parkinsonism linke...,20178834.0,2010.0
1616,Dementia,C0497327,GRN,2896,0.5,GeneticVariation,,BEFREE,"Heterozygous mutations in <span class=""gene"" i...",22608501.0,2012.0


In [53]:
gene_names = ['TREM2', 'APP', 'PRNP', 'MAPT', 'GRN']

sentences = genes_evidences[genes_evidences['Gene'].isin(gene_names)]

sentences = pd.DataFrame(sentences)
sentences

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,Score_gda,Association_Type,Type,Original_DB,Sentence,PMID,PMID_Year
1,Dementia,C0497327,PRNP,5621,0.5,GeneticVariation,,BEFREE,"In contrast, a recent case with proven <span c...",8520719.0,1995.0
17,Dementia,C0497327,MAPT,4137,0.5,Biomarker,,BEFREE,"<span class=""disease"" id=""27792010-1-0-8"">Deme...",27792010.0,2017.0
18,Dementia,C0497327,MAPT,4137,0.5,Biomarker,,BEFREE,"The transition of <span class=""gene"" id=""31456...",31456657.0,2019.0
20,Dementia,C0497327,APP,351,0.7,GeneticVariation,,BEFREE,The most common familial early onset <span cla...,15258222.0,2004.0
22,Dementia,C0497327,TREM2,54209,0.7,AlteredExpression,,BEFREE,"The selective expression of <span class=""gene""...",26694609.0,2016.0
...,...,...,...,...,...,...,...,...,...,...,...
2985,Dementia,C0497327,APP,351,0.7,Biomarker,,BEFREE,"These results suggest that s<span class=""gene""...",22170863.0,2012.0
2987,Dementia,C0497327,APP,351,0.7,GeneticVariation,,BEFREE,Overexpression of DSCAM in Down syndrome (DS) ...,21241773.0,2011.0
3002,Dementia,C0497327,MAPT,4137,0.5,GeneticVariation,,LHGDN,Novel G335V mutation in the tau gene associate...,15765246.0,2005.0
3006,Dementia,C0497327,APP,351,0.7,Biomarker,,LHGDN,Association between progranulin and beta-amylo...,18955727.0,2009.0


In [56]:
# Extract rows where the 'Gene' column matches any of the genes in genes_of_interest
summaries = genes_summary[genes_summary['Gene'].isin(gene_names)]
summaries

Unnamed: 0,Disease,Disease_id,Gene,Gene_id,UniProt,Gene_Full_Name,Protein_Class,N_diseases_g,DSI_g,DPI_g,pLI,Score_gda,EL_gda,EI_gda,N_PMIDs,N_SNPs_gda,First_Ref,Last_Ref
0,Dementia,C0497327,TREM2,54209,Q9NZC2,triggering receptor expressed on myeloid cells 2,,239,0.519,0.769,3.2892e-09,0.7,strong,1.0,19,5,2005.0,2020.0
1,Dementia,C0497327,APP,351,P05067,amyloid beta precursor protein,Enzyme modulator,485,0.422,0.846,0.046544,0.7,strong,0.964,83,10,1993.0,2020.0
3,Dementia,C0497327,GRN,2896,P28799,granulin precursor,,412,0.435,0.846,0.069666,0.5,,1.0,46,4,2006.0,2019.0
4,Dementia,C0497327,PRNP,5621,F7VJQ1;P04156,prion protein,,426,0.445,0.923,0.00063182,0.5,strong,0.958,48,4,1991.0,2019.0
5,Dementia,C0497327,MAPT,4137,P10636,microtubule associated protein tau,,469,0.446,0.923,0.0060258,0.5,,0.992,127,13,1995.0,2020.0
