In [1]:
import pandas as pd

# Import and clean annotation data
We downloaded the data from KEGG and Reactome (using the API from KEGG and the download option in Reactome) to obtain the :
-  Kegg genes ids to gene names
-  Kegg pathway ids to kegg genes ids
-  Uniprot ids to kegg ids
-  Uniprot ids to reactome pathway ids

In [2]:
kegg = pd.read_table("data/kegg.txt", names = ['Pathway','KeggID'])
genes_to_kegg = pd.read_table("data/genes_to_kegg_ID.txt", names = ['KeggID','Genes'])
uniprot_to_kegg = pd.read_table("data/uniprot_to_kegg_ID.txt",names = ['UniprotID','KeggID'])
uniprot_to_reactome = pd.read_table("data/UniProt2Reactome_All_Levels.txt.gz", names = ['UniprotID','Pathway_Reactome','url',
                                                                         'Name','Code','Species'])
kegg['Pathway'] = kegg.Pathway.str.slice(5) # delete 'path:'
uniprot_to_kegg['UniprotID'] = uniprot_to_kegg.UniprotID.str.slice(3) # delete 'up:'
uniprot_to_reactome = uniprot_to_reactome[uniprot_to_reactome.Species == 'Homo sapiens']

We merge all the annotations together to create one table with all pathways informations.

In [3]:
all_genes_to_kegg = pd.merge(genes_to_kegg,kegg, how='left', on = ['KeggID','KeggID'])
all_genes_to_kegg = pd.merge(all_genes_to_kegg,uniprot_to_kegg,how = 'left', on=['KeggID','KeggID'])
all_genes_to_kegg = pd.merge(all_genes_to_kegg,uniprot_to_reactome, how = 'left', on=['UniprotID','UniprotID'])
all_genes_to_kegg = all_genes_to_kegg[['KeggID','Genes','Pathway','Pathway_Reactome','Name']]
all_genes_to_kegg = all_genes_to_kegg.fillna(0) #to check if the pathway is avalaible

We create a dictionary to obtain one gene - all pathways information.

In [4]:
pathways = {}
for index, row in all_genes_to_kegg.iterrows():
    keys = row['Genes'].split(';')[0].split(',')
    if row['Pathway'] != 0:
        for key in keys:
            if key not in pathways:
                pathways[key] = [row['Pathway']]
            else:
                pathways[key].append(row['Pathway'])
    if row['Pathway_Reactome'] != 0:
        for key in keys:
            if key not in pathways:
                pathways[key] = [row['Pathway_Reactome']]
            else:
                pathways[key].append(row['Pathway_Reactome'])
pathways

{'DEFB103A': ['R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157'],
 ' BD-3': ['R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157',
  'R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157'],
 ' DEFB-3': ['R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157',
  'R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157'],
 ' DEFB103': ['R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157',
  'R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157'],
 ' DEFB3': ['R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157',
  'R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157'],
 ' HBD3': ['R-HSA-1461957',
  'R-HSA-1461973',
  'R-HSA-168249',
  'R-HSA-168256',
  'R-HSA-6803157',
  

# Annotate the combinations
With the file, we just check if we have an intersection of the pathways for each gene pair.

In [5]:
file = pd.read_csv("data/posey_to_predict.csv", header = 0)
res = []
for index, row in file.iterrows():
    gene1,gene2 = row['GenePair'].split('/')
    try:
        if len(set(pathways[gene1]) & set(pathways[gene2])) > 0:
            res.append(1)
        else:
            res.append(0)
    # if not present in the dictionary
    except Exception as e:
        print(e)
        res.append(0)

'ENG'
'MYO1F'
'SOX10'
'SCN1A'
'ANKRD11'
'NF1'
'SMC1A'
'SETBP1'
'ACTG1'
'KIF5C'
'NF1'
'TGFB2'
'ECEL1'
'BRWD3'
'SMC1A'
'SCN1A'
'ATRX'
'AGL'
'MCCC2'
'TPO'
'AGL'
'MECP2'


In [6]:
file['Path'] = res
file

Unnamed: 0,CADD1,CADD2,RecA,EssA,CADD3,CADD4,RecB,EssB,Path,GenePair,Type
0,2.819812,0.0,0.766452,1.0,11.366191,0.0,,,0,ENG/ASXL3,Distinct
1,3.705321,0.0,0.173316,0.0,7.08488,0.0,0.559057,1.0,1,SLC26A1/CACNA1A,Overlapping
2,3.713686,0.0,0.200792,0.0,4.532255,0.0,0.113767,1.0,0,MYO1F/FBN1,Distinct
3,5.658066,0.0,0.450519,1.0,1.320816,0.0,0.213879,1.0,0,SOX10/GDF6,Overlapping
4,4.297717,0.0,0.214085,1.0,6.597932,0.0,0.516366,1.0,0,SCN1A/SMARCA2,Overlapping
5,7.134717,0.0,0.239843,0.0,8.776734,0.0,0.107361,1.0,0,SLC6A1/ANKRD11,Overlapping
6,3.04257,0.0,0.835765,1.0,3.989005,0.0,0.908457,1.0,0,SOX9/NF1,Distinct
7,10.77899,0.0,0.250815,0.0,2.540059,0.0,0.231156,,0,SMC1A/MYH2,Distinct
8,10.137813,7.155611,,1.0,6.078869,0.0,0.299625,,0,CLCN1/SETBP1,Distinct
9,0.782101,0.0,,1.0,4.908936,-0.032748,0.109796,1.0,0,ACTG1/WFS1,Overlapping


In [7]:
file.to_csv('posey_to_predict.csv',index=False, na_rep='N/A')