In [1]:
import sys
import os
from collections import Counter
from pathlib import Path
import pickle
import pandas as pd
import pybiopax
import csv
import networkx as nx
import mygene

In [2]:
with open("../data/raw/PathwayCommons12.All.hgnc.sif",'r') as ptc_file:
    ptc_sif = [i.strip().split('\t') for i in ptc_file.readlines()]

In [3]:
ptc_genes = []
for i in ptc_sif:
    if 'CHEBI' not in i[0]:
        ptc_genes.append(i[0])
    if 'CHEBI' not in i[2]:
        ptc_genes.append(i[2])

In [4]:
ptc_genes = set(ptc_genes)

In [5]:
ptc_genes

{'NEXMIF',
 'AOX1',
 'PTPN5',
 'LLGL1',
 'IGHV4-34',
 'SPAM1',
 'CLUH',
 'CEACAM20',
 'NLGN4X',
 'CACNG3',
 'PKHD1L1',
 'GSK3B',
 'PDK1',
 'HOOK1',
 'PRUNE2',
 'MSLN',
 'BICRA',
 'OR2T29',
 'REV3L',
 'ACTR8',
 'CKAP4',
 'NBR2',
 'HSP90AB3P',
 'ABCC2',
 'SELENOO',
 'RPP21',
 'PRKRIP1',
 'PIMREG',
 'POLR2E',
 'PBX4',
 'ZNF629',
 'GLA',
 'MT-ATP6',
 'MTBP',
 'FAM3D',
 'JAKMIP3',
 'ZNF330',
 'MYORG',
 'DALRD3',
 'ACTL9',
 'H2AFY',
 'FAU',
 'FGD1',
 'C3orf18',
 'AKR1C1',
 'OR4D9',
 'B4GALT6',
 'EYA2',
 'KIAA0408',
 'CPSF4L',
 'NACA2',
 'ZNF625',
 'RILPL1',
 'PLPP2',
 'SLIRP',
 'TRAM1L1',
 'ZBTB4',
 'PSIP1',
 'FAM71D',
 'FAM83E',
 'OR4K13',
 'CYP20A1',
 'DEFB109B',
 'ATP23',
 'CHUK',
 'GPR87',
 'ADAM8',
 'PPP1R16A',
 'SLC25A41',
 'GAS2',
 'NSUN5',
 'ZSCAN5DP',
 'RTL8A',
 'ERI1',
 'BRI3BP',
 'MUC15',
 'RPS26',
 'IL3RA',
 'PROCR',
 'RALGAPA1',
 'CCDC84',
 'SSC5D',
 'VEZF1',
 'MYT1',
 'STX4',
 'ZCCHC2',
 'CEP85L',
 'SUMO1',
 'PRR23A',
 'LIPT2',
 'DCAF13',
 'KCNJ5',
 'TRPV6',
 'HUNK',
 'OSM',
 '

In [6]:
mg = mygene.MyGeneInfo()
mg_hgnc = mg.querymany(list(ptc_genes), scopes='symbol,alias', fields='entrezgene', species='human',returnall=True)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-19572...done.
Finished.
1443 input query terms found dup hits:
	[('IGHV4-34', 3), ('PDK1', 2), ('HSP90AB3P', 2), ('GLA', 2), ('ZSCAN5DP', 2), ('RPS26', 3), ('MYT1',
15 input query terms found no hit:
	['MT-ATP6', 'MT-ND3', 'MT-ND5', 'MT-CYB', 'MT-ND6', 'MT-ND4', 'SPHAR', 'MT-ND2', 'MT-CO1', 'ZNF705F'


In [27]:
entrez_counts = Counter([i.get('query') for i in mg_hgnc['out'] if i.get('entrezgene')])

In [21]:
symbol2entrez = {}
#symbol2entrez = { i.get('query'):i.get('entrezgene') for i in mg_hgnc if i.get('entrezgene')}
for gene in entrez_counts.keys():
    
    gene_results = [result for result in mg_hgnc['out'] if (result.get('query') == gene) and (result.get('entrezgene'))]

    if len(gene_results)==1:
        symbol2entrez[gene_results[0]['query']]=gene_results[0]['entrezgene']
    elif len(gene_results)>1:
        scores = [result['_score'] for result in gene_results]
        #Take the firs in the case of ties
        best_result = gene_results[scores.index(max(scores))]
        symbol2entrez[best_result['query']]=best_result['entrezgene']
        

In [41]:
symbol2entrez['GIF']

'2694'

In [30]:
ptc_graph = nx.MultiGraph()
for sif in ptc_sif:
    edge = []
    for i in [0,2]:
        if 'CHEBI' not in sif[i]:
            if sif[i] in symbol2entrez:
                ptc_graph.add_node(int(symbol2entrez[sif[i]]),label='gene')
                edge.append(int(symbol2entrez[sif[i]]))
                
        else:
            ptc_graph.add_node(sif[i],label='chebi')
            edge.append(sif[i])
                            
    if len(edge)==2:
        ptc_graph.add_edge(edge[0],edge[1],key=sif[1])

    

In [31]:
(ptc_graph.number_of_nodes(),ptc_graph.number_of_edges())

(30872, 1829702)

In [32]:
components = [i for i in nx.connected_components(ptc_graph)]

In [33]:
components

[{1,
  2,
  131076,
  'CHEBI:44897',
  'CHEBI:7865',
  'CHEBI:8687',
  9,
  10,
  'CHEBI:92624',
  12,
  13,
  14,
  15,
  16,
  'CHEBI:71253',
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  'CHEBI:57880',
  39,
  40,
  38,
  41,
  43,
  'CHEBI:57733',
  131118,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  58,
  59,
  60,
  'CHEBI:41808',
  'CHEBI:5812',
  'CHEBI:59087',
  70,
  71,
  72,
  'CHEBI:6359',
  131149,
  'CHEBI:27832',
  81,
  'CHEBI:57688',
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  97,
  98,
  100,
  101,
  102,
  103,
  104,
  131177,
  105,
  107,
  108,
  109,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  'CHEBI:80144',
  123,
  124,
  125,
  126,
  127,
  128,
  'CHEBI:28842',
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  140,
  141,
  142,
  143,
  146,
  147,
  148,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158

In [34]:
with open('../data/processed/disease_ontograph.pkl', 'rb') as f:
    disease_ontograph = pickle.load(f)

In [35]:
disease_with_ptc = nx.compose_all([disease_ontograph,ptc_graph])

In [36]:
disease_with_ptc.number_of_nodes()

103875

In [37]:
ptc_graph.number_of_nodes() + disease_ontograph.number_of_nodes()

107000

In [38]:
disease_genes = [n for n in disease_ontograph.nodes if disease_ontograph.nodes[n].get('label') ==  'gene']
ptc_graph_genes = [n for n in ptc_graph.nodes if ptc_graph.nodes[n].get('label') ==  'gene']

In [40]:
len(set(disease_genes).difference(set(ptc_graph_genes)))

55