# Code to convert Gene Lists from PathVisio into files ready for CyTargetLinker Linkset creation

## Required imports

In [1]:
import os
from os import listdir
import requests
import json

## Set directory location of pathway files

In [2]:
location = 'pathwaysforlinkset/'

## Step 1: find and list files

It is expected that all files are stored with format [WP-ID].txt e.g. `WP5096.txt`

In [3]:
files = listdir(location)
print("There are "+str(len(files))+ " Pathways in the folder: \n"+str(files)) 

There are 8 Pathways in the folder: 
['WP5090.txt', 'WP5089.txt', 'WP5095.txt', 'WP5096.txt', 'WP5088.txt', 'WP5083.txt', 'WP5036.txt', 'WP5087.txt']


## Step 2: read all files and GeneProduct nodes in dictionaries

### Create dictionary

In [4]:
d = {}
dbs = set()

### Fill dictionary

In [5]:
for file in files:
    d[file[:-4]] = set()
    f = open(location+file, 'r')
    for line in f:
        d[file[:-4]].add(line[:-1])
        dbs.add(line.split('\t')[1][:-1])
    print(str(file) + " has " + str(len(d[file[:-4]])) + " DataNodes")
    f.close()

WP5090.txt has 138 DataNodes
WP5089.txt has 10 DataNodes
WP5095.txt has 134 DataNodes
WP5096.txt has 29 DataNodes
WP5088.txt has 41 DataNodes
WP5083.txt has 178 DataNodes
WP5036.txt has 37 DataNodes
WP5087.txt has 113 DataNodes


In [6]:
print("All databases in the set: \n" + str(dbs))

All databases in the set: 
{'Entrez Gene', 'Wikidata', 'Database', 'WikiPathways', 'Uniprot-TrEMBL', 'NCBI Protein', 'Reactome', 'HMDB', 'Enzyme Nomenclature', 'Ensembl', 'CAS', 'HGNC', 'KEGG Genes', 'ChEBI'}


## Step 3: ID mapping

In [7]:
bridgedb = 'https://webservice.bridgedb.org/Human/'

In [8]:
syscodes = {'Ensembl':'En',
            'Entrez Gene':'L',
            'HGNC':'H',
            'KEGG Genes':'Kg',
            'NCBI Protein':'Np',
            'Reactome':'Re',
            'Uniprot-TrEMBL':'S',
            'Wikidata':'Wd'}

### Map all IDs to Entrez Gene IDs

In [9]:
def idtoentrez (id,db):
    a = requests.get(bridgedb + 'xrefs/' + db + '/' + str(id) + '?dataSource=L').text
    if 'html' not in a:
        return a.split('\t')[0]

In [10]:
identrez = {}
for wp in d:
    identrez[wp] = {}
    nomappings = 0
    mappings = 0
    for item in d[wp]:
        entrez = idtoentrez(item.split('\t')[0],item.split('\t')[1])
        if not entrez == '':
            identrez[wp][item] = entrez
            mappings += 1
        else:
            nomappings += 1
    print("For " + str(wp) + " there are " + str(mappings) + " mappings and " + str(nomappings) + " IDs were not mapped")

For WP5090 there are 107 mappings and 31 IDs were not mapped
For WP5089 there are 8 mappings and 2 IDs were not mapped
For WP5095 there are 130 mappings and 4 IDs were not mapped
For WP5096 there are 29 mappings and 0 IDs were not mapped
For WP5088 there are 35 mappings and 6 IDs were not mapped
For WP5083 there are 141 mappings and 37 IDs were not mapped
For WP5036 there are 29 mappings and 8 IDs were not mapped
For WP5087 there are 110 mappings and 3 IDs were not mapped


### Add all Gene names

In [11]:
def entreztohgnc (entrez):
    a = requests.get(bridgedb + 'xrefs/L/' + str(entrez) + '?dataSource=H').text
    if 'html' not in a:
        return a.split('\t')[0]

In [12]:
def wpidtotitle (wp):
    a = requests.get('https://webservice.wikipathways.org/getPathwayInfo?pwId='+wp+'&format=json').text
    title = a.split(',')[2].split(':')[1][1:-1]
    if 'html' not in a:
        return title

In [16]:
g = open('idfile.txt','w')
g.write('PathwayName\tPathwayID\tGeneName\tGeneID\n')

38

In [17]:
for wp in identrez:
    title = wpidtotitle(wp)
    for item in identrez[wp]:
        name = entreztohgnc(identrez[wp][item])
        if identrez[wp][item] is not None:
            g.write(str(title)
                    + '\t' + str(wp)
                    +'\t'+str(name)+'\t'+str(identrez[wp][item])
                    +'\n')

In [18]:
g.close()