# PhosphoELM Data Formating

This file takes data regarding kinase-protein interactions from the PhosphoELM database and converts the data into the .gmt format. The data was retrieved from the PhosphoELM database on Wed, Jun 7 2017 16:27:31. This data will be added to enhance the KEA2 database and will be suitably formatted for use by ENRICHR and X2K.

In [2]:
import numpy as np
import pandas as pd
import xmltodict
import json
import requests

In [3]:
#read data from excel file into dataframe 'phospho_df'
phospho_df = pd.read_excel('~/Desktop/phosphoELM_all_2015-04.xlsm')

Unnamed: 0,acc,sequence,position,code,pmids,kinases,source,species,entry_date
0,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,304,S,17114649,,HTP,Mus musculus,2005-03-14 12:16:11.108314+01
1,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,304,S,17242355,,HTP,Mus musculus,2005-03-14 12:16:11.108314+01
2,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,304,S,15345747,,HTP,Mus musculus,2005-03-14 12:16:11.108314+01
3,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,296,S,17114649,,HTP,Mus musculus,2007-07-13 15:17:45.666219+02
4,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,296,S,17242355,,HTP,Mus musculus,2007-07-13 15:17:45.666219+02
5,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,298,S,17114649,,HTP,Mus musculus,2005-03-14 12:16:10.997726+01
6,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,298,S,17242355,,HTP,Mus musculus,2005-03-14 12:16:10.997726+01
7,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,298,S,15345747,,HTP,Mus musculus,2005-03-14 12:16:10.997726+01
8,O08539,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,308,T,17242355,,HTP,Mus musculus,2007-07-13 15:17:45.732447+02
9,O08605,MVSSQKLEKPIEMGSSEPLPIVDSDKRRKKKRKTRATDSLPGKFED...,39,S,15234964,PAK2,LTP,Mus musculus,2005-11-21 09:43:56.407579+01


# This is a title
## This is a subtitle

hello you can write in **bold**, *italic*

lists:
- asd
- asd
- asd


numbered lists:
1. asd
2. 234
3. 46

In [4]:
#select columns necessary for .gmt format and filter into new dataframe 'df'
df = phospho_df[['acc', 'kinases']]

#drop all columns with an 'NaN' value for the kinases
df.dropna(axis = 0, inplace = True)

#drop duplicate rows in the dataframe
df.drop_duplicates(inplace = True)

#set index of protein values 'acc' as kinases
#creates new dataframe 'kin'
kin = df.set_index('kinases')

#Create dictionary 'PhosphoELM' with kinases as keys
PhosphoELM = dict([(key, []) for key in kin.index])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
#Define url to obtain gene symbol from API
ENRICHR_URL = 'https://www.ebi.ac.uk/proteins/api/proteins/%s'

#Define function uniprot_to_gene which converts uniprot_id into the gene symbol
def uniprot_to_gene(protein_id):
    response = requests.get(ENRICHR_URL % protein_id)
    if not response.ok:
        name = np.NaN
    else:
        data = xmltodict.parse(response.text)
        entry = data['entry']
        # check if entry contains 'gene'
        if 'gene' in entry.keys():
            data = data['entry']['gene']
            if type(data) == list:
                name = str(protein_id)
            else:
                names = data['name']
                if type(names) == list:
                    name = list(names[0].values())[1]
                else:
                    name = list(names.values())[1]
        else:
            name = data['entry']['name']         
    return name

acc = pd.Series(kin.acc[:])
    
for key, row in acc.iteritems():
    
    protein_id = '%s' %row
    geneS = uniprot_to_gene(protein_id)
    PhosphoELM[key] = PhosphoELM[key] + [geneS]
    print(key)
    

In [9]:
#Using dictionary, re-create dataframe 'kin' with gene symbols rather than gene accession numbers
kin = pd.DataFrame.from_dict(PhosphoELM, orient = 'index')

#Look at format of newly created kin dataframe
kin.head()

#Group kinases in dataframe 'kin'
#Aggregate data in 'kin' according to kinase groups
kin = kin.groupby('kinases').agg(lambda x: tuple(x))

#Create a new column 'PhosphoELM' as description of data
kin.insert(0, 'Description', 'PhosphoELM')

2543


In [None]:
### NOTES to self

#look into plotly

In [None]:
# fix the dataframe in order to have three columns:
# kinases, description, acc_merged (acc, but all elements are joined by a \t symbol)
# with a reset index

#reset index of the datframe to integers, restores column 'kinases'
kin.reset_index(inplace = True)

#create column 'acc_merged' in which all 'acc' elements are joined by a \t symbol
kin['acc_merged'] = ['\t'.join(x) for x in kin['acc']]

acc = pd.Series(kin['acc'])

acc[4]

#drop the now-unneccesary column 'acc'
kin.drop('acc', axis=1, inplace = True)

#Create dictionary 'PhosphoELM' with kinases as keys
PhosphoELM = dict([(key, []) for key in kin.index])

# loop through rows with iterrows()
for index, rowData in kin.iterrows():
    line = ['\t'.join(rowData)]
    PhosphoELM[index] = line

In [None]:
#Transfer tsv info into a new txt file
with open('PhosphoELM.txt', 'w') as openfile:
    for index in PhosphoELM:
        openfile.write(str(PhosphoELM[index]) + '\n')