# PhosphoSite Data Formatting

This file takes data regarding kinase-protein interactions from the PhosphoSite database and converts the data into the .gmt format. The data was retrieved from the PhosphoSite database on Wed, Jun 14 2017 12:10:56. This data will be added to enhance the KEA2 database and will be suitably formatted for use by ENRICHR and X2K.

## Import packages necessary for following program

In [21]:
%run /home/maayanlab/Projects/Scripts/init.ipy

## Create a dataframe from a file containing PhosphoSite data

In [22]:
#Define colnames (present in original dataset) for dataframe


#read data from excel file into dataframe 'phospho_df'
site_df = pd.read_excel('~/Desktop/Kinase_Substrate_Dataset.xlsm', 
                           header = None, names = colnames)

#View dataframe
site_df.head(50)

Unnamed: 0,Gene,Kinase,Kin_Acc_ID,Kin_organism,Substrate,Sub_gene_ID,sub_Acc_ID,Sub_gene,Sub_organism,sub_mod_rsd,site_grp_id,site_+/-7_AA,Domain,in_vivo_rxn,in_vitro_rxn,CST_Cat
0,PRKCD,PKCD,Q05655,human,S6,6194.0,P62753,RPS6,human,S236,448093,AKRRRLssLRAstsK,,X,,5006; 4856; 4857; 4858; 4851; 4803; 2211; 3945...
1,PRKCD,PKCD,Q05655,human,RPS3,6188.0,P23396,RPS3,human,T221,452652,kDEILPttPIsEQkG,,X,X,
2,PRKCD,PKCD,Q05655,human,CTNNB1,1499.0,P35222,CTNNB1,human,S715,25288326,GYRQDDPsyRsFHsG,,X,X,
3,PRKCD,PKCD,Q05655,human,PRKD1,5587.0,Q15139,PRKD1,human,S412,20615903,PLMRVVQsVKHTKRK,,X,,
4,PRKCD,PKCD,Q05655,human,IRS1,3667.0,P35568,IRS1,human,S307,450190,TRRsRtEsItAtsPA,,,X,2384; 2491
5,PRKCD,PKCD,Q05655,human,TNNT2,286816.0,P13789,TNNT2,cow,S195,451755,KAQtERKsGKRQtER,Troponin,,X,
6,PRKCD,PKCD,Q05655,human,CXCR4,7852.0,P61073,CXCR4,human,S324,6140803,LtsVsRGssLkILsk,,X,,
7,PRKCD,PKCD,Q05655,human,GSK3A,2931.0,P49840,GSK3A,human,S21,448584,sGrARtssFAEPGGG,,,X,9337; 9316; 9331; 5090; 9327; 8452; 8566
8,PRKCD,PKCD,Q05655,human,TNNI3,7137.0,P19429,TNNI3,human,S24,448424,APIRRRssNyRAyAt,Troponin-I_N,,X,4004
9,PRKCD,PKCD,Q05655,human,PPP1R14B,26472.0,Q96C90,PPP1R14B,human,T57,1904503,VRRQGKVtVkYDRKE,PP1_inhibitor,,X,


## Filter by columns necessary for .GMT file format

In [45]:
#Create dataframe 'df' with columns pertaining to kinase accession id,
#kinase organism, and substrate accession id
df = site_df[['Kin_Acc_ID', 'Kin_organism', 'sub_Acc_ID']]

#View dataframe
df.head()

Unnamed: 0,Kin_Acc_ID,Kin_organism,sub_Acc_ID
0,Q05655,human,P62753
1,Q05655,human,P23396
2,Q05655,human,P35222
3,Q05655,human,Q15139
4,Q05655,human,P35568


## Filter dataframe by Organism
Ensures that kinase organism matches substrate organism, and subsets data pertaining only to mouse and human samples.

In [46]:
# Check that kinase and substrate are of same organism
indices = [index for index, rowData in site_df.iterrows() 
           if rowData['Kin_organism'] == rowData['Sub_organism'] 
           and rowData['Kin_organism'] in ['human', 'mouse']]

# Filter the dataframe by organism
site_df_filtered = df.loc[indices]

# Replace 'human' and 'mouse' with 'Homo sapiens' and 'Mus musculus' for 'Kin_organism' column
site_df_filtered.Kin_organism.replace(['human', 'mouse'], ['Homo sapiens', 'Mus musculus'], inplace=True)

#View dataframe
site_df_filtered

Unnamed: 0,Kin_Acc_ID,Kin_organism,sub_Acc_ID
0,Q05655,Homo sapiens,P62753
1,Q05655,Homo sapiens,P23396
2,Q05655,Homo sapiens,P35222
3,Q05655,Homo sapiens,Q15139
4,Q05655,Homo sapiens,P35568
6,Q05655,Homo sapiens,P61073
7,Q05655,Homo sapiens,P49840
8,Q05655,Homo sapiens,P19429
9,Q05655,Homo sapiens,Q96C90
10,Q05655,Homo sapiens,P30273


In [24]:
#site_df.groupby(['Kin_organism', 'Sub_organism']).size().to_frame()