# Getting protein features from UniProt

In [1]:
import pandas as pd
import os

In [2]:
print("\n... Loading Train Proteins Data From CSV File ...\n\n")

DATA_DIR = './dataset'
train_proteins = pd.read_csv(os.path.join(DATA_DIR, "train_proteins.csv"))
display(train_proteins.head())


... Loading Train Proteins Data From CSV File ...




Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [3]:
protein_list = train_proteins.UniProt.unique()
protein_list 

array(['O00391', 'O00533', 'O00584', 'O14498', 'O14773', 'O14791',
       'O15240', 'O15394', 'O43505', 'O60888', 'O75144', 'O75326',
       'O94919', 'P00441', 'P00450', 'P00734', 'P00736', 'P00738',
       'P00746', 'P00747', 'P00748', 'P00751', 'P01008', 'P01009',
       'P01011', 'P01019', 'P01023', 'P01024', 'P01031', 'P01033',
       'P01034', 'P01042', 'P01344', 'P01591', 'P01608', 'P01621',
       'P01717', 'P01780', 'P01833', 'P01834', 'P01857', 'P01859',
       'P01860', 'P01861', 'P01876', 'P01877', 'P02452', 'P02647',
       'P02649', 'P02652', 'P02655', 'P02656', 'P02671', 'P02675',
       'P02679', 'P02747', 'P02748', 'P02749', 'P02750', 'P02751',
       'P02753', 'P02760', 'P02763', 'P02765', 'P02766', 'P02768',
       'P02774', 'P02787', 'P02790', 'P04004', 'P04075', 'P04156',
       'P04180', 'P04196', 'P04207', 'P04211', 'P04216', 'P04217',
       'P04275', 'P04406', 'P04433', 'P05060', 'P05067', 'P05090',
       'P05155', 'P05156', 'P05408', 'P05452', 'P05546', 'P063

In [4]:
import requests
from pandas import json_normalize

df_proteins = pd.DataFrame()

for protein in protein_list:
    url = f"https://rest.uniprot.org/uniprotkb/search?query=accession:{protein}&format=json"
    r = requests.get(url).json()

    df = pd.json_normalize(r['results'])
    df['UniProt'] = protein    
    if 'features' in df.columns:
        df = pd.concat([df, pd.json_normalize(df.loc[0, 'features'])], axis=1)
    else:
        print('No features for ', protein)
        
    if 'genes' in df.columns:
        df = pd.concat([df, pd.json_normalize(df.loc[0, 'genes'])], axis=1)
    else:
        print('No genes for ', protein)
        
    df_obj= df.select_dtypes(['object'])
    df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

    
    if 'ligand.name' in df.columns:
        df['description'] = df[['ligand.name','description']].apply(
            lambda x: ' '.join(x.dropna().astype(str)),
            axis=1
        )
    df = df.fillna(method='ffill')

    df_proteins = pd.concat([df_proteins, df])
    print(protein)

df_proteins = df_proteins[[
 'proteinDescription.recommendedName.fullName.value',
 'proteinDescription.flag',
 'sequence.value',
 'sequence.length',
 'sequence.molWeight',
 'type',
 'description',
 'location.start.value',
 'location.end.value',
 'geneName.value',
 'UniProt']]
print(df_proteins.shape)
display(df_proteins.head(20))
display(df_proteins.info())

O00391
O00533
O00584
O14498
O14773
O14791
O15240
O15394
O43505
O60888
O75144
O75326
O94919
P00441
P00450
P00734
P00736
P00738
P00746
P00747
P00748
P00751
P01008
P01009
P01011
P01019
P01023
P01024
P01031
P01033
P01034
P01042
P01344
P01591
No features for  P01608
No genes for  P01608
P01608
No features for  P01621
No genes for  P01621
P01621
P01717
P01780
P01833
P01834
P01857
P01859
P01860
P01861
P01876
P01877
P02452
P02647
P02649
P02652
P02655
P02656
P02671
P02675
P02679
P02747
P02748
P02749
P02750
P02751
P02753
P02760
P02763
P02765
P02766
P02768
P02774
P02787
P02790
P04004
P04075
P04156
P04180
P04196
No features for  P04207
No genes for  P04207
P04207
P04211
P04216
P04217
P04275
P04406
P04433
P05060
P05067
P05090
P05155
P05156
P05408
P05452
P05546
P06310
P06396
P06454
P06681
P06727
P07195
P07225
P07333
P07339
P07602
P07711
P07858
P07998
P08123
P08133
P08253
P08294
P08493
P08571
P08603
P08637
P08697
P09104
P09486
P09871
P10451
P10643
P10645
P10909
P11142
P11277
P12109
P13473
P13521
P135

Unnamed: 0,proteinDescription.recommendedName.fullName.value,proteinDescription.flag,sequence.value,sequence.length,sequence.molWeight,type,description,location.start.value,location.end.value,geneName.value,UniProt
0,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Signal,,1.0,29.0,QSOX1,O00391
1,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Chain,Sulfhydryl oxidase 1,30.0,747.0,QSOX1,O00391
2,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Transmembrane,Helical,710.0,730.0,QSOX1,O00391
3,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Domain,Thioredoxin,36.0,156.0,QSOX1,O00391
4,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Domain,ERV/ALR sulfhydryl oxidase,396.0,503.0,QSOX1,O00391
5,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Region,Disordered,573.0,633.0,QSOX1,O00391
6,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Active site,Nucleophile,70.0,70.0,QSOX1,O00391
7,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Active site,Nucleophile,73.0,73.0,QSOX1,O00391
8,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Binding site,FAD,401.0,401.0,QSOX1,O00391
9,Sulfhydryl oxidase 1,Precursor,MRRCNSGSGPPPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQ...,747.0,82578.0,Binding site,FAD,408.0,408.0,QSOX1,O00391


<class 'pandas.core.frame.DataFrame'>
Int64Index: 15354 entries, 0 to 142
Data columns (total 11 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   proteinDescription.recommendedName.fullName.value  15351 non-null  object 
 1   proteinDescription.flag                            12899 non-null  object 
 2   sequence.value                                     15351 non-null  object 
 3   sequence.length                                    15351 non-null  float64
 4   sequence.molWeight                                 15351 non-null  float64
 5   type                                               15351 non-null  object 
 6   description                                        15351 non-null  object 
 7   location.start.value                               15351 non-null  float64
 8   location.end.value                                 15351 non-null  float64
 9   geneName

None

In [5]:
df_proteins.to_pickle('./protein_features.pkl')

In [4]:
df_proteins.to_csv('./protein_features.csv')