In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdmolfiles
import itertools
from collections import Counter

In [4]:
# Import file containing peptide sequences, protein information, and experimental values
#  for the different 15 enzymes analysed within the investigation
peptides = pd.read_csv('./Full_Tyr_Proteome_MSD_Scores_13_mer.csv', index_col=0)

In [5]:
peptides

Unnamed: 0,uid_pos,Uniprot_ID,Central_AA,Position,Netsurfp_RSA,Peptide,Gene_ID,Padding,SECONDARY_ML_SCORE
0,Q99871_15,Q99871,Y,15,0.565898,GRGGDDYSEDEGD,HAUS7_HUMAN,,0.867
1,Q99871_47,Q99871,Y,47,0.570332,PFLEGLYITEPKT,HAUS7_HUMAN,,0.079
2,Q99871_64,Q99871,Y,64,0.322800,LCSPSEYRLEILE,HAUS7_HUMAN,,0.051
3,Q99873_52,Q99873,Y,52,0.309666,DMTSKDYYFDSYA,ANM1_HUMAN,,0.092
4,Q99873_125,Q99873,Y,125,0.434846,CSSISDYAVKIVK,ANM1_HUMAN,,0.061
...,...,...,...,...,...,...,...,...,...
177255,P0DJX6_117,P0DJX6,Y,117,0.374022,IDGRRDYKPDKSA,ALT2B_EMCVR,A to end,0.344
177256,P0DJX7_117,P0DJX7,Y,117,0.374022,IDGRRDYKPDKSA,ALT2B_EMCV,A to end,0.344
177257,P0DSS1_68,P0DSS1,Y,68,0.310163,PGTIILYATYIKA,PG081_VAR67,A to end,0.065
177258,P0DSS2_68,P0DSS2,Y,68,0.310163,PGTIILYATYIKA,PG081_VARV,A to end,0.065


In [6]:
# Import file containing protdcal values (needed for feature generation later on)
protdcal = pd.read_csv('../protdcal_features.csv', index_col=0)

In [7]:
## NOW WE CAN GO ON TO GENERATING OUR FEATURES ##

In [8]:
# METHOD FOR GENERATING FEATURES USING ONE SEQUENCE AT A TIME
def FeatureGen (sequence, protdcal):
    # FIRST: GENERATE PROTDCAL VALUES
    slist = list(sequence)   # first split sequence up into list
    # Go through sequence to get protdcal value
    pd = []
    for i in slist:
        pd.append(protdcal.loc[i].tolist())
    values = list(map(lambda *x: sum(x), *pd))   # add up values
    headers =  protdcal.columns.tolist()   # include headers


    # SECOND: GENERATE ONE-HOT ENCODING
    aa = ['K', 'R', 'H', 'A', 'I', 'L', 'M', 'V', 'F', 'W', 'Y', 'N', 'C', 'Q', 'S', 'T', 'D', 'E', 'G', 'P']   # possible amino acids
    # Make headers and one-hot encoding for each letter
    for i in aa:
        j = 0
        while j < len(sequence):
            headers.append('ONE-HOT_' + str(j) + '-' + i)  # make header
            if sequence[j] == i:
                values.append(1)
            else:
                values.append(0)
            j+=1


    # THIRD: GENERATE MACCS KEYS
    # Generate maccs keys
    mol = (rdmolfiles.MolFromFASTA(sequence))
    fp = (MACCSkeys.GenMACCSKeys(mol))
    maccs = fp.ToBitString()
    binary = list(maccs)   # split up into list
    values.extend(binary)   # add list onto resulting values
    # Generate headers for maccs keys
    mt = list(itertools.chain(range(len(binary))))
    mt = [str(s) + '_maccs' for s in mt]
    headers.extend(mt)   # append header values

    return values, headers

In [None]:
peptides = peptides.set_index('uid_pos', drop=True)

In [11]:
peptides[peptides['Peptide'].str.contains('X')]

Unnamed: 0,uid_pos,Uniprot_ID,Central_AA,Position,Netsurfp_RSA,Peptide,Gene_ID,Padding,SECONDARY_ML_SCORE
22771,Q9WSV7_330,Q9WSV7,Y,330,0.306006,GSNTXDYMSPXIS,CAPSD_TTVV5,,0.625
56584,P13890_205,P13890,Y,205,0.28854,HHGAVQYSXGRFT,POLS_RRVN,,0.048
56586,P13890_444,P13890,Y,444,0.260461,HCPPGDYLKXSFE,POLS_RRVN,,0.097
56607,P13896_1855,P13896,Y,1855,0.253373,QHSNXRYEAGAYI,POLN_WEEV,,0.106
69785,Q9YLR1_1575,Q9YLR1,Y,1575,0.363802,VVLCSDYRQXRNA,POLN_HEVUS,,0.06
72195,P03441_351,P03441,Y,351,0.27537,GMXXGWYGFRHQN,HEMA_I79A0,,0.06
72201,P03441_511,P03441,Y,511,0.40948,VELKXGYKDWILW,HEMA_I79A0,,0.052
87316,Q9WC70_133,Q9WC70,Y,133,0.252874,PGPGTXYPLTFGW,NEF_HV1S9,,0.037
100227,P17517_15,P17517,Y,15,0.302336,YKATRPYXXXCAD,POLS_RRV2,,0.032
100985,P26028_36,P26028,Y,36,0.59696,XXXXRPYVLLAVL,HEMA_MEASI,,0.036


In [19]:
# Pull sequences and make X -> A for feature generation (A is least offensive AA)
sequences = peptides['Peptide'].str.replace('X', 'A')

In [21]:
## GENERATE FEATURES FOR OUR EXPERIMENTAL DATA
# Create df for results to go into
v, h = FeatureGen(sequences[0], protdcal)
features = pd.DataFrame(columns=h)
features.loc[len(features)] = v

i = 1

# Go through rest of sequences to generate feature set
while i < len(sequences):
    ts = sequences[i]
    value, header = FeatureGen(ts, protdcal)
    features.loc[len(features)] = value
    i+=1
    if i % 500 == 0:
        print(i, 'of', len(sequences), 'completed')

# Make the index the same as our initial dataframe
feat_x = features.set_index(peptides.index)

# Isolate the methylated condition from the sequences as our y value
#feat_y = peptides['Experimental_Phosphorylation']

500 of 177260 completed
1000 of 177260 completed
1500 of 177260 completed
2000 of 177260 completed
2500 of 177260 completed
3000 of 177260 completed
3500 of 177260 completed
4000 of 177260 completed
4500 of 177260 completed
5000 of 177260 completed
5500 of 177260 completed
6000 of 177260 completed
6500 of 177260 completed
7000 of 177260 completed
7500 of 177260 completed
8000 of 177260 completed
8500 of 177260 completed
9000 of 177260 completed
9500 of 177260 completed
10000 of 177260 completed
10500 of 177260 completed
11000 of 177260 completed
11500 of 177260 completed
12000 of 177260 completed
12500 of 177260 completed
13000 of 177260 completed
13500 of 177260 completed
14000 of 177260 completed
14500 of 177260 completed
15000 of 177260 completed
15500 of 177260 completed
16000 of 177260 completed
16500 of 177260 completed
17000 of 177260 completed
17500 of 177260 completed
18000 of 177260 completed
18500 of 177260 completed
19000 of 177260 completed
19500 of 177260 completed
20000 

156500 of 177260 completed
157000 of 177260 completed
157500 of 177260 completed
158000 of 177260 completed
158500 of 177260 completed
159000 of 177260 completed
159500 of 177260 completed
160000 of 177260 completed
160500 of 177260 completed
161000 of 177260 completed
161500 of 177260 completed
162000 of 177260 completed
162500 of 177260 completed
163000 of 177260 completed
163500 of 177260 completed
164000 of 177260 completed
164500 of 177260 completed
165000 of 177260 completed
165500 of 177260 completed
166000 of 177260 completed
166500 of 177260 completed
167000 of 177260 completed
167500 of 177260 completed
168000 of 177260 completed
168500 of 177260 completed
169000 of 177260 completed
169500 of 177260 completed
170000 of 177260 completed
170500 of 177260 completed
171000 of 177260 completed
171500 of 177260 completed
172000 of 177260 completed
172500 of 177260 completed
173000 of 177260 completed
173500 of 177260 completed
174000 of 177260 completed
174500 of 177260 completed
1

In [22]:
feat_x.to_csv('./Full_Proteome_Exp_Set/full_proteome_tyr_peps_13mer_features.csv')
#feat_y.to_csv('./features/palma_2017_y_features.csv')
peptides.to_csv('./Full_Proteome_Exp_Set/full_proteome_tyr_peps_13mer_details.csv')

In [25]:
peptides.head()

Unnamed: 0_level_0,Uniprot_ID,Central_AA,Position,Netsurfp_RSA,Peptide,Gene_ID,Padding,SECONDARY_ML_SCORE
uid_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q99871_15,Q99871,Y,15,0.565898,GRGGDDYSEDEGD,HAUS7_HUMAN,,0.867
Q99871_47,Q99871,Y,47,0.570332,PFLEGLYITEPKT,HAUS7_HUMAN,,0.079
Q99871_64,Q99871,Y,64,0.3228,LCSPSEYRLEILE,HAUS7_HUMAN,,0.051
Q99873_52,Q99873,Y,52,0.309666,DMTSKDYYFDSYA,ANM1_HUMAN,,0.092
Q99873_125,Q99873,Y,125,0.434846,CSSISDYAVKIVK,ANM1_HUMAN,,0.061


In [26]:
feat_x.head()

Unnamed: 0_level_0,Gs(U)_NO,Mw_NO,HP_NO,IP_NO,ECI_NO,L1-9_NO,DHf_NO,Z1_NO,Z2_NO,Z3_NO,...,157_maccs,158_maccs,159_maccs,160_maccs,161_maccs,162_maccs,163_maccs,164_maccs,165_maccs,166_maccs
uid_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q99871_15,-4603.39,1352.0,-29.2,68.36,10.77,234.67,-6148.5,31.29,-12.65,5.5,...,1,1,1,0,1,1,1,1,1,0
Q99871_47,-207.748,1488.0,-2.3,80.83,5.86,232.28,-5911.19,-10.3,-5.43,-5.99,...,1,1,1,1,1,1,1,1,1,0
Q99871_64,-342.718,1532.0,-1.1,80.24,8.04,235.5,-6046.04,-5.57,-1.69,-3.32,...,1,1,1,1,1,1,1,1,1,0
Q99873_52,-2192.715,1586.0,-14.1,69.49,8.74,227.91,-6028.87,7.09,5.71,3.84,...,1,1,1,1,1,1,1,1,1,0
Q99873_125,631.851,1393.0,6.7,80.01,5.23,240.14,-5924.1,0.33,-9.74,-2.62,...,1,1,1,1,1,1,1,1,1,0
