In [8]:
import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join

In [5]:
## IMPORT FILE OF TYROSINE SITES
sites = pd.read_csv('./Full_Proteome_Exp_Set/Tyrosine_25_AA_Peptides_Surface_Exposed_Proteome.csv', index_col=0)

In [10]:
## IMPORT MUSITE DEEP RESULTS (5 SEPARATE RUNS FROM MSD SYSTEM)
msd_file_dir = './Full_Proteome_Exp_Set/MSD_Run/out_files/'
msd_files = [f for f in listdir(msd_file_dir) if isfile(join(msd_file_dir, f))]

m = 0
all_files = []

while m < len(msd_files):
    all_files.append(pd.read_csv(msd_file_dir+msd_files[m], sep='\t'))
    m+=1

In [14]:
# Clean MSD data
tyr_msd = pd.concat(all_files)
tyr_msd = tyr_msd[tyr_msd['Position'].isna() == False]
tyr_msd['Position'] = tyr_msd['Position'].astype(int)
tyr_msd['Uniprot_ID'] = tyr_msd['ID'].str.split('|', expand=True)[1]
tyr_msd['uid_pos'] = tyr_msd['Uniprot_ID'] + '_' + tyr_msd['Position'].astype(str)
tyr_msd['PTMscores'] = tyr_msd['PTMscores'].str[16:].astype(float)
tyr_msd = tyr_msd[['uid_pos', 'Position', 'Residue', 'PTMscores']]
tyr_msd = tyr_msd.rename(columns={'PTMscores':'SECONDARY_ML_SCORE'})

In [41]:
tyr_msd

Unnamed: 0,uid_pos,Position,Residue,SECONDARY_ML_SCORE
1,Q463W5_34,34,Y,0.092
2,Q463W5_55,55,Y,0.096
3,Q463W5_111,111,Y,0.075
4,Q463W5_115,115,Y,0.050
5,Q463W5_118,118,Y,0.043
...,...,...,...,...
90272,Q45UG0_1106,1106,Y,0.040
90273,Q45UG0_1107,1107,Y,0.033
90274,Q45UG0_1116,1116,Y,0.029
90275,Q45UG0_1118,1118,Y,0.035


In [43]:
sites = sites.rename(columns={'UID_Pos':'uid_pos'})

In [47]:
sites_with_msd = pd.merge(sites, tyr_msd, on='uid_pos', indicator=True, how='left')

In [49]:
missing_msd_scores = sites_with_msd[sites_with_msd['_merge'] != 'both']
found_msd_scores = sites_with_msd.drop(missing_msd_scores.index)

In [52]:
print(len(missing_msd_scores), 'missing from MSD scored data.')

718 missing from MSD scored data.


In [55]:
# Only 718 missing out of a 177K long dataset, difference is negligible. Good to proceed!

In [59]:
# Lastly -- the Peptide length needs to be reduced to 13AA to reflect those in the 
#  training data (from Palma et. al., 2017)
found_msd_scores['Peptide'] = found_msd_scores['Peptide'].str[6:-6]

In [116]:
length_less = found_msd_scores[found_msd_scores['Peptide'].str.len() != 13]
good_to_go = found_msd_scores.drop(length_less.index)

In [117]:
# Double check they all have the 6AAs before the central Y
length_less[length_less['Peptide'].str[6] != 'Y']

Unnamed: 0,uid_pos,Uniprot_ID,Central_AA,Position_x,Netsurfp_RSA,Peptide,Gene_ID,Padding,Position_y,Residue,SECONDARY_ML_SCORE,_merge
43726,A0A0C5B5G6_11,A0A0C5B5G6,Y,11,0.411926,EMGYIF,MOTSC_HUMAN,ala_front,11.0,Y,0.299,both


In [118]:
# One without - correct for this
length_less.loc[length_less['Peptide'].str[6] != 'Y','Peptide'] = 'AAA' + length_less.loc[length_less['Peptide'].str[6] != 'Y']['Peptide'].item()

In [119]:
length_less[length_less['Peptide'].str[6] != 'Y']

Unnamed: 0,uid_pos,Uniprot_ID,Central_AA,Position_x,Netsurfp_RSA,Peptide,Gene_ID,Padding,Position_y,Residue,SECONDARY_ML_SCORE,_merge


In [120]:
# Fixed! Now let's address the issue of the AAs that come after the central Y 
length_less['missing_end_AA'] = ((length_less['Peptide'].str.len() - 13)*-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_less['missing_end_AA'] = ((length_less['Peptide'].str.len() - 13)*-1)


In [121]:
new_pep = []
pad_applied = []

for h,r in length_less.iterrows():
    pep = r['Peptide']
    pad = r['missing_end_AA']
    
    new_pep.append(pep + pad*'A')
    pad_applied.append(pad*'A' + ' to end')

In [122]:
length_less['Peptide'] = new_pep
length_less['Padding'] = pad_applied

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  length_less['Peptide'] = new_pep


In [124]:
length_less

Unnamed: 0,uid_pos,Uniprot_ID,Central_AA,Position_x,Netsurfp_RSA,Peptide,Gene_ID,Padding,Position_y,Residue,SECONDARY_ML_SCORE,_merge,missing_end_AA
53,Q99909_177,Q99909,Y,177,0.530418,RKQLVIYEEISDA,SSX3_HUMAN,A to end,177.0,Y,0.873,both,1
899,Q9BS26_395,Q9BS26,Y,395,0.524777,KLAPSEYRYTLLA,ERP44_HUMAN,A to end,395.0,Y,0.239,both,1
966,Q9BSF0_84,Q9BSF0,Y,84,0.392764,ACNNIKYHDIPYA,SMAKA_HUMAN,A to end,84.0,Y,0.207,both,1
1227,Q9BTL3_107,Q9BTL3,Y,107,0.612650,PQQYGHYGYNQRA,RAMAC_HUMAN,A to end,107.0,Y,0.181,both,1
1410,Q9BUJ2_845,Q9BUJ2,Y,845,0.662077,NYDYGSYSGNTQA,HNRL1_HUMAN,A to end,845.0,Y,0.053,both,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
176923,P0DJX6_117,P0DJX6,Y,117,0.374022,IDGRRDYKPDKSA,ALT2B_EMCVR,A to end,117.0,Y,0.344,both,1
176926,P0DJX7_117,P0DJX7,Y,117,0.374022,IDGRRDYKPDKSA,ALT2B_EMCV,A to end,117.0,Y,0.344,both,1
177605,P0DSS1_68,P0DSS1,Y,68,0.310163,PGTIILYATYIKA,PG081_VAR67,A to end,68.0,Y,0.065,both,1
177608,P0DSS2_68,P0DSS2,Y,68,0.310163,PGTIILYATYIKA,PG081_VARV,A to end,68.0,Y,0.065,both,1


In [125]:
# Now that they're all fixed we can add our peptides back to our full dataframe
length_less = length_less.drop(columns=['missing_end_AA'])
final_y_proteome = pd.concat([good_to_go, length_less])

In [126]:
final_y_proteome = final_y_proteome.drop(columns=['_merge', 'Residue', 'Position_y'])
final_y_proteome = final_y_proteome.rename(columns={'Position_x':'Position'})
final_y_proteome = final_y_proteome.reset_index(drop=True)

In [130]:
final_y_proteome.to_csv('Full_Tyr_Proteome_MSD_Scores_13_mer.csv')