In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import .csv of peptides
peps = pd.read_csv('./features/palma_2017_feature_details.csv')

In [7]:
# Import MuSiteDeep Results
msd = pd.read_csv('./MuSiteDeep_Full_Length_Results_Palma_2017.txt', sep='\t')

In [12]:
# Format MuSiteDeep Results
temp = msd['ID'].str.split('|', expand=True)
msd['Gene_ID'] = temp[0]
msd['Uniprot_ID'] = temp[1]

In [43]:
# Some peptides did not exist as part of proteins -- did not generate MSD scores for these
#  we'll need to pull them out
no_fullseq = peps[pd.to_numeric(peps['SITE_LOC'], errors='coerce').notnull() == False]
with_fullseq = peps.drop(no_fullseq.index)

In [47]:
# Now we can fix the SITE_LOC column -- currently it lists where the peptide starts
#  within the full sequence, but we need to correspond it to the Y residue
#  in order to pull the correct score out of the MSD data
with_fullseq['SITE_LOC'] = with_fullseq['SITE_LOC'].astype(int) + 7

In [49]:
with_fullseq['uid_pos'] = with_fullseq['ACC_ID'] + '_' + with_fullseq['SITE_LOC'].astype(str)

In [67]:
msd = msd[pd.to_numeric(msd['Position'], errors='coerce').notnull()]
msd['Position'] = msd['Position'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msd['Position'] = msd['Position'].astype(int)


In [69]:
msd['uid_pos'] = msd['Uniprot_ID'] + '_' + msd['Position'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msd['uid_pos'] = msd['Uniprot_ID'] + '_' + msd['Position'].astype(str)


In [95]:
full = pd.merge(with_fullseq, msd, on='uid_pos', how='left', indicator=True)

In [96]:
# 3 didn't merge successfully -- due to peptide length padding done in the previous notebook
incomplete = full[full['_merge'] != 'both']
full[full['_merge'] != 'both']

Unnamed: 0,uid_pos,Spot Index,Spot Flag,Peptide,Interactor Protein,Measure Flag,PTP_HD-PTP_11-07-2008.seam,PTP_LAR_12-10-2007.seam,PTP_LyP_12-10-2007.seam,PTP_MEG-1_12-10-2007.seam,...,SITE_LOC,Fullseq_Length,ID,Position,Residue,PTMscores,Cutoff=0.5,Gene_ID,Uniprot_ID,_merge
826,Q9BTX7_70,2383,GOOD,DMVRKEYPNLSTS,Q9BTX7 (C20orf121) --> 64-75,GOOD,-0.0467,-0.201,-0.727,-0.474,...,70,342.0,,,,,,,,left_only
1216,Q53H88_7,4702,GOOD,AMADPKYADLPGI,Q53H88 --> 1-12,GOOD,1.37,-1.18,-0.277,-0.414,...,7,406.0,,,,,,,,left_only
2666,P31268_7,3585,GOOD,AMSSSYYVNALFS,P31268 (HOXA7) --> 1-12,GOOD,1.34,0.65,0.696,0.38,...,7,230.0,,,,,,,,left_only


In [97]:
full = full.drop(incomplete.index)

In [111]:
incomplete = incomplete.drop(columns=msd.columns.drop('uid_pos'))
incomplete = incomplete.drop(columns=['_merge'])

In [118]:
incomplete['SITE_LOC'] = incomplete['SITE_LOC'] - 1
incomplete['uid_pos'] = incomplete['ACC_ID'] + '_' + incomplete['SITE_LOC'].astype(str)

In [213]:
full = pd.concat([full, pd.merge(incomplete, msd, on='uid_pos', how='left', indicator=True)]).set_index('uid_pos', drop=True)

In [214]:
no_fullseq.to_csv('./missing_fullseqs_from_uniprot.csv')

In [215]:
print(len(no_fullseq)/len(with_msd)*100, "% of our data didn't have an active Uniprot Listing")

11.629192775525249 % of our data didn't have an active Uniprot Listing


In [216]:
# Every little bit of data counts, especially when it's 10% of our dataset. So we'll
#  arrange to get MSD scores for this data. A portion of it has Uniprot IDs that
#  represent protein sequences that have been adjusted/deleted -- will need to manually
#  correct for those. The remaining ones are peptides which can be run through MSD
#  on their own.

In [217]:
# First pull the peptides for their MSD run
solo_peps = no_fullseq[no_fullseq['Interactor Protein'].isna()]

In [218]:
# Format for MSD run
f = open('./missing_fullseq_MSD_run/missing_seqs_msd_run.fasta', 'w')

for h,r in solo_peps.iterrows():
    f.write('>' + r['uid_pos'] + '|' + str(r['Spot Index']) + '\n')
    f.write(str(r['Peptide']) + '\n')
f.close()

In [219]:
# Now let's handle the remaining sequences that weren't found within the Uniprot DB
uid_not_found = no_fullseq[no_fullseq['Interactor Protein'].isna() == False]

In [220]:
uid_not_found['ACC_ID']

14             Q15811
17      IPI00455894.2
24             Q16827
72             O43240
156            Q15154
            ...      
5955           Q8WXU2
5982    IPI00645382.1
5983           Q9P1Z9
6000           Q8IZF2
6042         Q07955-3
Name: ACC_ID, Length: 257, dtype: object

In [221]:
# The IDs similar to "IPI00455894.2" aren't recognisable protein IDs - take these
#  as peptides too
ipi_peptides = uid_not_found[uid_not_found['ACC_ID'].str.contains('IPI')]
uid_not_found = uid_not_found.drop(ipi_peptides.index)

In [222]:
# Format for MSD run
f = open('./missing_fullseq_MSD_run/missing_seqs_msd_run.fasta', 'a')

for h,r in ipi_peptides.iterrows():
    f.write('>' + r['uid_pos'] + '|' + str(r['Spot Index']) + '\n')
    f.write(str(r['Peptide']) + '\n')
f.close()

In [223]:
# Now onto the final uid_not_found sequences
uid_not_found['ACC_ID'].drop_duplicates().to_csv('./missing_fullseq_MSD_run/uids_not_found.txt', index=None)

In [224]:
# Note: some of these include peptides that couldn't be found within the full protein 
# sequences. These will be run as peptides as well.
pep_miss_peptides = uid_not_found[uid_not_found['uid_pos'].str.contains('Peptide_Not_Found')]
true_uid_not_found = uid_not_found.drop(pep_miss_peptides.index)

In [225]:
# Format for MSD run
f = open('./missing_fullseq_MSD_run/missing_seqs_msd_run.fasta', 'a')

for h,r in pep_miss_peptides.iterrows():
    f.write('>' + r['uid_pos'] + '|' + str(r['Spot Index']) + '\n')
    f.write(str(r['Peptide']) + '\n')
f.close()

In [226]:
true_uid_not_found['ACC_ID'].drop_duplicates().to_csv('./missing_fullseq_MSD_run/uids_not_found.txt', index=None)

In [227]:
# All of the uid_not_found peptides were deleted from the uniprot database
# As they are not reflective of true proteins, only the peptide sequence will be run.
f = open('./missing_fullseq_MSD_run/missing_seqs_msd_run.fasta', 'a')

for h,r in true_uid_not_found.iterrows():
    f.write('>' + r['uid_pos'] + '|' + str(r['Spot Index']) + '\n')
    f.write(str(r['Peptide']) + '\n')
f.close()

In [228]:
# NOW READ IN THE MSD SCORES FOR THE PEPTIDES #
#no_fullseq
no_fs_msd = pd.read_csv('./missing_fullseq_MSD_run/prediction_results.txt', sep='\t')
no_fs_msd = no_fs_msd[no_fs_msd['Position'].notna()]
no_fs_msd = no_fs_msd[no_fs_msd['Position'] == 7]

In [229]:
no_fs_msd['uid_pos'] = no_fs_msd['ID'].str.split('|', expand=True)[0]

In [230]:
msd_scored_peptides = pd.merge(no_fs_msd, no_fullseq, on='uid_pos', indicator=True)

In [241]:
msd_scored_peptides = msd_scored_peptides.set_index('uid_pos', drop=True)

In [242]:
final_peptide_set = pd.concat([full, msd_scored_peptides])

In [245]:
final_peptide_set = final_peptide_set.reset_index(drop=False)

In [246]:
final_peptide_set.columns

Index(['uid_pos', 'Spot Index', 'Spot Flag', 'Peptide', 'Interactor Protein',
       'Measure Flag', 'PTP_HD-PTP_11-07-2008.seam', 'PTP_LAR_12-10-2007.seam',
       'PTP_LyP_12-10-2007.seam', 'PTP_MEG-1_12-10-2007.seam',
       'PTP_MEG-2_12-10-2007.seam', 'PTP_PTP-PEST_12-10-2007.seam',
       'PTP_PTPH1_12-10-2007.seam', 'PTP_rPTP-alpha_12-10-2007.seam',
       'PTP_rPTP-beta_12-10-2007.seam', 'PTP_SAP-1_12-10-2007.seam',
       'PTP_SHP-1_12-10-2007.seam', 'PTP_SHP-2_12-10-2007.seam',
       'PTP_DEP-1_12-10-2007.seam.txt', 'PTP_TC-PTP_12-10-2007.seam',
       'PTP_PTP1B_averaged', 'Length', 'ACC_ID', 'Gene_Name',
       'Peptide_Location', 'SITE_LOC', 'Fullseq_Length', 'ID', 'Position',
       'Residue', 'PTMscores', 'Cutoff=0.5', 'Gene_ID', 'Uniprot_ID',
       '_merge'],
      dtype='object')

In [247]:
final_peptide_set = final_peptide_set[['uid_pos', 'Peptide', 'PTP_HD-PTP_11-07-2008.seam', 'PTP_LAR_12-10-2007.seam',
       'PTP_LyP_12-10-2007.seam', 'PTP_MEG-1_12-10-2007.seam',
       'PTP_MEG-2_12-10-2007.seam', 'PTP_PTP-PEST_12-10-2007.seam',
       'PTP_PTPH1_12-10-2007.seam', 'PTP_rPTP-alpha_12-10-2007.seam',
       'PTP_rPTP-beta_12-10-2007.seam', 'PTP_SAP-1_12-10-2007.seam',
       'PTP_SHP-1_12-10-2007.seam', 'PTP_SHP-2_12-10-2007.seam',
       'PTP_DEP-1_12-10-2007.seam.txt', 'PTP_TC-PTP_12-10-2007.seam',
       'PTP_PTP1B_averaged', 'ACC_ID', 'SITE_LOC', 'Gene_Name', 'PTMscores', 'Spot Index', 
                   'Spot Flag']]

final_peptide_set['PTMscores'] = final_peptide_set['PTMscores'].str[16:].astype(float)

In [248]:
final_peptide_set = final_peptide_set.rename(columns={'PTMscores':'SECONDARY_ML_SCORE'})

In [249]:
final_peptide_set.to_csv('./features/palma_2017_experimental_results_and_sec_scores.csv')

In [256]:
final_peptide_set[final_peptide_set['uid_pos'].str.contains('Q53H88_')]

Unnamed: 0,uid_pos,Peptide,PTP_HD-PTP_11-07-2008.seam,PTP_LAR_12-10-2007.seam,PTP_LyP_12-10-2007.seam,PTP_MEG-1_12-10-2007.seam,PTP_MEG-2_12-10-2007.seam,PTP_PTP-PEST_12-10-2007.seam,PTP_PTPH1_12-10-2007.seam,PTP_rPTP-alpha_12-10-2007.seam,...,PTP_SHP-2_12-10-2007.seam,PTP_DEP-1_12-10-2007.seam.txt,PTP_TC-PTP_12-10-2007.seam,PTP_PTP1B_averaged,ACC_ID,SITE_LOC,Gene_Name,SECONDARY_ML_SCORE,Spot Index,Spot Flag
5424,Q53H88_6,AMADPKYADLPGI,1.37,-1.18,-0.277,-0.414,0.147,-0.574,1.01,-0.608,...,0.253,0.14,2.6,1.84,Q53H88,6,None_Listed,0.896,4702,GOOD
