Reformat published datasets for CysteineomeDB.
Categories: Dataset Found, Reactive, Ligandable, Identified but not Ligandale, and Conditional.
Note: Annotations from authors were used to determine "ligandabiliy."
Note: Yang DIA paper did not provide cysteine residue numbers. Peptide sequences were manually mapped to the uniprot fasta (2201). 

# Setup Environment

In [1]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import matplotlib
import numpy as np
import math
from matplotlib.pyplot import figure
import Bio
from Bio import SeqIO
from functools import reduce

In [2]:
cd = os.getcwd()
cd

'/Users/lisamarieboatner/Dropbox/Backus/Cysteineome/Data'

In [3]:
path_data = os.path.join(os.getcwd(), 'results')
if not os.path.exists(path_data):
    os.makedirs(path_data)

In [4]:
def get_new_df(dfs, dataset, col1, col2, cys):
    new_df = pd.concat(dfs)
    
    new_df = new_df[new_df[col1].str.contains("contaminant") == False]
    
    if dataset == 'kuljanin_gygi':
        new_df['proteinid'] = new_df[col1].map(lambda x: str(x).split('|')[1])
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'weerapana_cravatt':
        new_df['proteinid'] = new_df[col1].map(lambda x: str(x))
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'backus_cravatt':
        new_df['proteinid'] = new_df['Identifier'].map(lambda x: str(x).split('_')[0])
        new_df['resid'] = new_df['Identifier'].map(lambda x: str(x).split('_')[-1]) 
    elif dataset == 'yan_backus':
        new_df['proteinid'] = new_df[col1]
        new_df['resid'] = new_df['identifier'].map(lambda x: 'C' + str(x).split('_')[-1]) 
    elif dataset == 'yang_wang':
        new_df['proteinid'] = new_df[col1]
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))         
    else:
        new_df = new_df.rename(columns = {col1: 'proteinid', col2: 'resid'})
        
    new_df['cysteineid'] = new_df['proteinid'] + '_' + new_df['resid'].astype(str)
    new_df['dataset'] = dataset
    new_df['identified'] = 1
    new_df['identified_datasets'] = dataset
    
    if cys == True:
        new_df['level'] = 'cysteine'
        new_df = new_df[['level', 'cysteineid', 'proteinid', 'dataset', 'identified', 'identified_datasets']]
    else:
        new_df['level'] = 'protein'
        new_df = new_df[['level', 'proteinid', 'dataset', 'identified', 'identified_datasets']]
    new_df = new_df.drop_duplicates()
    
    return new_df

In [5]:
def get_cys_uniprot_identifier(master, df, dataset, category, category_datasets, col1, col2):
    if dataset == 'weerapana_cravatt':
        df['proteinid'] = df[col1].map(lambda x: str(x))
        df['resid'] = df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'kuljanin_gygi':
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df['proteinid'] = df[col1].map(lambda x: str(x).split('|')[1])
        df['resid'] = df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'backus_cravatt':
        df['proteinid'] = df['Identifier'].map(lambda x: str(x).split('_')[0])
        df['resid'] = df['Identifier'].map(lambda x: str(x).split('_')[-1])  
        df = df[['proteinid', 'resid']]
        df = df.drop_duplicates()
    elif dataset == 'yang_wang':
        df['proteinid'] = df[col1]
        df['resid'] = df[col2].map(lambda x: 'C' + str(x)) 
    else:
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df = df.rename(columns = {col1: 'proteinid', col2: 'resid'})
        
    df['cysteineid'] = df['proteinid'] + '_' + df['resid'].astype(str)
    df_ids = list(df['cysteineid'].unique())
    
    master[category] = np.where(master['cysteineid'].isin(df_ids), 1, 0)
    category_df = master[master[category] == 1]
    category_df[category  + '_datasets'] = [category_datasets] * category_df.shape[0]
    non_category_df = master[master[category] == 0]
    
    new_df = pd.concat([category_df, non_category_df])
#     master[category  + '_datasets'] = np.where(master['cysteineid'].isin(df_ids), category_datasets, np.nan)
    
    return new_df

In [6]:
def get_pro_uniprot_identifier(master, df, dataset, category, category_datasets, col1, col2):
    if dataset == 'weerapana_cravatt':
        df['proteinid'] = df[col1].map(lambda x: str(x))
    elif dataset == 'kuljanin_gygi':
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df['proteinid'] = df[col1].map(lambda x: str(x).split('|')[1])
    elif dataset == 'backus_cravatt':
        df['proteinid'] = df['Identifier'].map(lambda x: str(x).split('_')[0])
        df = df[['proteinid']]
        df = df.drop_duplicates()
    elif dataset == 'yang_wang':
        df['proteinid'] = df[col1]
    else:
        df = df[[col1]]
        df = df.drop_duplicates()
        df = df.rename(columns = {col1: 'proteinid'})

    df_ids = list(df['proteinid'].unique())
    
    master[category] = np.where(master['proteinid'].isin(df_ids), 1, 0)
    category_df = master[master[category] == 1]
    category_df[category  + '_datasets'] = [category_datasets] * category_df.shape[0]
    non_category_df = master[master[category] == 0]
    
    new_df = pd.concat([category_df, non_category_df])
#     master[category  + '_datasets'] = np.where(master['proteinid'].isin(df_ids), category_datasets, np.nan)
    
    return new_df

In [7]:
def get_ligandability(df, compound_list, cutoff):
    ligandable = []
    
    for index, row in df.iterrows():
        
        count = 0

        for i in range(len(compound_list)):
            current_ratio = row[compound_list[i]]
            
            if type(current_ratio) != int and type(current_ratio) != float:
                continue
            else:    
                if float(current_ratio) >= cutoff:
                    count += 1
    
        if count >= cutoff:
            ligandable.append('yes')
        else:
            ligandable.append(np.nan)
            
    df['ligandable'] = ligandable
    
    df_identified = df[df['ligandable'].isna() == True]
    df_ligandable = df[df['ligandable'].isna() == False]
    return df_identified, df_ligandable

In [10]:
def get_ligandable_category(df, probe_cols, tcell, cutoff):
    
    lig_labels = []
    
    if tcell == True:
        df = df.replace('--', 0)
    
    for index, row in df.iterrows():
        liganded = False
        for i in range(len(probe_cols)):
            if row[probe_cols[i]] >= cutoff:
                liganded = True
                
        if liganded == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(None)
            
    return lig_labels

In [11]:
def list_to_string(lst, symbol):
    return (symbol.join([str(elem) for elem in lst]))

In [12]:
def get_dup_ids(df):
    nodup_ids = []
    dup_ids = []
    for index, row in df.iterrows():
        if row['cysteineid'] not in nodup_ids:
            nodup_ids.append(row['cysteineid'])
        else:
            dup_ids.append(row['cysteineid'])
            
    return dup_ids

# Extract Experimental Data

# Read Proteome-wide Discovery Data

R values≥ 4 in two or more data sets and met additional criteria for data quality

In [11]:
os.chdir(cd)
os.chdir('Discovery')

In [12]:
df_discovery_ligandable = pd.read_excel('41586_2016_BFnature18002_MOESM54_ESM.xlsx', sheet_name='Probe Targets In vitro')
df_discovery_not_ligandable = pd.read_excel('41586_2016_BFnature18002_MOESM54_ESM.xlsx', sheet_name='In vitro (not probe targets)')

In [13]:
df_discovery_ligandable['ligandable'] = 'yes'

In [14]:
df_discovery_ligandablity = pd.concat([df_discovery_ligandable, df_discovery_not_ligandable])

In [15]:
df_discovery_ligandable_subset = df_discovery_ligandablity.drop(columns = ['Protein Master_pep Exp_pep', 'Drug Bank Target', 'Protein Class'
                                                                          ])

In [16]:
df_discovery_ligandable_subset = df_discovery_ligandable_subset.rename(columns = {'Identifier': 'cysteineid'})

# cysteineid, chloroacetamide, bromoacetamide, acrylamide, acetamide

In [17]:
discovery_other_cols = [
 '23_500uM_invitro_231',
 '23_500uM_invitro_ramos',
 '42_500uM_invitro_ramos'
]

In [18]:
discovery_chloro_cols = [
 '2_500uM_invitro_231',
 '2_500uM_invitro_ramos',
 '3_500uM_invitro_231',
 '3_500uM_invitro_ramos',
 '4_250uM_invitro_231',
 '4_250uM_invitro_ramos',
 '7_500uM_invitro_231',
 '7_500uM_invitro_ramos',
 '8_500uM_invitro_231',
 '8_500uM_invitro_ramos',
 '9_500uM_invitro_231',
 '9_500uM_invitro_ramos',
 '10_500uM_invitro_231',
 '10_500uM_invitro_ramos',
 '11_500uM_invitro_231',
 '11_500uM_invitro_ramos',
 '12_500uM_invitro_231',
 '12_500uM_invitro_ramos',
 '13_500uM_invitro_231',
 '13_500uM_invitro_ramos',
 '27_500uM_invitro_231',
 '20_500uM_invitro_231',
 '20_500uM_invitro_ramos',
 '21_500uM_invitro_231',
 '21_500uM_invitro_ramos',
 '22_500uM_invitro_231',
 '22_500uM_invitro_ramos',
 '25_500uM_invitro_231',
 '27_500uM_invitro_231.1',
 '27_500uM_invitro_ramos',
 '28_500uM_invitro_231',
 '28_500uM_invitro_ramos',
 '29_500uM_invitro_231',
 '29_500uM_invitro_ramos',
 '30_500uM_invitro_231',
 '30_500uM_invitro_ramos',
 '32_500uM_invitro_231',
 '32_500uM_invitro_ramos',
 '33_500uM_invitro_231',
 '33_500uM_invitro_ramos',
 '34_500uM_invitro_231',
 '34_500uM_invitro_ramos',
 '35_500uM_invitro_231',
 '35_500uM_invitro_ramos',
 '36_500uM_invitro_231',
 '39_500uM_invitro_231',
 '43_500uM_invitro_231',
 '43_500uM_invitro_ramos',
 '44_500uM_invitro_231',
 '45_500uM_invitro_231',
 '49_500uM_invitro_231',
 '50_500uM_invitro_231',
 '51_500uM_invitro_231',
 '51_500uM_invitro_ramos',
 '52_500uM_invitro_231',
 '52_500uM_invitro_ramos',
 '54_500uM_invitro_231',
 '55_500uM_invitro_231'
]

In [19]:
discovery_acryl_cols = [
 '5_500uM_invitro_231',
 '5_500uM_invitro_ramos',
 '6_500uM_invitro_231',
 '14_500uM_invitro_231',
 '14_500uM_invitro_ramos',
 '15_500uM_invitro_231',
 '15_500uM_invitro_ramos',
 '26_500uM_invitro_ramos',
 '31_500uM_invitro_231',
 '31_500uM_invitro_ramos',
 '37_500uM_invitro_ramos',
 '38_500uM_invitro_231',
 '38_500uM_invitro_ramos',
 '40_500uM_invitro_231',
 '40_500uM_invitro_ramos',
 '41_500uM_invitro_231',
 '41_500uM_invitro_ramos',
 '46_500uM_invitro_231',
 '47_500uM_invitro_231',
 '48_500uM_invitro_231',
 '53_500uM_invitro_231',
 '53_500uM_invitro_ramos',
 '56_500uM_invitro_231',
 '56_500uM_invitro_ramos'
]

In [20]:
discovery_bromo_cols = [
 '24_500uM_invitro_ramos',
]

In [21]:
def get_ligandable_category(df, probe_cols, tcell, cutoff):
    
    lig_labels = []
    
    if tcell == True:
        df = df.replace('--', 0)
    
    for index, row in df.iterrows():
        liganded = False
        for i in range(len(probe_cols)):
            if row[probe_cols[i]] >= cutoff:
                liganded = True
                
        if liganded == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(np.nan)
            
    return lig_labels

In [186]:
def get_ligandable_greater_category(df, probe_cols, tcell, cutoff):
    
    lig_labels = []
    
    if tcell == True:
        df = df.replace('--', 0)
    
    for index, row in df.iterrows():
        liganded = False
        for i in range(len(probe_cols)):
            if row[probe_cols[i]] > cutoff:
                liganded = True
                
        if liganded == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(np.nan)
            
    return lig_labels

In [22]:
other_labels = get_ligandable_category(df_discovery_ligandable_subset, discovery_other_cols, True, 4)

In [23]:
chloro_labels = get_ligandable_category(df_discovery_ligandable_subset, discovery_chloro_cols, True, 4)

In [24]:
acryl_labels = get_ligandable_category(df_discovery_ligandable_subset, discovery_acryl_cols, True, 4)

In [25]:
bromo_labels = get_ligandable_category(df_discovery_ligandable_subset, discovery_bromo_cols, True, 4)

In [26]:
new_df_discovery_ligandability = df_discovery_ligandable_subset.copy()

In [27]:
new_df_discovery_ligandability['other'] = other_labels
new_df_discovery_ligandability['chloroacetamide'] = chloro_labels
new_df_discovery_ligandability['acrylamide'] = acryl_labels
new_df_discovery_ligandability['bromoacetamide'] = bromo_labels

In [28]:
backus_ligandable_df = new_df_discovery_ligandability.copy()

In [29]:
backus_ligandable_df.shape, len(backus_ligandable_df['cysteineid'].unique())

((6157, 97), 6157)

In [30]:
os.chdir('../')
os.chdir('results')
backus_ligandable_df.to_csv('backus_ligandable_dataset.csv', index = False)

# Read Suzuki Data

Competitive m-CSCP data, cysteine- and lysine- containing peptides labeled by two or more compounds with an MS1 ratio >3, and lysine-containing peptides labeled only by bifunctional compounds 15−18, 21, or 22 with an MS1 ratio >3

In [374]:
os.chdir(cd)
os.chdir('Suzuki')
os.chdir('5275142')

In [375]:
df_suzuki_ligandability = pd.read_excel('ac0c04726_si_003.xlsx', sheet_name='Liganded Cysteines')

In [377]:
df_suzuki_ligandability_subset = df_suzuki_ligandability[df_suzuki_ligandability['Proteins'].str.count(',') == 0]

In [379]:
df_suzuki_ligandability_subset.shape

(876, 14)

In [380]:
df_suzuki_ligandability_subset = df_suzuki_ligandability_subset[df_suzuki_ligandability_subset['Cysteine Labeled'].str.count(';') == 0]

In [381]:
df_suzuki_ligandability_subset.shape

(876, 14)

In [383]:
df_suzuki_ligandability_subset['cysteineid'] = df_suzuki_ligandability_subset['Proteins'] + '_' + df_suzuki_ligandability_subset['Cysteine Labeled']
df_suzuki_ligandability_subset = df_suzuki_ligandability_subset.rename(columns = {'Proteins': 'proteinid'})

In [386]:
df_suzuki_ligandability_subset

Unnamed: 0,Sequence,Cysteine Labeled,proteinid,Gene Name,15 Mean Ratio,16 Mean Ratio,17 Mean Ratio,18 Mean Ratio,19 Mean Ratio,20 Mean Ratio,21 Mean Ratio,22 Mean Ratio,KB3 Mean Ratio,KB14 Mean Ratio,cysteineid
0,R.NADC*SSGPGQR.V,C101,Q9BSD7,NTPCR,7.291606,11.653879,12.518661,11.962014,2.083666,9.586003,11.162382,4.156949,9.380445,15.738400,Q9BSD7_C101
1,R.LVVPATQC*GSLIGK.G,C109,Q15365,PCBP1,7.152586,4.007957,10.572970,7.172616,1.538675,11.547644,10.516507,11.687082,11.383731,6.238604,Q15365_C109
2,K.LNISFPATGC*QK.L,C12,P62753,RPS6,5.514663,4.632175,7.581202,5.240268,1.540174,9.495018,9.565758,9.083388,19.455253,5.389722,P62753_C12
3,R.HGFC*GIPITDTGR.M,C140,P12268,IMPDH2,8.943307,9.903237,9.241183,9.260288,1.601289,4.480943,5.317926,4.190588,3.530440,8.011889,P12268_C140
4,K.VC*NFLASQVPFPSR.L,C214,Q99714,HSD17B10,14.517019,6.688661,16.602403,3.115911,0.970330,20.000000,14.086372,18.099789,5.607376,6.411274,Q99714_C214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,K.C*PNLTYLNLSGNK.I,C87,Q9BTT0,ANP32E,2.562271,1.266308,2.304519,1.171669,1.216497,3.422084,2.125810,2.368503,0.974845,3.886560,Q9BTT0_C87
886,K.NLETPLC*K.N,C92,P54819,AK2,2.389350,1.673570,2.127241,1.505026,1.097580,4.868312,1.459994,2.095577,2.406276,4.187669,P54819_C92
887,R.C*QHLQAEK.K,C931,P35579,MYH9,2.615336,2.273709,2.447052,3.277421,1.526949,3.002799,2.019259,2.809188,1.505132,2.460907,P35579_C931
888,R.QVC*PLDNR.E,C94,P62877,RBX1,1.428531,1.894191,1.725322,2.084840,5.323070,1.465588,1.523959,1.204224,2.760544,7.258142,P62877_C94


In [390]:
df_suzuki_ligandability_merged = df_suzuki_ligandability_subset.copy()

## cysteineid, chloroacetamide, acrylamide, acetamide

In [396]:
suzuki_cols = df_suzuki_ligandability_merged.columns.to_list()[4:-1]

In [399]:
for i in range(len(cols)):
    df_suzuki_ligandability_merged = df_suzuki_ligandability_merged.rename(columns = {cols[i] : cols[i] + ' ' + 'hek293t'})

In [401]:
suzuki_other_cols = ['19 Mean Ratio hek293t', '22 Mean Ratio hek293t']
suzuki_chloro_cols = ['16 Mean Ratio hek293t', '18 Mean Ratio hek293t', '21 Mean Ratio hek293t', 'KB3 Mean Ratio hek293t']
suzuki_acryl_cols = ['15 Mean Ratio hek293t', '17 Mean Ratio hek293t', '20 Mean Ratio hek293t', 'KB14 Mean Ratio hek293t']

In [403]:
other_labels = get_ligandable_category(df_suzuki_ligandability_merged, suzuki_other_cols, False, 3)

In [404]:
chloro_labels = get_ligandable_category(df_suzuki_ligandability_merged, suzuki_chloro_cols, False, 3)

In [405]:
acryl_labels = get_ligandable_category(df_suzuki_ligandability_merged, suzuki_acryl_cols, False, 3)

In [406]:
new_df_suzuki_ligandability = df_suzuki_ligandability_merged.copy()

In [407]:
new_df_suzuki_ligandability['other'] = other_labels
new_df_suzuki_ligandability['chloroacetamide'] = chloro_labels
new_df_suzuki_ligandability['acrylamide'] = acryl_labels

In [408]:
df_suzuki_compound_list = new_df_suzuki_ligandability.columns.to_list()[2:-3]

In [817]:
df_suzuki_not_ligandable, df_suzuki_ligandable = get_ligandability(new_df_suzuki_ligandability, df_suzuki_compound_list, 2)

In [813]:
cao_ligandable_df = pd.concat([df_suzuki_not_ligandable, df_suzuki_ligandable])

In [814]:
cao_ligandable_df = cao_ligandable_df.drop(columns = ['Sequence', 'Cysteine Labeled', 'Gene Name'])

In [417]:
os.chdir('../../')
os.chdir('results')
cao_ligandable_df.to_csv('cao_ligandable_dataset.csv', index = False)

# Read T-cell Data

active compounds in
T cells by ABPP. In these experiments, we set a slightly lower
threshold for liganded cysteines (R >= 4 versus R >= 5 for scout
fragments), because treatments with elaborated electrophilic
compounds were performed in situ at much lower concentrations

In [57]:
os.chdir(cd)
os.chdir('Tcell')

## Read ligandability data

In [58]:
df_tcell_ligandablility = pd.read_excel('NIHMS1616434-supplement-mmc4.xlsx', sheet_name='Table S6_Master Table', header = [5])

  warn(msg)


In [258]:
df_tcell_ligandablility_merged_split = df_tcell_ligandablility[df_tcell_ligandablility['Residues'].str.count(',') == 0]

In [259]:
df_tcell_ligandablility_merged_split['cysteineid'] = df_tcell_ligandablility_merged_split['Uniprot'] + '_' + df_tcell_ligandablility_merged_split['Residues'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tcell_ligandablility_merged_split['cysteineid'] = df_tcell_ligandablility_merged_split['Uniprot'] + '_' + df_tcell_ligandablility_merged_split['Residues'].str.strip()


In [260]:
df_tcell_ligandablility_merged_split = df_tcell_ligandablility_merged_split.rename(columns = {
    'control.1': 'control_1_KB02_TMT',
    'control.2': 'control_2_KB05_TMT',
    'control.3': 'control_3_KB02_TMT',
    'control.4': 'control_4_KB05_TMT',
    'control.5': 'control_5_KB02_isoTOP',
    'control.6': 'control_6_KB05_isoTOP',
    'control.7': 'control_7_KB02_isoTOP',
    'control.8': 'control_8_KB05_isoTOP'
})

In [261]:
tcell_cols = df_tcell_ligandablility_merged_split.columns.to_list()[10:-1]

In [262]:
for i in range(len(tcell_cols)):
    df_tcell_ligandablility_merged_split = df_tcell_ligandablility_merged_split.rename(columns = {tcell_cols[i]: tcell_cols[i].replace(' μM', 'uM').replace(' ', '_') + '_' + 'tcell' })

## cysteineid, chloroacetamide, acrylamide, dmf

In [265]:
tcell_chloro_active_cols = [
 'EV-3_tcell',
 'EV-93_tcell'
]

In [266]:
tcell_chloro_scout_cols = [
 'control_1_KB02_TMT_tcell',
 'control_3_KB02_TMT_tcell',
 'control_5_KB02_isoTOP_tcell',
 'control_7_KB02_isoTOP_tcell'
]

In [267]:
tcell_chloro_cols = tcell_chloro_active_cols + tcell_chloro_scout_cols

In [268]:
tcell_acryl_active_cols = [
 'BPK-21_tcell',
 'BPK-25_tcell',
 'EV-98_5uM_tcell',
 'EV-98_20uM_tcell',
 'EV-99_5uM_tcell',
 'EV-99_20uM_tcell',
 'EV-96_5uM_tcell',
 'EV-96_20uM_tcell',
 'EV-97_5uM_tcell',
 'ev-97_20uM_tcell'
]

In [269]:
tcell_acryl_scout_cols = [
 'control_2_KB05_TMT_tcell',
 'control_4_KB05_TMT_tcell',
 'control_6_KB05_isoTOP_tcell',
 'control_8_KB05_isoTOP_tcell'

In [270]:
tcell_acryl_cols = tcell_acryl_active_cols + tcell_acryl_scout_cols

In [271]:
tcell_dmf_cols = [
 'DMF_tcell'
]

In [272]:
chloro_active_labels = get_ligandable_category(df_tcell_ligandablility_merged_split, tcell_chloro_active_cols, False, 4)

In [273]:
chloro_scout_labels = get_ligandable_category(df_tcell_ligandablility_merged_split, tcell_chloro_scout_cols, False, 5)

In [274]:
acryl_active_labels = get_ligandable_category(df_tcell_ligandablility_merged_split, tcell_acryl_active_cols, False, 4)

In [275]:
acryl_scout_labels = get_ligandable_category(df_tcell_ligandablility_merged_split, tcell_acryl_scout_cols, False, 5)

In [276]:
dmf_labels = get_ligandable_category(df_tcell_ligandablility_merged_split, tcell_dmf_cols, False, 4)

In [277]:
new_df_tcell_ligandability = df_tcell_ligandablility_merged_split.copy()

In [278]:
new_df_tcell_ligandability['chloroacetamide_active'] = chloro_active_labels
new_df_tcell_ligandability['acrylamide_active'] = acryl_active_labels
new_df_tcell_ligandability['chloroacetamide_scout'] = chloro_scout_labels
new_df_tcell_ligandability['acrylamide_scout'] = acryl_scout_labels
new_df_tcell_ligandability['dmf'] = dmf_labels

In [279]:
new_df_tcell_ligandability['chloroacetamide'] = np.where((new_df_tcell_ligandability['chloroacetamide_active'] == 'yes') | (new_df_tcell_ligandability['chloroacetamide_scout'] == 'yes'), 'yes', None)

In [280]:
new_df_tcell_ligandability['acrylamide'] = np.where((new_df_tcell_ligandability['acrylamide_active'] == 'yes') | (new_df_tcell_ligandability['acrylamide_scout'] == 'yes'), 'yes', None)

In [281]:
new_df_tcell_ligandability['dmf'] = dmf_labels

In [284]:
new_df_tcell_ligandability['ligandable'] = np.where((new_df_tcell_ligandability['Max'] >= 4) | (new_df_tcell_ligandability['Max.1_tcell'] >= 5), 'yes', np.nan)

In [285]:
df_tcell_ligandability_merged = new_df_tcell_ligandability.copy()

In [286]:
df_tcell_ligandability_merged.columns.to_list()

['Identifier',
 'Uniprot',
 'Description',
 'Residues',
 'OMIM?',
 'immune-enriched?',
 'target?',
 'activated',
 'control',
 'Max',
 'DMF_tcell',
 'EV-3_tcell',
 'BPK-21_tcell',
 'BPK-25_tcell',
 'EV-93_tcell',
 'EV-98_5uM_tcell',
 'EV-98_20uM_tcell',
 'EV-99_5uM_tcell',
 'EV-99_20uM_tcell',
 'EV-96_5uM_tcell',
 'EV-96_20uM_tcell',
 'EV-97_5uM_tcell',
 'ev-97_20uM_tcell',
 'Max.1_tcell',
 'control_1_KB02_TMT_tcell',
 'activated.1_tcell',
 'control_2_KB05_TMT_tcell',
 'activated.2_tcell',
 'control_3_KB02_TMT_tcell',
 'activated.3_tcell',
 'control_4_KB05_TMT_tcell',
 'activated.4_tcell',
 'control_5_KB02_isoTOP_tcell',
 'activated.5_tcell',
 'control_6_KB05_isoTOP_tcell',
 'activated.6_tcell',
 'control_7_KB02_isoTOP_tcell',
 'activated.7_tcell',
 'control_8_KB05_isoTOP_tcell',
 'activated.8_tcell',
 'cysteineid',
 'chloroacetamide_active',
 'acrylamide_active',
 'chloroacetamide_scout',
 'acrylamide_scout',
 'dmf',
 'chloroacetamide',
 'acrylamide',
 'ligandable']

In [287]:
df_tcell_ligandability_merged = df_tcell_ligandability_merged.drop(columns =  ['Description',
 'Residues',
 'OMIM?',
 'immune-enriched?',
 'target?',
 'activated',
 'control',
 'Max', 
 'Residues'])

In [288]:
df_tcell_ligandability_merged = df_tcell_ligandability_merged.rename(columns = {'Uniprot': 'proteinid'})

In [289]:
df_tcell_ligandability_merged.columns.to_list()

['Identifier',
 'proteinid',
 'DMF_tcell',
 'EV-3_tcell',
 'BPK-21_tcell',
 'BPK-25_tcell',
 'EV-93_tcell',
 'EV-98_5uM_tcell',
 'EV-98_20uM_tcell',
 'EV-99_5uM_tcell',
 'EV-99_20uM_tcell',
 'EV-96_5uM_tcell',
 'EV-96_20uM_tcell',
 'EV-97_5uM_tcell',
 'ev-97_20uM_tcell',
 'Max.1_tcell',
 'control_1_KB02_TMT_tcell',
 'activated.1_tcell',
 'control_2_KB05_TMT_tcell',
 'activated.2_tcell',
 'control_3_KB02_TMT_tcell',
 'activated.3_tcell',
 'control_4_KB05_TMT_tcell',
 'activated.4_tcell',
 'control_5_KB02_isoTOP_tcell',
 'activated.5_tcell',
 'control_6_KB05_isoTOP_tcell',
 'activated.6_tcell',
 'control_7_KB02_isoTOP_tcell',
 'activated.7_tcell',
 'control_8_KB05_isoTOP_tcell',
 'activated.8_tcell',
 'cysteineid',
 'chloroacetamide_active',
 'acrylamide_active',
 'chloroacetamide_scout',
 'acrylamide_scout',
 'dmf',
 'chloroacetamide',
 'acrylamide',
 'ligandable']

In [290]:
vinogradova_ligandable_df = df_tcell_ligandability_merged[[
 'proteinid',
 'cysteineid',
 'DMF_tcell',
 'EV-3_tcell',
 'BPK-21_tcell',
 'BPK-25_tcell',
 'EV-93_tcell',
 'EV-98_5uM_tcell',
 'EV-98_20uM_tcell',
 'EV-99_5uM_tcell',
 'EV-99_20uM_tcell',
 'EV-96_5uM_tcell',
 'EV-96_20uM_tcell',
 'EV-97_5uM_tcell',
 'ev-97_20uM_tcell',
#  'Max.1_tcell',
 'control_1_KB02_TMT_tcell',
#  'activated.1_tcell',
 'control_2_KB05_TMT_tcell',
#  'activated.2_tcell',
 'control_3_KB02_TMT_tcell',
#  'activated.3_tcell',
 'control_4_KB05_TMT_tcell',
#  'activated.4_tcell',
 'control_5_KB02_isoTOP_tcell',
#  'activated.5_tcell',
 'control_6_KB05_isoTOP_tcell',
#  'activated.6_tcell',
 'control_7_KB02_isoTOP_tcell',
#  'activated.7_tcell',
 'control_8_KB05_isoTOP_tcell',
#  'activated.8_tcell',
 'chloroacetamide',
 'acrylamide',
 'dmf',
 'ligandable'
]]

In [291]:
tcell_dup_ids = get_dup_ids(vinogradova_ligandable_df)

In [292]:
vinogradova_ligandable_df['duplicate'] = np.where(vinogradova_ligandable_df['cysteineid'].isin(tcell_dup_ids), 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vinogradova_ligandable_df['duplicate'] = np.where(vinogradova_ligandable_df['cysteineid'].isin(tcell_dup_ids), 1, 0)


In [293]:
nodups_tcell = vinogradova_ligandable_df[vinogradova_ligandable_df['duplicate'] == 0]

In [294]:
nodups_tcell.shape, len(nodups_tcell['cysteineid'].unique())

((19586, 28), 19586)

In [295]:
os.chdir('../')
os.chdir('results')
nodups_tcell.to_csv('vinogradova_ligandable_dataset.csv', index = False)

24081 -23920 = 161

Number of duplicate cysteines.

# Read SP3 Data

MS1 chromatographic area
ratios (R>4), indicative of labeling

In [296]:
os.chdir('../')

In [297]:
os.chdir('SP3')

In [298]:
df_sp3 = pd.read_excel("cbic202000870-sup-0001-table_s6_modified.xlsx")

In [299]:
df_sp3 = df_sp3[df_sp3["Protein"].str.contains('contaminant') == False]

In [300]:
df_sp3.shape

(7154, 12)

In [301]:
df_sp3 = df_sp3[df_sp3['Protein'].str.count(',') == 0]

In [302]:
df_sp3 = df_sp3[df_sp3['PTM'].str.count(',') == 0]

In [303]:
df_sp3.shape

(6873, 12)

In [305]:
df_sp3 = df_sp3.drop_duplicates()

In [306]:
df_sp3 = df_sp3.rename(columns = {'Protein': 'proteinid'})
df_sp3 = df_sp3.rename(columns = {'PTM': 'resid'})

In [307]:
df_sp3['cysteineid'] = df_sp3['proteinid'] + '_' + df_sp3['resid']

In [308]:
df_sp3 = df_sp3.rename(columns = {'Mean Ratio': 'Mean Ratio jurkat'})

In [310]:
sp3_chloro_cols = ['Mean Ratio jurkat']

In [311]:
chloro_labels = get_ligandable_greater_category(df_sp3, sp3_chloro_cols, False, 4)

In [312]:
new_df_sp3_ligandability = df_sp3.copy()

In [313]:
new_df_sp3_ligandability['chloroacetamide'] = chloro_labels

In [316]:
new_df_sp3_ligandability['ligandable'] = np.where(new_df_sp3_ligandability['Mean Ratio jurkat'] > 4, 'yes', np.nan)

In [317]:
os.chdir('../')
os.chdir('results')
new_df_sp3_ligandability.to_csv('yan_ligandable_dataset.csv', index = False)

# Read SLCABPP Data

Scout Ligandable cysteine sites were defined as 
those showing ≥75% reduction in abundance (CR≥4)

ligandable (CR ≥ 4) 
cysteines per fragment

In [13]:
os.chdir(cd)
os.chdir('SLCABPP')
os.chdir('NIHMS1660575-supplement-9_supplementary_tables')

In [15]:
# HCT116
df_scout = pd.read_excel('41587_2020_778_S4_ESM.xlsx', sheet_name='Scout Profiling')

In [16]:
df_hct = pd.read_excel('41587_2020_778_S6_ESM.xlsx', sheet_name='SLC-ABPP HCT116 Screen')
df_hek = pd.read_excel('41587_2020_778_S7_ESM.xlsx', sheet_name='SLC-ABPP HEK293T SCreen')
df_patu = pd.read_excel('41587_2020_778_S8_ESM.xlsx', sheet_name='SLC-ABPP PaTu-8988T Screen')

In [17]:
df_scout['proteinid'] = df_scout['Uniprot ID'].map(lambda x: str(x).split('|')[1].strip())

In [20]:
df_scout['cysteineid'] = df_scout['proteinid'] + '_C' + df_scout['Site Position'].astype(str)

In [22]:
df_scout['cysteineid'] = df_scout['cysteineid'].map(lambda x: str(x).replace(' ', '').strip())

In [23]:
df_scout = df_scout.rename(columns = {'Comp ratio (KBO2)': 'Comp ratio (KBO2) hct116',
                                      'Comp ratio (KBO3)': 'Comp ratio (KBO3) hct116',
                                      'Comp ratio (KBO5)': 'Comp ratio (KBO5) hct116'})

In [24]:
slc_chloro_cols = ['Comp ratio (KBO2) hct116', 'Comp ratio (KBO3) hct116']

In [25]:
slc_acryl_cols = ['Comp ratio (KBO5) hct116']

In [35]:
chloro_labels = get_ligandable_category(df_scout, slc_chloro_cols, False, 4)

In [36]:
acryl_labels = get_ligandable_category(df_scout, slc_acryl_cols, False, 4)

In [37]:
new_df_scout_ligandability = df_scout[['proteinid', 'cysteineid', 
                                       'Comp ratio (KBO2) hct116', 
                                       'Comp ratio (KBO3) hct116', 
                                       'Comp ratio (KBO5) hct116']]

In [38]:
new_df_scout_ligandability['chloroacetamide'] = chloro_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_scout_ligandability['chloroacetamide'] = chloro_labels


In [39]:
new_df_scout_ligandability['acrylamide'] = acryl_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_scout_ligandability['acrylamide'] = acryl_labels


In [40]:
scout_compound_list = new_df_scout_ligandability.columns.to_list()[2:-2]

In [41]:
df_scout_not_ligandable, df_scout_ligandable = get_ligandability(new_df_scout_ligandability, scout_compound_list, 4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ligandable'] = ligandable


In [42]:
new_kuljanin_scout_ligandability_df = pd.concat([df_scout_ligandable, df_scout_not_ligandable])

In [43]:
new_kuljanin_scout_ligandability_df.to_csv('kuljanin_scout_ligandable_dataset.csv', index = False)

In [44]:
def get_ligandable_category(df, probe_cols, tcell, cutoff):
    
    lig_labels = []
    
    if tcell == True:
        df = df.replace('--', 0)
    else:
        df = df.replace(np.nan, 0)
    
    for index, row in df.iterrows():
        liganded = False
        for i in range(len(probe_cols)):
            if (type(row[probe_cols[i]]) == str):
                print(row[probe_cols[i]])
                print(probe_cols[i])
                
            if row[probe_cols[i]] >= cutoff:
                liganded = True
                
        if liganded == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(None)
            
    return lig_labels

In [45]:
def get_full_ligandability(df):
    df['proteinid'] = df['Uniprot ID'].map(lambda x: str(x).split('|')[1])
    df['cysteineid'] = df['proteinid'] + '_C' + df['Site Position'].astype(str)
    
    chloro_cols = df.columns.to_list()[5:133]
    acryl_cols = df.columns.to_list()[134:-2]
    
    chloro_labels = get_ligandable_category(df, chloro_cols, False, 4)
    acryl_labels = get_ligandable_category(df, acryl_cols, False, 4)
    
    new_df_ligandability = df.copy()
    new_df_ligandability = new_df_ligandability.drop(columns = ['Uniprot ID',
                             'Gene Symbol',
                             'Peptide Sequence',
                             'Site Position',
                             'Gene + Site'])
    
    new_df_ligandability['chloroacetamide'] = chloro_labels
    new_df_ligandability['acrylamide'] = acryl_labels
    
    return new_df_ligandability

In [46]:
new_hct_df_ligandability = get_full_ligandability(df_hct)

In [47]:
new_hek_df_ligandability = get_full_ligandability(df_hek)

In [48]:
new_patu_df_ligandability = get_full_ligandability(df_patu)

In [49]:
new_patu_df_ligandability

Unnamed: 0,CL1,CL2,CL3,CL4,CL5,CL6,CL7,CL8,CL9,CL10,...,AC152,AC153,AC154,AC155,AC156,AC157,proteinid,cysteineid,chloroacetamide,acrylamide
0,20.000,20.00,8.400,20.000,20.000,19.400,12.600,7.670,20.000,20.000,...,1.130,0.999,1.050,0.829,1.060,0.475,O95881,O95881_C66,yes,yes
1,15.400,2.84,20.000,20.000,1.420,20.000,8.960,20.000,9.390,20.000,...,1.300,1.800,1.950,1.690,1.650,1.580,P16455,P16455_C145,yes,
2,9.850,6.89,18.800,12.100,1.890,1.560,12.200,4.040,9.780,20.000,...,1.710,2.640,1.680,1.380,1.790,1.750,P78417,P78417_C32,yes,yes
3,11.300,2.49,1.790,18.600,1.350,1.970,2.070,20.000,8.520,20.000,...,1.440,1.910,3.270,2.070,2.020,2.140,Q13490,Q13490_C45,yes,yes
4,9.170,1.42,12.300,8.450,1.500,11.000,3.460,11.600,9.760,15.500,...,0.800,0.923,1.190,0.678,1.040,1.390,O95197-6,O95197-6_C42,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20264,1.160,1.04,1.050,1.060,1.250,1.020,1.290,1.150,1.000,3.180,...,1.430,1.690,1.660,1.350,1.400,1.590,Q14202,Q14202_C636,,
20265,0.946,1.19,0.821,0.989,0.947,0.971,0.993,0.723,0.869,0.770,...,0.672,0.810,0.781,0.860,0.699,0.858,Q9Y4E5,Q9Y4E5_C940,,
20266,0.961,0.79,1.000,1.080,0.811,0.816,0.884,0.894,0.981,0.912,...,1.020,0.868,0.810,0.944,0.925,0.948,Q6ZN55-2,Q6ZN55-2_C483,,
20267,0.961,0.79,1.000,1.080,0.811,0.816,0.884,0.894,0.981,0.912,...,1.020,0.868,0.810,0.944,0.925,0.948,Q6ZN55-2,Q6ZN55-2_C485,,


In [50]:
cell_line_compound_list = new_patu_df_ligandability.columns.to_list()
cell_line_compound_list = cell_line_compound_list[:-4]

In [52]:
def rename_slcabpp_cols(df, cell_line_compound_list, cell_line):
    for i in range(len(cell_line_compound_list)):
        df = df.rename(columns = {cell_line_compound_list[i]: cell_line_compound_list[i] + '_' + cell_line})
        
    return df

In [53]:
new_hct_df_ligandability = rename_slcabpp_cols(new_hct_df_ligandability, cell_line_compound_list, 'hct116')

In [54]:
new_hek_df_ligandability = rename_slcabpp_cols(new_hek_df_ligandability, cell_line_compound_list, 'hek293t')

In [55]:
new_patu_df_ligandability = rename_slcabpp_cols(new_patu_df_ligandability, cell_line_compound_list, 'patu-8988t')

In [56]:
hct_compound_list = [
 'CL1_hct116',
 'CL2_hct116',
 'CL3_hct116',
 'CL4_hct116',
 'CL5_hct116',
 'CL6_hct116',
 'CL7_hct116',
 'CL8_hct116',
 'CL9_hct116',
 'CL10_hct116',
 'CL11_hct116',
 'CL12_hct116',
 'CL13_hct116',
 'CL14_hct116',
 'CL15_hct116',
 'CL16_hct116',
 'CL17_hct116',
 'CL18_hct116',
 'CL19_hct116',
 'CL20_hct116',
 'CL21_hct116',
 'CL22_hct116',
 'CL23_hct116',
 'CL24_hct116',
 'CL25_hct116',
 'CL26_hct116',
 'CL27_hct116',
 'CL28_hct116',
 'CL29_hct116',
 'CL30_hct116',
 'CL31_hct116',
 'CL32_hct116',
 'CL33_hct116',
 'CL34_hct116',
 'CL35_hct116',
 'CL36_hct116',
 'CL37_hct116',
 'CL38_hct116',
 'CL39_hct116',
 'CL40_hct116',
 'CL41_hct116',
 'CL42_hct116',
 'CL43_hct116',
 'CL44_hct116',
 'CL45_hct116',
 'CL46_hct116',
 'CL47_hct116',
 'CL48_hct116',
 'CL49_hct116',
 'CL50_hct116',
 'CL51_hct116',
 'CL52_hct116',
 'CL53_hct116',
 'CL54_hct116',
 'CL55_hct116',
 'CL56_hct116',
 'CL57_hct116',
 'CL58_hct116',
 'CL59_hct116',
 'CL60_hct116',
 'CL61_hct116',
 'CL62_hct116',
 'CL63_hct116',
 'CL64_hct116',
 'CL65_hct116',
 'CL66_hct116',
 'CL67_hct116',
 'CL68_hct116',
 'CL69_hct116',
 'CL70_hct116',
 'CL71_hct116',
 'CL72_hct116',
 'CL73_hct116',
 'CL74_hct116',
 'CL75_hct116',
 'CL76_hct116',
 'CL77_hct116',
 'CL78_hct116',
 'CL79_hct116',
 'CL80_hct116',
 'CL81_hct116',
 'CL82_hct116',
 'CL83_hct116',
 'CL84_hct116',
 'CL85_hct116',
 'CL86_hct116',
 'CL87_hct116',
 'CL88_hct116',
 'CL89_hct116',
 'CL90_hct116',
 'CL91_hct116',
 'CL92_hct116',
 'CL93_hct116',
 'CL94_hct116',
 'CL95_hct116',
 'CL96_hct116',
 'CL97_hct116',
 'CL98_hct116',
 'CL99_hct116',
 'CL100_hct116',
 'CL101_hct116',
 'CL102_hct116',
 'CL103_hct116',
 'CL104_hct116',
 'CL105_hct116',
 'CL106_hct116',
 'CL107_hct116',
 'CL108_hct116',
 'CL109_hct116',
 'CL110_hct116',
 'CL111_hct116',
 'CL112_hct116',
 'CL113_hct116',
 'CL114_hct116',
 'CL115_hct116',
 'CL116_hct116',
 'CL117_hct116',
 'CL118_hct116',
 'CL119_hct116',
 'CL120_hct116',
 'CL121_hct116',
 'CL122_hct116',
 'CL123_hct116',
 'CL124_hct116',
 'CL125_hct116',
 'CL126_hct116',
 'CL127_hct116',
 'CL128_hct116',
 'AC1_hct116',
 'AC2_hct116',
 'AC3_hct116',
 'AC4_hct116',
 'AC5_hct116',
 'AC6_hct116',
 'AC7_hct116',
 'AC8_hct116',
 'AC9_hct116',
 'AC10_hct116',
 'AC11_hct116',
 'AC12_hct116',
 'AC13_hct116',
 'AC14_hct116',
 'AC15_hct116',
 'AC16_hct116',
 'AC17_hct116',
 'AC18_hct116',
 'AC19_hct116',
 'AC20_hct116',
 'AC21_hct116',
 'AC22_hct116',
 'AC23_hct116',
 'AC24_hct116',
 'AC25_hct116',
 'AC26_hct116',
 'AC27_hct116',
 'AC28_hct116',
 'AC29_hct116',
 'AC30_hct116',
 'AC31_hct116',
 'AC32_hct116',
 'AC33_hct116',
 'AC34_hct116',
 'AC35_hct116',
 'AC36_hct116',
 'AC37_hct116',
 'AC38_hct116',
 'AC39_hct116',
 'AC40_hct116',
 'AC41_hct116',
 'AC42_hct116',
 'AC43_hct116',
 'AC44_hct116',
 'AC45_hct116',
 'AC46_hct116',
 'AC47_hct116',
 'AC48_hct116',
 'AC49_hct116',
 'AC50_hct116',
 'AC51_hct116',
 'AC52_hct116',
 'AC53_hct116',
 'AC54_hct116',
 'AC55_hct116',
 'AC56_hct116',
 'AC57_hct116',
 'AC58_hct116',
 'AC59_hct116',
 'AC60_hct116',
 'AC61_hct116',
 'AC62_hct116',
 'AC63_hct116',
 'AC64_hct116',
 'AC65_hct116',
 'AC66_hct116',
 'AC67_hct116',
 'AC68_hct116',
 'AC69_hct116',
 'AC70_hct116',
 'AC71_hct116',
 'AC72_hct116',
 'AC73_hct116',
 'AC74_hct116',
 'AC75_hct116',
 'AC76_hct116',
 'AC77_hct116',
 'AC78_hct116',
 'AC79_hct116',
 'AC80_hct116',
 'AC81_hct116',
 'AC82_hct116',
 'AC83_hct116',
 'AC84_hct116',
 'AC85_hct116',
 'AC86_hct116',
 'AC87_hct116',
 'AC88_hct116',
 'AC89_hct116',
 'AC90_hct116',
 'AC91_hct116',
 'AC92_hct116',
 'AC93_hct116',
 'AC94_hct116',
 'AC95_hct116',
 'AC96_hct116',
 'AC97_hct116',
 'AC98_hct116',
 'AC99_hct116',
 'AC100_hct116',
 'AC101_hct116',
 'AC102_hct116',
 'AC103_hct116',
 'AC104_hct116',
 'AC105_hct116',
 'AC106_hct116',
 'AC107_hct116',
 'AC108_hct116',
 'AC109_hct116',
 'AC110_hct116',
 'AC111_hct116',
 'AC112_hct116',
 'AC113_hct116',
 'AC114_hct116',
 'AC115_hct116',
 'AC116_hct116',
 'AC117_hct116',
 'AC118_hct116',
 'AC119_hct116',
 'AC120_hct116',
 'AC121_hct116',
 'AC122_hct116',
 'AC123_hct116',
 'AC124_hct116',
 'AC125_hct116',
 'AC126_hct116',
 'AC127_hct116',
 'AC128_hct116',
 'AC129_hct116',
 'AC130_hct116',
 'AC131_hct116',
 'AC132_hct116',
 'AC133_hct116',
 'AC134_hct116',
 'AC135_hct116',
 'AC136_hct116',
 'AC137_hct116',
 'AC138_hct116',
 'AC139_hct116',
 'AC140_hct116',
 'AC141_hct116',
 'AC142_hct116',
 'AC143_hct116',
 'AC144_hct116',
 'AC145_hct116',
 'AC146_hct116',
 'AC147_hct116',
 'AC148_hct116',
 'AC149_hct116',
 'AC150_hct116',
 'AC151_hct116',
 'AC152_hct116',
 'AC153_hct116',
 'AC154_hct116',
 'AC155_hct116',
 'AC156_hct116',
 'AC157_hct116'
]

In [57]:
new_hek_df_ligandability.columns.to_list()

['CL1_hek293t',
 'CL2_hek293t',
 'CL3_hek293t',
 'CL4_hek293t',
 'CL5_hek293t',
 'CL6_hek293t',
 'CL7_hek293t',
 'CL8_hek293t',
 'CL9_hek293t',
 'CL10_hek293t',
 'CL11_hek293t',
 'CL12_hek293t',
 'CL13_hek293t',
 'CL14_hek293t',
 'CL15_hek293t',
 'CL16_hek293t',
 'CL17_hek293t',
 'CL18_hek293t',
 'CL19_hek293t',
 'CL20_hek293t',
 'CL21_hek293t',
 'CL22_hek293t',
 'CL23_hek293t',
 'CL24_hek293t',
 'CL25_hek293t',
 'CL26_hek293t',
 'CL27_hek293t',
 'CL28_hek293t',
 'CL29_hek293t',
 'CL30_hek293t',
 'CL31_hek293t',
 'CL32_hek293t',
 'CL33_hek293t',
 'CL34_hek293t',
 'CL35_hek293t',
 'CL36_hek293t',
 'CL37_hek293t',
 'CL38_hek293t',
 'CL39_hek293t',
 'CL40_hek293t',
 'CL41_hek293t',
 'CL42_hek293t',
 'CL43_hek293t',
 'CL44_hek293t',
 'CL45_hek293t',
 'CL46_hek293t',
 'CL47_hek293t',
 'CL48_hek293t',
 'CL49_hek293t',
 'CL50_hek293t',
 'CL51_hek293t',
 'CL52_hek293t',
 'CL53_hek293t',
 'CL54_hek293t',
 'CL55_hek293t',
 'CL56_hek293t',
 'CL57_hek293t',
 'CL58_hek293t',
 'CL59_hek293t',
 'CL60

In [58]:
hek_compound_list = [
'CL1_hek293t',
 'CL2_hek293t',
 'CL3_hek293t',
 'CL4_hek293t',
 'CL5_hek293t',
 'CL6_hek293t',
 'CL7_hek293t',
 'CL8_hek293t',
 'CL9_hek293t',
 'CL10_hek293t',
 'CL11_hek293t',
 'CL12_hek293t',
 'CL13_hek293t',
 'CL14_hek293t',
 'CL15_hek293t',
 'CL16_hek293t',
 'CL17_hek293t',
 'CL18_hek293t',
 'CL19_hek293t',
 'CL20_hek293t',
 'CL21_hek293t',
 'CL22_hek293t',
 'CL23_hek293t',
 'CL24_hek293t',
 'CL25_hek293t',
 'CL26_hek293t',
 'CL27_hek293t',
 'CL28_hek293t',
 'CL29_hek293t',
 'CL30_hek293t',
 'CL31_hek293t',
 'CL32_hek293t',
 'CL33_hek293t',
 'CL34_hek293t',
 'CL35_hek293t',
 'CL36_hek293t',
 'CL37_hek293t',
 'CL38_hek293t',
 'CL39_hek293t',
 'CL40_hek293t',
 'CL41_hek293t',
 'CL42_hek293t',
 'CL43_hek293t',
 'CL44_hek293t',
 'CL45_hek293t',
 'CL46_hek293t',
 'CL47_hek293t',
 'CL48_hek293t',
 'CL49_hek293t',
 'CL50_hek293t',
 'CL51_hek293t',
 'CL52_hek293t',
 'CL53_hek293t',
 'CL54_hek293t',
 'CL55_hek293t',
 'CL56_hek293t',
 'CL57_hek293t',
 'CL58_hek293t',
 'CL59_hek293t',
 'CL60_hek293t',
 'CL61_hek293t',
 'CL62_hek293t',
 'CL63_hek293t',
 'CL64_hek293t',
 'CL65_hek293t',
 'CL66_hek293t',
 'CL67_hek293t',
 'CL68_hek293t',
 'CL69_hek293t',
 'CL70_hek293t',
 'CL71_hek293t',
 'CL72_hek293t',
 'CL73_hek293t',
 'CL74_hek293t',
 'CL75_hek293t',
 'CL76_hek293t',
 'CL77_hek293t',
 'CL78_hek293t',
 'CL79_hek293t',
 'CL80_hek293t',
 'CL81_hek293t',
 'CL82_hek293t',
 'CL83_hek293t',
 'CL84_hek293t',
 'CL85_hek293t',
 'CL86_hek293t',
 'CL87_hek293t',
 'CL88_hek293t',
 'CL89_hek293t',
 'CL90_hek293t',
 'CL91_hek293t',
 'CL92_hek293t',
 'CL93_hek293t',
 'CL94_hek293t',
 'CL95_hek293t',
 'CL96_hek293t',
 'CL97_hek293t',
 'CL98_hek293t',
 'CL99_hek293t',
 'CL100_hek293t',
 'CL101_hek293t',
 'CL102_hek293t',
 'CL103_hek293t',
 'CL104_hek293t',
 'CL105_hek293t',
 'CL106_hek293t',
 'CL107_hek293t',
 'CL108_hek293t',
 'CL109_hek293t',
 'CL110_hek293t',
 'CL111_hek293t',
 'CL112_hek293t',
 'CL113_hek293t',
 'CL114_hek293t',
 'CL115_hek293t',
 'CL116_hek293t',
 'CL117_hek293t',
 'CL118_hek293t',
 'CL119_hek293t',
 'CL120_hek293t',
 'CL121_hek293t',
 'CL122_hek293t',
 'CL123_hek293t',
 'CL124_hek293t',
 'CL125_hek293t',
 'CL126_hek293t',
 'CL127_hek293t',
 'CL128_hek293t',
 'AC1_hek293t',
 'AC2_hek293t',
 'AC3_hek293t',
 'AC4_hek293t',
 'AC5_hek293t',
 'AC6_hek293t',
 'AC7_hek293t',
 'AC8_hek293t',
 'AC9_hek293t',
 'AC10_hek293t',
 'AC11_hek293t',
 'AC12_hek293t',
 'AC13_hek293t',
 'AC14_hek293t',
 'AC15_hek293t',
 'AC16_hek293t',
 'AC17_hek293t',
 'AC18_hek293t',
 'AC19_hek293t',
 'AC20_hek293t',
 'AC21_hek293t',
 'AC22_hek293t',
 'AC23_hek293t',
 'AC24_hek293t',
 'AC25_hek293t',
 'AC26_hek293t',
 'AC27_hek293t',
 'AC28_hek293t',
 'AC29_hek293t',
 'AC30_hek293t',
 'AC31_hek293t',
 'AC32_hek293t',
 'AC33_hek293t',
 'AC34_hek293t',
 'AC35_hek293t',
 'AC36_hek293t',
 'AC37_hek293t',
 'AC38_hek293t',
 'AC39_hek293t',
 'AC40_hek293t',
 'AC41_hek293t',
 'AC42_hek293t',
 'AC43_hek293t',
 'AC44_hek293t',
 'AC45_hek293t',
 'AC46_hek293t',
 'AC47_hek293t',
 'AC48_hek293t',
 'AC49_hek293t',
 'AC50_hek293t',
 'AC51_hek293t',
 'AC52_hek293t',
 'AC53_hek293t',
 'AC54_hek293t',
 'AC55_hek293t',
 'AC56_hek293t',
 'AC57_hek293t',
 'AC58_hek293t',
 'AC59_hek293t',
 'AC60_hek293t',
 'AC61_hek293t',
 'AC62_hek293t',
 'AC63_hek293t',
 'AC64_hek293t',
 'AC65_hek293t',
 'AC66_hek293t',
 'AC67_hek293t',
 'AC68_hek293t',
 'AC69_hek293t',
 'AC70_hek293t',
 'AC71_hek293t',
 'AC72_hek293t',
 'AC73_hek293t',
 'AC74_hek293t',
 'AC75_hek293t',
 'AC76_hek293t',
 'AC77_hek293t',
 'AC78_hek293t',
 'AC79_hek293t',
 'AC80_hek293t',
 'AC81_hek293t',
 'AC82_hek293t',
 'AC83_hek293t',
 'AC84_hek293t',
 'AC85_hek293t',
 'AC86_hek293t',
 'AC87_hek293t',
 'AC88_hek293t',
 'AC89_hek293t',
 'AC90_hek293t',
 'AC91_hek293t',
 'AC92_hek293t',
 'AC93_hek293t',
 'AC94_hek293t',
 'AC95_hek293t',
 'AC96_hek293t',
 'AC97_hek293t',
 'AC98_hek293t',
 'AC99_hek293t',
 'AC100_hek293t',
 'AC101_hek293t',
 'AC102_hek293t',
 'AC103_hek293t',
 'AC104_hek293t',
 'AC105_hek293t',
 'AC106_hek293t',
 'AC107_hek293t',
 'AC108_hek293t',
 'AC109_hek293t',
 'AC110_hek293t',
 'AC111_hek293t',
 'AC112_hek293t',
 'AC113_hek293t',
 'AC114_hek293t',
 'AC115_hek293t',
 'AC116_hek293t',
 'AC117_hek293t',
 'AC118_hek293t',
 'AC119_hek293t',
 'AC120_hek293t',
 'AC121_hek293t',
 'AC122_hek293t',
 'AC123_hek293t',
 'AC124_hek293t',
 'AC125_hek293t',
 'AC126_hek293t',
 'AC127_hek293t',
 'AC128_hek293t',
 'AC129_hek293t',
 'AC130_hek293t',
 'AC131_hek293t',
 'AC132_hek293t',
 'AC133_hek293t',
 'AC134_hek293t',
 'AC135_hek293t',
 'AC136_hek293t',
 'AC137_hek293t',
 'AC138_hek293t',
 'AC139_hek293t',
 'AC140_hek293t',
 'AC141_hek293t',
 'AC142_hek293t',
 'AC143_hek293t',
 'AC144_hek293t',
 'AC145_hek293t',
 'AC146_hek293t',
 'AC147_hek293t',
 'AC148_hek293t',
 'AC149_hek293t',
 'AC150_hek293t',
 'AC151_hek293t',
 'AC152_hek293t',
 'AC153_hek293t',
 'AC154_hek293t',
 'AC155_hek293t',
 'AC156_hek293t',
 'AC157_hek293t'
]

In [59]:
patu_compound_list = [
    'CL1_patu-8988t',
 'CL2_patu-8988t',
 'CL3_patu-8988t',
 'CL4_patu-8988t',
 'CL5_patu-8988t',
 'CL6_patu-8988t',
 'CL7_patu-8988t',
 'CL8_patu-8988t',
 'CL9_patu-8988t',
 'CL10_patu-8988t',
 'CL11_patu-8988t',
 'CL12_patu-8988t',
 'CL13_patu-8988t',
 'CL14_patu-8988t',
 'CL15_patu-8988t',
 'CL16_patu-8988t',
 'CL17_patu-8988t',
 'CL18_patu-8988t',
 'CL19_patu-8988t',
 'CL20_patu-8988t',
 'CL21_patu-8988t',
 'CL22_patu-8988t',
 'CL23_patu-8988t',
 'CL24_patu-8988t',
 'CL25_patu-8988t',
 'CL26_patu-8988t',
 'CL27_patu-8988t',
 'CL28_patu-8988t',
 'CL29_patu-8988t',
 'CL30_patu-8988t',
 'CL31_patu-8988t',
 'CL32_patu-8988t',
 'CL33_patu-8988t',
 'CL34_patu-8988t',
 'CL35_patu-8988t',
 'CL36_patu-8988t',
 'CL37_patu-8988t',
 'CL38_patu-8988t',
 'CL39_patu-8988t',
 'CL40_patu-8988t',
 'CL41_patu-8988t',
 'CL42_patu-8988t',
 'CL43_patu-8988t',
 'CL44_patu-8988t',
 'CL45_patu-8988t',
 'CL46_patu-8988t',
 'CL47_patu-8988t',
 'CL48_patu-8988t',
 'CL49_patu-8988t',
 'CL50_patu-8988t',
 'CL51_patu-8988t',
 'CL52_patu-8988t',
 'CL53_patu-8988t',
 'CL54_patu-8988t',
 'CL55_patu-8988t',
 'CL56_patu-8988t',
 'CL57_patu-8988t',
 'CL58_patu-8988t',
 'CL59_patu-8988t',
 'CL60_patu-8988t',
 'CL61_patu-8988t',
 'CL62_patu-8988t',
 'CL63_patu-8988t',
 'CL64_patu-8988t',
 'CL65_patu-8988t',
 'CL66_patu-8988t',
 'CL67_patu-8988t',
 'CL68_patu-8988t',
 'CL69_patu-8988t',
 'CL70_patu-8988t',
 'CL71_patu-8988t',
 'CL72_patu-8988t',
 'CL73_patu-8988t',
 'CL74_patu-8988t',
 'CL75_patu-8988t',
 'CL76_patu-8988t',
 'CL77_patu-8988t',
 'CL78_patu-8988t',
 'CL79_patu-8988t',
 'CL80_patu-8988t',
 'CL81_patu-8988t',
 'CL82_patu-8988t',
 'CL83_patu-8988t',
 'CL84_patu-8988t',
 'CL85_patu-8988t',
 'CL86_patu-8988t',
 'CL87_patu-8988t',
 'CL88_patu-8988t',
 'CL89_patu-8988t',
 'CL90_patu-8988t',
 'CL91_patu-8988t',
 'CL92_patu-8988t',
 'CL93_patu-8988t',
 'CL94_patu-8988t',
 'CL95_patu-8988t',
 'CL96_patu-8988t',
 'CL97_patu-8988t',
 'CL98_patu-8988t',
 'CL99_patu-8988t',
 'CL100_patu-8988t',
 'CL101_patu-8988t',
 'CL102_patu-8988t',
 'CL103_patu-8988t',
 'CL104_patu-8988t',
 'CL105_patu-8988t',
 'CL106_patu-8988t',
 'CL107_patu-8988t',
 'CL108_patu-8988t',
 'CL109_patu-8988t',
 'CL110_patu-8988t',
 'CL111_patu-8988t',
 'CL112_patu-8988t',
 'CL113_patu-8988t',
 'CL114_patu-8988t',
 'CL115_patu-8988t',
 'CL116_patu-8988t',
 'CL117_patu-8988t',
 'CL118_patu-8988t',
 'CL119_patu-8988t',
 'CL120_patu-8988t',
 'CL121_patu-8988t',
 'CL122_patu-8988t',
 'CL123_patu-8988t',
 'CL124_patu-8988t',
 'CL125_patu-8988t',
 'CL126_patu-8988t',
 'CL127_patu-8988t',
 'CL128_patu-8988t',
 'AC1_patu-8988t',
 'AC2_patu-8988t',
 'AC3_patu-8988t',
 'AC4_patu-8988t',
 'AC5_patu-8988t',
 'AC6_patu-8988t',
 'AC7_patu-8988t',
 'AC8_patu-8988t',
 'AC9_patu-8988t',
 'AC10_patu-8988t',
 'AC11_patu-8988t',
 'AC12_patu-8988t',
 'AC13_patu-8988t',
 'AC14_patu-8988t',
 'AC15_patu-8988t',
 'AC16_patu-8988t',
 'AC17_patu-8988t',
 'AC18_patu-8988t',
 'AC19_patu-8988t',
 'AC20_patu-8988t',
 'AC21_patu-8988t',
 'AC22_patu-8988t',
 'AC23_patu-8988t',
 'AC24_patu-8988t',
 'AC25_patu-8988t',
 'AC26_patu-8988t',
 'AC27_patu-8988t',
 'AC28_patu-8988t',
 'AC29_patu-8988t',
 'AC30_patu-8988t',
 'AC31_patu-8988t',
 'AC32_patu-8988t',
 'AC33_patu-8988t',
 'AC34_patu-8988t',
 'AC35_patu-8988t',
 'AC36_patu-8988t',
 'AC37_patu-8988t',
 'AC38_patu-8988t',
 'AC39_patu-8988t',
 'AC40_patu-8988t',
 'AC41_patu-8988t',
 'AC42_patu-8988t',
 'AC43_patu-8988t',
 'AC44_patu-8988t',
 'AC45_patu-8988t',
 'AC46_patu-8988t',
 'AC47_patu-8988t',
 'AC48_patu-8988t',
 'AC49_patu-8988t',
 'AC50_patu-8988t',
 'AC51_patu-8988t',
 'AC52_patu-8988t',
 'AC53_patu-8988t',
 'AC54_patu-8988t',
 'AC55_patu-8988t',
 'AC56_patu-8988t',
 'AC57_patu-8988t',
 'AC58_patu-8988t',
 'AC59_patu-8988t',
 'AC60_patu-8988t',
 'AC61_patu-8988t',
 'AC62_patu-8988t',
 'AC63_patu-8988t',
 'AC64_patu-8988t',
 'AC65_patu-8988t',
 'AC66_patu-8988t',
 'AC67_patu-8988t',
 'AC68_patu-8988t',
 'AC69_patu-8988t',
 'AC70_patu-8988t',
 'AC71_patu-8988t',
 'AC72_patu-8988t',
 'AC73_patu-8988t',
 'AC74_patu-8988t',
 'AC75_patu-8988t',
 'AC76_patu-8988t',
 'AC77_patu-8988t',
 'AC78_patu-8988t',
 'AC79_patu-8988t',
 'AC80_patu-8988t',
 'AC81_patu-8988t',
 'AC82_patu-8988t',
 'AC83_patu-8988t',
 'AC84_patu-8988t',
 'AC85_patu-8988t',
 'AC86_patu-8988t',
 'AC87_patu-8988t',
 'AC88_patu-8988t',
 'AC89_patu-8988t',
 'AC90_patu-8988t',
 'AC91_patu-8988t',
 'AC92_patu-8988t',
 'AC93_patu-8988t',
 'AC94_patu-8988t',
 'AC95_patu-8988t',
 'AC96_patu-8988t',
 'AC97_patu-8988t',
 'AC98_patu-8988t',
 'AC99_patu-8988t',
 'AC100_patu-8988t',
 'AC101_patu-8988t',
 'AC102_patu-8988t',
 'AC103_patu-8988t',
 'AC104_patu-8988t',
 'AC105_patu-8988t',
 'AC106_patu-8988t',
 'AC107_patu-8988t',
 'AC108_patu-8988t',
 'AC109_patu-8988t',
 'AC110_patu-8988t',
 'AC111_patu-8988t',
 'AC112_patu-8988t',
 'AC113_patu-8988t',
 'AC114_patu-8988t',
 'AC115_patu-8988t',
 'AC116_patu-8988t',
 'AC117_patu-8988t',
 'AC118_patu-8988t',
 'AC119_patu-8988t',
 'AC120_patu-8988t',
 'AC121_patu-8988t',
 'AC122_patu-8988t',
 'AC123_patu-8988t',
 'AC124_patu-8988t',
 'AC125_patu-8988t',
 'AC126_patu-8988t',
 'AC127_patu-8988t',
 'AC128_patu-8988t',
 'AC129_patu-8988t',
 'AC130_patu-8988t',
 'AC131_patu-8988t',
 'AC132_patu-8988t',
 'AC133_patu-8988t',
 'AC134_patu-8988t',
 'AC135_patu-8988t',
 'AC136_patu-8988t',
 'AC137_patu-8988t',
 'AC138_patu-8988t',
 'AC139_patu-8988t',
 'AC140_patu-8988t',
 'AC141_patu-8988t',
 'AC142_patu-8988t',
 'AC143_patu-8988t',
 'AC144_patu-8988t',
 'AC145_patu-8988t',
 'AC146_patu-8988t',
 'AC147_patu-8988t',
 'AC148_patu-8988t',
 'AC149_patu-8988t',
 'AC150_patu-8988t',
 'AC151_patu-8988t',
 'AC152_patu-8988t',
 'AC153_patu-8988t',
 'AC154_patu-8988t',
 'AC155_patu-8988t',
 'AC156_patu-8988t',
 'AC157_patu-8988t'
]

In [60]:
def get_ligandability(df, compound_list, cutoff, dataset_cutoff):
    ligandable = []
    
    df = df.replace(np.nan, 0)
    
    for index, row in df.iterrows():
        
        count = 0

        for i in range(len(compound_list)):
            current_ratio = row[compound_list[i]]
            

            if float(current_ratio) >= cutoff:
                count += 1
    
        if count >= dataset_cutoff:
            ligandable.append('yes')
        else:
            ligandable.append(None)
            
    df['ligandable'] = ligandable
    
    df_identified = df[df['ligandable'].isna() == True]
    df_ligandable = df[df['ligandable'].isna() == False]
    return df_identified, df_ligandable

In [61]:
df_hct_not_ligandable, df_hct_ligandable = get_ligandability(new_hct_df_ligandability, hct_compound_list, 4, 1)

In [62]:
df_hek_not_ligandable, df_hek_ligandable = get_ligandability(new_hek_df_ligandability, hek_compound_list, 4, 1)

In [63]:
df_patu_not_ligandable, df_patu_ligandable = get_ligandability(new_patu_df_ligandability, patu_compound_list, 4, 1)

In [64]:
new_df_hct_ligandability = pd.concat([df_hct_not_ligandable, df_hct_ligandable])

In [65]:
new_df_hek_ligandability = pd.concat([df_hek_not_ligandable, df_hek_ligandable])

In [66]:
new_df_patu_ligandability = pd.concat([df_patu_not_ligandable, df_patu_ligandable])

In [67]:
new_df_hct_ligandability.shape, new_df_hek_ligandability.shape

((23363, 290), (20247, 290))

In [68]:
new_df_hct_ligandability.head()

Unnamed: 0,CL1_hct116,CL2_hct116,CL3_hct116,CL4_hct116,CL5_hct116,CL6_hct116,CL7_hct116,CL8_hct116,CL9_hct116,CL10_hct116,...,AC153_hct116,AC154_hct116,AC155_hct116,AC156_hct116,AC157_hct116,proteinid,cysteineid,chloroacetamide,acrylamide,ligandable
1742,0.995696,0.848423,0.848423,0.953304,0.851343,0.90292,0.914327,1.00196,1.01131,1.0162,...,1.17627,1.21357,1.27608,1.2715,1.2635,Q5VT66-2,Q5VT66-2_C290,0,0,
1743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.5261,1.23996,1.13581,1.21896,1.34317,Q5VT66-2,Q5VT66-2_C317,0,0,
1744,1.05323,1.00722,1.00722,0.970048,0.907063,0.969666,1.02997,1.39239,0.97733,1.15318,...,0.67126,0.861047,0.849078,0.87272,0.80942,Q969Z3,Q969Z3_C272,0,0,
1745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.07007,1.18896,1.40296,1.00444,0.889291,Q969Z3,Q969Z3_C299,0,0,
1746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Q9NX47,Q9NX47_C46,0,0,


In [71]:
new_df_hek_ligandability['proteinid'] = new_df_hek_ligandability['proteinid'].map(lambda x: str(x).replace(' ', '').strip())
new_df_hek_ligandability['cysteineid'] = new_df_hek_ligandability['cysteineid'].map(lambda x: str(x).replace(' ', '').strip())
new_df_hek_ligandability.shape

(20247, 290)

In [73]:
new_df_hct_ligandability['proteinid'] = new_df_hct_ligandability['proteinid'].map(lambda x: str(x).replace(' ', '').strip())
new_df_hct_ligandability['cysteineid'] = new_df_hct_ligandability['cysteineid'].map(lambda x: str(x).replace(' ', '').strip())
new_df_hct_ligandability.shape


(23363, 290)

In [74]:
new_df_patu_ligandability['proteinid'] = new_df_patu_ligandability['proteinid'].map(lambda x: str(x).replace(' ', '').strip())
new_df_patu_ligandability['cysteineid'] = new_df_patu_ligandability['cysteineid'].map(lambda x: str(x).replace(' ', '').strip())
new_df_patu_ligandability.shape


(20269, 290)

In [75]:
new_df_hek_ligandability = new_df_hek_ligandability.drop_duplicates()
new_df_hek_ligandability.shape

(20247, 290)

In [76]:
new_df_hct_ligandability = new_df_hct_ligandability.drop_duplicates()
new_df_hct_ligandability.shape

(23363, 290)

In [77]:
new_df_patu_ligandability = new_df_patu_ligandability.drop_duplicates()
new_df_patu_ligandability.shape

(20269, 290)

In [78]:
os.chdir(cd)
os.chdir('results')
new_df_hct_ligandability.to_csv('kuljanin_hct_ligandable_dataset.csv', index = False)
new_df_hek_ligandability.to_csv('kuljanin_hek_ligandable_dataset.csv', index = False)
new_df_patu_ligandability.to_csv('kuljanin_patu_ligandable_dataset.csv', index = False)

# Merge

## cysteineid, chloroacetamide, acrylamide

In [79]:
scout_merge = df_scout[['cysteineid', 'Comp ratio (KBO2) hct116', 'Comp ratio (KBO3) hct116', 'Comp ratio (KBO5) hct116']]

In [80]:
hct_merge = new_df_hct_ligandability.copy()
hct_merge = hct_merge.drop(columns = ['chloroacetamide', 'acrylamide', 'ligandable', 'proteinid'])

In [81]:
hek_merge = new_df_hek_ligandability.copy()
hek_merge = hek_merge.drop(columns = ['chloroacetamide', 'acrylamide', 'ligandable', 'proteinid'])

In [82]:
patu_merge = new_df_patu_ligandability.copy()
patu_merge = patu_merge.drop(columns = ['chloroacetamide', 'acrylamide', 'ligandable', 'proteinid'])

In [83]:
merged = pd.merge(hct_merge, scout_merge, on = 'cysteineid', how = 'outer')

In [84]:
merged = pd.merge(merged, hek_merge, on = 'cysteineid', how = 'outer')

In [85]:
merged = pd.merge(merged, patu_merge, on = 'cysteineid', how = 'outer')

In [65]:
# merged = merged.drop(columns = ['proteinid_x', 'proteinid_y'])

In [93]:
def get_colnames(df):
    cols = df.columns.to_list()
    
    cl_cols = []
    ac_cols = []
    
    for i in range(len(cols)):
        if "CL" in cols[i]:
            cl_cols.append(cols[i])
            
        if "AC" in cols[i]:
            ac_cols.append(cols[i])
            
    return cl_cols, ac_cols

In [94]:
cl_cols, ac_cols = get_colnames(merged)

In [95]:
slcabpp_chloro_cols = cl_cols + ['Comp ratio (KBO2) hct116', 'Comp ratio (KBO3) hct116']
slcabpp_acryl_cols = ac_cols + ['Comp ratio (KBO5) hct116']

In [98]:
chloro_labels = get_ligandable_category(merged, slcabpp_chloro_cols, False, 4)
acryl_labels = get_ligandable_category(merged, slcabpp_acryl_cols, False, 4)

In [99]:
new_merged = merged.copy()

In [100]:
new_merged['chloroacetamide'] = chloro_labels
new_merged['acrylamide'] = acryl_labels

In [102]:
new_merged['ligandable'] = np.where((new_merged['chloroacetamide'] == 'yes') | (new_merged['acrylamide'] == 'yes'), 'yes', None)

In [108]:
new_merged.shape, len(new_merged['cysteineid'].unique())

((30719, 862), 30719)

In [104]:
new_merged['ligandable'].value_counts()

yes    5457
Name: ligandable, dtype: int64

In [106]:
new_merged.columns.to_list()

['CL1_hct116',
 'CL2_hct116',
 'CL3_hct116',
 'CL4_hct116',
 'CL5_hct116',
 'CL6_hct116',
 'CL7_hct116',
 'CL8_hct116',
 'CL9_hct116',
 'CL10_hct116',
 'CL11_hct116',
 'CL12_hct116',
 'CL13_hct116',
 'CL14_hct116',
 'CL15_hct116',
 'CL16_hct116',
 'CL17_hct116',
 'CL18_hct116',
 'CL19_hct116',
 'CL20_hct116',
 'CL21_hct116',
 'CL22_hct116',
 'CL23_hct116',
 'CL24_hct116',
 'CL25_hct116',
 'CL26_hct116',
 'CL27_hct116',
 'CL28_hct116',
 'CL29_hct116',
 'CL30_hct116',
 'CL31_hct116',
 'CL32_hct116',
 'CL33_hct116',
 'CL34_hct116',
 'CL35_hct116',
 'CL36_hct116',
 'CL37_hct116',
 'CL38_hct116',
 'CL39_hct116',
 'CL40_hct116',
 'CL41_hct116',
 'CL42_hct116',
 'CL43_hct116',
 'CL44_hct116',
 'CL45_hct116',
 'CL46_hct116',
 'CL47_hct116',
 'CL48_hct116',
 'CL49_hct116',
 'CL50_hct116',
 'CL51_hct116',
 'CL52_hct116',
 'CL53_hct116',
 'CL54_hct116',
 'CL55_hct116',
 'CL56_hct116',
 'CL57_hct116',
 'CL58_hct116',
 'CL59_hct116',
 'CL60_hct116',
 'CL61_hct116',
 'CL62_hct116',
 'CL63_hct116',
 

In [109]:
new_merged['proteinid'] = new_merged['cysteineid'].map(lambda x: str(x).split('_C')[0].strip())

In [111]:
len(new_merged['cysteineid'].unique()), new_merged.shape

(30719, (30719, 863))

In [112]:
new_merged.to_csv('kuljanin_ligandable_dataset.csv', index = False)

# Read DIA Data

Applying the same criteria of defining “ligandable cysteines”
(e.g., individual cysteine with >75% competition in IA-alkyne
probe labeling by two or more ligand fragments) 

In [221]:
def get_uniprot_dict(directory, file):
    os.chdir(directory)
    
    uniprot_dict = {}
    for record in SeqIO.parse(file, "fasta"):
        protein = str(record.id).split('|')[1]
        uniprot_dict[protein] = record.seq
    
    os.chdir(cd)
    return uniprot_dict

In [223]:
# uniprot_dict = get_uniprot_dict('/Users/lisamarieboatner/Dropbox/Backus/master/uniprot/data/220505', '2201_uniprot.fasta')

In [224]:
uniprot_dict = get_uniprot_dict('C:\\Users\\Onee-sama\\Dropbox\\Backus\\master\\uniprot\\data\\220724', '2207_uniprot.fasta')

In [225]:
def get_residue_aa(identifier, peptide, uniprot_dict, uniprot_aa):
    if (peptide in str(uniprot_dict[identifier])):
        correct_aa = str(uniprot_dict[identifier]).index(peptide) + (int(uniprot_aa))
        return int(correct_aa)
    else:
        return '--'

In [465]:
def get_cysetine_ids(df, uniprot_dict):
    cysteines = []
    missing_protein_ids = []
    
    for index, row in df.iterrows():
        protein = row['Proteins']
        modified_peptide = row['Peptides']
        modified_aa = modified_peptide.index('*')
        unmodified_peptide = modified_peptide.replace('*', '')

        if (protein not in uniprot_dict.keys()):
            correct_aa = '--'
            missing_protein_ids.append(protein)
        else:
            correct_aa = get_residue_aa(protein, unmodified_peptide, uniprot_dict, modified_aa)
    
        cysteines.append(correct_aa)
        
    return cysteines, missing_protein_ids

## Read ligandability data

In [614]:
os.chdir(cd)
os.chdir('DIA')

## Read All Cysteines

In [754]:
df_dia_ligandability = pd.read_excel('ja1c11053_si_003.xlsx', sheet_name='all cysteines')

In [755]:
df_dia_ligandability = df_dia_ligandability[df_dia_ligandability['Proteins'].str.count(',') == 0]

In [756]:
all_stars = count_star(df_dia_ligandability)
df_dia_ligandability['modified_cysteines'] = all_stars

In [757]:
df_dia_ligandability['modified_cysteines'].value_counts()

1    7631
2     361
3       8
Name: modified_cysteines, dtype: int64

In [758]:
# df_dia_ligandability = df_dia_ligandability[df_dia_ligandability['Peptides'].str.count('*') == 1]
df_dia_ligandability = df_dia_ligandability[df_dia_ligandability['modified_cysteines'] == 1]

In [759]:
dia_ligandability_cysteine_ids, dia_ligandability_missing_protein_ids = get_cysetine_ids(df_dia_ligandability, uniprot_dict)

In [760]:
len(dia_ligandability_missing_protein_ids)

45

There are 113 proteins identified in DIA ligandability experiment that were not found in 2207 fasta. 48 were not found in 2207 fasta out of the peptides mapped to single protein entries. 48 were not found in 2207 fasta out of the peptides mapped to single protein entries and single modified peptides.

In [761]:
df_dia_ligandability['Cysteine'] = dia_ligandability_cysteine_ids

In [762]:
df_dia_ligandability = df_dia_ligandability[df_dia_ligandability['Cysteine'] != '--']

In [763]:
df_dia_ligandability['proteinid'] = df_dia_ligandability['Proteins']
df_dia_ligandability['cysteineid'] = df_dia_ligandability['proteinid'] + '_C' + df_dia_ligandability['Cysteine'].astype(str)

## cysteineid, chloroacetamide, acrylamide

In [765]:
dia_cols = df_dia_ligandability.columns.to_list()[2:26]

In [767]:
for i in range(len(dia_cols)):
    df_dia_ligandability = df_dia_ligandability.rename(columns = {dia_cols[i]: dia_cols[i] + '_' + 'ramos'})

In [885]:
dia_chloro_cols = [
 'F2_ramos',
 'F3_ramos',
 'F4_ramos',
 'F7_ramos',
 'F8_ramos',
 'F9_ramos',
 'F10_ramos',
 'F11_ramos',
 'F12_ramos',
 'F13_ramos',
 'F20_ramos',
 'F21_ramos',
 'F27_ramos',
 'F28_ramos',
 'F30_ramos',
 'F32_ramos',
 'F33_ramos',
 'F52_ramos'
]

In [886]:
dia_acryl_cols = [
 'F5_ramos',
 'F14_ramos',
 'F23_ramos',
 'F31_ramos',
 'F38_ramos',
 'F56_ramos'
]

In [887]:
chloro_labels = get_ligandable_category(df_dia_ligandability, dia_chloro_cols, True, 4)

In [888]:
acryl_labels = get_ligandable_category(df_dia_ligandability, dia_acryl_cols, True, 4)

In [889]:
new_dia_ligandability_df = df_dia_ligandability.copy()
new_dia_ligandability_df = new_dia_ligandability_df.drop(columns = [
    'Peptides',
                                                                     'Proteins',
                                                                     'Unnamed: 26',
                                                                     'Unnamed: 27',
                                                                     'Unnamed: 28',
                                                                     'Unnamed: 29',
                                                                     'Unnamed: 30',
                                                                     'Unnamed: 31',
                                                                     'Unnamed: 32',
                                                                     'Unnamed: 33',
                                                                     'Unnamed: 34',
                                                                     'Proteins',
                                                                     'Cysteine'
                                                                    ])

In [890]:
new_dia_ligandability_df['chloroacetamide'] = chloro_labels
new_dia_ligandability_df['acrylamide'] = acryl_labels

In [891]:
new_df_dia_cols = new_dia_ligandability_df.columns.to_list()[:24]

In [892]:
new_dia_ligandability_df.shape

(7584, 29)

In [893]:
def get_dia_ligandable_category(df, cols, cutoff):
    lig_count = 0
    lig = []
    
    df = df.replace('--', 0)
    
    for index, row in df.iterrows():
        current_count = 0
        current_ratios = []
        current_cols = []
        for j in range(len(cols)):
            if row[cols[j]] != None:
                current_ratio = float(row[cols[j]])
                if current_ratio > cutoff:
                    current_count += 1
                    
        if current_count >=2:
            lig.append('yes')
            lig_count += 1
        else:
            lig.append(None)
            
    print(lig_count)
    return lig
            

In [894]:
ligandable_dia = get_dia_ligandable_category(new_dia_ligandability_df, new_df_dia_cols, 4)

523


In [895]:
new_dia_ligandability_df['ligandable'] = ligandable_dia

In [897]:
new_dia_ligandability_df['cysteineid'] = new_dia_ligandability_df['cysteineid'].str.strip()

In [898]:
new_dia_ligandability_df.shape

(7584, 30)

In [899]:
dia_dup_ids = get_dup_ids(new_dia_ligandability_df)
len(dia_dup_ids)

3

In [900]:
new_dia_ligandability_df['duplicate'] = np.where(new_dia_ligandability_df['cysteineid'].isin(dia_dup_ids), 1, 0)
dia_no_dups = new_dia_ligandability_df[new_dia_ligandability_df['duplicate'] == 0]

In [901]:
dia_no_dups.shape, len(dia_no_dups['cysteineid'].unique())

((7578, 31), 7578)

In [902]:
dia_no_dups = dia_no_dups.replace('--', None)

In [906]:
os.chdir('../')
os.chdir('results')
dia_no_dups.to_csv('yang_ligandable_dataset.csv', index = False)