Reformat published datasets for CysteineomeDB.
Categories: Dataset Found, Reactive, Ligandable, Identified but not Ligandale, and Conditional.
Note: Annotations from authors were used to determine "ligandabiliy."
Note: Yang DIA paper did not provide cysteine residue numbers. Peptide sequences were manually mapped to the uniprot fasta (2201). 

# Setup Environment

In [None]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import matplotlib
import numpy as np
import math
from matplotlib.pyplot import figure
import Bio
from Bio import SeqIO
from functools import reduce

In [None]:
cd = os.getcwd()
cd

In [None]:
path_data = os.path.join(os.getcwd(), 'results')
if not os.path.exists(path_data):
    os.makedirs(path_data)

In [None]:
def get_new_df(dfs, dataset, col1, col2, cys):
    new_df = pd.concat(dfs)
    
    new_df = new_df[new_df[col1].str.contains("contaminant") == False]
    
    if dataset == 'kuljanin_gygi':
        new_df['proteinid'] = new_df[col1].map(lambda x: str(x).split('|')[1])
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'weerapana_cravatt':
        new_df['proteinid'] = new_df[col1].map(lambda x: str(x))
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'backus_cravatt':
        new_df['proteinid'] = new_df['Identifier'].map(lambda x: str(x).split('_')[0])
        new_df['resid'] = new_df['Identifier'].map(lambda x: str(x).split('_')[-1]) 
    elif dataset == 'yan_backus':
        new_df['proteinid'] = new_df[col1]
        new_df['resid'] = new_df['identifier'].map(lambda x: 'C' + str(x).split('_')[-1]) 
    elif dataset == 'yang_wang':
        new_df['proteinid'] = new_df[col1]
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))         
    else:
        new_df = new_df.rename(columns = {col1: 'proteinid', col2: 'resid'})
        
    new_df['cysteineid'] = new_df['proteinid'] + '_' + new_df['resid'].astype(str)
    new_df['dataset'] = dataset
    new_df['identified'] = 1
    new_df['identified_datasets'] = dataset
    
    if cys == True:
        new_df['level'] = 'cysteine'
        new_df = new_df[['level', 'cysteineid', 'proteinid', 'dataset', 'identified', 'identified_datasets']]
    else:
        new_df['level'] = 'protein'
        new_df = new_df[['level', 'proteinid', 'dataset', 'identified', 'identified_datasets']]
    new_df = new_df.drop_duplicates()
    
    return new_df

In [None]:
def get_cys_uniprot_identifier(master, df, dataset, category, category_datasets, col1, col2):
    if dataset == 'weerapana_cravatt':
        df['proteinid'] = df[col1].map(lambda x: str(x))
        df['resid'] = df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'kuljanin_gygi':
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df['proteinid'] = df[col1].map(lambda x: str(x).split('|')[1])
        df['resid'] = df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'backus_cravatt':
        df['proteinid'] = df['Identifier'].map(lambda x: str(x).split('_')[0])
        df['resid'] = df['Identifier'].map(lambda x: str(x).split('_')[-1])  
        df = df[['proteinid', 'resid']]
        df = df.drop_duplicates()
    elif dataset == 'yang_wang':
        df['proteinid'] = df[col1]
        df['resid'] = df[col2].map(lambda x: 'C' + str(x)) 
    else:
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df = df.rename(columns = {col1: 'proteinid', col2: 'resid'})
        
    df['cysteineid'] = df['proteinid'] + '_' + df['resid'].astype(str)
    df_ids = list(df['cysteineid'].unique())
    
    master[category] = np.where(master['cysteineid'].isin(df_ids), 1, 0)
    category_df = master[master[category] == 1]
    category_df[category  + '_datasets'] = [category_datasets] * category_df.shape[0]
    non_category_df = master[master[category] == 0]
    
    new_df = pd.concat([category_df, non_category_df])
    return new_df

In [None]:
def get_pro_uniprot_identifier(master, df, dataset, category, category_datasets, col1, col2):
    if dataset == 'weerapana_cravatt':
        df['proteinid'] = df[col1].map(lambda x: str(x))
    elif dataset == 'kuljanin_gygi':
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df['proteinid'] = df[col1].map(lambda x: str(x).split('|')[1])
    elif dataset == 'backus_cravatt':
        df['proteinid'] = df['Identifier'].map(lambda x: str(x).split('_')[0])
        df = df[['proteinid']]
        df = df.drop_duplicates()
    elif dataset == 'yang_wang':
        df['proteinid'] = df[col1]
    else:
        df = df[[col1]]
        df = df.drop_duplicates()
        df = df.rename(columns = {col1: 'proteinid'})

    df_ids = list(df['proteinid'].unique())
    
    master[category] = np.where(master['proteinid'].isin(df_ids), 1, 0)
    category_df = master[master[category] == 1]
    category_df[category  + '_datasets'] = [category_datasets] * category_df.shape[0]
    non_category_df = master[master[category] == 0]
    
    new_df = pd.concat([category_df, non_category_df])
    return new_df

In [None]:
def get_ligandability(df, compound_list, cutoff):
    ligandable = []
    
    for index, row in df.iterrows():
        
        count = 0

        for i in range(len(compound_list)):
            current_ratio = row[compound_list[i]]
            
            if type(current_ratio) != int and type(current_ratio) != float:
                continue
            else:    
                if float(current_ratio) >= cutoff:
                    count += 1
    
        if count >= cutoff:
            ligandable.append('yes')
        else:
            ligandable.append(None)
            
    df['ligandable'] = ligandable
    
    df_identified = df[df['ligandable'].isna() == True]
    df_ligandable = df[df['ligandable'].isna() == False]
    return df_identified, df_ligandable

In [None]:
def get_ligandable_category(df, probe_cols, tcell, cutoff):
    
    lig_labels = []
    
    if tcell == True:
        df = df.replace('--', 0)
    
    for index, row in df.iterrows():
        liganded = False
        for i in range(len(probe_cols)):
            if row[probe_cols[i]] >= cutoff:
                liganded = True
                
        if liganded == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(None)
            
    return lig_labels

In [None]:
def list_to_string(lst, symbol):
    return (symbol.join([str(elem) for elem in lst]))

# Extract Experimental Data

# Read Proteome-wide Discovery Data

# cysteineid, chloroacetamide, bromoacetamide, acrylamide, acetamide

In [None]:
discovery_other_cols = [
 '23_500uM_invitro_231',
 '23_500uM_invitro_ramos',
 '42_500uM_invitro_ramos'
]

In [None]:
discovery_chloro_cols = [
 '2_500uM_invitro_231',
 '2_500uM_invitro_ramos',
 '3_500uM_invitro_231',
 '3_500uM_invitro_ramos',
 '4_250uM_invitro_231',
 '4_250uM_invitro_ramos',
 '7_500uM_invitro_231',
 '7_500uM_invitro_ramos',
 '8_500uM_invitro_231',
 '8_500uM_invitro_ramos',
 '9_500uM_invitro_231',
 '9_500uM_invitro_ramos',
 '10_500uM_invitro_231',
 '10_500uM_invitro_ramos',
 '11_500uM_invitro_231',
 '11_500uM_invitro_ramos',
 '12_500uM_invitro_231',
 '12_500uM_invitro_ramos',
 '13_500uM_invitro_231',
 '13_500uM_invitro_ramos',
 '27_500uM_invitro_231',
 '20_500uM_invitro_231',
 '20_500uM_invitro_ramos',
 '21_500uM_invitro_231',
 '21_500uM_invitro_ramos',
 '22_500uM_invitro_231',
 '22_500uM_invitro_ramos',
 '25_500uM_invitro_231',
 '27_500uM_invitro_231.1',
 '27_500uM_invitro_ramos',
 '28_500uM_invitro_231',
 '28_500uM_invitro_ramos',
 '29_500uM_invitro_231',
 '29_500uM_invitro_ramos',
 '30_500uM_invitro_231',
 '30_500uM_invitro_ramos',
 '32_500uM_invitro_231',
 '32_500uM_invitro_ramos',
 '33_500uM_invitro_231',
 '33_500uM_invitro_ramos',
 '34_500uM_invitro_231',
 '34_500uM_invitro_ramos',
 '35_500uM_invitro_231',
 '35_500uM_invitro_ramos',
 '36_500uM_invitro_231',
 '39_500uM_invitro_231',
 '43_500uM_invitro_231',
 '43_500uM_invitro_ramos',
 '44_500uM_invitro_231',
 '45_500uM_invitro_231',
 '49_500uM_invitro_231',
 '50_500uM_invitro_231',
 '51_500uM_invitro_231',
 '51_500uM_invitro_ramos',
 '52_500uM_invitro_231',
 '52_500uM_invitro_ramos',
 '54_500uM_invitro_231',
 '55_500uM_invitro_231'
]

In [None]:
discovery_acryl_cols = [
 '5_500uM_invitro_231',
 '5_500uM_invitro_ramos',
 '6_500uM_invitro_231',
 '14_500uM_invitro_231',
 '14_500uM_invitro_ramos',
 '15_500uM_invitro_231',
 '15_500uM_invitro_ramos',
 '26_500uM_invitro_ramos',
 '31_500uM_invitro_231',
 '31_500uM_invitro_ramos',
 '37_500uM_invitro_ramos',
 '38_500uM_invitro_231',
 '38_500uM_invitro_ramos',
 '40_500uM_invitro_231',
 '40_500uM_invitro_ramos',
 '41_500uM_invitro_231',
 '41_500uM_invitro_ramos',
 '46_500uM_invitro_231',
 '47_500uM_invitro_231',
 '48_500uM_invitro_231',
 '53_500uM_invitro_231',
 '53_500uM_invitro_ramos',
 '56_500uM_invitro_231',
 '56_500uM_invitro_ramos'
]

In [None]:
discovery_bromo_cols = [
 '24_500uM_invitro_ramos',
]

In [None]:
def get_ligandable_category(df, probe_cols, tcell, cutoff):
    
    lig_labels = []
    
    if tcell == True:
        df = df.replace('--', 0)
    
    for index, row in df.iterrows():
        liganded = False
        for i in range(len(probe_cols)):
            if row[probe_cols[i]] >= cutoff:
                liganded = True
                
        if liganded == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(None)
            
    return lig_labels

# Read Suzuki Data

## cysteineid, chloroacetamide, acrylamide, acetamide

In [None]:
suzuki_other_cols = ['19 Mean Ratio hek293t', '22 Mean Ratio hek293t']
suzuki_chloro_cols = ['16 Mean Ratio hek293t', '18 Mean Ratio hek293t', '21 Mean Ratio hek293t', 'KB3 Mean Ratio hek293t']
suzuki_acryl_cols = ['15 Mean Ratio hek293t', '17 Mean Ratio hek293t', '20 Mean Ratio hek293t', 'KB14 Mean Ratio hek293t']

# Read T-cell Data

## cysteineid, chloroacetamide, acrylamide, dmf

In [None]:
tcell_chloro_active_cols = [
 'EV-3_tcell',
 'EV-93_tcell'
]

In [None]:
tcell_chloro_scout_cols = [
 'control_1_KB02_TMT_tcell',
 'control_3_KB02_TMT_tcell',
 'control_5_KB02_isoTOP_tcell',
 'control_7_KB02_isoTOP_tcell'
]

In [None]:
tcell_chloro_cols = tcell_chloro_active_cols + tcell_chloro_scout_cols

In [None]:
tcell_acryl_active_cols = [
 'BPK-21_tcell',
 'BPK-25_tcell',
 'EV-98_5uM_tcell',
 'EV-98_20uM_tcell',
 'EV-99_5uM_tcell',
 'EV-99_20uM_tcell',
 'EV-96_5uM_tcell',
 'EV-96_20uM_tcell',
 'EV-97_5uM_tcell',
 'ev-97_20uM_tcell'
]

In [None]:
tcell_acryl_scout_cols = [
 'control_2_KB05_TMT_tcell',
 'control_4_KB05_TMT_tcell',
 'control_6_KB05_isoTOP_tcell',
 'control_8_KB05_isoTOP_tcell'
]

In [None]:
tcell_acryl_cols = tcell_acryl_active_cols + tcell_acryl_scout_cols

In [None]:
tcell_dmf_cols = [
 'DMF_tcell'
]

# Read SP3 Data

In [None]:
sp3_chloro_cols = ['Mean Ratio jurkat']

# Read SLCABPP Data

In [None]:
hct_compound_list = [
 'CL1_hct116',
 'CL2_hct116',
 'CL3_hct116',
 'CL4_hct116',
 'CL5_hct116',
 'CL6_hct116',
 'CL7_hct116',
 'CL8_hct116',
 'CL9_hct116',
 'CL10_hct116',
 'CL11_hct116',
 'CL12_hct116',
 'CL13_hct116',
 'CL14_hct116',
 'CL15_hct116',
 'CL16_hct116',
 'CL17_hct116',
 'CL18_hct116',
 'CL19_hct116',
 'CL20_hct116',
 'CL21_hct116',
 'CL22_hct116',
 'CL23_hct116',
 'CL24_hct116',
 'CL25_hct116',
 'CL26_hct116',
 'CL27_hct116',
 'CL28_hct116',
 'CL29_hct116',
 'CL30_hct116',
 'CL31_hct116',
 'CL32_hct116',
 'CL33_hct116',
 'CL34_hct116',
 'CL35_hct116',
 'CL36_hct116',
 'CL37_hct116',
 'CL38_hct116',
 'CL39_hct116',
 'CL40_hct116',
 'CL41_hct116',
 'CL42_hct116',
 'CL43_hct116',
 'CL44_hct116',
 'CL45_hct116',
 'CL46_hct116',
 'CL47_hct116',
 'CL48_hct116',
 'CL49_hct116',
 'CL50_hct116',
 'CL51_hct116',
 'CL52_hct116',
 'CL53_hct116',
 'CL54_hct116',
 'CL55_hct116',
 'CL56_hct116',
 'CL57_hct116',
 'CL58_hct116',
 'CL59_hct116',
 'CL60_hct116',
 'CL61_hct116',
 'CL62_hct116',
 'CL63_hct116',
 'CL64_hct116',
 'CL65_hct116',
 'CL66_hct116',
 'CL67_hct116',
 'CL68_hct116',
 'CL69_hct116',
 'CL70_hct116',
 'CL71_hct116',
 'CL72_hct116',
 'CL73_hct116',
 'CL74_hct116',
 'CL75_hct116',
 'CL76_hct116',
 'CL77_hct116',
 'CL78_hct116',
 'CL79_hct116',
 'CL80_hct116',
 'CL81_hct116',
 'CL82_hct116',
 'CL83_hct116',
 'CL84_hct116',
 'CL85_hct116',
 'CL86_hct116',
 'CL87_hct116',
 'CL88_hct116',
 'CL89_hct116',
 'CL90_hct116',
 'CL91_hct116',
 'CL92_hct116',
 'CL93_hct116',
 'CL94_hct116',
 'CL95_hct116',
 'CL96_hct116',
 'CL97_hct116',
 'CL98_hct116',
 'CL99_hct116',
 'CL100_hct116',
 'CL101_hct116',
 'CL102_hct116',
 'CL103_hct116',
 'CL104_hct116',
 'CL105_hct116',
 'CL106_hct116',
 'CL107_hct116',
 'CL108_hct116',
 'CL109_hct116',
 'CL110_hct116',
 'CL111_hct116',
 'CL112_hct116',
 'CL113_hct116',
 'CL114_hct116',
 'CL115_hct116',
 'CL116_hct116',
 'CL117_hct116',
 'CL118_hct116',
 'CL119_hct116',
 'CL120_hct116',
 'CL121_hct116',
 'CL122_hct116',
 'CL123_hct116',
 'CL124_hct116',
 'CL125_hct116',
 'CL126_hct116',
 'CL127_hct116',
 'CL128_hct116',
 'AC1_hct116',
 'AC2_hct116',
 'AC3_hct116',
 'AC4_hct116',
 'AC5_hct116',
 'AC6_hct116',
 'AC7_hct116',
 'AC8_hct116',
 'AC9_hct116',
 'AC10_hct116',
 'AC11_hct116',
 'AC12_hct116',
 'AC13_hct116',
 'AC14_hct116',
 'AC15_hct116',
 'AC16_hct116',
 'AC17_hct116',
 'AC18_hct116',
 'AC19_hct116',
 'AC20_hct116',
 'AC21_hct116',
 'AC22_hct116',
 'AC23_hct116',
 'AC24_hct116',
 'AC25_hct116',
 'AC26_hct116',
 'AC27_hct116',
 'AC28_hct116',
 'AC29_hct116',
 'AC30_hct116',
 'AC31_hct116',
 'AC32_hct116',
 'AC33_hct116',
 'AC34_hct116',
 'AC35_hct116',
 'AC36_hct116',
 'AC37_hct116',
 'AC38_hct116',
 'AC39_hct116',
 'AC40_hct116',
 'AC41_hct116',
 'AC42_hct116',
 'AC43_hct116',
 'AC44_hct116',
 'AC45_hct116',
 'AC46_hct116',
 'AC47_hct116',
 'AC48_hct116',
 'AC49_hct116',
 'AC50_hct116',
 'AC51_hct116',
 'AC52_hct116',
 'AC53_hct116',
 'AC54_hct116',
 'AC55_hct116',
 'AC56_hct116',
 'AC57_hct116',
 'AC58_hct116',
 'AC59_hct116',
 'AC60_hct116',
 'AC61_hct116',
 'AC62_hct116',
 'AC63_hct116',
 'AC64_hct116',
 'AC65_hct116',
 'AC66_hct116',
 'AC67_hct116',
 'AC68_hct116',
 'AC69_hct116',
 'AC70_hct116',
 'AC71_hct116',
 'AC72_hct116',
 'AC73_hct116',
 'AC74_hct116',
 'AC75_hct116',
 'AC76_hct116',
 'AC77_hct116',
 'AC78_hct116',
 'AC79_hct116',
 'AC80_hct116',
 'AC81_hct116',
 'AC82_hct116',
 'AC83_hct116',
 'AC84_hct116',
 'AC85_hct116',
 'AC86_hct116',
 'AC87_hct116',
 'AC88_hct116',
 'AC89_hct116',
 'AC90_hct116',
 'AC91_hct116',
 'AC92_hct116',
 'AC93_hct116',
 'AC94_hct116',
 'AC95_hct116',
 'AC96_hct116',
 'AC97_hct116',
 'AC98_hct116',
 'AC99_hct116',
 'AC100_hct116',
 'AC101_hct116',
 'AC102_hct116',
 'AC103_hct116',
 'AC104_hct116',
 'AC105_hct116',
 'AC106_hct116',
 'AC107_hct116',
 'AC108_hct116',
 'AC109_hct116',
 'AC110_hct116',
 'AC111_hct116',
 'AC112_hct116',
 'AC113_hct116',
 'AC114_hct116',
 'AC115_hct116',
 'AC116_hct116',
 'AC117_hct116',
 'AC118_hct116',
 'AC119_hct116',
 'AC120_hct116',
 'AC121_hct116',
 'AC122_hct116',
 'AC123_hct116',
 'AC124_hct116',
 'AC125_hct116',
 'AC126_hct116',
 'AC127_hct116',
 'AC128_hct116',
 'AC129_hct116',
 'AC130_hct116',
 'AC131_hct116',
 'AC132_hct116',
 'AC133_hct116',
 'AC134_hct116',
 'AC135_hct116',
 'AC136_hct116',
 'AC137_hct116',
 'AC138_hct116',
 'AC139_hct116',
 'AC140_hct116',
 'AC141_hct116',
 'AC142_hct116',
 'AC143_hct116',
 'AC144_hct116',
 'AC145_hct116',
 'AC146_hct116',
 'AC147_hct116',
 'AC148_hct116',
 'AC149_hct116',
 'AC150_hct116',
 'AC151_hct116',
 'AC152_hct116',
 'AC153_hct116',
 'AC154_hct116',
 'AC155_hct116',
 'AC156_hct116',
 'AC157_hct116'
]

In [None]:
hek_compound_list = [
'CL1_hek293t',
 'CL2_hek293t',
 'CL3_hek293t',
 'CL4_hek293t',
 'CL5_hek293t',
 'CL6_hek293t',
 'CL7_hek293t',
 'CL8_hek293t',
 'CL9_hek293t',
 'CL10_hek293t',
 'CL11_hek293t',
 'CL12_hek293t',
 'CL13_hek293t',
 'CL14_hek293t',
 'CL15_hek293t',
 'CL16_hek293t',
 'CL17_hek293t',
 'CL18_hek293t',
 'CL19_hek293t',
 'CL20_hek293t',
 'CL21_hek293t',
 'CL22_hek293t',
 'CL23_hek293t',
 'CL24_hek293t',
 'CL25_hek293t',
 'CL26_hek293t',
 'CL27_hek293t',
 'CL28_hek293t',
 'CL29_hek293t',
 'CL30_hek293t',
 'CL31_hek293t',
 'CL32_hek293t',
 'CL33_hek293t',
 'CL34_hek293t',
 'CL35_hek293t',
 'CL36_hek293t',
 'CL37_hek293t',
 'CL38_hek293t',
 'CL39_hek293t',
 'CL40_hek293t',
 'CL41_hek293t',
 'CL42_hek293t',
 'CL43_hek293t',
 'CL44_hek293t',
 'CL45_hek293t',
 'CL46_hek293t',
 'CL47_hek293t',
 'CL48_hek293t',
 'CL49_hek293t',
 'CL50_hek293t',
 'CL51_hek293t',
 'CL52_hek293t',
 'CL53_hek293t',
 'CL54_hek293t',
 'CL55_hek293t',
 'CL56_hek293t',
 'CL57_hek293t',
 'CL58_hek293t',
 'CL59_hek293t',
 'CL60_hek293t',
 'CL61_hek293t',
 'CL62_hek293t',
 'CL63_hek293t',
 'CL64_hek293t',
 'CL65_hek293t',
 'CL66_hek293t',
 'CL67_hek293t',
 'CL68_hek293t',
 'CL69_hek293t',
 'CL70_hek293t',
 'CL71_hek293t',
 'CL72_hek293t',
 'CL73_hek293t',
 'CL74_hek293t',
 'CL75_hek293t',
 'CL76_hek293t',
 'CL77_hek293t',
 'CL78_hek293t',
 'CL79_hek293t',
 'CL80_hek293t',
 'CL81_hek293t',
 'CL82_hek293t',
 'CL83_hek293t',
 'CL84_hek293t',
 'CL85_hek293t',
 'CL86_hek293t',
 'CL87_hek293t',
 'CL88_hek293t',
 'CL89_hek293t',
 'CL90_hek293t',
 'CL91_hek293t',
 'CL92_hek293t',
 'CL93_hek293t',
 'CL94_hek293t',
 'CL95_hek293t',
 'CL96_hek293t',
 'CL97_hek293t',
 'CL98_hek293t',
 'CL99_hek293t',
 'CL100_hek293t',
 'CL101_hek293t',
 'CL102_hek293t',
 'CL103_hek293t',
 'CL104_hek293t',
 'CL105_hek293t',
 'CL106_hek293t',
 'CL107_hek293t',
 'CL108_hek293t',
 'CL109_hek293t',
 'CL110_hek293t',
 'CL111_hek293t',
 'CL112_hek293t',
 'CL113_hek293t',
 'CL114_hek293t',
 'CL115_hek293t',
 'CL116_hek293t',
 'CL117_hek293t',
 'CL118_hek293t',
 'CL119_hek293t',
 'CL120_hek293t',
 'CL121_hek293t',
 'CL122_hek293t',
 'CL123_hek293t',
 'CL124_hek293t',
 'CL125_hek293t',
 'CL126_hek293t',
 'CL127_hek293t',
 'CL128_hek293t',
 'AC1_hek293t',
 'AC2_hek293t',
 'AC3_hek293t',
 'AC4_hek293t',
 'AC5_hek293t',
 'AC6_hek293t',
 'AC7_hek293t',
 'AC8_hek293t',
 'AC9_hek293t',
 'AC10_hek293t',
 'AC11_hek293t',
 'AC12_hek293t',
 'AC13_hek293t',
 'AC14_hek293t',
 'AC15_hek293t',
 'AC16_hek293t',
 'AC17_hek293t',
 'AC18_hek293t',
 'AC19_hek293t',
 'AC20_hek293t',
 'AC21_hek293t',
 'AC22_hek293t',
 'AC23_hek293t',
 'AC24_hek293t',
 'AC25_hek293t',
 'AC26_hek293t',
 'AC27_hek293t',
 'AC28_hek293t',
 'AC29_hek293t',
 'AC30_hek293t',
 'AC31_hek293t',
 'AC32_hek293t',
 'AC33_hek293t',
 'AC34_hek293t',
 'AC35_hek293t',
 'AC36_hek293t',
 'AC37_hek293t',
 'AC38_hek293t',
 'AC39_hek293t',
 'AC40_hek293t',
 'AC41_hek293t',
 'AC42_hek293t',
 'AC43_hek293t',
 'AC44_hek293t',
 'AC45_hek293t',
 'AC46_hek293t',
 'AC47_hek293t',
 'AC48_hek293t',
 'AC49_hek293t',
 'AC50_hek293t',
 'AC51_hek293t',
 'AC52_hek293t',
 'AC53_hek293t',
 'AC54_hek293t',
 'AC55_hek293t',
 'AC56_hek293t',
 'AC57_hek293t',
 'AC58_hek293t',
 'AC59_hek293t',
 'AC60_hek293t',
 'AC61_hek293t',
 'AC62_hek293t',
 'AC63_hek293t',
 'AC64_hek293t',
 'AC65_hek293t',
 'AC66_hek293t',
 'AC67_hek293t',
 'AC68_hek293t',
 'AC69_hek293t',
 'AC70_hek293t',
 'AC71_hek293t',
 'AC72_hek293t',
 'AC73_hek293t',
 'AC74_hek293t',
 'AC75_hek293t',
 'AC76_hek293t',
 'AC77_hek293t',
 'AC78_hek293t',
 'AC79_hek293t',
 'AC80_hek293t',
 'AC81_hek293t',
 'AC82_hek293t',
 'AC83_hek293t',
 'AC84_hek293t',
 'AC85_hek293t',
 'AC86_hek293t',
 'AC87_hek293t',
 'AC88_hek293t',
 'AC89_hek293t',
 'AC90_hek293t',
 'AC91_hek293t',
 'AC92_hek293t',
 'AC93_hek293t',
 'AC94_hek293t',
 'AC95_hek293t',
 'AC96_hek293t',
 'AC97_hek293t',
 'AC98_hek293t',
 'AC99_hek293t',
 'AC100_hek293t',
 'AC101_hek293t',
 'AC102_hek293t',
 'AC103_hek293t',
 'AC104_hek293t',
 'AC105_hek293t',
 'AC106_hek293t',
 'AC107_hek293t',
 'AC108_hek293t',
 'AC109_hek293t',
 'AC110_hek293t',
 'AC111_hek293t',
 'AC112_hek293t',
 'AC113_hek293t',
 'AC114_hek293t',
 'AC115_hek293t',
 'AC116_hek293t',
 'AC117_hek293t',
 'AC118_hek293t',
 'AC119_hek293t',
 'AC120_hek293t',
 'AC121_hek293t',
 'AC122_hek293t',
 'AC123_hek293t',
 'AC124_hek293t',
 'AC125_hek293t',
 'AC126_hek293t',
 'AC127_hek293t',
 'AC128_hek293t',
 'AC129_hek293t',
 'AC130_hek293t',
 'AC131_hek293t',
 'AC132_hek293t',
 'AC133_hek293t',
 'AC134_hek293t',
 'AC135_hek293t',
 'AC136_hek293t',
 'AC137_hek293t',
 'AC138_hek293t',
 'AC139_hek293t',
 'AC140_hek293t',
 'AC141_hek293t',
 'AC142_hek293t',
 'AC143_hek293t',
 'AC144_hek293t',
 'AC145_hek293t',
 'AC146_hek293t',
 'AC147_hek293t',
 'AC148_hek293t',
 'AC149_hek293t',
 'AC150_hek293t',
 'AC151_hek293t',
 'AC152_hek293t',
 'AC153_hek293t',
 'AC154_hek293t',
 'AC155_hek293t',
 'AC156_hek293t',
 'AC157_hek293t'
]

In [None]:
patu_compound_list = [
    'CL1_patu-8988t',
 'CL2_patu-8988t',
 'CL3_patu-8988t',
 'CL4_patu-8988t',
 'CL5_patu-8988t',
 'CL6_patu-8988t',
 'CL7_patu-8988t',
 'CL8_patu-8988t',
 'CL9_patu-8988t',
 'CL10_patu-8988t',
 'CL11_patu-8988t',
 'CL12_patu-8988t',
 'CL13_patu-8988t',
 'CL14_patu-8988t',
 'CL15_patu-8988t',
 'CL16_patu-8988t',
 'CL17_patu-8988t',
 'CL18_patu-8988t',
 'CL19_patu-8988t',
 'CL20_patu-8988t',
 'CL21_patu-8988t',
 'CL22_patu-8988t',
 'CL23_patu-8988t',
 'CL24_patu-8988t',
 'CL25_patu-8988t',
 'CL26_patu-8988t',
 'CL27_patu-8988t',
 'CL28_patu-8988t',
 'CL29_patu-8988t',
 'CL30_patu-8988t',
 'CL31_patu-8988t',
 'CL32_patu-8988t',
 'CL33_patu-8988t',
 'CL34_patu-8988t',
 'CL35_patu-8988t',
 'CL36_patu-8988t',
 'CL37_patu-8988t',
 'CL38_patu-8988t',
 'CL39_patu-8988t',
 'CL40_patu-8988t',
 'CL41_patu-8988t',
 'CL42_patu-8988t',
 'CL43_patu-8988t',
 'CL44_patu-8988t',
 'CL45_patu-8988t',
 'CL46_patu-8988t',
 'CL47_patu-8988t',
 'CL48_patu-8988t',
 'CL49_patu-8988t',
 'CL50_patu-8988t',
 'CL51_patu-8988t',
 'CL52_patu-8988t',
 'CL53_patu-8988t',
 'CL54_patu-8988t',
 'CL55_patu-8988t',
 'CL56_patu-8988t',
 'CL57_patu-8988t',
 'CL58_patu-8988t',
 'CL59_patu-8988t',
 'CL60_patu-8988t',
 'CL61_patu-8988t',
 'CL62_patu-8988t',
 'CL63_patu-8988t',
 'CL64_patu-8988t',
 'CL65_patu-8988t',
 'CL66_patu-8988t',
 'CL67_patu-8988t',
 'CL68_patu-8988t',
 'CL69_patu-8988t',
 'CL70_patu-8988t',
 'CL71_patu-8988t',
 'CL72_patu-8988t',
 'CL73_patu-8988t',
 'CL74_patu-8988t',
 'CL75_patu-8988t',
 'CL76_patu-8988t',
 'CL77_patu-8988t',
 'CL78_patu-8988t',
 'CL79_patu-8988t',
 'CL80_patu-8988t',
 'CL81_patu-8988t',
 'CL82_patu-8988t',
 'CL83_patu-8988t',
 'CL84_patu-8988t',
 'CL85_patu-8988t',
 'CL86_patu-8988t',
 'CL87_patu-8988t',
 'CL88_patu-8988t',
 'CL89_patu-8988t',
 'CL90_patu-8988t',
 'CL91_patu-8988t',
 'CL92_patu-8988t',
 'CL93_patu-8988t',
 'CL94_patu-8988t',
 'CL95_patu-8988t',
 'CL96_patu-8988t',
 'CL97_patu-8988t',
 'CL98_patu-8988t',
 'CL99_patu-8988t',
 'CL100_patu-8988t',
 'CL101_patu-8988t',
 'CL102_patu-8988t',
 'CL103_patu-8988t',
 'CL104_patu-8988t',
 'CL105_patu-8988t',
 'CL106_patu-8988t',
 'CL107_patu-8988t',
 'CL108_patu-8988t',
 'CL109_patu-8988t',
 'CL110_patu-8988t',
 'CL111_patu-8988t',
 'CL112_patu-8988t',
 'CL113_patu-8988t',
 'CL114_patu-8988t',
 'CL115_patu-8988t',
 'CL116_patu-8988t',
 'CL117_patu-8988t',
 'CL118_patu-8988t',
 'CL119_patu-8988t',
 'CL120_patu-8988t',
 'CL121_patu-8988t',
 'CL122_patu-8988t',
 'CL123_patu-8988t',
 'CL124_patu-8988t',
 'CL125_patu-8988t',
 'CL126_patu-8988t',
 'CL127_patu-8988t',
 'CL128_patu-8988t',
 'AC1_patu-8988t',
 'AC2_patu-8988t',
 'AC3_patu-8988t',
 'AC4_patu-8988t',
 'AC5_patu-8988t',
 'AC6_patu-8988t',
 'AC7_patu-8988t',
 'AC8_patu-8988t',
 'AC9_patu-8988t',
 'AC10_patu-8988t',
 'AC11_patu-8988t',
 'AC12_patu-8988t',
 'AC13_patu-8988t',
 'AC14_patu-8988t',
 'AC15_patu-8988t',
 'AC16_patu-8988t',
 'AC17_patu-8988t',
 'AC18_patu-8988t',
 'AC19_patu-8988t',
 'AC20_patu-8988t',
 'AC21_patu-8988t',
 'AC22_patu-8988t',
 'AC23_patu-8988t',
 'AC24_patu-8988t',
 'AC25_patu-8988t',
 'AC26_patu-8988t',
 'AC27_patu-8988t',
 'AC28_patu-8988t',
 'AC29_patu-8988t',
 'AC30_patu-8988t',
 'AC31_patu-8988t',
 'AC32_patu-8988t',
 'AC33_patu-8988t',
 'AC34_patu-8988t',
 'AC35_patu-8988t',
 'AC36_patu-8988t',
 'AC37_patu-8988t',
 'AC38_patu-8988t',
 'AC39_patu-8988t',
 'AC40_patu-8988t',
 'AC41_patu-8988t',
 'AC42_patu-8988t',
 'AC43_patu-8988t',
 'AC44_patu-8988t',
 'AC45_patu-8988t',
 'AC46_patu-8988t',
 'AC47_patu-8988t',
 'AC48_patu-8988t',
 'AC49_patu-8988t',
 'AC50_patu-8988t',
 'AC51_patu-8988t',
 'AC52_patu-8988t',
 'AC53_patu-8988t',
 'AC54_patu-8988t',
 'AC55_patu-8988t',
 'AC56_patu-8988t',
 'AC57_patu-8988t',
 'AC58_patu-8988t',
 'AC59_patu-8988t',
 'AC60_patu-8988t',
 'AC61_patu-8988t',
 'AC62_patu-8988t',
 'AC63_patu-8988t',
 'AC64_patu-8988t',
 'AC65_patu-8988t',
 'AC66_patu-8988t',
 'AC67_patu-8988t',
 'AC68_patu-8988t',
 'AC69_patu-8988t',
 'AC70_patu-8988t',
 'AC71_patu-8988t',
 'AC72_patu-8988t',
 'AC73_patu-8988t',
 'AC74_patu-8988t',
 'AC75_patu-8988t',
 'AC76_patu-8988t',
 'AC77_patu-8988t',
 'AC78_patu-8988t',
 'AC79_patu-8988t',
 'AC80_patu-8988t',
 'AC81_patu-8988t',
 'AC82_patu-8988t',
 'AC83_patu-8988t',
 'AC84_patu-8988t',
 'AC85_patu-8988t',
 'AC86_patu-8988t',
 'AC87_patu-8988t',
 'AC88_patu-8988t',
 'AC89_patu-8988t',
 'AC90_patu-8988t',
 'AC91_patu-8988t',
 'AC92_patu-8988t',
 'AC93_patu-8988t',
 'AC94_patu-8988t',
 'AC95_patu-8988t',
 'AC96_patu-8988t',
 'AC97_patu-8988t',
 'AC98_patu-8988t',
 'AC99_patu-8988t',
 'AC100_patu-8988t',
 'AC101_patu-8988t',
 'AC102_patu-8988t',
 'AC103_patu-8988t',
 'AC104_patu-8988t',
 'AC105_patu-8988t',
 'AC106_patu-8988t',
 'AC107_patu-8988t',
 'AC108_patu-8988t',
 'AC109_patu-8988t',
 'AC110_patu-8988t',
 'AC111_patu-8988t',
 'AC112_patu-8988t',
 'AC113_patu-8988t',
 'AC114_patu-8988t',
 'AC115_patu-8988t',
 'AC116_patu-8988t',
 'AC117_patu-8988t',
 'AC118_patu-8988t',
 'AC119_patu-8988t',
 'AC120_patu-8988t',
 'AC121_patu-8988t',
 'AC122_patu-8988t',
 'AC123_patu-8988t',
 'AC124_patu-8988t',
 'AC125_patu-8988t',
 'AC126_patu-8988t',
 'AC127_patu-8988t',
 'AC128_patu-8988t',
 'AC129_patu-8988t',
 'AC130_patu-8988t',
 'AC131_patu-8988t',
 'AC132_patu-8988t',
 'AC133_patu-8988t',
 'AC134_patu-8988t',
 'AC135_patu-8988t',
 'AC136_patu-8988t',
 'AC137_patu-8988t',
 'AC138_patu-8988t',
 'AC139_patu-8988t',
 'AC140_patu-8988t',
 'AC141_patu-8988t',
 'AC142_patu-8988t',
 'AC143_patu-8988t',
 'AC144_patu-8988t',
 'AC145_patu-8988t',
 'AC146_patu-8988t',
 'AC147_patu-8988t',
 'AC148_patu-8988t',
 'AC149_patu-8988t',
 'AC150_patu-8988t',
 'AC151_patu-8988t',
 'AC152_patu-8988t',
 'AC153_patu-8988t',
 'AC154_patu-8988t',
 'AC155_patu-8988t',
 'AC156_patu-8988t',
 'AC157_patu-8988t'
]

In [None]:
slc_chloro_cols = ['Comp ratio (KBO2)', 'Comp ratio (KBO3)']

In [None]:
slc_acryl_cols = ['Comp ratio (KBO5)']

In [None]:
def get_ligandable_category(df, probe_cols, tcell, cutoff):
    
    lig_labels = []
    
    if tcell == True:
        df = df.replace('--', 0)
    else:
        df = df.replace(np.nan, 0)
    
    for index, row in df.iterrows():
        liganded = False
        for i in range(len(probe_cols)):
            if (type(row[probe_cols[i]]) == str):
                print(row[probe_cols[i]])
                print(probe_cols[i])
                
            if row[probe_cols[i]] >= cutoff:
                liganded = True
                
        if liganded == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(np.nan)
            
    return lig_labels

In [None]:
def get_full_ligandability(df):
    df['proteinid'] = df['Uniprot ID'].map(lambda x: str(x).split('|')[1])
    df['cysteineid'] = df['proteinid'] + '_C' + df['Site Position'].astype(str)
    
    chloro_cols = df.columns.to_list()[5:133]
    acryl_cols = df.columns.to_list()[134:-2]
    
    chloro_labels = get_ligandable_category(df, chloro_cols, False, 4)
    acryl_labels = get_ligandable_category(df, acryl_cols, False, 4)
    
    new_df_ligandability = df.copy()
    new_df_ligandability = new_df_ligandability.drop(columns = ['Uniprot ID',
                             'Gene Symbol',
                             'Peptide Sequence',
                             'Site Position',
                             'Gene + Site'])
    
    new_df_ligandability['chloroacetamide'] = chloro_labels
    new_df_ligandability['acrylamide'] = acryl_labels
    
    return new_df_ligandability

In [None]:
cell_line_compound_list = ['CL1',
 'CL2',
 'CL3',
 'CL4',
 'CL5',
 'CL6',
 'CL7',
 'CL8',
 'CL9',
 'CL10',
 'CL11',
 'CL12',
 'CL13',
 'CL14',
 'CL15',
 'CL16',
 'CL17',
 'CL18',
 'CL19',
 'CL20',
 'CL21',
 'CL22',
 'CL23',
 'CL24',
 'CL25',
 'CL26',
 'CL27',
 'CL28',
 'CL29',
 'CL30',
 'CL31',
 'CL32',
 'CL33',
 'CL34',
 'CL35',
 'CL36',
 'CL37',
 'CL38',
 'CL39',
 'CL40',
 'CL41',
 'CL42',
 'CL43',
 'CL44',
 'CL45',
 'CL46',
 'CL47',
 'CL48',
 'CL49',
 'CL50',
 'CL51',
 'CL52',
 'CL53',
 'CL54',
 'CL55',
 'CL56',
 'CL57',
 'CL58',
 'CL59',
 'CL60',
 'CL61',
 'CL62',
 'CL63',
 'CL64',
 'CL65',
 'CL66',
 'CL67',
 'CL68',
 'CL69',
 'CL70',
 'CL71',
 'CL72',
 'CL73',
 'CL74',
 'CL75',
 'CL76',
 'CL77',
 'CL78',
 'CL79',
 'CL80',
 'CL81',
 'CL82',
 'CL83',
 'CL84',
 'CL85',
 'CL86',
 'CL87',
 'CL88',
 'CL89',
 'CL90',
 'CL91',
 'CL92',
 'CL93',
 'CL94',
 'CL95',
 'CL96',
 'CL97',
 'CL98',
 'CL99',
 'CL100',
 'CL101',
 'CL102',
 'CL103',
 'CL104',
 'CL105',
 'CL106',
 'CL107',
 'CL108',
 'CL109',
 'CL110',
 'CL111',
 'CL112',
 'CL113',
 'CL114',
 'CL115',
 'CL116',
 'CL117',
 'CL118',
 'CL119',
 'CL120',
 'CL121',
 'CL122',
 'CL123',
 'CL124',
 'CL125',
 'CL126',
 'CL127',
 'CL128',
 'AC1',
 'AC2',
 'AC3',
 'AC4',
 'AC5',
 'AC6',
 'AC7',
 'AC8',
 'AC9',
 'AC10',
 'AC11',
 'AC12',
 'AC13',
 'AC14',
 'AC15',
 'AC16',
 'AC17',
 'AC18',
 'AC19',
 'AC20',
 'AC21',
 'AC22',
 'AC23',
 'AC24',
 'AC25',
 'AC26',
 'AC27',
 'AC28',
 'AC29',
 'AC30',
 'AC31',
 'AC32',
 'AC33',
 'AC34',
 'AC35',
 'AC36',
 'AC37',
 'AC38',
 'AC39',
 'AC40',
 'AC41',
 'AC42',
 'AC43',
 'AC44',
 'AC45',
 'AC46',
 'AC47',
 'AC48',
 'AC49',
 'AC50',
 'AC51',
 'AC52',
 'AC53',
 'AC54',
 'AC55',
 'AC56',
 'AC57',
 'AC58',
 'AC59',
 'AC60',
 'AC61',
 'AC62',
 'AC63',
 'AC64',
 'AC65',
 'AC66',
 'AC67',
 'AC68',
 'AC69',
 'AC70',
 'AC71',
 'AC72',
 'AC73',
 'AC74',
 'AC75',
 'AC76',
 'AC77',
 'AC78',
 'AC79',
 'AC80',
 'AC81',
 'AC82',
 'AC83',
 'AC84',
 'AC85',
 'AC86',
 'AC87',
 'AC88',
 'AC89',
 'AC90',
 'AC91',
 'AC92',
 'AC93',
 'AC94',
 'AC95',
 'AC96',
 'AC97',
 'AC98',
 'AC99',
 'AC100',
 'AC101',
 'AC102',
 'AC103',
 'AC104',
 'AC105',
 'AC106',
 'AC107',
 'AC108',
 'AC109',
 'AC110',
 'AC111',
 'AC112',
 'AC113',
 'AC114',
 'AC115',
 'AC116',
 'AC117',
 'AC118',
 'AC119',
 'AC120',
 'AC121',
 'AC122',
 'AC123',
 'AC124',
 'AC125',
 'AC126',
 'AC127',
 'AC128',
 'AC129',
 'AC130',
 'AC131',
 'AC132',
 'AC133',
 'AC134',
 'AC135',
 'AC136',
 'AC137',
 'AC138',
 'AC139',
 'AC140',
 'AC141',
 'AC142',
 'AC143',
 'AC144',
 'AC145',
 'AC146',
 'AC147',
 'AC148',
 'AC149',
 'AC150',
 'AC151',
 'AC152',
 'AC153',
 'AC154',
 'AC155',
 'AC156',
 'AC157']

## cysteineid, chloroacetamide, acrylamide

In [None]:
slcabpp_chloro_cols = [
'CL1_hct116',
 'CL2_hct116',
 'CL3_hct116',
 'CL4_hct116',
 'CL5_hct116',
 'CL6_hct116',
 'CL7_hct116',
 'CL8_hct116',
 'CL9_hct116',
 'CL10_hct116',
 'CL11_hct116',
 'CL12_hct116',
 'CL13_hct116',
 'CL14_hct116',
 'CL15_hct116',
 'CL16_hct116',
 'CL17_hct116',
 'CL18_hct116',
 'CL19_hct116',
 'CL20_hct116',
 'CL21_hct116',
 'CL22_hct116',
 'CL23_hct116',
 'CL24_hct116',
 'CL25_hct116',
 'CL26_hct116',
 'CL27_hct116',
 'CL28_hct116',
 'CL29_hct116',
 'CL30_hct116',
 'CL31_hct116',
 'CL32_hct116',
 'CL33_hct116',
 'CL34_hct116',
 'CL35_hct116',
 'CL36_hct116',
 'CL37_hct116',
 'CL38_hct116',
 'CL39_hct116',
 'CL40_hct116',
 'CL41_hct116',
 'CL42_hct116',
 'CL43_hct116',
 'CL44_hct116',
 'CL45_hct116',
 'CL46_hct116',
 'CL47_hct116',
 'CL48_hct116',
 'CL49_hct116',
 'CL50_hct116',
 'CL51_hct116',
 'CL52_hct116',
 'CL53_hct116',
 'CL54_hct116',
 'CL55_hct116',
 'CL56_hct116',
 'CL57_hct116',
 'CL58_hct116',
 'CL59_hct116',
 'CL60_hct116',
 'CL61_hct116',
 'CL62_hct116',
 'CL63_hct116',
 'CL64_hct116',
 'CL65_hct116',
 'CL66_hct116',
 'CL67_hct116',
 'CL68_hct116',
 'CL69_hct116',
 'CL70_hct116',
 'CL71_hct116',
 'CL72_hct116',
 'CL73_hct116',
 'CL74_hct116',
 'CL75_hct116',
 'CL76_hct116',
 'CL77_hct116',
 'CL78_hct116',
 'CL79_hct116',
 'CL80_hct116',
 'CL81_hct116',
 'CL82_hct116',
 'CL83_hct116',
 'CL84_hct116',
 'CL85_hct116',
 'CL86_hct116',
 'CL87_hct116',
 'CL88_hct116',
 'CL89_hct116',
 'CL90_hct116',
 'CL91_hct116',
 'CL92_hct116',
 'CL93_hct116',
 'CL94_hct116',
 'CL95_hct116',
 'CL96_hct116',
 'CL97_hct116',
 'CL98_hct116',
 'CL99_hct116',
 'CL100_hct116',
 'CL101_hct116',
 'CL102_hct116',
 'CL103_hct116',
 'CL104_hct116',
 'CL105_hct116',
 'CL106_hct116',
 'CL107_hct116',
 'CL108_hct116',
 'CL109_hct116',
 'CL110_hct116',
 'CL111_hct116',
 'CL112_hct116',
 'CL113_hct116',
 'CL114_hct116',
 'CL115_hct116',
 'CL116_hct116',
 'CL117_hct116',
 'CL118_hct116',
 'CL119_hct116',
 'CL120_hct116',
 'CL121_hct116',
 'CL122_hct116',
 'CL123_hct116',
 'CL124_hct116',
 'CL125_hct116',
 'CL126_hct116',
 'CL127_hct116',
 'CL128_hct116',
 'CL1_hek293t',
 'CL2_hek293t',
 'CL3_hek293t',
 'CL4_hek293t',
 'CL5_hek293t',
 'CL6_hek293t',
 'CL7_hek293t',
 'CL8_hek293t',
 'CL9_hek293t',
 'CL10_hek293t',
 'CL11_hek293t',
 'CL12_hek293t',
 'CL13_hek293t',
 'CL14_hek293t',
 'CL15_hek293t',
 'CL16_hek293t',
 'CL17_hek293t',
 'CL18_hek293t',
 'CL19_hek293t',
 'CL20_hek293t',
 'CL21_hek293t',
 'CL22_hek293t',
 'CL23_hek293t',
 'CL24_hek293t',
 'CL25_hek293t',
 'CL26_hek293t',
 'CL27_hek293t',
 'CL28_hek293t',
 'CL29_hek293t',
 'CL30_hek293t',
 'CL31_hek293t',
 'CL32_hek293t',
 'CL33_hek293t',
 'CL34_hek293t',
 'CL35_hek293t',
 'CL36_hek293t',
 'CL37_hek293t',
 'CL38_hek293t',
 'CL39_hek293t',
 'CL40_hek293t',
 'CL41_hek293t',
 'CL42_hek293t',
 'CL43_hek293t',
 'CL44_hek293t',
 'CL45_hek293t',
 'CL46_hek293t',
 'CL47_hek293t',
 'CL48_hek293t',
 'CL49_hek293t',
 'CL50_hek293t',
 'CL51_hek293t',
 'CL52_hek293t',
 'CL53_hek293t',
 'CL54_hek293t',
 'CL55_hek293t',
 'CL56_hek293t',
 'CL57_hek293t',
 'CL58_hek293t',
 'CL59_hek293t',
 'CL60_hek293t',
 'CL61_hek293t',
 'CL62_hek293t',
 'CL63_hek293t',
 'CL64_hek293t',
 'CL65_hek293t',
 'CL66_hek293t',
 'CL67_hek293t',
 'CL68_hek293t',
 'CL69_hek293t',
 'CL70_hek293t',
 'CL71_hek293t',
 'CL72_hek293t',
 'CL73_hek293t',
 'CL74_hek293t',
 'CL75_hek293t',
 'CL76_hek293t',
 'CL77_hek293t',
 'CL78_hek293t',
 'CL79_hek293t',
 'CL80_hek293t',
 'CL81_hek293t',
 'CL82_hek293t',
 'CL83_hek293t',
 'CL84_hek293t',
 'CL85_hek293t',
 'CL86_hek293t',
 'CL87_hek293t',
 'CL88_hek293t',
 'CL89_hek293t',
 'CL90_hek293t',
 'CL91_hek293t',
 'CL92_hek293t',
 'CL93_hek293t',
 'CL94_hek293t',
 'CL95_hek293t',
 'CL96_hek293t',
 'CL97_hek293t',
 'CL98_hek293t',
 'CL99_hek293t',
 'CL100_hek293t',
 'CL101_hek293t',
 'CL102_hek293t',
 'CL103_hek293t',
 'CL104_hek293t',
 'CL105_hek293t',
 'CL106_hek293t',
 'CL107_hek293t',
 'CL108_hek293t',
 'CL109_hek293t',
 'CL110_hek293t',
 'CL111_hek293t',
 'CL112_hek293t',
 'CL113_hek293t',
 'CL114_hek293t',
 'CL115_hek293t',
 'CL116_hek293t',
 'CL117_hek293t',
 'CL118_hek293t',
 'CL119_hek293t',
 'CL120_hek293t',
 'CL121_hek293t',
 'CL122_hek293t',
 'CL123_hek293t',
 'CL124_hek293t',
 'CL125_hek293t',
 'CL126_hek293t',
 'CL127_hek293t',
 'CL128_hek293t',
 'CL1_patu-8988t',
 'CL2_patu-8988t',
 'CL3_patu-8988t',
 'CL4_patu-8988t',
 'CL5_patu-8988t',
 'CL6_patu-8988t',
 'CL7_patu-8988t',
 'CL8_patu-8988t',
 'CL9_patu-8988t',
 'CL10_patu-8988t',
 'CL11_patu-8988t',
 'CL12_patu-8988t',
 'CL13_patu-8988t',
 'CL14_patu-8988t',
 'CL15_patu-8988t',
 'CL16_patu-8988t',
 'CL17_patu-8988t',
 'CL18_patu-8988t',
 'CL19_patu-8988t',
 'CL20_patu-8988t',
 'CL21_patu-8988t',
 'CL22_patu-8988t',
 'CL23_patu-8988t',
 'CL24_patu-8988t',
 'CL25_patu-8988t',
 'CL26_patu-8988t',
 'CL27_patu-8988t',
 'CL28_patu-8988t',
 'CL29_patu-8988t',
 'CL30_patu-8988t',
 'CL31_patu-8988t',
 'CL32_patu-8988t',
 'CL33_patu-8988t',
 'CL34_patu-8988t',
 'CL35_patu-8988t',
 'CL36_patu-8988t',
 'CL37_patu-8988t',
 'CL38_patu-8988t',
 'CL39_patu-8988t',
 'CL40_patu-8988t',
 'CL41_patu-8988t',
 'CL42_patu-8988t',
 'CL43_patu-8988t',
 'CL44_patu-8988t',
 'CL45_patu-8988t',
 'CL46_patu-8988t',
 'CL47_patu-8988t',
 'CL48_patu-8988t',
 'CL49_patu-8988t',
 'CL50_patu-8988t',
 'CL51_patu-8988t',
 'CL52_patu-8988t',
 'CL53_patu-8988t',
 'CL54_patu-8988t',
 'CL55_patu-8988t',
 'CL56_patu-8988t',
 'CL57_patu-8988t',
 'CL58_patu-8988t',
 'CL59_patu-8988t',
 'CL60_patu-8988t',
 'CL61_patu-8988t',
 'CL62_patu-8988t',
 'CL63_patu-8988t',
 'CL64_patu-8988t',
 'CL65_patu-8988t',
 'CL66_patu-8988t',
 'CL67_patu-8988t',
 'CL68_patu-8988t',
 'CL69_patu-8988t',
 'CL70_patu-8988t',
 'CL71_patu-8988t',
 'CL72_patu-8988t',
 'CL73_patu-8988t',
 'CL74_patu-8988t',
 'CL75_patu-8988t',
 'CL76_patu-8988t',
 'CL77_patu-8988t',
 'CL78_patu-8988t',
 'CL79_patu-8988t',
 'CL80_patu-8988t',
 'CL81_patu-8988t',
 'CL82_patu-8988t',
 'CL83_patu-8988t',
 'CL84_patu-8988t',
 'CL85_patu-8988t',
 'CL86_patu-8988t',
 'CL87_patu-8988t',
 'CL88_patu-8988t',
 'CL89_patu-8988t',
 'CL90_patu-8988t',
 'CL91_patu-8988t',
 'CL92_patu-8988t',
 'CL93_patu-8988t',
 'CL94_patu-8988t',
 'CL95_patu-8988t',
 'CL96_patu-8988t',
 'CL97_patu-8988t',
 'CL98_patu-8988t',
 'CL99_patu-8988t',
 'CL100_patu-8988t',
 'CL101_patu-8988t',
 'CL102_patu-8988t',
 'CL103_patu-8988t',
 'CL104_patu-8988t',
 'CL105_patu-8988t',
 'CL106_patu-8988t',
 'CL107_patu-8988t',
 'CL108_patu-8988t',
 'CL109_patu-8988t',
 'CL110_patu-8988t',
 'CL111_patu-8988t',
 'CL112_patu-8988t',
 'CL113_patu-8988t',
 'CL114_patu-8988t',
 'CL115_patu-8988t',
 'CL116_patu-8988t',
 'CL117_patu-8988t',
 'CL118_patu-8988t',
 'CL119_patu-8988t',
 'CL120_patu-8988t',
 'CL121_patu-8988t',
 'CL122_patu-8988t',
 'CL123_patu-8988t',
 'CL124_patu-8988t',
 'CL125_patu-8988t',
 'CL126_patu-8988t',
 'CL127_patu-8988t',
 'CL128_patu-8988t',
 'Comp ratio (KBO2) hct116',
 'Comp ratio (KBO3) hct116'
]

In [None]:
slcabpp_acryl_cols = [
'AC1_hct116',
 'AC2_hct116',
 'AC3_hct116',
 'AC4_hct116',
 'AC5_hct116',
 'AC6_hct116',
 'AC7_hct116',
 'AC8_hct116',
 'AC9_hct116',
 'AC10_hct116',
 'AC11_hct116',
 'AC12_hct116',
 'AC13_hct116',
 'AC14_hct116',
 'AC15_hct116',
 'AC16_hct116',
 'AC17_hct116',
 'AC18_hct116',
 'AC19_hct116',
 'AC20_hct116',
 'AC21_hct116',
 'AC22_hct116',
 'AC23_hct116',
 'AC24_hct116',
 'AC25_hct116',
 'AC26_hct116',
 'AC27_hct116',
 'AC28_hct116',
 'AC29_hct116',
 'AC30_hct116',
 'AC31_hct116',
 'AC32_hct116',
 'AC33_hct116',
 'AC34_hct116',
 'AC35_hct116',
 'AC36_hct116',
 'AC37_hct116',
 'AC38_hct116',
 'AC39_hct116',
 'AC40_hct116',
 'AC41_hct116',
 'AC42_hct116',
 'AC43_hct116',
 'AC44_hct116',
 'AC45_hct116',
 'AC46_hct116',
 'AC47_hct116',
 'AC48_hct116',
 'AC49_hct116',
 'AC50_hct116',
 'AC51_hct116',
 'AC52_hct116',
 'AC53_hct116',
 'AC54_hct116',
 'AC55_hct116',
 'AC56_hct116',
 'AC57_hct116',
 'AC58_hct116',
 'AC59_hct116',
 'AC60_hct116',
 'AC61_hct116',
 'AC62_hct116',
 'AC63_hct116',
 'AC64_hct116',
 'AC65_hct116',
 'AC66_hct116',
 'AC67_hct116',
 'AC68_hct116',
 'AC69_hct116',
 'AC70_hct116',
 'AC71_hct116',
 'AC72_hct116',
 'AC73_hct116',
 'AC74_hct116',
 'AC75_hct116',
 'AC76_hct116',
 'AC77_hct116',
 'AC78_hct116',
 'AC79_hct116',
 'AC80_hct116',
 'AC81_hct116',
 'AC82_hct116',
 'AC83_hct116',
 'AC84_hct116',
 'AC85_hct116',
 'AC86_hct116',
 'AC87_hct116',
 'AC88_hct116',
 'AC89_hct116',
 'AC90_hct116',
 'AC91_hct116',
 'AC92_hct116',
 'AC93_hct116',
 'AC94_hct116',
 'AC95_hct116',
 'AC96_hct116',
 'AC97_hct116',
 'AC98_hct116',
 'AC99_hct116',
 'AC100_hct116',
 'AC101_hct116',
 'AC102_hct116',
 'AC103_hct116',
 'AC104_hct116',
 'AC105_hct116',
 'AC106_hct116',
 'AC107_hct116',
 'AC108_hct116',
 'AC109_hct116',
 'AC110_hct116',
 'AC111_hct116',
 'AC112_hct116',
 'AC113_hct116',
 'AC114_hct116',
 'AC115_hct116',
 'AC116_hct116',
 'AC117_hct116',
 'AC118_hct116',
 'AC119_hct116',
 'AC120_hct116',
 'AC121_hct116',
 'AC122_hct116',
 'AC123_hct116',
 'AC124_hct116',
 'AC125_hct116',
 'AC126_hct116',
 'AC127_hct116',
 'AC128_hct116',
 'AC129_hct116',
 'AC130_hct116',
 'AC131_hct116',
 'AC132_hct116',
 'AC133_hct116',
 'AC134_hct116',
 'AC135_hct116',
 'AC136_hct116',
 'AC137_hct116',
 'AC138_hct116',
 'AC139_hct116',
 'AC140_hct116',
 'AC141_hct116',
 'AC142_hct116',
 'AC143_hct116',
 'AC144_hct116',
 'AC145_hct116',
 'AC146_hct116',
 'AC147_hct116',
 'AC148_hct116',
 'AC149_hct116',
 'AC150_hct116',
 'AC151_hct116',
 'AC152_hct116',
 'AC153_hct116',
 'AC154_hct116',
 'AC155_hct116',
 'AC156_hct116',
 'AC157_hct116',
 'AC1_hek293t',
 'AC2_hek293t',
 'AC3_hek293t',
 'AC4_hek293t',
 'AC5_hek293t',
 'AC6_hek293t',
 'AC7_hek293t',
 'AC8_hek293t',
 'AC9_hek293t',
 'AC10_hek293t',
 'AC11_hek293t',
 'AC12_hek293t',
 'AC13_hek293t',
 'AC14_hek293t',
 'AC15_hek293t',
 'AC16_hek293t',
 'AC17_hek293t',
 'AC18_hek293t',
 'AC19_hek293t',
 'AC20_hek293t',
 'AC21_hek293t',
 'AC22_hek293t',
 'AC23_hek293t',
 'AC24_hek293t',
 'AC25_hek293t',
 'AC26_hek293t',
 'AC27_hek293t',
 'AC28_hek293t',
 'AC29_hek293t',
 'AC30_hek293t',
 'AC31_hek293t',
 'AC32_hek293t',
 'AC33_hek293t',
 'AC34_hek293t',
 'AC35_hek293t',
 'AC36_hek293t',
 'AC37_hek293t',
 'AC38_hek293t',
 'AC39_hek293t',
 'AC40_hek293t',
 'AC41_hek293t',
 'AC42_hek293t',
 'AC43_hek293t',
 'AC44_hek293t',
 'AC45_hek293t',
 'AC46_hek293t',
 'AC47_hek293t',
 'AC48_hek293t',
 'AC49_hek293t',
 'AC50_hek293t',
 'AC51_hek293t',
 'AC52_hek293t',
 'AC53_hek293t',
 'AC54_hek293t',
 'AC55_hek293t',
 'AC56_hek293t',
 'AC57_hek293t',
 'AC58_hek293t',
 'AC59_hek293t',
 'AC60_hek293t',
 'AC61_hek293t',
 'AC62_hek293t',
 'AC63_hek293t',
 'AC64_hek293t',
 'AC65_hek293t',
 'AC66_hek293t',
 'AC67_hek293t',
 'AC68_hek293t',
 'AC69_hek293t',
 'AC70_hek293t',
 'AC71_hek293t',
 'AC72_hek293t',
 'AC73_hek293t',
 'AC74_hek293t',
 'AC75_hek293t',
 'AC76_hek293t',
 'AC77_hek293t',
 'AC78_hek293t',
 'AC79_hek293t',
 'AC80_hek293t',
 'AC81_hek293t',
 'AC82_hek293t',
 'AC83_hek293t',
 'AC84_hek293t',
 'AC85_hek293t',
 'AC86_hek293t',
 'AC87_hek293t',
 'AC88_hek293t',
 'AC89_hek293t',
 'AC90_hek293t',
 'AC91_hek293t',
 'AC92_hek293t',
 'AC93_hek293t',
 'AC94_hek293t',
 'AC95_hek293t',
 'AC96_hek293t',
 'AC97_hek293t',
 'AC98_hek293t',
 'AC99_hek293t',
 'AC100_hek293t',
 'AC101_hek293t',
 'AC102_hek293t',
 'AC103_hek293t',
 'AC104_hek293t',
 'AC105_hek293t',
 'AC106_hek293t',
 'AC107_hek293t',
 'AC108_hek293t',
 'AC109_hek293t',
 'AC110_hek293t',
 'AC111_hek293t',
 'AC112_hek293t',
 'AC113_hek293t',
 'AC114_hek293t',
 'AC115_hek293t',
 'AC116_hek293t',
 'AC117_hek293t',
 'AC118_hek293t',
 'AC119_hek293t',
 'AC120_hek293t',
 'AC121_hek293t',
 'AC122_hek293t',
 'AC123_hek293t',
 'AC124_hek293t',
 'AC125_hek293t',
 'AC126_hek293t',
 'AC127_hek293t',
 'AC128_hek293t',
 'AC129_hek293t',
 'AC130_hek293t',
 'AC131_hek293t',
 'AC132_hek293t',
 'AC133_hek293t',
 'AC134_hek293t',
 'AC135_hek293t',
 'AC136_hek293t',
 'AC137_hek293t',
 'AC138_hek293t',
 'AC139_hek293t',
 'AC140_hek293t',
 'AC141_hek293t',
 'AC142_hek293t',
 'AC143_hek293t',
 'AC144_hek293t',
 'AC145_hek293t',
 'AC146_hek293t',
 'AC147_hek293t',
 'AC148_hek293t',
 'AC149_hek293t',
 'AC150_hek293t',
 'AC151_hek293t',
 'AC152_hek293t',
 'AC153_hek293t',
 'AC154_hek293t',
 'AC155_hek293t',
 'AC156_hek293t',
 'AC157_hek293t',
 'AC1_patu-8988t',
 'AC2_patu-8988t',
 'AC3_patu-8988t',
 'AC4_patu-8988t',
 'AC5_patu-8988t',
 'AC6_patu-8988t',
 'AC7_patu-8988t',
 'AC8_patu-8988t',
 'AC9_patu-8988t',
 'AC10_patu-8988t',
 'AC11_patu-8988t',
 'AC12_patu-8988t',
 'AC13_patu-8988t',
 'AC14_patu-8988t',
 'AC15_patu-8988t',
 'AC16_patu-8988t',
 'AC17_patu-8988t',
 'AC18_patu-8988t',
 'AC19_patu-8988t',
 'AC20_patu-8988t',
 'AC21_patu-8988t',
 'AC22_patu-8988t',
 'AC23_patu-8988t',
 'AC24_patu-8988t',
 'AC25_patu-8988t',
 'AC26_patu-8988t',
 'AC27_patu-8988t',
 'AC28_patu-8988t',
 'AC29_patu-8988t',
 'AC30_patu-8988t',
 'AC31_patu-8988t',
 'AC32_patu-8988t',
 'AC33_patu-8988t',
 'AC34_patu-8988t',
 'AC35_patu-8988t',
 'AC36_patu-8988t',
 'AC37_patu-8988t',
 'AC38_patu-8988t',
 'AC39_patu-8988t',
 'AC40_patu-8988t',
 'AC41_patu-8988t',
 'AC42_patu-8988t',
 'AC43_patu-8988t',
 'AC44_patu-8988t',
 'AC45_patu-8988t',
 'AC46_patu-8988t',
 'AC47_patu-8988t',
 'AC48_patu-8988t',
 'AC49_patu-8988t',
 'AC50_patu-8988t',
 'AC51_patu-8988t',
 'AC52_patu-8988t',
 'AC53_patu-8988t',
 'AC54_patu-8988t',
 'AC55_patu-8988t',
 'AC56_patu-8988t',
 'AC57_patu-8988t',
 'AC58_patu-8988t',
 'AC59_patu-8988t',
 'AC60_patu-8988t',
 'AC61_patu-8988t',
 'AC62_patu-8988t',
 'AC63_patu-8988t',
 'AC64_patu-8988t',
 'AC65_patu-8988t',
 'AC66_patu-8988t',
 'AC67_patu-8988t',
 'AC68_patu-8988t',
 'AC69_patu-8988t',
 'AC70_patu-8988t',
 'AC71_patu-8988t',
 'AC72_patu-8988t',
 'AC73_patu-8988t',
 'AC74_patu-8988t',
 'AC75_patu-8988t',
 'AC76_patu-8988t',
 'AC77_patu-8988t',
 'AC78_patu-8988t',
 'AC79_patu-8988t',
 'AC80_patu-8988t',
 'AC81_patu-8988t',
 'AC82_patu-8988t',
 'AC83_patu-8988t',
 'AC84_patu-8988t',
 'AC85_patu-8988t',
 'AC86_patu-8988t',
 'AC87_patu-8988t',
 'AC88_patu-8988t',
 'AC89_patu-8988t',
 'AC90_patu-8988t',
 'AC91_patu-8988t',
 'AC92_patu-8988t',
 'AC93_patu-8988t',
 'AC94_patu-8988t',
 'AC95_patu-8988t',
 'AC96_patu-8988t',
 'AC97_patu-8988t',
 'AC98_patu-8988t',
 'AC99_patu-8988t',
 'AC100_patu-8988t',
 'AC101_patu-8988t',
 'AC102_patu-8988t',
 'AC103_patu-8988t',
 'AC104_patu-8988t',
 'AC105_patu-8988t',
 'AC106_patu-8988t',
 'AC107_patu-8988t',
 'AC108_patu-8988t',
 'AC109_patu-8988t',
 'AC110_patu-8988t',
 'AC111_patu-8988t',
 'AC112_patu-8988t',
 'AC113_patu-8988t',
 'AC114_patu-8988t',
 'AC115_patu-8988t',
 'AC116_patu-8988t',
 'AC117_patu-8988t',
 'AC118_patu-8988t',
 'AC119_patu-8988t',
 'AC120_patu-8988t',
 'AC121_patu-8988t',
 'AC122_patu-8988t',
 'AC123_patu-8988t',
 'AC124_patu-8988t',
 'AC125_patu-8988t',
 'AC126_patu-8988t',
 'AC127_patu-8988t',
 'AC128_patu-8988t',
 'AC129_patu-8988t',
 'AC130_patu-8988t',
 'AC131_patu-8988t',
 'AC132_patu-8988t',
 'AC133_patu-8988t',
 'AC134_patu-8988t',
 'AC135_patu-8988t',
 'AC136_patu-8988t',
 'AC137_patu-8988t',
 'AC138_patu-8988t',
 'AC139_patu-8988t',
 'AC140_patu-8988t',
 'AC141_patu-8988t',
 'AC142_patu-8988t',
 'AC143_patu-8988t',
 'AC144_patu-8988t',
 'AC145_patu-8988t',
 'AC146_patu-8988t',
 'AC147_patu-8988t',
 'AC148_patu-8988t',
 'AC149_patu-8988t',
 'AC150_patu-8988t',
 'AC151_patu-8988t',
 'AC152_patu-8988t',
 'AC153_patu-8988t',
 'AC154_patu-8988t',
 'AC155_patu-8988t',
 'AC156_patu-8988t',
 'AC157_patu-8988t',
 'Comp ratio (KBO5) hct116'
]

# Read DIA Data

## cysteineid, chloroacetamide, acrylamide

In [None]:
dia_chloro_cols = [
 'F2_ramos',
 'F3_ramos',
 'F4_ramos',
 'F7_ramos',
 'F8_ramos',
 'F9_ramos',
 'F10_ramos',
 'F11_ramos',
 'F12_ramos',
 'F13_ramos',
 'F20_ramos',
 'F21_ramos',
 'F27_ramos',
 'F28_ramos',
 'F30_ramos',
 'F32_ramos',
 'F33_ramos',
 'F52_ramos'
]

In [None]:
dia_acryl_cols = [
 'F5_ramos',
 'F14_ramos',
 'F23_ramos',
 'F31_ramos',
 'F38_ramos',
 'F56_ramos'
]

In [None]:
def get_dup_ids(df):
    nodup_ids = []
    dup_ids = []
    for index, row in df.iterrows():
        if row['cysteineid'] not in nodup_ids:
            nodup_ids.append(row['cysteineid'])
        else:
            dup_ids.append(row['cysteineid'])
            
    return dup_ids

# Warhead Dataset Counts

In [None]:
os.chdir(cd)

In [None]:
os.chdir('results')

In [None]:
discovery_df = pd.read_csv('backus_ligandable_dataset.csv')

In [None]:
discovery_df.shape, len(discovery_df['cysteineid'].unique())

In [None]:
subset_discovery_df = discovery_df[['cysteineid', 'other', 'chloroacetamide', 'acrylamide', 'bromoacetamide']]

In [None]:
subset_discovery_df['dataset'] = 'backus_cravatt'

In [None]:
suzuki_df = pd.read_csv('cao_ligandable_dataset.csv')

In [None]:
suzuki_df.shape, len(suzuki_df['cysteineid'].unique())

In [None]:
subset_suzuki_df = suzuki_df[['cysteineid', 'other', 'chloroacetamide', 'acrylamide']]

In [None]:
subset_suzuki_df['dataset'] = 'cao_backus'

In [None]:
tcell_df = pd.read_csv('vinogradova_ligandable_dataset.csv')

In [None]:
subset_tcell_df = tcell_df[['cysteineid', 'chloroacetamide', 'acrylamide', 'dmf']]

In [None]:
subset_tcell_df['dataset'] = 'vinogradova_cravatt'

In [None]:
sp3_df = pd.read_csv('yan_ligandable_dataset.csv')

In [None]:
sp3_df.shape, len(sp3_df['cysteineid'].unique())

In [None]:
subset_sp3_df = sp3_df[['cysteineid', 'chloroacetamide']]

In [None]:
subset_sp3_df['dataset'] = 'yan_backus'

In [None]:
slcabpp_df = pd.read_csv('kuljanin_ligandable_dataset.csv')

In [None]:
slcabpp_df.shape, len(slcabpp_df['cysteineid'].unique())

In [None]:
subset_slcabpp_df = slcabpp_df[['cysteineid', 'chloroacetamide', 'acrylamide']]

In [None]:
subset_slcabpp_df['dataset'] = 'kuljanin_gygi'

In [None]:
dia_df = pd.read_csv('yang_ligandable_dataset.csv')

In [None]:
dia_df.shape, len(dia_df['cysteineid'].unique())

In [None]:
subset_dia_df = dia_df[['cysteineid', 'chloroacetamide', 'acrylamide']]

In [None]:
subset_dia_df['dataset'] = 'wang_yang'

## dataset, cysteineid, acryl, acet, chloro, dmf, bromo

In [None]:
concat_df = pd.concat([subset_discovery_df, subset_suzuki_df, subset_tcell_df, subset_sp3_df, subset_slcabpp_df, subset_dia_df])

In [None]:
concat_df = concat_df.replace(' ', '')

In [None]:
concat_df = concat_df.drop_duplicates()

In [None]:
len(concat_df['cysteineid'].unique())

In [None]:
def get_warhead_count(df, search_datasets):
    new_df = pd.DataFrame()
    
    datasets = []
    acryl_counts = []
    acet_counts = []
    chloro_counts = []
    bromo_counts = []
    dmf_counts = []
    other_counts = []
    
    warheads = []
    counts = []

    for i in range(len(search_datasets)):

        current_dataset = df[df['dataset'] == search_datasets[i]]

        datasets += [search_datasets[i]] * 5
        warheads += ['acrylamide', 'bromoacetamide', 'chloroacetamide', 'dmf', 'other']

        current_acryl = current_dataset[current_dataset['acrylamide'] == 'yes']
        current_bromo = current_dataset[current_dataset['bromoacetamide'] == 'yes']
        current_chloro = current_dataset[current_dataset['chloroacetamide'] == 'yes']
        current_dmf = current_dataset[current_dataset['dmf'] == 'yes']
        current_other = current_dataset[current_dataset['other'] == 'yes']

        counts += [current_acryl.shape[0], current_bromo.shape[0], 
                   current_chloro.shape[0], current_dmf.shape[0], 
                   current_other.shape[0]]
    
        acryl_counts.append(current_acryl.shape[0])
        bromo_counts.append(current_bromo.shape[0])
        chloro_counts.append(current_chloro.shape[0])
        dmf_counts.append(current_dmf.shape[0])
        other_counts.append(current_other.shape[0])
        
    new_df['dataset'] = datasets
    new_df['category'] = warheads
    new_df['count'] = counts
    
    return new_df

In [None]:
new_df = get_warhead_count(concat_df, ['backus_cravatt', 'cao_backus', 'vinogradova_cravatt', 'yan_backus', 'kuljanin_gygi', 'wang_yang'])

In [None]:
new_df.to_csv('cysteineomedb_warhead_dataset_counts.csv', index = False)

# Merge ligandability 

In [None]:
# use ligandability labeling from individual csvs due to differences in criteria

In [None]:
# merge competition ratios

In [None]:
os.chdir(cd)
os.chdir('results')

In [None]:
discovery_df = pd.read_csv('backus_ligandable_dataset.csv')

In [None]:
discovery_df = discovery_df.rename(columns = {'Identifier': 'cysteineid', 'ligandable': 'backus_cravatt_ligandable'})
subset_discovery_df = discovery_df.drop(columns = ['other', 'chloroacetamide', 'acrylamide', 'bromoacetamide'])

In [None]:
suzuki_df = pd.read_csv('cao_ligandable_dataset.csv')

In [None]:
suzuki_df = suzuki_df.rename(columns = {'ligandable': 'cao_backus_ligandable'})
subset_suzuki_df = suzuki_df.drop(columns = ['proteinid', 'other', 'chloroacetamide', 'acrylamide'])

In [None]:
tcell_df = pd.read_csv('vinogradova_ligandable_dataset.csv')

In [None]:
tcell_df = tcell_df.rename(columns = {'ligandable': 'vinogradova_cravatt_ligandable'})
subset_tcell_df = tcell_df.drop(columns = ['proteinid', 'chloroacetamide', 'acrylamide'])

In [None]:
sp3_df = pd.read_csv('yan_ligandable_dataset.csv')

In [None]:
sp3_df = sp3_df.rename(columns = {'ligandable': 'yan_backus_ligandable'})
subset_sp3_df = sp3_df[['cysteineid', 'yan_backus_ligandable', 'Mean Ratio jurkat']]
subset_sp3_df = sp3_df.drop(columns = ['proteinid', 'chloroacetamide', 'acrylamide'])

In [None]:
suzuki_df = pd.read_csv('cao_ligandable_dataset.csv')

In [None]:
slcabpp_df = pd.read_csv('kuljanin_ligandable_dataset.csv')

In [None]:
slcabpp_df['proteinid'] = slcabpp_df['cysteineid'].map(lambda x: str(x).split('_')[0])

In [None]:
slcabpp_df = slcabpp_df.rename(columns = {'ligandable': 'kuljanin_gygi_ligandable'})
subset_slcabpp_df = slcabpp_df.drop(columns = ['proteinid', 'chloroacetamide', 'acrylamide'])

In [None]:
dia_df = pd.read_csv('yang_ligandable_dataset.csv')

In [None]:
dia_df = dia_df.rename(columns = {'ligandable': 'yang_wang_ligandable'})
subset_dia_df = dia_df.drop(columns = ['proteinid', 'chloroacetamide', 'acrylamide'])

In [None]:
def get_exp_df(dfs):
    new_df = pd.concat(dfs)
    
    new_df = new_df.replace(' ', '')

    id_df = new_df[['cysteineid']]
    id_df = id_df.drop_duplicates()
    
    return id_df

In [None]:
ligandable_df = get_exp_df([subset_discovery_df, 
                            subset_suzuki_df, 
                            subset_tcell_df,
                            subset_sp3_df,
                            subset_slcabpp_df,
                            subset_dia_df])

In [None]:
def merge_ligandable(dfs, id_df):
    
    for i in range(len(dfs)):
        id_df = pd.merge(id_df, dfs[i], on = 'cysteineid', how = 'left')
    
    return id_df

In [None]:
id_df = merge_ligandable([subset_discovery_df, 
                            subset_suzuki_df, 
                            subset_tcell_df,
                            subset_sp3_df,
                            subset_slcabpp_df,
                            subset_dia_df], 
                         ligandable_df)

In [None]:
id_df.shape

In [None]:
chloro_cols = discovery_chloro_cols + tcell_chloro_cols + suzuki_chloro_cols + sp3_chloro_cols + slcabpp_chloro_cols + dia_chloro_cols
bromo_cols = discovery_bromo_cols
other_cols = discovery_other_cols + suzuki_other_cols
acryl_cols = discovery_acryl_cols + tcell_acryl_cols + suzuki_acryl_cols + slcabpp_acryl_cols+ dia_acryl_cols
dmf_cols = tcell_dmf_cols

In [None]:
def get_warhead_label(dfs, warhead):
    warhead_ids = []
    for i in range(len(dfs)):
        current_df = dfs[i]
        cols = current_df.columns.to_list()
        if warhead in cols:
            warhead_df = current_df[current_df[warhead] == 'yes']
            current_ids = list(warhead_df['cysteineid'].unique())
            warhead_ids += current_ids
            
    unique_warhead_ids = list(set(warhead_ids))
    
    return unique_warhead_ids

In [None]:
chloro_warheads = get_warhead_label([discovery_df, 
                            suzuki_df, 
                            tcell_df,
                            sp3_df,
                            slcabpp_df,
                            dia_df], 
                            'chloroacetamide')

In [None]:
bromo_warheads = get_warhead_label([discovery_df, 
                            suzuki_df, 
                            tcell_df,
                            sp3_df,
                            slcabpp_df,
                            dia_df], 
                            'bromoacetamide')

In [None]:
other_warheads = get_warhead_label([discovery_df, 
                            suzuki_df, 
                            tcell_df,
                            sp3_df,
                            slcabpp_df,
                            dia_df], 
                            'other')

In [None]:
acryl_warheads = get_warhead_label([discovery_df, 
                            suzuki_df, 
                            tcell_df,
                            sp3_df,
                            slcabpp_df,
                            dia_df], 
                            'acrylamide')

In [None]:
dmf_warheads = get_warhead_label([discovery_df, 
                            suzuki_df, 
                            tcell_df,
                            sp3_df,
                            slcabpp_df,
                            dia_df], 
                            'dmf')

In [None]:
lig_id_df = id_df.copy()

In [None]:
new_chloro_cols = discovery_chloro_cols + suzuki_chloro_cols + tcell_chloro_cols + sp3_chloro_cols + slcabpp_chloro_cols + dia_chloro_cols
new_bromo_cols = discovery_bromo_cols
new_other_cols = discovery_other_cols + suzuki_other_cols
new_acryl_cols = discovery_acryl_cols + tcell_acryl_cols + suzuki_acryl_cols + slcabpp_acryl_cols + dia_acryl_cols
new_dmf_cols = tcell_dmf_cols

In [None]:
def rename_cols(df, start_num, abbrev, cols):
    count = start_num
    new_cols_dict = {}
    for i in range(len(cols)):
        df = df.rename(columns = {cols[i]: abbrev + '_' + str(count)})
        new_cols_dict[cols[i]] = abbrev + '_' + str(count)
        count += 1
        
    return df, new_cols_dict

In [None]:
updated_col_df, updated_chloro_cols_dict = rename_cols(lig_id_df, 1, 'CL', new_chloro_cols)

In [None]:
updated_col_df, updated_bromo_cols_dict = rename_cols(updated_col_df, 1, 'BR', new_bromo_cols)


In [None]:
updated_col_df, updated_other_cols_dict = rename_cols(updated_col_df, 1, 'OTHER', new_other_cols)


In [None]:
updated_col_df, updated_acryl_cols_dict = rename_cols(updated_col_df, 1, 'ACRYL', new_acryl_cols)


In [None]:
updated_col_df, updated_dmf_cols_dict = rename_cols(updated_col_df, 1, 'DMF', new_dmf_cols)

In [None]:
updated_col_df['chloroacetamide'] = np.where(updated_col_df['cysteineid'].isin(chloro_warheads), 'yes', None)
updated_col_df['bromoacetamide'] = np.where(updated_col_df['cysteineid'].isin(bromo_warheads), 'yes', None)
updated_col_df['other'] = np.where(updated_col_df['cysteineid'].isin(other_warheads), 'yes', None)
updated_col_df['acrylamide'] = np.where(updated_col_df['cysteineid'].isin(acryl_warheads), 'yes', None)
updated_col_df['dmf'] = np.where(updated_col_df['cysteineid'].isin(dmf_warheads), 'yes', None)

In [None]:
def get_ligandable(df):
    lig_labels = []
    
    for index, row in df.iterrows():
        lig = False
        if (row['backus_cravatt_ligandable'] == 'yes'):
            lig = True
        if (row['cao_backus_ligandable'] == 'yes'):
            lig = True
        if (row['vinogradova_cravatt_ligandable'] == 'yes'):
            lig = True
        if (row['yan_backus_ligandable'] == 'yes'):
            lig = True
        if (row['kuljanin_gygi_ligandable'] == 'yes'):
            lig = True
        if (row['yang_wang_ligandable'] == 'yes'):
            lig = True
            
        if lig == True:
            lig_labels.append('yes')
        else:
            lig_labels.append(None)
            
    return lig_labels

In [None]:
lig_labels = get_ligandable(updated_col_df)

In [None]:
updated_col_df['ligandable'] = lig_labels

In [None]:
updated_col_df['ligandable'].value_counts()

In [None]:
updated_col_df['proteinid'] = updated_col_df['cysteineid'].map(lambda x: str(x).split('_')[0])

In [None]:
len(updated_col_df['cysteineid'].unique())

In [None]:
updated_col_df.shape

In [None]:
dup_tot_ids = get_dup_ids(updated_col_df)

In [None]:
dup_tot_ids

In [None]:
updated_cols = updated_col_df.columns.to_list()
for i in range(len(updated_cols)):
    print(updated_cols[i])

In [None]:
updated_col_df['resid'] = updated_col_df['cysteineid'].map((lambda x: str(x).split('_')[1]))
updated_col_df['resid'] = updated_col_df['resid'].str.replace('C', '')

In [None]:
subset = updated_col_df.reindex(sorted(updated_col_df.columns), axis=1)
subset_cols = subset.columns.to_list()

In [None]:
subset_df = subset[[
 'proteinid',
 'cysteineid',
 'resid',
 'ligandable',  
 'backus_cravatt_ligandable',
 'cao_backus_ligandable',
 'kuljanin_gygi_ligandable',
 'vinogradova_cravatt_ligandable',
 'yan_backus_ligandable',
 'yang_wang_ligandable',
 'acrylamide',
 'bromoacetamide',
 'chloroacetamide',
 'dmf',
 'other',
 'ACRYL_1',
 'ACRYL_10',
 'ACRYL_100',
 'ACRYL_101',
 'ACRYL_102',
 'ACRYL_103',
 'ACRYL_104',
 'ACRYL_105',
 'ACRYL_106',
 'ACRYL_107',
 'ACRYL_108',
 'ACRYL_109',
 'ACRYL_11',
 'ACRYL_110',
 'ACRYL_111',
 'ACRYL_112',
 'ACRYL_113',
 'ACRYL_114',
 'ACRYL_115',
 'ACRYL_116',
 'ACRYL_117',
 'ACRYL_118',
 'ACRYL_119',
 'ACRYL_12',
 'ACRYL_120',
 'ACRYL_121',
 'ACRYL_122',
 'ACRYL_123',
 'ACRYL_124',
 'ACRYL_125',
 'ACRYL_126',
 'ACRYL_127',
 'ACRYL_128',
 'ACRYL_129',
 'ACRYL_13',
 'ACRYL_130',
 'ACRYL_131',
 'ACRYL_132',
 'ACRYL_133',
 'ACRYL_134',
 'ACRYL_135',
 'ACRYL_136',
 'ACRYL_137',
 'ACRYL_138',
 'ACRYL_139',
 'ACRYL_14',
 'ACRYL_140',
 'ACRYL_141',
 'ACRYL_142',
 'ACRYL_143',
 'ACRYL_144',
 'ACRYL_145',
 'ACRYL_146',
 'ACRYL_147',
 'ACRYL_148',
 'ACRYL_149',
 'ACRYL_15',
 'ACRYL_150',
 'ACRYL_151',
 'ACRYL_152',
 'ACRYL_153',
 'ACRYL_154',
 'ACRYL_155',
 'ACRYL_156',
 'ACRYL_157',
 'ACRYL_158',
 'ACRYL_159',
 'ACRYL_16',
 'ACRYL_160',
 'ACRYL_161',
 'ACRYL_162',
 'ACRYL_163',
 'ACRYL_164',
 'ACRYL_165',
 'ACRYL_166',
 'ACRYL_167',
 'ACRYL_168',
 'ACRYL_169',
 'ACRYL_17',
 'ACRYL_170',
 'ACRYL_171',
 'ACRYL_172',
 'ACRYL_173',
 'ACRYL_174',
 'ACRYL_175',
 'ACRYL_176',
 'ACRYL_177',
 'ACRYL_178',
 'ACRYL_179',
 'ACRYL_18',
 'ACRYL_180',
 'ACRYL_181',
 'ACRYL_182',
 'ACRYL_183',
 'ACRYL_184',
 'ACRYL_185',
 'ACRYL_186',
 'ACRYL_187',
 'ACRYL_188',
 'ACRYL_189',
 'ACRYL_19',
 'ACRYL_190',
 'ACRYL_191',
 'ACRYL_192',
 'ACRYL_193',
 'ACRYL_194',
 'ACRYL_195',
 'ACRYL_196',
 'ACRYL_197',
 'ACRYL_198',
 'ACRYL_199',
 'ACRYL_2',
 'ACRYL_20',
 'ACRYL_200',
 'ACRYL_201',
 'ACRYL_202',
 'ACRYL_203',
 'ACRYL_204',
 'ACRYL_205',
 'ACRYL_206',
 'ACRYL_207',
 'ACRYL_208',
 'ACRYL_209',
 'ACRYL_21',
 'ACRYL_210',
 'ACRYL_211',
 'ACRYL_212',
 'ACRYL_213',
 'ACRYL_214',
 'ACRYL_215',
 'ACRYL_216',
 'ACRYL_217',
 'ACRYL_218',
 'ACRYL_219',
 'ACRYL_22',
 'ACRYL_220',
 'ACRYL_221',
 'ACRYL_222',
 'ACRYL_223',
 'ACRYL_224',
 'ACRYL_225',
 'ACRYL_226',
 'ACRYL_227',
 'ACRYL_228',
 'ACRYL_229',
 'ACRYL_23',
 'ACRYL_230',
 'ACRYL_231',
 'ACRYL_232',
 'ACRYL_233',
 'ACRYL_234',
 'ACRYL_235',
 'ACRYL_236',
 'ACRYL_237',
 'ACRYL_238',
 'ACRYL_239',
 'ACRYL_24',
 'ACRYL_240',
 'ACRYL_241',
 'ACRYL_242',
 'ACRYL_243',
 'ACRYL_244',
 'ACRYL_245',
 'ACRYL_246',
 'ACRYL_247',
 'ACRYL_248',
 'ACRYL_249',
 'ACRYL_25',
 'ACRYL_250',
 'ACRYL_251',
 'ACRYL_252',
 'ACRYL_253',
 'ACRYL_254',
 'ACRYL_255',
 'ACRYL_256',
 'ACRYL_257',
 'ACRYL_258',
 'ACRYL_259',
 'ACRYL_26',
 'ACRYL_260',
 'ACRYL_261',
 'ACRYL_262',
 'ACRYL_263',
 'ACRYL_264',
 'ACRYL_265',
 'ACRYL_266',
 'ACRYL_267',
 'ACRYL_268',
 'ACRYL_269',
 'ACRYL_27',
 'ACRYL_270',
 'ACRYL_271',
 'ACRYL_272',
 'ACRYL_273',
 'ACRYL_274',
 'ACRYL_275',
 'ACRYL_276',
 'ACRYL_277',
 'ACRYL_278',
 'ACRYL_279',
 'ACRYL_28',
 'ACRYL_280',
 'ACRYL_281',
 'ACRYL_282',
 'ACRYL_283',
 'ACRYL_284',
 'ACRYL_285',
 'ACRYL_286',
 'ACRYL_287',
 'ACRYL_288',
 'ACRYL_289',
 'ACRYL_29',
 'ACRYL_290',
 'ACRYL_291',
 'ACRYL_292',
 'ACRYL_293',
 'ACRYL_294',
 'ACRYL_295',
 'ACRYL_296',
 'ACRYL_297',
 'ACRYL_298',
 'ACRYL_299',
 'ACRYL_3',
 'ACRYL_30',
 'ACRYL_300',
 'ACRYL_301',
 'ACRYL_302',
 'ACRYL_303',
 'ACRYL_304',
 'ACRYL_305',
 'ACRYL_306',
 'ACRYL_307',
 'ACRYL_308',
 'ACRYL_309',
 'ACRYL_31',
 'ACRYL_310',
 'ACRYL_311',
 'ACRYL_312',
 'ACRYL_313',
 'ACRYL_314',
 'ACRYL_315',
 'ACRYL_316',
 'ACRYL_317',
 'ACRYL_318',
 'ACRYL_319',
 'ACRYL_32',
 'ACRYL_320',
 'ACRYL_321',
 'ACRYL_322',
 'ACRYL_323',
 'ACRYL_324',
 'ACRYL_325',
 'ACRYL_326',
 'ACRYL_327',
 'ACRYL_328',
 'ACRYL_329',
 'ACRYL_33',
 'ACRYL_330',
 'ACRYL_331',
 'ACRYL_332',
 'ACRYL_333',
 'ACRYL_334',
 'ACRYL_335',
 'ACRYL_336',
 'ACRYL_337',
 'ACRYL_338',
 'ACRYL_339',
 'ACRYL_34',
 'ACRYL_340',
 'ACRYL_341',
 'ACRYL_342',
 'ACRYL_343',
 'ACRYL_344',
 'ACRYL_345',
 'ACRYL_346',
 'ACRYL_347',
 'ACRYL_348',
 'ACRYL_349',
 'ACRYL_35',
 'ACRYL_350',
 'ACRYL_351',
 'ACRYL_352',
 'ACRYL_353',
 'ACRYL_354',
 'ACRYL_355',
 'ACRYL_356',
 'ACRYL_357',
 'ACRYL_358',
 'ACRYL_359',
 'ACRYL_36',
 'ACRYL_360',
 'ACRYL_361',
 'ACRYL_362',
 'ACRYL_363',
 'ACRYL_364',
 'ACRYL_365',
 'ACRYL_366',
 'ACRYL_367',
 'ACRYL_368',
 'ACRYL_369',
 'ACRYL_37',
 'ACRYL_370',
 'ACRYL_371',
 'ACRYL_372',
 'ACRYL_373',
 'ACRYL_374',
 'ACRYL_375',
 'ACRYL_376',
 'ACRYL_377',
 'ACRYL_378',
 'ACRYL_379',
 'ACRYL_38',
 'ACRYL_380',
 'ACRYL_381',
 'ACRYL_382',
 'ACRYL_383',
 'ACRYL_384',
 'ACRYL_385',
 'ACRYL_386',
 'ACRYL_387',
 'ACRYL_388',
 'ACRYL_389',
 'ACRYL_39',
 'ACRYL_390',
 'ACRYL_391',
 'ACRYL_392',
 'ACRYL_393',
 'ACRYL_394',
 'ACRYL_395',
 'ACRYL_396',
 'ACRYL_397',
 'ACRYL_398',
 'ACRYL_399',
 'ACRYL_4',
 'ACRYL_40',
 'ACRYL_400',
 'ACRYL_401',
 'ACRYL_402',
 'ACRYL_403',
 'ACRYL_404',
 'ACRYL_405',
 'ACRYL_406',
 'ACRYL_407',
 'ACRYL_408',
 'ACRYL_409',
 'ACRYL_41',
 'ACRYL_410',
 'ACRYL_411',
 'ACRYL_412',
 'ACRYL_413',
 'ACRYL_414',
 'ACRYL_415',
 'ACRYL_416',
 'ACRYL_417',
 'ACRYL_418',
 'ACRYL_419',
 'ACRYL_42',
 'ACRYL_420',
 'ACRYL_421',
 'ACRYL_422',
 'ACRYL_423',
 'ACRYL_424',
 'ACRYL_425',
 'ACRYL_426',
 'ACRYL_427',
 'ACRYL_428',
 'ACRYL_429',
 'ACRYL_43',
 'ACRYL_430',
 'ACRYL_431',
 'ACRYL_432',
 'ACRYL_433',
 'ACRYL_434',
 'ACRYL_435',
 'ACRYL_436',
 'ACRYL_437',
 'ACRYL_438',
 'ACRYL_439',
 'ACRYL_44',
 'ACRYL_440',
 'ACRYL_441',
 'ACRYL_442',
 'ACRYL_443',
 'ACRYL_444',
 'ACRYL_445',
 'ACRYL_446',
 'ACRYL_447',
 'ACRYL_448',
 'ACRYL_449',
 'ACRYL_45',
 'ACRYL_450',
 'ACRYL_451',
 'ACRYL_452',
 'ACRYL_453',
 'ACRYL_454',
 'ACRYL_455',
 'ACRYL_456',
 'ACRYL_457',
 'ACRYL_458',
 'ACRYL_459',
 'ACRYL_46',
 'ACRYL_460',
 'ACRYL_461',
 'ACRYL_462',
 'ACRYL_463',
 'ACRYL_464',
 'ACRYL_465',
 'ACRYL_466',
 'ACRYL_467',
 'ACRYL_468',
 'ACRYL_469',
 'ACRYL_47',
 'ACRYL_470',
 'ACRYL_471',
 'ACRYL_472',
 'ACRYL_473',
 'ACRYL_474',
 'ACRYL_475',
 'ACRYL_476',
 'ACRYL_477',
 'ACRYL_478',
 'ACRYL_479',
 'ACRYL_48',
 'ACRYL_480',
 'ACRYL_481',
 'ACRYL_482',
 'ACRYL_483',
 'ACRYL_484',
 'ACRYL_485',
 'ACRYL_486',
 'ACRYL_487',
 'ACRYL_488',
 'ACRYL_489',
 'ACRYL_49',
 'ACRYL_490',
 'ACRYL_491',
 'ACRYL_492',
 'ACRYL_493',
 'ACRYL_494',
 'ACRYL_495',
 'ACRYL_496',
 'ACRYL_497',
 'ACRYL_498',
 'ACRYL_499',
 'ACRYL_5',
 'ACRYL_50',
 'ACRYL_500',
 'ACRYL_501',
 'ACRYL_502',
 'ACRYL_503',
 'ACRYL_504',
 'ACRYL_505',
 'ACRYL_506',
 'ACRYL_507',
 'ACRYL_508',
 'ACRYL_509',
 'ACRYL_51',
 'ACRYL_510',
 'ACRYL_511',
 'ACRYL_512',
 'ACRYL_513',
 'ACRYL_514',
 'ACRYL_515',
 'ACRYL_516',
 'ACRYL_517',
 'ACRYL_518',
 'ACRYL_519',
 'ACRYL_52',
 'ACRYL_520',
 'ACRYL_53',
 'ACRYL_54',
 'ACRYL_55',
 'ACRYL_56',
 'ACRYL_57',
 'ACRYL_58',
 'ACRYL_59',
 'ACRYL_6',
 'ACRYL_60',
 'ACRYL_61',
 'ACRYL_62',
 'ACRYL_63',
 'ACRYL_64',
 'ACRYL_65',
 'ACRYL_66',
 'ACRYL_67',
 'ACRYL_68',
 'ACRYL_69',
 'ACRYL_7',
 'ACRYL_70',
 'ACRYL_71',
 'ACRYL_72',
 'ACRYL_73',
 'ACRYL_74',
 'ACRYL_75',
 'ACRYL_76',
 'ACRYL_77',
 'ACRYL_78',
 'ACRYL_79',
 'ACRYL_8',
 'ACRYL_80',
 'ACRYL_81',
 'ACRYL_82',
 'ACRYL_83',
 'ACRYL_84',
 'ACRYL_85',
 'ACRYL_86',
 'ACRYL_87',
 'ACRYL_88',
 'ACRYL_89',
 'ACRYL_9',
 'ACRYL_90',
 'ACRYL_91',
 'ACRYL_92',
 'ACRYL_93',
 'ACRYL_94',
 'ACRYL_95',
 'ACRYL_96',
 'ACRYL_97',
 'ACRYL_98',
 'ACRYL_99',
 'BR_1',
 'CL_1',
 'CL_10',
 'CL_100',
 'CL_101',
 'CL_102',
 'CL_103',
 'CL_104',
 'CL_105',
 'CL_106',
 'CL_107',
 'CL_108',
 'CL_109',
 'CL_11',
 'CL_110',
 'CL_111',
 'CL_112',
 'CL_113',
 'CL_114',
 'CL_115',
 'CL_116',
 'CL_117',
 'CL_118',
 'CL_119',
 'CL_12',
 'CL_120',
 'CL_121',
 'CL_122',
 'CL_123',
 'CL_124',
 'CL_125',
 'CL_126',
 'CL_127',
 'CL_128',
 'CL_129',
 'CL_13',
 'CL_130',
 'CL_131',
 'CL_132',
 'CL_133',
 'CL_134',
 'CL_135',
 'CL_136',
 'CL_137',
 'CL_138',
 'CL_139',
 'CL_14',
 'CL_140',
 'CL_141',
 'CL_142',
 'CL_143',
 'CL_144',
 'CL_145',
 'CL_146',
 'CL_147',
 'CL_148',
 'CL_149',
 'CL_15',
 'CL_150',
 'CL_151',
 'CL_152',
 'CL_153',
 'CL_154',
 'CL_155',
 'CL_156',
 'CL_157',
 'CL_158',
 'CL_159',
 'CL_16',
 'CL_160',
 'CL_161',
 'CL_162',
 'CL_163',
 'CL_164',
 'CL_165',
 'CL_166',
 'CL_167',
 'CL_168',
 'CL_169',
 'CL_17',
 'CL_170',
 'CL_171',
 'CL_172',
 'CL_173',
 'CL_174',
 'CL_175',
 'CL_176',
 'CL_177',
 'CL_178',
 'CL_179',
 'CL_18',
 'CL_180',
 'CL_181',
 'CL_182',
 'CL_183',
 'CL_184',
 'CL_185',
 'CL_186',
 'CL_187',
 'CL_188',
 'CL_189',
 'CL_19',
 'CL_190',
 'CL_191',
 'CL_192',
 'CL_193',
 'CL_194',
 'CL_195',
 'CL_196',
 'CL_197',
 'CL_198',
 'CL_199',
 'CL_2',
 'CL_20',
 'CL_200',
 'CL_201',
 'CL_202',
 'CL_203',
 'CL_204',
 'CL_205',
 'CL_206',
 'CL_207',
 'CL_208',
 'CL_209',
 'CL_21',
 'CL_210',
 'CL_211',
 'CL_212',
 'CL_213',
 'CL_214',
 'CL_215',
 'CL_216',
 'CL_217',
 'CL_218',
 'CL_219',
 'CL_22',
 'CL_220',
 'CL_221',
 'CL_222',
 'CL_223',
 'CL_224',
 'CL_225',
 'CL_226',
 'CL_227',
 'CL_228',
 'CL_229',
 'CL_23',
 'CL_230',
 'CL_231',
 'CL_232',
 'CL_233',
 'CL_234',
 'CL_235',
 'CL_236',
 'CL_237',
 'CL_238',
 'CL_239',
 'CL_24',
 'CL_240',
 'CL_241',
 'CL_242',
 'CL_243',
 'CL_244',
 'CL_245',
 'CL_246',
 'CL_247',
 'CL_248',
 'CL_249',
 'CL_25',
 'CL_250',
 'CL_251',
 'CL_252',
 'CL_253',
 'CL_254',
 'CL_255',
 'CL_256',
 'CL_257',
 'CL_258',
 'CL_259',
 'CL_26',
 'CL_260',
 'CL_261',
 'CL_262',
 'CL_263',
 'CL_264',
 'CL_265',
 'CL_266',
 'CL_267',
 'CL_268',
 'CL_269',
 'CL_27',
 'CL_270',
 'CL_271',
 'CL_272',
 'CL_273',
 'CL_274',
 'CL_275',
 'CL_276',
 'CL_277',
 'CL_278',
 'CL_279',
 'CL_28',
 'CL_280',
 'CL_281',
 'CL_282',
 'CL_283',
 'CL_284',
 'CL_285',
 'CL_286',
 'CL_287',
 'CL_288',
 'CL_289',
 'CL_29',
 'CL_290',
 'CL_291',
 'CL_292',
 'CL_293',
 'CL_294',
 'CL_295',
 'CL_296',
 'CL_297',
 'CL_298',
 'CL_299',
 'CL_3',
 'CL_30',
 'CL_300',
 'CL_301',
 'CL_302',
 'CL_303',
 'CL_304',
 'CL_305',
 'CL_306',
 'CL_307',
 'CL_308',
 'CL_309',
 'CL_31',
 'CL_310',
 'CL_311',
 'CL_312',
 'CL_313',
 'CL_314',
 'CL_315',
 'CL_316',
 'CL_317',
 'CL_318',
 'CL_319',
 'CL_32',
 'CL_320',
 'CL_321',
 'CL_322',
 'CL_323',
 'CL_324',
 'CL_325',
 'CL_326',
 'CL_327',
 'CL_328',
 'CL_329',
 'CL_33',
 'CL_330',
 'CL_331',
 'CL_332',
 'CL_333',
 'CL_334',
 'CL_335',
 'CL_336',
 'CL_337',
 'CL_338',
 'CL_339',
 'CL_34',
 'CL_340',
 'CL_341',
 'CL_342',
 'CL_343',
 'CL_344',
 'CL_345',
 'CL_346',
 'CL_347',
 'CL_348',
 'CL_349',
 'CL_35',
 'CL_350',
 'CL_351',
 'CL_352',
 'CL_353',
 'CL_354',
 'CL_355',
 'CL_356',
 'CL_357',
 'CL_358',
 'CL_359',
 'CL_36',
 'CL_360',
 'CL_361',
 'CL_362',
 'CL_363',
 'CL_364',
 'CL_365',
 'CL_366',
 'CL_367',
 'CL_368',
 'CL_369',
 'CL_37',
 'CL_370',
 'CL_371',
 'CL_372',
 'CL_373',
 'CL_374',
 'CL_375',
 'CL_376',
 'CL_377',
 'CL_378',
 'CL_379',
 'CL_38',
 'CL_380',
 'CL_381',
 'CL_382',
 'CL_383',
 'CL_384',
 'CL_385',
 'CL_386',
 'CL_387',
 'CL_388',
 'CL_389',
 'CL_39',
 'CL_390',
 'CL_391',
 'CL_392',
 'CL_393',
 'CL_394',
 'CL_395',
 'CL_396',
 'CL_397',
 'CL_398',
 'CL_399',
 'CL_4',
 'CL_40',
 'CL_400',
 'CL_401',
 'CL_402',
 'CL_403',
 'CL_404',
 'CL_405',
 'CL_406',
 'CL_407',
 'CL_408',
 'CL_409',
 'CL_41',
 'CL_410',
 'CL_411',
 'CL_412',
 'CL_413',
 'CL_414',
 'CL_415',
 'CL_416',
 'CL_417',
 'CL_418',
 'CL_419',
 'CL_42',
 'CL_420',
 'CL_421',
 'CL_422',
 'CL_423',
 'CL_424',
 'CL_425',
 'CL_426',
 'CL_427',
 'CL_428',
 'CL_429',
 'CL_43',
 'CL_430',
 'CL_431',
 'CL_432',
 'CL_433',
 'CL_434',
 'CL_435',
 'CL_436',
 'CL_437',
 'CL_438',
 'CL_439',
 'CL_44',
 'CL_440',
 'CL_441',
 'CL_442',
 'CL_443',
 'CL_444',
 'CL_445',
 'CL_446',
 'CL_447',
 'CL_448',
 'CL_449',
 'CL_45',
 'CL_450',
 'CL_451',
 'CL_452',
 'CL_453',
 'CL_454',
 'CL_455',
 'CL_456',
 'CL_457',
 'CL_458',
 'CL_459',
 'CL_46',
 'CL_460',
 'CL_461',
 'CL_462',
 'CL_463',
 'CL_464',
 'CL_465',
 'CL_466',
 'CL_467',
 'CL_468',
 'CL_469',
 'CL_47',
 'CL_470',
 'CL_471',
 'CL_472',
 'CL_473',
 'CL_48',
 'CL_49',
 'CL_5',
 'CL_50',
 'CL_51',
 'CL_52',
 'CL_53',
 'CL_54',
 'CL_55',
 'CL_56',
 'CL_57',
 'CL_58',
 'CL_59',
 'CL_6',
 'CL_60',
 'CL_61',
 'CL_62',
 'CL_63',
 'CL_64',
 'CL_65',
 'CL_66',
 'CL_67',
 'CL_68',
 'CL_69',
 'CL_7',
 'CL_70',
 'CL_71',
 'CL_72',
 'CL_73',
 'CL_74',
 'CL_75',
 'CL_76',
 'CL_77',
 'CL_78',
 'CL_79',
 'CL_8',
 'CL_80',
 'CL_81',
 'CL_82',
 'CL_83',
 'CL_84',
 'CL_85',
 'CL_86',
 'CL_87',
 'CL_88',
 'CL_89',
 'CL_9',
 'CL_90',
 'CL_91',
 'CL_92',
 'CL_93',
 'CL_94',
 'CL_95',
 'CL_96',
 'CL_97',
 'CL_98',
 'CL_99',
 'DMF_1',
 'OTHER_1',
 'OTHER_2',
 'OTHER_3',
 'OTHER_4',
 'OTHER_5'
]]

In [None]:
subset_df = subset_df.rename(columns = {'dmf': 'dimethylfumarate'})

In [None]:
subset_df['ligandable'].value_counts()

In [None]:
subset_df.to_csv('cysteineomedb_ligandable_dataset.csv', index = False)

# Create Warheads and Compounds

In [None]:
def merge_dict(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [None]:
merged_dict = merge_dict(updated_other_cols_dict, updated_acryl_cols_dict)

In [None]:
merged_dict = merge_dict(merged_dict, updated_bromo_cols_dict)

In [None]:
merged_dict = merge_dict(merged_dict, updated_chloro_cols_dict)

In [None]:
merged_dict = merge_dict(merged_dict, updated_dmf_cols_dict)

In [None]:
new_cols_df = pd.DataFrame()
new_cols_df['publication_compound'] = list(merged_dict.keys())
new_cols_df['cysdb_compound'] = list(merged_dict.values())

In [None]:
new_cols_df.to_csv('cysteineomedb_compound_keys.csv', index = False)

In [None]:
other_df = updated_col_df[updated_col_df['other'] == 'yes']
acryl_df = updated_col_df[updated_col_df['acrylamide'] == 'yes']
chloro_df = updated_col_df[updated_col_df['chloroacetamide'] == 'yes']
bromo_df = updated_col_df[updated_col_df['bromoacetamide'] == 'yes']
dmf_df = updated_col_df[updated_col_df['dmf'] == 'yes']

In [None]:
subset_other_df = other_df[['cysteineid'] + list(updated_other_cols_dict.values())]
subset_other_df['warhead'] = 'other'

In [None]:
subset_acryl_df = acryl_df[['cysteineid'] + list(updated_acryl_cols_dict.values())]
subset_acryl_df['warhead'] = 'acrylamide'

In [None]:
subset_chloro_df = chloro_df[['cysteineid'] + list(updated_chloro_cols_dict.values())]
subset_chloro_df['warhead'] = 'chloroacetamide'

In [None]:
subset_bromo_df = bromo_df[['cysteineid'] + list(updated_bromo_cols_dict.values())]
subset_bromo_df['warhead'] = 'bromoacetamide'

In [None]:
subset_dmf_df = dmf_df[['cysteineid'] + list(updated_dmf_cols_dict.values())]
subset_dmf_df['warhead'] = 'dimethylfumarate'

In [None]:
warhead_category_df = pd.concat([subset_other_df,
                                subset_acryl_df,
                                subset_chloro_df,
                                subset_bromo_df,
                                subset_dmf_df])

In [None]:
warhead_category_df = warhead_category_df.drop_duplicates()

In [None]:
warhead_cols = warhead_category_df.columns.to_list()[1:6]  + warhead_category_df.columns.to_list()[8:]

In [None]:
warheads = warhead_category_df[['cysteineid', 'warhead'] + warhead_cols]

In [None]:
warheads.to_csv('cysteineomedb_warheadid_dataset.csv', index = False)

In [None]:
warhead_count_df = pd.DataFrame()

In [None]:
warhead_count_df['warhead'] = warheads['warhead'].value_counts().index

In [None]:
warhead_count_df['warhead_count'] = list(warheads['warhead'].value_counts())

In [None]:
warhead_count_df.to_csv('cysteineomedb_warhead_counts.csv', index = False)