Reformat published datasets for CysteineomeDB.
Categories: Dataset Found, Reactive, Ligandable, Identified but not Ligandale, and Conditional.
Note: Annotations from authors were used to determine "ligandabiliy."
Note: Yang DIA paper did not provide cysteine residue numbers. Peptide sequences were manually mapped to the uniprot fasta (2201). 

# Setup Environment

In [None]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import matplotlib
import numpy as np
import math
from matplotlib.pyplot import figure
import Bio
from Bio import SeqIO
from functools import reduce
import seaborn as sns
from statistics import mean

In [None]:
cd = os.getcwd()
cd

In [None]:
date = '220919'

In [None]:
path_data = os.path.join(os.getcwd(), 'results')
if not os.path.exists(path_data):
    os.makedirs(path_data)

In [None]:
def get_new_df(dfs, dataset, col1, col2, cys):
    new_df = pd.concat(dfs)
    
    new_df = new_df[new_df[col1].str.contains("contaminant") == False]
    
    if dataset == 'kuljanin_gygi':
        new_df['proteinid'] = new_df[col1].map(lambda x: str(x).split('|')[1])
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'weerapana_cravatt':
        new_df['proteinid'] = new_df[col1].map(lambda x: str(x))
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'backus_cravatt':
        new_df['proteinid'] = new_df['Identifier'].map(lambda x: str(x).split('_')[0])
        new_df['resid'] = new_df['Identifier'].map(lambda x: str(x).split('_')[-1]) 
    elif dataset == 'yan_backus':
        new_df['proteinid'] = new_df[col1]
        new_df['resid'] = new_df['identifier'].map(lambda x: 'C' + str(x).split('_')[-1]) 
    elif dataset == 'yang_wang':
        new_df['proteinid'] = new_df[col1]
        new_df['resid'] = new_df[col2].map(lambda x: 'C' + str(x))         
    else:
        new_df = new_df.rename(columns = {col1: 'proteinid', col2: 'resid'})
        
    new_df['cysteineid'] = new_df['proteinid'] + '_' + new_df['resid'].astype(str)
    new_df['dataset'] = dataset
    new_df['identified'] = 1
    new_df['identified_datasets'] = dataset
    
    if cys == True:
        new_df['level'] = 'cysteine'
        new_df = new_df[['level', 'cysteineid', 'proteinid', 'dataset', 'identified', 'identified_datasets']]
    else:
        new_df['level'] = 'protein'
        new_df = new_df[['level', 'proteinid', 'dataset', 'identified', 'identified_datasets']]
    new_df = new_df.drop_duplicates()
    
    return new_df

In [None]:
def get_cys_uniprot_identifier(master, df, dataset, category, category_datasets, col1, col2):
    if dataset == 'weerapana_cravatt':
        df['proteinid'] = df[col1].map(lambda x: str(x))
        df['resid'] = df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'kuljanin_gygi':
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df['proteinid'] = df[col1].map(lambda x: str(x).split('|')[1])
        df['resid'] = df[col2].map(lambda x: 'C' + str(x))
    elif dataset == 'backus_cravatt':
        df['proteinid'] = df['Identifier'].map(lambda x: str(x).split('_')[0])
        df['resid'] = df['Identifier'].map(lambda x: str(x).split('_')[-1])  
        df = df[['proteinid', 'resid']]
        df = df.drop_duplicates()
    elif dataset == 'yang_wang':
        df['proteinid'] = df[col1]
        df['resid'] = df[col2].map(lambda x: 'C' + str(x)) 
    else:
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df = df.rename(columns = {col1: 'proteinid', col2: 'resid'})
        
    df['cysteineid'] = df['proteinid'] + '_' + df['resid'].astype(str)
    df_ids = list(df['cysteineid'].unique())
    
    master[category] = np.where(master['cysteineid'].isin(df_ids), 1, 0)
    category_df = master[master[category] == 1]
    category_df[category  + '_datasets'] = [category_datasets] * category_df.shape[0]
    non_category_df = master[master[category] == 0]
    
    new_df = pd.concat([category_df, non_category_df])
    
    return new_df

In [None]:
def get_pro_uniprot_identifier(master, df, dataset, category, category_datasets, col1, col2):
    if dataset == 'weerapana_cravatt':
        df['proteinid'] = df[col1].map(lambda x: str(x))
    elif dataset == 'kuljanin_gygi':
        df = df[[col1, col2]]
        df = df.drop_duplicates()
        df['proteinid'] = df[col1].map(lambda x: str(x).split('|')[1])
    elif dataset == 'backus_cravatt':
        df['proteinid'] = df['Identifier'].map(lambda x: str(x).split('_')[0])
        df = df[['proteinid']]
        df = df.drop_duplicates()
    elif dataset == 'yang_wang':
        df['proteinid'] = df[col1]
    else:
        df = df[[col1]]
        df = df.drop_duplicates()
        df = df.rename(columns = {col1: 'proteinid'})

    df_ids = list(df['proteinid'].unique())
    
    master[category] = np.where(master['proteinid'].isin(df_ids), 1, 0)
    category_df = master[master[category] == 1]
    category_df[category  + '_datasets'] = [category_datasets] * category_df.shape[0]
    non_category_df = master[master[category] == 0]
    
    new_df = pd.concat([category_df, non_category_df])
    
    return new_df

In [None]:
def get_ligandability(df, compound_list, cutoff):
    ligandable = []
    
    for index, row in df.iterrows():
        
        count = 0

        for i in range(len(compound_list)):
            current_ratio = row[compound_list[i]]
            
            if type(current_ratio) != int and type(current_ratio) != float:
                continue
            else:    
                if float(current_ratio) >= 4:
                    count += 1
    
        if count >= cutoff:
            ligandable.append('yes')
        else:
            ligandable.append(np.nan)
            
    df['ligandable'] = ligandable
    
    df_identified = df[df['ligandable'].isna() == True]
    df_ligandable = df[df['ligandable'].isna() == False]
    return df_identified, df_ligandable

In [None]:
def list_to_string(lst, symbol):
    return (symbol.join([str(elem) for elem in lst]))

In [None]:
def get_reactivity_bin(df, mean):
    reactivity_labels = []
    
    for index, row in df.iterrows():
        ratio = row[mean]
        if ratio < 2:
            reactivity_labels.append('High')
        elif (ratio > 2) & (ratio <= 5):
            reactivity_labels.append('Medium')
        else:
            reactivity_labels.append('Low')
    return reactivity_labels

# Create final files

## cysteineid, hyperreactive, hyperreactive_datasets, cysdb_mean, cysdb_stdev, weerapana_mean, palafox_mean, vinogradova_mean 

In [None]:
# cysteineid	identified	identified_datasets	reactive	reactive_datasets	ligandable	ligandable_datasets	cellline	cellline_datasets	condition	condition_datasets	level	proteinid

In [None]:
os.chdir(cd)
# os.chdir('results')
# os.chdir('../')

In [None]:
new_reactivity_df = pd.read_csv('cysteineomedb_reactive_dataset.csv')

In [None]:
reactivity_id_df = new_reactivity_df.copy()

In [None]:
pro_reactivity_id_df = new_reactivity_df.copy()

## Cysteine Level

In [None]:
def get_hyperreactive_datasets(df):
    datasets = []
    means = []
    
    df = df.replace(np.nan, 0)
    
    for index, row in df.iterrows():
        current_datasets = ''
        current_means = ''

        if (row['weerapana_mean'] != 0) & (row['weerapana_mean'] < 2):
            current_datasets += 'weerapana_cravatt' + ';'
        if (row['palafox_mean'] != 0) & (row['palafox_mean'] < 2):
            current_datasets += 'palafox_backus' + ';'
        if (row['vinogradova_mean'] != 0) & (row['vinogradova_mean'] < 2):
            current_datasets += 'vinogradova_cravatt' + ';'
            
        if len(current_datasets) == 0:
            datasets.append(None)
            means.append(None)
        else:
            datasets.append(current_datasets[:-1])
            means.append(current_means[:-1])
    return datasets, means

In [None]:
hyperreactive_datasets, hyperreactive_means = get_hyperreactive_datasets(reactivity_id_df)

In [None]:
reactivity_id_df['hyperreactive_datasets'] = hyperreactive_datasets

In [None]:
def get_reactive_datasets(df):
    datasets = []
    means = []
    
    df = df.replace(np.nan, 0)
    
    for index, row in df.iterrows():
        current_datasets = ''
        current_means = ''
        if (row['weerapana_mean'] != 0):
            current_datasets += 'weerapana_cravatt' + ';'
            current_means += str(row['weerapana_mean']) + ';'
        if (row['palafox_mean'] != 0):
            current_datasets += 'palafox_backus' + ';'
            current_means += str(row['palafox_mean']) + ';'
        if (row['vinogradova_mean'] != 0):
            current_datasets += 'vinogradova_cravatt' + ';'
            current_means += str(row['vinogradova_mean']) + ';'
            
        if len(current_datasets) == 0:
            datasets.append(None)
            means.append(None)
        else:
            datasets.append(current_datasets[:-1])
            means.append(current_means[:-1])
    return datasets, means

In [None]:
datasets, means = get_reactive_datasets(reactivity_id_df)

In [None]:
reactivity_id_df['reactive_datasets'] = datasets

In [None]:
reactivity_id_df['level'] = 'cysteine'

## Protein Level

In [None]:
pro_reactivity_id_df = reactivity_id_df.copy()

In [None]:
pro_reactivity_id_df = pro_reactivity_id_df[['proteinid']]

In [None]:
pro_reactivity_id_df = pro_reactivity_id_df.drop_duplicates()

In [None]:
def get_h_pro_ids(df, dataset, cutoff):
    
    h_df = df[df[dataset] < cutoff]
    
    h_pro_ids = list(h_df['proteinid'].unique())
    h_cys_ids = list(h_df['cysteineid'].unique())
    
    return h_pro_ids, h_cys_ids

In [None]:
w_pro_ids, w_cys_ids = get_h_pro_ids(reactivity_id_df, 'weerapana_mean', 2)

In [None]:
p_pro_ids, p_cys_ids = get_h_pro_ids(reactivity_id_df, 'palafox_mean', 2)

In [None]:
v_pro_ids, v_cys_ids = get_h_pro_ids(reactivity_id_df, 'vinogradova_mean', 2)

In [None]:
def get_hyperreactive_proteins(df, w_pro_ids, p_pro_ids, v_pro_ids):
    datasets = []
    hyperreactive = []
    
    for index, row in df.iterrows():
        ds_hyperreactive = ''
        if row['proteinid'] in w_pro_ids:
            ds_hyperreactive += 'weerapana_cravatt' + ';'
        if row['proteinid'] in p_pro_ids:
            ds_hyperreactive += 'palafox_backus' + ';'
        if row['proteinid'] in v_pro_ids:
            ds_hyperreactive += 'vinogradova_cravatt' + ';'
            
        datasets.append(ds_hyperreactive[:-1])
        
        if ds_hyperreactive != '':
            hyperreactive.append('yes')
        else:
            hyperreactive.append(np.nan)
            
    return hyperreactive, datasets

In [None]:
hyper, datasets = get_hyperreactive_proteins(pro_reactivity_id_df, w_pro_ids, p_pro_ids, v_pro_ids)

In [None]:
pro_reactivity_id_df['hyperreactive'] = hyper
pro_reactivity_id_df['hyperreactive_datasets'] = datasets

In [None]:
pro_reactivity_id_df['level'] = 'protein'

In [None]:
pro_reactivity_id_df['hyperreactive'].value_counts()

In [None]:
c_reactivity_id_df = reactivity_id_df.copy()

In [None]:
p_reactivity_id_df = pro_reactivity_id_df.copy()

# Merge Cysteine and Protein

In [None]:
cys_reactivity_id_df = reactivity_id_df[['cysteineid', 'proteinid', 'hyperreactive', 'hyperreactive_datasets', 'level']]

In [None]:
reactive_df = pd.concat([cys_reactivity_id_df, pro_reactivity_id_df])

In [None]:
reactive_df.to_csv('cysteineomedb_id_reactive.csv', index = False)

In [None]:
reactivity_id_df = pd.read_csv('cysteineomedb_id_reactive.csv')

In [None]:
cys_reactivity_id_df = reactivity_id_df[reactivity_id_df['level'] == 'cysteine']

In [None]:
pro_reactivity_id_df = reactivity_id_df[reactivity_id_df['level'] == 'protein']

# Merge with CysteineomeDB ID

In [None]:
os.chdir('results')

In [None]:
cysdb_id_df = pd.read_csv('cysteineomedb_id.csv')

In [None]:
cys_cysdb_id_df = cysdb_id_df[cysdb_id_df['level'] == 'cysteine']

In [None]:
subset_reactivity_id_df = c_reactivity_id_df[['cysteineid', 'hyperreactive', 'hyperreactive_datasets']]

In [None]:
cys_id_df = pd.merge(cys_cysdb_id_df, subset_reactivity_id_df, on = 'cysteineid', how = 'left')

In [None]:
pro_cysdb_id_df = cysdb_id_df[cysdb_id_df['level'] == 'protein']

In [None]:
subset_pro_reactivity_id_df = p_reactivity_id_df[['proteinid', 'hyperreactive', 'hyperreactive_datasets']]

In [None]:
pro_id_df = pd.merge(pro_cysdb_id_df, subset_pro_reactivity_id_df, on = 'proteinid', how = 'left')

In [None]:
new_cysdb_id_df = pd.concat([cys_id_df, pro_id_df])

In [None]:
new_cysdb_id_df.to_csv('cysteineomedb_id_rxt.csv', index = False)

# Concatonated Reactivity Data

In [None]:
def get_new_concat_df(df, dataset, name):
    current_df = df[df[dataset].isna() == False]
    cys_df = current_df[current_df['level'] == 'cysteine']
    pro_df = current_df[['proteinid']]
    pro_df = pro_df.drop_duplicates()
    
    hyperreactive = cys_df[cys_df[dataset] < 2]
    hyperreactive_pro_ids = list(hyperreactive['proteinid'].unique())
    
    cys_df['hyperreactive'] = np.where(cys_df[dataset] < 2, 'yes', None)
    cys_df['hyperreactive_datasets'] = np.where(cys_df[dataset] < 2, name, None)
    cys_df = cys_df[['cysteineid', 'hyperreactive', 'hyperreactive_datasets']]
    cys_df['dataset'] = name
    
    pro_df['hyperreactive'] = np.where(pro_df['proteinid'].isin(hyperreactive_pro_ids), 'yes', None)
    pro_df['hyperreactive_datasets'] = np.where(pro_df['proteinid'].isin(hyperreactive_pro_ids), name, None)
    pro_df['dataset'] = name
    
    return cys_df, pro_df

In [None]:
new_w_cys_df, new_w_pro_df = get_new_concat_df(c_reactivity_id_df, 'weerapana_mean', 'weerapana_cravatt')

In [None]:
new_p_cys_df, new_p_pro_df = get_new_concat_df(c_reactivity_id_df, 'palafox_mean', 'palafox_backus')

In [None]:
new_v_cys_df, new_v_pro_df = get_new_concat_df(c_reactivity_id_df, 'vinogradova_mean', 'vinogradova_cravatt')

In [None]:
c_datasetid_df = pd.concat([new_w_cys_df, new_p_cys_df, new_v_cys_df])

In [None]:
p_datasetid_df = pd.concat([new_w_pro_df, new_p_pro_df, new_v_pro_df])

# Merge with CysteineomeDB Dataset

In [None]:
cysdb_datasetid_df = pd.read_csv('cysteineomedb_datasetid.csv')

In [None]:
c_cysdb_datasetid_df = cysdb_datasetid_df[cysdb_datasetid_df['level'] == 'cysteine']

In [None]:
p_cysdb_datasetid_df = cysdb_datasetid_df[cysdb_datasetid_df['level'] == 'protein']

In [None]:
new_c_datasetid_df = pd.merge(c_cysdb_datasetid_df, c_datasetid_df, left_on = ['cysteineid', 'dataset'], right_on = ['cysteineid', 'dataset'], how = 'left')

In [None]:
new_p_datasetid_df = pd.merge(p_cysdb_datasetid_df, p_datasetid_df, left_on = ['proteinid', 'dataset'], right_on = ['proteinid', 'dataset'], how =  'left')

In [None]:
new_datasetid_df = pd.concat([new_c_datasetid_df, new_p_datasetid_df])

In [None]:
new_datasetid_df.to_csv('cysteineomedb_datasetid_rxt.csv', index = False)