Find which cysteines are near active sites using primary sequences and UniProt annotations

In [None]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import matplotlib
import math
from matplotlib.pyplot import figure
import seaborn as sns
import string
from matplotlib import pyplot as plt
import plotly.io as pio
from matplotlib_venn import venn3
from matplotlib_venn import venn2, venn2_circles
from matplotlib import cm
from matplotlib.colors import ListedColormap
from venn import venn
import plotly.express as px

In [None]:
date = '220920'

In [None]:
cd = os.getcwd()
cd

# Read UniProt Active Site Annotations

In [None]:
as_df = pd.read_csv('220908_uniprot_as_identifiers.csv')

In [None]:
as_df['AA'] = as_df['as_id'].map(lambda x: str(x)[0])

In [None]:
as_df['Site'] = as_df['as_id'].map(lambda x: str(x)[1:])

In [None]:
c_as_df = as_df[as_df['AA'] == 'C']

In [None]:
c_as_df = c_as_df[['Entry', 'as_identifier', 'as_id', 'AA', 'Site']]

In [None]:
c_as_ids = list(c_as_df['as_identifier'].unique())

# Read CysDB IDs

In [None]:
c_df = pd.read_excel('Table_S1.xlsx', sheet_name = 'Fig1D')

In [None]:
c_df['is_as'] = np.where(c_df['cysteineid'].isin(c_as_ids), 1, 0)

In [None]:
non_as_c_df = c_df[c_df['is_as'] == 0]
non_as_c_ids = list(non_as_c_df['cysteineid'].unique())

In [None]:
c_df['check_as'] = np.where(c_df['cysteineid'].isin(non_as_c_ids), 1, 0)

In [None]:
cysdb_w_u_as = c_df[c_df['check_as'] == 1]

In [None]:
cysdb_w_u_as['cysteineid'] = cysdb_w_u_as['cysteineid'].map(lambda x: str(x).replace(' ', ''))

In [None]:
cysdb_w_u_as['resid'] = cysdb_w_u_as['cysteineid'].map(lambda x: str(x).split('_C')[-1].strip())

In [None]:
# find which cysteines are within +/-10 amino acids from an annotated UniProt active site

def get_range(df, u_df):
    as_neighbor = []
    count = 0
    as_neighbors = []
    
    # iterate through cysdb
    for index, row in df.iterrows():
        
        # get identifiers
        pro = row['proteinid']
        cys = row['cysteineid'].replace(' ', '')
        aa = int(row['resid'])
        
        suaset_u_df = u_df[u_df['Entry'] == pro]
        as_ids = list(suaset_u_df['Site'])
        as_identifiers = list(suaset_u_df['as_id'])
        as_neighbors.append(as_ids)
        
        current_as_ids = []
        
        for i in range(len(as_ids)):
            u_lim = int(as_ids[i]) + 10
            l_lim = int(as_ids[i]) - 10
            
            if (aa >= l_lim) & (aa <= u_lim):
                current_as_ids.append(as_identifiers[i])
                count += 1
                
        as_neighbor.append(current_as_ids)
        
    return as_neighbor, as_neighbors

In [None]:
near_as_neighbors, as_neighbor_list = get_range(cysdb_w_u_as, as_df)

In [None]:
cysdb_w_u_as['near_as_neighbors'] = near_as_neighbors

In [None]:
cysdb_w_u_as['near_as_neighbors_list'] = as_neighbor_list

In [None]:
cysdb_w_u_as['near_as_ps'] = cysdb_w_u_as['near_as_neighbors'].str.len()

In [None]:
cysdb_w_u_as['near_as_1D'] = np.where(cysdb_w_u_as['near_as_ps'] >= 1, 1, 0) 

In [None]:
cysdb_w_u_as['near_as_neighbors'] = [';'.join(map(str, l)) for l in cysdb_w_u_as['near_as_neighbors']]

In [None]:
c_near_as_df = cysdb_w_u_as[cysdb_w_u_as['near_as_1D'] == 1]

In [None]:
c_near_as = list(c_near_as_df['cysteineid'].unique())

In [None]:
merged_c_df = pd.merge(c_df, c_near_as_df[['cysteineid', 'near_as_1D', 'near_as_neighbors']], on = 'cysteineid', how = 'left')

In [None]:
# yes if cysteine is an annotated active site or proximal to an active site

def get_as_label(df):
    
    label = []
    
    df = df.fillna(0)
    
    for index, row in df.iterrows():
        is_as = int(row['is_as'])
        near_as = int(row['near_as_1D'])
        
        if ((is_as + near_as) >= 1):
            label.append('yes')
        else:
            label.append(None)
            
    return label

In [None]:
as_labels = get_as_label(merged_c_df)

In [None]:
merged_c_df['as_category'] = as_labels

In [None]:
lig_df = merged_c_df[merged_c_df['ligandable'] == 'yes']

In [None]:
rxt_df = merged_c_df[merged_c_df['hyperreactive'] == 'yes']

In [None]:
merged_c_df = merged_c_df.drop(columns = ['check_as'])

In [None]:
merged_c_df.to_csv(date + '_cysdb_active_sites.csv', index = False)