# Stratify residues into disulfide bonds and redox active based on UniProtKB annotations

In [1]:
# Author: Lisa Boatner
# Date Created: 221128
# Date Modified: 221206
# Updates: 

# Import Modules 

In [2]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import string
from matplotlib import pyplot as plt

In [3]:
# assuming current directory is main folder
os.chdir('data')
cd = os.getcwd()
cd

'C:\\Users\\Onee-sama\\Documents\\GitHub\\residue_function_annotations\\residue_site_annotations\\data'

In [4]:
# set the date for naming files
date = '2401'

# 

# Part 1: Generate the main identifier files 

# 

# UniProtKB 

## Download UniProt File with columns: Entry, Active Site, Binding Site, Disulfide Bond, Redox Potential, PDB, Sequence
## https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cgene_names%2Cgene_primary%2Cgene_synonym%2Cprotein_name%2Cft_act_site%2Cft_binding%2Cft_dna_bind%2Ccc_catalytic_activity%2Ccc_cofactor%2Cft_disulfid%2Credox_potential%2Cft_site%2Cstructure_3d%2Ccc_function%2Ckeyword%2Csequence&format=xlsx&query=%28Human%29+AND+%28model_organism%3A9606%29+AND+%28reviewed%3Atrue%29&size=500

In [5]:
u_df = pd.read_excel('uniprotkb_Human_AND_model_organism_9606_2024_06_25.xlsx')

  warn("Workbook contains no default style, apply openpyxl's default")


In [6]:
u_df.shape

(20435, 18)

In [7]:
u_df.head()

Unnamed: 0,Entry,Entry Name,Gene Names,Gene Names (primary),Gene Names (synonym),Protein names,Active site,Binding site,DNA binding,Catalytic activity,Cofactor,Disulfide bond,Redox potential,Site,3D,Function [CC],Keywords,Sequence
0,A0A087X1C5,CP2D7_HUMAN,CYP2D7,CYP2D7,,Putative cytochrome P450 2D7 (EC 1.14.14.1),,"BINDING 461; /ligand=""heme""; /ligand_id=""ChEBI...",,CATALYTIC ACTIVITY: Reaction=an organic molecu...,COFACTOR: Name=heme; Xref=ChEBI:CHEBI:30413;,,,,,FUNCTION: May be responsible for the metabolis...,Cytoplasm;Glycoprotein;Heme;Iron;Membrane;Meta...,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
1,A0A0B4J2F0,PIOS1_HUMAN,PIGBOS1,PIGBOS1,,Protein PIGBOS1 (PIGB opposite strand protein 1),,,,,,,,,,FUNCTION: Plays a role in regulation of the un...,Direct protein sequencing;Membrane;Mitochondri...,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...
2,A0A0B4J2F2,SIK1B_HUMAN,SIK1B,SIK1B,,Putative serine/threonine-protein kinase SIK1B...,"ACT_SITE 149; /note=""Proton acceptor""; /eviden...","BINDING 33..41; /ligand=""ATP""; /ligand_id=""ChE...",,CATALYTIC ACTIVITY: Reaction=ATP + L-seryl-[pr...,COFACTOR: Name=Mg(2+); Xref=ChEBI:CHEBI:18420;...,,,,,FUNCTION: Probable serine/threonine-protein ki...,ATP-binding;Kinase;Magnesium;Metal-binding;Nuc...,MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH...
3,A0A0C5B5G6,MOTSC_HUMAN,MT-RNR1,MT-RNR1,,Mitochondrial-derived peptide MOTS-c (Mitochon...,,,,,,,,,,FUNCTION: Regulates insulin sensitivity and me...,DNA-binding;Mitochondrion;Nucleus;Osteogenesis...,MRWQEMGYIFYPRKLR
4,A0A0K2S4Q6,CD3CH_HUMAN,CD300H,CD300H,,Protein CD300H (CD300 antigen-like family memb...,,,,,,"DISULFID 43..111; /evidence=""ECO:0000255|PROSI...",,,,FUNCTION: May play an important role in innate...,Alternative splicing;Disulfide bond;Glycoprote...,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...


In [8]:
u_df.columns.to_list()

['Entry',
 'Entry Name',
 'Gene Names',
 'Gene Names (primary)',
 'Gene Names (synonym)',
 'Protein names',
 'Active site',
 'Binding site',
 'DNA binding',
 'Catalytic activity',
 'Cofactor',
 'Disulfide bond',
 'Redox potential',
 'Site',
 '3D',
 'Function [CC]',
 'Keywords',
 'Sequence']

# 

# Get Domain Range

In [19]:
def separate_sites(df, col_name, col_term):
    site_vals = list(df[col_name])
    
    new_site_vals = []
    
    for i in range(len(site_vals)):
        current = site_vals[i]
        if ';' in current:
            current = current.split(';')
            
            new_sites = []
            
            for j in range(len(current)):
                if col_term in current[j]:
                    reformatted = current[j].replace(col_term, '')
                    reformatted = reformatted.strip()
                    if '..' in reformatted:
                        separate = reformatted.split('..')
                        new_sites += separate
                    elif ':' in reformatted:
                        separate = reformatted.split(':')
                        new_sites += [separate[-1]]
                    else:
                        new_sites += [reformatted]
            new_site_vals.append(new_sites)
        else:
            new_sites = []
            
            for j in range(len(current)):
                if col_term in current[j]:
                    reformatted = current[j].replace(col_term, '')
                    reformatted = reformatted.strip()
                    if '..' in reformatted:
                        separate = reformatted.split('..')
                        new_sites += separate
                    elif ':' in reformatted:
                        separate = reformatted.split(':')
                        new_sites += [separate[-1]]
                    else:
                        new_sites += [reformatted]
            new_site_vals.append(new_sites)
            
    return new_site_vals

In [20]:
def get_selected_site(df, col_name, col_term):
    sites = []
    sites_resnames = []
    for index, row in df.iterrows():
        sequence = row['Sequence']
        
        separate_sites = row[col_name]

        current_site_str = ''
        current_site_resname_str = ''
        
        for i in range(len(separate_sites)):
            site_resid = separate_sites[i].strip()
            
            if ("?" in site_resid) | ("-" in site_resid):
                print(site_resid)
                continue
            
            if (int(site_resid)-1) < len(sequence):
                    res_name = sequence[int(site_resid)-1]
                    current_site_str += (res_name + str(site_resid)) + ';'
                    current_site_resname_str += (res_name + ';')
                    
        current_site_str = current_site_str[:-1]
        current_site_resname_str = current_site_resname_str[:-1]
        
        sites.append(current_site_str)
        sites_resnames.append(current_site_resname_str)
                    
    df[col_term + ' resid'] = sites
    df[col_term + ' resname'] = sites_resnames
    
    return df

In [53]:
def get_domains(df, domain_ty, domain_kw, domain_nm, col_name, new_col_name):
    domain_df = df.copy()
    
    new_disulfide_vals = separate_sites(domain_df, domain_ty, domain_kw)
    domain_df[col_name] = new_disulfide_vals
    
    reformatted_site_df = get_selected_site(domain_df, col_name, new_col_name)
    
    reformatted_site_df['identifier'] = reformatted_site_df[new_col_name + ' resid'].str.split(';')
    reformatted_site_df = reformatted_site_df.explode('identifier')
    reformatted_site_df['identifier'] = reformatted_site_df['identifier'].str.strip()
    reformatted_site_df['identifier'] = reformatted_site_df['Entry'] + '_' + reformatted_site_df['identifier']
    
    subset_updated_site_df = reformatted_site_df[['Entry', 'identifier']].drop_duplicates()
    subset_updated_site_df = subset_updated_site_df.rename(columns = {'Entry': 'proteinid', 'identifier': 'residueid'})
    
    return subset_updated_site_df

# 

# Disulfide Bonds

In [57]:
disulfide_df = u_df[u_df['Disulfide bond'].isna() == False]

In [58]:
disulfide_domain_df = get_domains(disulfide_df, 'Disulfide bond', 'DISULFID', 'disulfide_region', 'New disulfide site', 'Disulfide site')
disulfide_domain_df.shape

?274
?
?
?
?
Q13324-2:51
?
?
Q9Y6R1-2:583
Q9Y6R1-2:630


(36358, 2)

In [59]:
disulfide_domain_df.head()

Unnamed: 0,proteinid,residueid
4,A0A0K2S4Q6,A0A0K2S4Q6_C43
4,A0A0K2S4Q6,A0A0K2S4Q6_C111
7,A0A5B9,A0A5B9_C30
7,A0A5B9,A0A5B9_C95
7,A0A5B9,A0A5B9_C130


In [60]:
disulfide_domain_df.to_csv(date + '_uniprot_disulfide_region_identifiers.csv', index = False)

# 

# Redox Active

In [61]:
redox_df = u_df.copy()
redox_df['Redox-active'] = np.where(u_df['Disulfide bond'].str.contains('Redox-active') == True, True, False)
redox_df = redox_df[redox_df['Redox-active'] == True]
redox_ids = list(redox_df['Entry'].unique())
len(redox_ids)

56

In [63]:
redox_domain_df = get_domains(redox_df, 'Disulfide bond', 'DISULFID', 'redox_region', 'New redox site', 'Redox site')
redox_domain_df.shape

(242, 2)

In [64]:
redox_domain_df.head()

Unnamed: 0,proteinid,residueid
268,O00391,O00391_C70
268,O00391,O00391_C73
268,O00391,O00391_C101
268,O00391,O00391_C110
268,O00391,O00391_C393


In [65]:
redox_domain_df.to_csv(date + '_uniprot_redox_region_identifiers.csv', index = False)

# 

# Part 2: Identify which experimentally identified residues are disulfide bonds or redox active 

# 

# Read Disulfide Bond Identifiers 

In [100]:
disulfide_domain_df = pd.read_csv(date + '_uniprot_disulfide_region_identifiers.csv')

In [86]:
disulfide_ids = list(set(disulfide_domain_df['residueid'].unique()))

# 

# Read Redox Active Identifiers 

In [99]:
redox_domain_df = pd.read_csv(date + '_uniprot_redox_region_identifiers.csv')

In [87]:
redox_ids = list(set(redox_domain_df['residueid'].unique()))

# 

# Read Experimental File 

In [90]:
df = pd.read_csv('compiled_identifiers.csv')

In [91]:
df.shape

(2961, 33)

In [92]:
df.head()

Unnamed: 0,identifier,protein,description,gene,peptides,peptide_count,spectral_count,modifications,modification_count,amino_acids,...,Experiment-1_avg_of_medians,Experiment-2_avg_of_medians,Experiment-1_stdev_of_medians,Experiment-2_stdev_of_medians,total_quant_list,Experiment-1_Exp1_Replicate-2_median,Experiment-1_Exp1_Replicate-1_median,Experiment-2_Exp2-Replicate-1_median,Experiment-2_Exp2-Replicate-2_median,Unnamed: 32
0,Q86X55_C26,Q86X55,Histone-arginine methyltransferase CARM1,CARM1,AAAAAAVGPGAGGAGSAVPGGAGPC*ATVSVFPGAR,1,3,25_C[527.3213];25_C[521.3074],2,C,...,1.291182,1.2952,,,1.2911815216979126;1.295200001608753,1.291182,,1.2952,,
1,Q99615_C7,Q99615,DnaJ homolog subfamily C member 7,DNAJC7,AAAAEC*DVVMAATEPELLDDQEAKR,1,3,6_C[521.3074];6_C[527.3213],2,C,...,1.347838,4.039893,,,1.3478380214036192;4.039893401046718,1.347838,,4.039893,,
2,Q5SRE5_C9,Q5SRE5,Nucleoporin NUP188 homolog,NUP188,AAAAGGPC*VR,1,3,8_C[527.3213];8_C[521.3074],2,C,...,1.020498,0.810254,,,1.0204981401893118;0.8102536010867071,1.020498,,0.810254,,
3,Q8IYU8_C8,Q8IYU8,Calcium uptake protein 2; mitochondrial,MICU2,AAAAGSC*AR,1,2,7_C[527.3213];7_C[521.3074],2,C,...,1.050002,,,,1.0500022223666556,1.050002,,,,
4,Q9NRL3_C17,Q9NRL3,Striatin-4,STRN4,AAAAVAAAASSC*RPLGSGAGPGPTGAAPVSAPAPGPGPAGK,1,2,12_C[527.3213];12_C[521.3074],2,C,...,1.432112,,,,1.432112392544983,1.432112,,,,


In [93]:
df.columns.to_list()

['identifier',
 'protein',
 'description',
 'gene',
 'peptides',
 'peptide_count',
 'spectral_count',
 'modifications',
 'modification_count',
 'amino_acids',
 'amino_acid_count',
 'modification_masses',
 'no_of_experiments_count',
 'no_of_replicates_count',
 'Experiment-1_experiment_count',
 'Experiment-2_experiment_count',
 'Experiment-1_Exp1_Replicate-2_replicate_count',
 'Experiment-1_Exp1_Replicate-1_replicate_count',
 'Experiment-2_Exp2-Replicate-1_replicate_count',
 'Experiment-2_Exp2-Replicate-2_replicate_count',
 'lh_correlations',
 'aggregate_avg_of_avg_of_medians',
 'aggregate_stdev_of_avg_of_medians',
 'Experiment-1_avg_of_medians',
 'Experiment-2_avg_of_medians',
 'Experiment-1_stdev_of_medians',
 'Experiment-2_stdev_of_medians',
 'total_quant_list',
 'Experiment-1_Exp1_Replicate-2_median',
 'Experiment-1_Exp1_Replicate-1_median',
 'Experiment-2_Exp2-Replicate-1_median',
 'Experiment-2_Exp2-Replicate-2_median',
 'Unnamed: 32']

# 

# Which experimentally labeled residues are annotated as active or binding sites? 

In [94]:
df['disulfide_bond'] = np.where(df['identifier'].isin(disulfide_ids), 'yes', None)
df['disulfide_bond'].value_counts()

yes    76
Name: disulfide_bond, dtype: int64

In [95]:
df[df['disulfide_bond'] == 'yes']

Unnamed: 0,identifier,protein,description,gene,peptides,peptide_count,spectral_count,modifications,modification_count,amino_acids,...,Experiment-2_avg_of_medians,Experiment-1_stdev_of_medians,Experiment-2_stdev_of_medians,total_quant_list,Experiment-1_Exp1_Replicate-2_median,Experiment-1_Exp1_Replicate-1_median,Experiment-2_Exp2-Replicate-1_median,Experiment-2_Exp2-Replicate-2_median,Unnamed: 32,disulfide_bond
31,P00390_C102,P00390,Glutathione reductase; mitochondrial,GSR,AAVVESHKLGGTC*VNVGCVPK;LGGTC*VNVGCVPK,2,3,13_C[527.3213];13_C[521.3074];5_C[521.3074];5_...,4,C,...,,,,1.8504976206068715;0.9692316589943264,1.409865,,,,,yes
68,P30048_C229,P30048,Thioredoxin-dependent peroxide reductase; mito...,PRDX3,AFQYVETHGEVC*PANWTPDSPTIKPSPAASK;AFQYVETHGEVC*...,2,4,12_C[527.3213];12_C[521.3074],2,C,...,1.111072,,,1.150348652374722;2.9262978952500163;1.1110716...,2.038323,,1.111072,,,yes
103,P07339_C329,P07339,Cathepsin D,CTSD,AIGAVPLIQGEYMIPC*EK,1,3,16_C[521.3074];16_C[527.3213],2,C,...,,,,0.4486990974800883;0.6058337296936092,0.527266,,,,,yes
120,P16422_C99,P16422,Epithelial cell adhesion molecule,EPCAM,AKPEGALQNNDGLYDPDC*DESGLFK;RAKPEGALQNNDGLYDPDC...,2,3,18_C[527.3213];18_C[521.3074];19_C[527.3213];1...,4,C,...,,,,6.181908377060021;0.7543532633645165,3.468131,,,,,yes
176,O95833_C22,O95833,Chloride intracellular channel protein 3,CLIC3,ASEDGESVGHC*PSCQR,1,2,11_C[527.3213];11_C[521.3074],2,C,...,,,,-0.5284825514649648,-0.528483,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,Q96HE7_C241,Q96HE7,ERO1-like protein alpha,ERO1A,RPLNPLASGQGTSEENTFYSWLEGLC*VEKR,1,2,26_C[527.3213];26_C[521.3074],2,C,...,0.051041,,,0.05104145218573698,,,0.051041,,,yes
2648,P51858_C108,P51858,Hepatoma-derived growth factor,HDGF,SC*VEEPEPEPEAAEGDGDKK,1,2,2_C[521.3074];2_C[527.3213],2,C,...,1.263236,,,1.2632362885104023,,,1.263236,,,yes
2716,Q5JRX3_C556,Q5JRX3,Presequence protease; mitochondrial,PITRM1,SQQSKPQDASC*LPALK,1,2,11_C[527.3213];11_C[521.3074],2,C,...,0.791430,,,0.7914304081923407,,,0.791430,,,yes
2824,P35914_C323,P35914,Hydroxymethylglutaryl-CoA lyase; mitochondrial,HMGCL,VAQATC*KL,1,2,6_C[521.3074];6_C[527.3213],2,C,...,0.729846,,,0.7298459977158773,,,0.729846,,,yes


In [96]:
df['redox_active'] = np.where(df['identifier'].isin(redox_ids), 'yes', None)
df['redox_active'].value_counts()

yes    21
Name: redox_active, dtype: int64

In [97]:
df[df['redox_active'] == 'yes']

Unnamed: 0,identifier,protein,description,gene,peptides,peptide_count,spectral_count,modifications,modification_count,amino_acids,...,Experiment-1_stdev_of_medians,Experiment-2_stdev_of_medians,total_quant_list,Experiment-1_Exp1_Replicate-2_median,Experiment-1_Exp1_Replicate-1_median,Experiment-2_Exp2-Replicate-1_median,Experiment-2_Exp2-Replicate-2_median,Unnamed: 32,disulfide_bond,redox_active
31,P00390_C102,P00390,Glutathione reductase; mitochondrial,GSR,AAVVESHKLGGTC*VNVGCVPK;LGGTC*VNVGCVPK,2,3,13_C[527.3213];13_C[521.3074];5_C[521.3074];5_...,4,C,...,,,1.8504976206068715;0.9692316589943264,1.409865,,,,,yes,yes
304,P10599_C73,P10599,Thioredoxin,TXN,C*MPTFQFFK;C*MPTFQFFKKGQK,2,6,1_C[521.3074];1_C[527.3213],2,C,...,,,1.292654855549643;1.2731739583461454;-0.951781...,1.273174,,1.337465,,,yes,yes
374,Q9BRA2_C43,Q9BRA2,Thioredoxin domain-containing protein 17,TXNDC17,DAGGKSWC*PDCVQAEPVVR;SWC*PDCVQAEPVVR;TIFAYFTGS...,3,6,8_C[527.3213];8_C[521.3074];3_C[527.3213];3_C[...,6,C,...,,,-0.33974181641759416;-1.4766161990587863;-0.23...,-0.285832,,2.680632,,,yes,yes
428,P13667_C555,P13667,Protein disulfide-isomerase A4,PDIA4,DVLIEFYAPWC*GHCK;KDVLIEFYAPWC*GHCK;TFDSIVMDPKK...,4,12,11_C[527.3213];11_C[521.3074];12_C[527.3213];1...,6,C,...,0.134885,,-0.3975532603588039;-0.2257377478415421;-0.070...,-0.07039,0.120366,-0.025688,,,yes,yes
482,P13667_C209,P13667,Protein disulfide-isomerase A4,PDIA4,ENFDEVVNDADIILVEFYAPWCGHC*KK,1,2,25_C[527.3213];25_C[521.3074],2,C,...,,,-0.043083615982805225,-0.043084,,,,,yes,yes
515,Q15084_C55,Q15084,Protein disulfide-isomerase A6,PDIA6,EVIQSDSLWLVEFYAPWC*GHCQR,1,3,18_C[527.3213];18_C[521.3074],2,C,...,,,0.1776033926583784;-0.10707805421887205,0.177603,,-0.107078,,,yes,yes
579,P13667_C91,P13667,Protein disulfide-isomerase A4,PDIA4,FYAPWC*GHCK,1,4,6_C[521.3074];6_C[527.3213],2,C,...,,,-0.15203198873604584;-0.37871931585293506;-0.2...,-0.265376,,-0.221209,,,yes,yes
688,P30044_C100,P30044,Peroxiredoxin-5; mitochondrial,PRDX5,GVLFGVPGAFTPGC*SK;KGVLFGVPGAFTPGC*SK,2,4,14_C[527.3213];14_C[521.3074];15_C[521.3074];1...,4,C,...,,,5.399054525627556;1.3278437169377546;0.5821161...,3.363449,,0.582116,,,yes,yes
815,P30101_C57,P30101,Protein disulfide-isomerase A3,PDIA3,ISDTGSAGLMLVEFFAPWC*GHCK;ISDTGSAGLMLVEFFAPWC*G...,3,4,19_C[527.3213];19_C[521.3074];6_C[521.3074];6_...,4,C,...,,,-0.284917467781557;-1.6625631148956137;-0.0875...,-0.97374,,-0.087506,,,yes,yes
849,Q15084_C190,Q15084,Protein disulfide-isomerase A6,PDIA6,KDVIELTDDSFDKNVLDSEDVWMVEFYAPWC*GHCK,1,4,31_C[527.3213];31_C[521.3074],2,C,...,,,0.3061305460521659;0.8494855229769096;0.232773...,0.306131,,0.54113,,,yes,yes


In [98]:
df.to_csv(date + '_compiled_identifiers_disulfide_redox_sites.csv', index = False)