# Stratify residues into active sites and binding sites based on UniProtKB annotations

In [5]:
# Author: Lisa Boatner
# Date Created: 221128
# Date Modified: 221206
# Updates: 

# Import Modules 

In [6]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import string
from matplotlib import pyplot as plt

In [12]:
# assuming current directory is main folder
os.chdir('data')
cd = os.getcwd()
cd

'C:\\Users\\Onee-sama\\Documents\\GitHub\\residue_function_annotations\\residue_site_annotations\\data'

In [13]:
# set the date for naming files
date = '2401'

# 

# Part 1: Generate the main identifier files 

# 

# UniProtKB 

## Download UniProt File with columns: Entry, Active Site, Binding Site, Disulfide Bond, Redox Potential, PDB, Sequence
## https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cgene_names%2Cgene_primary%2Cgene_synonym%2Cprotein_name%2Cft_act_site%2Cft_binding%2Cft_dna_bind%2Ccc_catalytic_activity%2Ccc_cofactor%2Cft_disulfid%2Credox_potential%2Cft_site%2Cstructure_3d%2Ccc_function%2Ckeyword%2Csequence&format=xlsx&query=%28Human%29+AND+%28model_organism%3A9606%29+AND+%28reviewed%3Atrue%29&size=500

In [14]:
u_df = pd.read_excel('uniprotkb_Human_AND_model_organism_9606_2024_06_25.xlsx')

  warn("Workbook contains no default style, apply openpyxl's default")


In [15]:
u_df.shape

(20435, 18)

In [16]:
u_df.head()

Unnamed: 0,Entry,Entry Name,Gene Names,Gene Names (primary),Gene Names (synonym),Protein names,Active site,Binding site,DNA binding,Catalytic activity,Cofactor,Disulfide bond,Redox potential,Site,3D,Function [CC],Keywords,Sequence
0,A0A087X1C5,CP2D7_HUMAN,CYP2D7,CYP2D7,,Putative cytochrome P450 2D7 (EC 1.14.14.1),,"BINDING 461; /ligand=""heme""; /ligand_id=""ChEBI...",,CATALYTIC ACTIVITY: Reaction=an organic molecu...,COFACTOR: Name=heme; Xref=ChEBI:CHEBI:30413;,,,,,FUNCTION: May be responsible for the metabolis...,Cytoplasm;Glycoprotein;Heme;Iron;Membrane;Meta...,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
1,A0A0B4J2F0,PIOS1_HUMAN,PIGBOS1,PIGBOS1,,Protein PIGBOS1 (PIGB opposite strand protein 1),,,,,,,,,,FUNCTION: Plays a role in regulation of the un...,Direct protein sequencing;Membrane;Mitochondri...,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...
2,A0A0B4J2F2,SIK1B_HUMAN,SIK1B,SIK1B,,Putative serine/threonine-protein kinase SIK1B...,"ACT_SITE 149; /note=""Proton acceptor""; /eviden...","BINDING 33..41; /ligand=""ATP""; /ligand_id=""ChE...",,CATALYTIC ACTIVITY: Reaction=ATP + L-seryl-[pr...,COFACTOR: Name=Mg(2+); Xref=ChEBI:CHEBI:18420;...,,,,,FUNCTION: Probable serine/threonine-protein ki...,ATP-binding;Kinase;Magnesium;Metal-binding;Nuc...,MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH...
3,A0A0C5B5G6,MOTSC_HUMAN,MT-RNR1,MT-RNR1,,Mitochondrial-derived peptide MOTS-c (Mitochon...,,,,,,,,,,FUNCTION: Regulates insulin sensitivity and me...,DNA-binding;Mitochondrion;Nucleus;Osteogenesis...,MRWQEMGYIFYPRKLR
4,A0A0K2S4Q6,CD3CH_HUMAN,CD300H,CD300H,,Protein CD300H (CD300 antigen-like family memb...,,,,,,"DISULFID 43..111; /evidence=""ECO:0000255|PROSI...",,,,FUNCTION: May play an important role in innate...,Alternative splicing;Disulfide bond;Glycoprote...,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...


In [17]:
u_df.columns.to_list()

['Entry',
 'Entry Name',
 'Gene Names',
 'Gene Names (primary)',
 'Gene Names (synonym)',
 'Protein names',
 'Active site',
 'Binding site',
 'DNA binding',
 'Catalytic activity',
 'Cofactor',
 'Disulfide bond',
 'Redox potential',
 'Site',
 '3D',
 'Function [CC]',
 'Keywords',
 'Sequence']

# 

# Get Domain Range

In [43]:
# ACT_SITE 149; /note="Proton acceptor";
# TOPO_DOM 1..24; /note="Extracellular"; /evidence="ECO:0000255"; TOPO_DOM 46..101; /note="Cytoplasmic"; /evidence="ECO:0000255"; TOPO_DOM 123..138; 
def find_domains(df, col, kw, extra):
    
    domains = []
    names = []
    
    regions = df[col].to_list()
    seqs = df['Sequence'].to_list()
    pros = df['Entry'].to_list()
    
    for i in range(len(regions)):
        entry = regions[i].replace('/', '')
        entries = entry.split(';')
        
        skip = False

        for j in range(len(entries)):
            if kw in entries[j]:
                
                nums = entries[j].replace(kw + ' ', '')
                
                if "?" in nums:
                    skip = True
                    continue
                
                elif "-" in nums:
                    skip = True
                    continue
                
                elif ">" in nums:
                    nums = nums.replace('>', '')
                    
                elif ".." in nums:
                    aa_num = nums.split('..')
                    start = aa_num[0]
                    end = aa_num[1]
                else:
                    start = nums.strip()
                    end = nums.strip()
                
                sequence =  seqs[i]

                if (int(start) - 1) > len(sequence):
                    start_aa = '-'
                else:
                    start_aa = sequence[int(start)-1]
                
                if (int(end) - 1) > len(sequence):
                    end_aa = sequence[-1]
                else:
                    end_aa = sequence[int(end)-1]
                
                identifier = pros[i] + '_' + start_aa.strip() + str(start).strip() + '_' + end_aa.strip() + str(end).strip()
                
                domains.append(identifier)
                
            if (extra in entries[j]) & (skip == False):
                note = entries[j].replace(extra + '"', '')[:-1].strip()
                names.append(note)
                
            skip = False
            
        if len(domains) != len(names):
            names.append('')

    return domains, names

In [44]:
def get_domains(df, domain_ty, domain_kw, domain_nm, extra):
    domain_df = df[df[domain_ty].isna() == False]
    
    domains, names = find_domains(domain_df, domain_ty, domain_kw, extra)
    domain_id_df = pd.DataFrame()
    domain_id_df['identifier'] = domains
    domain_id_df[domain_nm] = names
    
    domain_id_df['proteinid'] = domain_id_df['identifier'].map(lambda x: str(x).split('_')[0])
    domain_id_df[domain_nm + '_start'] = domain_id_df['identifier'].map(lambda x: str(x).split('_')[1])
    domain_id_df[domain_nm + '_end'] = domain_id_df['identifier'].map(lambda x: str(x).split('_')[-1])
    
    return domain_id_df

# 

# Active Sites

In [45]:
active_domain_df = get_domains(u_df, 'Active site', 'ACT_SITE', 'active_region', "note=")
active_domain_df.shape

(3582, 5)

In [46]:
active_domain_df.head()

Unnamed: 0,identifier,active_region,proteinid,active_region_start,active_region_end
0,A0A0B4J2F2_D149_D149,Proton acceptor,A0A0B4J2F2,D149,D149
1,A0A1B0GTW7_E306_E306,,A0A1B0GTW7,E306,E306
2,A0AVT1_C625_C625,Glycyl thioester intermediate,A0AVT1,C625,C625
3,A1KZ92_H812_H812,Proton acceptor,A1KZ92,H812,H812
4,A1L167_C88_C88,Glycyl thioester intermediate,A1L167,C88,C88


In [47]:
active_domain_df.to_csv(date + '_uniprot_active_region_identifiers.csv', index = False)

# 

# Binding Sites 

In [49]:
binding_domain_df = get_domains(u_df, 'Binding site', 'BINDING', 'binding_region', "ligand=")
binding_domain_df.shape

(28192, 5)

In [50]:
binding_domain_df.head()

Unnamed: 0,identifier,binding_region,proteinid,binding_region_start,binding_region_end
0,A0A087X1C5_C461_C461,heme,A0A087X1C5,C461,C461
1,A0A0B4J2F2_L33_V41,ATP,A0A0B4J2F2,L33,V41
2,A0A0B4J2F2_K56_K56,ATP,A0A0B4J2F2,K56,K56
3,A0A1B0GTW7_H305_H305,Zn(2+),A0A1B0GTW7,H305,H305
4,A0A1B0GTW7_H309_H309,Zn(2+),A0A1B0GTW7,H309,H309


In [51]:
binding_domain_df.to_csv(date + '_uniprot_binding_region_identifiers.csv', index = False)

# 

# Read Residue IDs 

In [52]:
residue_df = pd.read_csv('aa_ids/2401_uniprot_cysteineids.csv')

In [53]:
residue_df.shape

(261951, 2)

In [54]:
residue_df.head()

Unnamed: 0,proteinid,residueid
0,A0A087X1C5,A0A087X1C5_C57
1,A0A087X1C5,A0A087X1C5_C159
2,A0A087X1C5,A0A087X1C5_C161
3,A0A087X1C5,A0A087X1C5_C191
4,A0A087X1C5,A0A087X1C5_C337


# 

# Which amino acids are in active or binding sites? 

In [61]:
def get_resid(df, col):

    new_df = df.copy()
    vals = []

    for index, row in df.iterrows():
        vals.append(row[col].split('_')[1])

    new_df['resid'] = vals
    return new_df

In [62]:
def in_between(df, col, start, end, site_name):

    new_df = df.copy()
    vals = []
    
    for index, row in df.iterrows():
        current_resid = int(row[col][1:])

        if (current_resid >= int(row[start][1:])) & (current_resid <= int(row[end][1:])):
            vals.append('yes')
            
        else:
            vals.append(None)
            
    new_df['resid_in_' + site_name] = vals
    return new_df

In [88]:
def find_inbetween(input_df, site_df, site_name, site_type):
    subset_input_df = input_df[['proteinid', 'residueid']]
    subset_input_df = subset_input_df.drop_duplicates()
    
    # merge interpro and input
    domain_df = pd.merge(subset_input_df, site_df, on = 'proteinid', how = 'left')
    found_domain_df = domain_df[domain_df['proteinid'].isna() == False]
    print("Merged Sites and Input")

    # create resid
    found_domain_w_resid_df = get_resid(found_domain_df, 'residueid')
    print("Created Resid")

    # find residues in interpro domains
    no_na_df = found_domain_w_resid_df[found_domain_w_resid_df[site_name + '_start'].isna() == False]
    no_na_site_df = in_between(no_na_df, 'resid', site_name + '_start', site_name + '_end', site_type)
    site_df = no_na_site_df.copy()
    print("Found Resids in UniProt Site Regions")

    # subset output
    subset_found_domain_df = site_df.drop(columns = [site_name + '_start', site_name + '_end'])
    subset_found_domain_df = subset_found_domain_df.drop_duplicates()
    
    print(subset_found_domain_df['resid_in_' + site_type].value_counts())
    
    nonmissing_df = subset_found_domain_df[subset_found_domain_df['resid_in_' + site_type] == 'yes']

    return nonmissing_df

# 

## Active Sites 

In [89]:
found_active_region_df = find_inbetween(residue_df, active_domain_df, 'active_region', 'ar')

Merged Sites and Input
Created Resid
Found Resids in UniProt Site Regions
yes    595
Name: resid_in_ar, dtype: int64


In [90]:
found_active_region_df.head()

Unnamed: 0,proteinid,residueid,identifier,active_region,resid,resid_in_ar
146,A0AVT1,A0AVT1_C625,A0AVT1_C625_C625,Glycyl thioester intermediate,C625,yes
408,A1L167,A1L167_C88,A1L167_C88_C88,Glycyl thioester intermediate,C88,yes
1551,A4D256,A4D256_C284,A4D256_C284_C284,Phosphocysteine intermediate,C284,yes
2289,A6NK58,A6NK58_C185,A6NK58_C185_C185,Acyl-thioester intermediate,C185,yes
2431,A6NNY8,A6NNY8_C87,A6NNY8_C87_C87,Nucleophile,C87,yes


In [91]:
found_active_region_df.to_csv(date + '_uniprot_active_region_identifiers.csv', index = False)

In [99]:
as_ids = list(found_active_region_df['residueid'].unique())

# 

# Binding Sites 

In [92]:
found_binding_region_df = find_inbetween(residue_df, binding_domain_df, 'binding_region', 'br')

Merged Sites and Input
Created Resid
Found Resids in UniProt Site Regions
yes    4928
Name: resid_in_br, dtype: int64


In [93]:
found_binding_region_df.head()

Unnamed: 0,proteinid,residueid,identifier,binding_region,resid,resid_in_br
7,A0A087X1C5,A0A087X1C5_C461,A0A087X1C5_C461_C461,heme,C461,yes
2214,A3KMH1,A3KMH1_C451,A3KMH1_G447_T454,ATP,C451,yes
2832,A5YM72,A5YM72_C571,A5YM72_V542_D611,ATP,C571,yes
4268,A6PVC2,A6PVC2_C449,A6PVC2_C449_N450,ATP,C449,yes
4585,A7MCY6,A7MCY6_C586,A7MCY6_C586_C586,Zn(2+),C586,yes


In [94]:
found_binding_region_df.to_csv(date + '_uniprot_binding_region_identifiers.csv', index = False)

# 

# Part 2: Identify which experimentally identified residues are active sites or binding sites

# 

# Read Active Sites 

In [122]:
found_active_region_df = pd.read_csv(date + '_uniprot_active_region_identifiers.csv')

In [123]:
as_ids = list(found_active_region_df['residueid'].unique())

# 

# Read Binding Sites 

In [124]:
found_binding_region_df = pd.read_csv(date + '_uniprot_binding_region_identifiers.csv')

In [125]:
bs_ids = list(found_binding_region_df['residueid'].unique())

# 

# Read Experimental File 

In [105]:
df = pd.read_csv('compiled_identifiers.csv')

In [106]:
df.shape

(2961, 33)

In [107]:
df.head()

Unnamed: 0,identifier,protein,description,gene,peptides,peptide_count,spectral_count,modifications,modification_count,amino_acids,...,Experiment-1_avg_of_medians,Experiment-2_avg_of_medians,Experiment-1_stdev_of_medians,Experiment-2_stdev_of_medians,total_quant_list,Experiment-1_Exp1_Replicate-2_median,Experiment-1_Exp1_Replicate-1_median,Experiment-2_Exp2-Replicate-1_median,Experiment-2_Exp2-Replicate-2_median,Unnamed: 32
0,Q86X55_C26,Q86X55,Histone-arginine methyltransferase CARM1,CARM1,AAAAAAVGPGAGGAGSAVPGGAGPC*ATVSVFPGAR,1,3,25_C[527.3213];25_C[521.3074],2,C,...,1.291182,1.2952,,,1.2911815216979126;1.295200001608753,1.291182,,1.2952,,
1,Q99615_C7,Q99615,DnaJ homolog subfamily C member 7,DNAJC7,AAAAEC*DVVMAATEPELLDDQEAKR,1,3,6_C[521.3074];6_C[527.3213],2,C,...,1.347838,4.039893,,,1.3478380214036192;4.039893401046718,1.347838,,4.039893,,
2,Q5SRE5_C9,Q5SRE5,Nucleoporin NUP188 homolog,NUP188,AAAAGGPC*VR,1,3,8_C[527.3213];8_C[521.3074],2,C,...,1.020498,0.810254,,,1.0204981401893118;0.8102536010867071,1.020498,,0.810254,,
3,Q8IYU8_C8,Q8IYU8,Calcium uptake protein 2; mitochondrial,MICU2,AAAAGSC*AR,1,2,7_C[527.3213];7_C[521.3074],2,C,...,1.050002,,,,1.0500022223666556,1.050002,,,,
4,Q9NRL3_C17,Q9NRL3,Striatin-4,STRN4,AAAAVAAAASSC*RPLGSGAGPGPTGAAPVSAPAPGPGPAGK,1,2,12_C[527.3213];12_C[521.3074],2,C,...,1.432112,,,,1.432112392544983,1.432112,,,,


In [108]:
df.columns.to_list()

['identifier',
 'protein',
 'description',
 'gene',
 'peptides',
 'peptide_count',
 'spectral_count',
 'modifications',
 'modification_count',
 'amino_acids',
 'amino_acid_count',
 'modification_masses',
 'no_of_experiments_count',
 'no_of_replicates_count',
 'Experiment-1_experiment_count',
 'Experiment-2_experiment_count',
 'Experiment-1_Exp1_Replicate-2_replicate_count',
 'Experiment-1_Exp1_Replicate-1_replicate_count',
 'Experiment-2_Exp2-Replicate-1_replicate_count',
 'Experiment-2_Exp2-Replicate-2_replicate_count',
 'lh_correlations',
 'aggregate_avg_of_avg_of_medians',
 'aggregate_stdev_of_avg_of_medians',
 'Experiment-1_avg_of_medians',
 'Experiment-2_avg_of_medians',
 'Experiment-1_stdev_of_medians',
 'Experiment-2_stdev_of_medians',
 'total_quant_list',
 'Experiment-1_Exp1_Replicate-2_median',
 'Experiment-1_Exp1_Replicate-1_median',
 'Experiment-2_Exp2-Replicate-1_median',
 'Experiment-2_Exp2-Replicate-2_median',
 'Unnamed: 32']

# 

# Which experimentally labeled residues are annotated as active or binding sites? 

In [110]:
df['active_site'] = np.where(df['identifier'].isin(as_ids), 'yes', None)
df['active_site'].value_counts()

yes    42
Name: active_site, dtype: int64

In [113]:
df[df['active_site'] == 'yes']

Unnamed: 0,identifier,protein,description,gene,peptides,peptide_count,spectral_count,modifications,modification_count,amino_acids,...,Experiment-1_stdev_of_medians,Experiment-2_stdev_of_medians,total_quant_list,Experiment-1_Exp1_Replicate-2_median,Experiment-1_Exp1_Replicate-1_median,Experiment-2_Exp2-Replicate-1_median,Experiment-2_Exp2-Replicate-2_median,Unnamed: 32,active_site,binding_site
96,Q96PU5_C942,Q96PU5,E3 ubiquitin-protein ligase NEDD4-like,NEDD4L,AHTC*FNRLDLPPYETFEDLREK,1,2,4_C[527.3213];4_C[521.3074],2,C,...,,,0.3801311344509913,0.380131,,,,,yes,
224,O75891_C707,O75891,Cytosolic 10-formyltetrahydrofolate dehydrogenase,ALDH1L1,AVQMGMSSVFFNKGENC*IAAGR,1,2,17_C[527.3213];17_C[521.3074],2,C,...,,,2.354254494379964,2.354254,,,,,yes,
302,Q02252_C317,Q02252,Methylmalonate-semialdehyde dehydrogenase [acy...,ALDH6A1,C*MALSTAVLVGEAK;C*MALSTAVLVGEAKK,2,7,1_C[521.3074];1_C[527.3213],2,C,...,,,-0.7682774642068448;-0.20074387614475114;-0.36...,-0.501052,,1.701043,,,yes,
374,Q9BRA2_C43,Q9BRA2,Thioredoxin domain-containing protein 17,TXNDC17,DAGGKSWC*PDCVQAEPVVR;SWC*PDCVQAEPVVR;TIFAYFTGS...,3,6,8_C[527.3213];8_C[521.3074];3_C[527.3213];3_C[...,6,C,...,,,-0.33974181641759416;-1.4766161990587863;-0.23...,-0.285832,,2.680632,,,yes,
456,P55084_C458,P55084,Trifunctional enzyme subunit beta; mitochondrial,HADHB,EGGQYGLVAAC*AAGGQGHAMIVEAYPK,1,4,11_C[527.3213];11_C[521.3074],2,C,...,,,1.2723032963242593;1.2063911562092395;0.824777...,1.239347,,0.824777,,,yes,
470,Q15040_C36,Q15040,Josephin-1,JOSD1,ELC*ALHALNNVFQDSNAFTR,1,3,3_C[527.3213];3_C[521.3074],2,C,...,,,-0.3994997891326636;0.4630236189999799,-0.3995,,0.463024,,,yes,
515,Q15084_C55,Q15084,Protein disulfide-isomerase A6,PDIA6,EVIQSDSLWLVEFYAPWC*GHCQR,1,3,18_C[527.3213];18_C[521.3074],2,C,...,,,0.1776033926583784;-0.10707805421887205,0.177603,,-0.107078,,,yes,
546,P30838_C244,P30838,Aldehyde dehydrogenase; dimeric NADP-preferring,ALDH3A1,FMNSGQTC*VAPDYILCDPSIQNQIVEK,1,3,8_C[527.3213];8_C[521.3074],2,C,...,,,0.4797625049045535;1.0178426494202601,0.748803,,,,,yes,
557,P51649_C340,P51649,Succinate-semialdehyde dehydrogenase; mitochon...,ALDH5A1,FRNTGQTC*VCSNQFLVQR;NTGQTC*VCSNQFLVQR,2,5,8_C[527.3213];8_C[521.3074];6_C[521.3074];6_C[...,4,C,...,,,-1.2525741505075974;-0.039838579123378376;-0.2...,-0.646206,,-0.265599,,,yes,
569,P49903_C31,P49903,Selenide; water dikinase 1,SEPHS1,FTELKGTGC*KVPQDVLQK,1,3,9_C[527.3213];9_C[521.3074],2,C,...,,,2.335872941417477;1.3930973907242317,2.335873,,1.393097,,,yes,


In [111]:
df['binding_site'] = np.where(df['identifier'].isin(bs_ids), 'yes', None)
df['binding_site'].value_counts()

yes    55
Name: binding_site, dtype: int64

In [114]:
df[df['binding_site'] == 'yes']

Unnamed: 0,identifier,protein,description,gene,peptides,peptide_count,spectral_count,modifications,modification_count,amino_acids,...,Experiment-1_stdev_of_medians,Experiment-2_stdev_of_medians,total_quant_list,Experiment-1_Exp1_Replicate-2_median,Experiment-1_Exp1_Replicate-1_median,Experiment-2_Exp2-Replicate-1_median,Experiment-2_Exp2-Replicate-2_median,Unnamed: 32,active_site,binding_site
31,P00390_C102,P00390,Glutathione reductase; mitochondrial,GSR,AAVVESHKLGGTC*VNVGCVPK;LGGTC*VNVGCVPK,2,3,13_C[527.3213];13_C[521.3074];5_C[521.3074];5_...,4,C,...,,,1.8504976206068715;0.9692316589943264,1.409865,,,,,,yes
383,Q13263_C232,Q13263,Transcription intermediary factor 1-beta,TRIM28,DC*QLNAHKDHQYQFLEDAVR,1,3,2_C[521.3074];2_C[527.3213],2,C,...,,,1.1892424333144254;1.0089252210150188,1.189242,,1.008925,,,,yes
408,Q99798_C451,Q99798,Aconitate hydratase; mitochondrial,ACO2,DLGGIVLANACGPC*IGQWDRK,1,3,14_C[527.3213];14_C[521.3074],2,C,...,,,0.6344025051958484;0.8752029723349202,0.634403,,0.875203,,,,yes
508,Q15418_C432,Q15418,Ribosomal protein S6 kinase alpha-1,RPS6KA1,ETIGVGSYSEC*KR,1,2,11_C[527.3213];11_C[521.3074],2,C,...,,,-0.4807911297016589,-0.480791,,,,,,yes
527,Q96FX7_C209,Q96FX7,tRNA (adenine(58)-N(1))-methyltransferase cata...,TRMT61A,FCSFSPC*IEQVQR,1,3,7_C[527.3213];7_C[521.3074],2,C,...,,,-1.335103387093659;0.0910679454993483,-1.335103,,0.091068,,,,yes
529,Q9NYL2_C22,Q9NYL2,Mitogen-activated protein kinase kinase kinase 20,MAP3K20,FDDLQFFENC*GGGSFGSVYR,1,2,10_C[521.3074];10_C[527.3213],2,C,...,,,2.070844698835487,2.070845,,,,,,yes
586,P53384_C22,P53384,Cytosolic Fe-S cluster assembly factor NUBP1,NUBP1,GASC*QGCPNQR,1,2,4_C[527.3213];4_C[521.3074],2,C,...,,,1.2968750563513096,1.296875,,,,,,yes
682,Q86SX6_C67,Q86SX6,Glutaredoxin-related protein 5; mitochondrial,GLRX5,GTPEQPQC*GFSNAVVQILR,1,3,8_C[527.3213];8_C[521.3074],2,C,...,,,1.2468532176794604;2.7535263661110387,1.246853,,2.753526,,,,yes
689,P55072_C522,P55072,Transitional endoplasmic reticulum ATPase,VCP,GVLFYGPPGC*GK;FGMTPSKGVLFYGPPGC*GK,2,4,10_C[521.3074];10_C[527.3213];17_C[527.3213];1...,4,C,...,,,1.3342914282304348;3.5402285681213286;1.610803...,1.334291,,2.575516,,,,yes
690,Q8NB90_C672,Q8NB90,Spermatogenesis-associated protein 5,SPATA5,GVLLYGPPGC*SK,1,3,10_C[521.3074];10_C[527.3213],2,C,...,,,-0.23940287815676167;1.3662983324330211,-0.239403,,1.366298,,,,yes


In [121]:
df.to_csv(date + '_compiled_identifiers_active_binding_sites.csv', index = False)