# Create a Cell Surface Database for Cell Surface Cysteines (CS-Cys)

## import modules

In [1]:
import os, sys
import numpy as np
import matplotlib as plt
import pandas as pd
import csv

In [2]:
cd = os.getcwd()
cd

'/Users/lisamarieboatner/Dropbox/Backus/Scripts/Jupyter'

# Read Cell Where

## Download Cell Where Annotations from https://www.sys-myo.com/cellwhere/

In [91]:
cw_df = pd.read_csv('220902_cellwhere_uniprot_go.csv')

In [93]:
cw_protein_ids = list(cw_df['protein'].unique())

In [94]:
cw_df = cw_df.rename(columns = {'location': 'cw_location', 'score': 'cw_score', 'protein': 'Entry'})

# Read Human Protein Atlas

# Download Human Protein Atlas from https://www.proteinatlas.org/about/download

In [13]:
hpa_df = pd.read_csv('proteinatlas.tsv', sep = '\t')

In [95]:
subset_hpa_df = hpa_df[['Gene', 'Uniprot', 'Subcellular main location']]

In [96]:
subset_hpa_df = subset_hpa_df.drop_duplicates()

In [97]:
def group_hpa(df):
    df = df[df['Subcellular main location'].isna() == False]
    
    groups = df.groupby('Uniprot')
    names = []
    vals = []
    
    for name, group in groups:
        names.append(name)
        current = list(group['Subcellular main location'].unique())
        st = ''
        
        for i in range(len(current)):
            if current[i] != '':
                st += current[i] + ';'
                
        vals.append(st[:-1])
        
        
    return names, vals

In [98]:
pros, vals = group_hpa(subset_hpa_df)

In [99]:
new_hpa_df = pd.DataFrame()
new_hpa_df['Entry'] = pros
new_hpa_df['hpa_location'] = vals

# Read UniProt

# Download UniProt Subcellular Location Annotations from https://www.uniprot.org/

In [7]:
u_df = pd.read_csv('2301_uniprot_annotated.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [100]:
subset_u_df = u_df[[
 'Entry',
 'Entry Name',
 'Protein names',
 'Gene Names',
 'Gene Names (primary)',
 'Gene Names (synonym)',
 'Mass',
 'Keywords',
 'Gene Ontology (cellular component)',
 'Subcellular location [CC]'
]]

In [101]:
subset_u_df = subset_u_df.rename(columns = {'Subcellular location [CC]': 'uniprot_location'})

# Merge Cell Where, HPA and UniProt

In [103]:
merged_df = pd.merge(subset_u_df, new_hpa_df, on = 'Entry', how = 'left')

In [104]:
merged_df = pd.merge(merged_df, cw_df, on = 'Entry', how = 'left')

# Add Labels

In [84]:
# Mitochon, endoplasmic, golgi, nucleus, cytosol,cytoplasm

In [112]:
def get_category_labels(df, kws):
    df = df.fillna('')
    
    kw = kws.split(';')
    label = []
    
    for index, row in df.iterrows():
        label_true = False
        
        for i in range(len(cat_cols)):
            current = row[cat_cols[i]]
            current = current.lower()
            
            for j in range(len(kw)):
                if kw[j].lower() in current:
                    label_true = True
                
        if label_true == True:
            label.append('yes')
        else:
            label.append(None)
            
    return label
        

In [105]:
cat_cols = ['uniprot_location', 'hpa_location', 'cw_location']

In [119]:
labels = get_category_labels(merged_df, 'membrane')
merged_df['membrane'] = labels

In [120]:
labels = get_category_labels(merged_df, 'golgi')
merged_df['golgi'] = labels

In [121]:
labels = get_category_labels(merged_df, 'endoplasmic')
merged_df['endo'] = labels

In [122]:
labels = get_category_labels(merged_df, 'mitochon')
merged_df['mito'] = labels

In [123]:
labels = get_category_labels(merged_df, 'nucleus')
merged_df['nucleus'] = labels

In [124]:
labels = get_category_labels(merged_df, 'cytosol;cytoplasm')
merged_df['cyto'] = labels

# Cell Surface Labels

In [None]:
def get_category_surface_labels(df, kws, cat_cols):
    df = df.fillna('')
    
    kw = kws.split(';')
    label = []
    
    for index, row in df.iterrows():
        label_true = False
        
        for i in range(len(cat_cols)):
            current = row[cat_cols[i]]
            current = current.lower()
            
            for j in range(len(kw)):
                if kw[j].lower() in current:
                    label_true = True
                
        if label_true == True:
            label.append('yes')
        else:
            label.append(None)

    return label
        

In [None]:
labels = get_category_surface_labels(merged_df, 'cell surface;cell membrane', ['uniprot_location'])
merged_df['cellsurface_uniprot'] = labels

In [None]:
labels = get_category_surface_labels(merged_df, 'plasma membrane', ['hpa_location'])
merged_df['cellsurface_hpa'] = labels

In [None]:
labels = get_category_surface_labels(merged_df, 'cell surface;membrane', ['cw_location'])
merged_df['cellsurface_cw'] = labels

In [None]:
merged_df['cellsurface'] = np.where((merged_df['cellsurface_uniprot'] == 'yes') | (merged_df['cellsurface_hpa'] == 'yes') | (merged_df['cellsurface_cw'] == 'yes'), 'yes', None)

In [126]:
merged_df.head()

Unnamed: 0,Entry,Entry Name,Protein names,Gene Names,Gene Names (primary),Gene Names (synonym),Mass,Keywords,Gene Ontology (cellular component),uniprot_location,hpa_location,cw_location,cw_score,membrane,golgi,endo,mito,cyto,nucleus,cellsurface
0,A0A087X1C5,CP2D7_HUMAN,Putative cytochrome P450 2D7 (EC 1.14.14.1),CYP2D7,CYP2D7,,57489,Cytoplasm;Glycoprotein;Heme;Iron;Membrane;Meta...,cytoplasm [GO:0005737]; intracellular membrane...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,,Membrane;Cytoplasm;Mitochondrion,0.5;0.25;0.25,yes,,,yes,yes,,yes
1,A0A0B4J2F0,PIOS1_HUMAN,Protein PIGBOS1 (PIGB opposite strand protein 1),PIGBOS1,PIGBOS1,,6313,Direct protein sequencing;Membrane;Mitochondri...,mitochondrial outer membrane [GO:0005741],SUBCELLULAR LOCATION: Mitochondrion outer memb...,,Mitochondrion;Membrane,0.67;0.33,yes,,,yes,,,yes
2,A0A0B4J2F2,SIK1B_HUMAN,Putative serine/threonine-protein kinase SIK1B...,SIK1B,SIK1B,,84930,ATP-binding;Kinase;Magnesium;Metal-binding;Nuc...,cytoplasm [GO:0005737],,Nucleoplasm,Cytoplasm,1.0,,,,,yes,,
3,A0A0C5B5G6,MOTSC_HUMAN,Mitochondrial-derived peptide MOTS-c (Mitochon...,MT-RNR1,MT-RNR1,,2175,DNA-binding;Mitochondrion;Nucleus;Osteogenesis...,extracellular space [GO:0005615]; mitochondrio...,SUBCELLULAR LOCATION: Secreted {ECO:0000269|Pu...,,Nucleus;Mitochondrion;Extracellular,0.33;0.33;0.33,,,,yes,,yes,yes
4,A0A0K2S4Q6,CD3CH_HUMAN,Protein CD300H (CD300 antigen-like family memb...,CD300H,CD300H,,21806,Alternative splicing;Disulfide bond;Glycoprote...,extracellular region [GO:0005576]; plasma memb...,SUBCELLULAR LOCATION: [Isoform 1]: Membrane {E...,,Membrane;Extracellular;Unknown,0.5;0.25;0.25,yes,,,,,,yes


In [127]:
merged_df.to_csv('230215_localization.csv', index = False)