# Convert ProteinAtlas CSV export to one-hot-encoded feature columns

Creates 3 matrices:
 - Molecular function
 - Biological process
 - Protein class

In [3]:
import pandas as pd
import numpy as np
import data_utils as du
import scipy.sparse as sparse
data_dir = du.find_data_dir('app')
proteins_file = du.get_file_path(data_dir, 'ProteinAtlas proteins', 'unprocessed', 'proteinatlas_622791f5.tsv')

In [4]:
df = pd.read_csv(proteins_file, delimiter='\t').set_index('Gene')
df = df.drop(df.index[df.index.duplicated()]) #Drop 19 duplicate rows (with identical Gene ID)

In [5]:
df

Unnamed: 0_level_0,Gene synonym,Ensembl,Gene description,Uniprot,Chromosome,Position,Protein class,Biological process,Molecular function,Disease involvement,...,Single Cell Type RNA - Rod photoreceptor cells [NX],Single Cell Type RNA - Sertoli cells [NX],Single Cell Type RNA - Smooth muscle cells [NX],Single Cell Type RNA - Spermatocytes [NX],Single Cell Type RNA - Spermatogonia [NX],Single Cell Type RNA - Suprabasal keratinocytes [NX],Single Cell Type RNA - Syncytiotrophoblasts [NX],Single Cell Type RNA - T-cells [NX],Single Cell Type RNA - Undifferentiated cells [NX],Single Cell Type RNA - Urothelial cells [NX]
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,,ENSG00000121410,Alpha-1-B glycoprotein,P04217,19,58345178-58353499,"Plasma proteins, Predicted intracellular prote...",,,,...,5.2,25.8,52.4,1.4,5.1,3.1,1.6,34.8,0.1,0.6
A1CF,"ACF, ACF64, ACF65, APOBEC1CF, ASP",ENSG00000148584,APOBEC1 complementation factor,Q9NQ94,10,50799409-50885675,Predicted intracellular proteins,mRNA processing,RNA-binding,,...,1.1,0.0,0.0,0.0,0.0,0.0,0.0,0.9,25.9,0.1
A2M,"CPAMD5, FWP007, S863-7",ENSG00000175899,Alpha-2-macroglobulin,P01023,12,9067664-9116229,"Cancer-related genes, Candidate cardiovascular...",,"Protease inhibitor, Serine protease inhibitor",Cancer-related genes,...,18.6,329.8,207.9,1.8,1.1,12.3,3.4,16.7,0.0,2.2
A2ML1,"CPAMD9, FLJ25179, p170",ENSG00000166535,Alpha-2-macroglobulin like 1,A8K2U0,12,8822472-8887001,"Predicted intracellular proteins, Predicted se...",,"Protease inhibitor, Serine protease inhibitor",,...,0.2,0.0,0.3,1.4,0.0,11.0,0.1,0.0,0.0,4.9
A3GALT2,"A3GALT2P, IGB3S, IGBS3S",ENSG00000184389,"Alpha 1,3-galactosyltransferase 2",U3KPV4,1,33306766-33321098,"Enzymes, Predicted membrane proteins",,"Glycosyltransferase, Transferase",,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,"FLJ13861, MGC11349",ENSG00000070476,ZXD family zinc finger C,Q2QGD7,3,126437601-126475919,"Predicted intracellular proteins, Transcriptio...","Transcription, Transcription regulation",Activator,,...,27.2,0.0,4.7,3.2,7.7,3.8,5.3,5.0,6.3,5.3
ZYG11A,ZYG11,ENSG00000203995,"Zyg-11 family member A, cell cycle regulator",Q6WRX3,1,52842511-52894998,Predicted intracellular proteins,Ubl conjugation pathway,,,...,1.3,0.0,0.0,3.4,6.9,0.1,0.2,0.0,0.0,0.3
ZYG11B,"FLJ13456, ZYG11",ENSG00000162378,"Zyg-11 family member B, cell cycle regulator",Q9C0D3,1,52726467-52827342,Predicted intracellular proteins,Ubl conjugation pathway,,,...,23.0,25.8,19.4,9.4,26.9,12.8,29.1,5.7,13.6,13.9
ZYX,,ENSG00000159840,Zyxin,Q15942,7,143381080-143391111,"Plasma proteins, Predicted intracellular proteins","Cell adhesion, Host-virus interaction",,,...,2.6,84.1,82.1,7.9,9.5,17.7,23.5,54.9,19.6,87.0


In [5]:
def get_trimmed_list(string_list):
    if pd.isnull(string_list):
        return ['NaN']
    else:
        splitted_list = string_list.split(',')
        trimmed_splitted_list = [term.strip() for term in splitted_list]
        return trimmed_splitted_list

def to_categorical(y, num_classes=None, dtype='float32'):
    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=dtype)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

def get_cat(values, feature_vals_cat, df_feature_vals):
    return sparse.csr_matrix(feature_vals_cat[df_feature_vals.loc[values]['ID']].sum(axis=0))

def get_feature_categories(feature, single_or_list='single', feature_name='{feature_name}'):
    if single_or_list=='single':
        feature_vals_str = feature.unique()
    elif single_or_list=='list':
        feature_vals_str = list({x for l in feature for x in l})
    feature_vals_int = np.arange(len(feature_vals_str))
    df_feature_vals = pd.DataFrame(list(zip(feature_vals_int, feature_vals_str)), columns=['ID', 'Value']).set_index('Value')
       
    feature_vals_cat = sparse.csr_matrix(to_categorical(feature_vals_int).astype(int))
    if single_or_list=='single':
        feature_cat = feature_vals_cat[df_feature_vals.loc[feature]['ID']].astype(int)
        column_names = [feature_name+'=="' +str(feature_val_str)+'"' for feature_val_str in feature_vals_str]
        feature_cat_df = pd.DataFrame.sparse.from_spmatrix(feature_cat, index=feature.index, columns=column_names)
    elif single_or_list=='list':
        feature_cat = sparse.vstack(feature.apply(get_cat, args=(feature_vals_cat, df_feature_vals))).astype(int)
        column_names = ['"'+str(feature_val_str)+'" in '+feature_name for feature_val_str in feature_vals_str]    
        feature_cat_df = pd.DataFrame.sparse.from_spmatrix(feature_cat, index=feature.index, columns=column_names)

    return feature_cat_df

In [8]:
df['Subcellular location']

Gene
A1BG                                                     NaN
A1CF                                             Nucleoplasm
A2M                                                      NaN
A2ML1                                                    NaN
A3GALT2                                                  NaN
                                 ...                        
ZXDC                                                Nucleoli
ZYG11A                                           Nucleoplasm
ZYG11B                Golgi apparatus,Intermediate filaments
ZYX        Plasma membrane,Actin filaments,Focal adhesion...
ZZEF1                               Nucleoplasm,Mitochondria
Name: Subcellular location, Length: 19632, dtype: object

In [12]:
for colname in df.columns:
    if not 'RNA ' in colname and not 'Pathology ' in colname:
        print(colname)

Gene synonym
Ensembl
Gene description
Uniprot
Chromosome
Position
Protein class
Biological process
Molecular function
Disease involvement
Evidence
HPA evidence
UniProt evidence
NeXtProt evidence
MS evidence
Antibody
Reliability (IH)
Reliability (Mouse Brain)
Reliability (IF)
Subcellular location
Secretome location
CCD Protein
CCD Transcript
Blood concentration - Conc. blood IM [pg/L]
Blood concentration - Conc. blood MS [pg/L]
Subcellular main location
Subcellular additional location
Antibody RRID


In [14]:
df['Subcellular main location']

Gene
A1BG                                           NaN
A1CF                                   Nucleoplasm
A2M                                            NaN
A2ML1                                          NaN
A3GALT2                                        NaN
                            ...                   
ZXDC                                      Nucleoli
ZYG11A                                 Nucleoplasm
ZYG11B     Golgi apparatus, Intermediate filaments
ZYX                           Focal adhesion sites
ZZEF1                                  Nucleoplasm
Name: Subcellular main location, Length: 19632, dtype: object

In [15]:
df['Secretome location']

Gene
A1BG               Secreted to blood
A1CF                             NaN
A2M                Secreted to blood
A2ML1      Secreted in other tissues
A3GALT2                          NaN
                     ...            
ZXDC                             NaN
ZYG11A                           NaN
ZYG11B                           NaN
ZYX                              NaN
ZZEF1                            NaN
Name: Secretome location, Length: 19632, dtype: object

### Molecular function

In [15]:
%%time
molecular_function_vals = df['Molecular function'].apply(get_trimmed_list)
molecular_function_columns = get_feature_categories(molecular_function_vals, 'list', feature_name='Molecular function')

CPU times: user 23.3 s, sys: 0 ns, total: 23.3 s
Wall time: 23.3 s


In [19]:
molecular_function_columns.sum().sort_values(ascending=False)

"NaN" in Molecular function                            8878
"DNA-binding" in Molecular function                    1984
"Transferase" in Molecular function                    1818
"Hydrolase" in Molecular function                      1650
"Receptor" in Molecular function                       1421
                                                       ... 
"IgA-binding protein" in Molecular function               1
"Aspartic protease inhibitor" in Molecular function       1
"Excision nuclease" in Molecular function                 1
"RNA-directed DNA polymerase" in Molecular function       1
"Antiviral protein" in Molecular function                 1
Length: 130, dtype: int64

### Biological process

In [20]:
%%time
biological_process_vals = df['Biological process'].apply(get_trimmed_list)
biological_process_columns = get_feature_categories(biological_process_vals, 'list', feature_name='Biological process')

CPU times: user 19.7 s, sys: 147 ms, total: 19.9 s
Wall time: 19.9 s


In [21]:
biological_process_columns.sum().sort_values(ascending=False)

"NaN" in Biological process                                      9480
"Transcription" in Biological process                            2366
"Transcription regulation" in Biological process                 2304
"Transport" in Biological process                                1983
"Differentiation" in Biological process                           721
                                                                 ... 
"Sulfate transport" in Biological process                           1
"Menaquinone biosynthesis" in Biological process                    1
"Inositol biosynthesis" in Biological process                       1
"Activation of host autophagy by virus" in Biological process       1
"Viral immunoevasion" in Biological process                         1
Length: 233, dtype: int64

### Protein class

In [8]:
%%time
protein_class_vals = df['Protein class'].apply(get_trimmed_list)
protein_class_columns = get_feature_categories(protein_class_vals, 'list', feature_name='Protein class')

CPU times: user 24.6 s, sys: 0 ns, total: 24.6 s
Wall time: 24.6 s


In [9]:
protein_class_columns.sum().sort_values(ascending=False)

"Predicted intracellular proteins" in Protein class          15543
"Predicted membrane proteins" in Protein class                5514
"Disease related genes" in Protein class                      3995
"Plasma proteins" in Protein class                            3731
"Enzymes" in Protein class                                    3514
"Predicted secreted proteins" in Protein class                1707
"Cancer-related genes" in Protein class                       1671
"Transcription factors" in Protein class                      1496
"Transporters" in Protein class                               1471
"Potential drug targets" in Protein class                     1323
"G-protein coupled receptors" in Protein class                 775
"FDA approved drug targets" in Protein class                   753
"CD markers" in Protein class                                  373
"RAS pathway related proteins" in Protein class                230
"Ribosomal proteins" in Protein class                         

In [24]:
molecular_function_file = du.get_file_path(data_dir, 'ProteinAtlas proteins', 'Feature dfs pickle', 'molecular_function.p')
biological_process_file = du.get_file_path(data_dir, 'ProteinAtlas proteins', 'Feature dfs pickle', 'biological_process.p')
protein_class_file = du.get_file_path(data_dir, 'ProteinAtlas proteins', 'Feature dfs pickle', 'protein_class.p')

### Save in pickle files

In [25]:
du.dump_in_pickle(molecular_function_file, molecular_function_columns)
du.dump_in_pickle(biological_process_file, biological_process_columns)
du.dump_in_pickle(protein_class_file, protein_class_columns)

### Read from pickle files

In [26]:
molecular_function_columns = du.read_from_pickle(molecular_function_file)
biological_process_columns = du.read_from_pickle(biological_process_file)
protein_class_columns = du.read_from_pickle(protein_class_file)

### Gene synonyms

In [27]:
def equals_nan_list(value):
    return value == ['NaN']

def print_vals(gene_synonyms, gene_synonym_dict):
    gene = gene_synonyms['Gene']
    gene_synonyms = gene_synonyms['Gene synonym']
    
    for synonym in gene_synonyms:
        gene_synonym_dict[synonym] = gene

def get_gene_synonym_dict(df):
    gene_synonym_vals = df['Gene synonym'].apply(get_trimmed_list)
    non_nan_synonymns = gene_synonym_vals[~gene_synonym_vals.apply(equals_nan_list)]
    gene_synonym_dict = {}
    pd.DataFrame(non_nan_synonymns).reset_index().apply(print_vals, axis=1, args=(gene_synonym_dict,))
    return gene_synonym_dict

In [28]:
gene_synonym_dict_file = du.get_file_path(data_dir, 'ProteinAtlas proteins', 'protein matching', 'gene_synonym_dict.p')
gene_synonym_dict = get_gene_synonym_dict(df)
du.dump_in_pickle(gene_synonym_dict_file, gene_synonym_dict)