## Load Libraries and import modules 

In [3]:
# Load all the vanila libraries 
import numpy as np
import pandas as pd
import os
import gc
from functools import reduce


# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Import tqdm for progress bar
from tqdm.auto import tqdm

# for timing functions
from timeit import default_timer as timer 

### Configure Project Parameters

In [4]:
# check your current directory
savepath = os.getcwd() + "\\data\\"

**Important:** Run the configuration file first `configs.py`. Importing this script and setting the seed and device parameters before importing any of the other modules ensures that evereything is sync.

**Important** If you want *change the configuration parameters*, change them before importing and running the pipeline. 

Now that all the configurations values are assigned globally, we can import the modules. If this is working, we expect each module to access the **same** **seed** and **device** we set. We are also expecting generated numbers **inside the modules** to be reproducible.

In [5]:
from models_util import ml_helper as mlh 

# Load Data 

## Dataframe with Features

Since there are differences between R and python, I tried to combine the advantages of both. In R due to vectorization, it is faster to compute euclidean, manhattan and cosinde distances of matrices (with missing Values too).<br> 
In python correlation analysis is way faster than R. So...<br>
- To speed up python i tried to create custom functions where the distances are calculated via tensor algebra (it speeds things up)
- I will be using another matrix from a previous analysis in R to validate that the computations are correct. <br>

So for all the feature engineering practices where we have Protein A - Protein B pairs, I used the following naming sceme:<br>

```
features_dataframe:
Var1 - Var2 - [proteomics_type]_[feature_type]_[feature_analysis] - db

```
**Explanations**:<br>
The **Var1** and **Var2** are the protein or sims pairs (unique pairs), where Var1 > Var2 (as a way to remove duplicates and self combinations) <br>
The **proteomics type** could be **SCBC** or **ABMS** (subcellular or total cell MS proteomics)<br>
The **feature_type** could be the **raw** MS signals, or the **VAE** embeddings, or the **umap** coordinates, <br> 
The **feature_analysis** could be correlations, distances, angles, or any other feature. If there is time I will perform a **feature importance analysis** in more detail. <br>
The **db** are the classes of the classification based on ground truth databases of protein-protein interractions, 0 or 1. If we create the features with the purpose of training a classifier. 

**Example** of a columns of some features: 'SCBC_raw_pearson', 'ABMS_vae_umap', 'ABMS_vae_euclidean' <br>  
**The features dataframe** containing these columns could have protein pairs, proteoform dominant pairs (sims), or all the possible proteoform combinations outhere.<br>

**NOTE**<br>
For 34 million unique pairs, 14 features require around 32 minutes and do not crash the RAM, that csv file is around 20gb size  

In [6]:
# an old file written in R for validation of engineering the features correctly 
# feat = pd.read_csv(os.getcwd() + "\\data\\processed\\merged_features_v1.txt", delimiter="\t")

In [7]:
# feat["db"] = np.where(feat["db"]=="F",0,1)

## Data from Embeddings (VAE, umap) and Raw signals for gene or peptide centric Feature Eng.

Here I will construct some features for **protein-protein interractions** or for the **dominant (sim) proteoforms**, which have a corresponding **gene symbol**.<br>
- It is the same approach for different protein tables.
- The difference lies whether I will use the groudtruth data during feature engineering.
- For a subset of protein-protein pairs, which will be used to train the classifier, i will use the ground truth pairs.
- For the whole protein matrices, the purpose is to just engineer the features for all possible protein/sims pairs.
- This will have impact to the memory of the computer. <br>

For the **feature engineering** function we want Dataframes:<br>
- where **index** is either gene symbol or any other annotations used. Each row corresponds to a protein, proteoform, etc..
- where the **columns** are counts or MS signals (preferable normalized and harmonized) or VAE latent variables or UMAP coordinates. 
- index is important since we generate pairs and we need to name them or follow them somehow. 

In [8]:
#### SOME GLOBAL VARIABLES TO AID THE ANALYSIS ######
ANALYSIS_LEVEL = "proteoform" # "protein" OR "sims" for the the dominant proteoforms
USE_GROUND_TRUTH = False # if set to TRUE, it subsets the tables based on gene symbols that exist within the PPI databases for every function below 
seed = 456


# USE 3 IF GROUND TRUTH IS TRUE, OTHERWISE 10 ARE ENOUGH 
BLOCKS = 13 # calculate in a way that each block contains 1000 rows/proteins. So if you have 10000, use 10 blocks, 


In [None]:
if ANALYSIS_LEVEL == "protein":

    # for subcell
    raw_scbc = pd.read_csv(os.getcwd() + "\\data\\features_protein\\protein_quant_merged.txt", delimiter="\t", index_col=0)
    vae_scbc = pd.read_csv(os.getcwd() + "\\data\\features_protein\\proteinscbc_latent.csv", index_col=0)
    umap_scbc = pd.read_csv(os.getcwd() + "\\data\\features_protein\\protein_scbc_umap.csv", index_col=0)

    # for total cell 
    raw_abms = pd.read_csv(os.getcwd() + "\\data\\features_protein\\prot_abms_norm.txt", delimiter="\t")
    vae_abms = pd.read_csv(os.getcwd() + "\\data\\features_protein\\protein_abms_latent.csv", index_col=0)
    umap_abms = pd.read_csv(os.getcwd() + "\\data\\features_protein\\protein_abms_umap.csv", index_col=0)

    # for ground truth, the combined CORUM and Compleat datasets in gene symbol format 
    pairs_df = pd.read_csv(os.getcwd() + "\\data\\processed\\" + "merged_pairs.txt", delimiter="\t")

elif ANALYSIS_LEVEL == "sims":
    # for subcell 
    raw_scbc = pd.read_csv(os.getcwd() + "\\data\\features_sim\\scbc_quant_sims.csv", index_col=0)
    vae_scbc = pd.read_csv(os.getcwd() + "\\data\\features_sim\\simscbc_latent.csv", index_col=0)
    umap_scbc = pd.read_csv(os.getcwd() + "\\data\\features_sim\\sim_scbc_umap.csv", index_col=0)

    # for total cell 
    raw_abms = pd.read_csv(os.getcwd() + "\\data\\features_sim\\abms_quant_sims.csv", index_col=0)
    vae_abms = pd.read_csv(os.getcwd() + "\\data\\features_sim\\sim_abms_latent.csv", index_col=0)
    umap_abms = pd.read_csv(os.getcwd() + "\\data\\features_sim\\sim_abms_umap.csv", index_col=0)

    # for ground truth, the combined CORUM and Compleat datasets in gene symbol format 
    pairs_df = pd.read_csv(os.getcwd() + "\\data\\processed\\" + "merged_pairs.txt", delimiter="\t")

elif ANALYSIS_LEVEL == "proteoform":
    # overwrite global parameters for proetoform analysis
    BLOCK = 13 # many proteoforms in the dataframes 
    USE_GROUND_TRUTH = False # There is no ground truth for proteoform analysis 

    # for subcell 
    raw_scbc = pd.read_csv(os.getcwd() + "\\data\\features_proteoforms\\SCBC2_proteoform_table 2.txt", index_col=0, delimiter="\t")
    vae_scbc = pd.read_csv(os.getcwd() + "\\data\\features_proteoforms\\whole_proteoformscbc_latent.csv", index_col=0)
    umap_scbc = pd.read_csv(os.getcwd() + "\\data\\features_proteoforms\\whole_proteoform_scbc_umap.csv", index_col=0)

    # filter out proteoform raw data 
    raw_scbc = raw_scbc.loc[raw_scbc.isna().sum(axis=1) < 0.3*raw_scbc.shape[1]]

    # for total cell 
    raw_abms = pd.read_csv(os.getcwd() + "\\data\\features_proteoforms\\ABMS_proteoform_table.txt", decimal="\t", index_col=0)
    vae_abms = pd.read_csv(os.getcwd() + "\\data\\features_proteoforms\\whole_proteoform_abms_latent.csv", index_col=0)
    umap_abms = pd.read_csv(os.getcwd() + "\\data\\features_proteoforms\\whole_proteoform_abms_umap.csv", index_col=0)

else:
    raise ValueError("Choose either protein tables or dominant proteoform tables (sims) for analysis")

# check for duplicates in the indices and drop them (in case something survived)
for df in [raw_scbc, vae_scbc, umap_scbc, raw_abms, vae_abms, umap_abms]:
    print(f"Duplicates in {type(df)} before checking: {df.index.duplicated().sum()}")
    del df
    gc.collect()

# i found some in abms
raw_abms = raw_abms.loc[~raw_abms.index.duplicated(keep='first')]
vae_abms = vae_abms.loc[~vae_abms.index.duplicated(keep='first')]
umap_abms = umap_abms.loc[~umap_abms.index.duplicated(keep='first')]

Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0


In [10]:
# check for duplicates in the indices after clean up 
for df in [raw_scbc, vae_scbc, umap_scbc, raw_abms, vae_abms, umap_abms]:
    print(f"Duplicates in {type(df)} before checking: {df.index.duplicated().sum()}")
    del df
    gc.collect()

Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0
Duplicates in <class 'pandas.core.frame.DataFrame'> before checking: 0


In [13]:
# set seed for reproducibility
np.random.seed(seed)

# take random samples around 8000 of the same indices for all the tables
rand_index = raw_scbc.sample(4000, random_state=seed).index

# subset the protein tables to same indices 
raw_scbc = raw_scbc.loc[rand_index]
vae_scbc = vae_scbc.loc[rand_index]
umap_scbc = umap_scbc.loc[rand_index]

# check if the order of the indices is the same 
assert np.array_equal(raw_scbc.index, vae_scbc.index), "Indices of raw and VAE tables do not match"

In [14]:
# the dimensions of the tables before run
raw_scbc.shape, vae_scbc.shape, umap_scbc.shape

((4000, 130), (4000, 45), (4000, 3))

# Feature Engineering for SubCell (SCBC features dataframe)

In [15]:
if USE_GROUND_TRUTH:
    ground = pairs_df
    merge_cols = ["Var1", "Var2", "db"]
else:
    ground = None
    merge_cols = ["Var1", "Var2"]

# calculate correlation coefficients for all the pairs in raw and VAE tables 
scbc_raw_corr = mlh.correlation_blockwise(raw_scbc,
                                          ground_truth= ground,
                                          data_name="SCBC_raw")

gc.collect()

scbc_vae_corr = mlh.correlation_blockwise(vae_scbc,
                                          ground_truth= ground,
                                          data_name="SCBC_vae")
gc.collect()

scbc_raw_man = mlh.compute_distances_blockwise(raw_scbc,
                                        ground_truth = ground,
                                        data_name = "SCBC_raw",
                                        n_chunks = BLOCKS,
                                        return_dist = "manhattan")
gc.collect()

scbc_raw_std = mlh.compute_distances_blockwise(raw_scbc,
                                        ground_truth = ground,
                                        data_name = "SCBC_raw",
                                        n_chunks = BLOCKS,
                                        return_dist = "std_difference")
gc.collect()

scbc_umap_cos = mlh.compute_distances_blockwise(umap_scbc,
                                        ground_truth = ground,
                                        data_name = "SCBC_umap",
                                        n_chunks = BLOCKS,
                                        return_dist = "cosine")
gc.collect()

scbc_umap_euc = mlh.compute_distances_blockwise(umap_scbc,
                                        ground_truth = ground,
                                        data_name = "SCBC_umap",
                                        n_chunks = BLOCKS,
                                        return_dist = "euclidean")
gc.collect()

scbc_vae_cos = mlh.compute_distances_blockwise(vae_scbc,
                                        ground_truth = ground,
                                        data_name = "SCBC_vae",
                                        n_chunks = BLOCKS,
                                        return_dist = "cosine")
gc.collect()


# merge the features with all the common values - no missing Values 
scbc_features = reduce(lambda left, right: pd.merge(left, right, on=merge_cols, how = "inner", validate="one_to_one"), [scbc_raw_corr, scbc_vae_corr, scbc_raw_man, scbc_raw_std, scbc_umap_cos, scbc_umap_euc, scbc_vae_cos])

# clean memory
del scbc_raw_corr, scbc_vae_corr, scbc_raw_man, scbc_raw_std, scbc_umap_cos, scbc_umap_euc, scbc_vae_cos
gc.collect()

# sanity check 
print(f"The feature dataframe has {scbc_features.shape[0]} rows and the following columns:\n {list(scbc_features.columns)}")
print(f"Missing Values in total in the features dataframe: {scbc_features.isna().sum().sum()}\n")


# for consistancy separate and order columns 
excluded_cols = [col for col in scbc_features.columns if col in merge_cols]
other_cols = sorted([col for col in scbc_features.columns if col not in merge_cols])
final_col_order = excluded_cols + other_cols

# Reorder the DataFrame
scbc_features = scbc_features[final_col_order]
del excluded_cols, other_cols, final_col_order 


Analysis is starting for SCBC_raw to get pearson and spearman coefficients
------------------------------------------------------------------------------------------------------
Perform pairwise correlation analysis for the whole matrix - no ground truth will be used.
The number of pairs generated is 7998000
Analysis is completed

Analysis is starting for SCBC_vae to get pearson and spearman coefficients
------------------------------------------------------------------------------------------------------
Perform pairwise correlation analysis for the whole matrix - no ground truth will be used.
The number of pairs generated is 7998000
Analysis is completed

Analysis is starting for SCBC_raw for feature manhattan
------------------------------------------------------------------------------------------------------
Perform pairwise calculation for the whole matrix - no ground truth will be used.
The size of each block is 307 rows, and the number of blocks is 13
For total number of rows 4

In [17]:
# sanity check for the table 
scbc_features.head()

Unnamed: 0,Var1,Var2,SCBC_raw_cor_pears,SCBC_raw_cor_spear,SCBC_raw_man,SCBC_raw_std_dif,SCBC_umap_cos,SCBC_umap_euc,SCBC_vae_cor_pears,SCBC_vae_cor_spear,SCBC_vae_cos
0,P_sim_ENSG00000180817,P_sim_ENSG00000130511,0.113648,0.218177,129.881822,1.474601,0.077247,6.122372,0.348947,0.452964,0.684295
1,P_sim_ENSG00000132424,P_sim_ENSG00000130511,0.776654,0.576658,45.990217,0.540763,0.000762,0.532088,0.88818,0.888669,0.11064
2,P_sim_ENSG00000182628,P_sim_ENSG00000130511,0.14541,0.232676,86.683385,1.076647,0.051563,4.406766,0.253127,0.274835,0.759528
3,P_sim_ENSG00000163050,P_sim_ENSG00000130511,-0.212615,-0.223659,93.6714,1.046167,0.106859,6.206619,-0.391733,-0.458103,1.359753
4,P_sim_ENSG00000164151,P_sim_ENSG00000130511,0.040685,0.046469,89.952042,1.093471,0.06107,3.985075,0.121424,0.073123,0.875991


In [18]:
scbc_features.isna().sum()

Var1                  0
Var2                  0
SCBC_raw_cor_pears    0
SCBC_raw_cor_spear    0
SCBC_raw_man          0
SCBC_raw_std_dif      0
SCBC_umap_cos         0
SCBC_umap_euc         0
SCBC_vae_cor_pears    0
SCBC_vae_cor_spear    0
SCBC_vae_cos          0
dtype: int64

In [19]:
if ANALYSIS_LEVEL == "protein":

    if USE_GROUND_TRUTH:
        scbc_features.to_csv(savepath + "features_protein\\scbc_protein_features_db2.csv", header=True)
    else:
        scbc_features.to_csv(savepath + "features_protein\\scbc_protein_features_full.csv", header=True)

elif ANALYSIS_LEVEL == "sims":
    
    if USE_GROUND_TRUTH:
        scbc_features.to_csv(savepath + "features_sim\\scbc_sim_features_db2.csv", header=True)
    else:
        scbc_features.to_csv(savepath + "features_sim\\scbc_sim_features_full.csv", header=True)

elif ANALYSIS_LEVEL == "proteoform":
    scbc_features.to_csv(savepath + "features_proteoforms\\scbc_proteoform_features_full.csv", header=True)
    
else:
    raise RuntimeError("An ANALYSIS_LEVEL has not been specified")