# Transfer learning

Various imports

In [4]:
# Silence Tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pandas as pd
import numpy as np
from tqdm import tqdm
from tensorflow import keras
from sklearn.model_selection import train_test_split
from typing import List, Tuple

We start by reading expression matrices and labels for all male datasets

In [3]:
# Find all csv.gz files in the expr_matrices directory
filenames_expr = [f for f in os.listdir("expr_matrices") if f.endswith("M.csv.gz")]
filenames_clusters = [f for f in os.listdir("expr_matrices") if f.endswith("M_clusters.csv")]

print("Reading expression matrices...")
expr = [pd.read_csv(f"expr_matrices/{f}") for f in tqdm(filenames_expr)]

# Now intersect the gene ids
common_genes = []
for item in expr:
    item.rename(columns={item.columns[0]: "gene_id"}, inplace = True)
    item.set_index("gene_id", inplace=True)
    if len(common_genes) == 0:
        common_genes = item.index
    else:
        common_genes = common_genes.intersection(item.index)

for i in range(len(expr)):
    expr[i] = expr[i].loc[common_genes]

print("Reading clusters...")
clusters = [pd.read_csv(f"expr_matrices/{f}") for f in tqdm(filenames_clusters)]

Reading expression matrices...


100%|██████████| 6/6 [00:05<00:00,  1.13it/s]


Reading clusters...


100%|██████████| 6/6 [00:00<00:00, 1182.88it/s]


This defines our MLP

TODO: 

- hyperparameter tuning
- regularization / dropout

In [49]:
def build_model(n_clusters:int) -> keras.Model:
    """
    Build a 3-layer multi-class MLP classifier.
    
    param: n_clusters (int) - the number of clusters (possible classes)
    return: model (keras.Model) - the model    
    """

    # Now build our MLP
    model = keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape=(len(common_genes))))
    model.add(keras.layers.Dense(512, activation="relu", name="hidden_1"))
    model.add(keras.layers.Dense(256, activation="relu", name="hidden_2"))
    model.add(keras.layers.Dense(32, activation="relu", name="hidden_3"))
    # Output layer
    # Note the +1 to take into account the "other" class
    model.add(keras.layers.Dense(n_clusters + 1, activation="softmax", name="output"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    # model.summary()

    return model


This function prepares the training data

In [10]:
def prepare_training_data(expr: List, clusters: List, dataset_id: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Prepare the training data for the MLP.

    param: expr (list) - the expression matrices
    param: clusters (list) - the list of cluster identities
    param: dataset_id (int) - the id of the dataset we are using as reference
    return: x_train (np.array) - the training data
    return: y_train (np.array) - the training labels
    """

    # We will use the expression matrices as the training data
    # The labels will be the cluster labels

    # We get the expression matrix and labels for the dataset we are using as reference...
    expr = expr[dataset_id]
    clusters = clusters[dataset_id]
    # ... and split into training and test data
    x_train, x_test, y_train, y_test = train_test_split(expr, clusters, test_size=0.1, random_state=42)

    # One-hot encode labels
    y_train = keras.utils.to_categorical(y_train, num_classes=len(clusters) + 1)
    y_test = keras.utils.to_categorical(y_test, num_classes=len(clusters) + 1)

    return (x_train, y_train, x_test, y_test)


TypeError: tuple expected at most 1 argument, got 4

In [51]:
ref_dataset = 3

model  = build_model(len(clusters[ref_dataset]["Cluster"].unique()))

print(model.summary())

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 hidden_1 (Dense)            (None, 512)               6248960   
                                                                 
 hidden_2 (Dense)            (None, 256)               131328    
                                                                 
 hidden_3 (Dense)            (None, 32)                8224      
                                                                 
 output (Dense)              (None, 7)                 231       
                                                                 
Total params: 6,388,743
Trainable params: 6,388,743
Non-trainable params: 0
_________________________________________________________________
None
