In [None]:
import os
import sys
import hashlib

!pip install tensorflow_addons
!pip install rdkit
!pip install keras-swa

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
import keras.backend as K
from tensorflow.keras import layers
from swa.tfkeras import SWA

from custom_loss import rwrmse, alpha_1point75, alpha_1point5, alpha_1point25, alpha_adaptive, cauchy

import math as m
import numpy as np
import pandas as pd
import warnings

from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import AllChem
from rdkit.Chem import RDKFingerprint

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/612.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/612.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.3/612.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.22.0 typeguard-2.13.3
Collecting rdkit
  Downloading rdkit-2023.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Installing 


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
# Set random seeds
np.random.seed(8)
tf.random.set_seed(8)

In [None]:
# Load training data
train_path = '/content/drive/MyDrive/Colab Notebooks/Input/singlecell/de_train.parquet'
df_train = pd.read_parquet(train_path)

In [None]:
# Get Morgan fingerprints
df_X = np.zeros([df_train.shape[0], 2048])
for i in range(df_train.shape[0]):
	df_X[i, :] = np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(df_train["SMILES"][i]), radius=2, nBits=2048))

In [None]:
# Truncation function
def truncator(max, min, targets):
	return targets.clip(min, max)

In [None]:
# Jaccard similiarty
def jaccard_binary(x, y):
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity

In [None]:
# One hot encode the cells
# One hot ordering: [B cells,  Myeloid cells,  NK cells,  T cells CD4+,  T cells CD8+,  T regulatory cells]
one_hot = np.array(pd.get_dummies(df_train["cell_type"]) * 1.0)

In [None]:
# One hot encode controls
# 1 - control, 0 - otherwise
# 12 control observations, 2 per cell type
train_control = np.zeros([df_train.shape[0], 1])
for i, cont in enumerate(df_train["control"]):
  if cont == True:
    train_control[i] = 1.

In [None]:
# Get log p values for train set
df_chem = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Input/singlecell/chemical_properties.csv')
# Apply Log10 transform to n_atoms, molecular weight, and molar refractivity
df_chem["n_atoms"] = df_chem["n_atoms"].map(np.log10)
df_chem["mol_weight"] = df_chem["mol_weight"].map(np.log10)
df_chem["MR"] = df_chem["MR"].map(np.log10)

In [None]:
# Get dose values
# Controls: Dabrafenib - 1 uM dose, Belinostat - 0.1 uM dose
# All non-control doses are 1 uM
df_logfc = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/Input/singlecell/logFC.parquet')
train_dose = np.array(df_logfc["dose_uM"]).reshape((df_train.shape[0], 1))

In [None]:
# Scale data
scaler = StandardScaler()
scaler.fit(df_chem[["log_P", "MR"]])

In [None]:
# Join to form training matrix
one_hot = np.concatenate((one_hot, scaler.transform(df_chem[["log_P", "MR"]]), train_control), axis=1)
x_train = np.concatenate((one_hot, df_X), axis=1)

In [None]:
x_train.shape

(614, 2057)

In [None]:
# Make train data and targets
n_genes = 18211
y_train = np.array(df_train.iloc[:, 5:])

In [None]:
# Get log FC
y_logfc = np.array(df_logfc.iloc[:, 7:])

In [None]:
# Remove control samples. Takes out rows with cell type and compound exposure used as control
# Removing control also reduced the outliers
remove_control = False
if remove_control:
	control_false = list(df_train.index[df_train["control"] == False])
	# x_train and y_train
	x_train = x_train[control_false, :]
	y_train = y_train[control_false, :]
	# df_train
	df_train = df_train[df_train["control"] == False]

In [None]:
# Load in NN derived chemical weights (probabilities)
chem_weights = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Input/singlecell/train_chem_prob.csv')
chem_weights = np.array(chem_weights)

In [None]:
# Get weights for each cell type, calculated in separate file
# Calculated with log base of 1.5
cell_sample_weights = True
cell_sample_weights_scaling = False

# Chemical sample weighting
chem_sample_weights = False

if cell_sample_weights:
  # Change weight dict as necessary
  # Original Frobenius norm weight dictionary
  # weight_dict = {'B cells': 1., 'Myeloid cells': 1., 'T regulatory cells': 0.6518986684331918, 'T cells CD8+': 0.7203918421401994, 'NK cells': 0.7519447057731612, 'T cells CD4+': 0.6866328017001024}
  # Using cosine similarity weights
  weight_dict = {'B cells': 1., 'Myeloid cells': 1., 'NK cells': 0.3198791742324829, 'T cells CD8+': 0.05635642260313034, 'T cells CD4+': 0.24452143907546997, 'T regulatory cells': 0.21414318680763245}
  scaling_factor = {'T regulatory cells': 1.0718517636591396, 'T cells CD8+': 1.0950687072395096, 'NK cells': 1.1118585488618329, 'T cells CD4+': 1.0843393196219417}
  # Create weight vector
  # Assign a weight of 1 to B cells and Myeloid cells
  sample_weights = []
  for cell_type in df_train["cell_type"]:
    if cell_type in ["B cells", "Myeloid cells"]:
      sample_weights.append(1.0)
    else:
      sample_weights.append(weight_dict[cell_type])
  # Divide by minimum weight
  sample_weights /= np.min(sample_weights)
  if cell_sample_weights_scaling:
    for i, cell_type in enumerate(df_train["cell_type"]):
      if cell_type in ["B cells", "Myeloid cells"]:
        sample_weights[i] *= 1.0
      else:
        sample_weights[i] *= scaling_factor[cell_type]
  # 2 decimals
  sample_weights = np.around(sample_weights, 2)

if chem_sample_weights:
# Sum first two columns for each row of probabilities (p(private) + p(public))
# Predicted cells are not paired with compounds in test set, add score of 1.0 to still boost importance of focus on cell type
# Cells which are not in test and where compounds are not in test have no gained scores
  chem_weights_norm = []
  for i in range(df_X.shape[0]):
    chem_weights_norm.append(np.sum(chem_weights[i, :2])/chem_weights[i, 2])
  # Normalise to 1.0
  chem_weights_norm /= np.max(chem_weights_norm)
  # Assign weights to sample_weights
  for i in range(df_X.shape[0]):
    cell_type = df_train["cell_type"][i]
    if cell_type in ["B cells", "Myeloid cells"]:
      sample_weights[i] += chem_weights_norm[i] # Alternative: (chem_weights_norm[i] + 1.)
    else:
      sample_weights[i] += chem_weights_norm[i]
  # 2 decimals
  sample_weights = np.around(sample_weights, 2)

sample_weights

array([ 5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 , 17.74,
       17.74,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,
        5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,
        4.34,  1.  ,  3.8 , 17.74, 17.74,  5.68,  4.34,  1.  ,  3.8 ,
        5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,
        4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,
        1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,
        3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,
       17.74, 17.74,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,
        3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,
        5.68,  4.34,  1.  ,  3.8 , 17.74, 17.74,  5.68,  4.34,  1.  ,
        3.8 ,  5.68,  4.34,  1.  ,  3.8 , 17.74, 17.74,  5.68,  4.34,
        1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,
        3.8 ,  5.68,  4.34,  1.  ,  3.8 ,  5.68,  4.34,  1.  ,  3.8 ,
        5.68,  4.34,

In [None]:
np.min(y_train)

-180.5192016034818

In [None]:
np.max(y_train)

179.32417689610105

In [None]:
# Get y_pv, y_sign transform y_train into -np.log1000(y_pv) * sign(LFC)
y_pv = 10 ** (-np.abs(y_train))
y_train_log1000 = -(np.log10(y_pv) / 3.0) * np.sign(y_train)

In [None]:
np.max(y_train_log1000), np.min(y_train_log1000)

(59.774725632033686, -60.1730672011606)

In [None]:
# Gavish Donohoe SVD dimension reduction, returns q, U, S, VT
def gd_svd(data_mat, cutoff="w_B_high"):
  U, S, VT = np.linalg.svd(data_mat, full_matrices=False)
  # Calculate aspect ratio and cutoff
  Beta = data_mat.shape[0] / data_mat.shape[1]
  # Approximate w(B)
  w_B = 0.56 * Beta ** 3 - 0.95 * Beta ** 2 + 1.82 * Beta + 1.43
  med_S = np.median(S)
  if cutoff == "w_B":
    tau = w_B * med_S
  elif cutoff == "w_B_low":
    w_B_low = w_B - 0.02
    tau = w_B_low * med_S
  elif cutoff == "w_B_high":
    w_B_high = w_B + 0.02
    tau = w_B_high * med_S
  # Get optimal modes
  q = np.max(np.where(S > tau))
  U, S, VT = U[:, :(q+1)], np.diag(S[:(q+1)]), VT[:(q+1), :]
  return q, U, S, VT

In [None]:
# Perform dimension reduction on y_train
run_svd = True
run_autoencoder = False

if run_svd:
  q, U, S, VT = gd_svd(data_mat=y_train, cutoff="w_B_high")

if run_autoencoder:
  encoding_dim = 115
  # Load in AE model
  ae_target = keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/AE Models/sign_logpv_autoencoder_linear_embed_600e_115.h5')
  # Encoder
  encoder = keras.Model(inputs=ae_target.input, outputs=ae_target.layers[7].output)
  # Decoder
  decoder = keras.Model(inputs=ae_target.layers[8].input, outputs=ae_target.layers[-1].output)

q

113

In [None]:
# Get embeddings for y_train
if run_svd:
  # Calculate denoised y_train
  y_train_tilde = U @ S @ VT
  # Get y_embed
  y_embed = U @ S

if run_autoencoder:
  # Get y_embed
  y_embed = encoder.predict(y_train)

y_embed.shape

(614, 114)

In [None]:
# Perform SVD on logfc
if run_svd:
  k, U_lfc, S_lfc, VT_lfc = gd_svd(data_mat=y_logfc, cutoff="w_B_high")

k

63

In [None]:
# Get embeddings for log FC
if run_svd:
  # Calculate embeddings as features
  y_embed_logfc = U_lfc @ S_lfc

In [None]:
np.max(y_embed), np.min(y_embed), np.max(y_embed_logfc), np.min(y_embed_logfc)

(788.468073666263, -2693.3554989721106, 268.0056428553899, -415.64380997423666)

In [None]:
x_train.shape, y_embed.shape, y_embed_logfc.shape

((614, 2057), (614, 114), (614, 64))

In [None]:
# Standard scale SVD embed features
y_train_scale = True
scaler_type = "minmax" # or use "minmax"

if y_train_scale:
  if scaler_type == "standard_scaler":
    scaler_y_embed = StandardScaler()
    scaler_y_embed.fit(y_embed)
    y_embed = scaler_y_embed.transform(y_embed)
  elif scaler_type == "minmax":
    scaler_y_embed = MinMaxScaler(feature_range=(-15, 15)) # Scale SVD embeds to be centered around 0, try later -20, 20
    scaler_y_embed.fit(y_embed)
    y_embed = scaler_y_embed.transform(y_embed)

In [None]:
np.max(y_embed), np.min(y_embed)

(15.000000000000005, -15.0)

In [None]:
# Augment features with average drug response and average cell type response to drugs
# P-val responses
# Embedding features
embedding_features = False

# Drug response, cell type response
drug_response = False
cell_type_response = False
# Log FC response
drug_lfc_response = False
cell_type_lfc_response = False

if drug_response:
  scaler_drug = StandardScaler()
  if embedding_features:
    df_smiles_name = df_train.iloc[:, [3]]
    df_smiles_name = pd.concat((df_smiles_name, pd.DataFrame(y_embed)), axis=1) # Use SVD embeddings to average over
  else:
    # Normal dimension features
    df_smiles_name = df_train.iloc[:, [3] + list(range(5, df_train.shape[1]))]
  mean_smiles_name = df_smiles_name.groupby('SMILES').mean().reset_index()
  df_train_alt = df_train.iloc[:, 0:5]
  df_train_alt = df_train_alt.merge(mean_smiles_name, on='SMILES', how='left')
  mean_smiles_train = np.array(df_train_alt.iloc[:, 5:])
  # Fit and transform
  scaler_drug.fit(mean_smiles_train)
  # Concat with x_train
  x_train = np.concatenate((x_train, scaler_drug.transform(mean_smiles_train)), axis=1)

if cell_type_response:
  scaler_cell = StandardScaler()
  if embedding_features:
    df_cell_type = df_train.iloc[:, [0]]
    df_cell_type = pd.concat((df_cell_type, pd.DataFrame(y_embed)), axis=1)
  else:
    # Normal dimension features
    df_cell_type = df_train.iloc[:, [0] + list(range(5, df_train.shape[1]))]
  mean_cell_type = df_cell_type.groupby('cell_type').mean().reset_index()
  df_train_alt = df_train.iloc[:, 0:5]
  df_train_alt = df_train_alt.merge(mean_cell_type, on='cell_type', how='left')
  mean_cell_train = np.array(df_train_alt.iloc[:, 5:])
  # Fit and transform
  scaler_cell.fit(mean_cell_train)
  # Concat with x_train
  x_train = np.concatenate((x_train, scaler_cell.transform(mean_cell_train)), axis=1)

if drug_lfc_response:
  scaler_drug_lfc = StandardScaler()
  if embedding_features:
    df_smiles_name = df_train.iloc[:, [3]]
    df_smiles_name = pd.concat((df_smiles_name, pd.DataFrame(y_embed_logfc)), axis=1)
  else:
    # Normal dimension features
    df_smiles_name = df_train.iloc[:, [3]]
    df_smiles_name = pd.concat((df_smiles_name, df_logfc.iloc[:, 7:]), axis=1)
  mean_smiles_lfc_name = df_smiles_name.groupby('SMILES').mean().reset_index()
  df_train_alt = df_train.iloc[:, 0:5]
  df_train_alt = df_train_alt.merge(mean_smiles_lfc_name, on='SMILES', how='left')
  mean_smiles_lfc_train = np.array(df_train_alt.iloc[:, 5:])
  # Fit and transform
  scaler_drug_lfc.fit(mean_smiles_lfc_train)
  # Concat with x_train
  x_train = np.concatenate((x_train, scaler_drug_lfc.transform(mean_smiles_lfc_train)), axis=1)

if cell_type_lfc_response:
  scaler_cell_lfc = StandardScaler()
  if embedding_features:
    df_cell_type = df_train.iloc[:, [0]]
    df_cell_type = pd.concat((df_cell_type, pd.DataFrame(y_embed_logfc)), axis=1)
  else:
    # Normal dimension features
    df_cell_type = df_train.iloc[:, [0]]
    df_cell_type = pd.concat((df_cell_type, df_logfc.iloc[:, 7:]), axis=1)
  mean_cell_lfc_type = df_cell_type.groupby('cell_type').mean().reset_index()
  df_train_alt = df_train.iloc[:, 0:5]
  df_train_alt = df_train_alt.merge(mean_cell_lfc_type, on='cell_type', how='left')
  mean_cell_lfc_train = np.array(df_train_alt.iloc[:, 5:])
  # Fit and transform
  scaler_cell_lfc.fit(mean_cell_lfc_train)
  # Concat with x_train
  x_train = np.concatenate((x_train, scaler_cell_lfc.transform(mean_cell_lfc_train)), axis=1)

In [None]:
# Keep copy of x_train in original row order wrt df_train
x_train_copy = x_train
y_train_copy = y_train
y_embed_copy = y_embed

In [None]:
# Upsampling based on weights
# one hot encoding: B cells,	Myeloid cells,	NK cells,	T cells CD4+,	T cells CD8+,	T regulatory cells
upsample = False

if upsample:
  # Correct order of cells as in x_train
  cell_types = ['B cells', 'Myeloid cells', 'NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells']
  min_weight = min(list(weight_dict.values()))
  for i in range(len(cell_types)):
    # Don't resample control observations
    idx_list = np.where((x_train[:, i] == 1) & (x_train[:, 8] == 0))[0]
    base_train = x_train[idx_list, :]
    print(base_train.shape)
    base_target = y_embed[idx_list, :]
    # Scaling factor
    n_sample = int((np.around(weight_dict[cell_types[i]]/min_weight, 2) - 1) * base_train.shape[0])
    print(n_sample)
    # Sample and append to x_train, y_embed
    if n_sample > 0:
      # If no sampling, don't concat
      sample_train, sample_target = resample(base_train, base_target, n_samples=n_sample, replace=True, random_state=8)
      # Concat
      x_train = np.concatenate((x_train, sample_train), axis=0)
      y_embed = np.concatenate((y_embed, sample_target), axis=0)

In [None]:
x_train.shape

(614, 2057)

In [None]:
# Shuffle data
permuted_indices = np.random.permutation(np.arange(x_train.shape[0]))
if upsample:
  # Only permute train, target if upsampling
  x_train = x_train[permuted_indices, :]
  y_embed = y_embed[permuted_indices, :]
else:
  # Otherwise permute sample weights as well
  x_train = x_train[permuted_indices, :]
  y_embed = y_embed[permuted_indices, :]
  y_train = y_train[permuted_indices, :]
  sample_weights = sample_weights[permuted_indices]

In [None]:
sample_weights

array([ 3.8 ,  1.  ,  3.8 ,  5.68,  4.34,  3.8 ,  4.34,  3.8 ,  3.8 ,
        3.8 ,  4.34,  1.  ,  1.  ,  4.34,  4.34,  3.8 ,  1.  ,  3.8 ,
        5.68,  4.34,  1.  ,  4.34,  3.8 ,  1.  ,  3.8 ,  5.68, 17.74,
        4.34,  5.68,  3.8 ,  4.34,  5.68,  1.  ,  3.8 ,  3.8 ,  3.8 ,
        3.8 ,  1.  ,  5.68,  4.34,  1.  ,  3.8 ,  3.8 ,  1.  ,  5.68,
        5.68,  4.34,  1.  ,  3.8 ,  1.  ,  4.34,  4.34,  3.8 ,  4.34,
        4.34,  4.34,  4.34,  3.8 ,  4.34,  3.8 ,  4.34,  5.68,  5.68,
        3.8 ,  1.  ,  1.  , 17.74,  5.68,  1.  ,  3.8 , 17.74,  1.  ,
        5.68,  3.8 , 17.74,  1.  ,  1.  ,  3.8 ,  5.68,  5.68,  3.8 ,
        3.8 ,  3.8 ,  5.68,  4.34,  3.8 ,  5.68,  4.34,  5.68,  3.8 ,
        1.  ,  1.  ,  1.  ,  3.8 ,  1.  ,  4.34,  4.34,  3.8 ,  5.68,
        1.  , 17.74,  3.8 ,  1.  ,  3.8 ,  3.8 ,  5.68,  1.  , 17.74,
        3.8 ,  1.  ,  5.68,  5.68,  3.8 ,  3.8 ,  1.  ,  4.34,  3.8 ,
        4.34,  5.68,  1.  ,  5.68,  1.  ,  4.34, 17.74, 17.74,  1.  ,
        1.  ,  3.8 ,

In [None]:
x_train.shape, y_train.shape, y_embed.shape

((614, 2057), (614, 18211), (614, 114))

In [None]:
# Model
# Architecture selection
skip_connections = False
selu_stack = True

# Skip connections model
if skip_connections:
  input_size = x_train.shape[1]
  inputs_reg = layers.Input((input_size,))
  x_1 = layers.Dense(3072, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(inputs_reg)
  x_2 = layers.Dense(3072, kernel_initializer="lecun_normal", bias_initializer="zeros")(x_1)
  concat_1 = layers.Concatenate(axis=1)([inputs_reg, x_2])
  selu_1 = layers.Activation(keras.activations.selu)(concat_1)
  x_3 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(selu_1)
  x_4 = layers.Dense(5128, kernel_initializer="lecun_normal", bias_initializer="zeros")(x_3)
  add_1 = layers.Add()([selu_1, x_4])
  selu_2 = layers.Activation(keras.activations.selu)(add_1)
  x_5 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(selu_2)
  x_6 = layers.Dense(5128, kernel_initializer="lecun_normal", bias_initializer="zeros")(x_5)
  add_2 = layers.Add()([selu_2, x_6])
  selu_3 = layers.Activation(keras.activations.selu)(add_2)
  x_7 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(selu_3)
  x_8 = layers.Dense(5128, kernel_initializer="lecun_normal", bias_initializer="zeros")(x_7)
  add_3 = layers.Add()([selu_3, x_8])
  selu_4 = layers.Activation(keras.activations.selu)(add_3)
  x_9 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(selu_4)
  if run_svd:
    output_reg = layers.Dense((q+1), kernel_initializer="glorot_normal")(x_9)
  if run_autoencoder:
    output_reg = layers.Dense(encoding_dim, kernel_initializer="glorot_normal")(x_9)

# SELU stack
if selu_stack:
  input_size = x_train.shape[1]
  inputs_reg = layers.Input((input_size,))
  x_1 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(inputs_reg)
  x_2 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(x_1)
  x_3 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(x_2)
  x_4 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(x_3)
  x_5 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(x_4)
  x_6 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(x_5)
  x_7 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(x_6)
  x_8 = layers.Dense(5128, activation="selu", kernel_initializer="lecun_normal", bias_initializer="zeros")(x_7)
  if run_svd:
    output_reg = layers.Dense((q+1), kernel_initializer="glorot_normal")(x_8)
  if run_autoencoder:
    output_reg = layers.Dense(encoding_dim, kernel_initializer="glorot_normal")(x_8)

# Define model
model = keras.Model(inputs=inputs_reg, outputs=output_reg)

In [None]:
# Define losses
mae = keras.losses.MeanAbsoluteError()
mse = keras.losses.MeanSquaredError()
huber = keras.losses.Huber()

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2057)]            0         
                                                                 
 dense (Dense)               (None, 5128)              10553424  
                                                                 
 dense_1 (Dense)             (None, 5128)              26301512  
                                                                 
 dense_2 (Dense)             (None, 5128)              26301512  
                                                                 
 dense_3 (Dense)             (None, 5128)              26301512  
                                                                 
 dense_4 (Dense)             (None, 5128)              26301512  
                                                                 
 dense_5 (Dense)             (None, 5128)              263015

In [None]:
# Stochastic weight averaging
# SWA lr 5e-5 for constant
start_epoch = 601
swa = SWA(start_epoch=start_epoch,
          lr_schedule='manual',
          verbose=1)

# Optimizer
# Best scores used learning rate 7e-5, 5e-5, 3.5e-5
learning_rate = 7e-5

# Cosine rate scheduler
# Every 7800 steps, 200 epochs. Cosine Decay changes LR each batch (step)
# Every 3900 steps, 100 epochs
cos_sched = keras.optimizers.schedules.CosineDecayRestarts(initial_learning_rate=learning_rate, first_decay_steps=7800, t_mul=1.0, m_mul=0.9, alpha=0.01)

# Optimisers
# Try higher weight decay for larger models
# lambda = lambda(norm) * sqrt(batch_size / (n * t)), where n is number of training points and t is number of epochs
# Where cosine warm restarts is used t, is the number of epochs in a restart period. For long trains use t as the epochs in the last restart
# Try lambda(norm) between 0.025 to 0.05
weight_decay = 5e-4 # Try 2.8e-4 for lambda_norm = 0.025, and 5.6e-4 for lambda_norm = 0.05 and 4.2e-4 for lambda_norm = 0.0375

opt_adam = keras.optimizers.Adam(learning_rate=cos_sched)

opt_adamW = keras.optimizers.AdamW(learning_rate=cos_sched) # Weight decay not set

opt_adaMax = keras.optimizers.Adamax(learning_rate=cos_sched, weight_decay=weight_decay)

In [None]:
# Compile model
model.compile(loss=mae, optimizer=opt_adamW, metrics=[mae, rwrmse])

# Train model # no sample weights
model.fit(x=x_train, y=y_embed, epochs=800, batch_size=16, callbacks=[swa], shuffle=True)

Epoch 1/800
 6/39 [===>..........................] - ETA: 2s - loss: 2.4398 - mean_absolute_error: 2.4398 - rwrmse: 3.1394



Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78/800
Epoch 7

<keras.src.callbacks.History at 0x7bad23fcfac0>

In [None]:
# Load in submission set
df_id = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Input/singlecell/id_map_submission.csv')

# Convert to Morgan fingerprints
df_X = np.zeros([df_id.shape[0], 2048])
for i in range(df_id.shape[0]):
	df_X[i, :] = np.array(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(df_id["SMILES"][i]), radius=2, nBits=2048))

# One hot encode the cell types
n_cell_types = 6
cells = ["B cells", "Myeloid cells", "NK cells", "T cells CD4+", "T cells CD8+", "T regulatory cells"]
one_hot = pd.DataFrame(np.zeros([df_id.shape[0], n_cell_types]), columns=cells)

# Fill in one_hot
one_hot_test = pd.get_dummies(df_id["cell_type"]) * 1.0
one_hot["B cells"] = one_hot_test["B cells"]
one_hot["Myeloid cells"] = one_hot_test["Myeloid cells"]
one_hot = np.array(one_hot)

# Control and dose for test
# Control (1 - control, 0 - non-control)
# No control compounds in test, all values should be 0
test_control = np.zeros([df_id.shape[0], 1])

# Dose (1 uM for all compounds). All compounds in train were 1 uM except for Belinostat
test_dose = np.ones([df_id.shape[0], 1])

# Log10 transform n_atoms, molecular weight, and molar refractivity
df_id["n_atoms"] = df_id["n_atoms"].map(np.log10)
df_id["mol_weight"] = df_id["mol_weight"].map(np.log10)
df_id["MR"] = df_id["MR"].map(np.log10)

# Merge to construct test matrix
one_hot = np.concatenate((one_hot, scaler.transform(df_id[["log_P", "MR"]]), test_control), axis=1)
x_test = np.concatenate((one_hot, df_X), axis=1)

del one_hot, one_hot_test

In [None]:
# Augment features with average drug response and average cell type response to drugs for test

if drug_response:
  df_id_alt = df_id.iloc[:, 0:5]
  df_id_alt = df_id_alt.merge(mean_smiles_name, on='SMILES', how='left')
  mean_smiles_test = np.array(df_id_alt.iloc[:, 5:])
  # Concat with x_test
  x_test = np.concatenate((x_test, scaler_drug.transform(mean_smiles_test)), axis=1)

if cell_type_response:
  df_id_alt = df_id.iloc[:, 0:5]
  df_id_alt = df_id_alt.merge(mean_cell_type, on='cell_type', how='left')
  mean_cell_test = np.array(df_id_alt.iloc[:, 5:])
  # Concat with x_Test
  x_test = np.concatenate((x_test, scaler_cell.transform(mean_cell_test)), axis=1)

if drug_lfc_response:
  df_id_alt = df_id.iloc[:, 0:5]
  df_id_alt = df_id_alt.merge(mean_smiles_lfc_name, on='SMILES', how='left')
  mean_smiles_lfc_test = np.array(df_id_alt.iloc[:, 5:])
  # Concat with x_test
  x_test = np.concatenate((x_test, scaler_drug_lfc.transform(mean_smiles_lfc_test)), axis=1)

if cell_type_lfc_response:
  df_id_alt = df_id.iloc[:, 0:5]
  df_id_alt = df_id_alt.merge(mean_cell_lfc_type, on='cell_type', how='left')
  mean_cell_lfc_test = np.array(df_id_alt.iloc[:, 5:])
  # Concat with x_test
  x_test = np.concatenate((x_test, scaler_cell_lfc.transform(mean_cell_lfc_test)), axis=1)

In [None]:
x_test.shape

(255, 2057)

In [None]:
# Predict
output_emb = model.predict(x_test)

# Standard scaled SVD embed
if y_train_scale:
  if run_svd:
    output_high = scaler_y_embed.inverse_transform(output_emb) @ VT
  if run_autoencoder:
    output_high = decoder.predict(scaler_y_embed.inverse_transform(output_emb))
else:
  # Normal SVD embed
  if run_svd:
    output_high = output_emb @ VT
  if run_autoencoder:
    output_high = decoder.predict(output_emb)

# Read in submission file
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Input/singlecell/sample_submission.csv')
submission.iloc[:, 1:] = output_high



In [None]:
np.max(output_emb), np.min(output_emb)

(15.223473, -13.507002)

In [None]:
np.max(output_high), np.min(output_high)

(65.74404806279857, -42.696321795331656)

In [None]:
# Save as submission
# Generate truncated SHA256 hash
truncated_hash = hashlib.sha256(os.urandom(23)).hexdigest()[:23]

submission_path = '/content/drive/MyDrive/Colab Notebooks/Output/ecfp_reg_8lselu_stack_v289_' + truncated_hash + '.csv'
submission.to_csv(submission_path, index=False)

# Print hash
print(truncated_hash)

52f7b670061d957ae6c6cee


In [None]:
# Calculuate rwrmse for original y_train

if y_train_scale:
  if run_svd:
    y_train_pred = scaler_y_embed.inverse_transform(model.predict(x_train_copy)) @ VT
  if run_autoencoder:
    y_train_pred = decoder.predict(scaler_y_embed.inverse_transform(model.predict(x_train_copy)))
else:
  if run_svd:
    y_train_pred = model.predict(x_train_copy) @ VT
  if run_autoencoder:
    y_train_pred = decoder.predict(model.predict(x_train_copy))

train_loss = K.eval(rwrmse(y_train_copy, y_train_pred))
train_loss



0.5746691873989737

In [None]:
# Save predictions on train
filepath = '/content/drive/MyDrive/Colab Notebooks/Train Predictions/train_predictions_' + 'v289_' + truncated_hash + '.csv'
y_train_pred = pd.DataFrame(y_train_pred)

# Save as csv
y_train_pred.to_csv(filepath, index=False)

In [None]:
# Save model
# Use .keras extension to save whole model
filepath = '/content/drive/MyDrive/Colab Notebooks/Models/ecfp_reg_8lselu_stack_v289_' + truncated_hash + '.keras'
model.save(filepath)