In [8]:
import sys
sys.path.append("../utils")
from utils import load_data,load_encoded_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr

# Load the data non encoded
non_encoded_train, test, targets = load_data(raw=False)
ae, vae, vae2 = load_encoded_data()

# Split the data

#print nb of samples in ae_train and targets

nb_cell_lines = 742

ae_train = ae[:nb_cell_lines]
ae_test = ae[nb_cell_lines:]

vae_train = vae[:nb_cell_lines]
vae_test = vae[nb_cell_lines:]

vae2_train = vae2[:nb_cell_lines]
vae2_test = vae2[nb_cell_lines:]

X_train, X_val, y_train, y_val = train_test_split(non_encoded_train, targets, test_size=0.2, random_state=42)
X_train_ae, X_val_ae, y_train_ae, y_val_ae = train_test_split(ae_train, targets, test_size=0.2, random_state=42)
X_train_vae, X_val_vae, y_train_vae, y_val_vae = train_test_split(vae_train, targets, test_size=0.2, random_state=42)
X_train_vae2, X_val_vae2, y_train_vae2, y_val_vae2 = train_test_split(vae2_train, targets, test_size=0.2, random_state=42)


# Run random forest for each dataset and print the results in the function

from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

def run_random_forest(X_train, X_val, y_train, y_val):
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    spearman = spearmanr(y_val, y_pred)
    print(f'MSE: {mse}')
    print(f'R2: {r2}')
    print(f'Spearman: {spearman.correlation}')

def run_mlp(X_train, X_val, y_train, y_val):
    mlp = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    spearman = spearmanr(y_val, y_pred)
    print(f'MSE: {mse}')
    print(f'R2: {r2}')
    print(f'Spearman: {spearman.correlation}')


print("MLP")

print("Non encoded data")
run_mlp(X_train, X_val, y_train, y_val)
print("Autoencoder data")
run_mlp(X_train_ae, X_val_ae, y_train_ae, y_val_ae)
print("VAE data")
run_mlp(X_train_vae, X_val_vae, y_train_vae, y_val_vae)
print("VAE2 data")
run_mlp(X_train_vae2, X_val_vae2, y_train_vae2, y_val_vae2)



print("Random Forest")

print("Non encoded data")
run_random_forest(X_train, X_val, y_train, y_val)
print("Autoencoder data")
run_random_forest(X_train_ae, X_val_ae, y_train_ae, y_val_ae)
print("VAE data")
run_random_forest(X_train_vae, X_val_vae, y_train_vae, y_val_vae)
print("VAE2 data")
run_random_forest(X_train_vae2, X_val_vae2, y_train_vae2, y_val_vae2)


MLP
Non encoded data


  y = column_or_1d(y, warn=True)


MSE: 0.9147519440552313
R2: -121.19372137777923
Spearman: -0.012169428776922089
Autoencoder data


  y = column_or_1d(y, warn=True)


MSE: 0.09869412060268658
R2: -12.183685427424981
Spearman: 0.07818254868727022
VAE data
MSE: 0.006270682837870606
R2: 0.16235425732758157
Spearman: 0.4247509747425788
VAE2 data


  y = column_or_1d(y, warn=True)


ValueError: could not convert string to float: 'example 482\t-0.82308227\t-0.88209003\t1.6382465\t0.9128771\t0.13723393\t-1.3976676\t0.19273871\t1.2797318\t0.79047215\t-0.5997713\t2.4254954\t1.5838013\t-2.7735686\t-0.20764787\t1.8245893\t-0.10428563\t0.7883158\t-0.3139167\t-0.17539942\t-1.1478906\t-0.6052814\t-0.6064377\t2.1537273\t0.74347013\t0.92024684\t-0.39627057\t-0.7390394\t1.1788504\t-0.4108428\t0.6120739\t0.52332366\t0.2947685\t-1.036254\t0.47882405\t-0.3193612\t0.14832313\t-1.8681147\t1.4814999\t-1.0328437\t2.4115186\t2.6454704\t0.1822631\t0.34603995\t-1.9276468\t1.4746034\t0.83310556\t-0.11274527\t0.2263906\t-0.914847\t0.4762783\t-0.5343865\t-0.06768807\t-2.0514452\t0.37138045\t-3.2663403\t1.4479642\t-0.8592391\t0.16163881\t-0.8466633\t-1.8196385\t-0.41543257\t0.67449903\t-0.49611935\t0.5833167\t-1.1633252\t-0.57620376\t-1.6166193\t0.69094247\t-1.0430403\t0.35437146\t0.5263888\t-1.7746055\t-0.7220408\t-0.7268254\t1.5354068\t-1.4442505\t-1.0967299\t-1.6390498\t0.9482245\t-0.010463476\t-0.52168274\t0.39829358\t-1.2169354\t-0.406551\t-0.35719872\t0.8952993\t-0.44348773\t-1.5884229\t0.24521112\t2.560321\t1.6097265\t0.19613402\t0.5289663\t-0.5410305\t0.3367491\t-0.46257773\t0.66598976\t-1.9277067\t-1.6931771\t1.4915867'

In [None]:
# Train the model with MLP

# Train the model with the non encoded data
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
spearman = spearmanr(y_test, y_pred)
print("Non encoded data")
print("MSE: ", mse)
print("R2: ", r2)
print("Spearman: ", spearman)
