In [None]:
import sys
sys.path.append("../utils")
from utils import load_data,load_encoded_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr

non_encoded_train, test, targets = load_data(raw=False)
ae, vae, vae2 = load_encoded_data()

nb_cell_lines = 742

ae_train = ae[:nb_cell_lines]
ae_test = ae[nb_cell_lines:]

vae_train = vae[:nb_cell_lines]
vae_test = vae[nb_cell_lines:]

vae2_train = vae2[:nb_cell_lines]
vae2_test = vae2[nb_cell_lines:]

X_train, X_val, y_train, y_val = train_test_split(non_encoded_train, targets, test_size=0.2, random_state=42)
X_train_ae, X_val_ae, y_train_ae, y_val_ae = train_test_split(ae_train, targets, test_size=0.2, random_state=42)
X_train_vae, X_val_vae, y_train_vae, y_val_vae = train_test_split(vae_train, targets, test_size=0.2, random_state=42)
X_train_vae2, X_val_vae2, y_train_vae2, y_val_vae2 = train_test_split(vae2_train, targets, test_size=0.2, random_state=42)


from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

def run_random_forest(X_train, X_val, y_train, y_val):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train.values.ravel())
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    spearman = spearmanr(y_val, y_pred)
    print(f'Best Params: {grid_search.best_params_}')
    print(f'MSE: {mse}')
    print(f'R2: {r2}')
    print(f'Spearman: {spearman.correlation}')

def run_mlp(X_train, X_val, y_train, y_val):
    param_grid = {
        'hidden_layer_sizes': [(50, 50), (100, 100), (100, 100, 100)],
        'max_iter': [500, 1000],
        'alpha': [0.0001, 0.001, 0.01]
    }
    mlp = MLPRegressor(random_state=42)
    grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train.values.ravel())
    best_mlp = grid_search.best_estimator_
    y_pred = best_mlp.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    spearman = spearmanr(y_val, y_pred)
    print(f'Best Params: {grid_search.best_params_}')
    print(f'MSE: {mse}')
    print(f'R2: {r2}')
    print(f'Spearman: {spearman.correlation}')

def run_linear_regression(X_train, X_val, y_train, y_val):
    param_grid = {
        'fit_intercept': [True, False]
    }
    lr = LinearRegression()
    grid_search = GridSearchCV(lr, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train.values.ravel())
    best_lr = grid_search.best_estimator_
    y_pred = best_lr.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    spearman = spearmanr(y_val, y_pred)
    print(f'Best Params: {grid_search.best_params_}')
    print(f'MSE: {mse}')
    print(f'R2: {r2}')
    print(f'Spearman: {spearman.correlation}')


print("Linear Regression")

print("Non encoded data")
run_linear_regression(X_train, X_val, y_train, y_val)
print("Autoencoder data")
run_linear_regression(X_train_ae, X_val_ae, y_train_ae, y_val_ae)
print("VAE data")
run_linear_regression(X_train_vae, X_val_vae, y_train_vae, y_val_vae)
print("VAE2 data")
run_linear_regression(X_train_vae2, X_val_vae2, y_train_vae2, y_val_vae2)

print("MLP")

print("Non encoded data")
run_mlp(X_train, X_val, y_train, y_val)
print("Autoencoder data")
run_mlp(X_train_ae, X_val_ae, y_train_ae, y_val_ae)
print("VAE data")
run_mlp(X_train_vae, X_val_vae, y_train_vae, y_val_vae)
print("VAE2 data")
run_mlp(X_train_vae2, X_val_vae2, y_train_vae2, y_val_vae2)

print("Random Forest")

print("Non encoded data")
run_random_forest(X_train, X_val, y_train, y_val)
print("Autoencoder data")
run_random_forest(X_train_ae, X_val_ae, y_train_ae, y_val_ae)
print("VAE data")
run_random_forest(X_train_vae, X_val_vae, y_train_vae, y_val_vae)
print("VAE2 data")
run_random_forest(X_train_vae2, X_val_vae2, y_train_vae2, y_val_vae2)


Linear Regression
Non encoded data
Best Params: {'fit_intercept': True}
MSE: 0.006525344875489468
R2: 0.1283361803259132
Spearman: 0.40826265940762246
Autoencoder data
Best Params: {'fit_intercept': True}
MSE: 0.005851105634172096
R2: 0.21840191074706194
Spearman: 0.42051734130013785
VAE data
Best Params: {'fit_intercept': True}
MSE: 0.006534009840111856
R2: 0.12717870339492943
Spearman: 0.38609140766513944
VAE2 data
Best Params: {'fit_intercept': False}
MSE: 0.007579821792573711
R2: -0.012522179629331553
Spearman: 0.2907185658613257
MLP
Non encoded data
Best Params: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 100, 100), 'max_iter': 500}
MSE: 0.1328313406997955
R2: -16.74377845402826
Spearman: 0.10747878421806001
Autoencoder data
Best Params: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 100, 100), 'max_iter': 500}
MSE: 0.038212102822983776
R2: -4.104421013757153
Spearman: 0.09213503730734544
VAE data
Best Params: {'alpha': 0.01, 'hidden_layer_sizes': (100, 100, 100), 'max_iter': 500

KeyboardInterrupt: 