In [3]:
import sys
sys.path.append("../utils")
from utils import load_data
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr
# Load the data
data_train, data_test , targets_train= load_data(raw=False)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data_train, targets_train, test_size=0.2, random_state=42)

# Train a Random Forest model
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on validation data
y_val_pred = random_forest.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
spearman_non_encoded, p_value_non_encoded = spearmanr(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)
print("Spearman's Correlation (ρ):", spearman_non_encoded)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Bureau\\ml-project-2-lol-ml\\data\\preprocessed\\train.csv'

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import sys
sys.path.append("../utils")
from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(raw=False)

# Split the data into training and validation sets
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data_train, targets_train, test_size=0.2, random_state=42)

# Define the parameter grid with more options for a thorough search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],  # Wider range for number of trees
    'max_depth': [5, 10, 15, 20, 30, None],     # Include None for unrestricted depth
    'min_samples_split': [2, 5, 10],            # Minimum samples for a split
    'min_samples_leaf': [1, 2, 4, 8],           # Leaf node regularization
    'max_features': ['sqrt', 'log2']            # Avoid 'auto' (deprecated in sklearn)
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,                              # Increase cross-validation folds
    scoring='neg_mean_squared_error',  # Try alternative scoring metrics
    verbose=2,
    n_jobs=-1
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Extract best parameters and evaluate
best_model = grid_search.best_estimator_

# Validation set evaluation
y_val_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
spearman_non_encoded, p_value_non_encoded = spearmanr(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)
print("Spearman's Correlation (ρ):", spearman_non_encoded)


Fitting 5 folds for each of 720 candidates, totalling 3600 fits


KeyboardInterrupt: 

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

import sys
sys.path.append("../utils")
from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(raw= False)


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an initial Random Forest model to get feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances and select the top 500 features
importances = rf.feature_importances_
important_indices = np.argsort(importances)[::-1][:500]  # Top 500 important features
X_train_reduced = X_train.iloc[:, important_indices]
X_val_reduced = X_val.iloc[:, important_indices]

# Train a new Random Forest model on the reduced feature set
rf_reduced = RandomForestRegressor(n_estimators=300, random_state=42)
rf_reduced.fit(X_train_reduced, y_train)

# Predict and evaluate
y_val_pred = rf_reduced.predict(X_val_reduced)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
spearman_non_encoded, p_value_non_encoded = spearmanr(y_val, y_val_pred)

print("Spearman's Correlation (ρ):", spearman_non_encoded)
print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)

Spearman's Correlation (ρ): 0.38886848255818873
Validation Mean Squared Error (MSE): 0.005764102991137482
Validation R² Score: 0.2300238338172358


In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import sys
sys.path.append("../utils")
from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(True)

# Preprocess the data
# Drop the "Unnamed: 0" column (cell line identifiers) as it's not a feature
X = data_train.drop(columns=["Unnamed: 0"])
y = targets_train["AAC"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,       # Number of trees
    learning_rate=0.05,     # Learning rate
    max_depth=6,            # Maximum depth of each tree
    random_state=42,
    n_jobs=-1               # Use all available CPU cores
)
xgb_model.fit(X_train, y_train)

# Predict on validation data
y_val_pred = xgb_model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
spearman_non_encoded, p_value_non_encoded = spearmanr(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)
print("Spearman's Correlation (ρ):", spearman_non_encoded)





KeyboardInterrupt: 

In [None]:
#use MLP to predict the AAC
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import sys

sys.path.append("../utils")

from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(False)

# Preprocess the data
# Drop the "Unnamed: 0" column (cell line identifiers) as it's not a feature

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an MLP model
mlp_model = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons
    activation='relu',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42
)
mlp_model.fit(X_train, y_train)

# Predict on validation data

y_val_pred = mlp_model.predict(X_val)

# Evaluate the model

mse = mean_squared_error(y_val, y_val_pred)

r2 = r2_score(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)


NameError: name 'X' is not defined

In [None]:
##use VAE to predict the AAC

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.dataset import random_split
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.dataloader import default_collate
from torch.utils.data.sampler import BatchSampler
import sys
sys.path.append("../utils")
from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(False)

#split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(data_train, targets_train, test_size=0.2, random_state=42)

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

# Create a DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)