In [3]:
import sys
sys.path.append("../utils")
from utils import load_data
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Load the data
data_train, data_test , targets_train= load_data(True)

# Preprocess the data
# Drop the "Unnamed: 0" column (cell line identifiers) as it's not a feature
X = data_train.drop(columns=["Unnamed: 0"])
y = targets_train["AAC"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on validation data
y_val_pred = random_forest.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)

# Predict on test data
X_test = data_test.drop(columns=["Unnamed: 0"])
y_test_pred = random_forest.predict(X_test)


Validation Mean Squared Error (MSE): 0.0055733822286191115
Validation R² Score: 0.25550055443814357
Test predictions saved to 'test_predictions.csv'.


In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import sys
sys.path.append("../utils")
from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(True)

# Preprocess the data
# Drop the "Unnamed: 0" column (cell line identifiers) as it's not a feature
X = data_train.drop(columns=["Unnamed: 0"])
y = targets_train["AAC"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid with more options for a thorough search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],  # Wider range for number of trees
    'max_depth': [5, 10, 15, 20, 30, None],     # Include None for unrestricted depth
    'min_samples_split': [2, 5, 10],            # Minimum samples for a split
    'min_samples_leaf': [1, 2, 4, 8],           # Leaf node regularization
    'max_features': ['sqrt', 'log2']            # Avoid 'auto' (deprecated in sklearn)
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,                              # Increase cross-validation folds
    scoring='neg_mean_squared_error',  # Try alternative scoring metrics
    verbose=2,
    n_jobs=-1
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Extract best parameters and evaluate
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Validation set evaluation
y_val_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)

# Test set predictions
X_test = data_test.drop(columns=["Unnamed: 0"])
y_test_pred = best_model.predict(X_test)

ModuleNotFoundError: No module named 'tensorflow'

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

import sys
sys.path.append("../utils")
from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(True)

# Drop the "Unnamed: 0" column and separate features and target
X = data_train.drop(columns=["Unnamed: 0"])
y = targets_train["AAC"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an initial Random Forest model to get feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances and select the top 500 features
importances = rf.feature_importances_
important_indices = np.argsort(importances)[::-1][:500]  # Top 500 important features
X_train_reduced = X_train.iloc[:, important_indices]
X_val_reduced = X_val.iloc[:, important_indices]

# Train a new Random Forest model on the reduced feature set
rf_reduced = RandomForestRegressor(n_estimators=300, random_state=42)
rf_reduced.fit(X_train_reduced, y_train)

# Predict and evaluate
y_val_pred = rf_reduced.predict(X_val_reduced)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)

Validation Mean Squared Error (MSE): 0.005764102991137482
Validation R² Score: 0.2300238338172358


In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import sys
sys.path.append("../utils")
from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(True)

# Preprocess the data
# Drop the "Unnamed: 0" column (cell line identifiers) as it's not a feature
X = data_train.drop(columns=["Unnamed: 0"])
y = targets_train["AAC"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,       # Number of trees
    learning_rate=0.05,     # Learning rate
    max_depth=6,            # Maximum depth of each tree
    random_state=42,
    n_jobs=-1               # Use all available CPU cores
)
xgb_model.fit(X_train, y_train)

# Predict on validation data
y_val_pred = xgb_model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)

# Predict on test data
X_test = data_test.drop(columns=["Unnamed: 0"])
y_test_pred = xgb_model.predict(X_test)

# Save predictions
test_predictions = pd.DataFrame({"AAC_Predicted": y_test_pred})
test_predictions.to_csv("test_predictions_xgb.csv", index=False)

print("Test predictions saved to 'test_predictions_xgb.csv'.")


Validation Mean Squared Error (MSE): 0.006829186175894043
Validation R² Score: 0.08774867521484764
Test predictions saved to 'test_predictions_xgb.csv'.


In [None]:
#use MLP to predict the AAC
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import sys
sys.path.append("../utils")

from utils import load_data

# Load the data
data_train, data_test , targets_train= load_data(True)

# Preprocess the data
# Drop the "Unnamed: 0" column (cell line identifiers) as it's not a feature
X = data_train.drop(columns=["Unnamed: 0"])
y = targets_train["AAC"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an MLP model
mlp_model = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons
    activation='relu',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42
)
mlp_model.fit(X_train, y_train)

# Predict on validation data

y_val_pred = mlp_model.predict(X_val)

# Evaluate the model

mse = mean_squared_error(y_val, y_val_pred)

r2 = r2_score(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)


ModuleNotFoundError: No module named 'tensorflow'