In [1]:
import os
import sys

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Get the current working directory
current_directory = os.getcwd()
# Get the parent directory
parent_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
sys.path.append(os.path.abspath(parent_directory))

from utils.prepare_datasets import create_datasets_for_xgboost
import utils.plots as plots

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_pickle(parent_directory + '/data/processed_data.pkl')
df

Unnamed: 0,species_name,upstream200,stress_condition_name,tpm,species,stress_condition
0,ACHX,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",as,0.124923,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,ACHX,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",bs,-0.269196,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,ACHX,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",li,0.092039,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,ACHX,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",mig,1.205134,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
5,ACHX,"[[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [...",nd,1.060358,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...
1035322,Vibrio,"[[0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",oss,-0.155647,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1035323,Vibrio,"[[0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",oxs,-0.743433,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
1035324,Vibrio,"[[0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",sp,-0.482135,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
1035325,Vibrio,"[[0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [...",tm,-0.011241,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"


In [3]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = create_datasets_for_xgboost(df)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 6, 9],
    # Add more hyperparameters if needed
    # 'eta': [0.01, 0.1, 0.3],
    # 'subsample': [0.5, 0.7, 1.0],
    # 'colsample_bytree': [0.5, 0.7, 1.0],
    # 'n_estimators': [100, 200, 300]
}

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Combine training and validation sets for hyperparameter tuning
X_train_val = np.vstack((X_train, X_val))
y_train_val = np.vstack((y_train, y_val))

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# Perform grid search
grid_search.fit(X_train_val, y_train_val)

# Print best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {-grid_search.best_score_}")

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters: {'max_depth': 9}
Best score: 0.8835242787996928


In [4]:
# Train the model with the best parameters
best_params = grid_search.best_params_
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.6936472654342651


In [5]:
# import shap

# # SHAP analysis
# explainer = shap.Explainer(best_model)
# shap_values = explainer(X_test)

# # Plot SHAP summary
# shap.summary_plot(shap_values, X_test)

# # Plot SHAP dependence plot for a specific feature (e.g., feature index 0)
# shap.dependence_plot(0, shap_values, X_test)

: 

In [6]:
plots.plot_hexbin_predictions_vs_labels(y_pred, y_test)