In [2]:
import joblib
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Linux

In [None]:
data_af_pd_v1_b2b = pd.read_csv('/home/max/stayahead/analysis_old/AF_1step_pred_wuhan_v1_b2b_parse.csv')
data_af_5_bits = pd.read_csv('/home/max/stayahead/analysis/datasets/complexity/AF/PARSE_EPC/AF_1step_pred_wuhan_v1_b2b_5_Bits.csv')
data_af_16_bits = pd.read_csv('/home/max/stayahead/analysis/datasets/complexity/AF/PARSE_EPC/AF_1step_pred_wuhan_v1_b2b_16_Bits.csv')
data_af_1H_6_bits = pd.read_csv('/home/max/stayahead/analysis/datasets/complexity/AF/PARSE_EPC/AF_1step_pred_wuhan_v1_b2b_1_Hot_6_Bits.csv')
data_af_1H_20_bits = pd.read_csv('/home/max/stayahead/analysis/datasets/complexity/AF/PARSE_EPC/AF_1step_pred_wuhan_v1_b2b_1_Hot_20_Bits.csv')

MAC

XGBoost Regressor

Hyperparameters

In [None]:
# Define the parameter grid for XGBRegressor
param_dist = {
    'n_estimators': [200, 300, 400, 500, 600, 650, 700, 750, 800, 850, 900, 950, 1000],
    'learning_rate': [0.01, 0.02, 0.03, 0.035, 0.04],
    'max_depth': [7, 8, 9, 10, 11],
    'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

In [None]:
def gboost_regressor(df, param_dist, random_search, model_save_path):
    # Preprocessing
    df = data_af_16_bits.dropna(subset=['bind', 'delta_bind', 'expr', 'delta_expr', 'confidence_bind', 'confidence_expr'])
    df['pi_score'] = pd.to_numeric(df['pi_score'], errors='coerce')

    # Separate features and target variables
    # X = df.drop(columns=['seq_id', 'wildtype', 'site', 'mutation', 'bind', 'delta_bind', 'expr', 'delta_expr', 'confidence_bind', 'confidence_expr', 'sb', 'global_net_energy', 'total_hydro', 'Num_intf_residues', 'contact_pairs', 'hb', 'Hydrophobhic', 'Polar', 'Charged', 'iptm', 'iptm_ptm'])
    X = df.drop(columns=['seq_id', 'wildtype', 'mutation', 'bind', 'delta_bind', 'expr', 'delta_expr', 'confidence_bind', 'confidence_expr', 'global_net_energy', 'total_hydro'])
    y = df[['bind', 'delta_bind', 'expr', 'delta_expr']]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardization
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Scale the target variables
    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)

    # Initialize XGBoost model
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

    if random_search:
        # Initialize RandomizedSearchCV for the base XGBRegressor
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                        n_iter=600, scoring='neg_mean_squared_error', 
                                        cv=5, verbose=2, random_state=42, n_jobs=-1)

        # Fit the random search model for the base XGBRegressor
        random_search.fit(X_train_scaled, y_train_scaled[:, 0])  # Fit using the first target variable to get the best params

        # Print the best parameters and best score
        print(f'Best Parameters: {random_search.best_params_}')
        print(f'Best Score: {random_search.best_score_}')

        # Train the MultiOutputRegressor with the best parameters
        best_params = random_search.best_params_
    else:
        # Initialize GridSearchCV
        grid_search = GridSearchCV(estimator=model, param_grid=param_dist, 
                                   scoring='neg_mean_squared_error', 
                                   cv=5, verbose=2, n_jobs=-1)

        # Fit the grid search model
        grid_search.fit(X_train_scaled, y_train_scaled[:, 0])

        # Print the best parameters and best score
        print(f'Best Parameters: {grid_search.best_params_}')
        print(f'Best Score: {grid_search.best_score_}')

        # Train the model with best parameters
        best_params = grid_search.best_params_
        
    base_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
    multi_output_model = MultiOutputRegressor(base_model)

    # Cross-validation setup
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(multi_output_model, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_squared_error')

    # Fit the multi-output model
    multi_output_model.fit(X_train_scaled, y_train_scaled)

    # # Save the model
    # joblib.dump(multi_output_model, model_save_path)
    # print(f'Model saved to {model_save_path}')

    # Make predictions
    y_pred_scaled = multi_output_model.predict(X_test_scaled)

    # Rescale predictions back to original scale
    y_pred_rescaled = y_scaler.inverse_transform(y_pred_scaled)

    # Evaluate the model for each target variable
    for i, col in enumerate(y.columns):
        mse = mean_squared_error(y_test.iloc[:, i], y_pred_rescaled[:, i])
        r2 = r2_score(y_test.iloc[:, i], y_pred_rescaled[:, i])
        print(f'{col} - Mean Squared Error: {mse}')
        print(f'{col} - R-squared: {r2}')

    # Print cross-validation results
    print(f'Cross-Validation MSE Scores: {-cv_results}')
    print(f'Cross-Validation Mean MSE: {-cv_results.mean()}')
    print(f'Cross-Validation Standard Deviation of MSE: {cv_results.std()}')

    # Feature importance (using the first model in the MultiOutputRegressor)
    importance = multi_output_model.estimators_[0].get_booster().get_score(importance_type='weight')
    importance = {X.columns[int(k[1:])]: v for k, v in importance.items()}
    sorted_importance = dict(sorted(importance.items(), key=lambda item: item[1], reverse=True))

    plt.figure(figsize=(10, 8))
    plt.barh(range(len(sorted_importance)), list(sorted_importance.values()), align='center')
    plt.yticks(range(len(sorted_importance)), list(sorted_importance.keys()))
    plt.xlabel('F score')
    plt.ylabel('Features')
    plt.title('Feature importance')
    plt.show()