In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, MultiTaskLassoCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.linear_model import MultiTaskLassoCV


Matplotlib is building the font cache; this may take a moment.


In [None]:
# Load Data
data = pd.read_csv("./.../PCA_imputed_Linear_data_protein_only.csv")

# Normalize features (Z-score standardization)
scaler = StandardScaler()
x_data = scaler.fit_transform(data.iloc[:, 6:])  # Excluding first 6 columns

# Extract target variables (first 5 severity metrics)
y_data = data.iloc[:, 1:6].values

# Split into training (70%) and testing (30%)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)



In [8]:
# Define function to evaluate models
def evaluate_model(y_true, y_pred, metric_names):
    return pd.DataFrame({
        "Metric": metric_names,
        "MSE": mean_squared_error(y_true, y_pred, multioutput='raw_values'),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred, multioutput='raw_values')),
        "MAE": mean_absolute_error(y_true, y_pred, multioutput='raw_values'),
        "R_Squared": [r2_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])]
    })

metric_names = data.columns[1:6]



In [9]:
# (1) Multivariate Linear Regression (MLR)
mlr_model = LinearRegression().fit(x_train, y_train)
mlr_preds = mlr_model.predict(x_test)
mlr_metrics = evaluate_model(y_test, mlr_preds, metric_names)



In [10]:
# (2) Partial Least Squares Regression (PLSR)
pls_model = PLSRegression(n_components=5).fit(x_train, y_train)
pls_preds = pls_model.predict(x_test)
pls_metrics = evaluate_model(y_test, pls_preds, metric_names)


In [11]:
# (3) Random Forest (RF)
rf_model = RandomForestRegressor(n_estimators=500, random_state=42).fit(x_train, y_train)
rf_preds = rf_model.predict(x_test)
rf_metrics = evaluate_model(y_test, rf_preds, metric_names)


In [12]:
# (4) XGBoost Multi-Output Regression
xgb_models = [xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=3, eta=0.1) for _ in range(y_train.shape[1])]
for i in range(y_train.shape[1]):
    xgb_models[i].fit(x_train, y_train[:, i])
xgb_preds = np.column_stack([model.predict(x_test) for model in xgb_models])
xgb_metrics = evaluate_model(y_test, xgb_preds, metric_names)


In [15]:
# Combine results
results = pd.concat([
    pd.DataFrame({'Model': 'MLR'}, index=mlr_metrics.index).join(mlr_metrics),
    pd.DataFrame({'Model': 'PLSR'}, index=pls_metrics.index).join(pls_metrics),
    pd.DataFrame({'Model': 'RF'}, index=rf_metrics.index).join(rf_metrics),
    pd.DataFrame({'Model': 'XGBoost'}, index=xgb_metrics.index).join(xgb_metrics),
    pd.DataFrame({'Model': 'LASSO'}, index=lasso_metrics.index).join(lasso_metrics),
    pd.DataFrame({'Model': 'Neural Network'}, index=nn_metrics.index).join(nn_metrics)
])

