In [1]:
import shap
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import scipy.stats as stats
from itertools import combinations

# Load and preprocess data
file_path = r"C:\\Users\\SANDY\\Downloads\\carbon dots.csv"
df = pd.read_csv(file_path, encoding='latin1')

df.columns = [
    'PlantName', 'Part', 'Family', 'Solvent', 'ParticleSize', 'ZetaPotential',
    'CellType', 'CellOrigin', 'CellLine', 'Assay', 'Dose', 'Viability', 'Time'
]

num_cols = ['ParticleSize', 'ZetaPotential', 'Dose', 'Viability', 'Time']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

predictors = ['ZetaPotential', 'Dose', 'Time']
df[predictors] = df[predictors].apply(lambda x: x.fillna(x.median()))

def regression_impute(df, target_column, predictors):
    train_data = df.dropna(subset=[target_column] + predictors)
    test_data = df[df[target_column].isna()]
    model = LinearRegression()
    model.fit(train_data[predictors], train_data[target_column])
    df.loc[df[target_column].isna(), target_column] = model.predict(test_data[predictors])
    return df

df = regression_impute(df, 'ParticleSize', predictors)
df.dropna(inplace=True)

categorical_cols = ['Family', 'Solvent', 'CellType', 'CellOrigin']
df = pd.get_dummies(df, columns=categorical_cols)

features_basic = [col for col in df.columns if col not in ['PlantName', 'Part', 'CellLine', 'Assay', 'Viability', 'Dose']]
X_family = df[features_basic]
X_no_family = X_family.drop(columns=[col for col in X_family.columns if "Family_" in col])
y = df['Dose']

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

models = {
    'XGBoost': XGBRegressor(random_state=42, objective='reg:squarederror'),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

param_grids = {
    'XGBoost': {
        'clf__n_estimators': [50, 100, 200],
        'clf__max_depth': [3, 5, 10],
        'clf__learning_rate': [0.01, 0.1, 0.2],
        'clf__subsample': [0.8, 1.0],
        'clf__colsample_bytree': [0.8, 1.0]
    },
    'AdaBoost': {
        'clf__n_estimators': [50, 100, 200],
        'clf__learning_rate': [0.01, 0.1, 0.2]
    },
    'LightGBM': {
        'clf__n_estimators': [50, 100, 200],
        'clf__max_depth': [3, 5, 10],
        'clf__learning_rate': [0.01, 0.1, 0.2],
        'clf__subsample': [0.8, 1.0],
        'clf__colsample_bytree': [0.8, 1.0]
    }
}

cv = KFold(n_splits=10, shuffle=True, random_state=42)

for dataset_name, X in [("With Family", X_family), ("Without Family", X_no_family)]:
    for name, model in models.items():
        pipeline = Pipeline([('scaler', StandardScaler()), ('clf', model)])

        random_search = RandomizedSearchCV(
            pipeline, param_distributions=param_grids[name],
            n_iter=20, scoring='neg_mean_squared_error', cv=cv,
            verbose=1, n_jobs=-1, random_state=42
        )
        random_search.fit(X, y)
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)

        print(f"\n{name} ({dataset_name})")
        print(f"MSE: {mse:.3f}")
        print(f"R2 Score: {r2:.3f}")

        X_numeric = X.astype(float)
        model_only = best_model.named_steps['clf']

        if isinstance(model_only, (xgb.XGBRegressor, lgb.LGBMRegressor)):
            explainer = shap.Explainer(model_only, X_numeric)
            shap_values = explainer(X_numeric)
        else:
            background = shap.sample(X_numeric, 50, random_state=42)
            explainer = shap.KernelExplainer(model_only.predict, background)
            shap_values = explainer(X_numeric)

        plt.figure(figsize=(10, 6))
        shap.summary_plot(shap_values, X_numeric, feature_names=X.columns, show=False)
        plt.savefig(f"shap_summary_regression_{name}_{dataset_name}.png")
        plt.close()

        shap_df = pd.DataFrame(shap_values.values, columns=X.columns)
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=shap_df)
        plt.xticks(rotation=90)
        plt.title(f"SHAP Boxplot - {name} ({dataset_name})")
        plt.savefig(f"shap_boxplot_regression_{name}_{dataset_name}.png")
        plt.close()

# IC50-Like Boxplot

df_raw = pd.read_csv(file_path, encoding='latin1')
df_raw.columns = [
    'PlantName', 'Part', 'Family', 'Solvent', 'ParticleSize', 'ZetaPotential',
    'CellType', 'CellOrigin', 'CellLine', 'Assay', 'Dose', 'Viability', 'Time'
]
df_raw['Dose'] = pd.to_numeric(df_raw['Dose'], errors='coerce')
df_raw.dropna(subset=['Dose', 'Family'], inplace=True)

plt.figure(figsize=(12, 6))
sns.boxplot(x='Family', y='Dose', data=df_raw)
plt.xticks(rotation=45)
plt.title("IC50-like Boxplot (Dose vs Family)")
plt.ylabel("Dose (mg/L)")
plt.xlabel("Family")
plt.tight_layout()
plt.savefig("IC50_boxplot_regression.png")
plt.close()

# P-VALUE MATRIX (Welch's t-test)

families = sorted(df_raw['Family'].unique())
p_matrix = pd.DataFrame(index=families, columns=families, dtype=float)

for fam1, fam2 in combinations(families, 2):
    group1 = df_raw[df_raw['Family'] == fam1]['Dose']
    group2 = df_raw[df_raw['Family'] == fam2]['Dose']
    stat, p = stats.ttest_ind(group1, group2, equal_var=False)
    p_matrix.loc[fam1, fam2] = p
    p_matrix.loc[fam2, fam1] = p

np.fill_diagonal(p_matrix.values, np.nan)

def categorize_pvalue(p):
    if pd.isna(p): return np.nan
    elif p < 0.01: return 0.01
    elif p < 0.05: return 0.05
    else: return 1

color_matrix = p_matrix.applymap(categorize_pvalue)

plt.figure(figsize=(12, 10))
sns.heatmap(color_matrix, annot=False, cmap=sns.color_palette(["red", "orange", "white"]), 
            cbar_kws={'label': 'p value'}, linewidths=0.5, square=True)

for i in range(len(families)):
    for j in range(len(families)):
        pval = p_matrix.iloc[i, j]
        if pd.notna(pval):
            if pval < 0.01:
                plt.text(j+0.5, i+0.5, "**", ha='center', va='center', color='black', fontsize=10)
            elif pval < 0.05:
                plt.text(j+0.5, i+0.5, "*", ha='center', va='center', color='black', fontsize=10)

plt.xticks(np.arange(len(families))+0.5, families, rotation=90)
plt.yticks(np.arange(len(families))+0.5, families, rotation=0)
plt.title("P-value Similarity Matrix (Paired T-test, Dose vs Family)")
plt.tight_layout()
plt.savefig("pvalue_similarity_ttest.png")
plt.close()

print("\n✅ Full Regression with LightGBM completed!")


Fitting 10 folds for each of 20 candidates, totalling 200 fits

XGBoost (With Family)
MSE: 45549.662
R2 Score: 0.708




Fitting 10 folds for each of 9 candidates, totalling 90 fits

AdaBoost (With Family)
MSE: 58333.989
R2 Score: 0.626


  0%|          | 0/121 [00:00<?, ?it/s]

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 121, number of used features: 5
[LightGBM] [Info] Start training from score 270.537190

LightGBM (With Family)
MSE: 86724.752
R2 Score: 0.444




Fitting 10 folds for each of 20 candidates, totalling 200 fits

XGBoost (Without Family)
MSE: 84124.865
R2 Score: 0.461




Fitting 10 folds for each of 9 candidates, totalling 90 fits

AdaBoost (Without Family)
MSE: 83085.300
R2 Score: 0.468


  0%|          | 0/121 [00:00<?, ?it/s]

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31
[LightGBM] [Info] Number of data points in the train set: 121, number of used features: 4
[LightGBM] [Info] Start training from score 270.537190

LightGBM (Without Family)
MSE: 87658.304
R2 Score: 0.438


  color_matrix = p_matrix.applymap(categorize_pvalue)



✅ Full Regression with LightGBM completed!
