In [3]:
import shap
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, make_scorer, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression
from itertools import combinations
from scipy.stats import ttest_ind
from sklearn.inspection import permutation_importance
from sklearn import tree

# Load and preprocess data
file_path = r"C:\\Users\\SANDY\\Downloads\\carbon dots.csv"
df = pd.read_csv(file_path, encoding='latin1')

df.columns = [
    'PlantName', 'Part', 'Family', 'Solvent', 'ParticleSize', 'ZetaPotential',
    'CellType', 'CellOrigin', 'CellLine', 'Assay', 'Dose', 'Viability', 'Time'
]

num_cols = ['ParticleSize', 'ZetaPotential', 'Dose', 'Viability', 'Time']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

predictors = ['ZetaPotential', 'Dose', 'Time']
df[predictors] = df[predictors].apply(lambda x: x.fillna(x.median()))

def regression_impute(df, target_column, predictors):
    train_data = df.dropna(subset=[target_column] + predictors)
    test_data = df[df[target_column].isna()]
    model = LinearRegression()
    model.fit(train_data[predictors], train_data[target_column])
    df.loc[df[target_column].isna(), target_column] = model.predict(test_data[predictors])
    return df

df = regression_impute(df, 'ParticleSize', predictors)
df.dropna(inplace=True)

df['Toxicity'] = (df['Viability'] < 50).astype(int)

categorical_cols = ['Family', 'Solvent', 'CellType', 'CellOrigin']
df = pd.get_dummies(df, columns=categorical_cols)

features_basic = [col for col in df.columns if col not in ['PlantName', 'Part', 'CellLine', 'Assay', 'Viability', 'Toxicity']]
X_family = df[features_basic]
X_no_family = X_family.drop(columns=[col for col in X_family.columns if "Family_" in col])
y = df['Toxicity']

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

# Hyperparameters
param_grids = {
    'XGBoost': {'clf__n_estimators': [50, 100, 200], 'clf__max_depth': [3, 5, 10], 
                'clf__learning_rate': [0.01, 0.1, 0.2], 'clf__subsample': [0.8, 1.0], 'clf__colsample_bytree': [0.8, 1.0]},
    'AdaBoost': {'clf__n_estimators': [50, 100, 200], 'clf__learning_rate': [0.01, 0.1, 0.2]},
    'LightGBM': {'clf__n_estimators': [50, 100, 200], 'clf__max_depth': [-1, 5, 10],
                 'clf__learning_rate': [0.01, 0.1, 0.2], 'clf__subsample': [0.8, 1.0], 'clf__colsample_bytree': [0.8, 1.0]}
}

models = {
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42)
}

conf_matrices = {}

# Evaluation function
def evaluate_model(name, model, param_grid, X, y, dataset_name):
    pipeline = ImbPipeline([('smote', SMOTE(sampling_strategy=0.6, random_state=42)), ('clf', model)])
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=20,
                                       scoring=f1_scorer, cv=cv, verbose=1, n_jobs=-1, random_state=42)
    random_search.fit(X, y)
    best_model = random_search.best_estimator_
    y_pred = cross_val_predict(best_model, X, y, cv=cv)
    cm = confusion_matrix(y, y_pred)
    conf_matrices[f"{name} ({dataset_name})"] = cm

    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred)
    rec = recall_score(y, y_pred)
    f1_val = f1_score(y, y_pred)
    auc_pr = average_precision_score(y, y_pred)

    with open("model_text_results.txt", "a") as f:
        f.write(f"\n=== {name} ({dataset_name}) ===\n")
        f.write(f"Accuracy: {acc:.3f}\n")
        f.write(f"Recall: {rec:.3f}\n")
        f.write(f"Precision: {prec:.3f}\n")
        f.write(f"F1 Score: {f1_val:.3f}\n")
        f.write(f"AUC-PR: {auc_pr:.3f}\n")
        f.write(f"Classification Report:\n{classification_report(y, y_pred)}\n")

    return best_model

print("\n### TRAINING MODELS ###")
best_models = {}
for dataset_name, X in [("With Family", X_family), ("Without Family", X_no_family)]:
    for name in models:
        best_models[f"{name} ({dataset_name})"] = evaluate_model(name, models[name], param_grids[name], X, y, dataset_name)

# Confusion Matrices
for name, cm in conf_matrices.items():
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{name.replace(' ', '_')}.png")
    plt.close()

# SHAP + Boxplots
for dataset_name, X in [("With Family", X_family), ("Without Family", X_no_family)]:
    for name, model in best_models.items():
        if dataset_name in name:
            if "AdaBoost" in name:
                explainer = shap.KernelExplainer(model.named_steps["clf"].predict, shap.sample(X, 50))
                shap_values = explainer.shap_values(X)
            else:
                explainer = shap.TreeExplainer(model.named_steps["clf"])
                shap_values = explainer.shap_values(X)

            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values, X, feature_names=X.columns, show=False)
            plt.savefig(f"shap_summary_{name.replace(' ', '_')}.png")
            plt.close()

            shap_df = pd.DataFrame(shap_values, columns=X.columns)
            plt.figure(figsize=(10, 6))
            sns.boxplot(data=shap_df)
            plt.xticks(rotation=90)
            plt.title(f"Boxplot of Feature Importance - {name}")
            plt.ylabel("SHAP Value")
            plt.tight_layout()
            plt.savefig(f"boxplot_shap_{name.replace(' ', '_')}.png")
            plt.close()

# IC50-like Boxplot
df_raw = pd.read_csv(file_path, encoding='latin1')
df_raw.columns = [
    'PlantName', 'Part', 'Family', 'Solvent', 'ParticleSize', 'ZetaPotential',
    'CellType', 'CellOrigin', 'CellLine', 'Assay', 'Dose', 'Viability', 'Time'
]
df_raw['Dose'] = pd.to_numeric(df_raw['Dose'], errors='coerce')
df_raw.dropna(subset=['Dose', 'Family'], inplace=True)

plt.figure(figsize=(12, 6))
sns.boxplot(x='Family', y='Dose', data=df_raw)
plt.xticks(rotation=45)
plt.title("Approximate IC50-like Boxplot (based on Dose vs Family)")
plt.ylabel("Dose (mg/L) -- approximate IC50 representation")
plt.xlabel("Family")
plt.tight_layout()
plt.savefig("approximate_ic50_boxplot.png")
plt.close()

# P-VALUE Heatmap
families = df_raw['Family'].unique()
p_matrix = pd.DataFrame(np.ones((len(families), len(families))), index=families, columns=families)

for fam1, fam2 in combinations(families, 2):
    doses1 = df_raw[df_raw['Family'] == fam1]['Dose']
    doses2 = df_raw[df_raw['Family'] == fam2]['Dose']
    stat, p_value = ttest_ind(doses1, doses2)
    p_matrix.loc[fam1, fam2] = p_value
    p_matrix.loc[fam2, fam1] = p_value

cmap = sns.color_palette("Reds", as_cmap=True)
plt.figure(figsize=(12, 8))
sns.heatmap(p_matrix, annot=False, cmap=cmap, cbar_kws={'label': 'P value'}, vmin=0, vmax=1)
plt.title("Pairwise T-Test P-Value Heatmap (Dose between Families)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("pvalue_heatmap_families.png")
plt.close()

# RANDOM FOREST Importance
rf_with_family = RandomForestClassifier(random_state=42)
rf_with_family.fit(X_family, y)
importances_gini = rf_with_family.feature_importances_
perm_importance = permutation_importance(rf_with_family, X_family, y, n_repeats=10, random_state=42, scoring='accuracy')
importances_accuracy = perm_importance.importances_mean

feat_importance = pd.DataFrame({
    'Feature': X_family.columns,
    'MeanDecreaseGini': importances_gini,
    'MeanDecreaseAccuracy': importances_accuracy
}).sort_values('MeanDecreaseAccuracy', ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x='MeanDecreaseAccuracy', y='Feature', data=feat_importance)
plt.title("Mean Decrease Accuracy (Random Forest)")
plt.tight_layout()
plt.savefig("rf_mean_decrease_accuracy.png")
plt.close()

plt.figure(figsize=(8, 6))
sns.barplot(x='MeanDecreaseGini', y='Feature', data=feat_importance)
plt.title("Mean Decrease Gini (Random Forest)")
plt.tight_layout()
plt.savefig("rf_mean_decrease_gini.png")
plt.close()

# DT and RF Plots
tree_models = {
    "DecisionTree (With Family)": DecisionTreeClassifier(random_state=42),
    "DecisionTree (Without Family)": DecisionTreeClassifier(random_state=42),
    "RandomForest (With Family)": RandomForestClassifier(random_state=42),
    "RandomForest (Without Family)": RandomForestClassifier(random_state=42)
}

for model_name, model in tree_models.items():
    dataset_name = "With Family" if "With" in model_name else "Without Family"
    X = X_family if dataset_name == "With Family" else X_no_family

    model.fit(X, y)
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {model_name}")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{model_name.replace(' ', '_')}.png")
    plt.close()

    if "DecisionTree" in model_name:
        plt.figure(figsize=(20, 10))
        tree.plot_tree(model, feature_names=X.columns, filled=True)
        plt.title(f"Decision Tree - {model_name}")
        plt.savefig(f"decision_tree_{model_name.replace(' ', '_')}.png")
        plt.close()
    elif "RandomForest" in model_name:
        plt.figure(figsize=(20, 10))
        tree.plot_tree(model.estimators_[0], feature_names=X.columns, filled=True)
        plt.title(f"Random Forest (First Tree) - {model_name}")
        plt.savefig(f"random_forest_{model_name.replace(' ', '_')}.png")
        plt.close()



### TRAINING MODELS ###
Fitting 10 folds for each of 20 candidates, totalling 200 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[LightGBM] [Info] Number of positive: 57, number of negative: 96
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 80
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372549 -> initscore=-0.521297
[LightGBM] [Info] Start training from score -0.521297
[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71
[Light

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[LightGBM] [Info] Number of positive: 57, number of negative: 96
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372549 -> initscore=-0.521297
[LightGBM] [Info] Start training from score -0.521297
[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: p

  0%|          | 0/121 [00:00<?, ?it/s]



  0%|          | 0/121 [00:00<?, ?it/s]

  svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
