In [4]:
import shap
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, make_scorer, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression
from itertools import combinations
from scipy.stats import ttest_ind
from sklearn.inspection import permutation_importance
from sklearn import tree
from sklearn.experimental import enable_iterative_imputer  # Required to enable
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier



# Load and preprocess data
file_path = r"C:\\Users\\SANDY\\Downloads\\carbon dots.csv"
df = pd.read_csv(file_path, encoding='latin1')

df.columns = [
    'PlantName', 'Part', 'Family', 'Solvent', 'ParticleSize', 'ZetaPotential',
    'CellType', 'CellOrigin', 'CellLine', 'Assay', 'Dose', 'Viability', 'Time'
]

num_cols = ['ParticleSize', 'ZetaPotential', 'Dose', 'Viability', 'Time']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

from sklearn.experimental import enable_iterative_imputer  # Required to enable IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# Improved RandomForest for better imputation accuracy
rf_estimator = RandomForestRegressor(
    n_estimators=100,
    max_depth=6,
    min_samples_split=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

imputer = IterativeImputer(
    estimator=rf_estimator,
    max_iter=10,
    random_state=42
)

# Apply imputation
# Apply imputation
df[num_cols] = imputer.fit_transform(df[num_cols])

# Create exposure feature
df['Exposure'] = df['Dose'] * df['Time']
num_cols.append('Exposure')

df.dropna(inplace=True)
df['Toxicity'] = (df['Viability'] < 50).astype(int)


categorical_cols = ['Family', 'Solvent', 'CellType', 'CellOrigin']
df = pd.get_dummies(df, columns=categorical_cols)

features_basic = [col for col in df.columns if col not in ['PlantName', 'Part', 'CellLine', 'Assay', 'Viability', 'Toxicity']]
X_family = df[features_basic]
X_no_family = X_family.drop(columns=[col for col in X_family.columns if "Family_" in col])
y = df['Toxicity']

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score)

# Hyperparameters
param_grids = {
    'XGBoost': {'clf__n_estimators': [50, 100, 200], 'clf__max_depth': [3, 5, 10], 
                'clf__learning_rate': [0.01, 0.1, 0.2], 'clf__subsample': [0.8, 1.0], 'clf__colsample_bytree': [0.8, 1.0]},
    'AdaBoost': {'clf__n_estimators': [50, 100, 200], 'clf__learning_rate': [0.01, 0.1, 0.2]},
    'LightGBM': {'clf__n_estimators': [50, 100, 200], 'clf__max_depth': [-1, 5, 10],
                 'clf__learning_rate': [0.01, 0.1, 0.2], 'clf__subsample': [0.8, 1.0], 'clf__colsample_bytree': [0.8, 1.0]},
    'DecisionTree': {
    'clf__max_depth': [3, 5, 10, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
    },
    'RandomForest': {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [3, 5, 10, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__max_features': ['sqrt', 'log2']
    },
    'GradientBoosting': {
    'clf__n_estimators': [50, 100, 200],
    'clf__learning_rate': [0.01, 0.1, 0.2],
    'clf__max_depth': [3, 5, 10]
    }

}

models = {
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)

    
}

conf_matrices = {}

# Evaluation function
def evaluate_model(name, model, param_grid, X, y, dataset_name):
    pipeline = ImbPipeline([('smote', SMOTE(sampling_strategy=0.6, random_state=42)), ('clf', model)])
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=20,
                                       scoring=f1_scorer, cv=cv, verbose=1, n_jobs=-1, random_state=42)
    random_search.fit(X, y)
    best_model = random_search.best_estimator_
    y_pred = cross_val_predict(best_model, X, y, cv=cv)
    cm = confusion_matrix(y, y_pred)
    conf_matrices[f"{name} ({dataset_name})"] = cm

    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred)
    rec = recall_score(y, y_pred)
    f1_val = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred)

    with open("model_text_results.txt", "a") as f:
        f.write(f"\n=== {name} ({dataset_name}) ===\n")
        f.write(f"Accuracy: {acc:.3f}\n")
        f.write(f"Precision: {prec:.3f}\n")
        f.write(f"Recall: {rec:.3f}\n")
        f.write(f"F1 Score: {f1_val:.3f}\n")
        f.write(f"ROC-AUC: {roc_auc:.3f}\n")
        f.write(f"Classification Report:\n{classification_report(y, y_pred)}\n")

    return best_model

print("\n### TRAINING MODELS ###")
best_models = {}
for dataset_name, X in [("With Family", X_family), ("Without Family", X_no_family)]:
    for name in models:
        best_models[f"{name} ({dataset_name})"] = evaluate_model(name, models[name], param_grids[name], X, y, dataset_name)

# Confusion Matrices
for name, cm in conf_matrices.items():
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{name.replace(' ', '_')}.png")
    plt.close()

# --- SHAP + Boxplots + Confusion Matrix (updated for GradientBoosting too) ---
for dataset_name, X in [("With Family", X_family), ("Without Family", X_no_family)]:
    for name, model in best_models.items():
        if dataset_name in name:
            clf = model.named_steps["clf"]
            y_pred = cross_val_predict(model, X, y, cv=cv)
            cm = confusion_matrix(y, y_pred)

            # Save confusion matrix
            plt.figure(figsize=(6, 5))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
            plt.title(f"Confusion Matrix - {name}")
            plt.ylabel("True Label")
            plt.xlabel("Predicted Label")
            plt.tight_layout()
            plt.savefig(f"confusion_matrix_{name.replace(' ', '_')}.png")
            plt.close()

            # # SHAP Explainer
            # try:
            #     if "AdaBoost" in name:
            #         explainer = shap.KernelExplainer(clf.predict, shap.sample(X, 50))
            #         shap_values = explainer.shap_values(X)
            #     else:
            #         explainer = shap.TreeExplainer(clf)
            #         shap_values = explainer.shap_values(X)
            # except Exception as e:
            #     print(f"SHAP failed for {name}: {e}")
            #     continue

            ## SHAP summary plot
            #plt.figure(figsize=(10, 6))
            #shap.summary_plot(shap_values, X, feature_names=X.columns, show=False)
            #plt.savefig(f"shap_summary_{name.replace(' ', '_')}.png")
            #plt.close()

            # SHAP boxplot
            # SHAP boxplot (safe for binary classifiers including DecisionTree and RandomForest)
            # try:
            #     if isinstance(shap_values, list):
            #         shap_array = shap_values[1]  # multiclass shap output from KernelExplainer
            #     elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
            #         shap_array = shap_values[:, :, 1]  # pick class 1 for binary output
            #     else:
            #         shap_array = shap_values

            #     shap_df = pd.DataFrame(shap_array, columns=X.columns)
            #     plt.figure(figsize=(10, 6))
            #     sns.boxplot(data=shap_df)
            #     plt.xticks(rotation=90)
            #     plt.title(f"Boxplot of Feature Importance - {name}")
            #     plt.ylabel("SHAP Value")
            #     plt.tight_layout()
            #     plt.savefig(f"boxplot_shap_{name.replace(' ', '_')}.png")
            #     plt.close()
            # except Exception as e:
            #     print(f"SHAP Boxplot failed for {name}: {e}")



# IC50-like Boxplot
df_raw = pd.read_csv(file_path, encoding='latin1')
df_raw.columns = [
    'PlantName', 'Part', 'Family', 'Solvent', 'ParticleSize', 'ZetaPotential',
    'CellType', 'CellOrigin', 'CellLine', 'Assay', 'Dose', 'Viability', 'Time'
]
df_raw['Dose'] = pd.to_numeric(df_raw['Dose'], errors='coerce')
df_raw.dropna(subset=['Dose', 'Family'], inplace=True)

plt.figure(figsize=(12, 6))
sns.boxplot(x='Family', y='Dose', data=df_raw)
plt.xticks(rotation=45)
plt.title("Approximate IC50-like Boxplot (based on Dose vs Family)")
plt.ylabel("Dose (mg/L) -- approximate IC50 representation")
plt.xlabel("Family")
plt.tight_layout()
plt.savefig("approximate_ic50_boxplot.png")
plt.close()

# P-VALUE Heatmap
families = df_raw['Family'].unique()
p_matrix = pd.DataFrame(np.ones((len(families), len(families))), index=families, columns=families)

for fam1, fam2 in combinations(families, 2):
    doses1 = df_raw[df_raw['Family'] == fam1]['Dose']
    doses2 = df_raw[df_raw['Family'] == fam2]['Dose']
    stat, p_value = ttest_ind(doses1, doses2)
    p_matrix.loc[fam1, fam2] = p_value
    p_matrix.loc[fam2, fam1] = p_value

cmap = sns.color_palette("Reds", as_cmap=True)
plt.figure(figsize=(12, 8))
sns.heatmap(p_matrix, annot=False, cmap=cmap, cbar_kws={'label': 'P value'}, vmin=0, vmax=1)
plt.title("Pairwise T-Test P-Value Heatmap (Dose between Families)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("pvalue_heatmap_families.png")
plt.close()

# RANDOM FOREST Importance
rf_with_family = RandomForestClassifier(random_state=42)
rf_with_family.fit(X_family, y)
importances_gini = rf_with_family.feature_importances_
perm_importance = permutation_importance(rf_with_family, X_family, y, n_repeats=10, random_state=42, scoring='accuracy')
importances_accuracy = perm_importance.importances_mean

feat_importance = pd.DataFrame({
    'Feature': X_family.columns,
    'MeanDecreaseGini': importances_gini,
    'MeanDecreaseAccuracy': importances_accuracy
}).sort_values('MeanDecreaseAccuracy', ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(x='MeanDecreaseAccuracy', y='Feature', data=feat_importance)
plt.title("Mean Decrease Accuracy (Random Forest)")
plt.tight_layout()
plt.savefig("rf_mean_decrease_accuracy.png")
plt.close()

plt.figure(figsize=(8, 6))
sns.barplot(x='MeanDecreaseGini', y='Feature', data=feat_importance)
plt.title("Mean Decrease Gini (Random Forest)")
plt.tight_layout()
plt.savefig("rf_mean_decrease_gini.png")
plt.close()



# Plot decision tree or random forest from already trained best models
from sklearn import tree

# Plot tree-based models from best_models
# for name, model in best_models.items():
#     clf = model.named_steps['clf']
#     dataset_name = "With Family" if "With Family" in name else "Without Family"
#     X = X_family if "With Family" in name else X_no_family

#     if isinstance(clf, DecisionTreeClassifier):
#         # Plot single Decision Tree
#         plt.figure(figsize=(20, 10))
#         tree.plot_tree(clf, feature_names=X.columns, filled=True)
#         plt.title(f"Decision Tree - {name}")
#         plt.savefig(f"decision_tree_{name.replace(' ', '_')}.png")
#         plt.close()

#     elif isinstance(clf, RandomForestClassifier):
#         # Plot first tree of Random Forest
#         plt.figure(figsize=(20, 10))
#         tree.plot_tree(clf.estimators_[0], feature_names=X.columns, filled=True)
#         plt.title(f"Random Forest (First Tree) - {name}")
#         plt.savefig(f"random_forest_{name.replace(' ', '_')}.png")
#         plt.close()

#     elif isinstance(clf, AdaBoostClassifier):
#         # Plot first weak learner of AdaBoost (usually a stump)
#         plt.figure(figsize=(12, 8))
#         tree.plot_tree(clf.estimators_[0], feature_names=X.columns, filled=True)
#         plt.title(f"AdaBoost (First Tree) - {name}")
#         plt.savefig(f"adaboost_tree_{name.replace(' ', '_')}.png")
#         plt.close()

#     elif isinstance(clf, GradientBoostingClassifier):
#         # Plot first estimator from Gradient Boosting
#         plt.figure(figsize=(20, 10))
#         tree.plot_tree(clf.estimators_[0, 0], feature_names=X.columns, filled=True)
#         plt.title(f"Gradient Boosting (First Tree) - {name}")
#         plt.savefig(f"gradientboosting_tree_{name.replace(' ', '_')}.png")
#         plt.close()

#     elif hasattr(clf, "get_booster"):  # XGBoost
#         # Use XGBoost built-in plotting (needs matplotlib backend)
#         try:
#             booster = clf.get_booster()
#             for i in range(1):  # Plot only the first tree
#                 plt.figure(figsize=(20, 10))
#                 xgb.plot_tree(booster, num_trees=i)
#                 plt.title(f"XGBoost Tree {i} - {name}")
#                 plt.savefig(f"xgboost_tree_{name.replace(' ', '_')}_tree{i}.png")
#                 plt.close()
#         except Exception as e:
#             print(f"XGBoost tree plot failed for {name}: {e}")

#     elif isinstance(clf, lgb.LGBMClassifier):
#         # Plot first tree from LightGBM
#         try:
#             ax = lgb.plot_tree(clf, tree_index=0, figsize=(20, 10), show_info=['split_gain', 'internal_value', 'leaf_count'])
#             plt.title(f"LightGBM Tree 0 - {name}")
#             plt.savefig(f"lightgbm_tree_{name.replace(' ', '_')}.png")
#             plt.close()
#         except Exception as e:
#             print(f"LightGBM tree plot failed for {name}: {e}")

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import ttest_ind

# Assume df_raw with columns: ['Family', 'Dose'] is already prepared

# Step 1: Compute p-value matrix
families = sorted(df_raw['Family'].unique())
p_matrix = pd.DataFrame(index=families, columns=families, dtype=float)

for fam1, fam2 in combinations(families, 2):
    group1 = df_raw[df_raw['Family'] == fam1]['Dose']
    group2 = df_raw[df_raw['Family'] == fam2]['Dose']
    stat, p = ttest_ind(group1, group2, equal_var=False)
    p_matrix.loc[fam1, fam2] = p
    p_matrix.loc[fam2, fam1] = p

np.fill_diagonal(p_matrix.values, np.nan)

# Step 2: Categorize p-values for coloring
def categorize_pvalue(p):
    if pd.isna(p):
        return np.nan
    elif p < 0.01:
        return 0.01
    elif p < 0.05:
        return 0.05
    else:
        return 1

color_matrix = p_matrix.map(categorize_pvalue)

# Optional: Abbreviate family names
abbrev_families = {fam: fam[:4] + '.' if len(fam) > 4 else fam for fam in families}
color_matrix.rename(index=abbrev_families, columns=abbrev_families, inplace=True)
p_matrix.rename(index=abbrev_families, columns=abbrev_families, inplace=True)

# Step 3: Plotting
plt.figure(figsize=(8, 6))
sns.heatmap(color_matrix, cmap=["#d73027", "#fc8d59", "#ffffbf"],  # deep red, orange, white
            cbar_kws={'label': 'p value'}, linewidths=0.5, square=True, linecolor='gray')

# Add star annotations
for i in range(len(color_matrix)):
    for j in range(len(color_matrix)):
        pval = p_matrix.iloc[i, j]
        if pd.notna(pval):
            if pval < 0.01:
                plt.text(j + 0.5, i + 0.5, "**", ha='center', va='center', color='black', fontsize=10)
            elif pval < 0.05:
                plt.text(j + 0.5, i + 0.5, "*", ha='center', va='center', color='black', fontsize=10)

plt.xticks(rotation=45, ha='right')
plt.title("P-value Matrix (Dose vs Family)", fontsize=13)
plt.tight_layout()
plt.savefig("pvalue_discrete_star_heatmap.png")
plt.close()
from sklearn.tree import export_graphviz
from graphviz import Source
import os
from sklearn.tree import _tree
from graphviz import Digraph

def custom_tree_to_graphviz(clf, feature_names, filename='custom_tree'):
    tree_ = clf.tree_
    dot = Digraph(name="CustomTree", format='png')
    dot.attr('node', shape='ellipse', style='filled', fontname='Helvetica', fontsize='12')

    def add_node(node_id):
        if tree_.feature[node_id] != _tree.TREE_UNDEFINED:
            name = feature_names[tree_.feature[node_id]]
            threshold = round(tree_.threshold[node_id], 3)
            label = f"{name} ≤ {threshold}"
            dot.node(str(node_id), label=label, fillcolor="white")
        else:
            value = tree_.value[node_id][0]
            pred_class = int(value.argmax())
            label = "Toxic" if pred_class == 1 else "Non-Toxic"
            color = "red" if label == "Toxic" else "green"
            dot.node(str(node_id), label=label, fillcolor=color, shape='box', fontcolor="white")

    def recurse(node_id):
        add_node(node_id)
        if tree_.feature[node_id] != _tree.TREE_UNDEFINED:
            left_child = tree_.children_left[node_id]
            right_child = tree_.children_right[node_id]
            recurse(left_child)
            recurse(right_child)
            dot.edge(str(node_id), str(left_child), label="yes")
            dot.edge(str(node_id), str(right_child), label="no")

    recurse(0)
    dot.render(filename, cleanup=True)
    print(f"Custom tree saved to: {filename}.png")


# Ensure output folder exists
os.makedirs("graphviz_trees", exist_ok=True)

for name, model in best_models.items():
    clf = model.named_steps['clf']
    dataset_name = "With Family" if "With Family" in name else "Without Family"
    X = X_family if "With Family" in name else X_no_family

    try:
        # Handle DecisionTreeClassifier
        if isinstance(clf, DecisionTreeClassifier):
            dt = clf

        # Handle RandomForestClassifier (first tree)
        elif isinstance(clf, RandomForestClassifier):
            dt = clf.estimators_[0]

        # Handle AdaBoostClassifier (first weak learner)
        elif isinstance(clf, AdaBoostClassifier):
            dt = clf.estimators_[0]

        # Handle GradientBoostingClassifier (first estimator)
        elif isinstance(clf, GradientBoostingClassifier):
            dt = clf.estimators_[0, 0]

        else:
            dt = None  # Not a tree-based model (e.g., XGBoost, LGBM)

        if dt is not None:
            # --- Standard Graphviz PNG ---
            dot_data = export_graphviz(
                dt,
                out_file=None,
                feature_names=X.columns,
                class_names=['Non-Toxic', 'Toxic'],
                filled=True,
                rounded=True,
                special_characters=True
            )
            graph = Source(dot_data)
            graph.render(f"graphviz_trees/standard_tree_{name.replace(' ', '_')}", format="png", cleanup=True)

            # --- Custom stylized Graphviz PNG ---
            custom_tree_to_graphviz(
                dt,
                feature_names=X.columns,
                filename=f"graphviz_trees/custom_tree_{name.replace(' ', '_')}"
            )

        # Handle XGBoost
        elif hasattr(clf, "get_booster"):
            booster = clf.get_booster()
            xgb.plot_tree(booster, num_trees=0)
            plt.title(f"XGBoost Tree 0 - {name}")
            plt.savefig(f"graphviz_trees/xgboost_tree_{name.replace(' ', '_')}_tree0.png")
            plt.close()

        # Handle LightGBM
        elif isinstance(clf, lgb.LGBMClassifier):
            ax = lgb.plot_tree(clf, tree_index=0, figsize=(20, 10),
                               show_info=['split_gain', 'internal_value', 'leaf_count'])
            plt.title(f"LightGBM Tree 0 - {name}")
            plt.savefig(f"graphviz_trees/lightgbm_tree_{name.replace(' ', '_')}.png")
            plt.close()

    except Exception as e:
        print(f"Custom Graphviz export failed for {name}: {e}")






### TRAINING MODELS ###
Fitting 10 folds for each of 20 candidates, totalling 200 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[LightGBM] [Info] Number of positive: 57, number of negative: 96
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 138
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372549 -> initscore=-0.521297
[LightGBM] [Info] Start training from score -0.521297
[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000074 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 125
[LightGBM] [Info] Number of data points in the train set: 137, numb

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[LightGBM] [Info] Number of positive: 57, number of negative: 96
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 132
[LightGBM] [Info] Number of data points in the train set: 153, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372549 -> initscore=-0.521297
[LightGBM] [Info] Start training from score -0.521297
[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]:

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 125
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372263 -> initscore=-0.522522
[LightGBM] [Info] Start training from score -0.522522
[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372263 -> initscore=-0.522522
[LightGBM] [Info] Sta

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372263 -> initscore=-0.522522
[LightGBM] [Info] Start training from score -0.522522
[LightGBM] [Info] Number of positive: 51, number of negative: 86
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123
[LightGBM] [Info] Number of data points in the train set: 137, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.372263 -> initscore=-0.522522
[LightGBM] [Info] Start training from score -0.522522
[LightGBM] [Info] Number of posi

  svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df


Custom tree saved to: graphviz_trees/custom_tree_AdaBoost_(With_Family).png
Custom tree saved to: graphviz_trees/custom_tree_DecisionTree_(With_Family).png
Custom tree saved to: graphviz_trees/custom_tree_RandomForest_(With_Family).png
Custom tree saved to: graphviz_trees/custom_tree_GradientBoosting_(With_Family).png




Custom tree saved to: graphviz_trees/custom_tree_AdaBoost_(Without_Family).png
Custom tree saved to: graphviz_trees/custom_tree_DecisionTree_(Without_Family).png
Custom tree saved to: graphviz_trees/custom_tree_RandomForest_(Without_Family).png
Custom tree saved to: graphviz_trees/custom_tree_GradientBoosting_(Without_Family).png
