In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
!pip install xgboost



In [6]:
!pip install lightgbm



In [7]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [8]:
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, StackingClassifier,
    AdaBoostClassifier, HistGradientBoostingClassifier
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

In [9]:

# Load data
data_path = r"/content/drive/MyDrive/Datasets/wpbcA.csv"
data = pd.read_csv(data_path)

# Encode target column
data['outcome'] = data['outcome'].map({'R': 1, 'N': 0})

# Drop unwanted columns
columns_to_drop = ['id', 'time']
data = data.drop(columns=columns_to_drop, errors='ignore')

# Separate features and target
X = data.drop('outcome', axis=1)
y = data['outcome']

# Drop columns with all NaNs and impute missing values
X = X.dropna(axis=1, how='all')
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Feature selection: Select top 25 features
selector = SelectKBest(score_func=mutual_info_classif, k=25)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
X = pd.DataFrame(X_selected, columns=selected_features)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=selected_features)

# Balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Fine-tuned classifiers
models = {
    "GradientBoosting": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=3),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.8),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, max_depth=4),
    "LightGBM": LGBMClassifier(n_estimators=100, learning_rate=0.1),
    "CatBoost": CatBoostClassifier(verbose=0, iterations=200, learning_rate=0.1, depth=4),
    "NaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(probability=True, kernel='rbf', C=1.0, gamma='scale'),
    "LogisticRegression": LogisticRegression(max_iter=1000, C=1.0),
    "DecisionTree": DecisionTreeClassifier(max_depth=5),
    "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=5),
    "HistGradientBoosting": HistGradientBoostingClassifier(max_iter=100)
}

# Add stacking classifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5)),
    ('svc', SVC(probability=True, C=1.0, gamma='scale')),
    ('xgb', XGBClassifier(eval_metric='logloss', n_estimators=100, max_depth=4))
]
models['Stacking'] = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Evaluation setup
results = {}
metrics_list = []
output_dir = r"/content/drive/MyDrive/Datasets/WPBC results/With FS"
os.makedirs(output_dir, exist_ok=True)

# Training and evaluation loop
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred, output_dict=True)

    TN, FP, FN, TP = cm.ravel()
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    npv = TN / (TN + FN) if (TN + FN) > 0 else 0
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0

    metrics_list.append({
        "Model": name,
        "Accuracy": acc,
        "Sensitivity (Recall)": sensitivity,
        "Specificity": specificity,
        "Precision (PPV)": precision,
        "NPV": npv,
        "F1-Score": f1
    })

    results[name] = {
        "accuracy": acc,
        "classification_report": cr,
        "confusion_matrix": cm
    }

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{name}_confusion_matrix.png"))
    plt.close()

# Comparison plot
model_names = [m["Model"] for m in metrics_list]
accuracies = [m["Accuracy"] for m in metrics_list]
plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies)
plt.title("Model Accuracy Comparison (Scaled + Tuned)")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "model_comparison.png"))
plt.close()

# Save metrics
metrics_df = pd.DataFrame(metrics_list).sort_values(by="Accuracy", ascending=False)
metrics_df.to_csv(os.path.join(output_dir, "detailed_model_metrics.csv"), index=False)

print(f"Results saved in: {output_dir}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 117, number of negative: 123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1945
[LightGBM] [Info] Number of data points in the train set: 240, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.487500 -> initscore=-0.050010
[LightGBM] [Info] Start training from score -0.050010
Results saved in: /content/drive/MyDrive/Datasets/WPBC results/With FS


In [10]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

# Load data
data_path = r"/content/drive/MyDrive/Datasets/wdbcA.csv"
data = pd.read_csv(data_path)

# Drop ID column if exists
data = data.drop(columns=['id'], errors='ignore')

# Encode diagnosis column
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Drop columns with all missing values
X = X.dropna(axis=1, how='all')
X_columns_cleaned = X.columns

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X_columns_cleaned)

# Feature selection: top 25 features using ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=25)
X = selector.fit_transform(X, y)

# Normalize data using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define models with default and tuned parameters (examples for a few)
models = {
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=150, learning_rate=0.1),
    "LightGBM": LGBMClassifier(n_estimators=150, learning_rate=0.1),
    "CatBoost": CatBoostClassifier(verbose=0),
    "NaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "SVM": SVC(probability=True, C=1.0, kernel='rbf'),
    "LogisticRegression": LogisticRegression(max_iter=1000, C=1.0),
    "DecisionTree": DecisionTreeClassifier(max_depth=5),
    "RandomForest": RandomForestClassifier(n_estimators=150),
    "HistGradientBoosting": HistGradientBoostingClassifier(max_iter=150)
}

# Add stacking classifier
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('svc', SVC(probability=True)),
    ('xgb', XGBClassifier(eval_metric='logloss'))
]
models['Stacking'] = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Prepare results
results = {}
metrics_list = []
output_dir = r"/content/drive/MyDrive/Datasets/WDBC results/With FS and Scaled"
os.makedirs(output_dir, exist_ok=True)

# Training loop
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred, output_dict=True)
    TN, FP, FN, TP = cm.ravel()

    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    npv = TN / (TN + FN) if (TN + FN) > 0 else 0
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0

    metrics_list.append({
        "Model": name,
        "Accuracy": acc,
        "Sensitivity (Recall)": sensitivity,
        "Specificity": specificity,
        "Precision (PPV)": precision,
        "NPV": npv,
        "F1-Score": f1
    })

    results[name] = {"accuracy": acc, "classification_report": cr}

    # Save confusion matrix plot
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{name}_confusion_matrix.png"))
    plt.close()

# Accuracy comparison plot
model_names = [m["Model"] for m in metrics_list]
accuracies = [m["Accuracy"] for m in metrics_list]
plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies)
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "model_comparison.png"))
plt.close()

# Export metrics
metrics_df = pd.DataFrame(metrics_list).sort_values(by="Accuracy", ascending=False)
metrics_df.to_csv(os.path.join(output_dir, "detailed_model_metrics.csv"), index=False)

print(f"Results saved in: {output_dir}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 283, number of negative: 288
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4768
[LightGBM] [Info] Number of data points in the train set: 571, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495622 -> initscore=-0.017514
[LightGBM] [Info] Start training from score -0.017514




Results saved in: /content/drive/MyDrive/Datasets/WDBC results/With FS and Scaled
