<a href="https://colab.research.google.com/github/mersalas/MLBS-2025_workshop/blob/main/Lab_1a_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Import packages**

In [None]:
# Install packages
!pip install pca
!pip install scikit-optimize

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pca import pca
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.plots import plot_objective
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, auc, roc_auc_score, matthews_corrcoef, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, precision_recall_curve

## **Load dataset**

In [None]:
# Fetch dataset
from sklearn.datasets import load_breast_cancer # cancer dataset from sklearn
cancer = load_breast_cancer()

## **Exploratory data analysis**

In [None]:
#cancer
#features = cancer.data
#target = cancer.target

In [None]:
# Transform dataset into dataframe
cancer_df = pd.DataFrame(data = np.c_[cancer['data'], cancer['target']],
                         columns = np.append(cancer['feature_names'], ['target']))

In [None]:
cancer_df.head()

In [None]:
# Know what each features mean
print("cancer.DESCR:\n{}".format(cancer.DESCR))

In [None]:
# dimensions (rows, columns) or (samples, features+target)
cancer_df.shape

In [None]:
# data types
cancer_df.dtypes

In [None]:
# Check missing values
cancer_df.isnull().sum()

In [None]:
# Count malignant & benign
cancer_df['target'].value_counts()

In [None]:
# Look at 4 features at a time
select_columns = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'target']
cancer_dataset_group1 = pd.DataFrame(cancer_df, columns=select_columns)
cancer_dataset_group1.head()

In [None]:
# Plot pairwise relationship
sns.set(style="ticks")
df = cancer_dataset_group1
sns.pairplot(df, hue="target")

In [None]:
# Separating the features from the target
X = cancer_df.iloc[:,0:30].values
y = cancer_df['target'].values

### PCA

In [None]:
# Initialize pca
model = pca(n_components=0.95, normalize=True)

# Fit transform
results = model.fit_transform(X)

In [None]:
# Screen plot
model.plot(figsize=(10,8))
plt.show()

In [None]:
# Print the top features
top = model.results['topfeat']
best = top.loc[top['type']=='best']
best

In [None]:
# 2D scatter plot
model.scatter(labels=y, SPE=True, legend=True, cmap='Set1', figsize=(15,10), dpi=300)

### **Data pre-processing**

In [None]:
# Normalize the data
norm = StandardScaler()
X_norm = norm.fit_transform(X)

In [None]:
# Split dataset into training & test set
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.20, stratify=y,
                                                    random_state=42)

print("Size of training set: {}   Size of test set:"
      " {}\n".format(X_train.shape[0], X_test.shape[0]))

## **Train model**

### **Logit**

In [None]:
# Perform grid search to find optimum parameters for logit
logit = LogisticRegression(max_iter=5000, random_state=42)
param = {'C': np.logspace(-3,3,10)}
logit_cv = GridSearchCV(logit, param, cv=10, n_jobs=-1)
logit_cv.fit(X_train, y_train)

print('best parameters: ', logit_cv.best_params_)
print('best score afetr grid search cv: ', logit_cv.best_score_)

In [None]:
# Train logit
logit_tuned = LogisticRegression(**logit_cv.best_params_, max_iter=5000, n_jobs=-1,
                                 random_state=42)

logit_tuned.fit(X_train, y_train)

In [None]:
# Print intercept & coefficients
print('intercept:', logit_tuned.intercept_ )
print('coef:', logit_tuned.coef_, end='\n')

In [None]:
# Evaluate the logit model on the training set
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'matthews_corrcoef']
cv_results = cross_validate(logit_tuned, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)

print("Performance of the logit model on the training set:\nAcc: {:.4f}\nPr: {:.4f}\nSn: {:.4f}\nF1: {:.4f}\nMCC: {:.4f}\nAUROC: {:.4f}".format(
    np.mean(cv_results['test_accuracy']),
    np.mean(cv_results['test_precision']),
    np.mean(cv_results['test_recall']),
    np.mean(cv_results['test_f1']),
    np.mean(cv_results['test_matthews_corrcoef']),
    np.mean(cv_results['test_roc_auc'])
))

### **KNN**

In [None]:
# Perform random search to find the optimum parameters for KNN
knn = KNeighborsClassifier()
param = {'n_neighbors': range(5, 20), 'weights': ['uniform', 'distance'],
         'metric': ['minkowski', 'manhattan', 'euclidean']}
knn_cv = RandomizedSearchCV(knn, param, cv=10, n_jobs=-1, random_state=42)
knn_cv.fit(X_train, y_train)

print('best parameters: ', knn_cv.best_params_)
print('best score after random search cv:', knn_cv.best_score_)

In [None]:
# Find the optimum k
train_acc = []
test_acc = []

# try n_neighbors from 1 to 30
neighbors_settings = range(1,30)

for n_neighbors in neighbors_settings:
  # build the model
  knn = KNeighborsClassifier(n_neighbors=n_neighbors)
  knn.fit(X_train, y_train)

  # record training set accuracy
  train_acc.append(knn.score(X_train, y_train))

  # record generalization accuracy
  test_acc.append(knn.score(X_test, y_test))

plt.plot(neighbors_settings, train_acc, label="training accuracy")
plt.plot(neighbors_settings, test_acc, label="test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()

In [None]:
# Train KNN
knn_tuned = KNeighborsClassifier(n_neighbors=7, weights='distance', p=2,
                                 metric='euclidean', n_jobs=-1)

knn_tuned.fit(X_train, y_train)

In [None]:
# Evaluate the KNN model on the training set
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'matthews_corrcoef']
cv_results = cross_validate(knn_tuned, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)

print("Performance of the KNN model on the training set:\nAcc: {:.4f}\nPr: {:.4f}\nSn: {:.4f}\nF1: {:.4f}\nMCC: {:.4f}\nAUROC: {:.4f}".format(
    np.mean(cv_results['test_accuracy']),
    np.mean(cv_results['test_precision']),
    np.mean(cv_results['test_recall']),
    np.mean(cv_results['test_f1']),
    np.mean(cv_results['test_matthews_corrcoef']),
    np.mean(cv_results['test_roc_auc'])
))

### **SVC**

In [None]:
# Perform Bayesian search to find optimum parameters for SVC
svc = SVC(random_state=42, kernel='rbf')

param = {'C': (1e-3, 1e1, 'log-uniform'),
         'gamma': (1e-3,1, 'log-uniform')}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
svc_cv = BayesSearchCV(estimator=svc, search_spaces=param, cv=cv, scoring='roc_auc',
                       n_jobs=-1, random_state=42)
svc_cv.fit(X_train, y_train)

print('best parameters: ', svc_cv.best_params_)
print('best score after search cv:', svc_cv.best_score_)

In [None]:
# Plot the Bayesian objective function
fig, ax = plt.subplots(figsize=(10,8))
plot_objective(svc_cv.optimizer_results_[0], ax=ax)
plt.show()

In [None]:
# Train SVC
svc_tuned = SVC(kernel='rbf', **svc_cv.best_params_, probability=True, random_state=42)

svc_tuned.fit(X_train, y_train)

In [None]:
# Evaluate the SVC model on the training set
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'matthews_corrcoef']
cv_results = cross_validate(svc_tuned, X_train, y_train, cv=10, scoring=scoring, n_jobs=-1)

print("Performance of the SVC model on the training set:\nAcc: {:.4f}\nPr: {:.4f}\nSn: {:.4f}\nF1: {:.4f}\nMCC: {:.4f}\nAUROC: {:.4f}".format(
    np.mean(cv_results['test_accuracy']),
    np.mean(cv_results['test_precision']),
    np.mean(cv_results['test_recall']),
    np.mean(cv_results['test_f1']),
    np.mean(cv_results['test_matthews_corrcoef']),
    np.mean(cv_results['test_roc_auc'])
))

## **Evaluation**

### **PR curves**

In [None]:
# Calculate decision function or probabilities for each model
y_thres_logit = logit_tuned.decision_function(X_test)
y_proba_knn = knn_tuned.predict_proba(X_test)[:, 1]
y_thres_svc = svc_tuned.decision_function(X_test)

# Calculate precision & recall for each model
precision_logit, recall_logit, _ = precision_recall_curve(y_test, y_thres_logit)
precision_knn, recall_knn, _ = precision_recall_curve(y_test, y_proba_knn)
precision_svc, recall_svc, _ = precision_recall_curve(y_test, y_thres_svc)

# Function to plot precision-recall curve
def plot_PR_curve(precision, recall, label=None):
  plt.plot(recall, precision, linewidth=2, label=label)
  plt.xticks(fontsize=10)
  plt.yticks(fontsize=10)
  plt.ylabel('Precision', fontsize=12)
  plt.xlabel('Recall', fontsize=12)
  plt.grid(False)

# Plot PR curves for all models
plt.figure(figsize=(6,5))
plot_PR_curve(precision_logit, recall_logit, "logit")
plot_PR_curve(precision_knn, recall_knn, "KNN")
plot_PR_curve(precision_svc, recall_svc, "SVC")

plt.legend(loc="best", fontsize=12)
plt.show()

### **ROC curves**

In [None]:
# Calculate ROC curves and AUC for each model
fpr, tpr, thresholds = roc_curve(y_test, y_thres_logit)
auroc = auc(fpr, tpr)

fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, y_proba_knn)
auroc_knn = auc(fpr_knn, tpr_knn)

fpr_svc, tpr_svc, thresholds_svc = roc_curve(y_test, y_thres_svc)
auroc_svc = auc(fpr_svc, tpr_svc)

# Defin a function to plot the ROC curve
def plot_roc_curve(fpr, tpr, auroc, label=None):
  plt.plot(fpr, tpr, linewidth=2, label=f'{label} (AUROC = {auroc:.4f})')
  plt.plot([0, 1], [0, 1], 'k--')  # Dashed diagonal
  plt.xticks(fontsize=10)
  plt.yticks(fontsize=10)
  plt.ylabel('True Positive Rate', fontsize=12)
  plt.xlabel('False Positive Rate', fontsize=12)
  plt.grid(False)

# Plot all ROC curves
plt.figure(figsize=(6,5))
plot_roc_curve(fpr, tpr, auroc, "logit")
plot_roc_curve(fpr_knn, tpr_knn, auroc_knn, "KNN")
plot_roc_curve(fpr_svc, tpr_svc, auroc_svc, "SVC")

plt.legend(loc="best", fontsize=12)
plt.savefig('ROC.jpg', format='jpg', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Evaluate the SVC model on the test set
y_pred_svc = svc_tuned.predict(X_test)
y_proba_svc = svc_tuned.predict_proba(X_test)[:, 1]

print("Performance of the SVC model on the test set:")
print("Acc: {:.4f}".format(accuracy_score(y_test, y_pred_svc)))
print("Pr: {:.4f}".format(precision_score(y_test, y_pred_svc)))
print("Sn: {:.4f}".format(recall_score(y_test, y_pred_svc)))
print("F1: {:.4f}".format(f1_score(y_test, y_pred_svc)))
print("MCC: {:.4f}".format(matthews_corrcoef(y_test, y_pred_svc)))
print("AUROC: {:.4f}".format(roc_auc_score(y_test, y_proba_svc)))

In [None]:
# Print classification report for svc
print(classification_report(y_test, y_pred_svc))

In [None]:
# Plot confusion matrix for svc model
cm = confusion_matrix(y_test, y_pred_svc, labels=svc_tuned.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=svc_tuned.classes_)
disp.plot()

### **Extract best features**

In [None]:
# Convert the feature‑position strings to integer indices
best_indices = best['feature'].astype(int).tolist()
print("Best feature positions:", best_indices)

# Map those indices to the real column names in cancer_df
best_feature_names = cancer_df.columns[best_indices].tolist()
print("Best feature names:", best_feature_names)

In [None]:
# Subset cancer_df by those column names
X_select = cancer_df[best_feature_names]
X_select

In [None]:
# Save in GDrive
csv_path = '/content/X_select.csv'
X_select.to_csv(csv_path, index=False)

In [None]:
X_select = pd.read_csv('/content/X_select.csv')
X_select

### **Exercise 1a**



*   Perform EDA on X_select dataset
*   Build a random forest classifier for the dataset





Sumbit your outputs here:
https://drive.google.com/drive/folders/1RCn-8HnAnIbNp2sAwtfptaMoDAUXSeQM?usp=sharing