# **Week 4: Colab Experiment**

# I. Introduction
In this exercise, we load the Breast cancer wisconsin dataset for classification.

# II. Methods
We train 3 models:
1. logistic regression
2. support vector machine
3. decision tree.

...

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from collections import Counter
from datetime import datetime
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import zero_one_loss
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Define the dependent and independent variables.
data = load_breast_cancer()
Y = data.target
X = data.data


In [None]:
# Create CV folds
num_folds = 5
kf = KFold(n_splits=num_folds, random_state=0, shuffle=True)
kfold_indices = {}

for i, (train_index, test_index) in enumerate(kf.split(X)):
  kfold_indices[f"fold_{i}"] = {'train': train_index, 'test': test_index}

In [None]:
# Train models and apply them to the test set
Error_rate = {'logreg': [], 'svm': [], 'decision_tree': []}
param_grid_logreg = {
  'C': [0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2],
  'penalty': [None],
  'solver': ['sag', 'saga']
}
param_grid_svm = {
  'C': [0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2],
  'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
param_grid_dt = {
  'criterion': ['gini', 'entropy', 'log_loss'],
  'min_samples_split': [4],
  'min_samples_leaf': [2, 3, 4],
  'max_depth': [2, 3, 4, 5],
  'max_features': [3, 4, 5, 6, 7, 8],
}

for fold_id in range(num_folds):
  X_train = X[kfold_indices[f"fold_{fold_id}"]['train']]
  Y_train = Y[kfold_indices[f"fold_{fold_id}"]['train']]
  X_test = X[kfold_indices[f"fold_{fold_id}"]['test']]
  Y_test = Y[kfold_indices[f"fold_{fold_id}"]['test']]

  # TODO : use standardScaler to normalize the data and run the models
  scaler = StandardScaler().fit(X_train)
  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)

  # GridSearchCV(estimator, param_grid, scoring='accuracy')
  grid_search_logreg = GridSearchCV(LogisticRegression(random_state=87), param_grid_logreg, scoring='accuracy')
  grid_search_logreg.fit(X_train, Y_train)
  grid_search_svm = GridSearchCV(SVC(random_state=87), param_grid_svm, scoring='accuracy')
  grid_search_svm.fit(X_train, Y_train)
  grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=87), param_grid_dt, scoring='accuracy')
  grid_search_dt.fit(X_train, Y_train)

  # Logistic regression
  best_model_logreg = grid_search_logreg.best_estimator_
  Error_rate['logreg'].append(zero_one_loss(
      Y_test, best_model_logreg.predict(X_test)))
  # best_params_logreg = grid_search_logreg.best_params_
  # print("Best Hyperparameters for logreg:", best_params_logreg)

  # SVM
  best_model_svm = grid_search_svm.best_estimator_
  Error_rate['svm'].append(zero_one_loss(
      Y_test, best_model_svm.predict(X_test)))
  # best_params_svm = grid_search_svm.best_params_
  # print("Best Hyperparameters for svm:", best_params_svm)

  # Decision tree
  best_model_dt = grid_search_dt.best_estimator_
  Error_rate['decision_tree'].append(zero_one_loss(
      Y_test, best_model_dt.predict(X_test)))
  # best_params_dt = grid_search_dt.best_params_
  # print("Best Hyperparameters for Decision Tree:", best_params_dt)

## III. Results

Show the results.

In [None]:
# TODO
print(f"The error rate over 5 folds in CV:")
print(f"Logistic Regression: mean = {round(np.mean(Error_rate['logreg']),4)}, std = {round(np.std(Error_rate['logreg']),4)}")
print(f"SVM: mean = {round(np.mean(Error_rate['svm']),4)}, std = {round(np.std(Error_rate['svm']),4)}")
print(f"Decision Tree: mean = {round(np.mean(Error_rate['decision_tree']),4)}, std = {round(np.std(Error_rate['decision_tree']),4)}")


The error rate over 5 folds in CV:
Logistic Regression: mean = 0.0246, std = 0.017
SVM: mean = 0.0246, std = 0.0129
Decision Tree: mean = 0.0527, std = 0.0199


# IV. Conclusion and Discussion
As you can see, Logistic Regression has almost the same performance as SVM.

However, Decision Tree performs worse than these two classifiers.

Because Decision Tree is sensitive to it's hyperparameter, such like the tree
depth,

minimum samples per leaf, and pruning, it may perform well only on some

specific hyperparameter-configurations.

Besides, Decision tree only learns linear-relationsip.

Hence, when the job has non-linear characteristic, Decision Tree will perform

worse than Logistic Regression.