In [1]:
# Split the data into training and test datasets. Make sure your split
# is reproducible and that it maintains roughly the proportion of each
# class of dependent variable. (1 point)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('project1_cleaned.csv')
data

Unnamed: 0,deg-malig,is_recurring,lymph_node_capsular_invasion,is_left_breast,taken_radiation_therapy,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,...,inv-nodes_12-14,inv-nodes_15-17,inv-nodes_24-26,inv-nodes_3-5,inv-nodes_6-8,inv-nodes_9-11,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up
0,3,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,3,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0
371,2,0,1,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
372,3,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
373,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0


In [2]:
# Split the data into training and test datasets
X = data.drop('is_recurring', axis=1)
y = data['is_recurring']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [3]:
# Perform classification using: (6 points)
#   K-Nearest Neighbor Classifier
#   K-Nearest Neighbor Classifier using Grid search CV
#   Linear classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

# K-Nearest Neighbor Classifier
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

In [4]:
# K-Nearest Neighbor Classifier using Grid search CV
knn_search_accuracy = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 100)}
knn_gscv = GridSearchCV(knn_search_accuracy, param_grid, cv=5, scoring='recall')
knn_gscv.fit(X_train, y_train)
print('Best n_neighbors:', knn_gscv.best_params_)
knn_gscv.best_estimator_

Best n_neighbors: {'n_neighbors': np.int64(1)}


In [5]:
# Linear classification
clf = SGDClassifier(loss='perceptron', alpha=0.01, class_weight={0: 254, 1: 121})
clf.fit(X_train, y_train)
clf

In [None]:
# Print report showing accuracy, recall, precision and f1-score
# for each classification model. Which metric is most important
# for this problem? (You will explain your answer in the report
# in Part 3). (2 points)

# For this problem, the most important metric is recall as we
# want to minimize the number of false negatives. False negatives
# in this case would be a recurring breast cancer case that was
# predicted to not recur. This would be a very serious mistake
# as the patient would not receive the necessary treatment to
# prevent the recurrence of breast cancer. Therefore, we want to
# minimize the number of false negatives and maximize the number
# of true positives. This is why recall is the most important
# metric for this problem, and why we use recall as the metric
# for scoring the classification models during training.
from sklearn.metrics import classification_report

# K-Nearest Neighbor Classifier
print("K-NEAREST NEIGHBOR CLASSIFIER:")
print(f"Performance on TEST\n*******************\n{classification_report(y_test, knn.predict(X_test))}")
print(f"Performance on TRAIN\n********************\n{classification_report(y_train, knn.predict(X_train))}")

K-NEAREST NEIGHBOR CLASSIFIER:
Performance on TEST
*******************
              precision    recall  f1-score   support

           0       0.70      0.99      0.82        77
           1       0.75      0.08      0.15        36

    accuracy                           0.70       113
   macro avg       0.72      0.54      0.48       113
weighted avg       0.71      0.70      0.60       113

Performance on TRAIN
********************
              precision    recall  f1-score   support

           0       0.69      0.98      0.81       177
           1       0.70      0.08      0.15        85

    accuracy                           0.69       262
   macro avg       0.70      0.53      0.48       262
weighted avg       0.69      0.69      0.60       262



In [7]:
# K-Nearest Neighbor Classifier using Grid search CV
print("K-NEAREST NEIGHBOR CLASSIFIER USING GRID SEARCH CV:")
print(f"Performance on TEST\n*******************\n{classification_report(y_test, knn_gscv.predict(X_test))}")
print(f"Performance on TRAIN\n********************\n{classification_report(y_train, knn_gscv.predict(X_train))}")

K-NEAREST NEIGHBOR CLASSIFIER USING GRID SEARCH CV:
Performance on TEST
*******************
              precision    recall  f1-score   support

           0       0.69      0.66      0.68        77
           1       0.33      0.36      0.35        36

    accuracy                           0.57       113
   macro avg       0.51      0.51      0.51       113
weighted avg       0.58      0.57      0.57       113

Performance on TRAIN
********************
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       177
           1       0.98      0.98      0.98        85

    accuracy                           0.98       262
   macro avg       0.98      0.98      0.98       262
weighted avg       0.98      0.98      0.98       262



In [8]:
# Linear classification
print("LINEAR CLASSIFICATION (SGD):")
print(f"Performance on TEST\n*******************\n{classification_report(y_test, clf.predict(X_test), zero_division=0)}")
print(f"Performance on TRAIN\n********************\n{classification_report(y_train, clf.predict(X_train), zero_division=0)}")

LINEAR CLASSIFICATION (SGD):
Performance on TEST
*******************
              precision    recall  f1-score   support

           0       0.68      0.99      0.81        77
           1       0.50      0.03      0.05        36

    accuracy                           0.68       113
   macro avg       0.59      0.51      0.43       113
weighted avg       0.63      0.68      0.57       113

Performance on TRAIN
********************
              precision    recall  f1-score   support

           0       0.69      1.00      0.82       177
           1       1.00      0.07      0.13        85

    accuracy                           0.70       262
   macro avg       0.85      0.54      0.47       262
weighted avg       0.79      0.70      0.60       262

