# COMP5318 Assignment 1: Classification

### Group number: 81 

In [None]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load Breast Cancer Wisconsin Dataset
data = pd.read_csv("breast-cancer-wisconsin.csv")
data_here = data

In [None]:
#Pre-processing 
#Filling in the missing attribute values
#Changing the class values 
data_here2 = data_here
keys = data_here2.keys()
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
for key in keys:
    uniq = data_here2[key].unique()
    if key == "class":
        data_here2["class"] = data_here2["class"].replace(["class1", "class2"], [0, 1])
        continue
    non_num_list = []
    for char in uniq:
        if not str(char).isnumeric():
            data_here2[key] = data_here2[key].replace(char, np.nan)
        else:
            data_here2[key] = data_here2[key].replace(char, float(char))
    imputer = imputer.fit(data_here2[[key]])
    data_here2[key] = imputer.transform(data_here2[[key]])
    
x = [i for i in data_here2.values.tolist()]
    
#Normalisation
scaler = MinMaxScaler(feature_range=(0, 1))

x = scaler.fit_transform(x)

data_here2 = pd.DataFrame(x, columns = keys, dtype = float).round(4)
data_here2["class"] = data_here2["class"].astype("int")

In [None]:
# Print first ten rows of pre-processed dataset to 4 decimal places
temp_printout = data_here2
temp=data_here2.iloc[0:10,:].values
for lists in temp:
  for number in range(len(lists)):
    if number == len(lists)-1:
      print("%d" % lists[number], end = "")
    else:
      print("%.4f" % lists[number], end = ",")
  print()

0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0


In [None]:
data_part1 = [i[:-1] for i in data_here2.values.tolist()]
result_part1 = [i[-1] for i in data_here2.values.tolist()]

### Part 1: Cross validation without parameter tuning

In [None]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [None]:
# K-Nearest Neighbour
def kNNClassifier(X, y, k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    scores = cross_val_score(knn, X, y, cv=cvKFold)
    return scores.mean()

In [None]:
# Logistic Regression
def logregClassifier(X, y):
    logC = LogisticRegression(random_state=0)
    logC.fit(X, y)
    scores = cross_val_score(logC, X, y, cv=cvKFold)
    return scores.mean()

In [None]:
#Naïve Bayes
def nbClassifier(X, y):
    NB = GaussianNB()
    NB.fit(X, y)
    scores = cross_val_score(NB, X, y, cv=cvKFold)
    return scores.mean()

In [None]:
# Decision Tree
def dtClassifier(X, y):
    DT = DecisionTreeClassifier(random_state=0)
    DT.fit(X, y)
    scores = cross_val_score(DT, X, y, cv=cvKFold)
    return scores.mean()

In [None]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    bag = BaggingClassifier(DecisionTreeClassifier(max_depth = max_depth, random_state=0),
                            max_samples=max_samples, n_estimators = n_estimators, max_features = max_depth, random_state=0)
    scores = cross_val_score(bag, X, y, cv=cvKFold)
    bag.fit(X, y)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth = max_depth, random_state=0),
                             n_estimators=n_estimators, learning_rate = learning_rate, random_state = 0)
    ada.fit(X, y)
    scores = cross_val_score(ada, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    gb = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate = learning_rate, random_state=0)
    gb.fit(X, y)
    scores = cross_val_score(gb, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [None]:
# Parameters for Part 1:
#KNN
k=3

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

# Print results for each classifier in part 1 to 4 decimal places here:
print("kNN average cross-validation accuracy: ", round(kNNClassifier(data_part1, result_part1, k), 4))
print("LR average cross-validation accuracy: ", round(logregClassifier(data_part1, result_part1), 4))
print("NB average cross-validation accuracy: ", round(nbClassifier(data_part1, result_part1), 4))
print("DT average cross-validation accuracy: ", round(dtClassifier(data_part1, result_part1), 4))
print("Bagging average cross-validation accuracy: ", round(bagDTClassifier(data_part1, result_part1, bag_n_estimators, bag_max_samples, bag_max_depth), 4))
print("AdaBoost average cross-validation accuracy: ", round(adaDTClassifier(data_part1, result_part1, ada_n_estimators, ada_learning_rate, ada_bag_max_depth), 4))
print("GB average cross-validation accuracy: ", round(gbClassifier(data_part1, result_part1, gb_n_estimators, gb_learning_rate), 4))

kNN average cross-validation accuracy:  0.9642
LR average cross-validation accuracy:  0.9642
NB average cross-validation accuracy:  0.9585
DT average cross-validation accuracy:  0.9471
Bagging average cross-validation accuracy:  0.9671
AdaBoost average cross-validation accuracy:  0.9642
GB average cross-validation accuracy:  0.9642


### Part 2: Cross validation with parameter tuning

In [None]:
# Linear SVM
# You should use SVC from sklearn.svm
C = [0.001, 0.01, 0.1, 1, 10, 100]
gamma = [0.001, 0.01, 0.1, 1, 10, 100]
grid = {'gamma':list(gamma), 'C':list(C)}
def bestLinClassifier(X,y):
    data_part2, result_part2 = X, y
    X_train, X_test, y_train, y_test = train_test_split(
        data_part2, result_part2, stratify=result_part2, random_state=0)  # split the data into train and test sets
    svc = SVC(random_state = 0, kernel = "linear")
    grid_search = GridSearchCV(svc, grid, cv=cvKFold,
                          return_train_score=True)
    grid_search.fit(X_train, y_train)
    test_score = round(grid_search.score(X_test, y_test), 4)  #following variables are for ouputs
    best_par = grid_search.best_params_
    cv_score = round(grid_search.best_score_, 4)

    return  best_par['C'], best_par['gamma'], cv_score, test_score#(appropriate values so that the required printing can be done)

In [None]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 20, 30, 50, 100]
max_leaf_nodes = [4, 10, 16, 20, 30]
rf_grid = {'n_estimators':list(n_estimators), 'max_leaf_nodes':list(max_leaf_nodes)}
def bestRFClassifier(X,y):
    data_part2, result_part2 = X, y
    X_train, X_test, y_train, y_test = train_test_split(
        data_part2, result_part2, stratify=result_part2, random_state=0) # split the data into train and test sets
    rf = RandomForestClassifier(criterion = "entropy", random_state = 0)
    grid_search = GridSearchCV(rf, rf_grid, cv=cvKFold,
                          return_train_score=True)
    grid_search.fit(X_train, y_train)
    test_score = round(grid_search.score(X_test, y_test), 4)  # following variables are for outputs
    best_par = grid_search.best_params_
    cv_score = round(grid_search.best_score_, 4)
    return best_par['n_estimators'], best_par['max_leaf_nodes'], cv_score, test_score#(appropriate values so that the required printing can be done)

### Part 2 Results

In [None]:
# Perform Grid Search with 10-fold Stratified Cross Validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "n_estimators" and "max_leaf_nodes" which should be printed as integers.
b_c, b_g, svc_sv_score, svc_test_score = bestLinClassifier(data_part1, result_part1)
b_n_e, max_nodes, rf_sv_score, rf_test_score = bestRFClassifier(data_part1, result_part1)
print("SVM best C: ", b_c)
print("SVM best gamma: ", b_g)
print("SVM cross-validation accuracy: ", format(svc_sv_score, ".4f")) #use ".4f" to keep 4 desimal places
print("SVM test set accuracy: ", format(svc_test_score, ".4f"))

print("RF best n_estimators: ", b_n_e)
print("RF best max_leaf_nodes: ", max_nodes)
print("RF cross-validation accuracy: ", format(rf_sv_score, ".4f"))
print("RF test set accuracy: ", format(rf_test_score, ".4f"))

SVM best C:  1
SVM best gamma:  0.001
SVM cross-validation accuracy:  0.9657
SVM test set accuracy:  0.9714
RF best n_estimators:  30
RF best max_leaf_nodes:  4
RF cross-validation accuracy:  0.9675
RF test set accuracy:  0.9600
