# Program Assignment #2 - 
# Decision Tree & Random Forest & KNN & PCA
---

## Name: 李勝維
## Student ID: 0711239

---

In [500]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)

# Load data

In [501]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# load data
raw_data = pd.read_csv("data/student-mat.csv", sep=";").values

# # shuffle data
# np.random.shuffle(raw_data)

x, y = raw_data[..., :-1], raw_data[..., -1]
Categorical_Features = [0, 1, 3, 4, 5, 8, 9, 10, 11, 15, 16, 17, 18, 19, 20, 21, 22]
Numerical_Features = [2, 6, 7, 12, 13, 14, 23, 24, 25, 26, 27, 28, 29, 30, 31]

# transform x to one-hot
x_encoder = OneHotEncoder(sparse=False)
x_cat = x_encoder.fit_transform(x[..., Categorical_Features])
x_num = x[..., Numerical_Features]
x = np.hstack((x_cat, x_num))

# transform y to binary / 5 class type
y_binary = np.digitize(y, [10]) # if G3 < 10, y = 0
y_5class = np.digitize(y, [21, 16, 14, 12, 10])
del y

# Principal components analysis
transforms training set and test set based on training set

In [502]:
from sklearn.decomposition import PCA
def pca_transform(train, test, k=5):
    pca = PCA(n_components=k)
    train = pca.fit_transform(train)
    test = pca.transform(test)
    return train, test

# Model Construction
### Algorithm behind decision tree:  
DecisionTreeClassifier from sklearn.tree uses CART (Classification and Regression Trees) algorithm, which is similar to C4.5 algorithm, but it supports numerical value (regression) and constructs binary trees using feature and its threshold which yields the largest information gain at each node.

### Difference between K-fold cross-validation and Random Forest:  


In [503]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
DTclf = DecisionTreeClassifier(random_state=0)
RFclfs = { # with different size of trees
    4:RandomForestClassifier(random_state=0, n_estimators=4),
    8:RandomForestClassifier(random_state=0, n_estimators=8),
    16:RandomForestClassifier(random_state=0, n_estimators=16),
    200:RandomForestClassifier(random_state=0, n_estimators=200),
}
KNNclfs ={ # with different k's
    3:KNeighborsClassifier(n_neighbors=3),
    5:KNeighborsClassifier(n_neighbors=5),
    7:KNeighborsClassifier(n_neighbors=7),
}

# Validation

In [504]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from tabulate import tabulate

def show_multiclass_score(y_pred, y_test, y_label):
    # shows metrics for each class
    table_data = list()
    table_header = []
    accuracy = accuracy_score(y_test, y_pred)
    recalls = recall_score(y_test, y_pred, average=None)
    precisions = precision_score(y_test, y_pred, average=None)
    table_header = ['Metrics', 'Accuracy', 'Recall', 'Precision']
    for name, recall, precision in zip(y_label, recalls, precisions):
        table_data.append([name,accuracy, recall, precision])
    print(tabulate(table_data, table_header, tablefmt='grid'))

    return accuracy_score(y_test, y_pred), recall_score(y_test, y_pred, average='macro'), precision_score(y_test, y_pred, average='macro')



def compare_PCA(clf, class_type='binary'):
    ret = []
    # with PCA
    table_data = list()
    if class_type == 'binary':
        y = y_binary
        table_header = ['Confusion matrix', 'G3 < 10', 'G3 >= 10']
    else:
        y = y_5class
        table_header = ['Confusion matrix', 'I', 'II', 'III', 'IV', 'V']
    KF = KFold(n_splits=3)
    cat_pred = np.array([])
    cat_gt = np.array([])
    for train_idx, test_idx in KF.split(x):
        x_train, x_test, y_train, y_test = x[train_idx], x[test_idx], y[train_idx], y[test_idx]
        x_train, x_test = pca_transform(x_train, x_test)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        cat_pred = np.concatenate([cat_pred, y_pred], axis=0)
        cat_gt = np.concatenate([cat_gt, y_test], axis=0)
    y_pred = cat_pred
    y_test = cat_gt
    c_m = confusion_matrix(y_test, y_pred)/3
    for idx, row in enumerate(c_m):
        table_data.append([table_header[idx+1], *row])
    print("1. With PCA:")
    print(tabulate(table_data, table_header, tablefmt='grid'))
    ret.append(show_multiclass_score(y_pred, y_test, table_header[1:]))

    # without PCA
    table_data = list()
    KF = KFold(n_splits=3)
    cat_pred = np.array([])
    cat_gt = np.array([])
    for train_idx, test_idx in KF.split(x):
        x_train, x_test, y_train, y_test = x[train_idx], x[test_idx], y[train_idx], y[test_idx]
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        cat_pred = np.concatenate([cat_pred, y_pred], axis=0)
        cat_gt = np.concatenate([cat_gt, y_test], axis=0)
    y_pred = cat_pred
    y_test = cat_gt
    c_m = confusion_matrix(y_test, y_pred)/3
    for idx, row in enumerate(c_m):
        table_data.append([table_header[idx+1], *row])
    print("2. Without PCA:")
    print(tabulate(table_data, table_header, tablefmt='grid'))
    ret.append(show_multiclass_score(y_pred, y_test, table_header[1:]))
    print()

    return ret

def compare_K(clf_dict, class_type='else'):
    ret = list()
    for K, clf in clf_dict.items():
        table_data = list()
        if class_type == 'binary':
            y = y_binary
            table_header = ['Confusion matrix', 'G3 < 10', 'G3 >= 10']
        else:
            y = y_5class
            table_header = ['Confusion matrix', 'I', 'II', 'III', 'IV', 'V']
        KF = KFold(n_splits=3)
        cat_pred = np.array([])
        cat_gt = np.array([])
        for train_idx, test_idx in KF.split(x):
            x_train, x_test, y_train, y_test = x[train_idx], x[test_idx], y[train_idx], y[test_idx]
            clf.fit(x_train, y_train)
            y_pred = clf.predict(x_test)
            cat_pred = np.concatenate([cat_pred, y_pred], axis=0)
            cat_gt = np.concatenate([cat_gt, y_test], axis=0)
        y_pred = cat_pred
        y_test = cat_gt
        c_m = confusion_matrix(y_test, y_pred)/3
        for idx, row in enumerate(c_m):
            table_data.append([table_header[idx+1], *row])
        print(f"for K={K}:")
        print(tabulate(table_data, table_header, tablefmt='grid'))
        ret.append(show_multiclass_score(y_pred, y_test, table_header[1:]))
        print()
    return ret

compare_K(RFclfs)
    

for K=4:
+--------------------+----------+----------+----------+-----------+-----------+
| Confusion matrix   |        I |       II |      III |        IV |         V |
| I                  | 7.66667  | 4.66667  | 0.666667 |  0.333333 |  0        |
+--------------------+----------+----------+----------+-----------+-----------+
| II                 | 4.33333  | 9        | 4.66667  |  1.66667  |  0.333333 |
+--------------------+----------+----------+----------+-----------+-----------+
| III                | 2.66667  | 3.33333  | 8.33333  |  5.66667  |  0.666667 |
+--------------------+----------+----------+----------+-----------+-----------+
| IV                 | 0.333333 | 2.66667  | 4.33333  | 18.6667   |  8.33333  |
+--------------------+----------+----------+----------+-----------+-----------+
| V                  | 0.333333 | 0.333333 | 0.666667 | 13        | 29        |
+--------------------+----------+----------+----------+-----------+-----------+
+-----------+------------+-----

[(0.5518987341772152, 0.5282291792141464, 0.5255053163490894),
 (0.5721518987341773, 0.5371101849398123, 0.5550141336341593),
 (0.6, 0.5450073879560255, 0.5667028459645358),
 (0.6632911392405063, 0.6218769423498519, 0.6550097522618475)]

# Results Part1: compare with and without PCA

In [505]:
# compare with / without PCA on three kinds of model on binary classification
table_header = ['Model setting', 'Accuracy', 'Recall (average)', 'Precision (average)']
table_data = list()
print(f"\n{'#'*40} Binary Classification {'#'*40}\n")
print("Result for DT comparing with and without PCA:")
rst = compare_PCA(DTclf)
table_data.append(['DT with PCA', *rst[0]])
table_data.append(['DT without PCA', *rst[1]])
print("Result for Random Forest comparing with and without PCA:")
rst = compare_PCA(RFclfs[4])
table_data.append(['RF with PCA', *rst[0]])
table_data.append(['RF without PCA', *rst[1]])
print("Result for KNN comparing with and without PCA:")
rst = compare_PCA(KNNclfs[5])
table_data.append(['KNN with PCA', *rst[0]])
table_data.append(['KNN without PCA', *rst[1]])
print()
print("Final results:")
print(tabulate(table_data, table_header, tablefmt='grid'))

# compare with / without PCA on three kinds of model on 5 class classification
print(f"\n{'#'*40} 5-Level Classification {'#'*40}\n")
table_header = ['Model setting', 'Accuracy', 'Recall (average)', 'Precision (average)']
table_data = list()
print("Result for DT comparing with and without PCA:")
rst = compare_PCA(DTclf, '5-class')
table_data.append(['DT with PCA', *rst[0]])
table_data.append(['DT without PCA', *rst[1]])
print("Result for Random Forest comparing with and without PCA:")
rst = compare_PCA(RFclfs[4], '5-class')
table_data.append(['RF with PCA', *rst[0]])
table_data.append(['RF without PCA', *rst[1]])
print("Result for KNN comparing with and without PCA:")
rst = compare_PCA(KNNclfs[5], '5-class')
table_data.append(['KNN with PCA', *rst[0]])
table_data.append(['KNN without PCA', *rst[1]])
print()
print("Final results:")
print(tabulate(table_data, table_header, tablefmt='grid'))


######################################## Binary Classification ########################################

Result for DT comparing with and without PCA:
1. With PCA:
+--------------------+-----------+------------+
| Confusion matrix   |   G3 < 10 |   G3 >= 10 |
| G3 < 10            |   35.6667 |    7.66667 |
+--------------------+-----------+------------+
| G3 >= 10           |   10.3333 |   78       |
+--------------------+-----------+------------+
+-----------+------------+----------+-------------+
| Metrics   |   Accuracy |   Recall |   Precision |
| G3 < 10   |   0.863291 | 0.823077 |    0.775362 |
+-----------+------------+----------+-------------+
| G3 >= 10  |   0.863291 | 0.883019 |    0.910506 |
+-----------+------------+----------+-------------+
2. Without PCA:
+--------------------+-----------+------------+
| Confusion matrix   |   G3 < 10 |   G3 >= 10 |
| G3 < 10            |  34.6667  |    8.66667 |
+--------------------+-----------+------------+
| G3 >= 10           |   7.

# Result Part 2: compare different number of trees for random forest