# The Hong Kong University of Science and Technology
# MSBD5002: Data Mining and Knowledge Discovery
# Fall 2021 Assignment 2

### Student name: Mak Chun Wai, Michael
### HKUST account: cwmakah
### Student ID: 20801333

#### 1. Comparison of Classifiers

#### Decision Tree

In [1]:
# import necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time

In [2]:
# Load training dataset and have a quick review
df_train = pd.read_csv('./dataset/winequality_train.csv', delimiter=';')
df_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.4,0.27,0.49,7.3,0.046,53.0,206.0,0.9956,3.24,0.43,9.2,6
2,8.0,0.45,0.28,10.8,0.051,25.0,157.0,0.9957,3.06,0.47,11.4,7
3,7.5,0.38,0.29,4.9,0.021,38.0,113.0,0.99026,3.08,0.48,13.0,7
4,6.9,0.35,0.55,11.95,0.038,22.0,111.0,0.99687,3.11,0.29,9.7,5


In [3]:
# Separate x and y by columns in the above table
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1:]

In [4]:
# Load test dataset and have a quick review
df_test = pd.read_csv('./dataset/winequality_test.csv', delimiter=';')
df_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,7.4,0.21,0.8,12.3,0.038,77.0,183.0,0.99778,2.95,0.48,9.0,5
2,6.8,0.18,0.28,8.7,0.047,52.0,242.0,0.9952,3.22,0.53,10.5,6
3,5.5,0.15,0.32,14.0,0.031,16.0,99.0,0.99437,3.26,0.38,11.5,8
4,7.1,0.26,0.3,2.0,0.031,13.0,128.0,0.9917,3.19,0.49,11.4,5


In [5]:
# Separate x and y by columns in the above table
X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1:]

In [6]:
# Set up the tree variables as requested
criterion = ['entropy', 'gini']
depth = [5, 10, 15, 20]
# Create an empty dataframe with column names for summary
colnames = ['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1', 'Training Time']
summary = pd.DataFrame(columns = colnames)

In [7]:
# Train the decision tree and obtain the summary table
for c in criterion:
    for d in depth:
        tree = DecisionTreeClassifier(criterion=c, max_depth=d, random_state=0)
        start_time = time.time()
        tree.fit(X_train, y_train)
        training_time = time.time() - start_time
        y_pred = tree.predict(X_test)
        # Append the summary table with evaluation metrics
        # Classifier name: criterion_tree depth
        data = {'Classifier': '{}_{}'.format(c, d), \
                'Accuracy': accuracy_score(y_test, y_pred),\
                'Precision': precision_score(y_test, y_pred, average='macro', labels=np.unique(y_pred)), \
                'Recall': recall_score(y_test, y_pred, average='macro', labels=np.unique(y_pred)), \
                'F1': f1_score(y_test, y_pred, average='macro', labels=np.unique(y_pred)), \
                'Training Time': training_time}
        summary = summary.append(data, ignore_index=True)

  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
summary

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1,Training Time
0,entropy_5,0.514781,0.319455,0.255394,0.25857,0.039497
1,entropy_10,0.528033,0.350273,0.318319,0.329816,0.052035
2,entropy_15,0.56371,0.341654,0.323363,0.330073,0.067042
3,entropy_20,0.577982,0.394065,0.387627,0.390202,0.069043
4,gini_5,0.528033,0.551574,0.405376,0.41789,0.016009
5,gini_10,0.552497,0.430434,0.355679,0.373582,0.029019
6,gini_15,0.576962,0.398792,0.379772,0.387373,0.037025
7,gini_20,0.587156,0.422511,0.404007,0.411709,0.040028


#### KNN, Random Forest

In [9]:
KNN = KNeighborsClassifier(n_neighbors=10)
Random_Forest = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=3, n_jobs=2)
summary2 = pd.DataFrame(columns = colnames)
for classifier in [KNN, Random_Forest]:
    start_time = time.time()
    classifier.fit(X_train, y_train['quality'].values)
    training_time = time.time() - start_time
    y_pred = classifier.predict(X_test)
    # Append the summary table with evaluation metrics
    data = {'Classifier': str(classifier).split("Classifier")[0], \
                'Accuracy': accuracy_score(y_test, y_pred),\
                'Precision': precision_score(y_test, y_pred, average='macro', labels=np.unique(y_pred)), \
                'Recall': recall_score(y_test, y_pred, average='macro', labels=np.unique(y_pred)), \
                'F1': f1_score(y_test, y_pred, average='macro', labels=np.unique(y_pred)), \
                'Training Time': training_time}
    summary2 = summary2.append(data, ignore_index=True)
summary2

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1,Training Time
0,KNeighbors,0.469929,0.399707,0.286781,0.290635,0.024022
1,RandomForest,0.653415,0.661161,0.495355,0.534972,0.115612
