###  <div align="right"> Name: Vipada Siripatanadilok No. 6020422001,Prasit Chulanutrakul No. 6120412007 </div> 
###  <div align="right"> ชื่อ:วิภาดา สิริพัฒนดิลก No. 6020422001, ประสิทธิ์ จุฬานุตรกุล No. 6120412007 </div> 

# Import Required Packages

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
sns.set()
random_state = 77

import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# PART I: Prepare and Visualize the Data

## 1. Load Data

# PART II: Classification
    - k-nearest neighbors
    - Naive Bayes
    - Decision Tree
    - Support Vector Machine
    - Logistic Regression
    - Random Forest
    - Ada Boost
    - Neural Network

## 1. Load Train and Test Set
แบ่งข้อมูลเป็น Train / Test Set

In [2]:
filename = 'drug_infection_disease_train_test_set_50.pkl'
with open(filename,'rb') as f:
    train_test_set = pickle.load(f)
X_train, X_test, y_train, y_test = train_test_set

## 2. Create Model Pipeline
สร้าง Fucntion เพื่อ สร้าง Pipeline เพื่อให้ทำการ Scale ข้อมูลก่อนเข้าโมเดลทั้ง Train และ Test โดยอัตโนมัติ เนื่องจากการ บางโมเดล เป็น Distance Base จำต้องทำการ Scale Data เพื่อให้แต่ละตัวแปร่งผลอย่างเท่าเทียม และ ยังทำให้ ANN Converge ได้เร็วกว่า

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
def create_pipeline(model_object):
    return Pipeline([('Model', model_object)])

## 3. Train Model (Train with Default Hyperparameters) 
ทำการ Train Model ด้วยโมเดลในรายการข้างล่าง โดยไม่ปรับ Hyperparameters
    - k-nearest neighbors
    - Naive Bayes
    - Decision Tree
    - Support Vector Machine
    - Logistic Regression
    - Random Forest
    - Ada Boost
    - Neural Network

In [4]:
# List of Models
model_list = {
    'k-nearest neighbors':KNeighborsClassifier(n_jobs=-1),
    'Naive Bayes':GaussianNB(),
    'Logistic Regression':LogisticRegression(random_state=random_state, n_jobs=-1),
    'LightGBM':LGBMClassifier(random_state=random_state, n_jobs=-1),
    "XGBoost" : XGBClassifier(seed = random_state, n_jobs=-1),
    'Random Forest':RandomForestClassifier(random_state=random_state, n_jobs=-1),
    'Ada Boost':AdaBoostClassifier(random_state=random_state),
    'Decision Tree':DecisionTreeClassifier(random_state=random_state),
    'Support Vector Machine':svm.SVC(random_state=random_state)
}

In [5]:
# Create Pipelines
pipeline_model_list = {}
for model in model_list:
    print(f'Created pipeline for {model}')
    pipeline_model_list[model] =  create_pipeline(model_list[model])

Created pipeline for k-nearest neighbors
Created pipeline for Naive Bayes
Created pipeline for Logistic Regression
Created pipeline for LightGBM
Created pipeline for XGBoost
Created pipeline for Random Forest
Created pipeline for Ada Boost
Created pipeline for Decision Tree
Created pipeline for Support Vector Machine


In [None]:
# Train Models
for model in model_list:
    pipeline_model_list[model].fit(X_train,y_train)
    print(f'Trained model for {model}')

## 4. Model Validation with Test Set
ทำการทดสอบโมเดลที่ Train ในข้อ 3 ด้วย Test Set

In [15]:
def evaluation(model_name, y_test, y_pred, average=None):
    average = None
    result_dict = {}
    
    result_dict['MODEL'] = model
    result_dict['ACCURACY'] = accuracy_score(y_test, y_pred)
    
    no_of_class = len(set(y_test))
    for i in range(no_of_class):
        result_dict[f'PRECISION_{i}'] = precision_score(y_test, y_pred, average=average)[i]
        result_dict[f'RECALL_{i}'] = recall_score(y_test, y_pred, average=average)[i]
        result_dict[f'F1_{i}'] = f1_score(y_test, y_pred, average=average)[i]

    return result_dict

def evaluation_cv(model_name, y_test, y_pred, average=None):
    average = None
    result_dict = {}
    
    result_dict['MODEL'] = model
    result_dict['CV_AVG_ACC'] = accuracy_score(y_test, y_pred)
    
    no_of_class = len(set(y_test))
    for i in range(no_of_class):
        result_dict[f'CV_AVG_PRECISION_{i}'] = precision_score(y_test, y_pred, average=average)[i]
        result_dict[f'CV_AVG_RECALL_{i}'] = recall_score(y_test, y_pred, average=average)[i]
        result_dict[f'CV_AVG_F1_{i}'] = f1_score(y_test, y_pred, average=average)[i]

    return result_dict

In [16]:
summary_result = []
for model in pipeline_model_list:
    
    y_pred = pipeline_model_list[model].predict(X_test)
    summary_result.append(evaluation(model, y_test, y_pred))
    
    print(f'Validation Result for {model}')
    print('\n-- Confusion Martix --')
    print(confusion_matrix(y_test,y_pred))
    print('\n-- Classification Report --')
    print(classification_report(y_test, y_pred))
    print('\n')
    
summary_result_default_df = pd.DataFrame(summary_result)

Validation Result for k-nearest neighbors

-- Confusion Martix --
[[603909   1552]
 [  1862   2663]]

-- Classification Report --
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    605461
           1       0.63      0.59      0.61      4525

    accuracy                           0.99    609986
   macro avg       0.81      0.79      0.80    609986
weighted avg       0.99      0.99      0.99    609986



Validation Result for Naive Bayes

-- Confusion Martix --
[[545156  60305]
 [  1789   2736]]

-- Classification Report --
              precision    recall  f1-score   support

           0       1.00      0.90      0.95    605461
           1       0.04      0.60      0.08      4525

    accuracy                           0.90    609986
   macro avg       0.52      0.75      0.51    609986
weighted avg       0.99      0.90      0.94    609986



Validation Result for Logistic Regression

-- Confusion Martix --
[[604754    707]
 [  449

### Summary Result of Predicting Test Set (Using Default Hyperparameters)
จะได้ผลลัพท์ของโมเดลที่ Train ในข้อ 2 ดังตารางสรุปด้านล่าง

In [17]:
col = ['MODEL','ACCURACY','PRECISION_0','PRECISION_1','RECALL_0','RECALL_1','F1_0','F1_1']
summary_result_default_df[col]

Unnamed: 0,MODEL,ACCURACY,PRECISION_0,PRECISION_1,RECALL_0,RECALL_1,F1_0,F1_1
0,k-nearest neighbors,0.994403,0.996926,0.631791,0.997437,0.588508,0.997181,0.609382
1,Naive Bayes,0.898204,0.996729,0.0434,0.900398,0.604641,0.946118,0.080987
2,Logistic Regression,0.991469,0.992619,0.038095,0.998832,0.006188,0.995716,0.010646
3,LightGBM,0.994008,0.995503,0.660044,0.998474,0.396464,0.996986,0.495375
4,XGBoost,0.995603,0.996161,0.862367,0.999422,0.484641,0.997789,0.620543
5,Random Forest,0.996436,0.996481,0.985142,0.999941,0.527514,0.998208,0.687104
6,Ada Boost,0.990723,0.992892,0.130378,0.997797,0.044199,0.995338,0.066017
7,Decision Tree,0.995303,0.996967,0.723599,0.998305,0.593591,0.997636,0.652179
8,Support Vector Machine,0.995843,0.996291,0.889237,0.999533,0.502099,0.997909,0.641808


### สรุป


## 5. Testing Model Generalization with K-Fold Cross Validation
ทำการทดสอบโมเดลด้วย 10-Flod Cross Validation ด้วยโมเดลในรายการข้างล่าง
    - k-nearest neighbors
    - Naive Bayes
    - Decision Tree
    - Support Vector Machine
    - Logistic Regression
    - Random Forest
    - Ada Boost
    - Neural Network

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
summary_result = []
for model in pipeline_model_list:
    
    y_pred = cross_val_predict(pipeline_model_list[model], X_train, y_train, cv = 10)
    cv_score = cross_val_score(pipeline_model_list[model], X_train, y_train, cv = 10, scoring = 'accuracy')
    result_dict = evaluation_cv(model, y_train, y_pred)
    result_dict['CV_SD_ACC'] = np.std(cv_score)
    summary_result.append(result_dict)
    
    print(f'Validation Result for {model}')
    print('\n-- Confusion Martix --')
    print(confusion_matrix(y_train,y_pred))
    print('\n-- Classification Report --')
    print(classification_report(y_train, y_pred))
    print('\n')
    
summary_result_cv_df = pd.DataFrame(summary_result)

## Summary Result of K-Fold Cross Validation
จะได้ผลลัพท์ของการทดสอบโมเดลด้วย 10-Fold Cross Validation จะได้ดังตารางสรุปด้านล่าง

In [None]:
col = ['MODEL','AVG_ACC','SD_ACC','AVG_PRECISION_0','AVG_PRECISION_1','AVG_RECALL_0',
       'AVG_RECALL_1','AVG_F1_0','AVG_F1_1']
summary_result_cv_df[col]