---
#      MODEL BUILDING PROCESSING
---

In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sn
import matplotlib.pyplot as plt
import pickle


dataset = pd.read_csv(r'./data/coords.csv')

## Spliting Data
---


In [2]:
X = dataset.drop(columns = ["class"])
y = dataset["class"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

## Feature Scaling

In [3]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
# reassingn index and columns
X_train2.columns = X_train.columns.values
X_train2.index = X_train.index.values
X_train_scaled = X_train2

X_test2 = pd.DataFrame(sc_X.transform(X_test))
X_test2.columns = X_test.columns.values
X_test2.index = X_test.index.values
X_test_scaled = X_test2

## Function to check perfomance

In [4]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

def perfomance_check(name: str):
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label='positive',
                                           average='micro')
    rec = recall_score(y_test, y_pred, pos_label='positive',
                                           average='micro')
    f1 = f1_score(y_test, y_pred, pos_label='positive',
                                           average='micro')

    model_results = pd.DataFrame([[name, acc, prec, rec, f1]],
                   columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    return results.append(model_results, ignore_index = True)

results = pd.DataFrame(columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

---
# CLASSIFICATIONS
---

## Logical Regression

In [5]:
from sklearn.linear_model import LogisticRegression
LR_classifier = LogisticRegression(random_state = 0, penalty = 'l1', solver='saga')
LR_classifier.fit(X_train, y_train.values.ravel())

y_pred = LR_classifier.predict(X_test)

results = perfomance_check('Linear Regression (Lasso)')



## KNN (K-Nearest Neighbours)

In [14]:
from sklearn.neighbors import KNeighborsClassifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNN_classifier.fit(X_train_scaled, y_train.values.ravel())

y_pred = KNN_classifier.predict(X_test_scaled)

results = perfomance_check('K-Nearest Neighbours')



## SVM

In [7]:
from sklearn.svm import SVC
SVML_classifier = SVC(random_state = 0, kernel = 'linear')
SVML_classifier.fit(X_train, y_train.values.ravel())

y_pred = SVML_classifier.predict(X_test)

results = perfomance_check('SVM (Linear)')



## Kernal SVM

In [8]:
from sklearn.svm import SVC
K_SVM_classifier = SVC(random_state = 0, kernel = 'rbf')
K_SVM_classifier.fit(X_train, y_train.values.ravel())

y_pred = K_SVM_classifier.predict(X_test)

results = perfomance_check('SVM (RBF)')



## Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB
NB_classifier = GaussianNB()
NB_classifier.fit(X_train, y_train.values.ravel())

y_pred = NB_classifier.predict(X_test)

results = perfomance_check('Naive Bayes')



## Decision Tree Classification

In [10]:
from sklearn.tree import DecisionTreeClassifier
DTC_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DTC_classifier.fit(X_train, y_train.values.ravel())

y_pred = DTC_classifier.predict(X_test)

results = perfomance_check('Decision Tree Classification')



## Random Forest Classification

In [11]:
from sklearn.ensemble import RandomForestClassifier
RF_classifier = RandomForestClassifier(random_state = 0, n_estimators = 100,
                                    criterion = 'entropy')
RF_classifier.fit(X_train, y_train.values.ravel())

y_pred = RF_classifier.predict(X_test)

results = perfomance_check('Random Forest (n=100)')



## XGBoost Classifier

In [12]:
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train.values.ravel())

y_pred = xgb_classifier.predict(X_test)

results = perfomance_check('XGBoost ')



## CatBoost Classifier

from catboost import CatBoostClassifier
CB_classifier = CatBoostClassifier()
CB_classifier.fit(X_train, y_train.values.ravel())

y_pred = CB_classifier.predict(X_test)

results = perfomance_check('CatBoost')

In [15]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.96124,0.96124,0.96124,0.96124
1,K-Nearest Neighbours,1.0,1.0,1.0,1.0
2,SVM (Linear),1.0,1.0,1.0,1.0
3,SVM (RBF),0.922481,0.922481,0.922481,0.922481
4,Naive Bayes,1.0,1.0,1.0,1.0
5,Decision Tree Classification,1.0,1.0,1.0,1.0
6,Random Forest (n=100),1.0,1.0,1.0,1.0
7,XGBoost,1.0,1.0,1.0,1.0
8,K-Nearest Neighbours,1.0,1.0,1.0,1.0


## Save Model

In [21]:
with open('./saved_model/body_language.pkl', 'wb') as f:
    pickle.dump(RF_classifier, f)

with open('./saved_model/body_language.pkl', 'rb') as f:
    model = pickle.load(f)

---
# MODEL SELECTION
---

### K-fold Cross Validation
---
* Conside which model perform the best

In [18]:
from sklearn.model_selection import cross_val_score
model_lst = [LR_classifier,    # 0
             KNN_classifier,   # 1
             SVML_classifier,  # 2
             K_SVM_classifier, # 3
             NB_classifier,    # 4
             DTC_classifier,   # 5
             RF_classifier,    # 6
             xgb_classifier]   # 7
            #  CB_classifier]    # 8
msg = []
for i in range(len(model_lst)):
    accuracies = cross_val_score(estimator =model_lst[i] , X = X_train, y = y_train, cv = 10)
    msg.append(f"Model Accuracy {i}: %0.3f (+/- %0.3f)" % (accuracies.mean(), accuracies.std() * 2))
    
for i in msg:
    print(i)    

Model Accuracy 0: 0.932 (+/- 0.064)
Model Accuracy 1: 0.982 (+/- 0.041)
Model Accuracy 2: 1.000 (+/- 0.000)
Model Accuracy 3: 0.854 (+/- 0.069)
Model Accuracy 4: 0.982 (+/- 0.034)
Model Accuracy 5: 0.987 (+/- 0.035)
Model Accuracy 6: 1.000 (+/- 0.000)
Model Accuracy 7: 0.990 (+/- 0.025)


In [19]:
for i in msg:
    print(i)

Model Accuracy 0: 0.932 (+/- 0.064)
Model Accuracy 1: 0.982 (+/- 0.041)
Model Accuracy 2: 1.000 (+/- 0.000)
Model Accuracy 3: 0.854 (+/- 0.069)
Model Accuracy 4: 0.982 (+/- 0.034)
Model Accuracy 5: 0.987 (+/- 0.035)
Model Accuracy 6: 1.000 (+/- 0.000)
Model Accuracy 7: 0.990 (+/- 0.025)


In [None]:
# This is the script to compare result with prediction
comparision = pd.DataFrame(columns = ['Result', 'Prediction'])
comparision.Prediction = pd.Series(y_pred)
comparision.Result = y_test.values

## Parameter Tuning
---
Base on the critera we can choose what type of parameter tuning algorithm to use.
#### Grid Search: Entropy 
* Meant to maximize the information content (in random forest we maximize the info. at every split)
* pip install joblib
* update joblib if there are some problem in GridSearch

#### Grid Search: Gini
* Meant to minimize the probability of mislabelling

### Input parameters
---
* Input different parameter based on the "Model" that you tring to tune
* Look into documentaion of specific algorithm for avaiable parameters
* Based on the best found parametes, slim down the range of the setting and test again
. 
         ***1st setting***
         {"max_depth": [3, None], 
         "max_features": [1, 5, 10], 
         'min_samples_split': [2, 5, 10],
         'min_samples_leaf': [1, 5, 10], 
         "bootstrap": [True, False],  
         "criterion": ["entropy"]}
.        
          ***Slimed down version***
          {"max_depth": [None],
              "max_features": [3, 5, 7],
              'min_samples_split': [8, 10, 12],
              'min_samples_leaf': [1, 2, 3],
              "bootstrap": [True],
              "criterion": ["entropy"]}
      

In [None]:
parameters = {"max_depth": [3, None],
              "max_features": [1, 5, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10],
              "bootstrap": [True, False],
              "criterion": ["entropy"]}

### Grid Search: Entropy 

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search_entropy = GridSearchCV(estimator = RF_classifier, # Make sure classifier points to the RF model
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           n_jobs = -1)

t0 = time.time()    # RECORD THE DURATION ALGORITHM TOOK
grid_search = grid_search.fit(X_train, y_train) 
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters

### Grid Search: Gini 

In [None]:
parameters = {"max_depth": [3, None],
              "max_features": [1, 5, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 5, 10],
              "bootstrap": [True, False],
              "criterion": ["gini"]}

from sklearn.model_selection import GridSearchCV
grid_search_gini = GridSearchCV(estimator = classifier, # Make sure classifier points to the RF model
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           n_jobs = -1)

t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters

## Save Model

In [None]:
with open('body_language.pkl', 'wb') as f:
    pickle.dump(grid_search_entropy, f)

with open('body_language.pkl', 'rb') as f:
    grid_search_entropy = pickle.load(f)

### Predicting Test Set
---
* Base on the best grid result test model, using its specific grid_serach parameter

In [None]:
# Use Correct grid_search 'entropy' or 'gini'
y_pred = grid_search_entropy.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,y_pred))

cm = confusion_matrix(y_test, y_pred)
sn.set(font_scale=2.8)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize = (50,40))
sn.heatmap(corr, annot=True,mask=mask);

### Formatting Final Results

In [None]:
final_results = pd.concat([y_test, users], axis = 1).dropna()
final_results['predictions'] = y_pred
final_results = final_results[['entry_id', 'e_signed', 'predictions']]