In [62]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression,LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score, f1_score

In [45]:
df_raw=pd.read_csv('/Users/apple/Desktop/deep_eye/dataset/input_data.csv',usecols=['participant_ID','Q_ID','text','fixation1'])
total_fixation_df = pd.DataFrame()
for PARTICIPANT_ID in range(1,61):
    df_csv = df_raw[(df_raw['participant_ID']==PARTICIPANT_ID)].drop(['participant_ID'],axis=1)

    q_index_list = sorted(list(set(df_csv['Q_ID'])))
    total_q_list = []
    for q in q_index_list:
        df_q = df_csv[df_csv['Q_ID'] == q]
        label_list = list(df_q['fixation1'])
        total_q_list.append(label_list)
    
    total_fixation_df = pd.concat([total_fixation_df, pd.DataFrame(total_q_list)], axis = 0)
total_fixation_df = total_fixation_df.reset_index(drop=True)
total_fixation_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,0,0,1,1,0,1,0,0,0,1,...,,,,,,,,,,
1,0,0,1,1,1,0,0,0,1,0,...,,,,,,,,,,
2,0,0,0,0,1,1,0,1,0,1,...,,,,,,,,,,
3,0,0,1,0,0,1,1,1,1,0,...,,,,,,,,,,
4,0,0,0,0,0,1,1,0,1,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2815,1,1,1,0,0,1,1,0,0,0,...,,,,,,,,,,
2816,0,1,1,1,1,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2817,0,1,0,0,0,1,0,0,0,0,...,,,,,,,,,,
2818,0,0,0,1,1,0,1,0,0,0,...,,,,,,,,,,


In [48]:
A_answer = pd.read_csv("/Users/apple/Desktop/deep_eye/dataset/answer/A_P1_answer.csv").iloc[:,1:]
C_answer = pd.read_csv("/Users/apple/Desktop/deep_eye/dataset/answer/C_P1_answer.csv").iloc[:,1:]
AC_answer = pd.concat([A_answer, C_answer], axis = 0).reset_index(drop = True)
total_ans_df = pd.DataFrame()
for i in range(AC_answer.shape[0]):
    total_ans_df = pd.concat([total_ans_df,AC_answer.loc[i]],axis = 0)
total_ans_df = total_ans_df.reset_index(drop = True)
total_ans_df.columns = ['ans']
total_ans_df.shape

(2820, 1)

In [50]:
# combine fixation sequence and answer 
total_df = pd.concat([total_fixation_df, total_ans_df], axis = 1)
total_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,ans
0,0,0,1,1,0,1,0,0,0,1,...,,,,,,,,,,0.0
1,0,0,1,1,1,0,0,0,1,0,...,,,,,,,,,,1.0
2,0,0,0,0,1,1,0,1,0,1,...,,,,,,,,,,0.0
3,0,0,1,0,0,1,1,1,1,0,...,,,,,,,,,,1.0
4,0,0,0,0,0,1,1,0,1,0,...,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2815,1,1,1,0,0,1,1,0,0,0,...,,,,,,,,,,1.0
2816,0,1,1,1,1,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
2817,0,1,0,0,0,1,0,0,0,0,...,,,,,,,,,,1.0
2818,0,0,0,1,1,0,1,0,0,0,...,,,,,,,,,,1.0


---
# ML

In [58]:
y_df = total_df.iloc[:,-1]
use_x_df = total_df.iloc[:,:20]
use_df = pd.concat([use_x_df,y_df], axis = 1)
df = use_df.dropna().reset_index(drop = True)

x_df = df.iloc[:,:-1]
y_df = df.iloc[:,-1]
print(df.shape)
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, random_state = 404)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(2160, 21)
(1620, 20)
(540, 20)
(1620,)
(540,)


In [59]:
# warnings.filterwarnings('ignore')
# Baseline - comparing model accuracy using all features across classifiers 
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC(kernel="linear"),
    GaussianNB(),
    LogisticRegression()
    ]


# Train Accuracy
algo_train = []
scores_train = []
for clf in classifiers:
    algo_train.append(clf.__class__.__name__)
    scores_train.append(cross_val_score(clf, x_df, y_df.astype('int'), cv=5).mean())
# warnings.filterwarnings('ignore')
score_df_Train = pd.DataFrame({'Algorithm': algo_train, 'Score': scores_train}).set_index('Algorithm')


# Test Accuracy
algo_test = []
scores_test = []

for clf in classifiers:
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    algo_test.append(clf.__class__.__name__)
    scores_test.append(accuracy_score(y_test, y_pred))
# warnings.filterwarnings('ignore')
score_df_Test  = pd.DataFrame({'Algorithm': algo_test, 'Score': scores_test}).set_index('Algorithm')

# # Bar plot between Train and Test Accuracy
# fig = plt.figure(figsize=(10,7)) # Create matplotlib figure
# ax = fig.add_subplot(111) # Create matplotlib axes
# ax2 = ax.twinx() # Create another axes that shares the same x-axis as a
# width = .4

# score_df_Train.Score.plot(kind='bar',color='green',ax=ax,width=width, position=0)
# score_df_Test.Score.plot(kind='bar',color='red', ax=ax2,width = width,position=1)
# ax.grid(None, axis='both')
# ax2.grid(None)
# ax.set_ylabel('Train')
# ax2.set_ylabel('Test')
# ax.set_xlim(-1,7)

# scores_train_round = np.round(scores_train,3)
# scores_test_round = np.round(scores_test,3)
# for index,data in enumerate(scores_train_round):
#     plt.text(x=index-.4 , y =data+.02 , s=f"{data}" , fontdict=dict(fontsize=10), color='red', fontweight='bold')
# for index,data in enumerate(scores_test_round):
#     plt.text(x=index , y =data+.03 , s=f"{data}" , fontdict=dict(fontsize=10), color='green', fontweight='bold')

# plt.show()

# Table
score_df_Test_table  = pd.DataFrame({'Algorithm': algo_test, 'Score': scores_test})
result_df = score_df_Test_table.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df


Unnamed: 0_level_0,Algorithm
Score,Unnamed: 1_level_1
0.587037,GaussianNB
0.574074,SVC
0.574074,LogisticRegression
0.559259,RandomForestClassifier
0.527778,KNeighborsClassifier
0.507407,DecisionTreeClassifier


# XGboost
https://ithelp.ithome.com.tw/articles/10268984

In [63]:
#from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

def score(m, x_train, y_train, x_test, y_test, train=True):
    if train:
        pred=m.predict(x_train)
        print('Train Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_train, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_train, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_train, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_train, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_train, pred)}")
    elif train == False:
        pred=m.predict(x_test)
        print('Test Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_test, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_test, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_test, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_test, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, pred)}")


In [64]:
from xgboost import XGBClassifier
xg1 = XGBClassifier()
xg1=xg1.fit(x_train, y_train)

score(xg1, x_train, y_train, x_test, y_test, train=False)



Test Result:

Accuracy Score: 55.56%
Precision Score: 56.04%
Recall Score: 56.04%
F1 score: 56.04%
Confusion Matrix:
 [[147 120]
 [120 153]]


## tuning RandomizedSearchCV

In [65]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
learning_rate=[round(float(x),2) for x in np.linspace(start=0.01, stop=0.2, num=10)]
colsample_bytree =[round(float(x),2) for x in np.linspace(start=0.1, stop=1, num=10)]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'learning_rate': learning_rate,
               'colsample_bytree': colsample_bytree}
random_grid 

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.09, 0.12, 0.14, 0.16, 0.18, 0.2],
 'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}

In [66]:
xg4 = XGBClassifier(random_state=42)

#Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available cores
xg_random = RandomizedSearchCV(estimator = xg4, param_distributions=random_grid,
                              n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

xg_random.fit(x_train,y_train)
xg_random.best_params_
# 16min 51sec

Fitting 3 folds for each of 100 candidates, totalling 300 fits






{'n_estimators': 400,
 'max_depth': 50,
 'learning_rate': 0.03,
 'colsample_bytree': 0.1}

In [67]:
xg5 = XGBClassifier(colsample_bytree= 0.1, learning_rate=0.03, max_depth= 50, n_estimators=400)
xg5=xg5.fit(x_train, y_train)
score(xg5, x_train, y_train, x_test, y_test, train=False)

Test Result:

Accuracy Score: 60.74%
Precision Score: 62.25%
Recall Score: 56.78%
F1 score: 59.39%
Confusion Matrix:
 [[173  94]
 [118 155]]


## tuning GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}

xg2 = XGBClassifier(random_state=1)
clf = GridSearchCV(estimator=xg2, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1)
clf.fit(x_train, y_train)

print("Best parameters:", clf.best_params_)


In [None]:
xg3 = XGBClassifier(colsample_bytree= 0.3, learning_rate=0.01, max_depth= 3, n_estimators=1000)
xg3=xg3.fit(x_train, y_train)
score(xg3, x_train, y_train, x_test, y_test, train=False)
