In [1]:
import pandas as pd
import numpy as np
import re
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import nltk
from sklearn.model_selection import KFold, LeaveOneOut, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_train_full = pd.read_csv("../Data/data_train_full.csv")
data_train_fiturtambahan = pd.read_csv("../Data/data_train_fiturtambahan.csv")
data_train_tfidf = pd.read_csv("../Data/data_train_tfidf.csv")

In [62]:
X_train_full = data_train_full.drop(["K", "Temp", "MBTI"],axis=1)
y_train_full = data_train_full["K"]

X_train_fiturtambahan = data_train_fiturtambahan.drop(["K", "Temp"],axis=1)
y_train_fiturtambahan = data_train_fiturtambahan["K"]

X_train_tfidf = data_train_tfidf.drop(["K", "Temp", "MBTI"],axis=1)
y_train_tfidf = data_train_tfidf["K"]

In [13]:
knn = KNeighborsClassifier()
mnb = MultinomialNB()
rf = RandomForestClassifier(criterion="gini",n_jobs=1)
gb = GradientBoostingClassifier()
svm = SVC()

# Full Features

- **KNN**

In [14]:
grid_params = {
    'n_neighbors' : [i for i in range(1,42) if i % 2 == 1]
}

In [15]:
grid_knn = GridSearchCV(
    estimator = knn,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_knn.fit(X_train_full,y_train_full)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    4.6s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
                                         23, 25, 27, 29, 31, 33, 35, 37, 39,
                                         41]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [16]:
print("Best Params : ",grid_knn.best_params_)

Best Params :  {'n_neighbors': 25}


In [17]:
knn_full = grid_knn.best_estimator_
knn_full.fit(X_train_full,y_train_full)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                     weights='uniform')

In [18]:
filename = "../Model/KNN_full.pkl"
pickle.dump(knn_full, open(filename, 'wb'))

- **Multinomial Naive Bayes**

In [19]:
mnb_full = mnb.fit(X_train_full,y_train_full)
mnb_full.fit(X_train_full,y_train_full)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
filename = "../Model/MNB_full.pkl"
pickle.dump(mnb_full, open(filename, 'wb'))

- **Random Forest**

In [21]:
grid_params = {
    'n_estimators' : [i for i in range(100,1100) if i % 100 == 0]
}

In [28]:
grid_rf = GridSearchCV(
    estimator = rf,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_rf.fit(X_train_full,y_train_full)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   10.1s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=1,
                                              oob_score=False,
                                              random_s

In [29]:
print("Best Params : ",grid_rf.best_params_)

Best Params :  {'n_estimators': 400}


In [30]:
rf_full = grid_rf.best_estimator_
rf_full.fit(X_train_full,y_train_full)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [31]:
filename = "../Model/RF_full.pkl"
pickle.dump(rf_full, open(filename, 'wb'))

- **Gradien Boosting**

In [32]:
grid_params = {
    'n_estimators' : [i for i in range(100,1100) if i % 100 == 0]
}

In [33]:
grid_GB = GridSearchCV(
    estimator = gb,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_GB.fit(X_train_full,y_train_full)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
         

In [35]:
print("Best Params : ",grid_GB.best_params_)

Best Params :  {'n_estimators': 400}


In [36]:
GB_full = grid_GB.best_estimator_
GB_full.fit(X_train_full,y_train_full)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=400,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [37]:
filename = "../Model/GB_full.pkl"
pickle.dump(GB_full, open(filename, 'wb'))

- **SVM**

grid_params = {
    'kernel' : ["linear","poly","sigmoid","rbf"],
    'C' : [0.1, 1, 10],
}

grid_svm = GridSearchCV(
    estimator = svm,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_svm.fit(X_train_full,y_train_full)

print("Best Params : ",grid_svm.best_params_)

svm_full = grid_svm.best_estimator_
svm_full.fit(X_train_full,y_train_full)

filename = "../Model/SVM_full.pkl"
pickle.dump(svm_full, open(filename, 'wb'))

# Fitur Tambahan

- **KNN**

In [38]:
grid_params = {
    'n_neighbors' : [i for i in range(1,42) if i % 2 == 1]
}

In [39]:
grid_knn = GridSearchCV(
    estimator = knn,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_knn.fit(X_train_fiturtambahan,y_train_fiturtambahan)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    2.2s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
                                         23, 25, 27, 29, 31, 33, 35, 37, 39,
                                         41]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [40]:
print("Best Params : ",grid_knn.best_params_)

Best Params :  {'n_neighbors': 25}


In [41]:
knn_fiturtambahan = grid_knn.best_estimator_
knn_fiturtambahan.fit(X_train_fiturtambahan,y_train_fiturtambahan)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                     weights='uniform')

In [42]:
filename = "../Model/KNN_fiturtambahan.pkl"
pickle.dump(knn_fiturtambahan, open(filename, 'wb'))

- **Multinomial Nive Bayes**

In [43]:
mnb_fiturtambahan = mnb.fit(X_train_fiturtambahan,y_train_fiturtambahan)
mnb_fiturtambahan.fit(X_train_fiturtambahan,y_train_fiturtambahan)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
filename = "../Model/MNB_fiturtambahan.pkl"
pickle.dump(mnb_fiturtambahan, open(filename, 'wb'))

- **Random Forest**

In [45]:
grid_params = {
    'n_estimators' : [i for i in range(100,1100) if i % 100 == 0]
}

In [46]:
grid_rf = GridSearchCV(
    estimator = rf,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_rf.fit(X_train_fiturtambahan,y_train_fiturtambahan)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    5.8s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=1,
                                              oob_score=False,
                                              random_s

In [47]:
print("Best Params : ",grid_rf.best_params_)

Best Params :  {'n_estimators': 400}


In [48]:
rf_fiturtambahan = grid_rf.best_estimator_
rf_fiturtambahan.fit(X_train_fiturtambahan,y_train_fiturtambahan)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [49]:
filename = "../Model/RF_fiturtambahan.pkl"
pickle.dump(rf_fiturtambahan, open(filename, 'wb'))

- **Gradient Boosting**

In [50]:
grid_params = {
    'n_estimators' : [i for i in range(100,1100) if i % 100 == 0]
}

In [55]:
grid_GB = GridSearchCV(
    estimator = gb,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_GB.fit(X_train_fiturtambahan,y_train_fiturtambahan)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.7s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
         

In [56]:
print("Best Params : ",grid_GB.best_params_)

Best Params :  {'n_estimators': 400}


In [57]:
GB_fiturtambahan = grid_GB.best_estimator_
GB_fiturtambahan.fit(X_train_fiturtambahan,y_train_fiturtambahan)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=400,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [58]:
filename = "../Model/GB_fiturtambahan.pkl"
pickle.dump(GB_fiturtambahan, open(filename, 'wb'))

- **SVM**

grid_params = {
    'kernel' : ["linear","poly","sigmoid","rbf"],
    'C' : [0.1, 1, 10],
}

grid_svm = GridSearchCV(
    estimator = svm,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_svm.fit(X_train_fiturtambahan,y_train_fiturtambahan)

# TF-IDF

- **KNN**

In [59]:
grid_params = {
    'n_neighbors' : [i for i in range(1,42) if i % 2 == 1]
}

In [63]:
grid_knn = GridSearchCV(
    estimator = knn,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_knn.fit(X_train_tfidf,y_train_tfidf)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    4.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
                                         23, 25, 27, 29, 31, 33, 35, 37, 39,
                                         41]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [64]:
print("Best Params : ",grid_knn.best_params_)

Best Params :  {'n_neighbors': 3}


In [65]:
knn_tfidf = grid_knn.best_estimator_
knn_tfidf.fit(X_train_tfidf,y_train_tfidf)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [66]:
filename = "../Model/KNN_tfidf.pkl"
pickle.dump(knn_tfidf, open(filename, 'wb'))

- **Multinomial Naive Bayes**

In [67]:
mnb_tfidf = mnb.fit(X_train_tfidf,y_train_tfidf)
mnb_tfidf.fit(X_train_tfidf,y_train_tfidf)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [68]:
filename = "../Model/MNB_tfidf.pkl"
pickle.dump(mnb_tfidf, open(filename, 'wb'))

- **Random Forest**

In [69]:
grid_params = {
    'n_estimators' : [i for i in range(100,1100) if i % 100 == 0]
}

In [70]:
grid_rf = GridSearchCV(
    estimator = rf,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_rf.fit(X_train_tfidf,y_train_tfidf)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   10.3s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=1,
                                              oob_score=False,
                                              random_s

In [71]:
print("Best Params : ",grid_rf.best_params_)

Best Params :  {'n_estimators': 1000}


In [72]:
rf_tfidf = grid_rf.best_estimator_
rf_tfidf.fit(X_train_tfidf,y_train_tfidf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [73]:
filename = "../Model/RF_tfidf.pkl"
pickle.dump(rf_tfidf, open(filename, 'wb'))

- **Gradient Boosting**

In [74]:
grid_params = {
    'n_estimators' : [i for i in range(100,1100) if i % 100 == 0]
}

In [75]:
grid_GB = GridSearchCV(
    estimator = gb,
    param_grid = grid_params,
    cv = 3,
    n_jobs = -1,
    verbose = 1
)

grid_GB.fit(X_train_tfidf,y_train_tfidf)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
         

In [76]:
print("Best Params : ",grid_GB.best_params_)

Best Params :  {'n_estimators': 100}


In [77]:
GB_tfidf = grid_GB.best_estimator_
GB_tfidf.fit(X_train_tfidf,y_train_tfidf)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [78]:
filename = "../Model/GB_tfidf.pkl"
pickle.dump(GB_tfidf, open(filename, 'wb'))