<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Final project: NLP to predict Myers-Briggs Personality Type

## Imports

In [32]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


# Text Processing
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from collections import Counter

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import sklearn.cluster as cluster

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

import pickle as pkl
from scipy import sparse
from numpy import asarray
from numpy import savetxt

# Fix imbalance
from imblearn.under_sampling import InstanceHardnessThreshold

# Model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score

#Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix, classification_report
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

## 3. Model building and evaluation

### Truncated SVD 

#### original sample

In [10]:
result_svd_vec_types  = pd.read_csv("data/output_csv/result_svd_vec_types.csv")
result_svd_vec_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [11]:
result_svd_vec_types.head()

Unnamed: 0,type,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,...,90,91,92,93,94,95,96,97,98,99
0,infj,11.12,135.29,0,0,0,0,0,0,0,...,0.343928,0.360159,0.35868,0.351273,0.402498,0.357498,0.376758,0.379303,0.367843,0.375516
1,entp,23.4,187.4756,0,0,0,1,0,0,0,...,0.351512,0.375683,0.352008,0.356475,0.332874,0.386907,0.356917,0.354533,0.377601,0.337164
2,intp,16.72,180.69,0,0,0,0,0,0,0,...,0.332374,0.354798,0.362468,0.352909,0.367732,0.3426,0.336408,0.357758,0.344202,0.391195
3,intj,21.28,181.8324,0,0,0,0,0,0,0,...,0.37072,0.335693,0.393478,0.349815,0.373543,0.380157,0.38104,0.335247,0.360196,0.377249
4,entj,19.34,196.4576,0,0,1,0,0,0,0,...,0.337362,0.363822,0.328088,0.33672,0.373329,0.376424,0.356934,0.367272,0.333998,0.381967


In [12]:
result_svd_vec_types.shape

(8675, 119)

In [13]:
X = result_svd_vec_types.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = result_svd_vec_types["type"].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 102) (6940,) (1735, 102) (1735,)


In [15]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted'))
    rocauc_ovr   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovr_weighted'))
    rocauc_ovo   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovo_weighted'))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc_ovr'   : [rocauc_ovr],
                             'rocauc_ovo'   : [rocauc_ovo],
                             'specificity'  : [specificity]
                            })   
    return df_model

In [30]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [0]:
raise SystemExit("Here it comes a very consumming memory process. You should better not start it till everything else has itereated propperly")

SystemExit: his is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step

In [66]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df.to_csv("data/output_csv/models_svd.csv")
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc_ovr,rocauc_ovo,specificity
0,gnb,0.555331,0.595202,0.557781,0.554236,0.850117,0.858449,0.96728
0,logit,0.212248,0.087325,0.20951,0.105265,0.532143,0.535477,0.937515
0,knn,0.143804,0.137235,0.143516,0.134732,0.520831,0.51707,0.938348
0,decisiontree,0.421182,0.417194,0.425504,0.418183,0.67451,0.634793,0.959201
0,randomforest,0.61585,0.636571,0.612824,0.583978,0.896918,0.864073,0.970577
0,xgboost,0.627666,0.629024,0.629539,0.626362,0.896847,0.870529,0.974389
0,MLPC,0.557637,0.53984,0.5683,0.510389,0.902499,0.861966,0.97182


#### resampled

In [154]:
result_svd_vec_types  = pd.read_csv("data/output_csv/result_svd_vec_types.csv")
result_svd_vec_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [155]:
result_svd_vec_types.head()

Unnamed: 0,type,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,...,90,91,92,93,94,95,96,97,98,99
0,infj,11.12,135.29,0,0,0,0,0,0,0,...,0.343928,0.360159,0.35868,0.351273,0.402498,0.357498,0.376758,0.379303,0.367843,0.375516
1,entp,23.4,187.4756,0,0,0,1,0,0,0,...,0.351512,0.375683,0.352008,0.356475,0.332874,0.386907,0.356917,0.354533,0.377601,0.337164
2,intp,16.72,180.69,0,0,0,0,0,0,0,...,0.332374,0.354798,0.362468,0.352909,0.367732,0.3426,0.336408,0.357758,0.344202,0.391195
3,intj,21.28,181.8324,0,0,0,0,0,0,0,...,0.37072,0.335693,0.393478,0.349815,0.373543,0.380157,0.38104,0.335247,0.360196,0.377249
4,entj,19.34,196.4576,0,0,1,0,0,0,0,...,0.337362,0.363822,0.328088,0.33672,0.373329,0.376424,0.356934,0.367272,0.333998,0.381967


In [156]:
result_svd_vec_types.shape

(8675, 119)

In [230]:
def sampling_k_elements(group, k=39):
    if len(group) < k:
        return group
    return group.sample(k)

balanced_svd = result_svd_vec_types.groupby("type").apply(sampling_k_elements).reset_index(drop=True)

In [231]:
X = balanced_svd.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = balanced_svd["type"].values

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(499, 102) (499,) (125, 102) (125,)


In [233]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted'))
    rocauc_ovr   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovr_weighted'))
    rocauc_ovo   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovo_weighted'))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc_ovr'   : [rocauc_ovr],
                             'rocauc_ovo'   : [rocauc_ovo],
                             'specificity': [specificity]
                            })   
    return df_model

In [234]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [235]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df.to_csv("data/output_csv/models_svd_resampled.csv")
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc_ovr,rocauc_ovo,specificity
0,gnb,0.394889,0.474383,0.420869,0.414898,0.794022,0.805177,0.961004
0,logit,0.166444,0.187662,0.158465,0.1068,0.622457,0.619038,0.950622
0,knn,0.060081,0.056842,0.078162,0.040041,0.495538,0.501879,0.938124
0,decisiontree,0.242465,0.254147,0.248485,0.223025,0.579716,0.592617,0.956176
0,randomforest,0.456949,0.498677,0.477071,0.473628,0.824057,0.819632,0.970218
0,xgboost,0.356707,0.373399,0.362667,0.357443,0.790459,0.779409,0.961701
0,NN,0.086081,0.076485,0.094162,0.074812,0.587777,0.571033,0.9402


### UMAP

#### original sample

In [244]:
result_umap_types  = pd.read_csv("data/output_csv/result_umap_types.csv")
result_umap_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [245]:
result_umap_types.head()

Unnamed: 0,type,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,...,infj,infp,intj,intp,isfj,isfp,istj,istp,0,1
0,infj,11.12,135.29,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,3.910143,7.477874
1,entp,23.4,187.4756,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,3.93804,5.939636
2,intp,16.72,180.69,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,3.740153,5.486389
3,intj,21.28,181.8324,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,5.415134,7.452929
4,entj,19.34,196.4576,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2.083198,7.512875


In [246]:
result_umap_types.shape

(8675, 21)

In [247]:
X = result_umap_types.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = result_umap_types["type"].values

In [248]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 4) (6940,) (1735, 4) (1735,)


In [249]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted'))
    rocauc_ovr   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovr_weighted'))
    rocauc_ovo   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovo_weighted'))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc_ovr'   : [rocauc_ovr],
                             'rocauc_ovo'   : [rocauc_ovo],
                             'specificity': [specificity]
                            })   
    return df_model

In [250]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [251]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df.to_csv("data/output_csv/models_umap.csv")
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc_ovr,rocauc_ovo,specificity
0,gnb,0.244669,0.147495,0.246542,0.170315,0.623117,0.617077,0.942453
0,logit,0.240058,0.13641,0.24366,0.162189,0.608578,0.585442,0.941679
0,knn,0.167579,0.161514,0.166138,0.154306,0.534462,0.524895,0.939635
0,decisiontree,0.162392,0.161085,0.160086,0.157405,0.523344,0.516075,0.940613
0,randomforest,0.207205,0.18449,0.216859,0.19035,0.595132,0.586595,0.942602
0,xgboost,0.228818,0.197117,0.228674,0.20021,0.6242,0.616643,0.942652
0,NN,0.222478,0.160691,0.230403,0.151829,0.618633,0.605634,0.941472


#### resampled

In [252]:
result_umap_types  = pd.read_csv("data/output_csv/result_umap_types.csv")
result_umap_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [253]:
result_umap_types.head()

Unnamed: 0,type,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,...,infj,infp,intj,intp,isfj,isfp,istj,istp,0,1
0,infj,11.12,135.29,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,3.910143,7.477874
1,entp,23.4,187.4756,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,3.93804,5.939636
2,intp,16.72,180.69,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,3.740153,5.486389
3,intj,21.28,181.8324,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,5.415134,7.452929
4,entj,19.34,196.4576,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2.083198,7.512875


In [254]:
result_umap_types.shape

(8675, 21)

In [255]:
def sampling_k_elements(group, k=39):
    if len(group) < k:
        return group
    return group.sample(k)

balanced_umap = result_umap_types.groupby("type").apply(sampling_k_elements).reset_index(drop=True)

In [256]:
X = balanced_umap.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = balanced_umap["type"].values

In [257]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(499, 4) (499,) (125, 4) (125,)


In [258]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted'))
    rocauc_ovr   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovr_weighted'))
    rocauc_ovo   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovo_weighted'))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc_ovr'   : [rocauc_ovr],
                             'rocauc_ovo'   : [rocauc_ovo],
                             'specificity': [specificity]
                            })   
    return df_model

In [259]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [260]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df.to_csv("data/output_csv/models_umap_resampled.csv")
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc_ovr,rocauc_ovo,specificity
0,gnb,0.118182,0.091955,0.116323,0.100632,0.570687,0.580601,0.93964
0,logit,0.136263,0.060058,0.120242,0.084161,0.557188,0.558777,0.93996
0,knn,0.058101,0.063381,0.064061,0.057647,0.493351,0.500408,0.938729
0,decisiontree,0.094121,0.101026,0.104202,0.085648,0.51884,0.516969,0.938369
0,randomforest,0.102263,0.109782,0.108162,0.105846,0.567952,0.564668,0.939001
0,xgboost,0.094121,0.097848,0.108182,0.094431,0.551444,0.534935,0.937814
0,NN,0.100202,0.076031,0.104182,0.058876,0.535706,0.53553,0.939457


### UMAP on TSVD

#### original sample

In [261]:
result_umap_svd_types  = pd.read_csv("data/output_csv/result_umap_svd_types.csv")
result_umap_svd_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [262]:
result_umap_svd_types.head()

Unnamed: 0,type,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,...,infj,infp,intj,intp,isfj,isfp,istj,istp,0,1
0,infj,11.12,135.29,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,12.761392,3.539108
1,entp,23.4,187.4756,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,9.733282,0.578183
2,intp,16.72,180.69,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,12.002575,4.193234
3,intj,21.28,181.8324,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,9.521256,1.823716
4,entj,19.34,196.4576,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,11.170195,0.726455


In [263]:
result_umap_svd_types.shape

(8675, 21)

In [264]:
X = result_umap_svd_types.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = result_umap_svd_types["type"].values

In [265]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 4) (6940,) (1735, 4) (1735,)


In [266]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted'))
    rocauc_ovr   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovr_weighted'))
    rocauc_ovo   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovo_weighted'))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc_ovr'   : [rocauc_ovr],
                             'rocauc_ovo'   : [rocauc_ovo],
                             'specificity': [specificity]
                            })   
    return df_model

In [267]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [268]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df.to_csv("data/output_csv/models_umap_svd.csv")
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc_ovr,rocauc_ovo,specificity
0,gnb,0.341787,0.267891,0.339625,0.286579,0.700431,0.689555,0.950001
0,logit,0.296542,0.213108,0.296398,0.22806,0.655654,0.639411,0.946228
0,knn,0.235159,0.233746,0.241643,0.22796,0.604789,0.589243,0.945984
0,decisiontree,0.321902,0.31853,0.309078,0.315941,0.607882,0.596917,0.952735
0,randomforest,0.423775,0.422431,0.420893,0.412365,0.749102,0.72591,0.957512
0,xgboost,0.414265,0.42443,0.410519,0.40472,0.753866,0.732565,0.957784
0,NN,0.283573,0.230036,0.291066,0.235785,0.682388,0.676132,0.950176


#### resampled

In [269]:
result_umap_svd_types  = pd.read_csv("data/output_csv/result_umap_svd_types.csv")
result_umap_svd_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [270]:
result_umap_svd_types.head()

Unnamed: 0,type,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,...,infj,infp,intj,intp,isfj,isfp,istj,istp,0,1
0,infj,11.12,135.29,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,12.761392,3.539108
1,entp,23.4,187.4756,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,9.733282,0.578183
2,intp,16.72,180.69,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,12.002575,4.193234
3,intj,21.28,181.8324,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,9.521256,1.823716
4,entj,19.34,196.4576,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,11.170195,0.726455


In [271]:
result_umap_svd_types.shape

(8675, 21)

In [272]:
def sampling_k_elements(group, k=39):
    if len(group) < k:
        return group
    return group.sample(k)

balanced_umap_svd = result_umap_svd_types.groupby("type").apply(sampling_k_elements).reset_index(drop=True)

In [273]:
X = balanced_umap_svd.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = balanced_umap_svd["type"].values

In [274]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(499, 4) (499,) (125, 4) (125,)


In [275]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted'))
    rocauc_ovr   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovr_weighted'))
    rocauc_ovo   = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc_ovo_weighted'))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc_ovr'   : [rocauc_ovr],
                             'rocauc_ovo'   : [rocauc_ovo],
                             'specificity': [specificity]
                            })   
    return df_model

In [276]:
models = {'gnb': GaussianNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'xgboost': GradientBoostingClassifier(),
          'MLPC': MLPClassifier()
         }

In [277]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df.to_csv("data/output_csv/models_umap_svd_resampled.csv")
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,rocauc_ovr,rocauc_ovo,specificity
0,gnb,0.154404,0.118007,0.146303,0.133287,0.670548,0.668227,0.944934
0,logit,0.122323,0.102271,0.142242,0.0957,0.639281,0.64063,0.943013
0,knn,0.066162,0.062883,0.05602,0.053131,0.505077,0.493911,0.940776
0,decisiontree,0.232545,0.280748,0.266465,0.233748,0.593971,0.589949,0.949278
0,randomforest,0.300707,0.312663,0.282626,0.283425,0.704815,0.70609,0.95135
0,xgboost,0.244545,0.255359,0.250505,0.240886,0.664545,0.674436,0.953061
0,NN,0.102182,0.114488,0.110141,0.097363,0.602849,0.620728,0.942461
