<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Final project: NLP to predict Myers-Briggs Personality Type

## Imports

In [35]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


# Text Processing
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from collections import Counter

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import sklearn.cluster as cluster

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

import pickle
from scipy import sparse
from numpy import asarray
from numpy import savetxt

# Fix imbalance
from imblearn.under_sampling import InstanceHardnessThreshold

# Model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier

pd.set_option("display.max_column", None)

## 3. Model building and evaluation

### Try directly with tfidf without embedding 

First I'll try using types

In [54]:
train_array_types = sparse.load_npz("data/output_sparse/train_array_types.npz")

In [55]:
# Split dataset
X = train_array_types.T[16:]
y = train_array_types.T[:16]
X = X.T
y = y.T

In [56]:
print(X.shape)
print(y.shape)

(8675, 88008)
(8675, 16)


In [57]:
wine_data_df = pd.DataFrame(data=np.hstack([X, y.reshape(-1,1)]),
                            index=[ind for ind in range(X.shape[0])],
                            columns=["target"])
wine_data_df.head()

ValueError: Shape of passed values is (2, 1), indices imply (8675, 1)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [40]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })   
    return df_model

In [41]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [42]:
raise SystemExit("This is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step")

SystemExit: This is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step

In [47]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df

ValueError: bad input shape ()

I will try now using dimensions

In [4]:
train_array_dimensions = sparse.load_npz("data/output_sparse/train_array_dimensions.npz")

### Try with embedding 

#### Truncated SVD 

#### UMAP

### Undersampling of the dataset

In [0]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X = mbti_numeric_clean.drop(["ENFJ",	"ENFP",	"ENTJ", "ENTP",	"ESFJ",	"ESFP",	"ESTJ",	"ESTP",	"INFJ",	"INFP",	"INTJ",	"INTP",	"ISFJ",	"ISFP",	"ISTJ",	"ISTP",	"I-E",	"N-S",	"T-F",	"J-P"], axis = 1)
y = mbti_numeric_clean[["ENFJ",	"ENFP",	"ENTJ", "ENTP",	"ESFJ",	"ESFP",	"ESTJ",	"ESTP",	"INFJ",	"INFP",	"INTJ",	"INTP",	"ISFJ",	"ISFP",	"ISTJ",	"ISTP",	"I-E",	"N-S",	"T-F",	"J-P"]]

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

AttributeError: ignored

#### Try directly with tfidf without embedding 

#### Try with embedding 

##### Truncated SVD 

##### UMAP

Due to big differences in the number examples among personality types I will resample the data to fix imbalance.

In [0]:
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, random_state=42, test_size=0.2)

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [0]:
raise SystemExit("Here it comes a very consumming memory process. You should better not start it till everything else has itereated propperly")

SystemExit: his is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step