<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Final project: NLP to predict Myers-Briggs Personality Type

## Imports

In [0]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


# Text Processing
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from collections import Counter

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import sklearn.cluster as cluster

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

# Export data
import pickle
from scipy import sparse

# Fix imbalance
from imblearn.under_sampling import InstanceHardnessThreshold

# Model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier

pd.set_option("display.max_column", None)

## 3. Model building and evaluation

In [37]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
mbti_numeric_clean = pd.read_csv("/content/drive/My Drive/Programación/Ironhack/project-final/your-project/data/output_csv/result.csv")
mbti_numeric_clean.drop(["Unnamed: 0"], axis=1, inplace=True)

In [62]:
mbti_numeric_dimensions = mbti_numeric_clean.copy()
mbti_numeric_dimensions.drop(["ENFJ", "ENFP", "ENTJ","ENTP", "ESFJ", "ESFP", "ESTJ", "ESTP", "INFJ", "INFP", "INTJ", "INTP","ISFJ", "ISFP", "ISTJ", "ISTP"], axis=1, inplace=True)
mbti_numeric_dimensions.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,I-E,N-S,T-F,J-P,0,1
0,11.12,135.29,0,0,1,0,3.910143,7.477874
1,23.4,187.4756,1,0,0,1,3.93804,5.939636
2,16.72,180.69,0,0,0,1,3.740153,5.486389
3,21.28,181.8324,0,0,0,0,5.415134,7.452929
4,19.34,196.4576,1,0,0,0,2.083198,7.512875


In [63]:
mbti_numeric_types = mbti_numeric_clean.copy()
mbti_numeric_types.drop(["I-E", "N-S", "T-F", "J-P"], axis=1, inplace=True)
mbti_numeric_types.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP,0,1
0,11.12,135.29,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3.910143,7.477874
1,23.4,187.4756,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3.93804,5.939636
2,16.72,180.69,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3.740153,5.486389
3,21.28,181.8324,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5.415134,7.452929
4,19.34,196.4576,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2.083198,7.512875


In [64]:
for column in mbti_numeric_clean.columns[2:21]:
    x = mbti_numeric_clean[column].value_counts()
    print("Column name is:",column,"and it value is:",x)
    print()

Column name is: ENFJ and it value is: 0    8485
1     190
Name: ENFJ, dtype: int64

Column name is: ENFP and it value is: 0    8000
1     675
Name: ENFP, dtype: int64

Column name is: ENTJ and it value is: 0    8444
1     231
Name: ENTJ, dtype: int64

Column name is: ENTP and it value is: 0    7990
1     685
Name: ENTP, dtype: int64

Column name is: ESFJ and it value is: 0    8633
1      42
Name: ESFJ, dtype: int64

Column name is: ESFP and it value is: 0    8627
1      48
Name: ESFP, dtype: int64

Column name is: ESTJ and it value is: 0    8636
1      39
Name: ESTJ, dtype: int64

Column name is: ESTP and it value is: 0    8586
1      89
Name: ESTP, dtype: int64

Column name is: INFJ and it value is: 0    7205
1    1470
Name: INFJ, dtype: int64

Column name is: INFP and it value is: 0    6843
1    1832
Name: INFP, dtype: int64

Column name is: INTJ and it value is: 0    7584
1    1091
Name: INTJ, dtype: int64

Column name is: INTP and it value is: 0    7371
1    1304
Name: INTP, dtype:

### Try directly with tdfidf without embedding 

In [0]:
tfidf_df = pd.read_csv("/content/drive/My Drive/Programación/Ironhack/project-final/your-project/data/output_csv/tfidf_df.csv")

In [53]:
tfidf_df.shape

(8675, 2)

In [0]:
tfidf = sparse.load_npz("/content/drive/My Drive/Programación/Ironhack/project-final/your-project/data/output_pickles/tfidf.npz")

In [70]:
tfidf.shape

(8675, 88023)

In [50]:
mbti_numeric_dimensions_tfidf = mbti_numeric_clean.copy()
mbti_numeric_dimensions_tfidf.drop(["ENFJ", "ENFP", "ENTJ","ENTP", "ESFJ", "ESFP", "ESTJ", "ESTP", "INFJ", "INFP", "INTJ", "INTP","ISFJ", "ISFP", "ISTJ", "ISTP","0", "1"], axis=1, inplace=True)
mbti_numeric_dimensions_tfidf
mbti_numeric_dimensions_tfidf.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,I-E,N-S,T-F,J-P
0,11.12,135.29,0,0,1,0
1,23.4,187.4756,1,0,0,1
2,16.72,180.69,0,0,0,1
3,21.28,181.8324,0,0,0,0
4,19.34,196.4576,1,0,0,0


In [51]:
mbti_numeric_types_tfidf = mbti_numeric_clean.copy()
mbti_numeric_types_tfidf.drop(["I-E", "N-S", "T-F", "J-P","0", "1"], axis=1, inplace=True)
mbti_numeric_types_tfidf.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP
0,11.12,135.29,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,23.4,187.4756,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,16.72,180.69,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,21.28,181.8324,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,19.34,196.4576,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### Try with embedding 

### Undersampling of the dataset

Due to big differences in the number examples among personality types I will resample the data to fix imbalance.

In [0]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X = mbti_numeric_clean.drop(["ENFJ",	"ENFP",	"ENTJ", "ENTP",	"ESFJ",	"ESFP",	"ESTJ",	"ESTP",	"INFJ",	"INFP",	"INTJ",	"INTP",	"ISFJ",	"ISFP",	"ISTJ",	"ISTP",	"I-E",	"N-S",	"T-F",	"J-P"], axis = 1)
y = mbti_numeric_clean[["ENFJ",	"ENFP",	"ENTJ", "ENTP",	"ESFJ",	"ESFP",	"ESTJ",	"ESTP",	"INFJ",	"INFP",	"INTJ",	"INTP",	"ISFJ",	"ISFP",	"ISTJ",	"ISTP",	"I-E",	"N-S",	"T-F",	"J-P"]]

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

AttributeError: ignored

In [0]:
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, random_state=42, test_size=0.2)

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [0]:
raise SystemExit("This is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step")

SystemExit: his is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [0]:
raise SystemExit("Here it comes a very consumming memory process. You should better not start it till everything else has itereated propperly")

SystemExit: his is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step