<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Final project: NLP to predict Myers-Briggs Personality Type

### Imports

In [2]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


# Ignore noise warning
import warnings
warnings.filterwarnings('ignore')

# Work with pickles
import pickle

#Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix
from sklearn.metrics import classification_report

# Model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

  import pandas.util.testing as tm


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## 5. Hyperparameter Tuning of the Models (Types)

Althought the metrics of the different models are really good, we can still improve the performance of the models. Therefore, a fine tunning of the different parameters of each models has to be done.

In [0]:
result_svd_vec_types  = pd.read_csv("/content/drive/My Drive/Programación/Ironhack/project-final/your-project/data/output_csv/result_svd_vec_types.csv")
result_svd_vec_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [6]:
X = result_svd_vec_types.drop(["type","enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1).values
y = result_svd_vec_types["type"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 102) (6940,) (1735, 102) (1735,)


<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000">

In [0]:
raise SystemExit("Stop right there! The following cells takes some time to complete.")

SystemExit: Stop right there! The following cells takes some time to complete.

### RandomForest Tuning

As there's quite a few parameters I've been tuning them one by one and now I will compare the performance of each parameter "best" vs default config altogether with the other parameters.

##### RandomSearchCV

##### GridSearchCV

In [0]:
random_forest = RandomForestClassifier(random_state = 42)

param_grid =  {'class_weight': ['balanced'],
               'criterion': ['gini','entropy'],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 
               'max_features': ['sqrt'],
               'n_estimators' : [185, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 
               'min_samples_leaf': [5],
               'min_samples_split': [12],
               'bootstrap': [False],
               'oob_score': [False] 
            }

grid = GridSearchCV(random_forest, param_grid, cv=3, scoring='f1_weighted', verbose=1, n_jobs=-1)

grid.fit(X_train, y_train)

grid.best_estimator_

print(grid.best_params_)

Fitting 3 folds for each of 264 candidates, totalling 792 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


### GradientBooster Tuning

### Final results

In [0]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision_weighted'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall_weighted'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1_weighted'))
    y_pred = model.predict(X_test)
    mcm = multilabel_confusion_matrix(y_test, y_pred)
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    specificities = tn / (tn+fp)
    specificity = (specificities.sum())/ 16

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'specificity'  : [specificity]
                            })   
    return df_model

In [0]:
models = {'randomforest': RandomForestClassifier(random_state = 42, bootstrap=False, class_weight = 'balanced', criterion = 'gini', max_features = 'sqrt',
                                                 min_samples_leaf = 5, min_samples_split = 12, max_depth = None, n_estimators = 185, oob_score = False),
          'xgboost': GradientBoostingClassifier(random_state = 42, loss = 'deviance', max_depth = 3, n_estimators = 99, max_features = 'sqrt', learning_rate = 0.075, criterion = "friedman_mse", subsample = 0.9,
                                                min_samples_leaf = 6, min_samples_split = 15)
         }

In [38]:
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()]) 
models_df

Unnamed: 0,model,accuracy,precision,recall,f1score,specificity
0,randomforest,0.637608,0.642026,0.633718,0.632585,0.974213
0,xgboost,0.649424,0.646581,0.646974,0.635541,0.974009


In [0]:
models_df.to_csv("/content/drive/My Drive/Programación/Ironhack/project-final/your-project/data/output_csv/models_tuned_types.csv")