<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Final project: NLP to predict Myers-Briggs Personality Type

## Imports

In [2]:
# Data Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


# Text Processing
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from collections import Counter

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import sklearn.cluster as cluster

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

import pickle as pkl
from scipy import sparse
from numpy import asarray
from numpy import savetxt

# Fix imbalance
from imblearn.under_sampling import InstanceHardnessThreshold

# Model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier

Using TensorFlow backend.


## 3. Model building and evaluation

### Try directly with tfidf without embedding 

#### Using types

In [184]:
train_array_types = sparse.load_npz("data/output_sparse/train_array_types.npz")
list_personality = pd.read_csv("data/output_csv/personality.csv")

In [185]:
list_personality.loc[-1] = list_personality.columns.astype("float64")
list_personality.index = list_personality.index + 1  # shifting index
list_personality = list_personality.sort_index()  # sorting by index
list_personality.columns = ["types"]

In [186]:
print ((train_array_types.shape),(list_personality.shape))

(8675, 88024) (8675, 1)


In [187]:
train_df = pd.DataFrame.sparse.from_spmatrix(train_array_types)

In [188]:
train_df.shape

(8675, 88024)

In [189]:
train_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,88014,88015,88016,88017,88018,88019,88020,88021,88022,88023
0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [190]:
train_df.drop([0], axis=1, inplace=True)

In [191]:
train_df.shape

(8675, 88023)

In [192]:
list_personality.types.value_counts()

9.0     1832
8.0     1470
11.0    1304
10.0    1091
3.0      685
1.0      675
15.0     337
13.0     271
2.0      231
14.0     205
0.0      190
12.0     166
7.0       89
5.0       48
4.0       42
6.0       39
Name: types, dtype: int64

In [193]:
# Split dataset
X = train_df
y = list_personality["types"]

X = X
y = y

In [194]:
X = X.iloc[:100]
y = y.iloc[:100]

print(X.shape)
print(y.shape)

(100, 88023)
(100,)


In [195]:
print(X.shape)
print(y.shape)

(100, 88023)
(100,)


In [94]:
y = y.astype("int64")

0      8
1      3
2     11
3     10
4      2
      ..
95     9
96     8
97    10
98     8
99     1
Name: types, Length: 100, dtype: int64

##### Undersampling of the dataset

Due to big differences in the number examples among personality types I will resample the data to fix imbalance.

In [95]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

IndexError: arrays used as indices must be of integer (or boolean) type

In [0]:
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, random_state=42, test_size=0.2)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(80, 88023) (80,) (20, 88023) (20,)


In [97]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })   
    return df_model

In [98]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

In [99]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

#### Dimensions

In [196]:
train_array_dimensions = sparse.load_npz("data/output_sparse/train_array_dimensions.npz")
dimensions = pd.read_csv("data/output_csv/dimensions.csv")

In [197]:
dimensions.head()

Unnamed: 0,0.000000000000000000e+00,0.000000000000000000e+00.1,1.000000000000000000e+00,0.000000000000000000e+00.2
0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [199]:
dimensions.loc[-1] = dimensions.columns.astype("float")
dimensions.index = dimensions.index + 1  # shifting index
dimensions = dimensions.sort_index()  # sorting by index
dimensions.columns = ["i-e", "n-s", "t-f", "j-p"]
dimensions.head()

TypeError: Cannot cast Index to dtype float

In [183]:
dimensions = dimensions.astype("float64")

ValueError: could not convert string to float: '0.000000000000000000e+00.1'

In [4]:
train_array_dimensions = sparse.load_npz("data/output_sparse/train_array_dimensions.npz")

In [37]:
print ((train_array_types.shape),(list_personality.shape))

(8675, 88024) (8675, 1)


In [41]:
train_df = pd.DataFrame.sparse.from_spmatrix(train_array_types)

In [42]:
train_df.shape

(8675, 88024)

In [44]:
train_df.drop([0], axis=1, inplace=True)

In [45]:
train_df.shape

(8675, 88023)

In [91]:
# Split dataset
X = train_df
y = list_personality["types"]

X = X
y = y

In [92]:
X = X.iloc[:100]
y = y.iloc[:100]

print(X.shape)
print(y.shape)

In [93]:
print(X.shape)
print(y.shape)

(100, 88023)
(100,)


In [180]:
y = y.astype("int64")

##### Undersampling of the dataset

Due to big differences in the number examples among personality types I will resample the data to fix imbalance.

In [95]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

IndexError: arrays used as indices must be of integer (or boolean) type

In [0]:
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample, random_state=42, test_size=0.2)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(80, 88023) (80,) (20, 88023) (20,)


In [97]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })   
    return df_model

In [98]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

In [99]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Try with embedding 

#### Truncated SVD 

##### types

In [139]:
result_svd_vec_types  = pd.read_csv("data/output_csv/result_svd_vec_types.csv")
result_svd_vec_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [140]:
result_svd_vec_types.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,estp,...,90,91,92,93,94,95,96,97,98,99
0,11.12,135.29,0,0,0,0,0,0,0,0,...,0.343928,0.360159,0.35868,0.351273,0.402498,0.357498,0.376758,0.379303,0.367843,0.375516
1,23.4,187.4756,0,0,0,1,0,0,0,0,...,0.351512,0.375683,0.352008,0.356475,0.332874,0.386907,0.356917,0.354533,0.377601,0.337164
2,16.72,180.69,0,0,0,0,0,0,0,0,...,0.332374,0.354798,0.362468,0.352909,0.367732,0.3426,0.336408,0.357758,0.344202,0.391195
3,21.28,181.8324,0,0,0,0,0,0,0,0,...,0.37072,0.335693,0.393478,0.349815,0.373543,0.380157,0.38104,0.335247,0.360196,0.377249
4,19.34,196.4576,0,0,1,0,0,0,0,0,...,0.337362,0.363822,0.328088,0.33672,0.373329,0.376424,0.356934,0.367272,0.333998,0.381967


In [141]:
result_svd_vec_types.shape

(8675, 118)

In [119]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X = result_svd_vec_types.drop(["enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1)
y = result_svd_vec_types[["enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                          "intp", "isfj", "isfp", "istj", "istp"]]

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

AttributeError: 'DataFrame' object has no attribute 'argmax'

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 4) (6940, 16) (1735, 4) (1735, 16)


In [143]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })   
    return df_model

In [144]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

In [145]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df

ValueError: bad input shape (6940, 16)

##### dimensions

In [146]:
result_svd_vec_dimensions = pd.read_csv("data/output_csv/result_svd_vec_dimensions.csv")
result_svd_vec_dimensions.drop(["Unnamed: 0"], axis=1, inplace=True)

In [147]:
result_svd_vec_dimensions.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,i-e,n-s,t-f,j-p,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,11.12,135.29,0,0,1,0,0.575864,0.3419,0.330206,0.33337,...,0.343928,0.360159,0.35868,0.351273,0.402498,0.357498,0.376758,0.379303,0.367843,0.375516
1,23.4,187.4756,1,0,0,1,0.668875,0.39152,0.354378,0.349749,...,0.351512,0.375683,0.352008,0.356475,0.332874,0.386907,0.356917,0.354533,0.377601,0.337164
2,16.72,180.69,0,0,0,1,0.632933,0.323963,0.318553,0.318146,...,0.332374,0.354798,0.362468,0.352909,0.367732,0.3426,0.336408,0.357758,0.344202,0.391195
3,21.28,181.8324,0,0,0,0,0.67878,0.478042,0.341578,0.293118,...,0.37072,0.335693,0.393478,0.349815,0.373543,0.380157,0.38104,0.335247,0.360196,0.377249
4,19.34,196.4576,1,0,0,0,0.632917,0.335554,0.338666,0.279699,...,0.337362,0.363822,0.328088,0.33672,0.373329,0.376424,0.356934,0.367272,0.333998,0.381967


In [148]:
result_svd_vec_dimensions.shape

(8675, 106)

In [149]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X = result_svd_vec_dimensions.drop(["i-e", "n-s", "t-f", "j-p"], axis=1)
y = result_svd_vec_dimensions[["i-e", "n-s", "t-f", "j-p"]]

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

ValueError: Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported.

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 102) (6940, 4) (1735, 102) (1735, 4)


In [151]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })   
    return df_model

In [152]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

In [153]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df

ValueError: bad input shape (6940, 4)

#### UMAP

##### types

In [154]:
result_umap_types  = pd.read_csv("data/output_csv/result_umap_types.csv")
result_umap_types.drop(["Unnamed: 0"], axis=1, inplace=True)

In [155]:
result_umap_types.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,enfj,enfp,entj,entp,esfj,esfp,estj,estp,infj,infp,intj,intp,isfj,isfp,istj,istp,0,1
0,11.12,135.29,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3.910143,7.477874
1,23.4,187.4756,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3.93804,5.939636
2,16.72,180.69,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3.740153,5.486389
3,21.28,181.8324,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,5.415134,7.452929
4,19.34,196.4576,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2.083198,7.512875


In [156]:
result_umap_types.shape

(8675, 20)

In [157]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X = result_umap_types.drop(["enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                               "intp", "isfj", "isfp", "istj", "istp"], axis=1)
y = result_umap_types[["enfj", "enfp", "entj", "entp", "esfj", "esfp", "estj", "estp","infj", "infp", "intj",
                          "intp", "isfj", "isfp", "istj", "istp"]]

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

AttributeError: 'DataFrame' object has no attribute 'argmax'

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 4) (6940, 16) (1735, 4) (1735, 16)


In [159]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })   
    return df_model

In [160]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

In [161]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df

ValueError: bad input shape (6940, 16)

##### dimensions

In [162]:
result_umap_dimensions = pd.read_csv("data/output_csv/result_umap_dimensions.csv")
result_umap_dimensions.drop(["Unnamed: 0"], axis=1, inplace=True)

In [163]:
result_umap_dimensions.head()

Unnamed: 0,words_per_comment,variance_of_word_counts,i-e,n-s,t-f,j-p,0,1
0,11.12,135.29,0,0,1,0,3.910143,7.477874
1,23.4,187.4756,1,0,0,1,3.93804,5.939636
2,16.72,180.69,0,0,0,1,3.740153,5.486389
3,21.28,181.8324,0,0,0,0,5.415134,7.452929
4,19.34,196.4576,1,0,0,0,2.083198,7.512875


In [164]:
result_umap_dimensions.shape

(8675, 8)

In [165]:
Ramdom_sample = InstanceHardnessThreshold(random_state =42)

X = result_umap_dimensions.drop(["i-e", "n-s", "t-f", "j-p"], axis=1)
y = result_umap_dimensions[["i-e", "n-s", "t-f", "j-p"]]

X_undersample, y_undersample = Ramdom_sample.fit_resample(X, y)

ValueError: Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported.

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 4) (6940, 4) (1735, 4) (1735, 4)


In [167]:
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True)
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='accuracy'))
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=strat_k_fold, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
    specificity = tn / (tn+fp)

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'specificity': [specificity]
                            })   
    return df_model

In [168]:
models = {'gnb': GaussianNB(),
          'bnb': BernoulliNB(),
          'mnb': MultinomialNB(),
          'logit': LogisticRegression(),
          'knn': KNeighborsClassifier(),
          'decisiontree': DecisionTreeClassifier(),
          'randomforest': RandomForestClassifier(),
          'svc': SVC(probability=True),
          'linearsvc': LinearSVC(),
          'xgboost': GradientBoostingClassifier(),
          'NN': MLPClassifier()
         }

In [169]:
# Evaluation of models
models_df = pd.concat([baseline_report(model, X_train, X_test, y_train, y_test, name) for (name, model) in models.items()])
models_df

ValueError: bad input shape (6940, 4)

<img src="https://www.nicepng.com/png/detail/148-1486992_discover-the-most-powerful-ways-to-automate-your.png" width="1000"> 

In [0]:
raise SystemExit("Here it comes a very consumming memory process. You should better not start it till everything else has itereated propperly")

SystemExit: his is a very consumming memory process, with average wall time: ~ 20 min. If you don't want to wait please go to the next step