In [18]:
# Install packages
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
from imblearn.over_sampling import RandomOverSampler
import pickle
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [19]:
# load data
df = pd.read_csv('mbti_preprocessed_features.csv', index_col= 0)

In [16]:
df.head()

Unnamed: 0,type,posts,encodedType,preprocessed_posts,extro_intro,intu_obs,feel_think,prosp_judg,avg_word_count,avg_exclam_count,avg_fullstop_count,avg_emoji_count,avg_count_of_hello,avg_count_of_hi,avg_count_of_extroverted_bigrams,avg_count_of_extroverted_stylistic_impressions,avg_count_of_interoverted_quantifiers,avg_count_of_introverted_first_person_singular_pronoun,avg_count_of_introverted_negations
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,8,"['youtube', 'tumblr', 'enfp', 'intj', 'moment'...",0,1,1,0,12.1,0.0,0.16,0.08,0.0,0.0,0.14,0.0,0.06,3.54,0.02
1,ENTP,'I'm finding the lack of me in these posts ver...,3,"['im', 'finding', 'the', 'lack', 'of', 'post',...",1,1,0,1,24.38,0.0,0.04,0.08,0.0,0.0,0.28,0.0,0.14,5.58,0.1
2,INTP,'Good one _____ https://www.youtube.com/wat...,11,"['good', 'one', 'youtube', 'of', 'course', 'i'...",0,1,0,1,17.7,0.0,0.08,0.0,0.0,0.0,0.26,0.0,0.0,5.18,0.04
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",10,"['dear', 'intp', 'i', 'enjoyed', 'conversation...",0,1,0,0,22.26,0.0,0.14,0.02,0.0,0.0,0.2,0.0,0.1,6.12,0.1
4,ENTJ,'You're fired.|||That's another silly misconce...,2,"['youre', 'fired', 'thats', 'another', 'silly'...",1,1,0,0,20.32,0.0,0.08,0.06,0.0,0.0,0.14,0.0,0.0,6.1,0.1


In [20]:
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

In [21]:
features = ['avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations']

In [22]:
# Multiclass + Feautres - Unbalanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 2].values #EncodedType

#tf-idf
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
df_tfidfvect = hstack([df_tfidfvect, X[features]])
tfidf_tokens = vectorizer.get_feature_names()
X = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + features)

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.44      0.07      0.12        55
        ENFP       0.68      0.42      0.52       212
        ENTJ       0.79      0.15      0.25        74
        ENTP       0.55      0.61      0.57       196
        ESFJ       0.00      0.00      0.00        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.00      0.00      0.00        11
        ESTP       0.00      0.00      0.00        28
        INFJ       0.51      0.58      0.54       436
        INFP       0.45      0.79      0.57       545
        INTJ       0.65      0.39      0.49       365
        INTP       0.49      0.60      0.54       378
        ISFJ       0.82      0.16      0.26        58
        ISFP       0.32      0.11      0.16        73
        ISTJ       1.00      0.07      0.12        61
        ISTP       0.56      0.36      0.44        86

    accuracy                           0.51      2603
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
# Multiclass + Feautres - Balanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 2].values #EncodedType

# Vectorize
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
df_tfidfvect = hstack([df_tfidfvect, X[features]])
tfidf_tokens = vectorizer.get_feature_names()
X = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + features)

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# balance data
sampler = RandomOverSampler()
X_train_balanced, Y_train_balanced = sampler.fit_resample(X_train, Y_train)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train_balanced, Y_train_balanced)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.51      0.45      0.48        55
        ENFP       0.57      0.48      0.52       212
        ENTJ       0.60      0.41      0.48        74
        ENTP       0.53      0.68      0.60       196
        ESFJ       0.50      0.36      0.42        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       1.00      0.18      0.31        11
        ESTP       0.56      0.18      0.27        28
        INFJ       0.50      0.54      0.52       436
        INFP       0.51      0.66      0.57       545
        INTJ       0.65      0.41      0.50       365
        INTP       0.52      0.57      0.54       378
        ISFJ       0.52      0.40      0.45        58
        ISFP       0.39      0.34      0.36        73
        ISTJ       0.48      0.36      0.41        61
        ISTP       0.46      0.51      0.48        86

    accuracy                           0.53      2603
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
# Hyperparameter Tuning
# CV Grid Search
random_grid = {
    'max_depth': [ 70,  80, 90],
    'max_features': ['log2', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 4],
    'n_estimators': [1400, 1500, 1600],
    'criterion' :['gini', 'entropy'],
    'random_state': [42]
 }

In [14]:
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [15]:
rf_random.fit(X_train_balanced, Y_train_balanced)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [16]:
print(rf_random.best_params_)

{'n_estimators': 1500,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 70}

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.60      0.11      0.18        55
        ENFP       0.73      0.36      0.48       212
        ENTJ       0.59      0.22      0.32        74
        ENTP       0.54      0.65      0.59       196
        ESFJ       0.00      0.00      0.00        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.00      0.00      0.00        11
        ESTP       0.00      0.00      0.00        28
        INFJ       0.55      0.56      0.55       436
        INFP       0.42      0.82      0.55       545
        INTJ       0.73      0.35      0.47       365
        INTP       0.54      0.61      0.58       378
        ISFJ       0.74      0.24      0.36        58
        ISFP       0.38      0.19      0.25        73
        ISTJ       0.82      0.15      0.25        61
        ISTP       0.61      0.41      0.49        86

    accuracy                           0.52      2603
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.57      0.44      0.49        55
        ENFP       0.63      0.49      0.55       212
        ENTJ       0.62      0.43      0.51        74
        ENTP       0.51      0.68      0.58       196
        ESFJ       0.40      0.36      0.38        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       1.00      0.18      0.31        11
        ESTP       0.67      0.21      0.32        28
        INFJ       0.57      0.52      0.54       436
        INFP       0.49      0.72      0.59       545
        INTJ       0.72      0.41      0.52       365
        INTP       0.58      0.61      0.59       378
        ISFJ       0.53      0.47      0.50        58
        ISFP       0.42      0.41      0.42        73
        ISTJ       0.51      0.38      0.43        61
        ISTP       0.48      0.56      0.51        86

    accuracy                           0.55      2603
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
rf_cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=random_grid, cv= 3, n_jobs = -1)
rf_cv.fit(X_train_balanced, Y_train_balanced)

In [23]:
print(rf_cv.best_params_)

{'max_depth': 80,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 1600,
 'random_state': 42}

In [None]:
# Best Parameters - max_features - log2
rf_t_1 = RandomForestClassifier(n_estimators = 1600, min_samples_split = 4, min_samples_leaf = 1, max_features = 'log2', max_depth = 80, random_state = 42, criterion = 'gini')
rf_t_1 = rf_t_1.fit(X_train_balanced, Y_train_balanced)

# Prediction & Evaluation
prediction = rf_t_1.predict(X_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

In [None]:
# Best Parameters - max_features - sqrt
rf_t_2 = RandomForestClassifier(n_estimators = 1600, min_samples_split = 4, min_samples_leaf = 1, max_features = 'sqrt', max_depth = 80, random_state = 42, criterion = 'gini')
rf_t_2 = rf_t_2.fit(X_train_balanced, Y_train_balanced)

# Prediction & Evaluation
prediction = rf_t_2.predict(X_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))