In [1]:
# Install packages
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
from sklearn.model_selection import GridSearchCV
from sklearn import feature_selection
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# load data
df = pd.read_csv('mbti_preprocessed_features.csv', index_col= 0)

In [3]:
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

In [4]:
feature = ['avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations']

In [5]:
# Multiclass + Feautres - Unbalanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 2].values #EncodedType

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

## TF-IDF
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3, max_features=10000)

##Extract Vocabulary
corpus = X_train["preprocessed_posts"].values.astype(str)
vectorizer.fit(corpus)
X_training = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_
# print("Training vocabulary size before dimension reduction: " + str(len(dic_vocabulary)))

# Feature Selection

##Reduce Dimensionality for sparse data with Chi-Quadrat
X_names = vectorizer.get_feature_names_out()
p_value_limit = 0.95
features = pd.DataFrame()
# print("Top Features for Each Class:")
for cat in np.unique(Y_train):
    chi2, p = feature_selection.chi2(X_training, Y_train == cat)
    features = features.append(pd.DataFrame(
        {"feature": X_names, "score": 1 - p, "y": cat}))
    features = features.sort_values(["y", "score"], ascending=[True, False])
    features = features[features["score"] > p_value_limit]
X_names = features["feature"].unique().tolist()

##Re-Fit vectorizer on corpus with new set of words and create new sparse matrix
vectorizer = TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)

df_tfidfvect = vectorizer.transform(corpus)
df_tfidfvect = hstack([df_tfidfvect, X_train[feature]])
tfidf_tokens = vectorizer.get_feature_names()
df_tfidfvect = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + feature)

# Normalise Features
scaler = MinMaxScaler()
df_tfidfvect[feature] = scaler.fit_transform(df_tfidfvect[feature])

corpus = X_test["preprocessed_posts"].values.astype(str)
df_tfidfvect_test = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

df_tfidfvect_test = hstack([df_tfidfvect_test, X_test[feature]])
tfidf_tokens_test = vectorizer.get_feature_names()
df_tfidfvect_test = pd.DataFrame(data = df_tfidfvect_test.toarray(),columns = tfidf_tokens_test + feature)

# Normalise Features
df_tfidfvect_test[feature] = scaler.transform(df_tfidfvect_test[feature])

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(df_tfidfvect, Y_train)

# Prediction & Evaluation
prediction = rf.predict(df_tfidfvect_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.62      0.38      0.47        55
        ENFP       0.58      0.49      0.53       212
        ENTJ       0.72      0.35      0.47        74
        ENTP       0.54      0.62      0.58       196
        ESFJ       0.00      0.00      0.00        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.67      0.18      0.29        11
        ESTP       0.50      0.18      0.26        28
        INFJ       0.55      0.58      0.56       436
        INFP       0.52      0.69      0.60       545
        INTJ       0.62      0.47      0.54       365
        INTP       0.54      0.63      0.59       378
        ISFJ       0.61      0.33      0.43        58
        ISFP       0.50      0.33      0.40        73
        ISTJ       0.60      0.41      0.49        61
        ISTP       0.51      0.55      0.53        86

    accuracy                           0.55      2603
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Multiclass + Feautres - Balanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 2].values #EncodedType

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

## TF-IDF
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3, max_features=10000)

##Extract Vocabulary
corpus = X_train["preprocessed_posts"].values.astype(str)
vectorizer.fit(corpus)
X_training = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_
# print("Training vocabulary size before dimension reduction: " + str(len(dic_vocabulary)))

# Feature Selection

##Reduce Dimensionality for sparse data with Chi-Quadrat
X_names = vectorizer.get_feature_names_out()
p_value_limit = 0.95
features = pd.DataFrame()
# print("Top Features for Each Class:")
for cat in np.unique(Y_train):
    chi2, p = feature_selection.chi2(X_training, Y_train == cat)
    features = features.append(pd.DataFrame(
        {"feature": X_names, "score": 1 - p, "y": cat}))
    features = features.sort_values(["y", "score"], ascending=[True, False])
    features = features[features["score"] > p_value_limit]
X_names = features["feature"].unique().tolist()

##Re-Fit vectorizer on corpus with new set of words and create new sparse matrix
vectorizer = TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)

df_tfidfvect = vectorizer.transform(corpus)
df_tfidfvect = hstack([df_tfidfvect, X_train[feature]])
tfidf_tokens = vectorizer.get_feature_names()
df_tfidfvect = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + feature)

# Normalise Features
scaler = MinMaxScaler()
df_tfidfvect[feature] = scaler.fit_transform(df_tfidfvect[feature])

corpus = X_test["preprocessed_posts"].values.astype(str)
df_tfidfvect_test = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

df_tfidfvect_test = hstack([df_tfidfvect_test, X_test[feature]])
tfidf_tokens_test = vectorizer.get_feature_names()
df_tfidfvect_test = pd.DataFrame(data = df_tfidfvect_test.toarray(),columns = tfidf_tokens_test + feature)

# Normalise Features
df_tfidfvect_test[feature] = scaler.transform(df_tfidfvect_test[feature])

# Train Model
rf_b = RandomForestClassifier(class_weight = 'balanced')
rf_b = rf_b.fit(df_tfidfvect, Y_train)

# Prediction & Evaluation
prediction = rf_b.predict(df_tfidfvect_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.49      0.44      0.46        55
        ENFP       0.56      0.45      0.50       212
        ENTJ       0.56      0.46      0.50        74
        ENTP       0.54      0.62      0.58       196
        ESFJ       0.57      0.36      0.44        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.62      0.45      0.53        11
        ESTP       0.50      0.32      0.39        28
        INFJ       0.53      0.57      0.55       436
        INFP       0.53      0.67      0.59       545
        INTJ       0.67      0.42      0.52       365
        INTP       0.54      0.59      0.56       378
        ISFJ       0.51      0.48      0.50        58
        ISFP       0.38      0.32      0.35        73
        ISTJ       0.56      0.54      0.55        61
        ISTP       0.54      0.59      0.57        86

    accuracy                           0.54      2603
   

In [7]:
# Hyperparameter Tuning
# CV Grid Search
random_grid = {
    'max_depth': [ 30, 60, 80],
    'max_features': ['log2', 'sqrt'], 
    'min_samples_split': [2, 5, 10],
    'n_estimators': [500, 1000, 1500],
    'criterion' :['gini', 'entropy'],
    'random_state': [42]
 }

In [10]:
rf_cv =  GridSearchCV(estimator = RandomForestClassifier(), param_grid = random_grid, cv = 3, n_jobs = -1)

In [11]:
rf_cv.fit(df_tfidfvect, Y_train)

In [12]:
print(rf_cv.best_params_)

{'criterion': 'gini', 'max_depth': 30, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 1500, 'random_state': 42}


In [None]:
rf_cv =  GridSearchCV(estimator = RandomForestClassifier(class_weight = 'balanced'), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [13]:
# Best Parameters - unbalanced
rf_t_1 = RandomForestClassifier(n_estimators = 1500, min_samples_split = 5, max_features = 'log2', max_depth = 30, random_state = 42, criterion = 'gini')
rf_t_1 = rf_t_1.fit(df_tfidfvect, Y_train)

# Prediction & Evaluation
prediction = rf_t_1.predict(df_tfidfvect_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.57      0.29      0.39        55
        ENFP       0.63      0.49      0.55       212
        ENTJ       0.71      0.36      0.48        74
        ENTP       0.55      0.64      0.59       196
        ESFJ       0.00      0.00      0.00        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       1.00      0.09      0.17        11
        ESTP       0.67      0.21      0.32        28
        INFJ       0.56      0.57      0.56       436
        INFP       0.51      0.72      0.60       545
        INTJ       0.65      0.45      0.53       365
        INTP       0.54      0.65      0.59       378
        ISFJ       0.59      0.33      0.42        58
        ISFP       0.53      0.33      0.41        73
        ISTJ       0.63      0.39      0.48        61
        ISTP       0.54      0.55      0.54        86

    accuracy                           0.56      2603
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Best Parameters - balanced
rf_t_1 = RandomForestClassifier(class_weight = 'balanced', n_estimators = 1500, min_samples_split = 5, max_features = 'log2', max_depth = 30, random_state = 42, criterion = 'gini')
rf_t_1 = rf_t_1.fit(df_tfidfvect, Y_train)

# Prediction & Evaluation
prediction = rf_t_1.predict(df_tfidfvect_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.41      0.51      0.45        55
        ENFP       0.57      0.54      0.56       212
        ENTJ       0.51      0.53      0.52        74
        ENTP       0.52      0.68      0.59       196
        ESFJ       0.60      0.55      0.57        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.60      0.55      0.57        11
        ESTP       0.52      0.43      0.47        28
        INFJ       0.59      0.53      0.56       436
        INFP       0.58      0.63      0.60       545
        INTJ       0.69      0.43      0.53       365
        INTP       0.57      0.63      0.60       378
        ISFJ       0.52      0.60      0.56        58
        ISFP       0.41      0.44      0.42        73
        ISTJ       0.53      0.57      0.55        61
        ISTP       0.52      0.64      0.58        86

    accuracy                           0.56      2603
   