In [2]:
# Install packages
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Load data
df = pd.read_csv("mbti_preprocessed_features.csv", index_col= 0)

In [4]:
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

In [8]:
# Multiclass

# Determine X and Y
X = df.iloc[:, 3].values #Preprocessed_posts
Y = df.iloc[:, 2].values #EncodedType

# Vectorize
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.3)
matrix = vectorizer.fit_transform(X)
X = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.60      0.11      0.18        55
        ENFP       0.69      0.39      0.50       212
        ENTJ       0.72      0.18      0.28        74
        ENTP       0.57      0.61      0.59       196
        ESFJ       0.00      0.00      0.00        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.00      0.00      0.00        11
        ESTP       0.00      0.00      0.00        28
        INFJ       0.50      0.59      0.54       436
        INFP       0.47      0.78      0.59       545
        INTJ       0.66      0.41      0.50       365
        INTP       0.50      0.63      0.56       378
        ISFJ       0.68      0.22      0.34        58
        ISFP       0.45      0.12      0.19        73
        ISTJ       0.86      0.20      0.32        61
        ISTP       0.54      0.40      0.46        86

    accuracy                           0.52      2603
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Binary Extroverted-Introverted

# Determine X and Y
X = df.iloc[:, 3].values #Preprocessed_posts
Y = df.iloc[:, 4].values #extro_intro

# Vectorize
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.3)
matrix = vectorizer.fit_transform(X)
X = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = ['Introverted', 'Extroverted']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

 Introverted       0.79      0.99      0.88      2002
 Extroverted       0.84      0.13      0.23       601

    accuracy                           0.79      2603
   macro avg       0.81      0.56      0.56      2603
weighted avg       0.80      0.79      0.73      2603

0.7940837495197849


In [10]:
# Binary Sensing-Intuition

# Determine X and Y 
X = df.iloc[:, 3].values #Preprocessed_posts
Y = df.iloc[:, 5].values #intu-obs

# Vectorize
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.3)
matrix = vectorizer.fit_transform(X)
X = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = ['Sensing', 'Intuition']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

     Sensing       0.80      0.04      0.07       342
   Intuition       0.87      1.00      0.93      2261

    accuracy                           0.87      2603
   macro avg       0.84      0.52      0.50      2603
weighted avg       0.86      0.87      0.82      2603

0.8720706876680753


In [12]:
# Binary Thinking-Feeling

# Determine X and Y 
X = df.iloc[:, 3].values #Preprocessed_posts
Y = df.iloc[:, 6].values # think - feel

# Vectorize
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.3)
matrix = vectorizer.fit_transform(X)
X = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = ['Thinking', 'Feeling']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

    Thinking       0.78      0.71      0.74      1199
     Feeling       0.77      0.83      0.80      1404

    accuracy                           0.77      2603
   macro avg       0.77      0.77      0.77      2603
weighted avg       0.77      0.77      0.77      2603

0.7737226277372263


In [13]:
# Binary Judging-Perceiving

# Determine X and Y 
X = df.iloc[:, 3].values #Preprocessed_posts
Y = df.iloc[:, 7].values # think - feel

# Vectorize
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.3)
matrix = vectorizer.fit_transform(X)
X = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = ['Judging', 'Perceiving']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

     Judging       0.81      0.39      0.52      1071
  Perceiving       0.69      0.93      0.79      1532

    accuracy                           0.71      2603
   macro avg       0.75      0.66      0.66      2603
weighted avg       0.74      0.71      0.68      2603

0.7099500576258164
