In [6]:
# Install packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from scipy.sparse import hstack
from sklearn.metrics import classification_report
import sklearn
import pickle
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler

In [7]:
# Load data
df = pd.read_csv("mbti_preprocessed_features.csv", index_col= 0)

In [8]:
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

In [9]:
features = ['avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations']

In [10]:
# Binary + Features - Unbalanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 4].values #Extro-Intro

# Vectorize
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
df_tfidfvect = hstack([df_tfidfvect, X[features]])
tfidf_tokens = vectorizer.get_feature_names()
X = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + features)

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
dt = DecisionTreeClassifier()
dt = dt.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = dt.predict(X_test)

types = ['Introverted', 'Extroverted']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

 Introverted       0.83      0.83      0.83      2002
 Extroverted       0.45      0.45      0.45       601

    accuracy                           0.74      2603
   macro avg       0.64      0.64      0.64      2603
weighted avg       0.74      0.74      0.74      2603

0.7433730311179407


In [6]:
# Binary + Features - Balanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 4].values #Extro-Intro

# Vectorize
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
df_tfidfvect = hstack([df_tfidfvect, X[features]])
tfidf_tokens = vectorizer.get_feature_names_out()
X = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + features)

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# balance data
sampler = RandomOverSampler()
X_train_balanced, Y_train_balanced = sampler.fit_resample(X_train, Y_train)

# Train Model
dt = DecisionTreeClassifier()
dt = dt.fit(X_train_balanced, Y_train_balanced)

# Prediction & Evaluation
prediction = dt.predict(X_test)

types = ['Introverted', 'Extroverted']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

 Introverted       0.84      0.81      0.82      2002
 Extroverted       0.43      0.47      0.45       601

    accuracy                           0.73      2603
   macro avg       0.63      0.64      0.63      2603
weighted avg       0.74      0.73      0.74      2603

0.7310795236265847
