In [65]:
# Install packages
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
from imblearn.over_sampling import RandomOverSampler
import pickle
from sklearn.model_selection import GridSearchCV

In [66]:
# load data
df = pd.read_csv('mbti_preprocessed_features.csv', index_col= 0)

In [67]:
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

In [68]:
features = ['avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations']

In [70]:
# Binary + Feautres - Unbalanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 4].values #EncodedType

#tf-idf
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
df_tfidfvect = hstack([df_tfidfvect, X[features]])
tfidf_tokens = vectorizer.get_feature_names()
X = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + features)

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = ['Introverted', 'Extroverted']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

 Introverted       0.80      0.99      0.88      2002
 Extroverted       0.77      0.16      0.26       601

    accuracy                           0.79      2603
   macro avg       0.78      0.57      0.57      2603
weighted avg       0.79      0.79      0.74      2603

0.7944679216288898


In [71]:
# Binary + Features - Balanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df['extro_intro'].values #Extro-Intro

# Vectorize
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
df_tfidfvect = hstack([df_tfidfvect, X[features]])
tfidf_tokens = vectorizer.get_feature_names()
X = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + features)

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Balance data
sampler = RandomOverSampler()
X_train_balanced, Y_train_balanced = sampler.fit_resample(X_train, Y_train)

# Train Model
rf = RandomForestClassifier()
rf = rf.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = rf.predict(X_test)

types = ['Introverted', 'Extroverted']

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

 Introverted       0.80      0.99      0.88      2002
 Extroverted       0.82      0.16      0.27       601

    accuracy                           0.80      2603
   macro avg       0.81      0.58      0.58      2603
weighted avg       0.80      0.80      0.74      2603

0.7990779869381481
