In [83]:
import pandas as pd
import os

df = pd.read_csv("data" + os.sep + "mbit_preprocessed_features_scaled.csv", index_col=0)

In [84]:
# Install packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from sklearn.metrics import classification_report
import sklearn
import pickle
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler

In [85]:
def create_train_test_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42069)
    return X_train, X_test, Y_train, Y_test

In [86]:
features = ['avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations']

In [87]:
# Multiclass + Features - Unbalanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 2].values #EncodedType

# Vectorize
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
df_tfidfvect = hstack([df_tfidfvect, X[features]])
tfidf_tokens = vectorizer.get_feature_names()
X = pd.DataFrame(data = df_tfidfvect.toarray(),columns = tfidf_tokens + features)

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
lr = LogisticRegression()
lr = lr.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = lr.predict(X_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))



Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.62      0.29      0.40        55
        ENFP       0.61      0.41      0.49       212
        ENTJ       0.74      0.42      0.53        74
        ENTP       0.61      0.57      0.59       196
        ESFJ       0.33      0.09      0.14        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.00      0.00      0.00        11
        ESTP       0.57      0.14      0.23        28
        INFJ       0.56      0.61      0.58       436
        INFP       0.50      0.73      0.59       545
        INTJ       0.61      0.52      0.56       365
        INTP       0.52      0.67      0.59       378
        ISFJ       0.76      0.28      0.41        58
        ISFP       0.50      0.29      0.37        73
        ISTJ       0.73      0.26      0.39        61
        ISTP       0.72      0.44      0.55        86

    accuracy                           0.56      2603
   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [89]:
# Multiclass + Features - Unbalanced

# Determine X and Y
X = df[['preprocessed_posts', 'avg_word_count', 'avg_exclam_count', 'avg_fullstop_count', 'avg_count_of_hello', 'avg_count_of_hi', 'avg_count_of_extroverted_bigrams', 'avg_count_of_extroverted_stylistic_impressions',	'avg_count_of_interoverted_quantifiers',	'avg_count_of_introverted_first_person_singular_pronoun',	'avg_count_of_introverted_negations' ]] #Preprocessed_posts #Preprocessed_Posts + Features
Y = df.iloc[:, 2].values #EncodedType

# Vectorize
vectorizer = TfidfVectorizer(min_df = 0.1, max_df=0.3)
df_tfidfvect = vectorizer.fit_transform(X['preprocessed_posts'].values)
X = df_tfidfvect

# Create splits
X_train, X_test, Y_train, Y_test = create_train_test_split(X, Y)

# Train Model
lr = LogisticRegression()
lr = lr.fit(X_train, Y_train)

# Prediction & Evaluation
prediction = lr.predict(X_test)

types = df.iloc[:, 0].values
types = sorted(list(set(types)))

print("Classification Report:")
print(classification_report(Y_test, prediction, target_names=types))
print(sklearn.metrics.f1_score(Y_test, prediction, average ='micro'))

Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.60      0.27      0.37        55
        ENFP       0.59      0.39      0.47       212
        ENTJ       0.69      0.39      0.50        74
        ENTP       0.62      0.59      0.61       196
        ESFJ       0.33      0.09      0.14        11
        ESFP       0.00      0.00      0.00        14
        ESTJ       0.00      0.00      0.00        11
        ESTP       0.57      0.14      0.23        28
        INFJ       0.55      0.61      0.58       436
        INFP       0.50      0.73      0.60       545
        INTJ       0.61      0.50      0.55       365
        INTP       0.52      0.68      0.59       378
        ISFJ       0.81      0.29      0.43        58
        ISFP       0.47      0.26      0.34        73
        ISTJ       0.74      0.28      0.40        61
        ISTP       0.70      0.43      0.53        86

    accuracy                           0.55      2603
   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Features don't increase the performance