In [12]:
import pandas as pd
import numpy as np
import re
import time

from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import nltk
from nltk.corpus import stopwords
from collections import Counter

In [4]:
#intial data load and num of rows and cols
data = pd.read_csv('~/Desktop/mbti_1.csv')

In [5]:
def clean_mbti_text(data):
    label = data['type']

    #lowercase
    data['posts'] = data['posts'].apply(lambda x: x.lower()) 
    
    personalities_list = ['enfj', 'enfp', 'entj', 'entp', 'esfj', 'esfp', 'estj', 'estp',
                          'infj', 'infp', 'intj', 'intp', 'isfj', 'isfp', 'istj', 'istp']
    #remove links
    data['posts'] = data['posts'].apply(lambda x: re.sub(r'https?:\/\/.*?[\s+]', '', x.replace("|"," ") + " ")) #links
    
    #remove MBTI personality labels from data['posts']
    for i in range(len(personalities_list)-1):
        data['posts'] = data['posts'].str.replace(personalities_list[i], '')
    
    #remove nonwords
    data['posts'] = data['posts'].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))
    
    #remove puncuation
    data['posts'] = data['posts'].apply(lambda x: re.sub(r'[\.+]', ".",x)) 
    
    #remove extra spaces
    data['posts'] = data['posts'].str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ') 
    
    clean = data
    
    return clean

In [6]:
clean_text = clean_mbti_text(data)

  data['posts'] = data['posts'].str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')


In [7]:
def text_split(text):
    extroversion = text[text['type'].isin(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP'])]
    extroversion = extroversion.replace(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP'], 
                                    [0, 0, 0, 0, 0, 0, 0, 0])
    introversion = text[text['type'].isin(['INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'])]
    introversion = introversion.replace(['INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'], 
                                    [1, 1, 1, 1, 1, 1, 1, 1])

    intuition = text[text['type'].isin(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'INFJ', 'INFP', 'INTJ', 'INTP'])]
    intuition = intuition.replace(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'INFJ', 'INFP', 'INTJ', 'INTP'], 
                                    [0, 0, 0, 0, 0, 0, 0, 0])
    sensing = text[text['type'].isin(['ISFJ', 'ISFP', 'ISTJ', 'ISTP','ESFJ', 'ESFP', 'ESTJ', 'ESTP'])]
    sensing = sensing.replace(['ISFJ', 'ISFP', 'ISTJ', 'ISTP','ESFJ', 'ESFP', 'ESTJ', 'ESTP'], 
                                    [1, 1, 1, 1, 1, 1, 1, 1])

    thinking = text[text['type'].isin(['ENTJ', 'ENTP','ESTJ', 'ESTP','INTJ', 'INTP', 'ISTJ', 'ISTP'])]
    thinking = thinking.replace(['ENTJ', 'ENTP','ESTJ', 'ESTP','INTJ', 'INTP', 'ISTJ', 'ISTP'], 
                                    [0, 0, 0, 0, 0, 0, 0, 0])
    feeling = text[text['type'].isin(['ENFJ', 'ENFP','ESFJ', 'ESFP', 'INFJ', 'INFP', 'ISFJ', 'ISFP'])]
    feeling = feeling.replace(['ENFJ', 'ENFP','ESFJ', 'ESFP', 'INFJ', 'INFP', 'ISFJ', 'ISFP'], 
                                    [1, 1, 1, 1, 1, 1, 1, 1])

    judging = text[text['type'].isin(['ENFJ','ENTJ', 'ESFJ', 'ESTJ', 'INFJ', 'INTJ', 'ISFJ', 'ISTJ'])]
    judging = judging.replace(['ENFJ','ENTJ', 'ESFJ', 'ESTJ', 'INFJ', 'INTJ', 'ISFJ', 'ISTJ'], 
                                    [0, 0, 0, 0, 0, 0, 0, 0])

    percieving = text[text['type'].isin(['ENFP', 'ENTP', 'ESFP', 'ESTP', 'INFP', 'INTP', 'ISFP', 'ISTP'])]
    percieving = percieving.replace(['ENFP', 'ENTP', 'ESFP', 'ESTP', 'INFP', 'INTP', 'ISFP', 'ISTP'], 
                                    [1, 1, 1, 1, 1, 1, 1, 1])
    
    EI = pd.concat([extroversion, introversion])
    NS = pd.concat([intuition, sensing])
    TF = pd.concat([thinking, feeling])
    JP = pd.concat([judging, percieving])
    
    return EI, NS, TF, JP

In [8]:
EI, NS, TF, JP = text_split(clean_text)

In [9]:
EI_x = EI['posts']
EI_y = EI['type']

In [10]:
# process raw text into ML compatible features
vectorizer = TfidfVectorizer(min_df=3, 
             stop_words='english',ngram_range=(1, 2), lowercase=True)  
vectorizer.fit(EI_x)

X = vectorizer.transform(EI_x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, EI_y, 
                                   test_size=0.15, shuffle=True, stratify=EI_y, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                 test_size=0.15/0.85, shuffle=True, stratify=y_train, random_state=42)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, EI_y, 
                                   test_size=0.1, shuffle=True, stratify=EI_y, random_state=42)

In [None]:
np.linspace(50,80, num = 12)

In [14]:
model = LogisticRegression(random_state=0, max_iter=1000)
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'saga', 'sag']
penalty = ['l2']
c_values = [50.        , 52.72727273, 55.45454545, 58.18181818, 60.90909091,
       63.63636364, 66.36363636, 69.09090909, 71.81818182, 74.54545455,
       77.27272727, 80.        ]

#define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1,  cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train,y_train)

#summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

KeyboardInterrupt: 

In [None]:
model = LogisticRegression(random_state=0, max_iter=1000)
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'saga', 'sag']
penalty = ['l2']
c_values = [50.        , 52.72727273, 55.45454545, 58.18181818, 60.90909091,
       63.63636364, 66.36363636, 69.09090909, 71.81818182, 74.54545455,
       77.27272727, 80.        ]

#define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_val,y_val)

#summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
lg = LogisticRegression(random_state=0, C=63, penalty='l2', solver = 'newton-cg', max_iter=1000) 

t0 = time.time()
lg.fit(X_train,y_train)
t1 = time.time() # ending time
lg_train_time = t1-t0

t0 = time.time()
y_true, y_pred_lg = y_test, lg.predict(X_test)
t1 = time.time() # ending time
lg_pred_time = t1-t0

lg_report = classification_report(y_true, y_pred_lg, output_dict=True)
df_lg = pd.DataFrame(lg_report)

In [None]:
df_lg