# Myers-Briggs Personality Type Prediction


# 1 - Packages #

Let's first import all the packages that you will need.

In [1]:
# Package imports
import numpy as np
import tensorflow as tf
import re
import random
from nltk.stem.snowball import SnowballStemmer
import nltk
# nltk.download()

# 2 - Preprocessing helper functions #

In [2]:
import string
import json
from nltk.corpus import words

# English Lexicon
lexicon = {}
for word in words.words():
    lexicon[word] = True
    
# Data store
data = []

# Lexicon
words_dict = {}
word_val = 0

# Personity types 
personality_type_dict = {}
personality_type_val = 0

# Snowball stemmer
stemmer = SnowballStemmer("english")


# Removes url, punctuation, and digits
def post_clean_up(post):
    remove_url = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', post)
    remove_punc = remove_url.translate(str.maketrans('', '', string.punctuation))
    remove_digit = re.sub(r'\d+', '', remove_punc)
    remove_digit = remove_digit.strip()
    return remove_digit

# Applies snow ball stemmer and inserts root word to words_dict
def apply_snow_ball_stemmer(post):
    global word_val
    processed_post = ""
    for word in post.split():
        if word not in lexicon:
            continue
        root_word = stemmer.stem(word)

        if root_word not in words_dict:
            words_dict[root_word] = word_val
            word_val += 1

        processed_post += " "+ root_word

    processed_post = processed_post.strip()
    return processed_post

# Read file
def read_file(location):
    file = open(location, 'r')
    return file.readlines()[1:]

# Read json
def read_json(location):
    with open(location) as json_file:
        data = json.load(json_file)
        return data
    
# Save CSV
def save_csv(data, resource_location):
    np.savetxt(resource_location, data, fmt='%s, %s')

# Save JSON
def save_json(data, resource_location):
    with open(resource_location, 'w') as fp:
        json.dump(data, fp)
        fp.close()

# 3 - Preprocessing mbti dataset #

In [6]:


# Open mbti file
mbti_resource_location = "../data/mbti_1.csv"

lines = read_file(mbti_resource_location)

mean_length = 0

for line in lines:
    
    personality_type, _, posts = line.partition(",")
    
    if personality_type not in personality_type_dict:
        personality_type_dict[personality_type] = personality_type_val
        personality_type_val += 1
    
    for post in posts.split("|||"):
        # Removing URLs, punctuation, and digits
        post = post_clean_up(post)
        
        # Filter out posts with length less than 10
        if len(post) > 10:
            
            # Apply Snowball stemmer
            post = apply_snow_ball_stemmer(post)
            
            
            # Filter out posts with length less than 10
            if len(post) > 10:
                mean_length += len(post)
                example = [post, personality_type]
                data.append(example)

        
            
data = np.asarray(data)
print(data.shape, mean_length/data.shape[0])

(381453, 2) 106.30440447446999


In [13]:
import json
processed_mbti_resource_location = "../data/processed_mbti.csv"
words_dict_resource_location = '../data/words_dict.json'
personality_type_dict_resource_location = '../data/personality_dict.json'


# Save processed mbti data
save_csv(data, processed_mbti_resource_location)
# Save words_dict
save_json(words_dict, words_dict_resource_location)
# Save words_dict
save_json(personality_type_dict, personality_type_dict_resource_location)


# 4 - Training a deep neural network #

In [None]:
import random
from numpy import sqrt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder

processed_mbti_resource_location = "../data/processed_mbti.csv"
words_dict_resource_location = '../data/words_dict.json'
personality_type_dict_resource_location = '../data/personality_dict.json'


processed_mbti = read_file(processed_mbti_resource_location)
processed_mbti = processed_mbti[:len(processed_mbti)-10000]
lexicon_dict = read_json(words_dict_resource_location)

random.shuffle(processed_mbti)
# Feature vector size
n_features = 20404
model  = Sequential()
model.add(Dense(20, activation='relu', kernel_initializer="he_normal", input_shape=(n_features,)))
model.add(Dense(20, activation='relu', kernel_initializer="he_normal"))
model.add(Dense(20, activation='relu', kernel_initializer="he_normal"))
model.add(Dense(20, activation='relu', kernel_initializer="he_normal"))
model.add(Dense(20, activation='relu', kernel_initializer="he_normal"))
model.add(Dense(20, activation='relu', kernel_initializer="he_normal"))
model.add(Dense(16, activation='softmax'))

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

i = 0
for r in range(0,len(processed_mbti), 500):
    X = []
    Y = []
    for line in processed_mbti[r:min(r+500, len(processed_mbti))]:
        post, _, personality_type = line.partition(",")
        feature = [0]*n_features

        for word in post.split(" "):
            if word in lexicon_dict:
                feature[lexicon_dict[word]] = 1

        X.append(feature)
        Y.append(personality_type)
    
    X = np.asarray(X)
    Y = np.asarray(Y)
    
    Y = LabelEncoder().fit_transform(Y)    
    if i % 5 == 0:
        print("############################################")
        model.fit(X, Y, epochs=15, batch_size=64, verbose=2)
    else:
        model.fit(X, Y, epochs=15, batch_size=64, verbose=0)
    i += 1;
    

# 5 - Saving the trained model #

In [9]:
model.save('model.h5')

# 6 - Testing the trained model on dev set #

In [15]:
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

processed_mbti_resource_location = "../data/processed_mbti.csv"
words_dict_resource_location = '../data/words_dict.json'
personality_type_dict_resource_location = '../data/personality_dict.json'


processed_mbti = read_file(processed_mbti_resource_location)
processed_mbti = processed_mbti[len(processed_mbti)-10000:]
lexicon_dict = read_json(words_dict_resource_location)


random.shuffle(processed_mbti)
processed_mbti = processed_mbti[:2000]

# Feature vector size
size = 20403

X_dev = []
Y_dev = []

for line in processed_mbti:
    post, _, personality_type = line.partition(",")
    feature = [0]*size

    for word in post.split(" "):
        if word in lexicon_dict:
            feature[lexicon_dict[word]] = 1
            
    X_dev.append(feature)
    Y_dev.append(personality_type)
X_dev = np.asarray(X_dev)
Y_dev = np.asarray(Y_dev)


model = load_model('model.h5')

Y_dev = LabelEncoder().fit_transform(Y_dev)
loss, acc = model.evaluate(X_dev, Y_dev, verbose=2)
print("Test accuracy: %.3f" % acc)

63/63 - 0s - loss: 13.7735 - accuracy: 0.0905
Test accuracy: 0.090
