In [1]:
# Importing necessary packages

import pandas as pd

import numpy as np

from spacy.lang.en import English
nlp = English()

from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# Reading in data
mbti_df = pd.read_csv('mbti_1.csv')

In [3]:
# converting each observation into a list of posts
mbti_df['posts'] = mbti_df['posts'].map(lambda x: list(x.split('|||')))

In [4]:
# Inspecting data
mbti_df.head()

Unnamed: 0,type,posts
0,INFJ,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ..."
1,ENTP,['I'm finding the lack of me in these posts ve...
2,INTP,['Good one _____ https://www.youtube.com/wa...
3,INTJ,"['Dear INTP, I enjoyed our conversation the ..."
4,ENTJ,"['You're fired., That's another silly misconce..."


In [5]:
# Not all the users had the same number of posts, but the majority had 50 so we will select those that had 50 posts. 
# Consider treating each post as an observation as opposed to each individual - drastically increase sample size. 
mbti_df['num_posts'] = mbti_df['posts'].map(lambda x: len(x))
mbti_df = mbti_df[mbti_df['num_posts'] == 50].reset_index(drop=True)

In [6]:
# treating each post as its own observation
split_df = pd.DataFrame({'type':np.repeat(mbti_df['type'].values, mbti_df['posts'].str.len()), 'post':np.concatenate(mbti_df['posts'].values)})

In [7]:
len(split_df)

379350

In [27]:
# Inspect our classes
pd.DataFrame(split_df['type'].value_counts())

Unnamed: 0,type
INFP,89796
INFJ,72105
INTP,63359
INTJ,52471
ENTP,33761
ENFP,32769
ISTP,16498
ISFP,13000
ENTJ,11273
ISTJ,9913


In [7]:
# Setting up new column names to assign to each post
post = 'Post {}'
nums = range(1, 51)
posts = []
for num in nums: 
    posts.append(post.format(str(num)))

In [8]:
# New dataframe with each post in its own column
df_update = pd.DataFrame(mbti_df['posts'].to_list(), columns=posts)
df_update['type'] = mbti_df['type']

In [9]:
df_update.head(1)

Unnamed: 0,Post 1,Post 2,Post 3,Post 4,Post 5,Post 6,Post 7,Post 8,Post 9,Post 10,...,Post 42,Post 43,Post 44,Post 45,Post 46,Post 47,Post 48,Post 49,Post 50,type
0,'http://www.youtube.com/watch?v=qsXHcwe3krw,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,enfp and intj moments https://www.youtube.com...,What has been the most life-changing experienc...,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,May the PerC Experience immerse you.,The last thing my INFJ friend posted on his fa...,Hello ENFJ7. Sorry to hear of your distress. I...,84389 84390 http://wallpaperpassion.com/uplo...,Welcome and stuff.,...,Not all artists are artists because they draw....,"Welcome to the robot ranks, person who downed ...",Banned for taking all the room under my bed. Y...,http://www.youtube.com/watch?v=w8IgImn57aQ,"Banned for being too much of a thundering, gru...",Ahh... old high school music I haven't heard i...,I failed a public speaking class a few years a...,I like this person's mentality. He's a confirm...,Move to the Denver area and start a new life f...,INFJ


In [10]:
df_update['Post 4'][0]

'What has been the most life-changing experience in your life?'

In [11]:
def token_rem_stop(doc):
#   Tokenize each doc
    tokens = []
    for token in doc: 
        tokens.append(token.text)
#   Remove stop words from tokens        
    rem_stop = []
    for token in tokens: 
        lex = nlp.vocab[token]
        if lex.is_stop == False:
            rem_stop.append(token)
        
    return rem_stop


In [13]:
# Test out the above function
token_rem_stop(nlp(df_update['Post 4'][0]))

['life', '-', 'changing', 'experience', 'life', '?']