This notebook contains preprocessing steps for MBTI dataset. 

The Myers Briggs Type Indicator (or MBTI for short) is a personality type system that divides everyone into 16 distinct personality types across 4 axis:
* Introversion (I) – Extroversion (E)
* Intuition (N) – Sensing (S)
* Thinking (T) – Feeling (F)
* Judging (J) – Perceiving (P)

In the dataset, there are 8600 rows of data. Each row contains a person's MBTI personality class and the last 50 things that he/she posted in PersonalityCafe Forum. 

In [116]:
# Import libraries 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
import requests
from lxml.html import fromstring
import re 
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import json 

In [None]:
import nltk 
nltk.download('stopwords')
nltk.download('wordnet')

In [12]:
# Define constant
# define paths, constants etc. 
datadir = "../dataset/mbti-type/"
datafile = "../dataset/mbti-type/mbti_1.csv"
HTTP = ["http://", "https://", ".com", "www."]
IMAGE = [".jpg",".png", ".gif"]
EMOJI = [":D",":)",":(","D:",":o"]
LINK = r'http\S+'

In [13]:
# Opening dataset as pandas dataframe 
df = pd.read_csv(datafile)
print("There are %d number of data "  %len(df))
# Looking the first 5 elements 
df.head(5)

There are 8675 number of data 


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [14]:
# copy the original to use later
df_copy = df.copy()
df_copy['posts'] = df_copy['posts'].apply(lambda x: x.split("|||"))
print("The shape (%d,%d)" %(df_copy.shape))

The shape (8675,2)


In [15]:
df_copy.loc[0].posts[0:5]

["'http://www.youtube.com/watch?v=qsXHcwe3krw",
 'http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg',
 'enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks',
 'What has been the most life-changing experience in your life?',
 'http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.']

# Preprocess 

###  What to do with links? -> Scrap links and get the title of the page 


In [16]:
def replace_links_title(df_copy):
    nbr_link = 0 
    link_dict = dict()
    for i,post in enumerate(df_copy.posts):
        link_dict[i] = dict()
        for j,p in enumerate(post):
            if any(f in p for f in HTTP):
                # get the page title 
                #print("Original link %s " %(p)) 
                link = re.findall(LINK,  p)
                if not ("http://-alexxxandra-.tumblr.com/" in link) and not("http://memearchive.net/memerial.net/fullsize/1370.jpg" in link):
                    if len(link)>0:
                        for l in link: # if multiple links 
                            #print("Request for %s " %(l)) 
                            try:
                                r = requests.get(l)
                                r.raise_for_status()
                                try : 
                                    tree = fromstring(r.content)
                                    title = tree.findtext('.//title') 
                                    #print("Title ", title)
                                    #print("Before replace  ", p)
                                    if title and not("Bilgi" in title): 
                                        p = re.sub(
                                            LINK, 
                                            title, 
                                            p) 
                                    #p.replace(l,title)
                                    #print(l in p)
                                    #print("After replace ", p)
                                    link_dict[i][j] = p
                                    nbr_link += 1
                                except: 
                                    print("Error in from String for i: %d j: %d", (i,j))
                            except: 
                                print("Error for i: %d j:%d" %(i,j))
            post[j] = p                  
        if i%50==0: 
            # save the results 
            print("Saving result for %d " %(i))
            filename = ("backup_df.csv" %(i))
            df_copy.to_csv("backup_for%")

        df_copy.loc[i]['posts'] = post
    print("Number of links in the whole data %d " %(nbr_link))

### Lemmatization and Removing StopWords 

In [53]:
# Labels for types
lab_encoder = LabelEncoder().fit(list(df.type.unique()))

In [110]:
# remove stop words 
def preprocess_data(df_copy):
    stopWordsEng = stopwords.words("english")
    post_list = []
    label_list = []
    exclusions = '|'.join([f.lower() for f in list(df.type.unique())])
    for row in tqdm(df_copy.iterrows()):
        posts = row[1].posts
        temp_post = ""
        for p in posts:
            # change remaining links as LNK
            temp = re.sub(LINK, 'LNK', p.lower())
            # change type names as TYP 
            temp = re.sub(exclusions, 'TYP', temp)
            # chose only letters 
            temp = re.sub("[^a-zA-Z]", " ", temp)
            # remove punctuations 
            temp = re.sub(' +', ' ', temp)
            # remove stopwords and lemmatize 
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in stopWordsEng])
            temp_post += " " +temp
        post_list.append(temp_post)
        label_list.append(lab_encoder.transform([row[1].type])[0])
    return post_list, label_list

In [111]:
# First replace links 
replace_links_title(df_copy)
# apply remaining preprocess 
posts, label = preprocess_data(df_copy)


0it [00:00, ?it/s][A
13it [00:00, 127.21it/s][A
26it [00:00, 127.79it/s][A
39it [00:00, 128.34it/s][A
51it [00:00, 124.95it/s][A
63it [00:00, 122.24it/s][A
75it [00:00, 120.46it/s][A
86it [00:00, 115.31it/s][A
97it [00:00, 112.18it/s][A
108it [00:00, 107.70it/s][A
119it [00:01, 107.87it/s][A
130it [00:01, 105.67it/s][A
141it [00:01, 102.32it/s][A
153it [00:01, 105.08it/s][A
166it [00:01, 110.30it/s][A
180it [00:01, 116.32it/s][A
194it [00:01, 122.40it/s][A
208it [00:01, 126.69it/s][A
222it [00:01, 129.64it/s][A
236it [00:01, 129.45it/s][A
250it [00:02, 127.38it/s][A
263it [00:02, 127.01it/s][A
276it [00:02, 115.68it/s][A
288it [00:02, 110.63it/s][A
300it [00:02, 109.65it/s][A
312it [00:02, 110.81it/s][A
324it [00:02, 112.69it/s][A
337it [00:02, 115.57it/s][A
349it [00:02, 116.37it/s][A
362it [00:03, 119.36it/s][A
374it [00:03, 114.74it/s][A
389it [00:03, 121.18it/s][A
402it [00:03, 121.59it/s][A
415it [00:03, 121.48it/s][A
429it [00:03, 125.96it/s][A

3637it [00:30, 89.86it/s][A
3648it [00:30, 94.40it/s][A
3658it [00:30, 89.43it/s][A
3668it [00:30, 87.49it/s][A
3677it [00:30, 84.43it/s][A
3687it [00:30, 87.50it/s][A
3699it [00:30, 94.03it/s][A
3713it [00:31, 103.15it/s][A
3727it [00:31, 110.09it/s][A
3741it [00:31, 116.48it/s][A
3754it [00:31, 119.75it/s][A
3767it [00:31, 119.09it/s][A
3782it [00:31, 124.87it/s][A
3796it [00:31, 126.85it/s][A
3809it [00:31, 124.14it/s][A
3822it [00:31, 119.52it/s][A
3835it [00:32, 121.44it/s][A
3848it [00:32, 121.97it/s][A
3862it [00:32, 124.80it/s][A
3875it [00:32, 122.86it/s][A
3888it [00:32, 123.24it/s][A
3902it [00:32, 126.93it/s][A
3915it [00:32, 117.43it/s][A
3927it [00:32, 116.97it/s][A
3939it [00:32, 116.26it/s][A
3952it [00:32, 118.52it/s][A
3964it [00:33, 117.40it/s][A
3979it [00:33, 123.92it/s][A
3992it [00:33, 124.89it/s][A
4006it [00:33, 127.58it/s][A
4020it [00:33, 130.39it/s][A
4034it [00:33, 123.08it/s][A
4047it [00:33, 123.75it/s][A
4060it [00:33, 12

7200it [00:59, 102.55it/s][A
7211it [00:59, 100.69it/s][A
7222it [00:59, 100.34it/s][A
7233it [00:59, 100.52it/s][A
7244it [00:59, 101.64it/s][A
7256it [01:00, 105.66it/s][A
7267it [01:00, 106.24it/s][A
7279it [01:00, 108.03it/s][A
7291it [01:00, 108.69it/s][A
7303it [01:00, 110.67it/s][A
7316it [01:00, 115.56it/s][A
7328it [01:00, 114.31it/s][A
7340it [01:00, 106.46it/s][A
7352it [01:00, 107.36it/s][A
7363it [01:01, 107.15it/s][A
7378it [01:01, 115.39it/s][A
7390it [01:01, 112.58it/s][A
7402it [01:01, 113.34it/s][A
7414it [01:01, 114.98it/s][A
7426it [01:01, 115.61it/s][A
7438it [01:01, 109.39it/s][A
7450it [01:01, 111.14it/s][A
7462it [01:01, 107.62it/s][A
7475it [01:02, 112.85it/s][A
7487it [01:02, 114.37it/s][A
7499it [01:02, 111.32it/s][A
7511it [01:02, 110.99it/s][A
7523it [01:02, 110.59it/s][A
7535it [01:02, 112.43it/s][A
7548it [01:02, 115.91it/s][A
7560it [01:02, 112.33it/s][A
7572it [01:02, 107.86it/s][A
7584it [01:03, 109.73it/s][A
7596it [01

In [128]:
# save the preprocessesed dataset 
data = dict()
data['posts'] = posts
data['types'] = [int(l) for l in label] 
with open("preprocessed_data.json", "w+") as fp:
    json.dump(data,fp)