In [16]:
import numpy as np
import pandas as pd
import re, string, unicodedata
import nltk
import inflect
import gc
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [17]:
df_data = pd.read_csv('mbti_1.csv')
df_data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [18]:
df_data.posts = df_data.posts.replace(r'[|||]', r' ', regex=True)
df_types = pd.DataFrame(df_data.drop('posts', 1))

In [19]:
df_types['is_E'] = df_data['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
df_types['is_S'] = df_data['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
df_types['is_T'] = df_data['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
df_types['is_J'] = df_data['type'].apply(lambda x: 1 if x[3] == 'J' else 0)
df_types.columns = ['type','is_E','is_S','is_T','is_J']

In [20]:
#Cleaning the data Round 1
def clean_text_round1(text):
    '''Make text lowercase, remove punctuation, remove words containing numbers, remove URL, remove @, '''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[^a-zA-Z\s]', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub("\s+", '', text)
    text = text.lstrip()
    text = text.rstrip()
    return text

round1 = lambda x: clean_text_round1(x)

data_posts_clean = pd.DataFrame(df_data.posts.apply(round1))
data_posts_clean

Unnamed: 0,posts
0,enfpandintjmomentssportscenternottoptenplayspr...
1,imfindingthelackofmeinthesepostsveryalarmingse...
2,goodoneofcoursetowhichisayiknowthatsmyblessing...
3,dearintpienjoyedourconversationtheotherdayesot...
4,yourefiredthatsanothersillymisconceptionthatap...
...,...
8670,ixfpjustbecauseialwaysthinkofcatsasfidomsforso...
8671,soifthisthreadalreadyexistssomeplaceelsewhichi...
8672,somanyquestionswhenidothesethingsiwouldtakethe...
8673,iamveryconflictedrightnowwhenitcomestowantingc...


In [21]:
# Tokenization - this is taken from SpaCy

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
stemmer = LancasterStemmer()

def tokenize(s): 
    tokens = re_tok.sub(r' \1 ', s).split()
    return stemmer.stemWords(tokens)

In [22]:
#Remove stopword and also added the MBTI types to it
stop = set(stopwords.words('english'))
stop.update(['INFP','INFJ','INTP','INTJ','ENTP','ENFP','ISTP','ISFP','ENTJ','ISTJ','ENFJ','ISFJ','ESTP','ESFP','ESFJ','ESTJ'])

def remove_stopwords(row):
    return [t for t in row if t not in stop]

In [23]:

# load the FastText vectors in a dictionary:
from tqdm import tqdm

embeddings_index_FastText = {}
f = open('crawl-300d-2M.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        continue
    embeddings_index_FastText[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index_FastText))

1999996it [01:55, 17245.20it/s]

Found 1999996 word vectors.





In [24]:
# Function to create a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower()
    words = tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())