### Importing python data processing and Natural-language processing libraries

In [1]:
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
%matplotlib inline
rcParams['figure.figsize'] = 10, 8
import nltk
import string
import multiprocessing
import time
cores = multiprocessing.cpu_count()

In [2]:
from nltk.stem import WordNetLemmatizer
from sklearn import metrics 
from sklearn.metrics import classification_report
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import gensim.models.doc2vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TreebankWordTokenizer



In [3]:
assert gensim.models.doc2vec.FAST_VERSION > -1

In [4]:
## defining some helper functions

In [5]:
def remove_punc(post):
    '''function for removing punctuation from post'''
    punc_num = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_num])

In [6]:
def remove_stop_words(tokens):
    '''function for removing stopwords from the list tokens'''
    sss = set(stopwords.words('english'))
    return [t for t in tokens if t not in sss]

In [7]:
mbti = pd.read_csv('train.csv')

In [8]:
all_mbti = []
for i,r in mbti.iterrows():
    for comment in r['posts'].split('|||'):
        all_mbti.append([r['type'],comment])
all_mbti = pd.DataFrame(all_mbti, columns=['type', 'post'])

In [9]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'

In [10]:
all_mbti['post'] = all_mbti['post'].replace(to_replace = pattern_url, value = subs_url, regex = True)
all_mbti['post'] = all_mbti['post'].str.lower()
all_mbti['post'] = all_mbti['post'].apply(remove_punc)

In [11]:
tokeniser = TreebankWordTokenizer()
all_mbti['tokens'] = all_mbti['post'].apply(tokeniser.tokenize)

In [12]:
all_mbti = all_mbti[all_mbti['post']!='urlweb']
all_mbti.head()

Unnamed: 0,type,post,tokens
2,INFJ,enfp and intj moments urlweb sportscenter no...,"[enfp, and, intj, moments, urlweb, sportscente..."
3,INFJ,what has been the most lifechanging experience...,"[what, has, been, the, most, lifechanging, exp..."
4,INFJ,urlweb urlweb on repeat for most of today,"[urlweb, urlweb, on, repeat, for, most, of, to..."
5,INFJ,may the perc experience immerse you,"[may, the, perc, experience, immerse, you]"
6,INFJ,the last thing my infj friend posted on his fa...,"[the, last, thing, my, infj, friend, posted, o..."


In [13]:
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(all_mbti['tokens'])]

In [14]:
## Defining the Doc2Vec model. 

setting the model parameters and initialising the model. The parameters were mostly found by trail and error. 

In [15]:
max_epochs = 50
vec_size = 50
alpha = 0.040

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                window = 4 ,
                min_alpha=0.033,
                min_count=3,
                dm =1,
                max_vocab_size=20000,
                workers=cores,
                negative = 5)

In [16]:
# build vocabulary before training the model otherwise training will fail
model.build_vocab(tagged_data)

In [17]:
## Training the model

In [18]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
        # decrease the learning rate
    model.alpha -= 0.0002
                # fix the learning rate, no decay
    model.min_alpha = model.alpha
model.save("first.model")
print("Model Saved")

iteration 0


  """


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
Model Saved


In [19]:
#model= Doc2Vec.load("no_url.model_w4")
print("Model Loaded")

Model Loaded


In [20]:
model.most_similar('king')

  """Entry point for launching an IPython kernel.


[('athletic', 0.5511487126350403),
 ('serious', 0.5334610939025879),
 ('prescribed', 0.5171880125999451),
 ('volumes', 0.4798150062561035),
 ('cliffs', 0.47889411449432373),
 ('sudden', 0.47873201966285706),
 ('spiderman', 0.4672902226448059),
 ('kind', 0.46360328793525696),
 ('kant', 0.45119261741638184),
 ('laidback', 0.44028735160827637)]