### Importing python data processing and Natural-language processing libraries

In [1]:
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
%matplotlib inline
rcParams['figure.figsize'] = 10, 8
import nltk
import string
import multiprocessing
import time
cores = multiprocessing.cpu_count()

In [2]:
from nltk.stem import WordNetLemmatizer
from sklearn import metrics 
from sklearn.metrics import classification_report
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import gensim.models.doc2vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TreebankWordTokenizer



In [3]:
assert gensim.models.doc2vec.FAST_VERSION > -1

In [4]:
## defining some helper functions

In [5]:
def remove_punc(post):
    '''function for removing punctuation from post'''
    punc_num = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_num])

In [6]:
def remove_stop_words(tokens):
    '''function for removing stopwords from the list tokens'''
    sss = set(stopwords.words('english'))
    return [t for t in tokens if t not in sss]

In [7]:
mbti = pd.read_csv('train.csv')

In [8]:
all_mbti = []
for i,r in mbti.iterrows():
    for comment in r['posts'].split('|||'):
        all_mbti.append([r['type'],comment])
all_mbti = pd.DataFrame(all_mbti, columns=['type', 'post'])

In [9]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'

In [10]:
all_mbti['post'] = all_mbti['post'].replace(to_replace = pattern_url, value = subs_url, regex = True)
all_mbti['post'] = all_mbti['post'].str.lower()
all_mbti['post'] = all_mbti['post'].apply(remove_punc)

In [11]:
tokeniser = TreebankWordTokenizer()
all_mbti['tokens'] = all_mbti['post'].apply(tokeniser.tokenize)

KeyboardInterrupt: 

In [None]:
all_mbti = all_mbti[all_mbti['post']!='urlweb']
all_mbti.head()

In [None]:
all_mbti.shape

In [None]:
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(all_mbti['tokens'])]

## The Doc2Vec model. 

setting the model parameters and initialising the model. The parameters were mostly found by trail and error. 

In [15]:
max_epochs = 50
vec_size = 50
alpha = 0.040

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                window = 4 ,
                min_alpha=0.033,
                min_count=3,
                dm =1,
                max_vocab_size=20000,
                workers=cores,
                negative = 5)

In [16]:
# build vocabulary before training the model otherwise training will fail
model.build_vocab(tagged_data)

## Training the model

In [None]:
#  training loop
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
        # decrease the learning rate
    model.alpha -= 0.0002
                # fix the learning rate, no decay
    model.min_alpha = model.alpha
model.save("first.model")
print("Model Saved")

In [12]:
model= Doc2Vec.load("first.model")
print("Model Loaded")

Model Loaded


In [16]:
model.wv.most_similar('children')

[('kids', 0.7476826310157776),
 ('families', 0.6871461868286133),
 ('students', 0.6676808595657349),
 ('tension', 0.6578578948974609),
 ('legs', 0.6477729678153992),
 ('customers', 0.6355262994766235),
 ('colours', 0.6316623687744141),
 ('answers', 0.6242372989654541),
 ('girls', 0.6158963441848755),
 ('friends', 0.6112593412399292)]

The model seems well trained. Most of the top 10 words make logical sense as to why they world be similar to the word 'children'

words used in very different contexts should have very low scores for similarity. 

In [30]:
model.wv.similarity(w1='dirty',w2='clean')

0.1291517880655949

Words used in similar contexts should have high similarity scores

In [42]:
model.wv.similarity(w1='husband',w2='wife')

0.6842354163303089

In [14]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('dark', 1.1452734470367432),
 ('roughly', 1.104595422744751),
 ('similar', 1.0799188613891602),
 ('large', 1.0694929361343384),
 ('confident', 1.0383901596069336),
 ('elephant', 1.02937650680542),
 ('k', 1.0130221843719482),
 ('bright', 1.0066474676132202),
 ('popular', 1.003062129020691),
 ('flip', 0.9994767904281616)]

ohk, the famous "woman+king-man=queen" example didn't work. I feel the reason for this is because our corpus only 300k tweet. 