In [31]:
import gensim
import smart_open
import pprint
from tqdm.notebook import tqdm
import random

In [7]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [12]:
train_file = "../Data/train_pos.txt"
test_file = "../Data/test_data.txt"

In [13]:
train_corpus = list(read_corpus(train_file))
test_corpus = list(read_corpus(test_file, tokens_only=True))

In [16]:
print(train_corpus[:2])

[TaggedDocument(words=['user', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', 'but', 'hope', 'you', 'will', 'follow', 'me', 'believe'], tags=[0]), TaggedDocument(words=['because', 'your', 'logic', 'is', 'so', 'dumb', 'won', 'even', 'crop', 'out', 'your', 'name', 'or', 'your', 'photo', 'tsk', 'url'], tags=[1])]


In [17]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=20)

In [18]:
model.build_vocab(train_corpus)

In [20]:
len(model.wv.vocab)

19326

In [21]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [22]:
model.infer_vector(["hello", "i'm", "fine"])

array([ 0.20487756,  0.08899808, -0.0681494 ,  0.12627402,  0.04539736,
        0.00123082, -0.1307659 ,  0.16447656, -0.01045008,  0.14504078,
       -0.0656509 , -0.20671836,  0.09884079, -0.12041245,  0.05194834,
        0.08882205, -0.05043878,  0.1711762 , -0.16186517,  0.3403733 ,
        0.06447627,  0.13376349, -0.1303575 ,  0.1683445 ,  0.19058312,
        0.07654331,  0.03617232, -0.10046463, -0.06874552,  0.01534981,
        0.20827037,  0.09971488, -0.00719216, -0.3770539 ,  0.01683249,
        0.05633703,  0.05866466,  0.04396249,  0.07770579,  0.01865072,
        0.15823819, -0.07933328, -0.07400562,  0.16164783,  0.03261593,
        0.18935075,  0.03030867, -0.08947752,  0.07227089,  0.15786079],
      dtype=float32)

In [28]:
ranks = []
second_ranks = []
for doc_id in tqdm(range(len(train_corpus))):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=10)
    try:
        rank = [docid for docid, sim in sims].index(doc_id)
    except ValueError:
        rank = 11 
    ranks.append(rank)

    second_ranks.append(sims[1])


HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))


  3%|▎         | 2586/100000 [00:26<06:33, 247.76it/s][A




In [29]:
import collections

# See how well are we fitting the input data.
counter = collections.Counter(ranks)
print(counter)

Counter({0: 77658, 11: 13292, 1: 3895, 2: 1611, 3: 937, 4: 687, 5: 519, 6: 413, 7: 387, 8: 334, 9: 267})


In [37]:
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=1)

print(test_corpus[doc_id])
print(sims[0])
print(train_corpus[sims[0][0]].words)

['user', 'aw', 'ino', 'need', 'to', 'be', 'back', 'soon', 'haha', 'but', 'still', 'love', 'it', 'despite', 'the', 'dodgy', 'acting', 'and', 'gabriella', 'uh', 'ha', 'aw', 'wish', 'could', 'sing', 'xx']
(54394, 0.6134401559829712)
['user', 'lol', 'must', 'admit', 'do', 'use', 'that', 'sometimes', 'but', 'not', 'that', 'often', 'hope']
