# Training

## Imports

In [1]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric
import random

## Data
Data comes from [this academic source](http://fakenews.research.sfu.ca/).

In [2]:
df = pd.read_csv("data/snopes_phase2_clean_2018_7_3.csv")

In [3]:
raw_texts = list(df['original_article_text_phase2'])
labels = list(df['fact_rating_phase1'])

In [4]:
print('We have '+str(len(raw_texts))+' total texts in our dataset.')

We have 15804 total texts in our dataset.


In [5]:
def clean(doc):
    return strip_punctuation(doc).lower().split()

In [6]:
texts = [clean(doc) for doc in raw_texts]

The following creates TaggedDocument objects for each of the texts in the dataset, where each text is tagged by the fact rating (label),e.g. "true" or "false."

In [7]:
documents = [TaggedDocument(doc, [label]) for doc,label in zip(texts,labels)]
random.shuffle(documents)
n = len(documents)
split = n*7//10
train_corpus = documents[:split]
test_corpus = documents[split:]

## Model
The model is trained on the documents, with vector size of 100 (for each word), with a window of 10 (each word is predicted by the 10 words surrounding it). min_count = 2 means that every word will be used if it appears more than once.

In [8]:
model = Doc2Vec(vector_size=100, window=10, min_count=2, epochs=100)
model.build_vocab(train_corpus)

Train

In [9]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Save the model

In [20]:
model.save("models/my_doc2vec_model")
model = Doc2Vec.load("models/my_doc2vec_model")

# Evaluating

## Example

In [19]:
new_doc = 'hillary clinton won the presidential election'.split()
vector = model.infer_vector(new_doc)
print(vector)

[ 0.55356336  0.34289587  0.5039152   0.7559751   0.25213003  1.0299613
 -0.7082775  -0.5057964   0.11715787 -0.64546925 -0.16096687 -0.28979084
 -0.06682199 -0.19200891  0.03027644  0.5119883   0.5079666   0.4487274
  0.18696629  0.5818896   0.19151989 -0.0216381  -0.50858855 -0.27248672
  0.21002984  0.2447957  -0.36019418 -1.2572     -0.08455919 -0.04736549
 -1.3155012  -0.34274608  0.14596397  0.30664265 -0.6106777   0.75890523
 -0.39150646 -0.80904526  0.24119864 -0.6517878  -0.05164708 -0.66930896
 -0.05648576  0.727746   -0.7741949  -0.44230926  0.48399165 -0.0768225
  0.7821966   0.52545375 -0.06728771 -0.7721223   0.6785759  -0.3941588
  0.3337826  -0.00497984 -0.8542931   0.31506607  0.04707924 -0.3627375
  0.00781691 -0.36983645 -0.24101378  0.12170418  0.7448678  -0.26418227
  0.3660997   0.69943255 -0.15209723  0.4801735   0.01532328 -0.62834305
  0.16523278  0.29566127 -0.10229503 -0.7899027   0.5496639   0.0200149
 -0.60574204  0.6114056  -0.5372891   0.49396458  0.09204

100

## Assessment
We do the following to make sure the model is behaving in a useful way. For each document in the train corpus, we infer a new vector from the model, calculate the most similar document vectors in the model, and determine if the inferred vectors are closest to themselves in the model. ***rank*** will store the index of the correct document in the similarity list. We should see most of the documents ranked as the number one most similar document to themselves.

In [12]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(train_corpus[doc_id].tags[0])
    ranks.append(rank)
    second_ranks.append(sims[1])

In [13]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 7318, 1: 1382, 2: 696, 3: 425, 4: 288, 5: 224, 6: 189, 7: 146, 8: 127, 11: 94, 9: 90, 10: 83})


# Testing Stuff

In [14]:
a_vec_labels = []
for i in range(0,100):
    a_vec_labels.append('a_vec_'+str(i))

In [15]:
test = pd.DataFrame(columns=a_vec_labels)
test

Unnamed: 0,a_vec_0,a_vec_1,a_vec_2,a_vec_3,a_vec_4,a_vec_5,a_vec_6,a_vec_7,a_vec_8,a_vec_9,...,a_vec_90,a_vec_91,a_vec_92,a_vec_93,a_vec_94,a_vec_95,a_vec_96,a_vec_97,a_vec_98,a_vec_99


In [16]:
# DELETE THIS
vecs = []
for i,text in enumerate(test_corpus):
    e = list(model.infer_vector(text.words))
    vecs.append(e)
    print(e)
    break

[-0.1784278, 1.3330603, 2.6526241, 3.105145, 3.4260335, 3.037864, -1.2165773, 1.3921784, 0.49113676, -0.40299338, 0.17370708, 0.8171446, 1.4808908, -0.3434352, -0.3222706, 7.640425, 6.7384973, 2.4144955, -0.35059428, 3.1217365, -1.2044243, -0.19923551, -3.369364, 1.3040717, -2.2644765, 2.3929367, -0.33051476, -5.0254636, 2.792715, -2.4706166, -1.7206733, 1.4587978, 2.058685, 0.45824018, 1.0071696, 2.3725657, -0.44208062, -1.1700135, -2.0797873, 0.84374565, -1.3710631, 0.104454786, -0.36281177, 0.26372114, -0.07936885, 0.26037624, 1.960281, -2.0275617, 1.5468478, 2.1120186, 3.9751973, -0.40382567, 3.3880253, -3.4590626, 3.9543805, -1.0642099, -1.9151137, 6.028958, 2.7899141, 2.1637254, 0.5943705, -0.37828916, -2.6014645, 0.7266521, 3.2402802, -1.3518803, -2.5701818, 0.51912344, -0.93017673, 3.194452, -3.4080641, 1.992742, 4.116685, 0.71092325, 0.7308884, -2.843813, 0.3158367, -3.0270953, -4.4702854, 1.2037218, -3.7543416, -0.37810338, -2.4403706, -0.92183477, -2.6385128, 1.1209885, 2.44

In [17]:
test = pd.DataFrame(vecs,columns=a_vec_labels)

In [18]:
test

Unnamed: 0,a_vec_0,a_vec_1,a_vec_2,a_vec_3,a_vec_4,a_vec_5,a_vec_6,a_vec_7,a_vec_8,a_vec_9,...,a_vec_90,a_vec_91,a_vec_92,a_vec_93,a_vec_94,a_vec_95,a_vec_96,a_vec_97,a_vec_98,a_vec_99
0,-0.178428,1.33306,2.652624,3.105145,3.426033,3.037864,-1.216577,1.392178,0.491137,-0.402993,...,4.253487,0.351254,-0.451361,1.689859,-0.804004,1.771108,1.721184,-0.337564,2.360753,-1.226336
