# Training

## Imports

In [1]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric
import random

## Data
Data comes from [this academic source](http://fakenews.research.sfu.ca/).

In [2]:
df = pd.read_csv("data/snopes_phase2_clean_2018_7_3.csv")

In [3]:
raw_texts = list(df['original_article_text_phase2'])
labels = list(df['fact_rating_phase1'])

In [4]:
print('We have '+str(len(raw_texts))+' total texts in our dataset.')

We have 15804 total texts in our dataset.


In [5]:
def clean(doc):
    return strip_punctuation(doc).lower().split()

In [6]:
texts = [clean(doc) for doc in raw_texts]

The following creates TaggedDocument objects for each of the texts in the dataset, where each text is tagged by the fact rating (label),e.g. "true" or "false."

In [7]:
documents = [TaggedDocument(doc, [label]) for doc,label in zip(texts,labels)]
random.shuffle(documents)
n = len(documents)
split = n*7//10
train_corpus = documents[:split]
test_corpus = documents[split:]

## Model
The model is trained on the documents, with vector size of 100 (for each word), with a window of 10 (each word is predicted by the 10 words surrounding it). min_count = 2 means that every word will be used if it appears more than once.

In [8]:
model = Doc2Vec(vector_size=100, window=10, min_count=2, epochs=100)
model.build_vocab(train_corpus)

Train

In [9]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Save the model

In [9]:
#model.save("models/my_doc2vec_model")
model = Doc2Vec.load("models/my_doc2vec_model")

# Evaluating

## Example

In [13]:
new_doc = 'hillary clinton won the presidential election'.split()
vector = model.infer_vector(new_doc)
print(vector)

[ 0.9404052   0.47744063  0.53421813  0.66232985  0.28171626  1.017354
 -0.84455884 -0.39268035  0.07428476 -0.79790145 -0.12265771 -0.4231448
 -0.07662332 -0.22598988  0.01171068  0.58670026  0.62161267  0.3769406
  0.14629868  0.6340019   0.2909536   0.11042444 -0.5684845  -0.33001187
  0.224797    0.21839188 -0.34626693 -1.1993206  -0.04392567 -0.07876993
 -1.3451405  -0.28440332  0.17690015  0.34891653 -0.5191822   0.7707534
 -0.27425253 -0.84567475  0.04765292 -0.73873806 -0.3549434  -0.8706137
 -0.01893645  0.75383514 -0.6347843  -0.5896135   0.45323756 -0.22950776
  0.72223383  0.6458022  -0.0560001  -0.8117853   0.7797085  -0.4714741
  0.42436132 -0.18374477 -0.8277907   0.38557872  0.22068419 -0.14774607
 -0.02490484 -0.37784976 -0.41929457  0.09866143  0.63371545 -0.1872806
  0.4446957   0.6320312  -0.16217197  0.5129655   0.01808596 -0.6853428
  0.33292666  0.08106186 -0.11431783 -0.8593764   0.6780839   0.03162329
 -0.5577183   0.6695166  -0.47466102  0.3819133  -0.04767067

## Assessment
We do the following to make sure the model is behaving in a useful way. For each document in the train corpus, we infer a new vector from the model, calculate the most similar document vectors in the model, and determine if the inferred vectors are closest to themselves in the model. ***rank*** will store the index of the correct document in the similarity list. We should see most of the documents ranked as the number one most similar document to themselves.

In [12]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(train_corpus[doc_id].tags[0])
    ranks.append(rank)
    second_ranks.append(sims[1])

In [13]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 8662, 1: 1245, 2: 554, 3: 259, 4: 154, 5: 77, 6: 42, 7: 25, 8: 22, 9: 18, 10: 4})


# Testing Stuff

In [15]:
a_vec_labels = []
for i in range(0,100):
    a_vec_labels.append('a_vec_'+str(i))

In [16]:
test = pd.DataFrame(columns=a_vec_labels)
test

Unnamed: 0,a_vec_0,a_vec_1,a_vec_2,a_vec_3,a_vec_4,a_vec_5,a_vec_6,a_vec_7,a_vec_8,a_vec_9,...,a_vec_90,a_vec_91,a_vec_92,a_vec_93,a_vec_94,a_vec_95,a_vec_96,a_vec_97,a_vec_98,a_vec_99


In [28]:
# DELETE THIS
vecs = []
for i,text in enumerate(test_corpus):
    e = list(model.infer_vector(text.words))
    vecs.append(e)
    print(e)
    break

[5.110512, -0.65592986, 0.20836602, 0.70076805, 0.4907212, 2.3698783, -0.6267344, -1.0828326, -2.7821813, -1.268279, 0.41016048, -0.079039365, -0.028961234, 2.9258268, -0.14932479, 0.7232984, -3.1125193, 1.4003577, -0.07166648, -0.743049, 1.0612937, -1.5159053, -2.7612288, -2.3630278, 0.70225316, -2.5888743, -3.5680034, -2.2784598, 0.262927, -4.4516673, -2.6072607, -0.79051894, 1.1706922, 0.83756584, -4.775812, 7.1229606, -0.22308028, -1.8293043, -0.4676226, -1.7573675, 0.23633519, -2.2035124, 0.8874312, -0.13073745, -0.8329144, -0.5429555, -1.6758429, -0.52289486, 3.0283427, 2.1907792, -1.7132385, -3.6814685, 4.7607384, -4.0084343, 6.727845, -0.7473339, -1.6317383, 5.0585017, 2.2816267, -1.1080445, 0.32366565, -2.5909634, 1.0620666, -1.0166601, 1.029357, -1.5898114, -1.7993814, 4.5236235, -4.0189924, 3.0700517, -0.054084707, 0.86749595, -1.7806597, 0.50445664, 2.2609172, -3.2808983, 0.86614966, 2.4901724, 5.4935184, -0.2522444, -3.9711046, -1.9295852, -3.0441294, -1.9789611, -0.804959

In [29]:
test = pd.DataFrame(vecs,columns=a_vec_labels)

In [30]:
test

Unnamed: 0,a_vec_0,a_vec_1,a_vec_2,a_vec_3,a_vec_4,a_vec_5,a_vec_6,a_vec_7,a_vec_8,a_vec_9,...,a_vec_90,a_vec_91,a_vec_92,a_vec_93,a_vec_94,a_vec_95,a_vec_96,a_vec_97,a_vec_98,a_vec_99
0,5.110512,-0.65593,0.208366,0.700768,0.490721,2.369878,-0.626734,-1.082833,-2.782181,-1.268279,...,0.684166,-1.87054,-0.382607,-0.05427,2.426356,-0.993193,-1.128066,-0.274844,0.920782,-3.091875
