In [1]:
import pandas as pd 
import numpy as np
from tqdm import tqdm
import re
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.test.utils import common_texts
from numpy import asarray
from numpy import save

  from pandas import Panel


In [2]:
# Import the datasets for train and test 
train_data = pd.read_csv('../data/fnc-1/preprocess_train.csv')
test_data = pd.read_csv('../data/fnc-1/preprocess_test.csv')

In [3]:
train_data

Unnamed: 0,bodyId,articleHeading,articleBody,articleStance
0,0,soldier shot parliament lock gunfir erupt war ...,small meteorit crash wood area nicaragua capit...,unrelated
1,0,tourist dub spider man spider burrow skin day,small meteorit crash wood area nicaragua capit...,unrelated
2,0,luke somer kill fail rescu attempt yemen,small meteorit crash wood area nicaragua capit...,unrelated
3,0,break soldier shot war memori ottawa,small meteorit crash wood area nicaragua capit...,unrelated
4,0,giant 8ft 9in catfish weigh 19 stone caught it...,small meteorit crash wood area nicaragua capit...,unrelated
...,...,...,...,...
49967,2532,pizza deliveri man get tip 2000 singl deliveri,ann arbor mich pizza deliveri man michigan got...,agree
49968,2532,pizza deliveri man get 2000 tip,ann arbor mich pizza deliveri man michigan got...,agree
49969,2532,luckiest pizza deliveri guy ever get 2000 tip,ann arbor mich pizza deliveri man michigan got...,agree
49970,2532,ann arbor pizza deliveri driver surpris 2000 tip,ann arbor mich pizza deliveri man michigan got...,agree


In [4]:
# Get Labeled Sentences 
def get_labeled_sentences(data, label_type):
    labeled_data = list()
    for i, val in enumerate(data):
        label = label_type + '_' + str(i)
        labeled_data.append(TaggedDocument(val.split(), [label]))
    return labeled_data

In [5]:
train_articleHeading = get_labeled_sentences(train_data['articleHeading'], 'trainArticleHeading')
train_articleBody = get_labeled_sentences(train_data['articleBody'], 'trainArticleBody')
test_articleHeading = get_labeled_sentences(test_data['articleHeading'], 'testArticleHeading')
test_articleBody = get_labeled_sentences(test_data['articleBody'], 'testArticleBody')

In [6]:
train_articleHeading[0]

TaggedDocument(words=['soldier', 'shot', 'parliament', 'lock', 'gunfir', 'erupt', 'war', 'memori'], tags=['trainArticleHeading_0'])

In [7]:
train_articleBody[0]

TaggedDocument(words=['small', 'meteorit', 'crash', 'wood', 'area', 'nicaragua', 'capit', 'managua', 'overnight', 'govern', 'said', 'sunday', 'resid', 'report', 'hear', 'mysteri', 'boom', 'left', '16foot', 'deep', 'crater', 'near', 'citi', 'airport', 'associ', 'press', 'report', 'govern', 'spokeswoman', 'rosario', 'murillo', 'said', 'committe', 'form', 'govern', 'studi', 'event', 'determin', 'rel', 'small', 'meteorit', 'appear', 'come', 'asteroid', 'pas', 'close', 'earth', 'house', 'asteroid', '2014', 'rc', 'measur', '60', 'foot', 'diamet', 'skim', 'earth', 'weekend', 'abc', 'news', 'report', 'murillo', 'said', 'nicaragua', 'ask', 'intern', 'expert', 'help', 'local', 'scientist', 'understand', 'happen', 'crater', 'left', 'meteorit', 'radiu', '39', 'foot', 'depth', '16', 'foot', 'said', 'humberto', 'saballo', 'volcanologist', 'nicaraguan', 'institut', 'territori', 'studi', 'committe', 'said', 'still', 'clear', 'meteorit', 'disintegr', 'buri', 'humberto', 'garcia', 'astronomi', 'center',

In [8]:
test_articleHeading[0]

TaggedDocument(words=['appl', 'instal', 'safe', 'instor', 'protect', 'gold', 'watch', 'edit'], tags=['testArticleHeading_0'])

In [9]:
test_articleBody[0]

TaggedDocument(words=['alsisi', 'deni', 'isra', 'report', 'state', 'offer', 'extend', 'gaza', 'strip'], tags=['testArticleBody_0'])

In [10]:
# Creating the doc2vec model 
vector_dimension = 300 
# window: The maximum distance between the current and predicted word within a sentence.
# alpha: initial learning rate
text_model_dbow = Doc2Vec(min_count=1, window=5, vector_size=vector_dimension, sample=1e-4, negative=5, epochs=10, seed=1, alpha=0.065, min_alpha=0.065)
# Creating the vocabulary on the train data 
vocabulary_train_data = train_articleHeading + train_articleBody + test_articleHeading + test_articleBody
# Build the vocabulary, number of unique words in heading and body columns of the training dataset
text_model_dbow.build_vocab([x for x in tqdm(vocabulary_train_data)])

100%|██████████| 150770/150770 [00:00<00:00, 1267602.53it/s]


In [11]:
# Train the doc2vec model for 20 epochs 
for epoch in range(20):
    # Using the utils method to shuffle the entire dataset 
    text_model_dbow.train(utils.shuffle([i for i in tqdm(vocabulary_train_data)]), total_examples=len(vocabulary_train_data), epochs=1)
    # Initialise the learning rate to decrease by 0.002 on each epoch 
    text_model_dbow.alpha -= 0.002
    text_model_dbow.min_alpha = text_model_dbow.alpha

100%|██████████| 150770/150770 [00:00<00:00, 2253059.47it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2333608.43it/s]
100%|██████████| 150770/150770 [00:00<00:00, 1826433.53it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2751946.17it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2717415.58it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2343302.07it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2670999.74it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2804697.80it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2838716.58it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2827761.87it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2714615.95it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2843989.18it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2767288.42it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2795720.55it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2769263.71it/s]
100%|██████████| 150770/150770 [00:00<00:00, 2723606.87it/s]
100%|██████████| 150770/

In [12]:
# Use the trained doc2vec model to get the vectors for the train and test data 
train_size = len(train_articleHeading)
test_size = len(test_articleHeading)
# Initialise the training and test vectors as 0 
train_articleHeading_arrays = np.zeros((train_size, vector_dimension))
train_articleBody_arrays = np.zeros((train_size, vector_dimension))
test_articleHeading_arrays = np.zeros((test_size, vector_dimension))
test_articleBody_arrays = np.zeros((test_size, vector_dimension))
# Method to generate vectors for train and test data 
def generate_doc2vec_vectors(vectors, text_model, data_size, vector_type):
    for i in range(data_size):
        title = vector_type + '_' + str(i)
        vectors[i] = text_model.docvecs[title]
    return vectors

In [13]:
# Get the vectors for train article headings 
train_articleHeading_vectors = generate_doc2vec_vectors(train_articleHeading_arrays, text_model_dbow, train_size, 'trainArticleHeading')
# Get the vectors for train article body
train_articleBody_vectors = generate_doc2vec_vectors(train_articleBody_arrays, text_model_dbow, train_size, 'trainArticleBody')
# Get the vectors for test article headings 
test_articleHeading_vectors = generate_doc2vec_vectors(test_articleHeading_arrays, text_model_dbow, test_size, 'testArticleHeading')
# Get the vectors for test article body
test_articleBody_vectors = generate_doc2vec_vectors(test_articleBody_arrays, text_model_dbow, test_size, 'testArticleBody')

In [14]:
# Create doc2vec vector for all the documents in a defined order and append them together.
train_featured_vectors = np.squeeze(np.c_[train_articleHeading_vectors, train_articleBody_vectors])
test_featured_vectors = np.squeeze(np.c_[test_articleHeading_vectors, test_articleBody_vectors])

In [15]:
# Convert the final featured vectors into numpy array
train_featured_vectors = np.array(train_featured_vectors)
test_featured_vectors = np.array(test_featured_vectors)

In [108]:
# Save the X_train and Y_train for doc2vec models 
save('../data/fnc-1/x_train_doc2vec.npy', train_featured_vectors)
# Save the X_test numpy array 
save('../data/fnc-1/x_test_doc2vec.npy', test_featured_vectors)