## Getting the Articles
### Takes each article and converts them into a list of words to allow gensim to read it as a document

In [25]:
import gensim
import os, os.path, glob

forDir = 'articles/full_test'
forNumOfArticles = len([name for name in os.listdir(forDir) if os.path.isfile(os.path.join(forDir, name))])

againstDir = 'articles/against_test'
againstNumOfArticles = len([name for name in os.listdir(againstDir) if os.path.isfile(os.path.join(againstDir, name))])

# creates dataset A, B, and dataset including both A and B articles

articleCount = []
forArticleCount = []
againstArticleCount = []
path = 'articles/all_articles'
for filename in glob.glob(os.path.join(path, '*.txt')):
    with open(os.path.join(os.getcwd(), filename), 'r', encoding="utf8") as f:
        myList= f.read().split()
        articleCount.append(myList)

for filename in glob.glob(os.path.join(forDir, '*.txt')):
    with open(os.path.join(os.getcwd(), filename), 'r', encoding="utf8") as f:
        newList= f.read().split()
        forArticleCount.append(newList)
        
for filename in glob.glob(os.path.join(againstDir, '*.txt')):
    with open(os.path.join(os.getcwd(), filename), 'r', encoding="utf8") as f:
        testList= f.read().split()
        againstArticleCount.append(testList)

## Tags each article to allow the gensim library to use the doc2vec model

In [26]:
# tags each article for vectorization

def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

data_for_training = list(tagged_document(forArticleCount))
data_against_training = list(tagged_document(againstArticleCount))

## Trains the datasets using the doc2vec model, assigning a vector to each article/document

In [27]:
#  trains the datasets using the doc2vec model

forModel = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=40)
forModel.build_vocab(data_for_training)
forModel.train(data_for_training, total_examples=forModel.corpus_count, epochs=forModel.epochs)

againstModel = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=40)
againstModel.build_vocab(data_against_training)
againstModel.train(data_against_training, total_examples=againstModel.corpus_count, epochs=againstModel.epochs)

## Measures cosine similarity using gensim's most_similar() function

In [28]:
# using the corpus article, finds the cosine similarity of each article using most_similar() function that uses cosine similarity

for_scores = forModel.dv.most_similar(0,topn=len(forArticleCount))
against_scores = againstModel.dv.most_similar(0,topn=len(againstArticleCount))

## Converts cosine similarity ratio into percentage and display scores

In [29]:
# display the scores of each article using cosine similarity

for_article_number = []
for_score_value = []

for a, b in for_scores:
    for_article_number.append(a)
    for_score_value.append(b)

# convert the cosine similarity to percentage
for_score_final = [f'{i*100:.1f}%' for i in for_score_value]
    
for i in range(len(for_article_number)):
    print("Article #" + str(for_article_number[i]) + ", Score: " + str(for_score_final[i]))

Article #9, Score: 42.2%
Article #8, Score: 41.5%
Article #3, Score: 36.7%
Article #4, Score: 31.7%
Article #2, Score: 27.5%
Article #6, Score: 25.9%
Article #7, Score: 20.8%
Article #5, Score: 19.2%
Article #10, Score: 16.8%
Article #1, Score: 14.0%


In [30]:
against_article_number = []
against_score_value = []

for a, b in against_scores:
    against_article_number.append(a)
    against_score_value.append(b)

against_score_final = [f'{i*100:.1f}%' for i in against_score_value]
for i in range(len(against_article_number)):
    print("Article #" + str(against_article_number[i]) + ", Score: " + str(against_score_final[i]))

Article #9, Score: 61.4%
Article #6, Score: 51.1%
Article #8, Score: 45.9%
Article #10, Score: 33.3%
Article #7, Score: 29.9%
Article #4, Score: 21.7%
Article #3, Score: 10.3%
Article #5, Score: 9.2%
Article #1, Score: 6.1%
Article #2, Score: 5.5%


# Classify which dataset an article is from depending on if it scores higher than 50%

In [31]:
# classify each article according to how high they score using dataset A

for_articles = []
against_articles = []

for i in range(len(for_article_number)):
    if for_score_value[i] >= 0.5:
        for_articles.append(for_article_number[i])
    else:
        against_articles.append(for_article_number[i])
    


In [16]:
# classify each article according to how high they score using dataset B

new_for_articles = []
new_against_articles = []

for i in range(len(against_article_number)):
    if against_score_value[i] >= 0.5:
        new_against_articles.append(against_article_number[i])
    else:
        new_for_articles.append(against_article_number[i])

In [17]:
print("Articles that are for vaccines: " + ', '.join(map(str, for_articles)))
print("Articles that are against vaccines: " + ', '.join(map(str, against_articles)))

Articles that are for vaccines: 3
Articles that are against vaccines: 2, 4, 8, 5, 9, 6, 1, 7, 10


In [18]:
print("Articles that are for vaccines: " + ', '.join(map(str, new_for_articles)))
print("Articles that are against vaccines: " + ', '.join(map(str, new_against_articles)))

Articles that are for vaccines: 8, 10, 7, 4, 5, 3, 1, 2
Articles that are against vaccines: 9, 6


# Formal display of results

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = {'Article': for_article_number,
       'Score': for_score_final}

df = pd.DataFrame(data)

df


Unnamed: 0,Article,Score
0,3,58.5%
1,2,39.7%
2,4,35.7%
3,8,30.9%
4,5,22.4%
5,9,22.3%
6,6,16.5%
7,1,15.1%
8,7,12.5%
9,10,8.8%


In [20]:
new_data = {'Article': against_article_number,
           'Score': against_score_final}

new_df = pd.DataFrame(new_data)

new_df

Unnamed: 0,Article,Score
0,9,58.7%
1,6,53.0%
2,8,42.8%
3,10,35.3%
4,7,28.8%
5,4,20.0%
6,5,9.9%
7,3,8.4%
8,1,4.6%
9,2,4.0%


In [21]:
for_article_data = {'For_Article': for_articles}

for_article_class = pd.DataFrame(for_article_data)

for_article_class

Unnamed: 0,For_Article
0,3


In [22]:
against_article_data = {'Against_Article': against_articles}

against_article_class = pd.DataFrame(against_article_data)

against_article_class

Unnamed: 0,Against_Article
0,2
1,4
2,8
3,5
4,9
5,6
6,1
7,7
8,10


In [23]:
new_for_article_data = {'For_Article': new_for_articles}

new_for_article_class = pd.DataFrame(new_for_article_data)

new_for_article_class

Unnamed: 0,For_Article
0,8
1,10
2,7
3,4
4,5
5,3
6,1
7,2


In [24]:
new_against_article_data = {'Against_Article': new_against_articles}

new_against_article_class = pd.DataFrame(new_against_article_data)

new_against_article_class

Unnamed: 0,Against_Article
0,9
1,6
