# 1. Explorative analysis

#### 1.1 Exploring the dataset, inspecting which relevant variables are present, and selecting variables of interest

In [1]:
#importing all the necessary packages for the explorative analysis of the news dataset
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import spacy
import csv

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

!pip install pyLDAvis
import gensim
from gensim import corpora
from gensim import models

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import WordEmbeddingSimilarityIndex

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import numpy as np
import random
from glob import glob
from string import punctuation

import random
from dateutil import parser
from tqdm import tqdm

random.seed(2022)
np.random.seed(2022)



In [2]:
#reading the csv file news into a dataset using pandas
newsarticles = pd.read_csv('/Users/luxvoorzanger/Library/CloudStorage/OneDrive-UvA/MINOR/CCS2/group project/articles_data.csv', sep=',', encoding='utf-8')

In [3]:
#checking the shape of the dataset
newsarticles.shape

(10437, 15)

In [4]:
#looking at the top five rows of the news dataset
newsarticles.head()

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
0,0,reuters,Reuters,Reuters Editorial,NTSB says Autopilot engaged in 2018 California...,The National Transportation Safety Board said ...,https://www.reuters.com/article/us-tesla-crash...,https://s4.reutersmedia.net/resources/r/?m=02&...,2019-09-03T16:22:20Z,WASHINGTON (Reuters) - The National Transporta...,0.0,0.0,0.0,2528.0,0.0
1,1,the-irish-times,The Irish Times,Eoin Burke-Kennedy,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,https://www.irishtimes.com/business/economy/un...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T10:32:28Z,The States jobless rate fell to 5.2 per cent l...,0.0,6.0,10.0,2.0,0.0
2,2,the-irish-times,The Irish Times,Deirdre McQuillan,"Louise Kennedy AW2019: Long coats, sparkling t...",Autumn-winter collection features designer’s g...,https://www.irishtimes.com/\t\t\t\t\t\t\t/life...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T14:40:00Z,Louise Kennedy is showing off her autumn-winte...,1.0,,,,
3,3,al-jazeera-english,Al Jazeera English,Al Jazeera,North Korean footballer Han joins Italian gian...,Han is the first North Korean player in the Se...,https://www.aljazeera.com/news/2019/09/north-k...,https://www.aljazeera.com/mritems/Images/2019/...,2019-09-03T17:25:39Z,"Han Kwang Song, the first North Korean footbal...",0.0,0.0,0.0,7.0,0.0
4,4,bbc-news,BBC News,BBC News,UK government lawyer says proroguing parliamen...,"The UK government's lawyer, David Johnston arg...",https://www.bbc.co.uk/news/av/uk-scotland-4956...,https://ichef.bbci.co.uk/news/1024/branded_new...,2019-09-03T14:39:21Z,,0.0,0.0,0.0,0.0,0.0


In [5]:
#checking the columns of the news dataset
newsarticles.columns

Index(['Unnamed: 0', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'top_article', 'engagement_reaction_count', 'engagement_comment_count',
       'engagement_share_count', 'engagement_comment_plugin_count'],
      dtype='object')

#### We have decided upon building a content-based recommender system for this dataset of news articles.
#### The columns which might be of interest for this are:
* "source_id" (column value indicates publisher unique identifier usually presented as lowercase source_name with spaces replaced with underscore symbol)
* "source_name" (column value indicates publisher name)
* "title" (column value indicates headline of an article)
* "description" (column value indicates short article description usually visible in popups or recommendation boxes on the publisher's website. This field is shortened to a few sentences content column)
* "content" (column value indicates the unformatted content of the article. This field is truncated to 260 characters)

In [6]:
#making a new column called "NewsID" to index the news articles
newsarticles.rename(columns={'Unnamed: 0' : 'NewsID'}, inplace=True)

In [7]:
#checking whether the new column has been added into the dataframe
newsarticles.head()

Unnamed: 0,NewsID,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
0,0,reuters,Reuters,Reuters Editorial,NTSB says Autopilot engaged in 2018 California...,The National Transportation Safety Board said ...,https://www.reuters.com/article/us-tesla-crash...,https://s4.reutersmedia.net/resources/r/?m=02&...,2019-09-03T16:22:20Z,WASHINGTON (Reuters) - The National Transporta...,0.0,0.0,0.0,2528.0,0.0
1,1,the-irish-times,The Irish Times,Eoin Burke-Kennedy,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,https://www.irishtimes.com/business/economy/un...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T10:32:28Z,The States jobless rate fell to 5.2 per cent l...,0.0,6.0,10.0,2.0,0.0
2,2,the-irish-times,The Irish Times,Deirdre McQuillan,"Louise Kennedy AW2019: Long coats, sparkling t...",Autumn-winter collection features designer’s g...,https://www.irishtimes.com/\t\t\t\t\t\t\t/life...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T14:40:00Z,Louise Kennedy is showing off her autumn-winte...,1.0,,,,
3,3,al-jazeera-english,Al Jazeera English,Al Jazeera,North Korean footballer Han joins Italian gian...,Han is the first North Korean player in the Se...,https://www.aljazeera.com/news/2019/09/north-k...,https://www.aljazeera.com/mritems/Images/2019/...,2019-09-03T17:25:39Z,"Han Kwang Song, the first North Korean footbal...",0.0,0.0,0.0,7.0,0.0
4,4,bbc-news,BBC News,BBC News,UK government lawyer says proroguing parliamen...,"The UK government's lawyer, David Johnston arg...",https://www.bbc.co.uk/news/av/uk-scotland-4956...,https://ichef.bbci.co.uk/news/1024/branded_new...,2019-09-03T14:39:21Z,,0.0,0.0,0.0,0.0,0.0


In [8]:
#checking the datatypes of the columns
newsarticles.dtypes

NewsID                               int64
source_id                           object
source_name                         object
author                              object
title                               object
description                         object
url                                 object
url_to_image                        object
published_at                        object
content                             object
top_article                        float64
engagement_reaction_count          float64
engagement_comment_count           float64
engagement_share_count             float64
engagement_comment_plugin_count    float64
dtype: object

In [9]:
#checking for missing values in the dataframe
newsarticles.isna().sum()

NewsID                                0
source_id                             0
source_name                           0
author                             1020
title                                 2
description                          24
url                                   1
url_to_image                        656
published_at                          1
content                            1292
top_article                           2
engagement_reaction_count           118
engagement_comment_count            118
engagement_share_count              118
engagement_comment_plugin_count     118
dtype: int64

In [10]:
#removing missing values from rows that appear in either the description or title columns, as these must have no
#missing values for the analyses we will execute later
newsarticles = newsarticles.dropna(subset=['description', 'title'])

In [11]:
#checking whether the previous command worked, which it did. the other columns can have missing values as we
#will not use these later on in the analysis
newsarticles.isna().sum()

NewsID                                0
source_id                             0
source_name                           0
author                             1005
title                                 0
description                           0
url                                   1
url_to_image                        643
published_at                          1
content                            1274
top_article                           2
engagement_reaction_count           118
engagement_comment_count            118
engagement_share_count              118
engagement_comment_plugin_count     118
dtype: int64

In [12]:
#looking into the column 'description'
newsarticles['description']

0        The National Transportation Safety Board said ...
1        Latest monthly figures reflect continued growt...
2        Autumn-winter collection features designer’s g...
3        Han is the first North Korean player in the Se...
4        The UK government's lawyer, David Johnston arg...
                               ...                        
10432    Get breaking national and world news, broadcas...
10433    The announcement by Julius Baer this week that...
10434    Weston Newswanger is just a normal 5-year-old ...
10435    A detective is haunted by the case of two wome...
10436    Who wanted one-time millionaire Lanny Horwitz ...
Name: description, Length: 10411, dtype: object

#### 1.2 In this step, we are preprocessing the data using feature engineering

In [13]:
#converting the description column in to a list
descriptionlist = newsarticles['description'].tolist()

In [14]:
#lowercasing everything in the list
desclistlower = [c.lower() for c in descriptionlist]

In [15]:
#checking whether the list conversion has worked and also checking whether everything is lowercase
print(desclistlower[:10])

["the national transportation safety board said tuesday a tesla model s was in autopilot mode when it struck a fire truck in culver city, california -- one of a series of crashes the board is investigating involving tesla's driver assistance system.", 'latest monthly figures reflect continued growth in headline employment', 'autumn-winter collection features designer’s glittering take on black watch tartan', 'han is the first north korean player in the serie a and was praised during his appearances during youth world cups.', "the uk government's lawyer, david johnston argued that proroguing parliament was a political decision for the government, rather than a legal matter for the court to decide.", '"this tender land" by william kent krueger is an affecting story about growing up and overcoming a childhood filled with neglect, abuse and racism during the depression.', 'the european union is waiting to see if british lawmakers block brexit before giving britain concessions to strike a n

In [16]:
#removing all stopwords from our list
mystopwords = stopwords.words("english")
descnostopwords = [" ".join([w for w in c.split() if w not in mystopwords]) for c in desclistlower]

In [17]:
#checking whether stopword removal worked, which can be confirmed when looking at the output below
descnostopwords[:10]

["national transportation safety board said tuesday tesla model autopilot mode struck fire truck culver city, california -- one series crashes board investigating involving tesla's driver assistance system.",
 'latest monthly figures reflect continued growth headline employment',
 'autumn-winter collection features designer’s glittering take black watch tartan',
 'han first north korean player serie praised appearances youth world cups.',
 "uk government's lawyer, david johnston argued proroguing parliament political decision government, rather legal matter court decide.",
 '"this tender land" william kent krueger affecting story growing overcoming childhood filled neglect, abuse racism depression.',
 'european union waiting see british lawmakers block brexit giving britain concessions strike new withdrawal agreement, prime minister boris johnson said.',
 "earnings revenue expectations european companies third quarter improved slightly, although region still expected corporate recessio

In [18]:
#lemmatizing the list after stopword removal
nlp = spacy.load("en_core_web_sm")
lemmatizeddescription = [" ".join([w.lemma_ for w in nlp(c)]) for c in descnostopwords]

In [19]:
#now we are removing the punctuation from the list and tokenizing it 
tokenizer = RegexpTokenizer(r'\w+')
descnopunctuations = [tokenizer.tokenize(c) for c in lemmatizeddescription]

In [20]:
#checking whether pruning has worked
descnopunctuations[:5]

[['national',
  'transportation',
  'safety',
  'board',
  'say',
  'tuesday',
  'tesla',
  'model',
  'autopilot',
  'mode',
  'strike',
  'fire',
  'truck',
  'culver',
  'city',
  'california',
  'one',
  'series',
  'crash',
  'board',
  'investigate',
  'involve',
  'tesla',
  's',
  'driver',
  'assistance',
  'system'],
 ['late',
  'monthly',
  'figure',
  'reflect',
  'continued',
  'growth',
  'headline',
  'employment'],
 ['autumn',
  'winter',
  'collection',
  'feature',
  'designer',
  's',
  'glitter',
  'take',
  'black',
  'watch',
  'tartan'],
 ['han',
  'first',
  'north',
  'korean',
  'player',
  'serie',
  'praise',
  'appearance',
  'youth',
  'world',
  'cup'],
 ['uk',
  'government',
  's',
  'lawyer',
  'david',
  'johnston',
  'argue',
  'prorogue',
  'parliament',
  'political',
  'decision',
  'government',
  'rather',
  'legal',
  'matter',
  'court',
  'decide']]

In [21]:
#we have decided to remove the character s from our list as well, as you can see in the previous output that it appears randomly
#and is not informative at all, it might only inflate as a token and hereby distort our results
#we believe that it appeared as a result of the stopword removal, so we have decided to take it out manually before we move on to the next step
tokenizedlist = [([w for w in c if w != "s"]) for c in descnopunctuations]

# 1.3 Inductive analysis
We decided to use a simple counter to explore which tokens are most common in our dataset, although this was not very informative when looking at the output. Hence, we decided to opt for a more advanced TfidfVectorizer approach to make sense of our data, which we will do in the following steps.

In [22]:
#inductive analysis using a simple counter, which was not very informative
from collections import Counter

bottomupapproach = []
for c in tokenizedlist:
    bottomupapproach.append(Counter(c).most_common(1))
print(bottomupapproach)



In [23]:
#using a TfidfVectorizer in a sparse format, as it is more computationally efficient and less memory is required
tokens2 = [x for c in tokenizedlist for x in c] 
Vec = TfidfVectorizer(max_df = .75, min_df = 2) 
Vec_fit = Vec.fit_transform(tokens2)

In [24]:
#checking the shape of this sparse matrix 
Vec_fit.shape

(179199, 9693)

In [25]:
#sparse matrix descriptions
print("Number of non-zero elements:", Vec_fit.sum())
print("Total number of elements:", Vec_fit.shape[0] * Vec_fit.shape[1])

# compute the sparsity of the matrix: w the proportion of zero elements in the matrix
print("Sparsity:", 1 - Vec_fit.sum() / (Vec_fit.shape[0] * Vec_fit.shape[1]))

Number of non-zero elements: 168708.0
Total number of elements: 1736975907
Sparsity: 0.99990287257335


In [26]:
#applying soft-cosine similarity
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

news1 = ' '.join(tokenizedlist[0])
news2 = ' '.join(tokenizedlist[1])
news3 = ' '.join(tokenizedlist[2])
dictionary = corpora.Dictionary([simple_preprocess(news) for news in [news1, news2, news3]])
bag_of_words_vectors = [ dictionary.doc2bow(simple_preprocess(news)) for news in [news1, news2, news3]] 
similarity_index = WordEmbeddingSimilarityIndex(fasttext_model300)
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary)

#between news1 and news2
scm_news1_news2 = similarity_matrix.inner_product(bag_of_words_vectors[0], bag_of_words_vectors[1], normalized=(True, True))

#between news1 and news3
scm_news1_news3 = similarity_matrix.inner_product(bag_of_words_vectors[0], bag_of_words_vectors[2], normalized=(True, True))

#between news2 and news3
scm_news2_news3 = similarity_matrix.inner_product(bag_of_words_vectors[1], bag_of_words_vectors[2], normalized=(True, True))

print(f"SCM between:\nnews1 <-> news2: {scm_news1_news2:.2f}\nnews1 <-> news3: {scm_news1_news3:.2f}\nnews2 <-> news3: {scm_news2_news3:.2f}")

100%|███████████████████████████████████████████| 42/42 [00:07<00:00,  5.64it/s]

SCM between:
news1 <-> news2: 0.00
news1 <-> news3: 0.00
news2 <-> news3: 0.04





# Interpretation:
* There is a large variety of words in the news articles, based on the SCM results
* Thus we also expect there to be many different topics present in the data
* The matrix is very sparse because the sparsity is very high, ≈ 0.99

#### 1.4 Data description
#### We would like to provide a clear description of the variables we end up working with.
#### Initially, we had selected these columns for our content-based recommender system:
* Source_id
* Source_name
* title
* description
* content

Upon reflection, we believe that 'description' and 'title' are the most relevant for additional insights and our recommender system. We are aiming to recommend news article titles based on their description, using cosine similarity. In the following step, we are providing some extra descriptives for both relevant variables, as well as for the news source.

In [27]:
#descriptives of sources
newsarticles['source_name'].mode()

0    Reuters
Name: source_name, dtype: object

In [28]:
#descriptives of sources
newsarticles['source_name'].value_counts()

source_name
Reuters                    1252
BBC News                   1242
The Irish Times            1230
ABC News                   1139
CNN                        1114
Business Insider           1048
The New York Times          984
CBS News                    952
Newsweek                    539
Al Jazeera English          495
The Wall Street Journal     333
ESPN                         82
460.0                         1
Name: count, dtype: int64

In [29]:
#descriptives of sources in percentages
newsarticles['source_name'].value_counts(normalize=True)

source_name
Reuters                    0.120257
BBC News                   0.119297
The Irish Times            0.118144
ABC News                   0.109404
CNN                        0.107002
Business Insider           0.100663
The New York Times         0.094515
CBS News                   0.091442
Newsweek                   0.051772
Al Jazeera English         0.047546
The Wall Street Journal    0.031985
ESPN                       0.007876
460.0                      0.000096
Name: proportion, dtype: float64

In [30]:
#average length of an article description
newsarticles['description'].str.len().mean()

157.89069253673998

In [31]:
#average length of an article title
newsarticles['title'].str.len().mean()

66.79483238881951

In [32]:
#datatypes of the relevant variables, both contain strings
newsarticles[['title', 'source_name', 'description']].dtypes

title          object
source_name    object
description    object
dtype: object

In [33]:
#info overview of both variables, but not very informative
newsarticles[['title', 'source_name', 'description']].info

<bound method DataFrame.info of                                                    title         source_name   
0      NTSB says Autopilot engaged in 2018 California...             Reuters  \
1           Unemployment falls to post-crash low of 5.2%     The Irish Times   
2      Louise Kennedy AW2019: Long coats, sparkling t...     The Irish Times   
3      North Korean footballer Han joins Italian gian...  Al Jazeera English   
4      UK government lawyer says proroguing parliamen...            BBC News   
...                                                  ...                 ...   
10432  Drop in US service sector activity raises econ...            ABC News   
10433  Banker defections pose challenge for Credit Su...             Reuters   
10434  A 5-year-old cancer survivor donates 3,000 toy...                 CNN   
10435                                 Fateful Connection            CBS News   
10436                             Love, Hate & Obsession            CBS News   

       

# 1.5 Topic modelling
We have decided to select the 10 most prominent topics in our dataset, otherwise there would be too much variety for a simple content-based algorithm. We believe that the 10 most prominent topics will give us the ability to provide accurate recommendations and keep a clear overview of the data. We will execute three LDA codes using a CountVectorizer, TfidfVectorizer and one using uni- and bigrams to see which fits our data best

In [34]:
# LDA implementation with CountVectorizer
raw_m1 = tokenizedlist
id2word_m1 = corpora.Dictionary(raw_m1)   
ldacorpus_m1 = [id2word_m1.doc2bow(c) for c in raw_m1] 

lda_m1 = models.LdaModel(ldacorpus_m1, id2word=id2word_m1, num_topics=10) 
lda_m1.print_topics()

[(0,
  '0.015*"say" + 0.011*"president" + 0.009*"business" + 0.009*"deal" + 0.009*"story" + 0.008*"democratic" + 0.007*"brexit" + 0.006*"house" + 0.006*"minister" + 0.006*"insider"'),
 (1,
  '0.017*"say" + 0.011*"one" + 0.008*"company" + 0.007*"we" + 0.007*"new" + 0.006*"year" + 0.006*"trump" + 0.005*"big" + 0.005*"president" + 0.005*"time"'),
 (2,
  '0.021*"new" + 0.009*"year" + 0.008*"say" + 0.008*"not" + 0.007*"com" + 0.007*"www" + 0.006*"make" + 0.006*"the" + 0.005*"one" + 0.005*"do"'),
 (3,
  '0.028*"trump" + 0.024*"president" + 0.015*"say" + 0.014*"ukraine" + 0.010*"donald" + 0.009*"officer" + 0.008*"inquiry" + 0.007*"house" + 0.007*"state" + 0.007*"call"'),
 (4,
  '0.019*"year" + 0.017*"police" + 0.013*"old" + 0.012*"murder" + 0.011*"say" + 0.008*"report" + 0.008*"man" + 0.008*"woman" + 0.007*"kill" + 0.007*"two"'),
 (5,
  '0.011*"saudi" + 0.010*"disney" + 0.008*"oil" + 0.006*"wife" + 0.006*"say" + 0.006*"attack" + 0.006*"two" + 0.006*"u" + 0.005*"cold" + 0.005*"infrastructure"'

In [35]:
# LDA implementation with TfidfVectorizer
raw_m2 = tokenizedlist
id2word_m2 = corpora.Dictionary(raw_m2)  
ldacorpus_m2 = [id2word_m2.doc2bow(c) for c in tokenizedlist]

tfidfcorpus_m = models.TfidfModel(ldacorpus_m2) 

lda_m2 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m[ldacorpus_m2],id2word=id2word_m2,num_topics=10) 
lda_m2.print_topics(num_words=10)

[(0,
  '0.006*"say" + 0.005*"murder" + 0.005*"thursday" + 0.005*"new" + 0.004*"police" + 0.004*"deal" + 0.004*"minister" + 0.004*"prime" + 0.003*"year" + 0.003*"one"'),
 (1,
  '0.007*"president" + 0.006*"trump" + 0.005*"biden" + 0.005*"thursday" + 0.005*"say" + 0.004*"donald" + 0.004*"abortion" + 0.004*"trade" + 0.004*"federal" + 0.004*"u"'),
 (2,
  '0.006*"wife" + 0.004*"killer" + 0.004*"48" + 0.003*"hour" + 0.003*"investigate" + 0.003*"connecticut" + 0.003*"year" + 0.003*"say" + 0.003*"case" + 0.003*"report"'),
 (3,
  '0.003*"quarter" + 0.003*"story" + 0.003*"new" + 0.003*"animal" + 0.003*"hire" + 0.002*"seriously" + 0.002*"light" + 0.002*"year" + 0.002*"miss" + 0.002*"hear"'),
 (4,
  '0.010*"ukraine" + 0.006*"committee" + 0.006*"impeachment" + 0.004*"saudi" + 0.004*"martin" + 0.004*"infrastructure" + 0.004*"knife" + 0.003*"trump" + 0.003*"hide" + 0.003*"reward"'),
 (5,
  '0.008*"inquiry" + 0.004*"impeachment" + 0.004*"poor" + 0.004*"intelligence" + 0.004*"probe" + 0.003*"training" +

In [36]:
# LDA implementation with N-grams 
cleaned = [' '.join(tokens) for tokens in tokenizedlist] 
ngramnews = [["_".join(tup) for tup in nltk.ngrams(c.split(),2)] for c in cleaned] 

newsgramsuniandbi = []
for a,b in zip([c.split() for c in cleaned],ngramnews):
    newsgramsuniandbi.append(a + b)

id2word_m3 = corpora.Dictionary(newsgramsuniandbi)
id2word_m3.filter_extremes(no_below=5, no_above=0.5)

ldacorpus_m3 = [id2word_m3.doc2bow(item) for item in newsgramsuniandbi]
tfidfcorpus_m3 = models.TfidfModel(ldacorpus_m3)

lda_m3 = models.ldamodel.LdaModel(corpus=tfidfcorpus_m3[ldacorpus_m3],id2word=id2word_m3,num_topics=10)
lda_m3.print_topics(num_words=10)

[(0,
  '0.004*"training" + 0.004*"ease" + 0.004*"complaint" + 0.004*"like" + 0.003*"army" + 0.003*"source" + 0.003*"seriously" + 0.003*"first" + 0.003*"cold" + 0.003*"era"'),
 (1,
  '0.007*"staff" + 0.006*"inquiry" + 0.005*"northern_ireland" + 0.005*"northern" + 0.004*"hotel" + 0.004*"shortage" + 0.004*"court_rule" + 0.004*"rule" + 0.004*"belfast" + 0.004*"miami"'),
 (2,
  '0.005*"disney" + 0.005*"officer" + 0.004*"paris" + 0.004*"french" + 0.004*"home" + 0.003*"injure" + 0.003*"knife" + 0.003*"sunday" + 0.003*"kill" + 0.003*"year"'),
 (3,
  '0.008*"president" + 0.007*"trump" + 0.006*"say" + 0.005*"impeachment" + 0.005*"donald" + 0.005*"donald_trump" + 0.005*"proposal" + 0.004*"prime_minister" + 0.004*"minister" + 0.004*"prime"'),
 (4,
  '0.005*"new" + 0.004*"fight" + 0.004*"chief" + 0.004*"executive" + 0.004*"carbon" + 0.004*"say" + 0.003*"state" + 0.003*"finance" + 0.003*"miss" + 0.003*"15"'),
 (5,
  '0.007*"police" + 0.006*"thursday" + 0.005*"year" + 0.005*"say" + 0.005*"year_old" +

In [37]:
# Model evaluation

cm1 = models.CoherenceModel(model=lda_m1, corpus=ldacorpus_m1 , dictionary=id2word_m1, coherence='u_mass')  
ch1 = cm1.get_coherence()
cm2 = models.CoherenceModel(model=lda_m2, corpus=ldacorpus_m2, dictionary= id2word_m2, coherence='u_mass')  
ch2 = cm2.get_coherence()

cm3 = models.CoherenceModel(model=lda_m3, corpus=tfidfcorpus_m3[ldacorpus_m3], dictionary= id2word_m3, coherence='u_mass')
ch3 = cm3.get_coherence()

print(f"Coherence of naive model = {ch1}\nCoherence of tfidf model = {ch2}\nCoherence of bigram and unigram model = {ch3}")

Coherence of naive model = -6.389039419340122
Coherence of tfidf model = -12.187215688807273
Coherence of bigram and unigram model = -10.079072738887199


#### Model fit:
* The higher the coherence score, the better the model fit
* The coherence of the tfidf model fits best, when looking at the numeric value
* Hence, we are visualizing the topics using the tfidf model in the next step

In [38]:
vis_data = gensimvis.prepare(lda_m2,ldacorpus_m2,id2word_m2)
pyLDAvis.display(vis_data)

# 2. Recommender system

In [39]:
#defining get data function
PATH = '/Users/luxvoorzanger/Library/CloudStorage/OneDrive-UvA/MINOR/CCS2/group project/articles_data.csv'
def get_data(path_to_data):
    data1 = pd.read_csv(path_to_data)
    samplenewsarticles = data1.sample(n=2000, random_state=42)
    samplenewsarticles.index = [i for i in range(0,len(samplenewsarticles))]
    return samplenewsarticles

In [40]:
#now my dataset of interest is called data
data = get_data(PATH)
data.head()

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
0,8005,cbs-news,CBS News,Emily Tillett,"300 former officials call out Trump for ""uncon...",Group of bipartisan national security experts ...,https://www.cbsnews.com/news/donald-trump-ukra...,https://cbsnews2.cbsistatic.com/hub/i/r/2017/0...,2019-09-27T12:17:01Z,More than 300 former national security officia...,0.0,14330.0,6414.0,4197.0,0.0
1,9495,cnn,CNN,"Jason Hanna and Aaron Cooper, CNN",WWII-era bomber crashes at an airport near Har...,A World War II-era aircraft crashed Wednesday ...,https://www.cnn.com/2019/10/02/us/connecticut-...,https://cdn.cnn.com/cnnnext/dam/assets/1910021...,2019-10-02T14:35:10Z,,0.0,3373.0,988.0,1265.0,0.0
2,7428,business-insider,Business Insider,"lramsey@businessinsider.com (Lydia Ramsey), Ly...",Dispensed: Amazon and Best Buy's expanding hea...,"REUTERS/Joshua Roberts Hello, There must be so...",https://www.businessinsider.com/dispensed-week...,https://image.businessinsider.com/5c2f7f05bd77...,2019-09-27T14:13:56Z,"Hello,\r\nThere must be something in the water...",0.0,0.0,0.0,1627.0,0.0
3,8727,the-new-york-times,The New York Times,Michelle Goldberg,Trump’s Claims About Biden Aren’t ‘Unsupported...,The president’s accusations turn reality on it...,https://www.nytimes.com/2019/09/30/opinion/tru...,https://static01.nyt.com/images/2019/09/30/opi...,2019-10-01T00:16:20Z,Joe Biden appears to have been uncomfortable w...,0.0,4988.0,881.0,3127.0,0.0
4,9063,al-jazeera-english,Al Jazeera English,Anthony Langat,"For rural Kenyans, treating snakebites is an u...",Insufficient anti-venom distribution and poor ...,https://www.aljazeera.com/indepth/features/rur...,https://www.aljazeera.com/mritems/Images/2019/...,2019-10-02T05:29:09Z,"Baringo, Kenya - Simotwo village in Baringo, a...",0.0,3.0,2.0,28.0,0.0


In [41]:
#checking whether the random sample worked
len(data)

2000

In [42]:
data['combined'] = data[['title', 'description']].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

In [43]:
#checking whether creating the new column worked
data.head()

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count,combined
0,8005,cbs-news,CBS News,Emily Tillett,"300 former officials call out Trump for ""uncon...",Group of bipartisan national security experts ...,https://www.cbsnews.com/news/donald-trump-ukra...,https://cbsnews2.cbsistatic.com/hub/i/r/2017/0...,2019-09-27T12:17:01Z,More than 300 former national security officia...,0.0,14330.0,6414.0,4197.0,0.0,"300 former officials call out Trump for ""uncon..."
1,9495,cnn,CNN,"Jason Hanna and Aaron Cooper, CNN",WWII-era bomber crashes at an airport near Har...,A World War II-era aircraft crashed Wednesday ...,https://www.cnn.com/2019/10/02/us/connecticut-...,https://cdn.cnn.com/cnnnext/dam/assets/1910021...,2019-10-02T14:35:10Z,,0.0,3373.0,988.0,1265.0,0.0,WWII-era bomber crashes at an airport near Har...
2,7428,business-insider,Business Insider,"lramsey@businessinsider.com (Lydia Ramsey), Ly...",Dispensed: Amazon and Best Buy's expanding hea...,"REUTERS/Joshua Roberts Hello, There must be so...",https://www.businessinsider.com/dispensed-week...,https://image.businessinsider.com/5c2f7f05bd77...,2019-09-27T14:13:56Z,"Hello,\r\nThere must be something in the water...",0.0,0.0,0.0,1627.0,0.0,Dispensed: Amazon and Best Buy's expanding hea...
3,8727,the-new-york-times,The New York Times,Michelle Goldberg,Trump’s Claims About Biden Aren’t ‘Unsupported...,The president’s accusations turn reality on it...,https://www.nytimes.com/2019/09/30/opinion/tru...,https://static01.nyt.com/images/2019/09/30/opi...,2019-10-01T00:16:20Z,Joe Biden appears to have been uncomfortable w...,0.0,4988.0,881.0,3127.0,0.0,Trump’s Claims About Biden Aren’t ‘Unsupported...
4,9063,al-jazeera-english,Al Jazeera English,Anthony Langat,"For rural Kenyans, treating snakebites is an u...",Insufficient anti-venom distribution and poor ...,https://www.aljazeera.com/indepth/features/rur...,https://www.aljazeera.com/mritems/Images/2019/...,2019-10-02T05:29:09Z,"Baringo, Kenya - Simotwo village in Baringo, a...",0.0,3.0,2.0,28.0,0.0,"For rural Kenyans, treating snakebites is an u..."


In [44]:
#using a TdIdfVectorizer on the new column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['combined'])

In [45]:
#creating a new column called combined features
def combine_features(data): 
    data['combined_features'] = data[['description', 'title']].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
    return data

In [46]:
#checking whether creating the new column worked
data.head()

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count,combined
0,8005,cbs-news,CBS News,Emily Tillett,"300 former officials call out Trump for ""uncon...",Group of bipartisan national security experts ...,https://www.cbsnews.com/news/donald-trump-ukra...,https://cbsnews2.cbsistatic.com/hub/i/r/2017/0...,2019-09-27T12:17:01Z,More than 300 former national security officia...,0.0,14330.0,6414.0,4197.0,0.0,"300 former officials call out Trump for ""uncon..."
1,9495,cnn,CNN,"Jason Hanna and Aaron Cooper, CNN",WWII-era bomber crashes at an airport near Har...,A World War II-era aircraft crashed Wednesday ...,https://www.cnn.com/2019/10/02/us/connecticut-...,https://cdn.cnn.com/cnnnext/dam/assets/1910021...,2019-10-02T14:35:10Z,,0.0,3373.0,988.0,1265.0,0.0,WWII-era bomber crashes at an airport near Har...
2,7428,business-insider,Business Insider,"lramsey@businessinsider.com (Lydia Ramsey), Ly...",Dispensed: Amazon and Best Buy's expanding hea...,"REUTERS/Joshua Roberts Hello, There must be so...",https://www.businessinsider.com/dispensed-week...,https://image.businessinsider.com/5c2f7f05bd77...,2019-09-27T14:13:56Z,"Hello,\r\nThere must be something in the water...",0.0,0.0,0.0,1627.0,0.0,Dispensed: Amazon and Best Buy's expanding hea...
3,8727,the-new-york-times,The New York Times,Michelle Goldberg,Trump’s Claims About Biden Aren’t ‘Unsupported...,The president’s accusations turn reality on it...,https://www.nytimes.com/2019/09/30/opinion/tru...,https://static01.nyt.com/images/2019/09/30/opi...,2019-10-01T00:16:20Z,Joe Biden appears to have been uncomfortable w...,0.0,4988.0,881.0,3127.0,0.0,Trump’s Claims About Biden Aren’t ‘Unsupported...
4,9063,al-jazeera-english,Al Jazeera English,Anthony Langat,"For rural Kenyans, treating snakebites is an u...",Insufficient anti-venom distribution and poor ...,https://www.aljazeera.com/indepth/features/rur...,https://www.aljazeera.com/mritems/Images/2019/...,2019-10-02T05:29:09Z,"Baringo, Kenya - Simotwo village in Baringo, a...",0.0,3.0,2.0,28.0,0.0,"For rural Kenyans, treating snakebites is an u..."


In [47]:
#importing sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [48]:
#calculating cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [49]:
# printing cosine similarity scores for the 1st article
cosine_sim[0]

array([1.        , 0.        , 0.        , ..., 0.01688088, 0.        ,
       0.        ])

In [50]:
#printing the second title
print(data['title'][1])

WWII-era bomber crashes at an airport near Hartford, Connecticut


In [51]:
#checking whether this title indeed matches the second index
indices = pd.Series(data.index, index = data['title'])
index = indices["WWII-era bomber crashes at an airport near Hartford, Connecticut"]
print(index)

1


In [52]:
sim_scores = list(enumerate(cosine_sim[index])) 
sim_scores

[(0, 0.0),
 (1, 0.9999999999999999),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.03644026974630519),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.015349192635091146),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0365824987362373),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0643328837990694),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.012468378502122584),
 (25, 0.0),
 (26, 0.03008136672901245),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.12923154185871533),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.022635929716165033),
 (49, 0.1953342704506539),
 (50, 0.0),
 (51, 0.023819787386803508),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.047411051014454836),
 (56, 0.0),
 (57, 0.0),
 (58, 0.008324298372831976),
 (59, 0.012912185137173616),
 (60, 0.0),
 (61, 0.0),
 (62, 0.015509059431905696),
 (63, 0.01

In [53]:
#sorting the list based on cosine scores while still keeping the track of indexes

sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores

[(1, 0.9999999999999999),
 (116, 0.47918781521764914),
 (1249, 0.44366215434257616),
 (1189, 0.26507450829663964),
 (227, 0.25412439929086106),
 (1060, 0.2022385091009911),
 (49, 0.1953342704506539),
 (1161, 0.17134090308503547),
 (1103, 0.16105980902685613),
 (33, 0.12923154185871533),
 (1360, 0.1264936352967295),
 (1480, 0.11048658403180922),
 (1524, 0.10664986775871706),
 (1823, 0.10052235339419627),
 (748, 0.09529974683836168),
 (1137, 0.09489842916237508),
 (1535, 0.08922135063942176),
 (1547, 0.08406802755578768),
 (813, 0.08321476508144017),
 (321, 0.0709728311148241),
 (1287, 0.06882740107482839),
 (1999, 0.06774702709926668),
 (19, 0.0643328837990694),
 (1261, 0.06408340307390177),
 (941, 0.06237127392890341),
 (1111, 0.06067373897273845),
 (558, 0.057560511796705495),
 (1053, 0.05689272884618431),
 (147, 0.05549950850025293),
 (501, 0.05549950850025293),
 (1662, 0.05513672093794474),
 (296, 0.0531303418917515),
 (55, 0.047411051014454836),
 (1050, 0.04507288423009831),
 (1940

In [54]:
#similarity scores are really low in our dataset, therefore we only keep first 10 (most similar ones)
sim_scores = sim_scores[1:11]
sim_scores

[(116, 0.47918781521764914),
 (1249, 0.44366215434257616),
 (1189, 0.26507450829663964),
 (227, 0.25412439929086106),
 (1060, 0.2022385091009911),
 (49, 0.1953342704506539),
 (1161, 0.17134090308503547),
 (1103, 0.16105980902685613),
 (33, 0.12923154185871533),
 (1360, 0.1264936352967295)]

In [55]:
#checking to which newsarticles these indices belong
news_indices = [i[0] for i in sim_scores]
news_indices

[116, 1249, 1189, 227, 1060, 49, 1161, 1103, 33, 1360]

In [56]:
#looking up the titles of the newsarticles
data.iloc[news_indices]['title']

116     The WWII-era plane that crashed belonged to th...
1249    Antique Plane Crashes at Bradley International...
1189    3 dead, 3 hurt in small plane crash near Michi...
227     Connecticut official says there are fatalities...
1060    Former police officer, insurance analyst among...
49         Catering truck spins out of control at airport
1161    The Priority Pass program gets you access to m...
1103    The Latest: Airman aboard B-17 opened hatch, a...
33      US woman arrested at Manila airport with baby ...
1360    Hurricane Dorian Flight Cancellations Reach Ne...
Name: title, dtype: object

In [59]:
#final code for the recommender system
def transform_data(data):

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(data['combined_features'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    return cosine_sim

def recommender(title, data, transformed_data):

    indices = pd.Series(data.index, index = data['title'])
    index = indices[title]

    sim_scores = list(enumerate(transformed_data[index]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]

    news_indices = [i[0] for i in sim_scores]


    news_title = data['title'].iloc[news_indices]
    news_description = data['description'].iloc[news_indices]

    recommendation = pd.DataFrame(columns=['title', 'description'])

    recommendation['title'] = news_title
    recommendation['description'] = news_description

    return recommendation

def results(insert_news_name):
    data = get_data(PATH)
    data_with_combined_features = combine_features(data)
    transformed_data = transform_data(data_with_combined_features)

    if insert_news_name not in data['title'].unique():
        return 'Title not in Database'

    else:
        recommendations = recommender(insert_news_name, data_with_combined_features, transformed_data)
        return recommendations.to_dict('records')

In [60]:
#it is working!!
results("WWII-era bomber crashes at an airport near Hartford, Connecticut")

[{'title': 'The WWII-era plane that crashed belonged to the Collings Foundation, which showcases historic aircraft',
  'description': 'A World War II-era Boeing B-17 bomber aircraft crashed this morning while trying to land at Bradley International Airport in Windsor Locks, Connecticut. The vintage aircraft was a civilian registered aircraft with the Collings Foundation.'},
 {'title': 'Antique Plane Crashes at Bradley International Airport Outside Hartford',
  'description': 'The crash at the airport in Connecticut involved an antique military aircraft. There were no immediate reports of casualties.'},
 {'title': '3 dead, 3 hurt in small plane crash near Michigan airport',
  'description': 'Authorities say three people have died and three others were injured after a single-engine plane crashed near Capital Regional International Airport in mid-Michigan'},
 {'title': 'Connecticut official says there are fatalities in crash of World War II-era plane with 13 people aboard',
  'description

In [None]:
results("WWII-era bomber crashes at an airport near Hartford, Connecticut")

In [61]:
print(data['title'][7])

US mass shooters exploited gaps, errors in background checks


In [62]:
#testing another title, this one talks about the USA
results("US mass shooters exploited gaps, errors in background checks")

[{'title': 'Texas governor defies NRA, says he supports background checks for private gun sales',
  'description': 'Texas. Lt. Gov. Dan Patrick said he supports stronger gun background checks for sales between strangers.'},
 {'title': 'After Mass Shootings: Republicans Boost Access to Guns, Democrats Do Little, Analysis Finds',
  'description': 'The findings also reveal a relationship between media coverage of mass shootings and resulting legislative action.'},
 {'title': 'After String of Mass Shootings, Democrats Begin New Push for Gun Control',
  'description': 'The fate of the Democrats’ proposals, and of the broader effort to enact gun safety measures, remains murky amid uncertainty about where President Trump stands.'},
 {'title': 'Andrew McCabe: 2 simple things Congress can do to stop gun violence',
  'description': 'Following a wave of mass shootings in August, Andrew McCabe offers two solutions that Congress can take when it is back in session -- clearly define what a fugitive 

In [67]:
print(data['title'][11])

Callum Shinkwin takes one-shot lead at KLM Open


In [68]:
#testing another title, this one recommends sports articles
results("Callum Shinkwin takes one-shot lead at KLM Open")

[{'title': 'Costly bird: Golfer gets 3-year ban for gesture',
  'description': 'Korean golfer Bio Kim was suspended for three years and fined about $8,350 for making an obscene gesture toward fans after a cellphone camera went off during his downswing.'},
 {'title': 'Robbie Henshaw set to miss Scotland game with hamstring injury',
  'description': 'Ireland suffer serious injury blow a week out from World Cup opener against Scotland'},
 {'title': 'Hong Kong Student Shot by Police Is ‘The Bravest Type,’ Cousin Says - Wall Street Journal',
  'description': "Hong Kong Student Shot by Police Is ‘The Bravest Type,’ Cousin Says Wall Street Journal Hong Kong protester shot in the chest by police during clashes Al Jazeera English Protester Reported Shot As Hong Kong Marks China's National Day With Widespread Unrest | …"},
 {'title': "U.S. Open Tennis 2019: Where to watch Serena Williams' Semifinal Match, Start Time, Live Stream",
  'description': 'Serena Williams looks for a second consecutive 