# Async Exercise: News Framing Dataset
#### MAPA, Jamie Shekinah
#### SANCHEZ, Martin Christopher

### Getting data from an API

#### Import package
This wrapper package allows the searching of public submissions and comments.

In [1]:
from newsapi import NewsApiClient
import pandas as pd
import json
from pandas.io.json import json_normalize

import numpy as np
import re #regular expressions
import gensim
import nltk

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer

### Generating the API

In [2]:
newsapi = NewsApiClient(api_key='a7adb7d462df41ae8820517582c407dc')

### Getting a list of news sources from newsapi

In [3]:
sources = newsapi.get_sources()
sources = pd.json_normalize(sources['sources'])

In [4]:
sources.id

0                abc-news
1             abc-news-au
2             aftenposten
3      al-jazeera-english
4                    ansa
              ...        
123                 wired
124              wired-de
125     wirtschafts-woche
126            xinhua-net
127                  ynet
Name: id, Length: 128, dtype: object

### Loading sources to a JSON file

In [5]:
sources_json = json.loads(sources.id.to_json())

with open('sources.json', 'w') as json_file:
    json.dump(sources_json, json_file)

### Creating a function to retrieve articles from news sites

In [6]:
def getArticles(source):
    posts = newsapi.get_everything(q='covid OR vaccine OR health',sources=source,from_param='2021-03-11',
                                      to='2021-03-12',
                                      language='en',
                                      sort_by='relevancy')
    posts = pd.json_normalize(posts['articles']) 
    return posts

### Getting articles from news sites

In [7]:
sources = ['vice-news','breitbart-news','time']
articles = pd.DataFrame()

for source in sources:
    articles = articles.append(getArticles(source), ignore_index=True)
df_articles = articles

### Exporting to a JSON file under "articles_consolidated_json.JSON"

In [None]:
articles_consolidated_json = json.loads(articles.to_json(orient="records"))

with open('articles_consolidated.json', 'w') as json_file:
    json.dump(articles_consolidated_json, json_file)

In [8]:
df_articles.head()

Unnamed: 0,author,title,description,url,urlToImage,publishedAt,content,source.id,source.name
0,"Gita Jackson, Jason Koebler",The Unofficial Guide to Getting a COVID Vaccine,The vaccine rollout has been confusing at best...,https://www.vice.com/en/article/k7av4a/the-uno...,https://video-images.vice.com/articles/604a82b...,2021-03-12T14:00:00Z,The rollout for the COVID-19 vaccine is in ful...,vice-news,Vice News
1,"Josh Terry, Leslie Horn",It’s Probably OK to Be Optimistic About Bands ...,"With Dinosaur Jr. and more planning shows, vac...",https://www.vice.com/en/article/7k9ygy/dinosau...,https://video-images.vice.com/articles/604a790...,2021-03-11T20:48:28Z,"On March 11, indie rock mainstays Dinosaur Jr....",vice-news,Vice News
2,"Anthony Esguerra, Joe Freeman",Kissing and Hugging in Public Is Now Officiall...,Public displays of affection are no longer all...,https://www.vice.com/en/article/3an9xw/kissing...,https://video-images.vice.com/articles/604b0d6...,2021-03-12T07:28:12Z,Police in the Philippines have officially disc...,vice-news,Vice News
3,"Hyeong Yun, Alan Wong",This Nose Ring Is Said to Protect You From COV...,South Korean police have charged a man for bul...,https://www.vice.com/en/article/88apwb/covid-c...,https://video-images.vice.com/articles/604afac...,2021-03-12T07:09:14Z,Imagine a mask that doesnt cover up half your ...,vice-news,Vice News
4,"Simon Childs, Phoebe Hurst",'Business as Usual' – How the Government Downp...,"“Our jobs are not worth our lives"", says the e...",https://www.vice.com/en/article/pkd5nk/institu...,https://video-images.vice.com/articles/604a1cd...,2021-03-11T14:25:18Z,The government has dangerously downplayed the ...,vice-news,Vice News


In [9]:
df_articles.count()
#type(df_articles)

author         60
title          60
description    60
url            60
urlToImage     60
publishedAt    60
content        60
source.id      60
source.name    60
dtype: int64

#### Deleting unnecessary objects

In [10]:
del df_articles['description']
del df_articles['url']
del df_articles['urlToImage']
del df_articles['source.id']

#### Creating attributes for required data

In [11]:
cols = df_articles.columns.tolist()
cols = cols[4:5]+cols[2:3]+cols[1:2]+cols[3:4]+cols[0:1]
cols

['source.name', 'publishedAt', 'title', 'content', 'author']

In [12]:
df_articles = df_articles[cols]
df_articles.head()

Unnamed: 0,source.name,publishedAt,title,content,author
0,Vice News,2021-03-12T14:00:00Z,The Unofficial Guide to Getting a COVID Vaccine,The rollout for the COVID-19 vaccine is in ful...,"Gita Jackson, Jason Koebler"
1,Vice News,2021-03-11T20:48:28Z,It’s Probably OK to Be Optimistic About Bands ...,"On March 11, indie rock mainstays Dinosaur Jr....","Josh Terry, Leslie Horn"
2,Vice News,2021-03-12T07:28:12Z,Kissing and Hugging in Public Is Now Officiall...,Police in the Philippines have officially disc...,"Anthony Esguerra, Joe Freeman"
3,Vice News,2021-03-12T07:09:14Z,This Nose Ring Is Said to Protect You From COV...,Imagine a mask that doesnt cover up half your ...,"Hyeong Yun, Alan Wong"
4,Vice News,2021-03-11T14:25:18Z,'Business as Usual' – How the Government Downp...,The government has dangerously downplayed the ...,"Simon Childs, Phoebe Hurst"


Cleaning attribute names

In [13]:
df_articles = df_articles.rename(columns={'source.name':'source',
                                    'publishedAt':'date',
                                    'content':'article_body'
                                   })
df_articles.head()

Unnamed: 0,source,date,title,article_body,author
0,Vice News,2021-03-12T14:00:00Z,The Unofficial Guide to Getting a COVID Vaccine,The rollout for the COVID-19 vaccine is in ful...,"Gita Jackson, Jason Koebler"
1,Vice News,2021-03-11T20:48:28Z,It’s Probably OK to Be Optimistic About Bands ...,"On March 11, indie rock mainstays Dinosaur Jr....","Josh Terry, Leslie Horn"
2,Vice News,2021-03-12T07:28:12Z,Kissing and Hugging in Public Is Now Officiall...,Police in the Philippines have officially disc...,"Anthony Esguerra, Joe Freeman"
3,Vice News,2021-03-12T07:09:14Z,This Nose Ring Is Said to Protect You From COV...,Imagine a mask that doesnt cover up half your ...,"Hyeong Yun, Alan Wong"
4,Vice News,2021-03-11T14:25:18Z,'Business as Usual' – How the Government Downp...,The government has dangerously downplayed the ...,"Simon Childs, Phoebe Hurst"


##  Pre-processing the text in the article_body column.

### Removing stop objects

In [14]:
def removeOtherText(text):
    result = re.sub(r"<.{2,3}>|\s+\w+…\s\[.+]|\r\n", " ", str(text))
    result = result.strip()
    
    return result

In [15]:
df_articles["article_body"]

0     The rollout for the COVID-19 vaccine is in ful...
1     On March 11, indie rock mainstays Dinosaur Jr....
2     Police in the Philippines have officially disc...
3     Imagine a mask that doesnt cover up half your ...
4     The government has dangerously downplayed the ...
5     Earlier this week, the CDC released its first ...
6     Welcome to Fantasy Week, where we indulge all ...
7     Welcome to Fantasy Week, where we indulge all ...
8     Mississippi Sen. Roger Wicker sure seems to be...
9     On the Clock is Motherboard's reporting on the...
10    Welcome to Fantasy Week, where we indulge all ...
11    Times Up, the group founded by Hollywood power...
12    Former President Donald Trump just wants a tha...
13    On March 2, Texas Independence Day, Governor G...
14    The racial slur-spewing Uber rider who coughed...
15    A Chinese artist who managed to put Trump and ...
16    The author of this op-ed is a postdoctoral fel...
17    Oklahoma State Representative Justin Humph

In [16]:
processed_docs = df_articles['article_body'].map(removeOtherText)
processed_docs

0     The rollout for the COVID-19 vaccine is in ful...
1     On March 11, indie rock mainstays Dinosaur Jr....
2     Police in the Philippines have officially disc...
3     Imagine a mask that doesnt cover up half your ...
4     The government has dangerously downplayed the ...
5     Earlier this week, the CDC released its first ...
6     Welcome to Fantasy Week, where we indulge all ...
7     Welcome to Fantasy Week, where we indulge all ...
8     Mississippi Sen. Roger Wicker sure seems to be...
9     On the Clock is Motherboard's reporting on the...
10    Welcome to Fantasy Week, where we indulge all ...
11    Times Up, the group founded by Hollywood power...
12    Former President Donald Trump just wants a tha...
13    On March 2, Texas Independence Day, Governor G...
14    The racial slur-spewing Uber rider who coughed...
15    A Chinese artist who managed to put Trump and ...
16    The author of this op-ed is a postdoctoral fel...
17    Oklahoma State Representative Justin Humph

### Creating a preprocessing and lemmatizing function

In [17]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize(token))
    return result

In [18]:
nltk.download('wordnet')

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thesm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
processed_docs = processed_docs.map(preprocess)
processed_docs.head()

0    [rollout, covid, vaccine, swing, unite, state,...
1    [march, indie, rock, mainstays, dinosaur, anno...
2    [police, philippines, officially, discourage, ...
3    [imagine, mask, doesnt, cover, half, face, don...
4    [government, dangerously, downplay, risk, catc...
Name: article_body, dtype: object

In [20]:
processed_docs

0     [rollout, covid, vaccine, swing, unite, state,...
1     [march, indie, rock, mainstays, dinosaur, anno...
2     [police, philippines, officially, discourage, ...
3     [imagine, mask, doesnt, cover, half, face, don...
4     [government, dangerously, downplay, risk, catc...
5     [earlier, week, release, guidelines, small, st...
6     [welcome, fantasy, week, indulge, grandest, da...
7     [welcome, fantasy, week, indulge, grandest, da...
8     [mississippi, roger, wicker, sure, take, credi...
9     [clock, motherboard, report, organize, labor, ...
10    [welcome, fantasy, week, indulge, grandest, da...
11    [time, group, found, hollywood, power, players...
12    [president, donald, trump, want, thank, congre...
13    [march, texas, independence, governor, greg, a...
14    [racial, slur, spew, uber, rider, cough, asian...
15    [chinese, artist, manage, trump, form, sculptu...
16    [author, postdoctoral, fellow, yale, school, p...
17    [oklahoma, state, representative, justin, 

### Creating a dictionary and tokenizing

In [21]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [22]:
for x in range(0, 20):
    print(x,":",dictionary[x])

0 : access
1 : clinics
2 : counties
3 : covid
4 : figure
5 : rollout
6 : state
7 : swing
8 : unite
9 : vaccine
10 : vary
11 : widely
12 : american
13 : announce
14 : coast
15 : dinosaur
16 : indie
17 : kick
18 : long
19 : mainstays


#### Filtering tokens

In [23]:
dictionary.filter_extremes(no_below=2, keep_n=100000)

#### Evaluate the worth of features

In [24]:
dictionary.cfs

{0: 10,
 4: 12,
 3: 2,
 2: 9,
 1: 2,
 9: 6,
 6: 4,
 8: 2,
 5: 3,
 7: 2,
 16: 2,
 14: 2,
 17: 3,
 10: 2,
 13: 20,
 11: 3,
 15: 4,
 12: 10,
 21: 3,
 18: 2,
 19: 3,
 20: 2,
 22: 2,
 24: 3,
 28: 8,
 25: 4,
 23: 2,
 26: 4,
 27: 2,
 29: 2,
 33: 8,
 31: 3,
 30: 2,
 32: 3,
 43: 3,
 36: 3,
 39: 3,
 38: 3,
 34: 3,
 44: 3,
 45: 8,
 41: 9,
 40: 4,
 35: 3,
 37: 3,
 42: 3,
 49: 3,
 51: 2,
 52: 6,
 46: 2,
 47: 3,
 48: 2,
 50: 4,
 53: 5,
 54: 2,
 55: 2,
 59: 4,
 56: 4,
 57: 2,
 58: 2,
 63: 17,
 61: 5,
 65: 6,
 64: 3,
 62: 2,
 60: 9,
 66: 3,
 68: 2,
 67: 3,
 69: 2,
 70: 4,
 71: 2,
 72: 7,
 73: 2,
 74: 3,
 75: 2,
 76: 2,
 78: 7,
 80: 9,
 79: 2,
 77: 3,
 83: 2,
 81: 2,
 82: 3,
 84: 2,
 85: 2,
 86: 2,
 87: 2,
 88: 2,
 89: 2,
 93: 2,
 92: 3,
 94: 2,
 90: 2,
 91: 3,
 96: 2,
 95: 2,
 97: 2,
 98: 3,
 100: 2,
 99: 3,
 101: 2,
 102: 4,
 103: 6,
 105: 2,
 104: 2,
 106: 2,
 108: 3,
 107: 3,
 109: 2,
 110: 3,
 113: 2,
 111: 2,
 112: 3,
 114: 2,
 115: 2,
 116: 2,
 117: 2,
 120: 2,
 119: 2,
 124: 2,
 122: 2,
 126: 3

In [25]:
dictionary.dfs

{0: 10,
 4: 10,
 3: 2,
 2: 7,
 1: 2,
 9: 5,
 6: 4,
 8: 2,
 5: 3,
 7: 2,
 16: 2,
 14: 2,
 17: 3,
 10: 2,
 13: 20,
 11: 3,
 15: 4,
 12: 10,
 21: 2,
 18: 2,
 19: 3,
 20: 2,
 22: 2,
 24: 3,
 28: 5,
 25: 4,
 23: 2,
 26: 4,
 27: 2,
 29: 2,
 33: 8,
 31: 3,
 30: 2,
 32: 3,
 43: 3,
 36: 3,
 39: 3,
 38: 3,
 34: 3,
 44: 3,
 45: 8,
 41: 9,
 40: 4,
 35: 3,
 37: 3,
 42: 3,
 49: 2,
 51: 2,
 52: 4,
 46: 2,
 47: 3,
 48: 2,
 50: 4,
 53: 5,
 54: 2,
 55: 2,
 59: 4,
 56: 3,
 57: 2,
 58: 2,
 63: 13,
 61: 5,
 65: 6,
 64: 3,
 62: 2,
 60: 8,
 66: 3,
 68: 2,
 67: 2,
 69: 2,
 70: 4,
 71: 2,
 72: 7,
 73: 2,
 74: 3,
 75: 2,
 76: 2,
 78: 6,
 80: 9,
 79: 2,
 77: 3,
 83: 2,
 81: 2,
 82: 3,
 84: 2,
 85: 2,
 86: 2,
 87: 2,
 88: 2,
 89: 2,
 93: 2,
 92: 3,
 94: 2,
 90: 2,
 91: 3,
 96: 2,
 95: 2,
 97: 2,
 98: 3,
 100: 2,
 99: 3,
 101: 2,
 102: 2,
 103: 5,
 105: 2,
 104: 2,
 106: 2,
 108: 3,
 107: 3,
 109: 2,
 110: 3,
 113: 2,
 111: 2,
 112: 3,
 114: 2,
 115: 2,
 116: 2,
 117: 2,
 120: 2,
 119: 2,
 124: 2,
 122: 2,
 126: 3

In [26]:
dictionary.num_pos

1013

In [27]:
dictionary.num_docs

60

#### Forming bow corpus on the article_body

In [28]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [29]:
bow_corpus

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1)],
 [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(18, 1), (19, 1), (20, 1), (21, 2), (22, 1)],
 [(13, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)],
 [(0, 1), (20, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)],
 [(33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1)],
 [(33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1)],
 [(0, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1)],
 [(26, 1), (28, 2), (54, 1), (55, 1)],
 [(33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1)],
 [(12, 1), (19, 1), (33, 1), (56, 2), (57, 1), (58, 1), (59, 1)],
 [(0, 1),
  (6, 1),
  (2

### Creating the TF-IDF for each article

In [30]:
tfidf = models.TfidfModel(bow_corpus)

In [31]:
corpus_tfidf = tfidf[bow_corpus]
for x in corpus_tfidf:
    print(x)

[(0, 0.2585638753582436), (1, 0.490817428881658), (2, 0.6200693088126926), (3, 0.490817428881658), (4, 0.2585638753582436)]
[(5, 0.4435372877427583), (6, 0.4009441203646525), (7, 0.5035689854722979), (8, 0.5035689854722979), (9, 0.3679062930524757)]
[(10, 0.42381364838254454), (11, 0.37328978061596246), (12, 0.2232661126260219), (13, 0.136894990196302), (14, 0.42381364838254454), (15, 0.33744252595282465), (16, 0.42381364838254454), (17, 0.37328978061596246)]
[(18, 0.35861448785610156), (19, 0.31586317243062956), (20, 0.35861448785610156), (21, 0.7172289757122031), (22, 0.35861448785610156)]
[(13, 0.1492819178665797), (23, 0.4621623783153431), (24, 0.4070668641009991), (25, 0.36797597466310744), (26, 0.36797597466310744), (27, 0.4621623783153431), (28, 0.3376547251710511)]
[(0, 0.23146277521358505), (20, 0.43937291725204236), (29, 0.43937291725204236), (30, 0.43937291725204236), (31, 0.3869941907618034), (32, 0.3869941907618034), (33, 0.26028886852869626)]
[(33, 0.2016698932447795), (3

#### Adding the new article_body_bow to the dataset

In [32]:
TDIDF_objects = np.array(corpus_tfidf, dtype="object")
df_articles['article_body_bow']=TDIDF_objects

In [33]:
df_articles

Unnamed: 0,source,date,title,article_body,author,article_body_bow
0,Vice News,2021-03-12T14:00:00Z,The Unofficial Guide to Getting a COVID Vaccine,The rollout for the COVID-19 vaccine is in ful...,"Gita Jackson, Jason Koebler","[(0, 0.2585638753582436), (1, 0.49081742888165..."
1,Vice News,2021-03-11T20:48:28Z,It’s Probably OK to Be Optimistic About Bands ...,"On March 11, indie rock mainstays Dinosaur Jr....","Josh Terry, Leslie Horn","[(5, 0.4435372877427583), (6, 0.40094412036465..."
2,Vice News,2021-03-12T07:28:12Z,Kissing and Hugging in Public Is Now Officiall...,Police in the Philippines have officially disc...,"Anthony Esguerra, Joe Freeman","[(10, 0.42381364838254454), (11, 0.37328978061..."
3,Vice News,2021-03-12T07:09:14Z,This Nose Ring Is Said to Protect You From COV...,Imagine a mask that doesnt cover up half your ...,"Hyeong Yun, Alan Wong","[(18, 0.35861448785610156), (19, 0.31586317243..."
4,Vice News,2021-03-11T14:25:18Z,'Business as Usual' – How the Government Downp...,The government has dangerously downplayed the ...,"Simon Childs, Phoebe Hurst","[(13, 0.1492819178665797), (23, 0.462162378315..."
5,Vice News,2021-03-11T17:29:06Z,How New CDC Rules About Post-Vax Hangouts Affe...,"Earlier this week, the CDC released its first ...","Hannah Smothers, Casey Johnston","[(0, 0.23146277521358505), (20, 0.439372917252..."
6,Vice News,2021-03-12T20:34:44Z,"When COVID Ends, I'm Going to Embrace My Inner...","Welcome to Fantasy Week, where we indulge all ...","Hilary Pollack, Mary Frances Knapp","[(33, 0.2016698932447795), (34, 0.299840241261..."
7,Vice News,2021-03-12T21:06:04Z,Oversized Clothing Is Getting Ridiculously Hug...,"Welcome to Fantasy Week, where we indulge all ...","Mary Frances Knapp, Hilary Pollack","[(33, 0.2016698932447795), (34, 0.299840241261..."
8,Vice News,2021-03-11T18:33:18Z,GOP Senator Says You’re Welcome for the COVID ...,Mississippi Sen. Roger Wicker sure seems to be...,"Paul Blest, Anne Torpey","[(0, 0.2012080014120717), (46, 0.3819419622585..."
9,Vice News,2021-03-11T14:00:00Z,Uber Is Expanding Its War on Labor to Canada W...,On the Clock is Motherboard's reporting on the...,"Edward Ongweso Jr, Jordan Pearson","[(26, 0.36459413397897755), (28, 0.66910309685..."


### Exporting to a JSON file under "sources.JSON"

In [34]:
articles_json = json.loads(df_articles.to_json(orient="records"))

with open('articles_processed.json', 'w') as json_file:
    json.dump(articles_json, json_file)