# Text Processing

News sources: (Date collected: 04-05-2021)
    - Inquirer
    - Manila Bulletin
    - The Guardian

## Imports

In [7]:
import os
import json
import pandas as pd
import numpy as np
import re
import gensim
import nltk

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer

## Combining all news articles into one json

The files were taken from the previous homework, scraping

In [8]:
articles = []

read = []

with open('inquirer_news.json') as f:
    read = json.load(f)
    print("News articles from Inquirer: ", len(read))

articles.extend(read)

with open('mb_news.json') as f:
    read = json.load(f)
    print("News articles from Manila Bulletin: ", len(read))


articles.extend(read)

with open('guardian_news.json') as f:
    read = json.load(f)
    print("News articles from The Guardian: ", len(read))

articles.extend(read)
print("Total number of news articles: ", len(articles))

News articles from Inquirer:  53
News articles from Manila Bulletin:  27
News articles from The Guardian:  49
Total number of news articles:  129


In [9]:
# sanity check

article_no = 0

print('Index: ', article_no)
print('\nSource: ', articles[article_no]['source'])
print("\nTitle: ", articles[article_no]['title'])
print("\nAuthor: ", articles[article_no]['author'])
print("\nDate: ", articles[article_no]['date'])
print("\nArticle body: \n", articles[article_no]['article_body'])

Index:  0

Source:  https://newsinfo.inquirer.net/1415144/healthcare-utilization-should-drop-to-60-before-easing-of-ncr-plus-restrictions-doh

Title:  To ease ‘NCR Plus’ curbs, healthcare demand must drop to 60% – DOH

Author:  Christia Marie Ramos

Date:  7:49 PM April 05, 2021

Article body: 
 
MANILA, Philippines — The current restrictions enforced on the “NCR Plus” area can be eased once healthcare demand has been lowered to at least 60 percent, the Department of Health (DOH) said Monday.
“For healthcare utilization, we need to see that the utilization will be down to at least 60% before we can say that we are at that safe level,” DOH Undersecretary Ma. Rosario Vergeire said in a Palace briefing.
“The health system should be able to manage and should be able to breathe and should have this decongestion before we can say that we can easily lift the restrictions for this community quarantine,” she added.
According to a DOH official, the National Capital Region’s total healthcare dema

## Pre-processing

In [12]:
# Stem / Lemmatize
nltk.download('wordnet')
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

# Tokenizing
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize(token))
    return result

article_bodies = []

for x in articles:
    article_bodies.append(preprocess(x['article_body']))
    
print(len(article_bodies))
print(article_bodies[0])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\niiick\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


129
['manila', 'philippines', 'current', 'restrictions', 'enforce', 'plus', 'area', 'ease', 'healthcare', 'demand', 'lower', 'percent', 'department', 'health', 'say', 'monday', 'healthcare', 'utilization', 'need', 'utilization', 'safe', 'level', 'undersecretary', 'rosario', 'vergeire', 'say', 'palace', 'brief', 'health', 'able', 'manage', 'able', 'breathe', 'decongestion', 'easily', 'lift', 'restrictions', 'community', 'quarantine', 'add', 'accord', 'official', 'national', 'capital', 'region', 'total', 'healthcare', 'demand', 'reportedly', 'percent', 'intensive', 'care', 'unit', 'bed', 'cities', 'need', 'bring', 'able', 'healthcare', 'manage', 'say', 'important', 'look', 'healthcare', 'utilization', 'add', 'initially', 'government', 'impose', 'enhance', 'community', 'quarantine', 'government', 'stringent', 'quarantine', 'classification', 'plus', 'bubble', 'encompass', 'metro', 'manila', 'bulacan', 'cavite', 'laguna', 'rizal', 'schedule', 'expire', 'april', 'extend', 'additional', 'week

## Getting TF-IDF

In [13]:
# Creating gensim dictionary

dictionary = gensim.corpora.Dictionary(article_bodies)

for x in range(0, 20):
    print(x,":",dictionary[x])

0 : able
1 : accord
2 : active
3 : add
4 : additional
5 : affect
6 : april
7 : area
8 : bed
9 : break
10 : breathe
11 : brief
12 : bring
13 : bubble
14 : bulacan
15 : capital
16 : care
17 : case
18 : cavite
19 : cities


In [20]:
# Filter out words

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Map bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in article_bodies]

In [25]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
len(corpus_tfidf)

129

In [27]:
articles_tfidf = []

for x in range(len(articles)):
    article_body_bow = []
    for y in corpus_tfidf[x]:
        article_body_bow.append((dictionary[]))
    articles_tfidf.append({
        'source': articles[x]['source'],
        'date': articles[x]['date'],
        'title': articles[x]['title'],
        'author': articles[x]['author'],
        'article_body': articles[x]['article_body'],
        'article_body_bow': article_body_bow
    })
    
# Sanity check:

article_no = 0

print('Index: ', article_no)
print('\nSource: ', articles_tfidf[article_no]['source'])
print("\nTitle: ", articles_tfidf[article_no]['title'])
print("\nAuthor: ", articles_tfidf[article_no]['author'])
print("\nDate: ", articles_tfidf[article_no]['date'])
print("\nArticle body TFIDF: \n", articles_tfidf[article_no]['article_body_bow'])

Index:  0

Source:  https://newsinfo.inquirer.net/1415144/healthcare-utilization-should-drop-to-60-before-easing-of-ncr-plus-restrictions-doh

Title:  To ease ‘NCR Plus’ curbs, healthcare demand must drop to 60% – DOH

Author:  Christia Marie Ramos

Date:  7:49 PM April 05, 2021

Article body TFIDF: 
 [(0, 0.3375325509797615), (1, 0.08892257853724608), (2, 0.1300135428740754), (3, 0.186894252823476), (4, 0.11945954181625008), (5, 0.11945954181625008), (6, 0.10933758242648152), (7, 0.10933758242648152), (8, 0.10976962409453825), (9, 0.21867516485296304), (10, 0.058467801911536645), (11, 0.11945954181625008), (12, 0.07915764336286867), (13, 0.11945954181625008), (14, 0.0946543040590526), (15, 0.1144951946344838), (16, 0.10933758242648152), (17, 0.3583786254487503), (18, 0.11945954181625008), (19, 0.11945954181625008), (20, 0.1736546918994867), (21, 0.11587655195198257), (22, 0.07782709019206399), (23, 0.08097803838827482), (24, 0.17784515707449217), (25, 0.23891908363250017), (26, 0.1513