Having imputed the csv in df_impute.ipynb, I now perform text preprocessing.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imputed_fmt_8-12-21.csv")
df.head()

Unnamed: 0,title,url,date
0,Early polls to ‘protect’ GPS from youth voters...,https://www.freemalaysiatoday.com/category/nat...,2021-11-05
1,Protecting Malaysian interests: What about KVDT2?,https://www.freemalaysiatoday.com/category/nat...,2021-11-05
2,Man arrested after attempting to ram into cops,https://www.freemalaysiatoday.com/category/nat...,2021-11-05
3,"Smart Glove banned by US, sixth Malaysian firm...",https://www.freemalaysiatoday.com/category/nat...,2021-11-05
4,Feb 23 decision on Anwar’s bid to reinstate So...,https://www.freemalaysiatoday.com/category/nat...,2021-11-05


First I remove all commas, because figures such as 1,000 are common in news headlines. Then, I tokenize the headlines and remove stop words.

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.replace(',', '')
    text = " ".join(re.split('\W+', text))
    word_tokens = word_tokenize(text)


    filtered_sentence = []

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = [item for item in filtered_sentence if not item.isdigit()]
            
    return (", ".join(filtered_sentence))

print(clean_text("This is a sample sentence, showing off the stop words filtration."))

sample, sentence, showing, stop, words, filtration


In [4]:
df['cleaned'] = df['title'].apply(clean_text)
df.head()

Unnamed: 0,title,url,date,cleaned
0,Early polls to ‘protect’ GPS from youth voters...,https://www.freemalaysiatoday.com/category/nat...,2021-11-05,"Early, polls, protect, GPS, youth, voters, say..."
1,Protecting Malaysian interests: What about KVDT2?,https://www.freemalaysiatoday.com/category/nat...,2021-11-05,"Protecting, Malaysian, interests, KVDT2"
2,Man arrested after attempting to ram into cops,https://www.freemalaysiatoday.com/category/nat...,2021-11-05,"Man, arrested, attempting, ram, cops"
3,"Smart Glove banned by US, sixth Malaysian firm...",https://www.freemalaysiatoday.com/category/nat...,2021-11-05,"Smart, Glove, banned, US, sixth, Malaysian, fi..."
4,Feb 23 decision on Anwar’s bid to reinstate So...,https://www.freemalaysiatoday.com/category/nat...,2021-11-05,"Feb, decision, Anwar, bid, reinstate, Sodomy, ..."


In [5]:
df = df.drop(['title', 'url'], axis = 1)
df.head()

Unnamed: 0,date,cleaned
0,2021-11-05,"Early, polls, protect, GPS, youth, voters, say..."
1,2021-11-05,"Protecting, Malaysian, interests, KVDT2"
2,2021-11-05,"Man, arrested, attempting, ram, cops"
3,2021-11-05,"Smart, Glove, banned, US, sixth, Malaysian, fi..."
4,2021-11-05,"Feb, decision, Anwar, bid, reinstate, Sodomy, ..."


In [6]:
df = df.groupby(['date'])['cleaned'].apply(lambda x: ','.join(x)).reset_index()
df['cleaned'] = df['cleaned'].apply(lambda x: x.split(","))
df.head()

Unnamed: 0,date,cleaned
0,2019-12-31,"[PIA, special, flight, brings, home, hund..."
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."


In [7]:
df.shape

(710, 2)

In [8]:
df.drop(index=df.index[0], 
        axis=0, 
        inplace=True)
df.head()

Unnamed: 0,date,cleaned
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say..."


I want to create a word cloud, so I count the number of tokens. Then, I save to json.

In [9]:
from collections import Counter

df['counter'] = df['cleaned'].apply(lambda x: dict(Counter(x)))
df.head()

Unnamed: 0,date,cleaned,counter
1,2020-01-01,"[caught, smoking, eateries, first, day, e...","{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu...","{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,...","{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ...","{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say...","{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [10]:
df = df.drop(['cleaned'], axis = 1)
df.head()

Unnamed: 0,date,counter
1,2020-01-01,"{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [11]:
df.tail()

Unnamed: 0,date,counter
705,2021-12-05,"{'old': 1, ' old': 1, ' economists': 1, ' impr..."
706,2021-12-06,"{'new': 2, ' Covid': 3, ' deaths': 1, ' report..."
707,2021-12-07,"{'offer': 1, ' RM37000': 1, ' month': 2, ' top..."
708,2021-12-08,"{'Bumi': 1, ' equity': 1, ' instrument': 1, ' ..."
709,2021-12-09,"{'Mambong': 3, ' tale': 3, ' two': 3, ' halves..."


In [12]:
df.to_json("preprocessed_fmt.json", orient='records')