Having imputed the csv in df_impute.ipynb, I now perform text preprocessing.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imputed_fmt_16-01-22.csv")
df.head()

Unnamed: 0,title,url,date
0,Pro-BN rule set before merger with Kuasa Rakya...,https://www.freemalaysiatoday.com/category/nat...,2021-12-26
1,Elected rep denied entry to PM’s event in own ...,https://www.freemalaysiatoday.com/category/nat...,2021-12-26
2,One-off 100% power bill rebate for flood victims,https://www.freemalaysiatoday.com/category/nat...,2021-12-26
3,"DID issues flood risk warning for Kelantan, Te...",https://www.freemalaysiatoday.com/category/nat...,2021-12-26
4,‘Two or three’ ministers have returned after m...,https://www.freemalaysiatoday.com/category/nat...,2021-12-26


First I remove all commas, because figures such as 1,000 are common in news headlines. Then, I tokenize the headlines and remove stop words.

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.replace(',', '')
    text = " ".join(re.split('\W+', text))
    word_tokens = word_tokenize(text)


    filtered_sentence = []

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = [item for item in filtered_sentence if not item.isdigit()]
            
    return (", ".join(filtered_sentence))

print(clean_text("This is a sample sentence, showing off the stop words filtration."))

sample, sentence, showing, stop, words, filtration


In [4]:
df['cleaned'] = df['title'].apply(clean_text)
df.head()

Unnamed: 0,title,url,date,cleaned
0,Pro-BN rule set before merger with Kuasa Rakya...,https://www.freemalaysiatoday.com/category/nat...,2021-12-26,"Pro, BN, rule, set, merger, Kuasa, Rakyat, say..."
1,Elected rep denied entry to PM’s event in own ...,https://www.freemalaysiatoday.com/category/nat...,2021-12-26,"Elected, rep, denied, entry, PM, event, consti..."
2,One-off 100% power bill rebate for flood victims,https://www.freemalaysiatoday.com/category/nat...,2021-12-26,"One, power, bill, rebate, flood, victims"
3,"DID issues flood risk warning for Kelantan, Te...",https://www.freemalaysiatoday.com/category/nat...,2021-12-26,"issues, flood, risk, warning, Kelantan, Tereng..."
4,‘Two or three’ ministers have returned after m...,https://www.freemalaysiatoday.com/category/nat...,2021-12-26,"Two, three, ministers, returned, order, says, PM"


In [5]:
df = df.drop(['title', 'url'], axis = 1)
df.head()

Unnamed: 0,date,cleaned
0,2021-12-26,"Pro, BN, rule, set, merger, Kuasa, Rakyat, say..."
1,2021-12-26,"Elected, rep, denied, entry, PM, event, consti..."
2,2021-12-26,"One, power, bill, rebate, flood, victims"
3,2021-12-26,"issues, flood, risk, warning, Kelantan, Tereng..."
4,2021-12-26,"Two, three, ministers, returned, order, says, PM"


In [6]:
df = df.groupby(['date'])['cleaned'].apply(lambda x: ','.join(x)).reset_index()
df['cleaned'] = df['cleaned'].apply(lambda x: x.split(","))
df.head()

Unnamed: 0,date,cleaned
0,2019-12-31,"[PIA, special, flight, brings, home, hund..."
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."


In [7]:
df.shape

(749, 2)

In [8]:
df.drop(index=df.index[0], 
        axis=0, 
        inplace=True)
df.head()

Unnamed: 0,date,cleaned
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say..."


I want to create a word cloud, so I count the number of tokens. Then, I save to json.

In [9]:
from collections import Counter

df['counter'] = df['cleaned'].apply(lambda x: dict(Counter(x)))
df.head()

Unnamed: 0,date,cleaned,counter
1,2020-01-01,"[caught, smoking, eateries, first, day, e...","{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu...","{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,...","{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ...","{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say...","{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [10]:
df = df.drop(['cleaned'], axis = 1)
df.head()

Unnamed: 0,date,counter
1,2020-01-01,"{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [11]:
df.tail()

Unnamed: 0,date,counter
744,2022-01-13,"{'Umno': 1, ' hold': 1, ' general': 1, ' assem..."
745,2022-01-14,"{'Covid': 1, ' deaths': 1, ' dip': 1, ' brough..."
746,2022-01-15,"{'Advisory': 1, ' board': 1, ' must': 2, ' tel..."
747,2022-01-16,"{'Solve': 1, ' structural': 1, ' problems': 1,..."
748,2022-01-17,"{'new': 2, ' Covid': 3, ' deaths': 1, ' brough..."


In [12]:
df.to_json("preprocessed_fmt.json", orient='records')