Having imputed the csv in df_impute.ipynb, I now perform text preprocessing.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imputed_fmt_7-11-21.csv")
df.head()

Unnamed: 0,title,url,date
0,"100,000 visit Langkawi within one month",https://www.freemalaysiatoday.com/category/nat...,2021-11-03
1,Appeals court greenlights new dates for Najib’...,https://www.freemalaysiatoday.com/category/nat...,2021-11-03
2,"BKM only for those earning below RM5,000 a mon...",https://www.freemalaysiatoday.com/category/nat...,2021-11-03
3,"Small budget allocations for Sabah, Sarawak wi...",https://www.freemalaysiatoday.com/category/nat...,2021-11-03
4,"LTAT sells 2,000 apartments to army for RM560mil",https://www.freemalaysiatoday.com/category/nat...,2021-11-03


First I remove all commas, because figures such as 1,000 are common in news headlines. Then, I tokenize the headlines and remove stop words.

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.replace(',', '')
    text = " ".join(re.split('\W+', text))
    word_tokens = word_tokenize(text)


    filtered_sentence = []

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = [item for item in filtered_sentence if not item.isdigit()]
            
    return (", ".join(filtered_sentence))

print(clean_text("This is a sample sentence, showing off the stop words filtration."))

sample, sentence, showing, stop, words, filtration


In [4]:
df['cleaned'] = df['title'].apply(clean_text)
df.head()

Unnamed: 0,title,url,date,cleaned
0,"100,000 visit Langkawi within one month",https://www.freemalaysiatoday.com/category/nat...,2021-11-03,"visit, Langkawi, within, one, month"
1,Appeals court greenlights new dates for Najib’...,https://www.freemalaysiatoday.com/category/nat...,2021-11-03,"Appeals, court, greenlights, new, dates, Najib..."
2,"BKM only for those earning below RM5,000 a mon...",https://www.freemalaysiatoday.com/category/nat...,2021-11-03,"BKM, earning, RM5000, month, says, deputy, min..."
3,"Small budget allocations for Sabah, Sarawak wi...",https://www.freemalaysiatoday.com/category/nat...,2021-11-03,"Small, budget, allocations, Sabah, Sarawak, hu..."
4,"LTAT sells 2,000 apartments to army for RM560mil",https://www.freemalaysiatoday.com/category/nat...,2021-11-03,"LTAT, sells, apartments, army, RM560mil"


In [5]:
df = df.drop(['title', 'url'], axis = 1)
df.head()

Unnamed: 0,date,cleaned
0,2021-11-03,"visit, Langkawi, within, one, month"
1,2021-11-03,"Appeals, court, greenlights, new, dates, Najib..."
2,2021-11-03,"BKM, earning, RM5000, month, says, deputy, min..."
3,2021-11-03,"Small, budget, allocations, Sabah, Sarawak, hu..."
4,2021-11-03,"LTAT, sells, apartments, army, RM560mil"


In [6]:
df = df.groupby(['date'])['cleaned'].apply(lambda x: ','.join(x)).reset_index()
df['cleaned'] = df['cleaned'].apply(lambda x: x.split(","))
df.head()

Unnamed: 0,date,cleaned
0,2019-12-31,"[PIA, special, flight, brings, home, hund..."
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."


In [7]:
df.shape

(679, 2)

In [8]:
df.drop(index=df.index[0], 
        axis=0, 
        inplace=True)
df.head()

Unnamed: 0,date,cleaned
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say..."


I want to create a word cloud, so I count the number of tokens. Then, I save to json.

In [9]:
from collections import Counter

df['counter'] = df['cleaned'].apply(lambda x: dict(Counter(x)))
df.head()

Unnamed: 0,date,cleaned,counter
1,2020-01-01,"[caught, smoking, eateries, first, day, e...","{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu...","{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,...","{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ...","{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say...","{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [10]:
df = df.drop(['cleaned'], axis = 1)
df.head()

Unnamed: 0,date,counter
1,2020-01-01,"{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [11]:
df.tail()

Unnamed: 0,date,counter
674,2021-11-04,"{'art': 1, ' kolam': 1, ' changed': 1, ' young..."
675,2021-11-05,"{'new': 2, ' Covid': 2, ' deaths': 1, ' report..."
676,2021-11-06,"{'Poaching': 1, ' foreign': 1, ' workers': 1, ..."
677,2021-11-07,"{'new': 2, ' Covid': 4, ' deaths': 1, ' toll':..."
678,2021-11-08,"{'Helping': 3, ' elderly': 3, ' comes': 3, ' f..."


In [12]:
df.to_json("preprocessed_fmt.json", orient='records')