Having imputed the csv in df_impute.ipynb, I now perform text preprocessing.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imputed_fmt_17-02-22.csv")
df.head()

Unnamed: 0,title,url,date
0,Azam turned up for PAC hearing with just an in...,https://www.freemalaysiatoday.com/category/nat...,2022-02-13
1,Khairuddin keeps mum over sacking from PAS Syu...,https://www.freemalaysiatoday.com/category/nat...,2022-02-13
2,22 undocumented Indonesian migrants nabbed aft...,https://www.freemalaysiatoday.com/category/nat...,2022-02-13
3,"We’ve proven Umno-led govt is a stable one, sa...",https://www.freemalaysiatoday.com/category/nat...,2022-02-13
4,"Malaysia, Brunei VTL on the cards",https://www.freemalaysiatoday.com/category/nat...,2022-02-13


First I remove all commas, because figures such as 1,000 are common in news headlines. Then, I tokenize the headlines and remove stop words.

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.replace(',', '')
    text = " ".join(re.split('\W+', text))
    word_tokens = word_tokenize(text)


    filtered_sentence = []

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = [item for item in filtered_sentence if not item.isdigit()]
            
    return (", ".join(filtered_sentence))

print(clean_text("This is a sample sentence, showing off the stop words filtration."))

sample, sentence, showing, stop, words, filtration


In [4]:
df['cleaned'] = df['title'].apply(clean_text)
df.head()

Unnamed: 0,title,url,date,cleaned
0,Azam turned up for PAC hearing with just an in...,https://www.freemalaysiatoday.com/category/nat...,2022-02-13,"Azam, turned, PAC, hearing, invite, recalls, MP"
1,Khairuddin keeps mum over sacking from PAS Syu...,https://www.freemalaysiatoday.com/category/nat...,2022-02-13,"Khairuddin, keeps, mum, sacking, PAS, Syura, C..."
2,22 undocumented Indonesian migrants nabbed aft...,https://www.freemalaysiatoday.com/category/nat...,2022-02-13,"undocumented, Indonesian, migrants, nabbed, bo..."
3,"We’ve proven Umno-led govt is a stable one, sa...",https://www.freemalaysiatoday.com/category/nat...,2022-02-13,"proven, Umno, led, govt, stable, one, says, PM"
4,"Malaysia, Brunei VTL on the cards",https://www.freemalaysiatoday.com/category/nat...,2022-02-13,"Malaysia, Brunei, VTL, cards"


In [5]:
df = df.drop(['title', 'url'], axis = 1)
df.head()

Unnamed: 0,date,cleaned
0,2022-02-13,"Azam, turned, PAC, hearing, invite, recalls, MP"
1,2022-02-13,"Khairuddin, keeps, mum, sacking, PAS, Syura, C..."
2,2022-02-13,"undocumented, Indonesian, migrants, nabbed, bo..."
3,2022-02-13,"proven, Umno, led, govt, stable, one, says, PM"
4,2022-02-13,"Malaysia, Brunei, VTL, cards"


In [6]:
df = df.groupby(['date'])['cleaned'].apply(lambda x: ','.join(x)).reset_index()
df['cleaned'] = df['cleaned'].apply(lambda x: x.split(","))
df.head()

Unnamed: 0,date,cleaned
0,2019-12-31,"[PIA, special, flight, brings, home, hund..."
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."


In [7]:
df.shape

(781, 2)

In [8]:
df.drop(index=df.index[0], 
        axis=0, 
        inplace=True)
df.head()

Unnamed: 0,date,cleaned
1,2020-01-01,"[caught, smoking, eateries, first, day, e..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say..."


I want to create a word cloud, so I count the number of tokens. Then, I save to json.

In [9]:
from collections import Counter

df['counter'] = df['cleaned'].apply(lambda x: dict(Counter(x)))
df.head()

Unnamed: 0,date,cleaned,counter
1,2020-01-01,"[caught, smoking, eateries, first, day, e...","{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"[Petition, support, Maszlee, gets, signatu...","{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"[one, recognises, Mavcom, civil, aviation,...","{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ...","{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say...","{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [10]:
df = df.drop(['cleaned'], axis = 1)
df.head()

Unnamed: 0,date,counter
1,2020-01-01,"{'caught': 1, ' smoking': 1, ' eateries': 1, '..."
2,2020-01-02,"{'Petition': 1, ' support': 1, ' Maszlee': 3, ..."
3,2020-01-03,"{'one': 1, ' recognises': 1, ' Mavcom': 1, ' c..."
4,2020-01-04,"{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [11]:
df.tail()

Unnamed: 0,date,counter
776,2022-02-14,"{'DAP': 1, ' PH': 2, ' facing': 1, ' tougher':..."
777,2022-02-15,"{'day': 1, ' Karpal': 1, ' chased': 1, ' Anwar..."
778,2022-02-16,"{'Covid': 1, ' deaths': 3, ' toll': 1, ' stand..."
779,2022-02-17,"{'Ex': 1, ' Goldman': 1, ' banker': 1, ' Tim':..."
780,2022-02-18,"{'Rights': 3, ' group': 3, ' welcomes': 3, ' r..."


In [12]:
df.to_json("preprocessed_fmt.json", orient='records')