Having imputed the csv in df_impute.ipynb, I now perform text preprocessing.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imputed_fmt_9-10-21.csv")
df.head()

Unnamed: 0,title,url,date
0,Minister says she had ‘no role’ in approving M...,https://www.freemalaysiatoday.com/category/nat...,2021-10-09
1,"Return your cars, CM tells exco members who quit",https://www.freemalaysiatoday.com/category/nat...,2021-10-09
2,"Question mark over ‘toothless panels’ on Adib,...",https://www.freemalaysiatoday.com/category/nat...,2021-10-09
3,"Melaka PN is strong enough without Umno, says ...",https://www.freemalaysiatoday.com/category/nat...,2021-10-09
4,"8,743 Covid-19 cases, 14,422 recoveries",https://www.freemalaysiatoday.com/category/nat...,2021-10-09


First I remove all commas, because figures such as 1,000 are common in news headlines. Then, I tokenize the headlines and remove stop words.

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.replace(',', '')
    text = " ".join(re.split('\W+', text))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
            
    return (", ".join(filtered_sentence))

print(clean_text("This is a sample sentence, showing off the stop words filtration."))

This, sample, sentence, showing, stop, words, filtration


In [4]:
df['cleaned'] = df['title'].apply(clean_text)
df.head()

Unnamed: 0,title,url,date,cleaned
0,Minister says she had ‘no role’ in approving M...,https://www.freemalaysiatoday.com/category/nat...,2021-10-09,"Minister, says, role, approving, Mitra, grants"
1,"Return your cars, CM tells exco members who quit",https://www.freemalaysiatoday.com/category/nat...,2021-10-09,"Return, cars, CM, tells, exco, members, quit"
2,"Question mark over ‘toothless panels’ on Adib,...",https://www.freemalaysiatoday.com/category/nat...,2021-10-09,"Question, mark, toothless, panels, Adib, Thomas"
3,"Melaka PN is strong enough without Umno, says ...",https://www.freemalaysiatoday.com/category/nat...,2021-10-09,"Melaka, PN, strong, enough, without, Umno, say..."
4,"8,743 Covid-19 cases, 14,422 recoveries",https://www.freemalaysiatoday.com/category/nat...,2021-10-09,"8743, Covid, 19, cases, 14422, recoveries"


In [5]:
df = df.drop(['title', 'url'], axis = 1)
df.head()

Unnamed: 0,date,cleaned
0,2021-10-09,"Minister, says, role, approving, Mitra, grants"
1,2021-10-09,"Return, cars, CM, tells, exco, members, quit"
2,2021-10-09,"Question, mark, toothless, panels, Adib, Thomas"
3,2021-10-09,"Melaka, PN, strong, enough, without, Umno, say..."
4,2021-10-09,"8743, Covid, 19, cases, 14422, recoveries"


In [6]:
df = df.groupby(['date'])['cleaned'].apply(lambda x: ','.join(x)).reset_index()
df['cleaned'] = df['cleaned'].apply(lambda x: x.split(","))
df.head()

Unnamed: 0,date,cleaned
0,2019-12-31,"[PIA, special, flight, brings, home, hund..."
1,2020-01-01,"[160, caught, smoking, eateries, first, d..."
2,2020-01-02,"[Petition, support, Maszlee, gets, 100000,..."
3,2020-01-03,"[No, one, recognises, Mavcom, civil, avia..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."


In [7]:
df.shape

(649, 2)

In [8]:
df.drop(index=df.index[0], 
        axis=0, 
        inplace=True)
df.head()

Unnamed: 0,date,cleaned
1,2020-01-01,"[160, caught, smoking, eateries, first, d..."
2,2020-01-02,"[Petition, support, Maszlee, gets, 100000,..."
3,2020-01-03,"[No, one, recognises, Mavcom, civil, avia..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say..."


I want to create a word cloud, so I count the number of tokens. Then, I save to csv.

In [9]:
from collections import Counter

df['counter'] = df['cleaned'].apply(lambda x: Counter(x))
df.head()

Unnamed: 0,date,cleaned,counter
1,2020-01-01,"[160, caught, smoking, eateries, first, d...","{'160': 1, ' caught': 1, ' smoking': 1, ' eate..."
2,2020-01-02,"[Petition, support, Maszlee, gets, 100000,...","{'Petition': 1, ' support': 1, ' Maszlee': 4, ..."
3,2020-01-03,"[No, one, recognises, Mavcom, civil, avia...","{'No': 1, ' one': 1, ' recognises': 1, ' Mavco..."
4,2020-01-04,"[PH, discuss, PM, handover, says, Syed, ...","{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"[Umno, make, constitutional, proposal, say...","{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [10]:
df = df.drop(['cleaned'], axis = 1)
df.head()

Unnamed: 0,date,counter
1,2020-01-01,"{'160': 1, ' caught': 1, ' smoking': 1, ' eate..."
2,2020-01-02,"{'Petition': 1, ' support': 1, ' Maszlee': 4, ..."
3,2020-01-03,"{'No': 1, ' one': 1, ' recognises': 1, ' Mavco..."
4,2020-01-04,"{'PH': 1, ' discuss': 1, ' PM': 2, ' handover'..."
5,2020-01-05,"{'Umno': 1, ' make': 1, ' constitutional': 1, ..."


In [11]:
df.to_csv("preprocessed_fmt_9-10-21.csv", index=False)