# 2. Script for obtaining word frequencies in comment corpora

## Set Up

In [39]:
import pandas as pd

In [40]:
path = "C:\\Users\\nicol\\Documents\\2021-2022\\QMSS\\Spring'22\\QMSS 5999 Thesis\\data\\"

# Import comment data
reddit_raw = pd.read_csv(path+'comments_reddit_all.csv', index_col=0, dtype={'comment_id': 'str', 'text': 'str'})
fb_raw = pd.read_csv(path+'comments_fb_all.csv', index_col=0, dtype={'comment_id': 'str', 'text': 'str'})
fb_raw = fb_raw[['comment_id', 'text']]

In [41]:
# Concatenate both FB and Reddit data into one DataFrame
all_raw = pd.concat([reddit_raw, fb_raw], ignore_index=True)

In [42]:
# Inspect DataFrame and get dimensions
display(all_raw.head())
display(all_raw.tail())
display(all_raw.shape)

Unnamed: 0,comment_id,text
0,gww5drd,ALL MY PLANS ARE GONE NOOOOOOOO
1,gww5jfz,&gt; This means people will no longer be allow...
2,gww68ne,phase 2 lai liao wah shag
3,gww6icr,Government fucks up and takes it out on the po...
4,gww6luf,Gyms also closed till 30th. PepeHands. I just ...


Unnamed: 0,comment_id,text
15943,10158807617032934,Top fan\nTimothy Tan
15944,10158807627202934,👀👀👀
15945,10158807627087934,🙄🙄🙄🙄🙄
15946,10158807433227934,Hmm… THOT AH ONG SAID WE BEGINNING TO SEE LIGH...
15947,10158806329392934,"Eh, i thought GKY said the numbers are stable ..."


(15948, 2)

## Preprocessing text

### Clean text

In [32]:
# Function for cleaning text
def clean_text_freq(text):
    import re
    import html
    text1 = html.unescape(text) # remove HTML encoded characters
    text2 = re.sub(r'â€™', '\'', text1) # replce â€™ with apostrophe
    text3 = re.sub(r'\n', '', text2) # remove excess line breaks between links
    text4 = re.sub(r'\n', ' ', text3) # replace remaining line breaks with spaces
    text5 = re.sub(r'#\S+', '', text4) # remove hashtags
    text6 = re.sub(r'http\S+', '', text5) # remove links starting with http
    text7 = re.sub(r'\S+\.com\S+', '', text6) # remove links not starting with http
    text8 = re.sub(r'[^A-Za-z]+', ' ', text7).strip().lower() # remove punctuations and digits 
    return(text8)

In [33]:
# Apply function to DataFrame
all_raw['text_clean_freq'] = all_raw.text.apply(clean_text_freq)

In [7]:
# Inspect DataFrame
all_raw.head()

Unnamed: 0,comment_id,text,text_clean_freq
0,gww5drd,ALL MY PLANS ARE GONE NOOOOOOOO,all my plans are gone noooooooo
1,gww5jfz,&gt; This means people will no longer be allow...,this means people will no longer be allowed to...
2,gww68ne,phase 2 lai liao wah shag,phase lai liao wah shag
3,gww6icr,Government fucks up and takes it out on the po...,government fucks up and takes it out on the po...
4,gww6luf,Gyms also closed till 30th. PepeHands. I just ...,gyms also closed till th pepehands i just want...


### Remove stop words

In [8]:
# Function for removing stop words
def remove_sw(text):
    from nltk.corpus import stopwords
    sw = set(stopwords.words('English'))
    text_filtered = [word for word in text.split() if not word in sw] 
    text_filtered = ' '.join(text_filtered) 
    return(text_filtered)

In [9]:
# Apply function to DataFrame
all_raw['text_no_sw'] = all_raw.text_clean_freq.apply(remove_sw)

In [10]:
# Inspect DataFrame
all_raw.head()

Unnamed: 0,comment_id,text,text_clean_freq,text_no_sw
0,gww5drd,ALL MY PLANS ARE GONE NOOOOOOOO,all my plans are gone noooooooo,plans gone noooooooo
1,gww5jfz,&gt; This means people will no longer be allow...,this means people will no longer be allowed to...,means people longer allowed digitally check sc...
2,gww68ne,phase 2 lai liao wah shag,phase lai liao wah shag,phase lai liao wah shag
3,gww6icr,Government fucks up and takes it out on the po...,government fucks up and takes it out on the po...,government fucks takes population changes work...
4,gww6luf,Gyms also closed till 30th. PepeHands. I just ...,gyms also closed till th pepehands i just want...,gyms also closed till th pepehands want rock c...


### Tokenize comment corpus

In [11]:
# Transform text in DataFrame to a single string
all_raw_string = ' '.join(all_raw['text_no_sw']) 

In [12]:
# Tokenise the whole corpus
from nltk.tokenize import word_tokenize
tokens_all = word_tokenize(all_raw_string)

In [13]:
# Obtain tokens not in NLTK's English dictionary (to identify non-English words that need to be parsed/replaced)
import nltk
from nltk.corpus import words
from nltk.corpus import wordnet

nltk_words = words.words()
tokens_no_eng = [tok for tok in tokens_all if not tok in nltk_words if not wordnet.synsets(tok)]

## Obtain top 1000 tokens by frequency

### Whole corpus

In [14]:
# Obtain top 1000 tokens from the whole corpus
from nltk.probability import FreqDist
fdist_all = FreqDist(tokens_all)
tokens_all_top1000 = fdist_all.most_common(1000)

In [15]:
# Inspect top 20 
tokens_all_top1000[0:20]

[('people', 2310),
 ('covid', 1640),
 ('like', 1446),
 ('still', 1269),
 ('cases', 1268),
 ('vaccinated', 1212),
 ('get', 1171),
 ('go', 1129),
 ('one', 1022),
 ('even', 973),
 ('time', 920),
 ('also', 900),
 ('singapore', 894),
 ('need', 883),
 ('us', 870),
 ('back', 838),
 ('think', 834),
 ('going', 784),
 ('open', 752),
 ('government', 715)]

In [16]:
# Convert to DataFrame and inspect
tokens_all_top1000_df = pd.DataFrame(tokens_all_top1000, columns=['token', 'freq'])
tokens_all_top1000_df.head()

Unnamed: 0,token,freq
0,people,2310
1,covid,1640
2,like,1446
3,still,1269
4,cases,1268


In [17]:
# Export as CSV
tokens_all_top1000_df.to_csv(path+'comments_all_tokens_top1000.csv', encoding='utf-8-sig')

### Non-English corpus

In [18]:
# Obtain top 1000 non-English tokens
from nltk.probability import FreqDist
fdist_no_eng = FreqDist(tokens_no_eng)
tokens_no_eng_top1000 = fdist_no_eng.most_common(1000)

In [19]:
# Inspect top 20 
tokens_no_eng_top1000[0:20]

[('cb', 456),
 ('ktv', 384),
 ('govt', 375),
 ('lol', 298),
 ('ppl', 244),
 ('etc', 224),
 ('gov', 224),
 ('moh', 222),
 ('wfh', 168),
 ('hbl', 159),
 ('others', 148),
 ('pls', 141),
 ('mrt', 136),
 ('pre', 128),
 ('mtf', 125),
 ('vax', 105),
 ('mmtf', 104),
 ('oyk', 103),
 ('ndp', 103),
 ('hari', 102)]

These non-English tokens will guide me in constructing a dictionary to replace these words with their meaning, to allow the sentiment analysis algorithms to better parse the text and to improve the results. 

Additionally, this list helps me to decide which tokens should be kept:
- For instance, 'cb' is a top token but it has multiple meanings in the corpora as it refers to either a swear word ('cb' short for 'chee bye' which is equivalent in vulgarity to the F word), or the Circuit Breaker lockdown restrictions in April 2020 (which the government and citizens have commonly abbreviated into 'CB'). Given the confusion, I will eliminate this specifc token from the list. I will only retain 'cb' if it is part of another token like 'ccb' which definitely refers to the swear phrase ('chao chee bye') and not the restrictions. 
- I will also not replace tokens which do not have a concise explanation/replacement in the English language, or tokens that are proper nouns.
- Internet slang abbreviations e.g. lol, lmao, etc. will not be replaced as they can be captured by VADER.
- Tokens with frequency <= 50 will not be replaced.

In [20]:
# Convert to DataFrame and export as CSV
tokens_no_eng_top1000_df = pd.DataFrame(tokens_no_eng_top1000, columns=['token', 'freq'])
tokens_no_eng_top1000_df.to_csv(path+'comments_all_tokens_no_eng_top1000.csv', encoding='utf-8-sig')