# **TEXT PREPROCESSING FUNCTION**

In [133]:
import pandas as pd
import re #regular expressions library for text manipulation
import string
import numpy as np
import unicodedata

from bs4 import BeautifulSoup
import html2text

from glob import glob

#NLP libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from spacy.lang.en import English
import spacymoji
import emoji
import contractions

import itertools
from autocorrect import Speller

#for wordclouds
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

In [52]:
def clean_comments (filepath):
    #import data
    df = pd.read_csv(filepath, low_memory = False)
    
    #remove deleted comments if any
    df = df[df.Body != '[deleted]']
    
    #remove comments with missing id
    drop_index = df[df.isSubmitter.isnull()].index
    df.drop(drop_index, inplace = True)
    
    #remove duplicates if any
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #correct data types and column label
    df['Date_Created'] = pd.to_datetime(df['Date_Created'])
    df['year'] = df['Date_Created'].dt.year
    df['Score'] = df['Score'].astype('int') 
    df.rename(columns = {'Author_ID': "Author"}, inplace = True) 
    
    #remove unnecessary columns
    df.drop(columns = ['Unnamed: 0', 'Author', 'Score', 'Date_Created',
       'Parent_ID', 'Submission_ID', 'Subreddit', 'isParent', 'isSubmitter'], inplace = True)
    
    #remove any extra whitespace in column labels
    df.columns = df.columns.str.strip()
    
    #include column to denote row is comment entry
    df['text_type'] = 'comment'
    
    #rearrange column order
    df = df[['text_type','ID', 'year', 'Body']]
    
    #rename 'Body' column to text
    df.rename(columns = {'Body': 'long_text'}, inplace = True)
        
    return df



In [53]:
def clean_submissions(filepath):
    df = pd.read_csv(filepath, low_memory = False)
    
    #drop duplicate posts
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #create column for post title + post text
    df['long_text'] = df['Title']+ " " +df['Post Text'].fillna('')
    
    #adjust data types
    df['Date Created'] = pd.to_datetime(df['Date Created'])
    df['year'] = df['year'].astype('int')
    
    #remove unwanted columns
    df = df.drop(columns = ['Unnamed: 0', 'Title','Post Text', 'Date Created', 'Score',
       'Total Comments', 'Post URL', 'SubReddit','Unnamed: 0.1'])
    
    #include column to denote row is comment entry
    df['text_type'] = 'submission'
    
    #reorder columns - 'ID', 'Post Text'
    df  = df[['text_type','ID', 'year', 'long_text']]
    
    return df
    

In [156]:
comments_filepath = 'Data/comments.csv'
submissions_filepath = 'Data/full_posts.csv'


data = pd.concat([clean_comments(comments_filepath), clean_submissions(submissions_filepath)], ignore_index = True)

data

Unnamed: 0,text_type,ID,year,long_text
0,comment,gtfo2hl,2021,"*Cuntry roads, take me hoem*"
1,comment,gtfqkbv,2021,"That’s been there for several years, sent a pi..."
2,comment,gtfou07,2021,I am single and I have not traveled to any cun...
3,comment,gtfrgpe,2021,What happens when you shop at dragon mart...
4,comment,gtg5mwv,2021,I am cunting on them to do so 😅
...,...,...,...,...
130050,submission,14f46ji,2023,"Best beauty saloons in Dubai? Hello fellas, I ..."
130051,submission,14f4uyi,2023,Found the r/dubai redditors who kept telling m...
130052,submission,14f4ri3,2023,Scam ? Healthy.line My sister has a CBD debit ...
130053,submission,14f4k3r,2023,Thoughts on Expo City properties? Anyone else ...


In [157]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130055 entries, 0 to 130054
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   text_type  130055 non-null  object
 1   ID         130055 non-null  object
 2   year       130055 non-null  int64 
 3   long_text  130055 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.0+ MB


In [158]:
year_group = data.groupby(by='year')
for year, group in year_group:
    print (year,len(group))

2012 11
2013 60
2014 143
2015 206
2016 787
2017 1131
2018 1686
2019 4238
2020 29987
2021 28188
2022 36776
2023 26842


## **TEXT PREPROCESSING**

### **Convert all Text to Lowercase**

In [159]:
data['clean_text'] = data['long_text'].apply(lambda text: text.lower())

data.sample(n=5)

Unnamed: 0,text_type,ID,year,long_text,clean_text
65964,comment,fz0zciz,2020,The change part is so true \n\n\nI went to the...,the change part is so true \n\n\ni went to the...
84509,comment,j7yb4ek,2023,So pleased to hear you and your family are saf...,so pleased to hear you and your family are saf...
87781,comment,fxa15xz,2020,ohh now i understand,ohh now i understand
127870,submission,148e8sb,2023,Home internet Which home internet connection i...,home internet which home internet connection i...
88199,comment,jk7ntm7,2023,"Go to Rasasi in Deira, they have this amazing ...","go to rasasi in deira, they have this amazing ..."


## **Insert Word Count**

In [160]:
data['word_count'] = data['clean_text'].apply (lambda text: len(text.split()))

data.sort_values(by='word_count', ascending = False)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...","psa: immigration to canada, australia, nz hell...",3188
126556,submission,jh5hjq,2020,List of discounts and working promo codes in D...,list of discounts and working promo codes in d...,1945
126510,submission,tce5mm,2022,Unknown accident and the plight with Dubai Pol...,unknown accident and the plight with dubai pol...,1926
127807,submission,10kwnuo,2023,A complete list of LIFE HACK / TIPS / FACTS ab...,a complete list of life hack / tips / facts ab...,1593
110203,comment,fpg1e6j,2020,Here is the text for those that can’t get by t...,here is the text for those that can’t get by t...,1352
...,...,...,...,...,...,...
119467,comment,hsr1vcj,2022,Ok,ok,1
119468,comment,hsr2kg6,2022,Ok,ok,1
119469,comment,hsr3ak1,2022,Ok,ok,1
11417,comment,gectffr,2020,Superb.,superb.,1


## **Translate Emoticons**

In [154]:
"""def find_emoji(text):
    try:
        emoticon_details = next(emoji.analyze(text, join_emoji=True))
        emoticon = emoticon_details.chars
        #translated_emoji = emoji.demojize(emoticon.chars)
    except StopIteration:
        emoticon = ''  # Handle the case when no emoji is found
    return  emoticon

data['emoticons'] = data['long_text'].apply (lambda text: find_emoji(text) )
data['translated_emojis'] = data['emoticons'].apply(lambda text: emoji.demojize(text))

emoji_index = [3709,33734,129114,100878]

data.loc[emoji_index]"""

"def find_emoji(text):\n    try:\n        emoticon_details = next(emoji.analyze(text, join_emoji=True))\n        emoticon = emoticon_details.chars\n        #translated_emoji = emoji.demojize(emoticon.chars)\n    except StopIteration:\n        emoticon = ''  # Handle the case when no emoji is found\n    return  emoticon\n\ndata['emoticons'] = data['long_text'].apply (lambda text: find_emoji(text) )\ndata['translated_emojis'] = data['emoticons'].apply(lambda text: emoji.demojize(text))\n\nemoji_index = [3709,33734,129114,100878]\n\ndata.loc[emoji_index]"

## **Expand Word Contractions**

In [161]:
data['clean_text'] = data['clean_text'].apply(lambda text: contractions.fix(text)) 

data.sample(n=5)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
109237,comment,frut51l,2020,why not tag the original article link though? ...,why not tag the original article link though? ...,23
23953,comment,fyjkcja,2020,You just can’t get in or out of the place,you just cannot get in or out of the place,10
84540,comment,j7zokjv,2023,Yep! been using Nexen on all our family cars f...,yep! been using nexen on all our family cars f...,21
75532,comment,jo3rgwg,2023,So you're gonna remove the comment and ban the...,so you are going to remove the comment and ban...,10
67114,comment,iq2hceh,2022,Antiquated and archaic rules that make zero se...,antiquated and archaic rules that make zero se...,29


## **Remove URLs**

In [140]:
pd.reset_option('display.max_colwidth')

In [150]:
#index of rows with urls
html_index = data[data['long_text'].str.contains("https")].index
data.loc[html_index]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,emoticons,translated_emojis
17,comment,gtgyqzz,2021,Meditation And Relaxation Music https://youtu....,meditation and relaxation music https://youtu....,5,,
83,comment,gpqwzwc,2021,I am curious about this Apollo Fintech GSX coi...,i am curious about this apollo fintech gsx coi...,21,,
94,comment,gpqwrek,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,https://www.instagram.com/p/ckhzbwph0c4/?igshi...,7,,
174,comment,gq3tah4,2021,"With all due respect, do you expect a law enfo...","with all due respect, do you expect a law enfo...",96,,
293,comment,jb5kpz4,2023,Here: Dubai Festival City\nhttps://maps.app.go...,here: dubai festival city\nhttps://maps.app.go...,9,,
...,...,...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help‼️uni qualification equivalency letter for...,81,‼️,:double_exclamation_mark:
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa: villas are at high risk of fire during th...,106,,
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...","daily random discussion thread hello,\n\nthis ...",69,,
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help with applying for residency - confus...,135,,


In [162]:
#regex pattern for urls
url_pattern = r'https?://\S+'
#replace url with empty string
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(url_pattern, ' ', text, flags=re.MULTILINE))

data.loc[html_index]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
17,comment,gtgyqzz,2021,Meditation And Relaxation Music https://youtu....,meditation and relaxation music,5
83,comment,gpqwzwc,2021,I am curious about this Apollo Fintech GSX coi...,i am curious about this apollo fintech gsx coi...,21
94,comment,gpqwrek,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,\n\nhere is a video explaining the same.,7
174,comment,gq3tah4,2021,"With all due respect, do you expect a law enfo...","with all due respect, do you expect a law enfo...",96
293,comment,jb5kpz4,2023,Here: Dubai Festival City\nhttps://maps.app.go...,here: dubai festival city\n \n\nthen follow th...,9
...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help‼️uni qualification equivalency letter for...,81
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa: villas are at high risk of fire during th...,106
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...","daily random discussion thread hello,\n\nthis ...",69
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help with applying for residency - confus...,135


## **Remove Accents from Characters**

In [163]:
data['clean_text'] = data['clean_text'].apply(
                                            lambda text: unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8'))

data.sample(n=5)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
97379,comment,h2o3ya0,2021,"In 2020, when I was looking for jobs right out...","in 2020, when i was looking for jobs right out...",105
55725,comment,gjbxxt3,2021,That's just beautiful.,that is just beautiful.,3
117506,comment,fb56toh,2019,"Great news! Next up, Deliveroo and Uber Eats n...","great news! next up, deliveroo and uber eats n...",20
41482,comment,hxh4gzs,2022,And corporate tax fee,and corporate tax fee,4
46821,comment,ir1h7cs,2022,"Hi! I'm from the Philippines! \n\n\nAlso, I'm...","hi! i am from the philippines! \n\n\nalso, i ...",19


## **Remove Punctuations**

In [169]:
#index of some rows with punctuations
checker_list = ['ifquow','gzl2ec','147gsfl','vtelex',
 '12pqx6m','fuxrd2','2ui6wu','l4gz0u','14f4uyi','14f8d30']

rows_to_check = data[data['ID'].isin(checker_list)].index.tolist()

rows_to_check.extend([32003, 116022,18460,5786,30109])

rows_to_check.extend(html_index)

print(rows_to_check[:5])

[125800, 126269, 127899, 128041, 128388]


In [170]:
#regex pattern for punctuations
punctuation_pattern = r'[^\w\s]'

#remove punctuations using `re.sub() method
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(r'[^\w\s]', ' ', text))

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys we made it yay,5
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...,3188
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...,72
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...,58
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i am currently looking for jobs t...,26
...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help uni qualification equivalency letter for...,81
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa villas are at high risk of fire during th...,106
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",daily random discussion thread hello \n\nthis ...,69
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help with applying for residency confus...,135


## **Remove New Line & Tab**

In [171]:
#remove `\n` from text
data['clean_text'] = data['clean_text'].str.replace('\n', ' ')
#remove `\t` from text
data['clean_text'] = data['clean_text'].str.replace('\t', ' ')

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys we made it yay,5
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...,3188
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...,72
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...,58
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i am currently looking for jobs t...,26
...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help uni qualification equivalency letter for...,81
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa villas are at high risk of fire during th...,106
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",daily random discussion thread hello this is...,69
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help with applying for residency confus...,135


## **Remove Stop Words - SpaCy**

In [172]:
nlp = spacy.load('en_core_web_md')

data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join([word.text for word in nlp(text) if not word.is_stop]))

data.sample(n=10)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
90525,comment,jl2gmh4,2023,Correction: every person that drives a car in ...,correction person drives car country world e...,58
78881,comment,gi337o6,2021,I don't think any of our actions are carbon ne...,think actions carbon neutral firewo...,39
118143,comment,gmqht9m,2021,That's so cool! I have a handful of collection...,cool handful collection different emirati di...,34
128015,submission,136m9fd,2023,T3 arrivals parking charges Hi guys I found th...,t3 arrivals parking charges hi guys found link...,26
105561,comment,ig3o52r,2022,"Hahaha use jaldi jaldi instead of fast fast, i...",hahaha use jaldi jaldi instead fast fast mak...,22
101938,comment,joxfkir,2023,"Flights cost an arm and a leg, i did check che...",flights cost arm leg check cheaper ones fl...,20
80174,comment,jg9c0uc,2023,Not sure why this was aimed at specifically Pa...,sure aimed specifically pakistanis south asi...,65
38222,comment,hr85ocd,2022,Yes. Let’s waste their resources on a missing ...,yes let waste resources missing burger may...,20
113866,comment,isap4t3,2022,What restaurant,restaurant,2
103391,comment,d5buby5,2016,That's very kind of you. :),kind,6


## **Remove Extra Whitespaces**

In [175]:
data['clean_text'] = data['clean_text'].str.strip().str.replace('\s+', ' ', regex = True)

sample_rows = [5786,18460, 103391, 129297]

#redo word_count
data['word_count'] = data['clean_text'].apply (lambda text: len(text.split()))

data.loc[sample_rows].sort_values(by='word_count', ascending = False)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
5786,comment,ji1smg1,2023,"FULLTEXT\nApr 27, 2023\n\nStand in the middle ...",fulltext apr 27 2023 stand middle teeming meen...,699
18460,comment,jj2t0ih,2023,"FULL TEXT:\n\nBy Archana Narayanan, Abeer Abu ...",text archana narayanan abeer abu omar zainab f...,617
129297,submission,l4gz0u,2021,BEST Karak/ Zafrani/ Morrocan Tea in Dubai? He...,best karak zafrani morrocan tea dubai heya bes...,19
103391,comment,d5buby5,2016,That's very kind of you. :),kind,1


## **Word Tokenization - NLTK**

In [176]:
data['tokens'] = data['clean_text'].apply(lambda text: word_tokenize(text))

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,tokens
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys yay,2,"[guys, yay]"
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration canada australia nz hello duba...,1399,"[psa, immigration, canada, australia, nz, hell..."
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,reach people asking money grocery experience t...,26,"[reach, people, asking, money, grocery, experi..."
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,plan spend eid holiday fun activities trips li...,28,"[plan, spend, eid, holiday, fun, activities, t..."
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi currently looking jobs temporary permanent ...,12,"[hi, currently, looking, jobs, temporary, perm..."
...,...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help uni qualification equivalency letter mast...,38,"[help, uni, qualification, equivalency, letter..."
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa villas high risk fire summer months friend...,49,"[psa, villas, high, risk, fire, summer, months..."
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",daily random discussion thread hello daily ran...,48,"[daily, random, discussion, thread, hello, dai..."
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help applying residency confused order st...,74,"[need, help, applying, residency, confused, or..."


## **LEMMATIZATION**

In [177]:
data['tokens'] = data['tokens'].apply(lambda text: [token.lemma_ for token in nlp(' '.join(text))])

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,tokens
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys yay,2,"[guy, yay]"
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration canada australia nz hello duba...,1399,"[psa, immigration, canada, australia, nz, hell..."
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,reach people asking money grocery experience t...,26,"[reach, people, ask, money, grocery, experienc..."
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,plan spend eid holiday fun activities trips li...,28,"[plan, spend, eid, holiday, fun, activity, tri..."
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi currently looking jobs temporary permanent ...,12,"[hi, currently, look, job, temporary, permanen..."
...,...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help uni qualification equivalency letter mast...,38,"[help, uni, qualification, equivalency, letter..."
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa villas high risk fire summer months friend...,49,"[psa, villas, high, risk, fire, summer, month,..."
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",daily random discussion thread hello daily ran...,48,"[daily, random, discussion, thread, hello, dai..."
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help applying residency confused order st...,74,"[need, help, apply, residency, confused, order..."


## **Remove Short Text Entries**

In [192]:
#redo word_count
data['word_count'] = data['clean_text'].apply (lambda text: len(text.split()))

data.loc[sample_rows].sort_values(by='word_count', ascending = False)

#select only rows with more than 1 word
corpus = data[data['word_count'] > 2]

corpus.shape

(101462, 7)

In [193]:
filename = 'Data/corpus_v4.csv'

def export_csv():
    '''
    export pre-processed data to CSV
    '''
    corpus.to_csv(filename, index_label = 'index', quoting = csv.QUOTE_ALL, header = True)

export_csv()