# **TEXT PREPROCESSING FUNCTION**

In [8]:
import pandas as pd
import re #regular expressions library for text manipulation
import string
import numpy as np
import unicodedata

from bs4 import BeautifulSoup
import html2text

from glob import glob

#NLP libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from spacy.lang.en import English

import itertools
from autocorrect import Speller

#for wordclouds
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

In [39]:
def clean_comments (filepath):
    #import data
    df = pd.read_csv(filepath, low_memory = False)
    
    #remove deleted comments if any
    df = df[df.Body != '[deleted]']
    
    #remove comments with missing id
    drop_index = df[df.isSubmitter.isnull()].index
    df.drop(drop_index, inplace = True)
    
    #remove duplicates if any
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #correct data types and column label
    df['Date_Created'] = pd.to_datetime(df['Date_Created'])
    df['year'] = df['Date_Created'].dt.year
    df['Score'] = df['Score'].astype('int') 
    df.rename(columns = {'Author_ID': "Author"}, inplace = True) 
    
    #remove unnecessary columns
    df.drop(columns = ['Unnamed: 0', 'Author', 'Score', 'Date_Created',
       'Parent_ID', 'Submission_ID', 'Subreddit', 'isParent', 'isSubmitter'], inplace = True)
    
    #remove any extra whitespace in column labels
    df.columns = df.columns.str.strip()
    
    #include column to denote row is comment entry
    df['text_type'] = 'comment'
    
    #rearrange column order
    df = df[['text_type','ID', 'year', 'Body']]
    
    #rename 'Body' column to text
    df.rename(columns = {'Body': 'long_text'}, inplace = True)
        
    return df



In [40]:
def clean_submissions(filepath):
    df = pd.read_csv(filepath, low_memory = False)
    
    #drop duplicate posts
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #create column for post title + post text
    df['long_text'] = df['Title']+ " " +df['Post Text'].fillna('')
    
    #adjust data types
    df['Date Created'] = pd.to_datetime(df['Date Created'])
    df['year'] = df['year'].astype('int')
    
    #remove unwanted columns
    df = df.drop(columns = ['Unnamed: 0', 'Title','Post Text', 'Date Created', 'Score',
       'Total Comments', 'Post URL', 'SubReddit','Unnamed: 0.1'])
    
    #include column to denote row is comment entry
    df['text_type'] = 'submission'
    
    #reorder columns - 'ID', 'Post Text'
    df  = df[['text_type','ID', 'year', 'long_text']]
    
    return df
    

In [42]:
comments_filepath = '/Users/amenaghawon/Documents/GitHub/Final_Project/Data/comments.csv'
submissions_filepath = '/Users/amenaghawon/Documents/GitHub/Final_Project/Data/full_posts.csv'


data = pd.concat([clean_comments(comments_filepath), clean_submissions(submissions_filepath)], ignore_index = True)

data

Unnamed: 0,text_type,ID,year,long_text
0,comment,gtfo2hl,2021,"*Cuntry roads, take me hoem*"
1,comment,gtfqkbv,2021,"That’s been there for several years, sent a pi..."
2,comment,gtfou07,2021,I am single and I have not traveled to any cun...
3,comment,gtfrgpe,2021,What happens when you shop at dragon mart...
4,comment,gtg5mwv,2021,I am cunting on them to do so 😅
...,...,...,...,...
130050,submission,14f46ji,2023,"Best beauty saloons in Dubai? Hello fellas, I ..."
130051,submission,14f4uyi,2023,Found the r/dubai redditors who kept telling m...
130052,submission,14f4ri3,2023,Scam ? Healthy.line My sister has a CBD debit ...
130053,submission,14f4k3r,2023,Thoughts on Expo City properties? Anyone else ...


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130055 entries, 0 to 130054
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   text_type  130055 non-null  object
 1   ID         130055 non-null  object
 2   year       130055 non-null  int64 
 3   long_text  130055 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.0+ MB


In [44]:
year_group = data.groupby(by='year')
for year, group in year_group:
    print (year,len(group))

2012 11
2013 60
2014 143
2015 206
2016 787
2017 1131
2018 1686
2019 4238
2020 29987
2021 28188
2022 36776
2023 26842


## **TEXT PREPROCESSING**

### **Convert all Text to Lowercase**

In [45]:
data['clean_text'] = data['long_text'].apply(lambda text: text.lower())

data.sample(n=5)

Unnamed: 0,text_type,ID,year,long_text,clean_text
87623,comment,fsu2v2a,2020,just insert a probe to your PP. you know.. the...,just insert a probe to your pp. you know.. the...
89650,comment,jd1jwm8,2023,these kids think like a MACHINE !!! some kind ...,these kids think like a machine !!! some kind ...
2273,comment,ft3qv9g,2020,How much is the cost ?,how much is the cost ?
80046,comment,gzb0k0t,2021,GN would have spelt loose incorrectly in their...,gn would have spelt loose incorrectly in their...
31242,comment,fm71jnd,2020,Feels. My face needs landscaping rn. Going bac...,feels. my face needs landscaping rn. going bac...


## **Insert Word Count**

In [46]:
data['word_count'] = data['clean_text'].apply (lambda text: len(text.split()))

data.sort_values(by='word_count', ascending = False)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...","psa: immigration to canada, australia, nz hell...",3188
126556,submission,jh5hjq,2020,List of discounts and working promo codes in D...,list of discounts and working promo codes in d...,1945
126510,submission,tce5mm,2022,Unknown accident and the plight with Dubai Pol...,unknown accident and the plight with dubai pol...,1926
127807,submission,10kwnuo,2023,A complete list of LIFE HACK / TIPS / FACTS ab...,a complete list of life hack / tips / facts ab...,1593
110203,comment,fpg1e6j,2020,Here is the text for those that can’t get by t...,here is the text for those that can’t get by t...,1352
...,...,...,...,...,...,...
119467,comment,hsr1vcj,2022,Ok,ok,1
119468,comment,hsr2kg6,2022,Ok,ok,1
119469,comment,hsr3ak1,2022,Ok,ok,1
11417,comment,gectffr,2020,Superb.,superb.,1


## **Remove URLs**

In [69]:
pd.reset_option('display.max_colwidth')

In [47]:
#index of rows with urls
html_index = data[data['long_text'].str.contains("https")].index
data.loc[html_index]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
17,comment,gtgyqzz,2021,Meditation And Relaxation Music https://youtu....,meditation and relaxation music https://youtu....,5
83,comment,gpqwzwc,2021,I am curious about this Apollo Fintech GSX coi...,i am curious about this apollo fintech gsx coi...,21
94,comment,gpqwrek,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,https://www.instagram.com/p/ckhzbwph0c4/?igshi...,7
174,comment,gq3tah4,2021,"With all due respect, do you expect a law enfo...","with all due respect, do you expect a law enfo...",96
293,comment,jb5kpz4,2023,Here: Dubai Festival City\nhttps://maps.app.go...,here: dubai festival city\nhttps://maps.app.go...,9
...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help‼️uni qualification equivalency letter for...,81
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa: villas are at high risk of fire during th...,106
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...","daily random discussion thread hello,\n\nthis ...",69
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help with applying for residency - confus...,135


In [48]:
#regex pattern for urls
url_pattern = r'https?://\S+'
#replace url with empty string
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(url_pattern, ' ', text, flags=re.MULTILINE))

data.loc[html_index]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
17,comment,gtgyqzz,2021,Meditation And Relaxation Music https://youtu....,meditation and relaxation music,5
83,comment,gpqwzwc,2021,I am curious about this Apollo Fintech GSX coi...,i am curious about this apollo fintech gsx coi...,21
94,comment,gpqwrek,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,\n\nhere’s a video explaining the same.,7
174,comment,gq3tah4,2021,"With all due respect, do you expect a law enfo...","with all due respect, do you expect a law enfo...",96
293,comment,jb5kpz4,2023,Here: Dubai Festival City\nhttps://maps.app.go...,here: dubai festival city\n \n\nthen follow th...,9
...,...,...,...,...,...,...
129949,submission,14djkvs,2023,Help‼️Uni qualification equivalency letter for...,help‼️uni qualification equivalency letter for...,81
130007,submission,14d2uk9,2023,PSA: Villas are at high risk of fire during th...,psa: villas are at high risk of fire during th...,106
130009,submission,14d0c29,2023,"Daily Random Discussion Thread Hello,\n\nThis ...","daily random discussion thread hello,\n\nthis ...",69
130040,submission,14cieby,2023,Need help with applying for residency - confus...,need help with applying for residency - confus...,135


## **Remove Accents from Characters**

In [49]:
data['clean_text'] = data['clean_text'].apply(
                                            lambda text: unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8'))

data.sample(n=5)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
116465,comment,faqgfdh,2019,"That is certainly true, but the view - or bein...","that is certainly true, but the view - or bein...",24
97940,comment,h3zcta8,2021,Glad to hear. Looking forward to the full post.,glad to hear. looking forward to the full post.,9
42920,comment,fs6j1o2,2020,I approve of this message,i approve of this message,5
1806,comment,eabf23e,2018,The Antichrist might be in Dubai. We should lo...,the antichrist might be in dubai. we should lo...,13
71590,comment,h55fn8p,2021,Try Dubai Community Healthcare Center. Take ca...,try dubai community healthcare center. take ca...,8


## **Remove Punctuations**

In [50]:
#index of some rows with punctuations
checker_list = ['ifquow','gzl2ec','147gsfl','vtelex',
 '12pqx6m','fuxrd2','2ui6wu','l4gz0u','14f4uyi','14f8d30']

rows_to_check = data[data['ID'].isin(checker_list)].index.tolist()

rows_to_check.extend([32003, 116022,18460,5786,30109])
print(rows_to_check)

[125800, 126269, 127899, 128041, 128388, 129114, 129235, 129297, 130051, 130054, 32003, 116022, 18460, 5786, 30109]


In [51]:
#regex pattern for punctuations
punctuation_pattern = r'[^\w\s]'

#remove punctuations using `re.sub() method
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(r'[^\w\s]', ' ', text))

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys we made it yay,5
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...,3188
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...,72
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...,58
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i m currently looking for jobs te...,26
129114,submission,fuxrd2,2020,FlyDubai Cancelled My Flight 😒😩 At the airport...,flydubai cancelled my flight at the airport i...,63
129235,submission,2ui6wu,2015,ELI5: Why do Mosques have to be so obnoxiously...,eli5 why do mosques have to be so obnoxiously...,92
129297,submission,l4gz0u,2021,BEST Karak/ Zafrani/ Morrocan Tea in Dubai? He...,best karak zafrani morrocan tea in dubai he...,35
130051,submission,14f4uyi,2023,Found the r/dubai redditors who kept telling m...,found the r dubai redditors who kept telling m...,19
130054,submission,14f8d30,2023,What to do when the neighbour parks like this?...,what to do when the neighbour parks like this ...,42


## **Remove New Line & Tab**

In [52]:
#remove `\n` from text
data['clean_text'] = data['clean_text'].str.replace('\n', ' ')
#remove `\t` from text
data['clean_text'] = data['clean_text'].str.replace('\t', ' ')

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys we made it yay,5
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...,3188
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...,72
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...,58
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i m currently looking for jobs te...,26
129114,submission,fuxrd2,2020,FlyDubai Cancelled My Flight 😒😩 At the airport...,flydubai cancelled my flight at the airport i...,63
129235,submission,2ui6wu,2015,ELI5: Why do Mosques have to be so obnoxiously...,eli5 why do mosques have to be so obnoxiously...,92
129297,submission,l4gz0u,2021,BEST Karak/ Zafrani/ Morrocan Tea in Dubai? He...,best karak zafrani morrocan tea in dubai he...,35
130051,submission,14f4uyi,2023,Found the r/dubai redditors who kept telling m...,found the r dubai redditors who kept telling m...,19
130054,submission,14f8d30,2023,What to do when the neighbour parks like this?...,what to do when the neighbour parks like this ...,42


## **Remove Stop Words - SpaCy**

In [53]:
nlp = spacy.load('en_core_web_md')

data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join([word.text for word in nlp(text) if not word.is_stop]))

data.sample(n=10)

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count
90134,comment,jl09bgw,2023,Indeed😍😍😍,,1
16553,comment,g347wa5,2020,Exactly. It was horrifying seeing those other ...,exactly horrifying seeing comments,8
81291,comment,j6ro77h,2023,Go visit any exchange stores and they'll happi...,visit exchange stores ll happily change,12
2979,comment,iswu8wf,2022,Yessir we are a small team.,yessir small team,6
4998,comment,iiowsiz,2022,in regards to grad school? maybe. undergrad? n...,regards grad school maybe undergrad,9
30939,comment,h1a0yvm,2021,I try not to judge because I think there are p...,try judge think people handle situation t si...,95
127735,submission,14awhgz,2023,Issues with RAKEZ freezone Has anyone recently...,issues rakez freezone recently faced issues ra...,60
95347,comment,ggod7jg,2020,Now I'm craving chips oman paratta...thanks,m craving chips oman paratta thanks,6
76585,comment,h3to1df,2021,Can you please add the address! Took an Uber h...,add address took uber home instead drunk dri...,23
77603,comment,f5kdgxr,2019,Why'd you have such a bad time bro?,d bad time bro,8


## **Word Tokenization - NLTK**

In [54]:
data['tokens'] = data['clean_text'].apply(lambda text: word_tokenize(text))

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,tokens
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys yay,5,"[guys, yay]"
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration canada australia nz hell...,3188,"[psa, immigration, canada, australia, nz, hell..."
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,reach people asking money grocery experience...,72,"[reach, people, asking, money, grocery, experi..."
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,plan spend eid holiday fun activities trips ...,58,"[plan, spend, eid, holiday, fun, activities, t..."
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi m currently looking jobs temporary perman...,26,"[hi, m, currently, looking, jobs, temporary, p..."
129114,submission,fuxrd2,2020,FlyDubai Cancelled My Flight 😒😩 At the airport...,flydubai cancelled flight airport tears flig...,63,"[flydubai, cancelled, flight, airport, tears, ..."
129235,submission,2ui6wu,2015,ELI5: Why do Mosques have to be so obnoxiously...,eli5 mosques obnoxiously loud don t mean o...,92,"[eli5, mosques, obnoxiously, loud, don, t, mea..."
129297,submission,l4gz0u,2021,BEST Karak/ Zafrani/ Morrocan Tea in Dubai? He...,best karak zafrani morrocan tea dubai he...,35,"[best, karak, zafrani, morrocan, tea, dubai, h..."
130051,submission,14f4uyi,2023,Found the r/dubai redditors who kept telling m...,found r dubai redditors kept telling don t kno...,19,"[found, r, dubai, redditors, kept, telling, do..."
130054,submission,14f8d30,2023,What to do when the neighbour parks like this?...,neighbour parks like hello dubai community ...,42,"[neighbour, parks, like, hello, dubai, communi..."


## **LEMMATIZATION**

In [55]:
data['tokens'] = data['tokens'].apply(lambda text: [token.lemma_ for token in nlp(' '.join(text))])

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,tokens
125800,submission,ifquow,2020,GUYS WE MADE IT!!! YAY,guys yay,5,"[guy, yay]"
126269,submission,gzl2ec,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration canada australia nz hell...,3188,"[psa, immigration, canada, australia, nz, hell..."
127899,submission,147gsfl,2023,How to reach people who are asking for money/g...,reach people asking money grocery experience...,72,"[reach, people, ask, money, grocery, experienc..."
128041,submission,12pqx6m,2023,How do you plan to spend your Eid holiday? Any...,plan spend eid holiday fun activities trips ...,58,"[plan, spend, eid, holiday, fun, activity, tri..."
128388,submission,vtelex,2022,"Hi everyone, I'm currently looking for jobs te...",hi m currently looking jobs temporary perman...,26,"[hi, m, currently, look, job, temporary, perma..."
129114,submission,fuxrd2,2020,FlyDubai Cancelled My Flight 😒😩 At the airport...,flydubai cancelled flight airport tears flig...,63,"[flydubai, cancel, flight, airport, tear, flig..."
129235,submission,2ui6wu,2015,ELI5: Why do Mosques have to be so obnoxiously...,eli5 mosques obnoxiously loud don t mean o...,92,"[eli5, mosques, obnoxiously, loud, don, t, mea..."
129297,submission,l4gz0u,2021,BEST Karak/ Zafrani/ Morrocan Tea in Dubai? He...,best karak zafrani morrocan tea dubai he...,35,"[good, karak, zafrani, morrocan, tea, dubai, h..."
130051,submission,14f4uyi,2023,Found the r/dubai redditors who kept telling m...,found r dubai redditors kept telling don t kno...,19,"[find, r, dubai, redditor, keep, tell, don, t,..."
130054,submission,14f8d30,2023,What to do when the neighbour parks like this?...,neighbour parks like hello dubai community ...,42,"[neighbour, park, like, hello, dubai, communit..."


## **Remove Short Text Entries**

In [56]:
corpus = data[data['word_count'] != 1]

corpus.shape

(122060, 7)

In [59]:
corpus.to_csv('/Users/amenaghawon/Documents/GitHub/Final_Project/Data/corpus.csv')
print ('csv file created')

csv file created
