In [1]:
#pip freeze > modelenv_requirements.txt

In [None]:
#remove common stop words, include word count/remove empty clean_text rows, remove rows with arabic

<span style="color: red; font-family: Calibri Light;">
  <h1><b>Topic Modelling with BerTopic: w Preprocessing</b></h1>
</span>

---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>I. Setting Up Environment</b></h2>
</span>

In [3]:
import pandas as pd
import numpy as np

import ast
import csv
import os
import random
import time

from glob import glob

#NLP libraries
from bertopic import BERTopic

#for vis
import matplotlib.pyplot as plt

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>II. Import Data into DataFrame</b></h2>
</span>

In [4]:
def clean_comments (filepath):
    """
    import comments from csv file into pandas dataframe,
    
    and carry out initial cleaning including removing deleted comments,
    
    correcting datetime data type, remove unwanted columns like `isSubmitter`,
    
    remove duplicates, etc
    
    """
    #import data
    df = pd.read_csv(filepath, low_memory = False)
    
    #remove deleted comments if any
    df = df[df.Body != '[deleted]']
    df = df[df.Body != '[removed]']
    df = df.dropna(subset=['Body'])
    
    #remove comments with missing id
    drop_index = df[df.isSubmitter.isnull()].index
    df.drop(drop_index, inplace = True)
    
    #remove duplicates if any
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #correct data types and column label
    df['Date_Created'] = pd.to_datetime(df['Date_Created'])
    df['year'] = df['Date_Created'].dt.year
    df['Score'] = df['Score'].astype('int') 
    df.rename(columns = {'Author_ID': "Author"}, inplace = True) 
    
    #rename 'Body' column to text
    df.rename(columns = {'Body': 'long_text',
                        'Date_Created': 'date_created'}, inplace = True)
    
    #remove unnecessary columns
    df.drop(columns = ['Unnamed: 0', 'Author', 'Score',
       'Parent_ID', 'Submission_ID', 'Subreddit', 'isParent', 'isSubmitter'], inplace = True)
    
    #remove any extra whitespace in column labels
    df.columns = df.columns.str.strip()
    
    #include column to denote row is comment entry
    df['text_type'] = 'comment'
    
    #rearrange column order
    df = df[['text_type','ID','date_created', 'year', 'long_text']]
    
    
        
    return df

def clean_submissions(filepath):
    """
    import submissions/posts from csv file into pandas dataframe,
    
    and carry out initial cleaning including removing deleted comments,
    
    correcting datetime data type, remove unwanted columns like `Unnamed`,
    
    remove duplicates, etc
    
    """
    df = pd.read_csv(filepath, low_memory = False)
    
    #drop duplicate posts
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #create column for post title + post text
    df['long_text'] = df['Title']+ " " +df['Post Text'].fillna('')
    
    #adjust data types
    df['Date Created'] = pd.to_datetime(df['Date Created'])
    df['year'] = df['year'].astype('int')
    
    #rename columns
    df.rename(columns = {'Date Created': 'date_created'}, inplace = True)
    
    #remove unwanted columns
    df = df.drop(columns = ['Unnamed: 0', 'Title','Post Text', 'Score',
       'Total Comments', 'Post URL', 'SubReddit','Unnamed: 0.1'])
    
    #include column to denote row is comment entry
    df['text_type'] = 'submission'
    
    #reorder columns - 'ID', 'Post Text'
    df  = df[['text_type','ID', 'date_created','year', 'long_text']]
    
    return df
    

In [5]:
#access dataset files
folder_path = os.path.join("..", "Data")
file_type = "*.csv"

#list of dataset file paths
document_path = glob(os.path.join(folder_path, file_type))

document_path

['../Data/corpus_v5.csv',
 '../Data/corpus_v4.csv',
 '../Data/filtered_corpus.csv',
 '../Data/corpus_v3.csv',
 '../Data/corpus_v2.csv',
 '../Data/full_posts.csv',
 '../Data/bert_train_data.csv',
 '../Data/training_data.csv',
 '../Data/vocabulary.csv',
 '../Data/emoji_subset.csv',
 '../Data/sampled_subset.csv',
 '../Data/training_corpus.csv',
 '../Data/comments.csv',
 '../Data/corpus.csv']

In [6]:
comments_filepath = '../Data/comments.csv'
submissions_filepath = '../Data/full_posts.csv'


data = pd.concat([clean_comments(comments_filepath), clean_submissions(submissions_filepath)], ignore_index = True)

data

Unnamed: 0,text_type,ID,date_created,year,long_text
0,comment,gtfo2hl,2021-04-05 13:00:32,2021,"*Cuntry roads, take me hoem*"
1,comment,gtfqkbv,2021-04-05 13:41:40,2021,"That’s been there for several years, sent a pi..."
2,comment,gtfou07,2021-04-05 13:13:23,2021,I am single and I have not traveled to any cun...
3,comment,gtfrgpe,2021-04-05 13:56:09,2021,What happens when you shop at dragon mart...
4,comment,gtg5mwv,2021-04-05 16:51:54,2021,I am cunting on them to do so 😅
...,...,...,...,...,...
128878,submission,14f46ji,2023-06-21 14:40:54,2023,"Best beauty saloons in Dubai? Hello fellas, I ..."
128879,submission,14f4uyi,2023-06-21 15:15:27,2023,Found the r/dubai redditors who kept telling m...
128880,submission,14f4ri3,2023-06-21 15:10:25,2023,Scam ? Healthy.line My sister has a CBD debit ...
128881,submission,14f4k3r,2023-06-21 15:00:34,2023,Thoughts on Expo City properties? Anyone else ...


<span style="color: red; font-family: Calibri Light;">
  <h2><b>III. Data Preprocessing</b></h2>
</span>

---

<span style="color: red; font-family: Calibri Light;">
  <h3><b>a. Import preprocessing packages</b></h3>
</span>

In [7]:
import re #regular expressions library for text manipulation
import string
import unicodedata

#from prettytable import PrettyTable

#NLP libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import contractions

#import itertools
#from autocorrect import Speller

#for wordclouds
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

import matplotlib.pyplot as plt

In [8]:
#!python -m spacy download en_core_web_md

In [9]:
year_group = data.groupby(by='year')
for year, group in year_group:
    print (year,len(group))

2012 11
2013 60
2014 143
2015 203
2016 786
2017 1127
2018 1676
2019 4217
2020 29760
2021 28034
2022 36342
2023 26524


<span style="color: red; font-family: Calibri Light;">
  <h3><b>b. convert text to lowercase</b></h3>
</span>

In [10]:
data['clean_text'] = data['long_text'].apply(lambda text: text.lower())

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
77608,comment,fln0mv1,2020-03-27 14:23:11,2020,FFS just stay the fuck indoors like everyone e...,ffs just stay the fuck indoors like everyone e...
13119,comment,gfkqk97,2020-12-12 23:56:02,2020,Don’t leave them...get someone to follow up fo...,don’t leave them...get someone to follow up fo...
57904,comment,i7ssgsh,2022-05-08 18:18:21,2022,"Fair enough. Just make informed decisions, as ...","fair enough. just make informed decisions, as ..."
30071,comment,jn4e7kc,2023-06-06 18:11:21,2023,I love this new rule but it does make wonder… ...,i love this new rule but it does make wonder… ...
45355,comment,coi0lcw,2015-02-11 14:29:01,2015,Hope you are bringing along your Italian sport...,hope you are bringing along your italian sport...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>c. expand word contractions</b></h3>
</span>

In [11]:
data['clean_text'] = data['clean_text'].apply(lambda text: contractions.fix(text)) 

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
105127,comment,jijjn3j,2023-05-02 13:28:08,2023,I will note that drivers have been getting... ...,i will note that drivers have been getting... ...
33027,comment,jervree,2023-04-03 15:34:33,2023,The difference between a muslim and non-muslim...,the difference between a muslim and non-muslim...
15078,comment,ir6z3x8,2022-10-06 00:00:02,2022,Yes that's one big bonus,yes that is one big bonus
125237,submission,nziruv,2021-06-14 13:20:26,2021,Eva from Emirates NBD is horrible to whoever i...,eva from emirates nbd is horrible to whoever i...
124191,comment,h7k6vuz,2021-08-03 19:13:09,2021,"This isn’t UAE-bashing, as the UAE has done an...","this is not uae-bashing, as the uae has done a..."


<span style="color: red; font-family: Calibri Light;">
  <h3><b>d. remove URLs</b></h3>
</span>

In [12]:
#index of rows with urls
html_index = data[data['long_text'].str.contains("https")].index

#regex pattern for urls
url_pattern = r'https?://\S+'
#replace url with empty string
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(url_pattern, ' ', text, flags=re.MULTILINE))

data.loc[html_index]

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
17,comment,gtgyqzz,2021-04-05 20:49:12,2021,Meditation And Relaxation Music https://youtu....,meditation and relaxation music
83,comment,gpqwzwc,2021-03-05 10:34:53,2021,I am curious about this Apollo Fintech GSX coi...,i am curious about this apollo fintech gsx coi...
93,comment,gpqwrek,2021-03-05 10:31:45,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,\n\nhere is a video explaining the same.
171,comment,gq3tah4,2021-03-07 19:49:41,2021,"With all due respect, do you expect a law enfo...","with all due respect, do you expect a law enfo..."
290,comment,jb5kpz4,2023-03-06 20:35:11,2023,Here: Dubai Festival City\nhttps://maps.app.go...,here: dubai festival city\n \n\nthen follow th...
...,...,...,...,...,...,...
128777,submission,14djkvs,2023-06-19 20:39:46,2023,Help‼️Uni qualification equivalency letter for...,help‼️uni qualification equivalency letter for...
128835,submission,14d2uk9,2023-06-19 07:06:28,2023,PSA: Villas are at high risk of fire during th...,psa: villas are at high risk of fire during th...
128837,submission,14d0c29,2023-06-19 05:00:13,2023,"Daily Random Discussion Thread Hello,\n\nThis ...","daily random discussion thread hello,\n\nthis ..."
128868,submission,14cieby,2023-06-18 15:43:08,2023,Need help with applying for residency - confus...,need help with applying for residency - confus...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>e. remove punctuations</b></h3>
</span>

In [13]:
start_time = time.time()
#index of some rows with punctuations
checker_list = ['ifquow','gzl2ec','147gsfl','vtelex',
 '12pqx6m','fuxrd2','2ui6wu','l4gz0u','14f4uyi','14f8d30']

rows_to_check = data[data['ID'].isin(checker_list)].index.tolist()

rows_to_check.extend([32003, 116022,18460,5786,30109])

rows_to_check.extend(html_index)

#regex pattern for punctuations
punctuation_pattern = r'[^\w\s]'

#remove punctuations using `re.sub() method
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(r'[^\w\s]', ' ', text))

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,guys we made it yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i am currently looking for jobs t...
...,...,...,...,...,...,...
128777,submission,14djkvs,2023-06-19 20:39:46,2023,Help‼️Uni qualification equivalency letter for...,help uni qualification equivalency letter for...
128835,submission,14d2uk9,2023-06-19 07:06:28,2023,PSA: Villas are at high risk of fire during th...,psa villas are at high risk of fire during th...
128837,submission,14d0c29,2023-06-19 05:00:13,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",daily random discussion thread hello \n\nthis ...
128868,submission,14cieby,2023-06-18 15:43:08,2023,Need help with applying for residency - confus...,need help with applying for residency confus...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>f. remove new line and tabs</b></h3>
</span>

In [14]:
#remove `\n` from text
data['clean_text'] = data['clean_text'].str.replace('\n', ' ')
#remove `\t` from text
data['clean_text'] = data['clean_text'].str.replace('\t', ' ')

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,guys we made it yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration to canada australia nz hell...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,how to reach people who are asking for money g...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,how do you plan to spend your eid holiday any...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",hi everyone i am currently looking for jobs t...
...,...,...,...,...,...,...
128777,submission,14djkvs,2023-06-19 20:39:46,2023,Help‼️Uni qualification equivalency letter for...,help uni qualification equivalency letter for...
128835,submission,14d2uk9,2023-06-19 07:06:28,2023,PSA: Villas are at high risk of fire during th...,psa villas are at high risk of fire during th...
128837,submission,14d0c29,2023-06-19 05:00:13,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",daily random discussion thread hello this is...
128868,submission,14cieby,2023-06-18 15:43:08,2023,Need help with applying for residency - confus...,need help with applying for residency confus...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>f. remove digits</b></h3>
</span>

In [15]:
data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join (word for word in text.split() if word.isalpha()))

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
2930,comment,isx9qt6,2022-10-19 15:33:24,2022,Deliveroo is a UK based company,deliveroo is a uk based company
24848,comment,ii0yaak,2022-07-28 22:54:04,2022,ياريال كسر سياييرهم وكسر محلاتهم بالماي وهو مسرع,ياريال كسر سياييرهم وكسر محلاتهم بالماي وهو مسرع
25853,comment,h99qyf6,2021-08-17 13:42:37,2021,Im confused is this a nightclub or some restau...,i am confused is this a nightclub or some rest...
20434,comment,j8svl61,2023-02-16 22:43:35,2023,You can’t reduce the headway indefinitely (tim...,you cannot reduce the headway indefinitely tim...
48765,comment,iinyb0r,2022-08-02 20:56:02,2022,I've read a number of posts on this sub and it...,i have read a number of posts on this sub and ...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>g. lemmatization</b></h3>
</span>

In [16]:
nlp = spacy.load('en_core_web_md')

In [None]:
start_time = time.time()
data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join(token.lemma_ for token in nlp(text)))

print (f"This program took {(time.time()-start_time)/60:.2f} minutes to complete")

data.iloc[rows_to_check]


In [None]:
if

<span style="color: red; font-family: Calibri Light;">
  <h3><b>h. remove custom stopwords</b></h3>
</span>

In [None]:
#access dataset files
folder_path = os.path.join("..", "preprocessing_exploration")
file_type = "*.txt"

#list of dataset file paths
document_path = glob(os.path.join(folder_path, file_type))

document_path

In [None]:
#load custom_stop_words list
with open('../preprocessing_exploration/custom_stop_words.txt', 'r') as file:
    custom_sw = [line.strip() for line in file]
print (f'There are {len(custom_sw)} in the custom stop words list')
print ('\n')
print (random.sample(custom_sw, 5))

In [None]:
start_time = time.time()
#remove custom words

data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join([word for word in text.split() if word not in custom_sw]))

print (f"This program took {(time.time()-start_time)/60:.2f} minutes to complete")

data.sample(n=5)                                    

<span style="color: red; font-family: Calibri Light;">
  <h3><b>i. remove extra whitespaces</b></h3>
</span>

In [None]:
data['clean_text'] = data['clean_text'].str.strip().str.replace('\s+', ' ', regex = True)

sample_rows = [5786,18460, 103391]

data.loc[sample_rows]

<span style="color: red; font-family: Calibri Light;">
  <h3><b>j. remove sampled subset</b></h3>
</span>

In [None]:
data.info()

In [None]:
#load list of sample_subset indices

#sampling done previously by randomly selecting entries from each year

with open('../preprocessing_exploration/sample_subset_index.txt', 'r') as file:
    subset_id = [line.strip() for line in file]

print(subset_id[:5])
print (len(subset_id))

In [None]:
#remove randomly sampled subset

subset_index = data[data['ID'].isin(subset_id)].index.to_list()

bert_train_data = data.drop(subset_index, axis = 0)

bert_train_data

In [None]:
#save corpus 
filename = '../Data/bert_train_data.csv'

def export_csv():
    '''
    export pre-processed data to CSV
    '''
    bert_train_data.to_csv(filename, index_label = 'index', quoting = csv.QUOTE_ALL, header = True)

export_csv()

print ('file saved')

---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>III. Train BerTopic Model</b></h2>
</span>

In [None]:
#https://hackernoon.com/nlp-tutorial-topic-modeling-in-python-with-bertopic-372w35l9

#instantiate model
model = BERTopic(verbose = True)


In [None]:
#convert text data to list
docs = bert_train_data['clean_text'].to_list()

In [None]:
start_time = time.time()

#train model
topics, probabilities = model.fit_transform(docs)

print (f"This project took {(time.time() - start_time)/60 :.2f} minutes to complete")

In [None]:
topics