In [3]:
#pip freeze > modelenv_requirements.txt

In [2]:
#remove common stop words, include word count/remove empty clean_text rows, remove rows with arabic

<span style="color: red; font-family: Calibri Light;">
  <h1><b>Topic Modelling with BerTopic: w Preprocessing</b></h1>
</span>

---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>I. Setting Up Environment</b></h2>
</span>

In [1]:
import pandas as pd
import numpy as np

import ast
import csv
import os
import random
import time

from glob import glob

#NLP libraries
from bertopic import BERTopic

#for vis
import matplotlib.pyplot as plt

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>II. Import Data into DataFrame</b></h2>
</span>

In [4]:
def clean_comments (filepath):
    """
    import comments from csv file into pandas dataframe,
    
    and carry out initial cleaning including removing deleted comments,
    
    correcting datetime data type, remove unwanted columns like `isSubmitter`,
    
    remove duplicates, etc
    
    """
    #import data
    df = pd.read_csv(filepath, low_memory = False)
    
    #remove deleted comments if any
    df = df[df.Body != '[deleted]']
    df = df[df.Body != '[removed]']
    df = df.dropna(subset=['Body'])
    
    #remove comments with missing id
    drop_index = df[df.isSubmitter.isnull()].index
    df.drop(drop_index, inplace = True)
    
    #remove duplicates if any
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #correct data types and column label
    df['Date_Created'] = pd.to_datetime(df['Date_Created'])
    df['year'] = df['Date_Created'].dt.year
    df['Score'] = df['Score'].astype('int') 
    df.rename(columns = {'Author_ID': "Author"}, inplace = True) 
    
    #rename 'Body' column to text
    df.rename(columns = {'Body': 'long_text',
                        'Date_Created': 'date_created'}, inplace = True)
    
    #remove unnecessary columns
    df.drop(columns = ['Unnamed: 0', 'Author', 'Score',
       'Parent_ID', 'Submission_ID', 'Subreddit', 'isParent', 'isSubmitter'], inplace = True)
    
    #remove any extra whitespace in column labels
    df.columns = df.columns.str.strip()
    
    #include column to denote row is comment entry
    df['text_type'] = 'comment'
    
    #rearrange column order
    df = df[['text_type','ID','date_created', 'year', 'long_text']]
    
    
        
    return df

def clean_submissions(filepath):
    """
    import submissions/posts from csv file into pandas dataframe,
    
    and carry out initial cleaning including removing deleted comments,
    
    correcting datetime data type, remove unwanted columns like `Unnamed`,
    
    remove duplicates, etc
    
    """
    df = pd.read_csv(filepath, low_memory = False)
    
    #drop duplicate posts
    df = df.drop_duplicates(subset =['ID'], ignore_index = True)
    df.reset_index (drop = True, inplace = True)
    
    #create column for post title + post text
    df['long_text'] = df['Title']+ " " +df['Post Text'].fillna('')
    
    #adjust data types
    df['Date Created'] = pd.to_datetime(df['Date Created'])
    df['year'] = df['year'].astype('int')
    
    #rename columns
    df.rename(columns = {'Date Created': 'date_created'}, inplace = True)
    
    #remove unwanted columns
    df = df.drop(columns = ['Unnamed: 0', 'Title','Post Text', 'Score',
       'Total Comments', 'Post URL', 'SubReddit','Unnamed: 0.1'])
    
    #include column to denote row is comment entry
    df['text_type'] = 'submission'
    
    #reorder columns - 'ID', 'Post Text'
    df  = df[['text_type','ID', 'date_created','year', 'long_text']]
    
    return df
    

In [5]:
#access dataset files
folder_path = os.path.join("..", "Data")
file_type = "*.csv"

#list of dataset file paths
document_path = glob(os.path.join(folder_path, file_type))

document_path

['../Data/corpus_v5.csv',
 '../Data/corpus_v4.csv',
 '../Data/filtered_corpus.csv',
 '../Data/corpus_v3.csv',
 '../Data/corpus_v2.csv',
 '../Data/full_posts.csv',
 '../Data/bert_train_data.csv',
 '../Data/training_data.csv',
 '../Data/vocabulary.csv',
 '../Data/emoji_subset.csv',
 '../Data/sampled_subset.csv',
 '../Data/training_corpus.csv',
 '../Data/comments.csv',
 '../Data/corpus.csv']

In [6]:
comments_filepath = '../Data/comments.csv'
submissions_filepath = '../Data/full_posts.csv'


data = pd.concat([clean_comments(comments_filepath), clean_submissions(submissions_filepath)], ignore_index = True)

data

Unnamed: 0,text_type,ID,date_created,year,long_text
0,comment,gtfo2hl,2021-04-05 13:00:32,2021,"*Cuntry roads, take me hoem*"
1,comment,gtfqkbv,2021-04-05 13:41:40,2021,"That’s been there for several years, sent a pi..."
2,comment,gtfou07,2021-04-05 13:13:23,2021,I am single and I have not traveled to any cun...
3,comment,gtfrgpe,2021-04-05 13:56:09,2021,What happens when you shop at dragon mart...
4,comment,gtg5mwv,2021-04-05 16:51:54,2021,I am cunting on them to do so 😅
...,...,...,...,...,...
128878,submission,14f46ji,2023-06-21 14:40:54,2023,"Best beauty saloons in Dubai? Hello fellas, I ..."
128879,submission,14f4uyi,2023-06-21 15:15:27,2023,Found the r/dubai redditors who kept telling m...
128880,submission,14f4ri3,2023-06-21 15:10:25,2023,Scam ? Healthy.line My sister has a CBD debit ...
128881,submission,14f4k3r,2023-06-21 15:00:34,2023,Thoughts on Expo City properties? Anyone else ...


<span style="color: red; font-family: Calibri Light;">
  <h2><b>III. Data Preprocessing</b></h2>
</span>

---

<span style="color: red; font-family: Calibri Light;">
  <h3><b>a. Import preprocessing packages</b></h3>
</span>

In [7]:
import re #regular expressions library for text manipulation
import string
import unicodedata

#from prettytable import PrettyTable

#NLP libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import contractions

#import itertools
#from autocorrect import Speller

#for wordclouds
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

import matplotlib.pyplot as plt

In [8]:
#!python -m spacy download en_core_web_md

In [9]:
year_group = data.groupby(by='year')
for year, group in year_group:
    print (year,len(group))

2012 11
2013 60
2014 143
2015 203
2016 786
2017 1127
2018 1676
2019 4217
2020 29760
2021 28034
2022 36342
2023 26524


<span style="color: red; font-family: Calibri Light;">
  <h3><b>b. convert text to lowercase</b></h3>
</span>

In [18]:
data['clean_text'] = data['long_text'].apply(lambda text: text.lower())

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
114426,comment,dm9vtv0,2017-08-29 13:29:37,2017,>Maximusbeatdown: Important enough for you to ...,en,>maximusbeatdown: important enough for you to ...
45664,comment,gm88xbb,2021-02-06 17:25:46,2021,It's actually illegal. If they are reported to...,en,it's actually illegal. if they are reported to...
80836,comment,jjaiayr,2023-05-08 06:55:31,2023,The definition of 'bare necessities' differs g...,en,the definition of 'bare necessities' differs g...
100729,comment,i8gve4r,2022-05-13 20:19:58,2022,May he rest in peace.\nCondolences to family 💐,en,may he rest in peace.\ncondolences to family 💐
64191,comment,ipuu5gz,2022-09-25 20:30:25,2022,"Hi, I'd like some hot chocolate. Thank you.",en,"hi, i'd like some hot chocolate. thank you."


<span style="color: red; font-family: Calibri Light;">
  <h3><b>c. expand word contractions</b></h3>
</span>

In [19]:
data['clean_text'] = data['clean_text'].apply(lambda text: contractions.fix(text)) 

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
114118,comment,jgghrml,2023-04-16 11:46:32,2023,"Speak to the security first, if that doesn't w...",en,"speak to the security first, if that does not ..."
3024,comment,i38g4lj,2022-04-03 19:43:28,2022,"enable facetime, discord, whatsapp etc",en,"enable facetime, discord, whatsapp etc"
47041,comment,ilbaf1d,2022-08-22 16:30:21,2022,It just means dont bring your horny ass to a f...,en,it just means do not bring your horny ass to a...
110998,comment,eg5eybl,2019-02-10 15:24:18,2019,And its taxi guy,en,and its taxi guy
19876,comment,j70chu9,2023-02-03 07:52:08,2023,Inspiration for many new and old folks out her...,en,inspiration for many new and old folks out her...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>d. remove URLs</b></h3>
</span>

In [20]:
#index of rows with urls
html_index = data[data['long_text'].str.contains("https")].index

#regex pattern for urls
url_pattern = r'https?://\S+'
#replace url with empty string
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(url_pattern, ' ', text, flags=re.MULTILINE))

data.loc[html_index]

Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
17,comment,gtgyqzz,2021-04-05 20:49:12,2021,Meditation And Relaxation Music https://youtu....,en,meditation and relaxation music
83,comment,gpqwzwc,2021-03-05 10:34:53,2021,I am curious about this Apollo Fintech GSX coi...,en,i am curious about this apollo fintech gsx coi...
93,comment,gpqwrek,2021-03-05 10:31:45,2021,https://www.instagram.com/p/CKhzbwpH0c4/?igshi...,en,\n\nhere is a video explaining the same.
171,comment,gq3tah4,2021-03-07 19:49:41,2021,"With all due respect, do you expect a law enfo...",en,"with all due respect, do you expect a law enfo..."
290,comment,jb5kpz4,2023-03-06 20:35:11,2023,Here: Dubai Festival City\nhttps://maps.app.go...,en,here: dubai festival city\n \n\nthen follow th...
...,...,...,...,...,...,...,...
128777,submission,14djkvs,2023-06-19 20:39:46,2023,Help‼️Uni qualification equivalency letter for...,en,help‼️uni qualification equivalency letter for...
128835,submission,14d2uk9,2023-06-19 07:06:28,2023,PSA: Villas are at high risk of fire during th...,en,psa: villas are at high risk of fire during th...
128837,submission,14d0c29,2023-06-19 05:00:13,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",en,"daily random discussion thread hello,\n\nthis ..."
128868,submission,14cieby,2023-06-18 15:43:08,2023,Need help with applying for residency - confus...,en,need help with applying for residency - confus...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>e. remove punctuations</b></h3>
</span>

In [21]:
start_time = time.time()
#index of some rows with punctuations
checker_list = ['ifquow','gzl2ec','147gsfl','vtelex',
 '12pqx6m','fuxrd2','2ui6wu','l4gz0u','14f4uyi','14f8d30']

rows_to_check = data[data['ID'].isin(checker_list)].index.tolist()

rows_to_check.extend([32003, 116022,18460,5786,30109])

rows_to_check.extend(html_index)

#regex pattern for punctuations
punctuation_pattern = r'[^\w\s]'

#remove punctuations using `re.sub() method
data['clean_text'] = data['clean_text'].apply(lambda text: re.sub(r'[^\w\s]', ' ', text))

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,so,guys we made it yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",en,psa immigration to canada australia nz hell...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,en,how to reach people who are asking for money g...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,en,how do you plan to spend your eid holiday any...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",en,hi everyone i am currently looking for jobs t...
...,...,...,...,...,...,...,...
128777,submission,14djkvs,2023-06-19 20:39:46,2023,Help‼️Uni qualification equivalency letter for...,en,help uni qualification equivalency letter for...
128835,submission,14d2uk9,2023-06-19 07:06:28,2023,PSA: Villas are at high risk of fire during th...,en,psa villas are at high risk of fire during th...
128837,submission,14d0c29,2023-06-19 05:00:13,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",en,daily random discussion thread hello \n\nthis ...
128868,submission,14cieby,2023-06-18 15:43:08,2023,Need help with applying for residency - confus...,en,need help with applying for residency confus...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>f. remove new line and tabs</b></h3>
</span>

In [22]:
#remove `\n` from text
data['clean_text'] = data['clean_text'].str.replace('\n', ' ')
#remove `\t` from text
data['clean_text'] = data['clean_text'].str.replace('\t', ' ')

data.iloc[rows_to_check]

Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,so,guys we made it yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",en,psa immigration to canada australia nz hell...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,en,how to reach people who are asking for money g...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,en,how do you plan to spend your eid holiday any...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",en,hi everyone i am currently looking for jobs t...
...,...,...,...,...,...,...,...
128777,submission,14djkvs,2023-06-19 20:39:46,2023,Help‼️Uni qualification equivalency letter for...,en,help uni qualification equivalency letter for...
128835,submission,14d2uk9,2023-06-19 07:06:28,2023,PSA: Villas are at high risk of fire during th...,en,psa villas are at high risk of fire during th...
128837,submission,14d0c29,2023-06-19 05:00:13,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",en,daily random discussion thread hello this is...
128868,submission,14cieby,2023-06-18 15:43:08,2023,Need help with applying for residency - confus...,en,need help with applying for residency confus...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>f. remove digits</b></h3>
</span>

In [23]:
data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join (word for word in text.split() if word.isalpha()))

data.sample(n=5)

Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
85062,comment,gvxjwol,2021-04-26 18:16:36,2021,A casual browsing through r/india or r/worldne...,en,a casual browsing through r india or r worldne...
64476,comment,iquzyaa,2022-10-03 12:33:52,2022,"Bro, trucks aren’t the worst ones. Buses act l...",en,bro trucks are not the worst ones buses act li...
113989,comment,gnrky6m,2021-02-17 18:16:09,2021,"It's always the Americans who say this, while ...",en,it is always the americans who say this while ...
25168,comment,geztmo0,2020-12-08 03:01:28,2020,You’re a bit too old to make jokes that would ...,en,you are a bit too old to make jokes that would...
111856,comment,ib606am,2022-06-04 21:50:03,2022,It happens for the best of us. \n\nEspecially ...,en,it happens for the best of us especially faint...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>g. lemmatization</b></h3>
</span>

In [24]:
nlp = spacy.load('en_core_web_md')

In [25]:
start_time = time.time()
data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join(token.lemma_ for token in nlp(text)))

print (f"This program took {(time.time()-start_time)/60:.2f} minutes to complete")

data.iloc[rows_to_check]


This program took 21.61 minutes to complete


Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
124628,submission,ifquow,2020-08-24 19:13:26,2020,GUYS WE MADE IT!!! YAY,so,guy we make it yay
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",en,psa immigration to canada australia nz hello d...
126727,submission,147gsfl,2023-06-12 10:22:19,2023,How to reach people who are asking for money/g...,en,how to reach people who be ask for money groce...
126869,submission,12pqx6m,2023-04-17 22:00:27,2023,How do you plan to spend your Eid holiday? Any...,en,how do you plan to spend your eid holiday any ...
127216,submission,vtelex,2022-07-07 13:32:10,2022,"Hi everyone, I'm currently looking for jobs te...",en,hi everyone I be currently look for job tempor...
...,...,...,...,...,...,...,...
128777,submission,14djkvs,2023-06-19 20:39:46,2023,Help‼️Uni qualification equivalency letter for...,en,help uni qualification equivalency letter for ...
128835,submission,14d2uk9,2023-06-19 07:06:28,2023,PSA: Villas are at high risk of fire during th...,en,psa villas be at high risk of fire during the ...
128837,submission,14d0c29,2023-06-19 05:00:13,2023,"Daily Random Discussion Thread Hello,\n\nThis ...",en,daily random discussion thread hello this be t...
128868,submission,14cieby,2023-06-18 15:43:08,2023,Need help with applying for residency - confus...,en,need help with apply for residency confuse abo...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>h. remove custom stopwords</b></h3>
</span>

In [34]:
#spacy stopwords
stop_words = list(nlp.Defaults.stop_words)
type(stop_words)

list

In [29]:
#access custom stop words file
folder_path = os.path.join("..", "preprocessing_exploration")
file_type = "*.txt"

#list of dataset file paths
document_path = glob(os.path.join(folder_path, file_type))

document_path

['../preprocessing_exploration/requirements.txt',
 '../preprocessing_exploration/custom_stop_words.txt',
 '../preprocessing_exploration/sample_subset_index.txt']

In [35]:
#load custom_stop_words list
with open('../preprocessing_exploration/custom_stop_words.txt', 'r') as file:
    custom_sw = [line.strip() for line in file]
print (f'There are {len(custom_sw)} in the custom stop words list')
print ('\n')
print (random.sample(custom_sw, 5))

There are 32792 in the custom stop words list


['sanitation', 'swelter', 'pooper', 'sinan', 'foreignaffairs']


In [37]:
#full stop words list - spacy stop words + custom stopwords
full_sw = stop_words + custom_sw
full_sw = list(set(full_sw))

print (f'There are {len(custom_sw)} in the full stop words list')
print (random.sample(full_sw, 5))

There are 32792 in the full stop words list
['excruciating', 'hary', 'shoeprint', 'islash', 'kdha']


In [38]:
start_time = time.time()
#remove custom words

data['clean_text'] = data['clean_text'].apply(lambda text: ' '.join([word for word in text.split() if word not in full_sw]))

print (f"This program took {(time.time()-start_time)/60:.2f} minutes to complete")

data.sample(n=5)                                    

This program took 31.01 minutes to complete


Unnamed: 0,text_type,ID,date_created,year,long_text,detected_language,clean_text
43566,comment,im923mj,2022-08-29 17:03:37,2022,https://www.khaleejtimes.com/emergencies/dubai...,en,police rta remind fast lane overtake vehicle r...
69054,comment,hqnxopq,2021-12-31 13:36:31,2021,I live in the next building lol. I could hear ...,en,I building I scream
78391,comment,ghcb3et,2020-12-29 04:21:19,2020,iPhone X,en,iphone
103266,comment,j4cgceh,2023-01-14 22:58:07,2023,san ka punta?,tl,san
52963,comment,fn7bpvf,2020-04-12 22:46:22,2020,1. Not the normal ones. Her blood pressure was...,en,blood pressure anyways cough slight cold cold ...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>i. remove extra whitespaces</b></h3>
</span>

In [40]:
#delete detected_language colum
data = data.drop(columns=['detected_language'],axis = 1)

In [41]:
data['clean_text'] = data['clean_text'].str.strip().str.replace('\s+', ' ', regex = True)

sample_rows = [5786,18460, 103391]

data.loc[sample_rows]

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
5786,comment,j8mroan,2023-02-15 17:37:07,2023,Confirmed; it’s true. Just need to have the ac...,plate
18460,comment,hzk2ggz,2022-03-06 12:55:36,2022,Encourage pointless posts on Reddit,encourage pointless
103391,comment,j7zocx9,2023-02-10 19:39:10,2023,I really want this to happen. Especially repla...,I replace shitty spelling lovindubai


<span style="color: red; font-family: Calibri Light;">
  <h3><b>j. remove sampled subset</b></h3>
</span>

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128883 entries, 0 to 128882
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   text_type     128883 non-null  object        
 1   ID            128883 non-null  object        
 2   date_created  128883 non-null  datetime64[ns]
 3   year          128883 non-null  int64         
 4   long_text     128883 non-null  object        
 5   clean_text    128883 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 5.9+ MB


In [43]:
#load list of sample_subset indices

#sampling done previously by randomly selecting entries from each year

with open('../preprocessing_exploration/sample_subset_index.txt', 'r') as file:
    subset_id = [line.strip() for line in file]

print(subset_id[:5])
print (len(subset_id))

['hbbcq6z', 'hjjhl9r', 'h7ntxws', 'h02sz1n', 'hnuysoa']
1200


In [44]:
#remove randomly sampled subset

subset_index = data[data['ID'].isin(subset_id)].index.to_list()

bert_train_data = data.drop(subset_index, axis = 0)

bert_train_data

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text
0,comment,gtfo2hl,2021-04-05 13:00:32,2021,"*Cuntry roads, take me hoem*",road I
1,comment,gtfqkbv,2021-04-05 13:41:40,2021,"That’s been there for several years, sent a pi...",
2,comment,gtfou07,2021-04-05 13:13:23,2021,I am single and I have not traveled to any cun...,I single I travel past
3,comment,gtfrgpe,2021-04-05 13:56:09,2021,What happens when you shop at dragon mart...,shop dragon mart
4,comment,gtg5mwv,2021-04-05 16:51:54,2021,I am cunting on them to do so 😅,I
...,...,...,...,...,...,...
128878,submission,14f46ji,2023-06-21 14:40:54,2023,"Best beauty saloons in Dubai? Hello fellas, I ...",beauty saloon fella I wife real saloon beauty ...
128879,submission,14f4uyi,2023-06-21 15:15:27,2023,Found the r/dubai redditors who kept telling m...,I I navigate roundabout
128880,submission,14f4ri3,2023-06-21 15:10:25,2023,Scam ? Healthy.line My sister has a CBD debit ...,scam healthy line sister cbd debit card april ...
128881,submission,14f4k3r,2023-06-21 15:00:34,2023,Thoughts on Expo City properties? Anyone else ...,expo property expo sale pleasant price locate ...


<span style="color: red; font-family: Calibri Light;">
  <h3><b>k. remove empty rows</b></h3>
</span>

In [54]:
#include column counting words in clean_text
bert_train_data['word_count'] = bert_train_data['clean_text'].apply(lambda text: len(text.split()))

#get rows with no text
empty_rows = bert_train_data[bert_train_data['word_count'] == 0]

#get index of empty rows
empty_rows_index = empty_rows.index.to_list()

#adjust bert_train_data to not include empty rows
bert_train_data = bert_train_data[bert_train_data['word_count']>0]
bert_train_data.sort_values(by='word_count', ascending = False)

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text,word_count
125097,submission,gzl2ec,2020-06-09 15:11:27,2020,"PSA: Immigration to Canada, Australia, NZ Hell...",psa immigration canada australia I inquiry imm...,787
125384,submission,jh5hjq,2020-10-24 12:29:05,2020,List of discounts and working promo codes in D...,discount promo code discount promo code tip I ...,641
125338,submission,tce5mm,2022-03-12 15:04:02,2022,Unknown accident and the plight with Dubai Pol...,unknown accident plight police incident I I po...,482
5756,comment,ji1smg1,2023-04-28 17:10:21,2023,"FULLTEXT\nApr 27, 2023\n\nStand in the middle ...",apr middle meena bazaar mile arabian sea mumba...,412
109128,comment,fpg1e6j,2020-05-04 15:19:38,2020,Here is the text for those that can’t get by t...,text paywall recent bye bye broker certainly c...,377
...,...,...,...,...,...,...,...
59846,comment,f1zsyqu,2019-09-30 17:15:28,2019,Choice is an illusion.,illusion,1
59847,comment,f1zjyjw,2019-09-30 15:59:38,2019,Not *That* safely (:,safely,1
98415,comment,iidgo6i,2022-07-31 16:17:15,2022,And this is the right answer that has been end...,dodge,1
81586,comment,izjgz4z,2022-12-09 19:45:56,2022,It has to be hard for you to be this cynical o...,cynical,1


In [55]:
#save corpus 
filename = '../Data/bert_train_data2.csv'

def export_csv():
    '''
    export pre-processed data to CSV
    '''
    bert_train_data.to_csv(filename, index_label = 'index', quoting = csv.QUOTE_ALL, header = True)

export_csv()

print ('file saved')

file saved


---

<span style="color: red; font-family: Calibri Light;">
  <h2><b>III. Train BerTopic Model</b></h2>
</span>

In [5]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


bert_train_data = pd.read_csv('../Data/bert_train_data2.csv', converters ={'tokens':list_converter})
bert_train_data = bert_train_data.drop(columns = ['index'])
bert_train_data.head()

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text,word_count
0,comment,gtfo2hl,2021-04-05 13:00:32,2021,"*Cuntry roads, take me hoem*",road I,2
1,comment,gtfou07,2021-04-05 13:13:23,2021,I am single and I have not traveled to any cun...,I single I travel past,5
2,comment,gtfrgpe,2021-04-05 13:56:09,2021,What happens when you shop at dragon mart...,shop dragon mart,3
3,comment,gtg5mwv,2021-04-05 16:51:54,2021,I am cunting on them to do so 😅,I,1
4,comment,gthiiwi,2021-04-05 23:18:56,2021,"That’s just absolutely hilarious, is this in t...",hilarious spring souk,3


In [7]:
#pip install --upgrade numba tbb
from numba import config
config.THREADING_LAYER = 'threadsafe'

In [9]:
#https://hackernoon.com/nlp-tutorial-topic-modeling-in-python-with-bertopic-372w35l9

#instantiate model
model = BERTopic(verbose = False)


In [10]:
#convert text data to list
docs = bert_train_data['clean_text'].to_list()

In [None]:
start_time = time.time()

#train model
topics, probabilities = model.fit_transform(docs)

print (f"This project took {(time.time() - start_time)/60 :.2f} minutes to complete")

2023-08-12 21:15:26,092 - BERTopic - Transformed documents to Embeddings
OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
topics