# Topic Modeling on Reddit Dataset using LDA

#### Importing Libraries

##### Uncomment this section while running the notebook for the first time

In [1]:
# ! pip install pandas
# ! pip install spacy
# ! pip install gensim
# ! pip install pickle
# ! pip install pyLDAvis
# ! python -m spacy download en_core_web_lg

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from gensim import corpora
import pickle
import gensim
import pyLDAvis
from pyLDAvis import gensim_models

import warnings
warnings.filterwarnings('ignore')

In [3]:
nlp = spacy.load("en_core_web_lg")

#### Reading Reddit dataset

In [4]:
data = pd.read_csv('../data/all/reddit_data_all.csv', index_col=0)
data.drop(0, axis=0, inplace=True)
data.head()

Unnamed: 0,id,title,score,comms_num,comments,search_topic,body,date
1,imkf0n,UBIT is safe,518,7,"['This made my day, thank you', 'COVID-19: *”g...",UBreddit,,2020-09-04 17:34:30
2,ijhhd5,WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...,472,16,['It’s not a new school week until Mr Krabs sa...,UBreddit,,2020-08-30 18:17:02
3,f8019t,What the last 2 minutes of lecture looks like ...,472,21,"['my dude how long did this take lol', 'Is tha...",UBreddit,,2020-02-22 22:30:28
4,dzmrbc,The SU Bull today,444,45,['FYI: Do not malign Chinese students holding ...,UBreddit,,2019-11-21 17:35:56
5,ex5bsz,Logging into MyUB be like.,442,27,"[""Since this semester started, it's been havin...",UBreddit,,2020-02-01 12:15:53


#### Titles Dataframe

In [5]:
title = [line for line in data.title]
title[:5]

['UBIT is safe',
 'WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND POST SPEEDRUNS OF YOUR DAILY HEALTH CHECKS!!!! GIVE IT UP FOR WEEK ONE!!!!!!!!',
 'What the last 2 minutes of lecture looks like to your professor',
 'The SU Bull today',
 'Logging into MyUB be like.']

In [6]:
len(title)

316

In [7]:
title_df = pd.DataFrame(title)
title_df.head()

Unnamed: 0,0
0,UBIT is safe
1,WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...
2,What the last 2 minutes of lecture looks like ...
3,The SU Bull today
4,Logging into MyUB be like.


#### Body Dataframe

In [8]:
body=[line for line in data.body if str(line)!='nan']
body[1]

'Hey everyone, I’m a grad student and a TA at UB, and if none of your professors or TAs have told you this yet this semester - you should be really proud of yourselves. Even if your grade isn’t what you want it to be, even if you felt like you were drowning, and struggling all semester, you should be PROUD. Nothing is okay right now. Nothing is normal. But you still tried, and you still came out on the other side. \n\nI’ve gotten so many emails from students panicking this semester, and telling me all of the problems they’re having, and it breaks my heart to see how heartless and ignorant some professors and TAs are being during this time. I’m truly sorry if you had an experience like this, but please don’t let it stop you from reaching out to your teachers and TAs in the future. We are not all cold assholes. We want you all to do well, and we want to help you do that. We WILL work with you in any way we can. \n\nAnd a final note; if you wouldn’t blame someone else for something, don’t

In [9]:
len(body)

138

In [10]:
body_df = pd.DataFrame(body)
body_df.head()

Unnamed: 0,0
0,CCP trolls have infiltrated UBReddit and are d...
1,"Hey everyone, I’m a grad student and a TA at U..."
2,That was an absolute blowout! Let's go Bulls!
3,The current one is archived for the time being...
4,"As we near the second half of the semester, it..."


#### Comments Dataframe

In [11]:
rows = [row[1:-1].split(', ') for row in data.comments]
comments = [line for row in rows for line in row]
comments[:2]

["'This made my day", "thank you'"]

In [12]:
len(comments)

3453

In [13]:
comments_df = pd.DataFrame(comments)
comments_df.head()

Unnamed: 0,0
0,'This made my day
1,thank you'
2,'COVID-19: *”goddamn it I just verified an hou...
3,'*Set to remind me in seven days.*'
4,'Alright this is some actual hilarious content


#### Data Processing using Spacy

- Tokenize
- Filter Stopwords
- Extract Alphanumeric
- Lemmatize

#### Cleaning data

1. Removing special characters, hyperlinks, punctuations, numbers, spaces
2. Removing stop words
3. Taking words with length greater than specified in 'min_length_word'
4. Taking Lemmatized version of the cleaned words
5. Taking Lowercase

### --------------------------------------------------------------------------------------------------------------------------------
### Add custom stop words here

In [14]:
stop_words = ['fuck','shit', 'dick', 'guy','time','people','year','month','week','girl','tomorrow','good','bad','fucking']

### --------------------------------------------------------------------------------------------------------------------------------

In [15]:
for stop_word in stop_words:
        STOP_WORDS.add(stop_word)

for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [16]:
min_word_length = 3

### --------------------------------------------------------------------------------------------------------------------------------

In [17]:
def data_cleaner(row):
    words = row.split(' ')
    cleaning = [word.strip('@#"*%^();><?][{}]:.&,\'') for word in words if word.startswith(('\nhttps', 'htt'))==False]
    row = ' '.join(cleaning)
    doc = nlp(row)
    output = [(token.lemma_).lower() for token in doc if token.is_stop == False and token.is_punct==False and token.is_space==False and token.is_digit==False and len(token)>min_word_length]
    return output

#### Cleaning dataframes

In [18]:
body_df['cleaned'] = body_df[0].apply(lambda x: data_cleaner(x))
body_df.head()

Unnamed: 0,0,cleaned
0,CCP trolls have infiltrated UBReddit and are d...,"[troll, infiltrate, ubreddit, downvote, post, ..."
1,"Hey everyone, I’m a grad student and a TA at U...","[grad, student, professor, tell, semester, pro..."
2,That was an absolute blowout! Let's go Bulls!,"[absolute, blowout, bulls]"
3,The current one is archived for the time being...,"[current, archive, post, listing, thing, like,..."
4,"As we near the second half of the semester, it...","[near, second, half, semester, survey, relate,..."


In [19]:
title_df['cleaned'] = title_df[0].apply(lambda x: data_cleaner(x))
title_df.head()

Unnamed: 0,0,cleaned
0,UBIT is safe,"[ubit, safe]"
1,WEEK ONE!!! REMEMBER TO WEAR YOUR MASKS AND PO...,"[remember, wear, mask, post, speedruns, daily,..."
2,What the last 2 minutes of lecture looks like ...,"[minute, lecture, look, like, professor]"
3,The SU Bull today,"[bull, today]"
4,Logging into MyUB be like.,"[log, myub, like]"


In [20]:
comments_df['cleaned'] = comments_df[0].apply(lambda x: data_cleaner(x))
comments_df.head()

Unnamed: 0,0,cleaned
0,'This made my day,[]
1,thank you',[thank]
2,'COVID-19: *”goddamn it I just verified an hou...,"[covid-19, goddamn, verify, hour]"
3,'*Set to remind me in seven days.*',"[remind, seven, day]"
4,'Alright this is some actual hilarious content,"[alright, actual, hilarious, content]"


#### Combining dataframes of Title, Body and Comments in one

In [21]:
total_words = pd.concat([comments_df,
                       title_df,
                       body_df])
total_words.reset_index(inplace=True)
total_words = total_words.drop(columns='index')
total_words.head()

Unnamed: 0,0,cleaned
0,'This made my day,[]
1,thank you',[thank]
2,'COVID-19: *”goddamn it I just verified an hou...,"[covid-19, goddamn, verify, hour]"
3,'*Set to remind me in seven days.*',"[remind, seven, day]"
4,'Alright this is some actual hilarious content,"[alright, actual, hilarious, content]"


#### Making sentences from cleaned column words

In [22]:
total_words['cleaned_sentences'] = total_words['cleaned'].apply(lambda x : ' '.join(x))
total_words.head()

Unnamed: 0,0,cleaned,cleaned_sentences
0,'This made my day,[],
1,thank you',[thank],thank
2,'COVID-19: *”goddamn it I just verified an hou...,"[covid-19, goddamn, verify, hour]",covid-19 goddamn verify hour
3,'*Set to remind me in seven days.*',"[remind, seven, day]",remind seven day
4,'Alright this is some actual hilarious content,"[alright, actual, hilarious, content]",alright actual hilarious content


#### Saving this file into csv format

In [23]:
total_words.to_csv('../data/cleaned_combined_data_all.csv')

## Topic Modeling

### I. Using complete sentences

LDA with Gensim

#### Building text corpus from cleaned data and saving it

In [24]:
dictionary = corpora.Dictionary(total_words['cleaned'])
corpus = [dictionary.doc2bow(text) for text in total_words['cleaned']]
pickle.dump(corpus, open('../topic_modeling/corpus.pkl', 'wb'))
dictionary.save('../topic_modeling/dictionary')

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [25]:
num_topics = 5
passes = 10                 
random_state = 100
num_words = 5
corpus = corpus
dictionary = dictionary

### --------------------------------------------------------------------------------------------------------------------------------

#### Model Building

In [26]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                          num_topics = num_topics,
                                          id2word = dictionary,
                                          passes = passes,
                                          random_state= random_state)

ldamodel.save('../topic_modeling/model5.gensim') # saving the model

#### Checking top topics

In [27]:
topics = ldamodel.print_topics(num_words=num_words)
for topic in topics:
    print(topic)

(0, '0.016*"go" + 0.012*"think" + 0.010*"want" + 0.010*"delete" + 0.008*"need"')
(1, '0.015*"like" + 0.014*"work" + 0.013*"post" + 0.013*"semester" + 0.009*"school"')
(2, '0.019*"like" + 0.018*"campus" + 0.015*"class" + 0.010*"know" + 0.007*"look"')
(3, '0.017*"mask" + 0.009*"buffalo" + 0.009*"wear" + 0.007*"room" + 0.006*"need"')
(4, '0.024*"look" + 0.016*"student" + 0.014*"campus" + 0.009*"nice" + 0.008*"love"')


#### Visualising Top Topics

In [28]:
lda_display = gensim_models.prepare(ldamodel, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display)

### II. Using Part of Speech - Nouns

In [29]:
total_words.head()

Unnamed: 0,0,cleaned,cleaned_sentences
0,'This made my day,[],
1,thank you',[thank],thank
2,'COVID-19: *”goddamn it I just verified an hou...,"[covid-19, goddamn, verify, hour]",covid-19 goddamn verify hour
3,'*Set to remind me in seven days.*',"[remind, seven, day]",remind seven day
4,'Alright this is some actual hilarious content,"[alright, actual, hilarious, content]",alright actual hilarious content


#### Example of how Part of Speech works (notice the last columns)

In [30]:
doc = nlp(total_words[0][20])

print(f'Sentence: {total_words[0][20]}')
result = {'Word':[], 'Is Stopword':[], 'Is Alphabet':[], 'Part-of-Speech':[]}

for token in nlp(doc):
    result['Word'].append(token), 
    result['Is Stopword'].append(token.is_stop),
    result['Is Alphabet'].append(token.is_alpha),
    result['Part-of-Speech'].append(token.pos_)
result_df = pd.DataFrame(result)
result_df

Sentence: 'What happened to CyanideSandwhich 😔'


Unnamed: 0,Word,Is Stopword,Is Alphabet,Part-of-Speech
0,',False,False,PUNCT
1,What,True,True,PRON
2,happened,False,True,VERB
3,to,True,True,ADP
4,CyanideSandwhich,False,True,PROPN
5,😔,False,False,PROPN
6,',False,False,PUNCT


### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [31]:
pos = ['NOUN', 'PROPN']
min_length_word = 3

### --------------------------------------------------------------------------------------------------------------------------------

#### Extracting Part of Speech decided above from the text

In [32]:
def pos_extracter(df, part:list, column_name):
    def extracter(row, part):
        words = row.split(' ')
        cleaning = [word.strip('@#"*%^();><?][{}]:.&,\'') for word in words if word.startswith(('\nhttps', 'htt'))==False]
        row = ' '.join(cleaning)
        doc = nlp(row)
        output = [(token.lemma_).lower() for token in doc if token.is_stop == False and token.is_punct == False and token.pos_ in part and token.is_digit==False and len(token)>min_length_word]
        return output
    df[column_name] = df[0].apply(lambda x: extracter(x, part))
    return df

In [33]:
df = pos_extracter(total_words, pos, 'pos')
df.head()

Unnamed: 0,0,cleaned,cleaned_sentences,pos
0,'This made my day,[],,[]
1,thank you',[thank],thank,[]
2,'COVID-19: *”goddamn it I just verified an hou...,"[covid-19, goddamn, verify, hour]",covid-19 goddamn verify hour,"[covid-19, hour]"
3,'*Set to remind me in seven days.*',"[remind, seven, day]",remind seven day,[day]
4,'Alright this is some actual hilarious content,"[alright, actual, hilarious, content]",alright actual hilarious content,[content]


#### Building and Saving POS Text Corpus

In [34]:
dictionary_nouns = corpora.Dictionary(df['pos'])
corpus_nouns = [dictionary_nouns.doc2bow(text) for text in df['pos']]
pickle.dump(corpus_nouns, open('../topic_modeling/corpus_nouns.pkl', 'wb'))
dictionary_nouns.save('../topic_modeling/dictionary_nouns')

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [35]:
num_topics = 5
passes = 10
random_state = 100
num_words = 5
corpus = corpus_nouns
dictionary = dictionary_nouns

### --------------------------------------------------------------------------------------------------------------------------------

#### Modeling using new POS data

In [36]:
ldamodel_nouns = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                          num_topics = num_topics,
                                          id2word = dictionary,
                                          passes = passes,
                                        random_state=random_state)
ldamodel_nouns.save('../topic_modeling/model5_nouns.gensim')

#### Checking top topics

In [37]:
topics_nouns = ldamodel_nouns.print_topics(num_words=num_words)
for topic in topics_nouns:
    print(topic)

(0, '0.020*"semester" + 0.019*"buffalo" + 0.019*"campus" + 0.012*"thing" + 0.012*"university"')
(1, '0.048*"class" + 0.024*"school" + 0.019*"campus" + 0.014*"student" + 0.013*"room"')
(2, '0.045*"student" + 0.026*"post" + 0.012*"course" + 0.011*"person" + 0.010*"semester"')
(3, '0.016*"exam" + 0.012*"opinion" + 0.009*"life" + 0.009*"case" + 0.008*"building"')
(4, '0.022*"campus" + 0.016*"place" + 0.011*"reddit" + 0.010*"professor" + 0.010*"dorm"')


#### Visualizing topics

In [38]:
lda_nouns_display = gensim_models.prepare(ldamodel_nouns, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_nouns_display)

### III. Testing for optimal number of topics based on Coherence and Perplexity scores

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [39]:
max_topics = 7

passes = 10
random_state = 100
num_words = 5
corpus = corpus_nouns
dictionary = dictionary_nouns

### --------------------------------------------------------------------------------------------------------------------------------

In [40]:
# THIS SECTION IS TAKEN FROM JASMINE'S CODE

from gensim.models import CoherenceModel

result={'number_topics': [], 'coherence_scores': [], 'perplexity_scores':[]}

print('Calculating Coherence and Perplexity Scores')
print('')

for t in range(2,max_topics+1):
    print(f'Topic number: {t}')


    # Build LDA model elbow chart
    lda_model_f = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=t, 
                                           random_state=random_state,
                                           passes=passes)


    # Compute Perplexity
    perp_score = lda_model_f.log_perplexity(corpus)
    print('\nPerplexity: ', lda_model_f.log_perplexity(corpus))  # a measure of how good the model is. The lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model_f, texts=df['pos'], dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    print('\nCoherence Score: ', coherence_lda)
    print('-'*50)

    result["number_topics"].append(t)
    result["coherence_scores"].append(coherence_lda)
    result["perplexity_scores"].append(perp_score)

topics = pd.DataFrame(result)

opt=max(topics.coherence_scores)
perp=min(topics.perplexity_scores)
topicindex_c=topics.number_topics[topics.coherence_scores==opt].values[0]
topicindex_p=topics.number_topics[topics.perplexity_scores==perp].values[0]
print('')
print('')
print('*'*50)
print(f'Best Perplexity topic:{topicindex_p}')
print(f'Best Coherence topic:{topicindex_c}')
print('*'*50)
topics

Calculating Coherence and Perplexity Scores

Topic number: 2

Perplexity:  -7.673320635676418

Coherence Score:  0.46307033296505407
--------------------------------------------------
Topic number: 3

Perplexity:  -7.776570928209765

Coherence Score:  0.48066707998571845
--------------------------------------------------
Topic number: 4

Perplexity:  -7.847535300977112

Coherence Score:  0.5107026356571038
--------------------------------------------------
Topic number: 5

Perplexity:  -7.922060471411436

Coherence Score:  0.5256607958315689
--------------------------------------------------
Topic number: 6

Perplexity:  -7.957917468544753

Coherence Score:  0.5260929931418173
--------------------------------------------------
Topic number: 7

Perplexity:  -8.05589901548066

Coherence Score:  0.55516336766688
--------------------------------------------------


**************************************************
Best Perplexity topic:7
Best Coherence topic:7
****************************

Unnamed: 0,number_topics,coherence_scores,perplexity_scores
0,2,0.46307,-7.673321
1,3,0.480667,-7.776569
2,4,0.510703,-7.847574
3,5,0.525661,-7.922088
4,6,0.526093,-7.957882
5,7,0.555163,-8.055803


### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

#### Choose num_topics with lowest Perplexity score and highest Coherence score

In [41]:
num_topics_opt = 7

### --------------------------------------------------------------------------------------------------------------------------------

#### Modeling using Optimal Topics

In [42]:
ldamodel_nouns_f = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics_opt, 
                                           random_state=random_state,
                                           passes=passes)

ldamodel_nouns_f.save('../topic_modeling/model5_nouns_f.gensim')

topics_nouns_f = ldamodel_nouns_f.print_topics(num_words=num_words)
for topic in topics_nouns_f:
    print(topic)

lda_nouns_display_f = gensim_models.prepare(ldamodel_nouns_f, corpus, dictionary, sort_topics=True)
vis = pyLDAvis.display(lda_nouns_display_f)
vis

(0, '0.022*"university" + 0.021*"life" + 0.019*"semester" + 0.019*"buffalo" + 0.016*"room"')
(1, '0.065*"class" + 0.031*"school" + 0.029*"mask" + 0.016*"work" + 0.015*"question"')
(2, '0.063*"student" + 0.035*"post" + 0.016*"college" + 0.015*"problem" + 0.014*"thank"')
(3, '0.018*"opinion" + 0.017*"break" + 0.012*"case" + 0.012*"campus" + 0.010*"semester"')
(4, '0.032*"professor" + 0.017*"reddit" + 0.015*"place" + 0.013*"exam" + 0.010*"friend"')
(5, '0.042*"campus" + 0.016*"house" + 0.014*"north" + 0.013*"community" + 0.012*"bedroom"')
(6, '0.017*"person" + 0.012*"snow" + 0.011*"thing" + 0.011*"housing" + 0.011*"money"')


#### Save the visualisation

In [43]:
pyLDAvis.save_html(lda_nouns_display_f, '../topic_modeling/reddit_topic_modeling.html')