# Topic Modeling on Twitter Dataset using LDA

#### Importing libraries

##### Uncomment this section while running the notebook for the first time

In [1]:
# ! pip install pandas
# ! pip install spacy
# ! pip install gensim
# ! pip install pickle
# ! pip install pyLDAvis
# ! python -m spacy download en_core_web_lg

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en import English
import pyLDAvis
from pyLDAvis import gensim_models
from gensim import corpora
import pickle
import gensim

import warnings
warnings.filterwarnings('ignore')

  from scipy.linalg.special_matrices import triu


In [3]:
nlp = spacy.load("en_core_web_lg")

#### Reading twitter dataset (combined_excel_files)

In [4]:
df_1 = pd.read_csv('../data/excel_files/combined_twitter_df.csv', index_col=0)
df_1.head()

Unnamed: 0,Tweet,Hashtag
0,@TheBuffaloNews Great to see the osteointegrat...,
1,After an @AANMember study documented the pay g...,ubuffalo
2,Join #UBGSE for Black History Nerds Saturday S...,ubgse blackhistorymonth ubuffalo
3,Michael Rembis is the director of the Center f...,
4,"Thank you, @NeurologyToday, for giving me the ...",paygaps genderinequity neurology ubuffalo wome...


In [5]:
hashtags_1 = []

for data in df_1['Hashtag'].dropna():
    for word in data.split(' '):
        hashtags_1.append(word.strip("'#@*&,:;/").lower())

#### Reading twitter data (combined_new_files)

In [6]:
df_2 = pd.read_csv('../data/combined_files/combined_new_files.csv', index_col=0)
df_2.rename(columns={'Tweets':'Tweet'}, inplace=True)
df_2.head()

Unnamed: 0,Tweet
0,Nice endorsement of #BuildBackBetter and...
1,She will be missed. https://t.co/U2eVs49mHg'
2,#UBBulls #GoBulls #SelfEfficacy 05/24/202...
3,Lies are shame. The one you retweeted p...
4,I believe that he is the third @ubalumn...


In [7]:
hashtags_2 = []
for data in df_2['Tweet']:
    for word in data.split(' '):
        if word.startswith('#'):
            hashtags_2.append(word.strip("'#@*&,:;/").lower())

#### Making hashtags list

In [8]:
hashtags=hashtags_1+hashtags_2

unique_hashtags = set(hashtags)

#### Datascraping Hashtags to use as stopwords

In [9]:
stopwords = ['ubuffalo', 'ubtrueblue', 'ubhornsup', 'ubalumni', 'ubgse', 'ubbulls', 'ubmgt']

#### Reading dataset with all tweets

In [10]:
df = pd.read_csv('../data/combined_files/all_tweets.csv', index_col=0)
df.head()

Unnamed: 0,Tweet
0,@TheBuffaloNews Great to see the osteointegrat...
1,After an @AANMember study documented the pay g...
2,Join #UBGSE for Black History Nerds Saturday S...
3,Michael Rembis is the director of the Center f...
4,"Thank you, @NeurologyToday, for giving me the ..."


#### Extracting twitter account names to exclude from analysis

In [11]:
ats = []
for data in df['Tweet']:
    for word in data.split(' '):
        if word.startswith('@'):
            ats.append(word.strip('@').lower())

In [12]:
custom_stopwords = ats+stopwords
len(custom_stopwords)

10198

#### adding custom stopwords

In [13]:
nlp.Defaults.stop_words |= set(custom_stopwords)
nlp.Defaults.stop_words

{'',
 'another',
 'full',
 'since',
 'deltasonicwash',
 'scottwilsonbuf',
 "ubgse'",
 'yalemed',
 'ballstatefb',
 'ssn_ballstate',
 'aera_edresearch',
 'john_majoreason',
 'kristjan90',
 'nyuniversity’s',
 'same',
 'bsucoachneu',
 'thartman2u',
 'nsbe',
 'coachbatts11',
 'rnwora',
 'nine',
 'box_band_chainz',
 'jobqualityindex',
 'yamphoto’s',
 'wrestlingtexas',
 'barrosolab',
 'ubsphhp,',
 'ashlynoshea',
 "profeaustin!'",
 'djjohnson232',
 'jonatthebar',
 'aare_rural_ed',
 "ub_president'",
 'might',
 'drabrashear',
 "82bulls50?'",
 'yaf',
 'wolfblitzer,',
 'rittigers',
 'ed2litt,',
 'goingdeeppod',
 'baldwinwallace',
 'healthleaders',
 'wsucougarfb',
 'dramandanick:',
 'ufpel',
 'mssociety',
 'without',
 'kimdianaconnoll,',
 'ubbullsfans',
 'ubchemistry',
 'patshellyssw',
 'sunychancellor',
 'latterly',
 'nehgov',
 'mattwal17',
 'nysdec',
 'jarvisrgivens',
 'although',
 'mcsdflorida',
 'itong_hiu',
 'whenever',
 'healthequitywks',
 'ubalumni!',
 'prevnet',
 'neaminzeleke',
 'ourselves

#### Cleaning data

1. Removing special characters, hyperlinks, punctuations, numbers, spaces
2. Removing stop words
6. Taking words with length greater than specified in 'min_length_word'
7. Taking Lemmatized version of the cleaned words
8. Taking Lowercase

In [14]:
min_length_word = 3

In [15]:
def data_cleaner(row):
    words = row.split(' ')
    cleaned = [word.strip('@#"*%^();><?][{}]:.&,\'') for word in words if word.startswith(('\nhttps', 'htt'))==False]
    row = ' '.join(cleaned)
    doc = nlp(row)
    result = [(token.lemma_).lower() for token in doc if token.is_stop == False and token.is_punct==False and token.is_space==False and token.is_digit==False and len(token.lemma_)>min_length_word]
    return result

In [16]:
df['Cleaned'] = df['Tweet'].apply(lambda x: data_cleaner(x))
df

Unnamed: 0,Tweet,Cleaned
0,@TheBuffaloNews Great to see the osteointegrat...,"[great, osteointegrated, program, grow]"
1,After an @AANMember study documented the pay g...,"[study, document, male, female, neurologist, t..."
2,Join #UBGSE for Black History Nerds Saturday S...,"[join, black, history, nerds, saturday, school..."
3,Michael Rembis is the director of the Center f...,"[michael, rembis, director, center, disability..."
4,"Thank you, @NeurologyToday, for giving me the ...","[thank, give, opportunity, speak, paygaps, gen..."
...,...,...
6945,Join an intellectual community where leading r...,"[join, intellectual, community, lead, research..."
6946,"Alex Adema, EMBA ’12, grew up as a competitive...","[alex, adema, emba, grow, competitive, skier, ..."
6947,Happy New Year! Cheers to a year full of renew...,"[happy, year, cheer, year, renew, beginning, p..."
6948,"Happy New Year! Another year gone by, but anot...","[happy, year, year, arrive, opportunity, seize..."


#### Building text corpus from cleaned data and saving it

In [17]:
dictionary = corpora.Dictionary(df['Cleaned'])
corpus = [dictionary.doc2bow(text) for text in df['Cleaned']]
pickle.dump(corpus, open('../topic_modeling/corpus.pkl', 'wb'))
dictionary.save('../topic_modeling/dictionary')

### I. Using complete sentences

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [18]:
num_topics = 5
passes = 10
random_state = 100
num_words = 5
corpus = corpus
dictionary = dictionary

### --------------------------------------------------------------------------------------------------------------------------------

#### Model Building

In [19]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                          num_topics = num_topics,
                                          id2word = dictionary,
                                          passes = passes,
                                          random_state=random_state)

ldamodel.save('../topic_modeling/model5.gensim')  #saving the model

#### Checking top topics

In [20]:
topics = ldamodel.print_topics(num_words=num_words)
for topic in topics:
    print(topic)

(0, '0.013*"tplf" + 0.011*"need" + 0.010*"tigray" + 0.009*"force" + 0.009*"right"')
(1, '0.017*"buffalo" + 0.011*"admission" + 0.008*"game" + 0.008*"family" + 0.008*"deadline"')
(2, '0.015*"year" + 0.014*"student" + 0.012*"happy" + 0.011*"share" + 0.010*"thank"')
(3, '0.023*"student" + 0.018*"register" + 0.016*"join" + 0.012*"program" + 0.012*"learn"')
(4, '0.023*"thank" + 0.012*"black" + 0.010*"game" + 0.010*"bulls" + 0.009*"history"')


#### Visualising Topics

In [21]:
lda_display = gensim_models.prepare(ldamodel, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display)

### II. Using Part of Speech - Nouns

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [22]:
pos = ['NOUN', 'PROPN']
min_length_word = 3

### --------------------------------------------------------------------------------------------------------------------------------

#### Extracting Part of Speech decided above from the tweets

In [23]:
def pos_extracter(row, part:list):
    words = row.split(' ')
    cleaned = [word.strip('@#"*%^();><?][{}]:.&,\'') for word in words if word.startswith(('\nhttps', 'http'))==False]
    row = ' '.join(cleaned)
    doc = nlp(row)
    result = [(token.lemma_).lower() for token in doc if token.is_stop == False and token.is_punct==False and token.pos_ in part and token.is_digit==False and len(token.lemma_)>min_length_word]
    return result

In [24]:
df['POS'] = df['Tweet'].apply(lambda x: pos_extracter(x, pos))
df.head()

Unnamed: 0,Tweet,Cleaned,POS
0,@TheBuffaloNews Great to see the osteointegrat...,"[great, osteointegrated, program, grow]",[program]
1,After an @AANMember study documented the pay g...,"[study, document, male, female, neurologist, t...","[study, neurologist, neurologist, health, scie..."
2,Join #UBGSE for Black History Nerds Saturday S...,"[join, black, history, nerds, saturday, school...","[black, history, nerds, saturday, school, hill..."
3,Michael Rembis is the director of the Center f...,"[michael, rembis, director, center, disability...","[michael, rembis, director, center, disability..."
4,"Thank you, @NeurologyToday, for giving me the ...","[thank, give, opportunity, speak, paygaps, gen...","[opportunity, paygaps, genderinequity, neurolo..."


#### Building and Saving POS Text Corpus

In [25]:
dictionary_nouns = corpora.Dictionary(df['POS'])
corpus_nouns = [dictionary_nouns.doc2bow(text) for text in df['POS']]
pickle.dump(corpus_nouns, open('../topic_modeling/corpus_nouns.pkl', 'wb'))
dictionary_nouns.save('../topic_modeling/dictionary_nouns')

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [26]:
num_topics = 5
passes = 10
random_state = 100
num_words = 5
corpus = corpus_nouns
dictionary = dictionary_nouns

### --------------------------------------------------------------------------------------------------------------------------------

#### Modeling using new POS data

In [27]:
ldamodel_nouns = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                          num_topics = num_topics,
                                          id2word = dictionary,
                                          passes = passes,
                                        random_state=random_state)

ldamodel_nouns.save('../topic_modeling/model5_nouns.gensim')   #saving the model

#### Checking top topics

In [28]:
topics_nouns = ldamodel_nouns.print_topics(num_words=num_words)
for topic in topics_nouns:
    print(topic)

(0, '0.019*"year" + 0.018*"ubmba" + 0.017*"game" + 0.017*"congratulation" + 0.016*"bulls"')
(1, '0.018*"tigray" + 0.016*"thank" + 0.016*"buffalo" + 0.014*"nomore" + 0.014*"force"')
(2, '0.017*"student" + 0.012*"happy" + 0.011*"deadline" + 0.010*"school" + 0.010*"program"')
(3, '0.032*"student" + 0.029*"program" + 0.025*"school" + 0.019*"today" + 0.017*"education"')
(4, '0.014*"diversity" + 0.013*"student" + 0.010*"point" + 0.009*"inclusion" + 0.008*"equity"')


#### Visualising Topics

In [29]:
lda_display_nouns = gensim_models.prepare(ldamodel_nouns, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display_nouns)

### III. Testing for optimal number of topics based on Coherence and Perplexity scores

### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

In [30]:
max_topics = 10

passes = 10
random_state = 100
num_words = 5
corpus = corpus_nouns
dictionary = dictionary_nouns

### --------------------------------------------------------------------------------------------------------------------------------

In [31]:
# THIS SECTION IS TAKEN FROM JASMINE'S CODE

from gensim.models import CoherenceModel

result={'number_topics': [], 'coherence_scores': [], 'perplexity_scores':[]}

print('Calculating Coherence and Perplexity Scores')
print('')
for t in range(2,max_topics+1):
    print(f'Topic number: {t}')


    # Build LDA model elbow chart
    lda_model_f = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=t, 
                                           random_state=random_state,
                                           passes=passes)


    # Compute Perplexity
    perp_score = lda_model_f.log_perplexity(corpus)
    print('\nPerplexity: ', lda_model_f.log_perplexity(corpus))  # a measure of how good the model is. The lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model_f, texts=df['POS'], dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    print('\nCoherence Score: ', coherence_lda)
    print('-'*50)

    result["number_topics"].append(t)
    result["coherence_scores"].append(coherence_lda)
    result["perplexity_scores"].append(perp_score)

topics = pd.DataFrame(result)

opt=max(topics.coherence_scores)
perp=min(topics.perplexity_scores)
topicindex_c=topics.number_topics[topics.coherence_scores==opt].values[0]
topicindex_p=topics.number_topics[topics.perplexity_scores==perp].values[0]
print('')
print('')
print('*'*50)
print(f'Best Perplexity topic:{topicindex_p}')
print(f'Best Coherence topic:{topicindex_c}')
print('*'*50)
topics

Calculating Coherence and Perplexity Scores

Topic number: 2

Perplexity:  -7.9823324193482055

Coherence Score:  0.35233979462066556
--------------------------------------------------
Topic number: 3

Perplexity:  -8.023052403058067

Coherence Score:  0.2898134000426386
--------------------------------------------------
Topic number: 4

Perplexity:  -8.03105735855205

Coherence Score:  0.35331365041598883
--------------------------------------------------
Topic number: 5

Perplexity:  -8.08949084790647

Coherence Score:  0.3898553571166121
--------------------------------------------------
Topic number: 6

Perplexity:  -8.147842735358472

Coherence Score:  0.35097980229008036
--------------------------------------------------
Topic number: 7

Perplexity:  -8.189923995835622

Coherence Score:  0.4232424101205131
--------------------------------------------------
Topic number: 8

Perplexity:  -8.217163484431989

Coherence Score:  0.4360399764708613
--------------------------------------

Unnamed: 0,number_topics,coherence_scores,perplexity_scores
0,2,0.35234,-7.982332
1,3,0.289813,-8.023053
2,4,0.353314,-8.031056
3,5,0.389855,-8.089487
4,6,0.35098,-8.14783
5,7,0.423242,-8.189861
6,8,0.43604,-8.217179
7,9,0.422877,-8.233353
8,10,0.394795,-8.258506


### --------------------------------------------------------------------------------------------------------------------------------
#### CHANGE THIS SECTION ONLY

#### Choose num_topics with lowest Perplexity score and highest Coherence score

In [37]:
num_topics_opt = 7

### --------------------------------------------------------------------------------------------------------------------------------

#### Modeling using Optimal Topics

In [38]:
ldamodel_nouns_f = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics_opt, 
                                           random_state=random_state,
                                           passes=passes)

ldamodel_nouns_f.save('../topic_modeling/model5_nouns_f.gensim') #saving the model

topics_nouns_f = ldamodel_nouns_f.print_topics(num_words=num_words)
for topic in topics_nouns_f:
    print(topic)

lda_nouns_display_f = gensim_models.prepare(ldamodel_nouns_f, corpus, dictionary, sort_topics=True)
vis = pyLDAvis.display(lda_nouns_display_f)
vis

(0, '0.022*"game" + 0.021*"ubmba" + 0.019*"leadership" + 0.019*"team" + 0.015*"diversity"')
(1, '0.030*"buffalo" + 0.016*"leader" + 0.012*"nomore" + 0.011*"alumni" + 0.011*"family"')
(2, '0.019*"spring" + 0.018*"student" + 0.014*"health" + 0.012*"school" + 0.011*"program"')
(3, '0.039*"student" + 0.032*"program" + 0.026*"school" + 0.021*"education" + 0.020*"today"')
(4, '0.017*"student" + 0.011*"scholarship" + 0.009*"work" + 0.009*"research" + 0.009*"people"')
(5, '0.032*"year" + 0.022*"happy" + 0.017*"book" + 0.014*"care" + 0.013*"child"')
(6, '0.023*"thank" + 0.018*"deadline" + 0.016*"emba" + 0.015*"research" + 0.013*"force"')


#### Save the visualisation

In [39]:
pyLDAvis.save_html(lda_nouns_display_f, '../topic_modeling/twitter_topic_modeling.html')