In [1]:
#Separate notebook to work on Topic Modelling with LDA
%pylab inline
from sys import platform
import pandas as pd

if platform == "win32":
    path = 'C:/Users/olive/'
elif platform == "darwin":
    path = '~/'

Populating the interactive namespace from numpy and matplotlib


In [2]:
c_df = pd.read_pickle(path + 'OneDrive - Cardiff University/Individual Project/processed data/comments.csv')
s_df = pd.read_pickle(path + 'OneDrive - Cardiff University/Individual Project/processed data/submissions.csv')

s_df = s_df.sort_values(by='datetime')
c_df = c_df.sort_values(by='datetime')

In [3]:
comments = c_df.reset_index().drop(columns=['id', 'datetime', 'sentiment', 'compound_score', 'submission_id', 'parent_id'], axis=1)

comments = comments.sample(100)

comments.head()

Unnamed: 0,body
538420,Exactly. I don't mean to be mean but I don't u...
289030,[deleted]
235804,I am not supporting or defending or suggesting...
315874,Finally someone who speaks my language. That's...
378895,They can leave it on your doorstep stand back ...


In [4]:
# Remove punctuation
import re

comments['processed_text'] = comments['body'].map(lambda x: re.sub('[,\.!?]', '', x))

comments['processed_text'] = comments['processed_text'].map(lambda x: x.lower())

comments['processed_text'].head()

538420    exactly i don't mean to be mean but i don't un...
289030                                            [deleted]
235804    i am not supporting or defending or suggesting...
315874    finally someone who speaks my language that's ...
378895    they can leave it on your doorstep stand back ...
Name: processed_text, dtype: object

In [32]:
# EXPLORATORY ANALYSIS

from wordcloud import WordCloud

long_string = ','.join(list(comments['processed_text'].values))

wordcloud = WordCloud(background_color="white",
                      max_words=5000,
                      contour_width=3,
                      contour_coliur='steelblue'
                     )

wordcloud.generate(long_string)

wordcloud.to_image()


ModuleNotFoundError: No module named 'wordcloud'

In [5]:
# Tokenize words and more text clean-up

import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),
                                            deacc=True))
data = comments.processed_text.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['exactly', 'don', 'mean', 'to', 'be', 'mean', 'but', 'don', 'understand', 'these', 'people', 'who', 'have', 'stayed', 'in', 'for', 'nearly', 'year', 'surely', 'at', 'some', 'point', 'you', 'could', 'have', 'made', 'the', 'trip', 'to', 'see']


In [6]:
#Bigram and Trigram Phrase Modelling

bigram = gensim.models.Phrases(
    data_words, min_count=5, threshold=100
)
trigram = gensim.models.Phrases(
    bigram[data_words], threshold=100
)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
#Remove Stopwords, Make Bigrams and Lemmantize

import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [8]:
#Define functions for stopwords, bigrams, trigrams and lemmatization

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
import spacy

data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load("en_core_web_sm", disable=['parser',
                                            'ner'])

data_lemmatized = lemmatization(data_words_bigrams, 
                                allowed_postags=['NOUN', 
                                                 'ADJ', 
                                                 'VERB', 
                                                 'ADV'])

print(data_lemmatized[:1])

[['exactly', 'mean', 'mean', 'understand', 'people', 'stay', 'nearly', 'year', 'surely', 'point', 'make', 'trip', 'see', 'want']]


In [10]:
#Data Tranformation: Corpus and Dictionary

import gensim.corpora as corpora

id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]]


In [11]:
from pprint import pprint

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.015*"say" + 0.015*"week" + 0.014*"take" + 0.014*"delete" + 0.014*"show" + '
  '0.008*"people" + 0.007*"year" + 0.007*"work" + 0.007*"job" + '
  '0.007*"measure"'),
 (1,
  '0.025*"people" + 0.011*"travel" + 0.011*"double" + 0.007*"unnecessary" + '
  '0.007*"symptom" + 0.007*"get" + 0.007*"covid" + 0.007*"talk" + '
  '0.007*"quarantine" + 0.007*"even"'),
 (2,
  '0.016*"people" + 0.011*"government" + 0.011*"september" + 0.011*"back" + '
  '0.011*"pad" + 0.006*"seem" + 0.006*"make" + 0.006*"try" + 0.006*"early" + '
  '0.006*"good"'),
 (3,
  '0.016*"https" + 0.010*"people" + 0.010*"well" + 0.010*"say" + 0.010*"thing" '
  '+ 0.010*"test" + 0.006*"go" + 0.006*"get" + 0.006*"work" + 0.006*"much"'),
 (4,
  '0.018*"want" + 0.018*"mean" + 0.014*"day" + 0.009*"people" + 0.009*"agree" '
  '+ 0.009*"need" + 0.009*"see" + 0.009*"health" + 0.009*"mind" + '
  '0.009*"plan"'),
 (5,
  '0.010*"risk" + 0.010*"trial" + 0.008*"want" + 0.008*"people" + 0.008*"go" + '
  '0.008*"sneeze" + 0.008*"time"

In [12]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.37493778404559935


In [28]:
#Hyperparameter Tuning

def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    print(corpus)
    print(dictionary)
    print(k)
    print(a)
    print(b)
    coherence_model_lda = CoherenceModel(model=lda_model, 
                                         texts=data_lemmatized, 
                                         dictionary=id2word,
                                         coherence='c_v')
    
    return coherence_model_lda.get_coherence()


In [30]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

#Can take a while to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], 
                                                  dictionary=id2word, 
                                                  k=k, 
                                                  a=a,
                                                  b=b)
                    print('Alert')
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
                    
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv',
                                       index=False)
    pbar.close()



  0%|          | 0/540 [00:00<?, ?it/s][A

ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize.

In [27]:
range(len(corpus_sets))

range(0, 2)