In [26]:
import re
import numpy as np
import pandas as pd
import os
import glob
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore
from nltk import TweetTokenizer

# scikit-learn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# spacy for lemmatization
import spacy

# Plotting tools
# import pyLDAvis
# import pyLDAvis.gensim_models  # don't skip this
# import matplotlib.pyplot as plt
# %matplotlib inline

# Enable logging for gensim - optional
import logging
import os

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

# NLTK Stop words
from nltk.corpus import stopwords

stop_words = stopwords.words('english')


In [4]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [5]:
# Read in data

globStr = "*"
dirName = "/home/tommy/PycharmProjects/nlpTwitter/april_29_12pm/"
data_files = [(x[0], x[2]) for x in os.walk(dirName)]
# print(data_files[0][1])
tweetList = []
for file in data_files[0][1]:
    docFile = open(dirName + file, "r")
    doc = docFile.read()
    entry = [file, doc]
    tweetList.append(entry)

tweetList = np.array(tweetList)

In [6]:
# Remove new line characters
for tweetIdx, tweet in enumerate(tweetList):
    test = tweet
    tweetList[tweetIdx][1] = re.sub('\s+', ' ', tweet[1])

In [7]:
tweet_tokenizer = TweetTokenizer()
data_words = []
for doc in tweetList[:,1]:
    data_words.append(tweet_tokenizer.tokenize(doc))

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Human readable format of corpus (term-frequency)
#id2word[0]
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])



In [9]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=15,
                                            random_state=1,
                                            passes=10)

In [10]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

base_perplexity = lda_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity)

# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=texts,
                                   dictionary=id2word, coherence='c_v')
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

[(0,
  '0.073*"teacher" + 0.031*"thank" + 0.028*"week" + 0.017*"happy" + '
  '0.017*"appreciation" + 0.016*"student" + 0.012*"educator" + 0.011*"day" + '
  '0.009*"work" + 0.009*"love"'),
 (1,
  '0.002*"ozark" + 0.001*"man" + 0.001*"ant" + 0.001*"truth" + 0.001*"willie" '
  '+ 0.001*"get" + 0.001*"tree" + 0.000*"fridayfeeling" + 0.000*"rock" + '
  '0.000*"today"'),
 (2,
  '0.032*"tree" + 0.031*"rock" + 0.012*"day" + 0.011*"meet" + 0.011*"arborday" '
  '+ 0.011*"plant" + 0.010*"gala" + 0.009*"happy" + 0.009*"today" + '
  '0.007*"celebrate"'),
 (3,
  '0.022*"artist" + 0.020*"comic_strip" + 0.017*"comic" + 0.016*"book" + '
  '0.012*"rip" + 0.012*"creator" + 0.012*"adam" + 0.012*"great" + 0.011*"neal" '
  '+ 0.010*"right"'),
 (4,
  '0.032*"man" + 0.028*"ant" + 0.028*"fridayfeeling" + 0.022*"fridayfeele" + '
  '0.008*"get" + 0.008*"marvel" + 0.007*"weekend" + 0.006*"drpepper" + '
  '0.006*"way" + 0.005*"day"'),
 (5,
  '0.002*"rock" + 0.001*"tree" + 0.001*"happy" + 0.001*"day" + 0.001*"teach

In [11]:
tokens_back_to_text = [' '.join(map(str, l)) for l in data_lemmatized]

In [18]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(tokens_back_to_text)

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

In [20]:
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             n_jobs=1,
             param_grid={'n_topics': [10, 15, 20, 30],'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs',
             refit=True,
             return_train_score='warn',
             scoring=None,
             verbose=3)

GridSearchCV(error_score='raise',
             estimator=LatentDirichletAllocation(learning_method=None,
                                                 n_jobs=1),
             n_jobs=1,
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_topics': [10, 15, 20, 30]},
             return_train_score='warn', verbose=3)

In [21]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -644387.365726891
Model Perplexity:  1770.8479949197294


In [28]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of tipics
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3, verbose=False):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        if verbose:
            print(num_topics)
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic    

In [35]:
# Can take a long time to run.
model_list_topic, coherence_values_topic = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=data_lemmatized,
                                                        start=2, limit=26, step=2, verbose=True)

2
4
6
8
10
12
14
16
18
20
22
24


In [36]:
coherence_values_topic

[0.22700074092261363,
 0.24220982815847658,
 0.23621026692272618,
 0.24082865755610838,
 0.2640902575810781,
 0.24789685226305136,
 0.24587942188615375,
 0.2554813616074703,
 0.25599793561368456,
 0.23302203542613192,
 0.2496879491357352,
 0.2488869367250408]

Best topic number = 10

In [43]:
#Defining a function to loop over number of topics to be used to find an 
#optimal number of tipics
def compute_pass_coherence(dictionary, corpus, texts, limit, start=2, step=3, numTopics=10, verbose=False):
    """
    Compute c_v coherence for various number of passes

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective number of topics
    """
    coherence_values_pass = []
    model_list_pass = []
    pass_numbers = []
    for num_passes in range(start, limit, step):
        if verbose:
            print(num_passes)
        pass_numbers.append(num_passes)
        model = LdaMulticore(corpus=corpus, num_topics=numTopics, id2word=id2word, passes=num_passes)
        model_list_pass.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_pass.append(coherencemodel.get_coherence())

    return model_list_pass, coherence_values_pass, pass_numbers

In [44]:
# Can take a long time to run.
model_list_pass, coherence_values_pass, pass_numbers = compute_pass_coherence(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=data_lemmatized,
                                                        start=1, limit=100, step=10, verbose=True)

1
11
21
31
41
51
61
71
81
91


In [51]:
for e in sorted(zip(pass_numbers, coherence_values_pass)):
    print(e)

(1, 0.23873938325808836)
(11, 0.43546028714701873)
(21, 0.5497241492467927)
(31, 0.532404031116434)
(41, 0.4116355451042374)
(51, 0.4248497837499426)
(61, 0.48514630501515044)
(71, 0.517233483172031)
(81, 0.4680025912702247)
(91, 0.4618293957796082)


In [57]:
model_list_pass_2, coherence_values_pass_2, pass_numbers_2 = compute_pass_coherence(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=data_lemmatized,
                                                        start=10, limit=24, step=1, verbose=False)

In [58]:
for e in zip(pass_numbers_2, coherence_values_pass_2):
    print(e)

(10, 0.4862814045863183)
(11, 0.46347240099674886)
(12, 0.4759043623175293)
(13, 0.4816294212276676)
(14, 0.4278066182359309)
(15, 0.49403827974565206)
(16, 0.4197771645182108)
(17, 0.4292485554383463)
(18, 0.39728663264718983)
(19, 0.514947562089464)
(20, 0.4865514284912968)
(21, 0.4626509106696929)
(22, 0.4698424696923748)
(23, 0.4024689503154414)


In [65]:

#import decimal

def float_range(start, stop, step):
    while start < stop:
        yield float(start)
        start += float(step)

#Defining a function to loop over number of decay values to be used to find an 
#optimal value for decay
def compute_decay_coherence(dictionary, corpus, texts, limit, start=2, step=3, numTopics=10, numPasses = 10, verbose=False):
    """
    Compute c_v coherence for various values for decay

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the 
    LDA model with respective decay values
    """
    coherence_values_decay = []
    model_list_decay = []
    decay_numbers = []
    for num_decay in float_range(start, limit, step):
        if verbose:
            print(num_decay)
        decay_numbers.append(num_decay)
        model = LdaMulticore(corpus=corpus, num_topics=numTopics, id2word=id2word, passes=numPasses, decay=num_decay)
        model_list_decay.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_decay.append(coherencemodel.get_coherence())

    return model_list_decay, coherence_values_decay, decay_numbers

In [66]:
model_list_decay, coherence_values_decay, decay_numbers = compute_decay_coherence(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=data_lemmatized,
                                                        start=0.5, limit=1, step=0.05, verbose=True,
                                                        numPasses = 19)

0.5
0.55
0.6000000000000001
0.6500000000000001
0.7000000000000002
0.7500000000000002
0.8000000000000003
0.8500000000000003
0.9000000000000004
0.9500000000000004


In [67]:
for e in zip(decay_numbers, coherence_values_decay):
    print(e)

(0.5, 0.4079381052577268)
(0.55, 0.41265437439534025)
(0.6000000000000001, 0.37211009957033403)
(0.6500000000000001, 0.49554758295150636)
(0.7000000000000002, 0.4142512939725793)
(0.7500000000000002, 0.4423523412070815)
(0.8000000000000003, 0.35937744375903036)
(0.8500000000000003, 0.408392614538258)
(0.9000000000000004, 0.4862202265172012)
(0.9500000000000004, 0.4049296541405856)


[(0,
  '0.071*"teacher" + 0.031*"thank" + 0.028*"week" + 0.017*"happy" + '
  '0.017*"appreciation" + 0.016*"student" + 0.012*"educator" + 0.010*"day" + '
  '0.009*"work" + 0.009*"love"'),
 (1,
  '0.097*"citygirl" + 0.087*"topnotchfriday" + 0.015*"get" + 0.011*"graduate" '
  '+ 0.011*"thank" + 0.011*"bachelor" + 0.010*"need" + 0.007*"pay" + '
  '0.007*"blessing" + 0.007*"cashapp"'),
 (2,
  '0.086*"ozark" + 0.021*"episode" + 0.017*"watch" + 0.015*"season" + '
  '0.015*"ruth" + 0.014*"final" + 0.011*"part" + 0.010*"get" + 0.010*"end" + '
  '0.009*"day"'),
 (3,
  '0.033*"mubarak" + 0.019*"wish" + 0.019*"family" + 0.018*"celebrate" + '
  '0.018*"happy" + 0.015*"eid" + 0.013*"bless" + 0.012*"happiness" + '
  '0.011*"love" + 0.011*"day"'),
 (4,
  '0.032*"meet" + 0.028*"gala" + 0.027*"truth" + 0.014*"ministry" + '
  '0.012*"artist" + 0.011*"comic_strip" + 0.009*"comic" + 0.009*"book" + '
  '0.008*"go" + 0.008*"drpepper"'),
 (5,
  '0.002*"willie" + 0.001*"tree" + 0.001*"day" + 0.001*"happy" + '

In [70]:
bestEmojiModel = LdaMulticore(corpus=corpus,
                              num_topics=10,
                              id2word=id2word,
                              passes=19,
                              decay=0.65)
pprint(bestEmojiModel.print_topics())

[(0,
  '0.074*"tree" + 0.025*"arborday" + 0.025*"plant" + 0.019*"day" + '
  '0.014*"today" + 0.013*"celebrate" + 0.011*"planting" + 0.010*"happy" + '
  '0.007*"help" + 0.007*"year"'),
 (1,
  '0.071*"teacher" + 0.031*"thank" + 0.028*"week" + 0.017*"happy" + '
  '0.017*"appreciation" + 0.016*"student" + 0.012*"educator" + 0.010*"day" + '
  '0.009*"work" + 0.009*"love"'),
 (2,
  '0.039*"citygirl" + 0.035*"topnotchfriday" + 0.017*"artist" + '
  '0.015*"comic_strip" + 0.014*"comic" + 0.012*"book" + 0.011*"get" + '
  '0.010*"rip" + 0.010*"creator" + 0.009*"adam"'),
 (3,
  '0.002*"ozark" + 0.000*"get" + 0.000*"episode" + 0.000*"watch" + '
  '0.000*"tree" + 0.000*"day" + 0.000*"today" + 0.000*"happy" + 0.000*"go" + '
  '0.000*"season"'),
 (4,
  '0.071*"ozark" + 0.017*"episode" + 0.014*"watch" + 0.012*"season" + '
  '0.012*"ruth" + 0.011*"final" + 0.010*"day" + 0.009*"part" + 0.009*"get" + '
  '0.009*"end"'),
 (5,
  '0.091*"man" + 0.082*"ant" + 0.023*"marvel" + 0.013*"release" + 0.012*"date" '
