In [None]:
# import libraries
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# import pre-processed data
df = pd.read_pickle('path/to/pre-processed.pkl')
df = df[[ 'Tweet', 'tidy_tweet', 'tidy_tweet_tokens', 'tokens_no_stop', 'no_stop_joined', 'lemmatized', 'lemmatized_joined', 'stemmed']]#'Authors',

In [None]:
# Join the tweet back together
def rejoin_words(row):
    words = row['stemmed']
    joined_words = (" ".join(words))
    return joined_words

In [None]:
df['stemmed_joined'] = df.apply(rejoin_words, axis=1)

In [None]:
# Document Word matrix

vectorizer = CountVectorizer(analyzer='word',       
                             max_df=0.9, 
                             #min_df=25,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

In [None]:
data_vectorized = vectorizer.fit_transform(df['stemmed_joined'][0:round(len(df)/3)])

In [None]:
# Check the sparsity
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", round((((data_dense > 0).sum()/data_dense.size)*100),3)*100, "%")

In [None]:
# Build LDA model with Sklearn

# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=10,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

In [None]:
lda_output = lda_model.fit_transform(data_vectorized)

In [None]:
# Diagnose model performance with perplexity and log-likelihood
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))


In [None]:
# GridSearch for the best LDA model
# Define Search Param
search_params = {'n_components': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [4, 7, 10, 14]
log_likelyhoods_5 = [round(ele) for ele in model.cv_results_['mean_test_score'][0:4]]
log_likelyhoods_7 = [round(ele) for ele in model.cv_results_['mean_test_score'][4:8]]
log_likelyhoods_9 = [round(ele) for ele in model.cv_results_['mean_test_score'][8:12]]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

In [None]:
#!pip3 install --upgrade jinja2

In [None]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

In [None]:
len(data_dense)

In [None]:
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Tweet " + str(i) for i in range(len(data_dense))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)


In [None]:

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic


In [None]:
df_document_topic

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

In [None]:
label = ['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3']
freq = df_topic_distribution['Num Documents']
index = np.arange(len(freq))

print("Total Tweets", df_topic_distribution['Num Documents'].sum())
plt.figure(figsize=(8,6))
plt.bar(index, freq, alpha=0.8, color= 'black', width=0.7)
plt.xlabel('Topics', fontsize=13)
plt.ylabel('Number of Tweets', fontsize=13)
plt.xticks(index, label, fontsize=11, fontweight="bold") 
plt.title('Topic Distribution', fontsize=12, fontweight="bold")
plt.show()

In [None]:
#Topic Keywords
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()


In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

In [None]:
#Cluster Documents with common topic
# Construct the k-means clusters
from sklearn.cluster import KMeans
clusters = KMeans(n_clusters=4, random_state=100).fit_predict(lda_output)

# Build the Singular Value Decomposition(SVD) model
svd_model = TruncatedSVD(n_components=2)  # 2 components
lda_output_svd = svd_model.fit_transform(lda_output)

# X and Y axes of the plot using SVD decomposition
x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]

# Weights for the 15 columns of lda_output, for each component
print("Component's weights: \n", np.round(svd_model.components_, 2))

# Percentage of total information in 'lda_output' explained by the two components
print("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))

In [None]:
# Plot
plt.figure(figsize=(12, 12))
plt.scatter(x, y, c=clusters)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title("Segregation of Topic Clusters", )

In [None]:
df_document_topic['Tweet'] = df['Tweet'][0:round(len(df)/3)].to_numpy()

In [None]:
df_document_topic.to_pickle('path/to/topic_modelling_sklearn_results.pkl')