In [None]:
# imports

import pandas as pd
import numpy as np
import networkx as nx
import itertools
import collections
import spacy
from pprint import pprint

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim_models

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
#from gensim.models.wrappers import LdaMallet

# nltk 
from nltk import bigrams
from nltk.stem import PorterStemmer

sns.set(font_scale = 1.5)
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

%matplotlib inline

In [None]:
# import pre-processed data
df = pd.read_pickle('path/to/pre-processed.pkl')

In [None]:
df = df[[ 'Tweet', 'tidy_tweet', 'tidy_tweet_tokens', 'tokens_no_stop', 'no_stop_joined']]#, 'Authors',

In [None]:
df.shape

In [None]:
# Bigrams
# Create list of lists containing bigrams in tweets
terms_bigram = [list(bigrams(tweet)) for tweet in df['tokens_no_stop']]

#view bigrams for the first tweet
terms_bigram[0]

In [None]:
# Flatten list of bigrams in clean tweets
bigrams = list(itertools.chain(*terms_bigram))

# Create counter of words in clean bigrams
bigram_counts = collections.Counter(bigrams)

In [None]:
bigram_df = pd.DataFrame(bigram_counts.most_common(25), columns = ['bigram', 'count'])

In [None]:
# Visualize the bigrams
#create dictionary of bigrams and their counts
d = bigram_df.set_index('bigram').T.to_dict('records')
# Create network plot
G = nx.Graph()

# Create connections between nodes
for k, v in d[0].items():
    G.add_edge(k[0], k[1], weight=(v * 5))

fig, ax = plt.subplots(figsize=(11, 9))

pos = nx.spring_layout(G, k=2)

# Plot networks
nx.draw_networkx(G, pos, font_size=10, width=3, edge_color='grey', node_color='purple', with_labels=False, ax=ax)

# Create offset labels
for key, value in pos.items():
    x, y = value[0]+.080, value[1]+.050
    ax.text(x, y, s=key, bbox = dict(facecolor='red', alpha=0.15), horizontalalignment='center', fontsize=12)

plt.show()


In [None]:
# Bigrams and Trigrams
data = df.no_stop_joined.values.tolist()

In [None]:
# Tokenize
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [None]:
data_words = list(sent_to_words(data))

In [None]:
# Make Bigrams and Trigrams
# Build the bigram and trigram model
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod =gensim.models.phrases.Phraser(trigram)

In [None]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[55]]])

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

In [None]:
# Lemmatization
def lemmatization(tweets, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    tweets_out = []
    for sent in tweets:
        doc = nlp(' '.join(sent))
        tweets_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return tweets_out

In [None]:
# Initialize spacy 'en' model, keeping only tagger component 
nlp = spacy.load("en_core_web_sm")

In [None]:
# Do lemmatization keeping only noun, adj, vb, adv
df['lemmatized'] = pd.Series(lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']))

In [None]:
# Dropping Duplicates
# join the tweet back together
def rejoin_words(row):
    words = row['lemmatized']
    joined_words = (' '.join(words))
    return joined_words

In [None]:
df['lemmatized_joined'] = df.apply(rejoin_words, axis = 1)

In [None]:
df = df.drop_duplicates(subset=['lemmatized_joined'], keep='first', inplace=False)

In [None]:
df.shape

In [None]:
# Stemming
stemmer = PorterStemmer()

In [None]:
df['stemmed'] = df['lemmatized'].apply(lambda x: [stemmer.stem(y) for y in x])

In [None]:
df['stemmed'][0:round(len(df)/3)] # I decide to use only the first month for the analysis

In [None]:
# Create Dictionary and Corpus
# Create Dictionary
id2word_stemmed = corpora.Dictionary(df['stemmed'])

In [None]:
# Create Dictionary
id2word_lemma = corpora.Dictionary(df['tokens_no_stop'])

In [None]:
# Create Dictionary
id2word_tidy = corpora.Dictionary(df['tidy_tweet_tokens'])

In [None]:
# Create Corpus
tweets_stemmed = df['stemmed'][0:round(len(df)/3)]

In [None]:
# Term Document Frequency
corpus_stemmed = [id2word_stemmed.doc2bow(tweet) for tweet in tweets_stemmed]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word_stemmed[id], freq) for id, freq in cp] for cp in corpus_stemmed[:3]]

In [None]:
# Build LDA model
lda_model_stemmed = gensim.models.ldamodel.LdaModel(corpus=corpus_stemmed, id2word=id2word_stemmed, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=15, alpha='auto', per_word_topics=True)

In [None]:
# View the Topics
pprint(lda_model_stemmed.print_topics())
doc_lda_stemmed = lda_model_stemmed[corpus_stemmed]

In [None]:
# Model Perplexity and Coherence Score
#Compute Perplexity
print('\nPerplexity: ', lda_model_stemmed.log_perplexity(corpus_stemmed)) # the lower the best

# Computer Coherence Score
coherence_model_lda_stemmed = CoherenceModel(model = lda_model_stemmed, texts=df['stemmed'][0:round(len(df)/3)], dictionary=id2word_stemmed, coherence='c_v')
coherence_lda_stemmed = coherence_model_lda_stemmed.get_coherence() #check alsi get_coherence_per_topic
print('\nCoherence Score: ', coherence_lda_stemmed)


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
visualization = pyLDAvis.gensim_models.prepare(lda_model_stemmed, corpus_stemmed, id2word_stemmed)

In [None]:
visualization

In [None]:
# Find the Optimal number of Topics

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    perplexity_values = []
    model_list= []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus_stemmed, id2word=id2word_stemmed, num_topics=num_topics, random_state=100, update_every=1, chunksize=100, passes=15, alpha='auto', per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model = model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        perplexity_values.append(model.log_perplexity(corpus_stemmed))

    return model_list, coherence_values, perplexity_values
 

In [None]:
model_list, coherence_values, perplexity_values = compute_coherence_values(dictionary=id2word_stemmed, corpus=corpus_stemmed, texts=df['stemmed'][0:round(len(df)/3)], start=2, limit=26, step=1)

In [None]:
# show graph coherence
plt.figure(figsize=(15, 6))
plt.suptitle('Evaluation Metrics for LDA', fontsize = 14, fontweight = 'bold')

plt.subplot(1, 2, 1)
limit=26; start=2; step=1; 
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.title('Coherence Score for different number of topics', fontsize=14, fontweight='bold')
plt.xlabel('Number of Topics', fontsize=12)
plt.ylabel('Coherence Score', fontsize=12)
plt.tick_params(axis='both', labelsize=13)
plt.legend(['Coherence Values'], loc='lower right')

# show graph perplexity
plt.subplot(1, 2, 2)
limit=26; start=2; step=1; 
x = range(start, limit, step)
plt.plot(x, perplexity_values)
plt.title('Perplexity Score for different number of topics', fontsize=14, fontweight='bold')
plt.xlabel('Number of Topics', fontsize=12)
plt.ylabel('Perplexity Score', fontsize=12)
plt.tick_params(axis='both', labelsize=13)
plt.legend(['Perplexity Values'], loc='lower right')
plt.show()

In [None]:
# Print the coherence scores for different number of topics
for m, cv in zip(x, coherence_values):
    print('Num Topics =', m, ' has Coherence Value of', round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[8]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
# dominant topic

def format_topics_sentences(ldamodel=optimal_model, corpus=corpus_stemmed, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: x[1], reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus_stemmed, texts=df['stemmed'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(15)

In [None]:
df_dominant_topic[df_dominant_topic.Dominant_Topic == 0.0]


In [None]:
# Total Topic Distrubution across documents
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution = topic_contribution.rename_axis('Dominant_Topic').reset_index(name='percentage')

topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates()
topic_num_keywords.index = range(len(topic_num_keywords))

df_dominant_topics = pd.merge(topic_contribution, topic_num_keywords, how='inner', on='Dominant_Topic')
df_dominant_topics



In [None]:
df_dominant_topics.to_csv('path/to/dominant_topic.csv')


In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = sent_topics_sorteddf_mallet[['Topic_Num', 'Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.sort_index(), topic_contribution.sort_index()], axis=1)

# Change Column names
df_dominant_topics.columns = ['Topic', 'Topic_Keywords', 'Num_Tweets', 'Perc_Tweets']

# Show
df_dominant_topics['Perc_Tweets'] = df_dominant_topics['Perc_Tweets'] * 100

In [None]:
df_dominant_topics['Text'] = sent_topics_sorteddf_mallet['Text']

In [None]:
df_dominant_topics

In [None]:
sns.set_style("white")
label = ['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9']#, 'Topic 10', 'Topic 11']
freq = df_dominant_topics['Num_Tweets']
index = np.arange(len(freq))

print("Total Tweets", df_dominant_topics['Num_Tweets'].sum())
plt.figure(figsize=(8,6), facecolor='white')
plt.bar(index, freq, alpha=0.8, color= 'black', width=0.7)
plt.xlabel('Topics', fontsize=13)
plt.ylabel('Number of Tweets', fontsize=13)
plt.xticks(index, label, fontsize=11, fontweight="bold", rotation=90)
plt.tick_params(axis='both', which='major', labelsize=12)
plt.title('Topic Distribution', fontsize=14, fontweight="bold")
plt.show()

In [None]:
# save df
df.to_pickle('path/to/pre-processed.pkl')
df_dominant_topics.to_pickle('path/to/topic_modelling_gensim_results.pkl')