# Topic Modeling

In [None]:
import re
import numpy as np
import pandas as pd
import joblib

# Gensim
import gensim
import logging
import warnings
import seaborn as sns
from wordcloud import WordCloud
import spacy
import gensim.corpora as corpora
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from tqdm import tqdm

stop_words = stopwords.words('english')
stop_words.extend(['rt', 'https'])

warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

## Reading the data

In [None]:
data = pd.read_csv('/data/metahate.csv', sep='\t', names=['label', 'text'])

text_hate = data.loc[data['label'] == 1, 'text'].tolist()
text_no_hate = data.loc[data['label'] == 0, 'text'].tolist()

## We create one LDA model for hate data an another for non-hate data

In [None]:
text_data = text_hate # Late `text_no_hate`

## Tokenizing and preprocess the text data

In [None]:
def sent_to_words(sentences):
    """
    Tokenize sentences into words.

    Parameters:
    - sentences (list): List of sentences to tokenize.

    Yields:
    - list: A list of words for each sentence.
    """
    for sent in sentences:
        sent = re.sub('\s+', ' ', str(sent))  # Remove newline chars
        sent = re.sub("\'", "", str(sent))  # Remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
        yield sent

data_words = list(sent_to_words(text_data))

## Creating n-grams

In [None]:
bigram = gensim.models.Phrases(
    data_words, 
    min_count=5, # Ignores all words and bigrams with total collected count lower than this
    threshold=100
) 
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

## Tokenizing, preprocessing, and lemmatizing each document

In [None]:
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """
    Tokenize, preprocess, and lemmatize a list of texts.

    Parameters:
    - texts (list): List of tokenized texts to process.
    - stop_words (set): Set of stop words to remove during processing.
    - allowed_postags (list): List of allowed POS (Part-of-Speech) tags.

    Returns:
    - list: List of processed and lemmatized texts.
    """
    # Remove stop words from each document and apply bigram and trigram models
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    # Remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]
    return texts_out

data_ready = process_words(data_words)

## Creating the dictionary and corpus for topic modeling using Gensim

In [None]:
id2word = corpora.Dictionary(data_ready)
corpus = [id2word.doc2bow(text) for text in data_ready]

## Creating and training an LDA (Latent Dirichlet Allocation) model using Gensim

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # Bag-of-words representation of the documents
                                           id2word=id2word, # Dictionary mapping words to unique numerical IDs
                                           num_topics=8, # Number of topics to identify
                                           random_state=100, # Random seed for reproducibility
                                           update_every=1, #  often the model parameters should be updated
                                           chunksize=100, # Number of documents to be used in each training chunk
                                           passes=10, # Number of passes through the entire corpus during training
                                           alpha=0.31, # Parameter controlling the document-topic density
                                           eta=0.9, # Parameter controlling the topic-word density
                                           iterations=100, # Maximum number of iterations through the corpus when inferring topic distributions
                                           per_word_topics=True) # Compute a list of topics, each represented by a list of words and associated probabilities
joblib.dump(lda_model, 'lda_model.jl')

## Calculating the dominant topics and its percentage contribution

In [None]:
def process_row(row_list):
    """
    Process a row of LDA model output.

    Parameters:
    - row_list (list): List representing the output for a document from the LDA model.

    Returns:
    - list: Processed information including the dominant topic number, proportion, and keywords.
    """
    row = row_list[0] if lda_model.per_word_topics else row_list
    row = sorted(row, key=lambda x: x[1], reverse=True)

    if row:
        topic_num, prop_topic = row[0]
        wp = lda_model.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        return [int(topic_num), round(prop_topic, 4), topic_keywords]

    return [None, None, None]

In [None]:
sent_topics_df = pd.DataFrame()

# Explicitly iterate through the rows of the LDA model corpus and apply the processing function
for i, row_list in tqdm(enumerate(lda_model[corpus])):
    sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([process_row(row_list)],
                                                             columns=['Dominant_Topic', 'Topic_Perc_Contrib',
                                                                      'Keywords'])], ignore_index=True)

In [None]:
contents = pd.Series(data_ready)
sent_topics_df['Text'] = contents
sent_topics_df = sent_topics_df.dropna(subset=['Dominant_Topic'])

## Visualization

### Distribution of document word counts

In [None]:
doc_lens = [len(d) for d in sent_topics_df.Text]

plt.figure(figsize=(16,7), dpi=160)
plt.hist(doc_lens, bins = 1000, color='#3d3d3d')
plt.text(800, 32000, 'Mean: ' + str(round(np.mean(doc_lens))))
plt.text(800,  28000, 'Median: ' + str(round(np.median(doc_lens))))
plt.text(800,  23000, 'Standard deviation: ' + str(round(np.std(doc_lens))))
plt.text(800,  18000, '1% quantile: ' + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(800,  13000, '99% quantile: ' + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(xlim=(0, 1000), ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0,1000,9))
plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.show()

### Distribution of document word counts by topic

In [None]:
colour = '#000'

fig, axes = plt.subplots(2,3,figsize=(16,14), dpi=160, sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    df_dominant_topic_sub = sent_topics_df.loc[sent_topics_df.Dominant_Topic == i, :]
    doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
    ax.hist(doc_lens, bins=1000, color=colour)
    ax.tick_params(axis='y', labelcolor=colour, color=colour)
    sns.kdeplot(doc_lens, color='black', shade=False, ax=ax.twinx())
    ax.set(xlim=(0, 1000), xlabel='Document Word Count')
    ax.set_ylabel('Number of Documents', color=colour)
    ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=colour))


fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(np.linspace(0,1000,9))
fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
plt.show()

### Wordcloud

In [None]:
colour = '#000'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: colour,
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 3, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=450)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=18))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

### Word count and importance of topic keywords

In [None]:
from collections import Counter
topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 3, figsize=(16,10), sharey=True, dpi=160)
colour = '#000'

topics = [0, 1, 2, 3, 4, 5]

for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==topics[i], :], color=colour, width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==topics[i], :], color=colour, width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=colour)
    ax_twin.set_ylim(0, 0.3); ax.set_ylim(0, 135000)
    ax.set_title('Topic: ' + str(i), color=colour, fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==topics[i], 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

fig.tight_layout(w_pad=2)    
fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)    
plt.show()

### t-SNE clustering

In [None]:
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

arr = pd.DataFrame(topic_weights).fillna(0).values
arr = arr[np.amax(arr, axis=1) > 0.5]
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca', perplexity=40)
tsne_lda = tsne_model.fit_transform(arr)

#### Plot

In [None]:
colors = np.array(["#c7e9c0", "#a1d99b", "#74c476", "#31a354", "#006d2c", "#003816"])
plt.figure(figsize=(8, 8), dpi=300)
scatter = plt.scatter(tsne_lda[:,0], tsne_lda[:,1], c=colors[topic_num], s=8)

handles, _ = scatter.legend_elements(prop='colors')
plt.suptitle("t-SNE Clustering")
plt.axis("off")
plt.show()