## Model Building: Latent Dirichlet Allocation
LDA is a generative probabilistic model that assumes each topic is a mixture over an underlying set of words, and each document is a mixture of over a set of topic probabilities.

Alpha parameter is Dirichlet prior concentration parameter that represents document-topic density — with a higher alpha, documents are assumed to be made up of more topics and result in more specific topic distribution per document.
Beta parameter is the same prior concentration parameter that represents topic-word density — with high beta, topics are assumed to made of up most of the words and result in a more specific word distribution per topic.

In [20]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
import spacy

import gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [2]:
df_reviews = pd.read_parquet('../data/raw/reviews_2024-11-25_22-13_PH.parquet', engine = 'pyarrow')

Filtered Influential Comments

In [3]:
filtered_df = df_reviews.loc[((df_reviews['content'].fillna('').str.split().apply(len) >= 3) & (df_reviews['thumbsUpCount'] >= 10)) | ((df_reviews['content'].fillna('').str.split().apply(len) >= 6) & (pd.notnull(df_reviews['replyContent']))),['content','reviewId']]

In [4]:
import subprocess

def lemmatization(document, model):
    return " ".join([token.lemma_ for token in model(document)])

def named_entities(document, model):
    # return list(model(document).ents)
    return [ent.text for ent in model(document).ents]

def remove_stop_words(tokens_list, stop_words):
    return [token for token in tokens_list if token not in stop_words]

def remove_one_character_or_letter_tokens(tokens_list):
    return [token for token in tokens_list if token.isalpha() and not (len(token) >= 2 and len(set(token)) == 1)]

def preprocess_dataframe(dataframe, text_column):
    # nltk, spacy required resources
    required_resources = ['stopwords', 'punkt']
    for resource in required_resources:
        try:
            nltk.data.find(f"corpora/{resource}")
        except LookupError:
            nltk.download(resource)
    try:
        nlp_spacy = spacy.load('en_core_web_sm')
    except OSError:
        print("spaCy model 'en_core_web_sm' not found. Downloading now...")
        subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'], check=True)
        nlp_spacy = spacy.load('en_core_web_sm')

    data = dataframe.copy()
    row_docs = data[text_column].fillna('').apply(lambda x: lemmatization(x, nlp_spacy))
    data['reviews_NER'] = row_docs.apply(lambda x: named_entities(x, nlp_spacy))
    row_docs = row_docs.str.lower().apply(word_tokenize)

    stop_words = set(stopwords.words('english'))
    row_docs = row_docs.apply(lambda x: remove_stop_words(x, stop_words))
    row_docs = row_docs.apply(remove_one_character_or_letter_tokens)

    data['processed_reviews'] = row_docs + data['reviews_NER']
    return data

In [5]:
preprocess_df = preprocess_dataframe(filtered_df[['reviewId', 'content']], 'content')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mund\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Tune the LDA model

In [10]:
# from itertools import product

# n_topics = list(range(2,8))
# alpha = [0.05, 0.1 ,0.2, 0.4, 0.5, 1, 2, 5, 10]
# beta = [0.01, 0.2, 0.4, 0.8, 2]
# params_df = pd.DataFrame(list(product(n_topics,alpha,beta)), columns = ['n_topics','alpha','beta'])

# copy_df = preprocess_df.copy()
# docs = copy_df['processed_reviews']
# dictionary = Dictionary(docs.tolist())
# dictionary.filter_extremes(no_below=20, no_above=0.5)
# corpus = [dictionary.doc2bow(text) for text in docs.tolist()]

# scores = []
# for i in range(params_df.shape[0]):
    
#     # Train LDA model
#     lda_model = LdaModel(corpus, id2word=dictionary, passes=10, random_state=77, chunksize=200, 
#                             num_topics = params_df['n_topics'][i], alpha=params_df['alpha'][i],  eta = params_df['beta'][i])
#     coherence_model = CoherenceModel(model = lda_model, texts = docs.tolist(), 
#                                         dictionary=dictionary, coherence= 'c_v')
#     scores.append(coherence_model.get_coherence())
#     print(f"Number of topics: {params_df['n_topics'][i]}, alpha: {params_df['alpha'][i]}, beta: {params_df['beta'][i]}, coherence_score: {coherence_model.get_coherence()}")

# params_df['coherence'] = scores

In [None]:
# params_df.to_json('training_hyperparameter_20241208.json')
# params_df.to_csv('training_hyperparameter_20241208.csv')

In [23]:
params_df = pd.read_csv('training_hyperparameter_20241208.csv', index_col=0)

#### Selecting the final Model
Criteria:
1. Get top 5 coherence scores for each number of topics.
2. Manual Review of each topics.

In [35]:
for num in params_df['n_topics'].unique():
    display('*'*10)
    display(f"Topic {num}'s TOP 5")
    display(params_df[params_df['n_topics'] == num].sort_values(by = ['coherence','alpha','beta'], ascending=[False,True,True]).head(5).reset_index())
    

'**********'

"Topic 2's TOP 5"

Unnamed: 0,index,n_topics,alpha,beta,coherence
0,36,2,5.0,0.2,0.594673
1,38,2,5.0,0.8,0.593004
2,39,2,5.0,2.0,0.593004
3,37,2,5.0,0.4,0.591943
4,40,2,10.0,0.01,0.587092


'**********'

"Topic 3's TOP 5"

Unnamed: 0,index,n_topics,alpha,beta,coherence
0,89,3,10.0,2.0,0.603055
1,82,3,5.0,0.4,0.601248
2,81,3,5.0,0.2,0.597751
3,84,3,5.0,2.0,0.592559
4,88,3,10.0,0.8,0.588612


'**********'

"Topic 4's TOP 5"

Unnamed: 0,index,n_topics,alpha,beta,coherence
0,99,4,0.1,2.0,0.589555
1,128,4,5.0,0.8,0.589425
2,129,4,5.0,2.0,0.587393
3,104,4,0.2,2.0,0.584051
4,124,4,2.0,2.0,0.581089


'**********'

"Topic 5's TOP 5"

Unnamed: 0,index,n_topics,alpha,beta,coherence
0,173,5,5.0,0.8,0.592784
1,172,5,5.0,0.4,0.583435
2,167,5,2.0,0.4,0.582734
3,174,5,5.0,2.0,0.580498
4,166,5,2.0,0.2,0.57969


'**********'

"Topic 6's TOP 5"

Unnamed: 0,index,n_topics,alpha,beta,coherence
0,218,6,5.0,0.8,0.600406
1,217,6,5.0,0.4,0.599242
2,199,6,0.4,2.0,0.597773
3,219,6,5.0,2.0,0.590813
4,216,6,5.0,0.2,0.590076


'**********'

"Topic 7's TOP 5"

Unnamed: 0,index,n_topics,alpha,beta,coherence
0,263,7,5.0,0.8,0.585824
1,264,7,5.0,2.0,0.584103
2,262,7,5.0,0.4,0.580338
3,261,7,5.0,0.2,0.579749
4,258,7,2.0,0.8,0.572348


### Selecting final model based on Coherence and Topic Interpretation

In [38]:
copy_df = preprocess_df.copy()
docs = copy_df['processed_reviews']

for num in params_df['n_topics'].unique():
    dictionary = Dictionary(docs.tolist())
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in docs.tolist()]

    params_per_topic = params_df[params_df['n_topics'] == num].sort_values(by = ['coherence','alpha','beta'], ascending=[False,True,True]).head(5).reset_index()
    for i in range(params_per_topic.shape[0]):
        lda_model = LdaModel(corpus, id2word=dictionary, passes=10, random_state=77, chunksize=200, 
                                num_topics = params_per_topic['n_topics'][i], alpha=params_per_topic['alpha'][i],  eta = params_per_topic['beta'][i])
        coherence_model = CoherenceModel(model = lda_model, texts = docs.tolist(), 
                                            dictionary=dictionary, coherence= 'c_v')
        try: 
            print(f"Number of topics: {params_per_topic['n_topics'][i]}, alpha: {params_per_topic['alpha'][i]}, beta: {params_per_topic['beta'][i]},  coherence_score: {coherence_model.get_coherence()}")
            vis = gensimvis.prepare(lda_model, corpus=corpus, dictionary=dictionary)
            display(pyLDAvis.display(vis))
        except TypeError as e:
            print(f"Error occured: {e}")
            print('Skipping visualization')

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  upcast = np.find_common_type(args, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  upcast = np.find_common_type(args, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  upcast = np.find_common_type(args, [])


Number of topics: 2, alpha: 5.0, beta: 0.2,  coherence_score: 0.5946732305714542


Number of topics: 2, alpha: 5.0, beta: 0.8,  coherence_score: 0.593003579372716


Number of topics: 2, alpha: 5.0, beta: 2.0,  coherence_score: 0.593003579372716


Number of topics: 2, alpha: 5.0, beta: 0.4,  coherence_score: 0.5919428294895778


Number of topics: 2, alpha: 10.0, beta: 0.01,  coherence_score: 0.587091906566859


Number of topics: 3, alpha: 10.0, beta: 2.0,  coherence_score: 0.6030548008374151


Number of topics: 3, alpha: 5.0, beta: 0.4,  coherence_score: 0.601247720934245


Number of topics: 3, alpha: 5.0, beta: 0.2,  coherence_score: 0.597750884779899


Number of topics: 3, alpha: 5.0, beta: 2.0,  coherence_score: 0.592559012224295


Number of topics: 3, alpha: 10.0, beta: 0.8,  coherence_score: 0.588611901155983


Number of topics: 4, alpha: 0.1, beta: 2.0,  coherence_score: 0.589554715276263


Number of topics: 4, alpha: 5.0, beta: 0.8,  coherence_score: 0.5894245245114547


Number of topics: 4, alpha: 5.0, beta: 2.0,  coherence_score: 0.5873925859967963


Number of topics: 4, alpha: 0.2, beta: 2.0,  coherence_score: 0.5840511486430661


Number of topics: 4, alpha: 2.0, beta: 2.0,  coherence_score: 0.5810892223498082


Number of topics: 5, alpha: 5.0, beta: 0.8,  coherence_score: 0.5927842878687252


Number of topics: 5, alpha: 5.0, beta: 0.4,  coherence_score: 0.5834347480463837


Number of topics: 5, alpha: 2.0, beta: 0.4,  coherence_score: 0.5827343367229411


Number of topics: 5, alpha: 5.0, beta: 2.0,  coherence_score: 0.5804975880011416


Number of topics: 5, alpha: 2.0, beta: 0.2,  coherence_score: 0.5796901047502829


Number of topics: 6, alpha: 5.0, beta: 0.8,  coherence_score: 0.6004062022986346


Number of topics: 6, alpha: 5.0, beta: 0.4,  coherence_score: 0.5992415342102925


Number of topics: 6, alpha: 0.4, beta: 2.0,  coherence_score: 0.5977731462399107


Number of topics: 6, alpha: 5.0, beta: 2.0,  coherence_score: 0.5908126082516528


Number of topics: 6, alpha: 5.0, beta: 0.2,  coherence_score: 0.5900764975649716


Number of topics: 7, alpha: 5.0, beta: 0.8,  coherence_score: 0.5858238165825966


Number of topics: 7, alpha: 5.0, beta: 2.0,  coherence_score: 0.5841026005023544


Number of topics: 7, alpha: 5.0, beta: 0.4,  coherence_score: 0.5803384866033333


Number of topics: 7, alpha: 5.0, beta: 0.2,  coherence_score: 0.5797493859674347


Number of topics: 7, alpha: 2.0, beta: 0.8,  coherence_score: 0.5723476208909453


### Final Tuned Model 

Model parameters.
 - `Token` : base(unigram)
 - `Topics`: 4
 - `alpha` : 0.1
 - `beta` : 2.0

coherence_score: 0.589554715276263

In [40]:
copy_df = preprocess_df.copy()
docs = copy_df['processed_reviews']

dictionary = Dictionary(docs.tolist())
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in docs.tolist()]

lda_model = LdaModel(corpus, id2word=dictionary, passes=10, random_state=77, chunksize=200, 
                            num_topics = 4, alpha= 0.1,  eta = 2)
coherence_model = CoherenceModel(model = lda_model, texts = docs.tolist(), 
                                        dictionary=dictionary, coherence= 'c_v')

vis = gensimvis.prepare(lda_model, corpus=corpus, dictionary=dictionary)
display(pyLDAvis.display(vis))

Topics weights of each reviews.

In [44]:
topics_weights_df = pd.DataFrame([dict(lda_model.get_document_topics(bow, minimum_probability=0.0)) for bow in corpus])
topics_weights_df['dominant_topic'] = topics_weights_df.idxmax(axis = 1)
topics_weights_df.head()

Unnamed: 0,0,1,2,3,dominant_topic
0,0.013532,0.251759,0.721176,0.013533,2
1,0.008779,0.973662,0.008781,0.008778,1
2,0.426699,0.320991,0.248207,0.004103,0
3,0.006103,0.569599,0.006104,0.418194,1
4,0.005157,0.984529,0.005157,0.005157,1


This will be combined with the main dataframe for influential users together with their game info for final dashboard for end users (community users/gamers).