## HDP Model with Genesim
### 1. Corpus Creation

In [1]:
# Retrieve data from Preprocessing
%store -r data_lemmatized df

from gensim import corpora, models
from pprint import pprint
import gensim

# Create Dictionary
id2word = gensim.corpora.Dictionary(data_lemmatized)

# Filtering out of tokens appearing in less than 20 documents or more than 70% of documents
id2word.filter_extremes(no_below=20, no_above=0.7)

# Create Corpus
bow_corpus = [id2word.doc2bow(doc) for doc in data_lemmatized]

# Create the TF-IDF model based on the bag-of-words corpus
tfidf_model = models.TfidfModel(bow_corpus)

# Apply the TF-IDF transformation to the corpus
tfidf_corpus = tfidf_model[bow_corpus]

for doc in tfidf_corpus:
    pprint(doc)
    break

[(0, 0.17690613756060086),
 (1, 0.13205008895998877),
 (2, 0.14336469175790142),
 (3, 0.14913808879720644),
 (4, 0.18731716171680116),
 (5, 0.19524770945331216),
 (6, 0.04689342344440567),
 (7, 0.26331130930551977),
 (8, 0.26416774953322836),
 (9, 0.19340339809371676),
 (10, 0.31448092903974306),
 (11, 0.3189738583094442),
 (12, 0.096278199935693),
 (13, 0.10405330187201527),
 (14, 0.17849338528617428),
 (15, 0.06802944545411793),
 (16, 0.043326972794528074),
 (17, 0.08654357014989379),
 (18, 0.3695068229344004),
 (19, 0.37115918582026985),
 (20, 0.11042566660647649),
 (21, 0.20566964840912783),
 (22, 0.13371304691656322),
 (23, 0.208036474980719)]


### 2. Defauld HDP model

In [2]:
from gensim.models import HdpModel

# calculate hdp model with default parameters
hdp_model = HdpModel(tfidf_corpus, id2word)

In [3]:
# show initial topics
hdp_model.show_topics()

[(0,
  '0.004*canal + 0.003*decent + 0.003*attack + 0.003*freeze + 0.003*labor + 0.002*ready + 0.002*shoulder + 0.002*perhaps + 0.002*heap + 0.002*back + 0.002*join + 0.002*heaven + 0.002*sat + 0.002*combination + 0.002*target + 0.002*lifetime + 0.002*dress + 0.002*overcrowded + 0.002*nice + 0.002*test'),
 (1,
  '0.003*hamburger + 0.002*lesson + 0.002*still + 0.002*differently + 0.002*animation + 0.002*par + 0.002*unhappy + 0.002*thoroughly + 0.002*planning + 0.002*reasonable + 0.002*seeker + 0.002*entirely + 0.002*pretty + 0.002*asap + 0.002*snack + 0.002*human + 0.002*picture + 0.002*dinosaur + 0.002*vader + 0.002*treatment'),
 (2,
 (3,
  '0.003*close + 0.003*tuesday + 0.003*sept + 0.002*mobile + 0.002*alcohol + 0.002*apps + 0.002*factor + 0.002*brother + 0.002*grizzly + 0.002*scare + 0.002*lake + 0.002*pro + 0.002*low + 0.002*everybody + 0.002*jam + 0.002*delightful + 0.002*decide + 0.002*discount + 0.002*language + 0.002*february'),
 (4,
  '0.003*humid + 0.002*submarine + 0.002*bub

#### Default coherence score

In [4]:
from gensim.models import CoherenceModel

# Compute Coherence Score with default alpha and beta values
coherence_model = CoherenceModel(model=hdp_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('Coherence Score: ', coherence_score)

Coherence Score:  0.48511618616782565


### 3. Compute model performance metrics
#### Calculation of Coherence Score with varying Truncation Level

In [5]:
# Define function for calculation of coherence values
def compute_coherence_values(corpus, dictionary, model):
    coherence_model = CoherenceModel(model=model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

In [6]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from gensim.models import HdpModel
import pandas as pd

# Create list with varying truncation levels between 5 and 50
t_list = [t for t in range(5, 50, 5)]
results = []

# Define funtion for training hdp model with varying truncation level
def train_hdp_model_t(t):
    hdp_model = HdpModel(corpus=tfidf_corpus, id2word=id2word, T=t)
    
    # Compute coherence value for the trained model
    cv = compute_coherence_values(corpus=tfidf_corpus, dictionary=id2word, model=hdp_model)
    
    # Get the number of topics generated by the model
    num_topics = len(hdp_model.get_topics())
    
    # Store the result in a dictionary
    result = {'Truncation Level': t, 'Coherence': cv, 'Num_Topics': num_topics}
    
    return result

# Create a progress bar and a thread pool executor for concurrent execution
with tqdm(total=len(t_list), desc="Training HDP Models") as pbar, ThreadPoolExecutor() as executor:
    # Submit tasks to the executor for each truncation level in t_list
    futures = [executor.submit(train_hdp_model_t, t) for t in t_list]
    
    # Wait for the tasks to complete and retrieve the results
    for future in as_completed(futures):
        result = future.result()
        results.append(result)
        pbar.update(1)

# Create a pandas DataFrame from the results
df2 = pd.DataFrame(results)

# Sort the DataFrame based on the 'Coherence' column in descending order
df2.sort_values('Coherence', ascending=False)


Training HDP Models: 100%|████████████████████████| 9/9 [04:16<00:00, 28.47s/it]


Unnamed: 0,Truncation Level,Coherence,Num_Topics
7,40,0.514168,40
3,20,0.505223,20
5,30,0.504326,30
2,15,0.504101,15
4,25,0.49776,25
1,10,0.49394,10
0,5,0.492919,5
6,35,0.473465,35
8,45,0.472601,45


#### Calculation of Coherence Score with varying alpha, beta & gamma

In [9]:
# Define function for training the model with different parameters and predefined truncation level
def train_hdp_model_params(alpha, beta, gamma):
    hdp_model = HdpModel(corpus=tfidf_corpus, id2word=id2word, alpha=alpha, eta=beta, gamma=gamma, T=30)
    
    # Compute coherence value for the trained model
    cv = compute_coherence_values(corpus=tfidf_corpus, dictionary=id2word, model=hdp_model)
    
    # Get the number of topics generated by the model
    num_topics = len(hdp_model.get_topics())
    return cv, num_topics

# define range for alpha, beta & gamma
alpha_list = [0.01, 0.31, 0.61, 0.91]
beta_list = [0.01, 0.31, 0.61, 0.91]
gamma_list = [1,2,3]

results = []


# Iterate over alpha and gamma values
for alpha in alpha_list:
    for beta in beta_list:
        for gamma in gamma_list:
            # Train the HDP model
            cv, num_topics = train_hdp_model_params(alpha, beta, gamma)

            # Store the results
            result = {'Alpha': alpha, 'Beta': beta, 'Gamma': gamma, 'Coherence': cv, 'Num_Topics': num_topics}
            results.append(result)
        
# create data frame with results                
df3 = pd.DataFrame(results)
df3.to_csv('hdp_tuning_results_gensim.csv', index=False)

In [10]:
# Read the model_results from the CSV file
model_results = pd.read_csv('hdp_tuning_results_gensim.csv')

# display the metrics and coherence score
metrics_df = model_results.sort_values('Coherence', ascending=False).head()   
metrics_df

Unnamed: 0,Alpha,Beta,Gamma,Coherence,Num_Topics
45,0.91,0.91,1,0.529403,30
34,0.61,0.91,2,0.509548,30
30,0.61,0.61,1,0.507113,30
43,0.91,0.61,2,0.506699,30
26,0.61,0.01,3,0.506229,30


#### Selection of the optimum alpha and beta values

In [11]:
# select alpha, beta & gamma with the highest coherence value from the dataframe
alpha = metrics_df.iloc[0,1]
beta = metrics_df.iloc[0,2]
gamma = metrics_df.iloc[0,3]

print(alpha, beta, gamma)
    

# calculate the hdp with the selected parameters
best_hdp_model = HdpModel(corpus=tfidf_corpus, id2word=id2word, alpha=alpha, eta=beta, gamma=gamma, T=30)

0.91 1 0.5294031868100869


#### Topics distribution across documents

In [12]:
import numpy as np

# Define function for extracting the topic probabilities and weights from the HDP model
def topic_prob_extractor(gensim_hdp):   
    shown_topics = gensim_hdp.show_topics(num_topics=30, formatted=False)
    topics_nos = [x[0] for x in shown_topics]
    weights = [sum([item[1] for item in x[1]]) for x in shown_topics]

    return pd.DataFrame({'topic_id': topics_nos, 'weight': weights})

# Retrieve the document-topic assignments
topic_assignments = best_hdp_model[tfidf_corpus]

# Count the number of documents assigned to each topic
topic_counts = {}
for doc_topics in topic_assignments:
    for topic_id, topic_prob in doc_topics:
        topic_counts[topic_id] = topic_counts.get(topic_id, 0) + 1

# Create the DataFrame with topic counts and weights
df_document_topic = pd.DataFrame({'Num Documents': topic_counts}).sort_values(by='Num Documents', ascending=False)

# Calculate the topic weights
df_topic_weights = topic_prob_extractor(best_hdp_model)

# Merge the topic weights into the DataFrame
df_document_topic = df_document_topic.merge(df_topic_weights, left_index=True, right_on='topic_id')
df_document_topic = df_document_topic[['topic_id', 'Num Documents', 'weight']]

# Sort the DataFrame by weights in descending order
df_document_topic = df_document_topic.sort_values(by='weight', ascending=False)

# Calculate the topic distribution
df_topic_distribution = df_document_topic['topic_id'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']

# Display the DataFrame
df_document_topic

Unnamed: 0,topic_id,Num Documents,weight
27,27,1014,0.046535
28,28,1029,0.044604
18,18,1191,0.043173
25,25,1057,0.04317
1,1,945,0.043155
17,17,1051,0.042805
7,7,1108,0.042653
2,2,949,0.04262
15,15,771,0.042584
4,4,1233,0.042273


#### Intertopic Distance Map

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings

# disable deprecation warning
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(best_hdp_model, tfidf_corpus, id2word)
vis

![Inter-topic Distance Map](Topic_Distance_Maps/Intertopic_Map_HDP_Gensim.jpg)

### 4. Final Results
#### Top 10 words assigned to each topic

In [None]:
# Show top 10 keywords for each topic
def show_topics(hdp_model, n_words=20):
    topic_keywords = []
    for topic_weights in hdp_model.get_topics():
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append([id2word[idx] for idx in top_keyword_locs])
    return topic_keywords

topic_keywords = show_topics(hdp_model=best_hdp_model, n_words=10)        

# Topic-Keywords DataFrame
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i+1) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i+1) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

#### Assignment of the reviews to the topics

In [None]:
# Set column width to maximum to see the whole review
pd.set_option('display.max_colwidth', None)

# Reset index of the DataFrame for matching
df = df.reset_index(drop=True)
df_document_topic = df_document_topic.reset_index(drop=True)

# Join the dataset with the 'topic_id' column from df_document_topic
df_joined = pd.merge(df, df_document_topic['topic_id'], left_index=True, right_index=True)

df_joined.head(10)