## HDP Model with Genesim
### 1. Corpus Creation

In [2]:
# Retrieve data from Preprocessing
%store -r data_lemmatized df

from gensim import corpora, models
from pprint import pprint
import gensim

# Create Dictionary
id2word = gensim.corpora.Dictionary(data_lemmatized)

# Filtering out of tokens appearing in less than 20 documents or more than 70% of documents
id2word.filter_extremes(no_below=20, no_above=0.7)

# Create Corpus
bow_corpus = [id2word.doc2bow(doc) for doc in data_lemmatized]

# Create the TF-IDF model based on the bag-of-words corpus
tfidf_model = models.TfidfModel(bow_corpus)

# Apply the TF-IDF transformation to the corpus
tfidf_corpus = tfidf_model[bow_corpus]

for doc in tfidf_corpus:
    pprint(doc)
    break

[(0, 0.17690613756060086),
 (1, 0.13205008895998877),
 (2, 0.14336469175790142),
 (3, 0.14913808879720644),
 (4, 0.18731716171680116),
 (5, 0.19524770945331216),
 (6, 0.04689342344440567),
 (7, 0.26331130930551977),
 (8, 0.26416774953322836),
 (9, 0.19340339809371676),
 (10, 0.31448092903974306),
 (11, 0.3189738583094442),
 (12, 0.096278199935693),
 (13, 0.10405330187201527),
 (14, 0.17849338528617428),
 (15, 0.06802944545411793),
 (16, 0.043326972794528074),
 (17, 0.08654357014989379),
 (18, 0.3695068229344004),
 (19, 0.37115918582026985),
 (20, 0.11042566660647649),
 (21, 0.20566964840912783),
 (22, 0.13371304691656322),
 (23, 0.208036474980719)]


### 2. Defauld HDP model

In [3]:
from gensim.models import HdpModel

# calculate hdp model with default parameters
hdp_model = HdpModel(tfidf_corpus, id2word)

In [4]:
# show initial topics
hdp_model.show_topics()

[(0,
  '0.003*family + 0.003*frequently + 0.003*trash + 0.002*clock + 0.002*machine + 0.002*mad + 0.002*weather + 0.002*monster + 0.002*gotta + 0.002*theme + 0.002*weekday + 0.002*thus + 0.002*chaotic + 0.002*start + 0.002*tight + 0.002*huge + 0.002*marathon + 0.002*express + 0.002*delay + 0.002*position'),
 (1,
  '0.004*really + 0.003*grandma + 0.003*twain + 0.003*round + 0.003*folk + 0.003*certainly + 0.002*stall + 0.002*significant + 0.002*popcorn + 0.002*buzz + 0.002*manageable + 0.002*dislike + 0.002*earth + 0.002*addition + 0.002*enjoyment + 0.002*kinda + 0.002*classic + 0.002*notch + 0.002*tomorrow + 0.002*animatronics'),
 (2,
  '0.003*others + 0.002*romantic + 0.002*support + 0.002*solid + 0.002*closure + 0.002*generation + 0.002*basic + 0.002*constantly + 0.002*special + 0.002*whatever + 0.002*league + 0.002*race + 0.002*decor + 0.002*dad + 0.002*efficient + 0.002*away + 0.002*soooo + 0.002*caters + 0.002*start + 0.002*theme'),
 (3,
  '0.003*awesome + 0.003*costco + 0.002*amer

#### Default coherence score

In [5]:
from gensim.models import CoherenceModel

# Compute Coherence Score with default alpha and beta values
coherence_model = CoherenceModel(model=hdp_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('Coherence Score: ', coherence_score)

Coherence Score:  0.4922253290232601


### 3. Compute model performance metrics
#### Calculation of Coherence Score with varying Truncation Level

In [6]:
# Define function for calculation of coherence values
def compute_coherence_values(corpus, dictionary, model):
    coherence_model = CoherenceModel(model=model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

In [6]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from gensim.models import HdpModel
import pandas as pd

# Create list with varying truncation levels between 5 and 50
t_list = [t for t in range(5, 50, 5)]
results = []

# Define funtion for training hdp model with varying truncation level
def train_hdp_model_t(t):
    hdp_model = HdpModel(corpus=tfidf_corpus, id2word=id2word, T=t)
    
    # Compute coherence value for the trained model
    cv = compute_coherence_values(corpus=tfidf_corpus, dictionary=id2word, model=hdp_model)
    
    # Get the number of topics generated by the model
    num_topics = len(hdp_model.get_topics())
    
    # Store the result in a dictionary
    result = {'Truncation Level': t, 'Coherence': cv, 'Num_Topics': num_topics}
    
    return result

# Create a progress bar and a thread pool executor for concurrent execution
with tqdm(total=len(t_list), desc="Training HDP Models") as pbar, ThreadPoolExecutor() as executor:
    # Submit tasks to the executor for each truncation level in t_list
    futures = [executor.submit(train_hdp_model_t, t) for t in t_list]
    
    # Wait for the tasks to complete and retrieve the results
    for future in as_completed(futures):
        result = future.result()
        results.append(result)
        pbar.update(1)

# Create a pandas DataFrame from the results
df2 = pd.DataFrame(results)

# Sort the DataFrame based on the 'Coherence' column in descending order
df2.sort_values('Coherence', ascending=False)


Training HDP Models: 100%|████████████████████████| 9/9 [04:16<00:00, 28.47s/it]


Unnamed: 0,Truncation Level,Coherence,Num_Topics
7,40,0.514168,40
3,20,0.505223,20
5,30,0.504326,30
2,15,0.504101,15
4,25,0.49776,25
1,10,0.49394,10
0,5,0.492919,5
6,35,0.473465,35
8,45,0.472601,45


#### Calculation of Coherence Score with varying alpha, beta & gamma

In [9]:
# Define function for training the model with different parameters and predefined truncation level
def train_hdp_model_params(alpha, beta, gamma):
    hdp_model = HdpModel(corpus=tfidf_corpus, id2word=id2word, alpha=alpha, eta=beta, gamma=gamma, T=30)
    
    # Compute coherence value for the trained model
    cv = compute_coherence_values(corpus=tfidf_corpus, dictionary=id2word, model=hdp_model)
    
    # Get the number of topics generated by the model
    num_topics = len(hdp_model.get_topics())
    return cv, num_topics

# define range for alpha, beta & gamma
alpha_list = [0.01, 0.31, 0.61, 0.91]
beta_list = [0.01, 0.31, 0.61, 0.91]
gamma_list = [1,2,3]

results = []


# Iterate over alpha and gamma values
for alpha in alpha_list:
    for beta in beta_list:
        for gamma in gamma_list:
            # Train the HDP model
            cv, num_topics = train_hdp_model_params(alpha, beta, gamma)

            # Store the results
            result = {'Alpha': alpha, 'Beta': beta, 'Gamma': gamma, 'Coherence': cv, 'Num_Topics': num_topics}
            results.append(result)
        
# create data frame with results                
df3 = pd.DataFrame(results)
df3.to_csv('hdp_tuning_results_gensim.csv', index=False)

In [8]:
# Read the model_results from the CSV file
model_results = pd.read_csv('hdp_tuning_results_gensim.csv')

# display the metrics and coherence score
metrics_df = model_results.sort_values('Coherence', ascending=False).head()   
metrics_df

Unnamed: 0,Alpha,Beta,Gamma,Coherence,Num_Topics
45,0.91,0.91,1,0.529403,30
34,0.61,0.91,2,0.509548,30
30,0.61,0.61,1,0.507113,30
43,0.91,0.61,2,0.506699,30
26,0.61,0.01,3,0.506229,30


#### Selection of the optimum alpha and beta values

In [9]:
# select alpha, beta & gamma with the highest coherence value from the dataframe
alpha = metrics_df.iloc[0,1]
beta = metrics_df.iloc[0,2]
gamma = metrics_df.iloc[0,3]

print(alpha, beta, gamma)
    

# calculate the hdp with the selected parameters
best_hdp_model = HdpModel(corpus=tfidf_corpus, id2word=id2word, alpha=alpha, eta=beta, gamma=gamma, T=30)

0.91 1 0.5294031868100869


#### Topics distribution across documents

In [10]:
import numpy as np

# Define function for extracting the topic probabilities and weights from the HDP model
def topic_prob_extractor(gensim_hdp):   
    shown_topics = gensim_hdp.show_topics(num_topics=30, formatted=False)
    topics_nos = [x[0] for x in shown_topics]
    weights = [sum([item[1] for item in x[1]]) for x in shown_topics]

    return pd.DataFrame({'topic_id': topics_nos, 'weight': weights})

# Retrieve the document-topic assignments
topic_assignments = best_hdp_model[tfidf_corpus]

# Count the number of documents assigned to each topic
topic_counts = {}
for doc_topics in topic_assignments:
    for topic_id, topic_prob in doc_topics:
        topic_counts[topic_id] = topic_counts.get(topic_id, 0) + 1

# Create the DataFrame with topic counts and weights
df_document_topic = pd.DataFrame({'Num Documents': topic_counts}).sort_values(by='Num Documents', ascending=False)

# Calculate the topic weights
df_topic_weights = topic_prob_extractor(best_hdp_model)

# Merge the topic weights into the DataFrame
df_document_topic = df_document_topic.merge(df_topic_weights, left_index=True, right_on='topic_id')
df_document_topic = df_document_topic[['topic_id', 'Num Documents', 'weight']]

# Sort the DataFrame by weights in descending order
df_document_topic = df_document_topic.sort_values(by='weight', ascending=False)

# Calculate the topic distribution
df_topic_distribution = df_document_topic['topic_id'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']

# Display the DataFrame
df_document_topic

Unnamed: 0,topic_id,Num Documents,weight
14,14,950,0.043866
8,8,1163,0.04318
17,17,772,0.042536
11,11,956,0.042532
7,7,1043,0.042507
28,28,930,0.042169
20,20,891,0.04202
2,2,1057,0.041893
6,6,1018,0.041553
1,1,1333,0.041423


#### Intertopic Distance Map

In [11]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings

# disable deprecation warning
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(best_hdp_model, tfidf_corpus, id2word)
vis

![Inter-topic Distance Map](Topic_Distance_Maps/Intertopic_Map_HDP_Gensim.jpg)

### 4. Final Results
#### Top 10 words assigned to each topic

In [12]:
# Show top 10 keywords for each topic
def show_topics(hdp_model, n_words=20):
    topic_keywords = []
    for topic_weights in hdp_model.get_topics():
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append([id2word[idx] for idx in top_keyword_locs])
    return topic_keywords

topic_keywords = show_topics(hdp_model=best_hdp_model, n_words=10)        

# Topic-Keywords DataFrame
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i+1) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i+1) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,third,duck,bummer,impatient,personal,virtually,magic,significantly,button,fee
Topic 2,fan,room,downtown,extremely,inn,cent,type,sore,sunday,cut
Topic 3,colorful,road,rider,beautiful,invest,sooo,fan,baby,healthy,clam
Topic 4,peak,shoot,starbucks,bring,magical,behavior,still,enormous,moderate,citypass
Topic 5,yummy,get,autographs,felt,own,gate,final,barely,tripadvisor,possible
Topic 6,enthusiasm,price,aside,matter,above,booking,operate,nightly,vip,step
Topic 7,haven,resident,pickle,factor,racer,major,getaway,buggy,fellow,congestion
Topic 8,random,june,sooooo,denny,common,jerky,zone,today,theatre,suffer
Topic 9,chat,ride,contact,hassle,bay,starbucks,woody,tough,order,merchandise
Topic 10,much,shoe,tomorrow,spend,definitely,hr,computer,lovely,celebration,marvel


#### Assignment of the reviews to the topics

In [13]:
# Set column width to maximum to see the whole review
pd.set_option('display.max_colwidth', None)

# Reset index of the DataFrame for matching
df = df.reset_index(drop=True)
df_document_topic = df_document_topic.reset_index(drop=True)

# Join the dataset with the 'topic_id' column from df_document_topic
df_joined = pd.merge(df, df_document_topic['topic_id'], left_index=True, right_index=True)

df_joined.head(10)

Unnamed: 0,Review_Text,topic_id
0,"this place has always been and forever will be special. the feeling you get entering the park, seeing the characters and different attractions is just priceless. this is definitely a dream trip for all ages, especially young kids. spend the money and go to disneyland, you will not regret it",14
1,"a great day of simple fun and thrills. bring cash, nothing is cheap, but we knew that it's disney. but they are great letting you bring in your own food, drinks, etc but read the list closely, we list several items at the entry gates (selfy sticks, glass refill bottles, etc). it is worth buying the photo pass and fastpass. have fun!",8
2,all and all a great day was had. the crowds are huge and ride times sometimes up to a 50 min wait but worth it. very disappointed the castle was under repair and covered but we understood. if you want to take a short cut on the rides grab a max pass for 15 each it allows you to book a time every 90mins and believe me it's a great feeling floating past the ques.,17
3,"having been to the florida location numerous times over the years i didn't know how this one would compare. the much smaller footprint made it quicker easier to get from ride to ride. the cinderella's castle was much much smaller, but also being renovated. many of the rides were identical, while some were notably different. the indiana jones ride (which fl does not have) was awesome. space mountain was much more puke worthy. it's a small world was way more beautiful from the exterior. they had several of the old rides that fl doesn't have (like pinocchio and mr. toad). i love how their fast passes worked... it was easy to use and we were able to use them back to back on rides, fast passing almost everything we wanted to ride. we went the first week of april and while crowded, i didn't think it was too bad. we were only in the park for like 7 hours and i do wish i'd had about 3 more hours to do everything.",11
4,"had the 4 day pass, spent 3 at dl and one at ca. great place to visit. we will be back. saw a lot in those days, but there is more that we missed, and more we want to see again.",7
5,oh my god you can really forget your self and enjoy everything you face there its a huge word of fun,28
6,"we were so sad that the castle was under refurbishment, but we didn t let it ruin our good time. i purchased the maxpass before we went and it was super easy to use and very convenient. maxpass includes free pictures taken by disney photographers. i have been to disney world and there is no shortage of photographers. this was not the case here. they definitely need to add more. we were able to do all of the rides that we wanted to do, and we got the snacks that we wanted to try. we spent two days here and on the last day we watched the fireworks show. it was amazing. all of the workers were polite and helpful.",20
7,"took my son to the park on a weekend(fri sun). very busy all over the places. my son enjoyed it, but he was very sad that he could not try a lot of attractions as the wait time was huge.....( ranging from 60 90 minutes) if you have a lot of time to waste waiting in the line for the attractions, fastpass system at the park is just a ripoff. you can have only one attraction added to your plan at any point, once you complete that attraction you can add one more.",2
8,"there's nothing like disneyland and it should be on your bucket list if you've never been. park is well maintained and parking is ok, but not cheap at $25. tickets can cost up to $150 per adult (10 years old and up) during peak periods. food is not cheap either. if you take a family without a season's pass, you could easily spend hundreds of dollars just on admission tickets. if you go frequently, get a disney credit card to get free park perks and park promotions. downtown disney is also a fun place to visit.",6
9,"i think it s because going to disneyland is one of my earliest memories, but i just love visiting the park every few years. you won t find a place with friendlier more helpful staff and the entire park is so clean. yes, it s expensive for many and there are crowds, but you can adjust the timing of your visit for fewer crowds and you can visit on a budget if you plan ahead. it s such a special place for so many people and kids enjoy themselves too. there are loads of online resources on how to visit on a budget such as staying at an offsite hotel and bringing your own snacks and meals. i highly recommend that everyone try it once, but be warned, it s highly addictive.",1
