# Analysis of Musical and Lyrical Trends at Intelligent Interactive Systems (MIIS)

#### Oktay Ozan Güner   -  ID : OZAN_ID
#### Juan Miguel Alfonso Habana   -  ID : MIGUEL_ID

# Introduction

Since 1990, the way people interact with music has evolved significantly. This period marks a transition from the tangible, physical media of CDs and vinyl to the intangible, yet infinitely accessible world of digital music. 
* How digitalization and streaming have influenced listeners? 
* How the listening habits have changed over time?

We'll examine how our music listening habits have been affected from duration of the songs to the way sentiment of the lyrics.


In [1]:
# INSTALLING RELATED PACKAGES
%%capture
!pip install --upgrade pip setuptools wheel
!pip install bertopic --no-cache-dir
!pip uninstall hdbscan -y
#!pip install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation
!pip install hdbscan==0.8.28

!pip install ctransformers[cuda]
!pip install bertopic
#!pip install --upgrade git+https://github.com/huggingface/transformers
!pip install nvidia-pyindex
!pip install ctransformers
!pip install -U accelerate
!pip3 install --upgrade scipy
!pip install transformers -U
!pip install numpy==1.21
!pip install llvmlite --ignore-installed
!pip install joblib==1.1.0
!conda install -c conda-forge hdbscan -y


!pip install umap-learn
!pip install sentence-transformers
!pip install keybert

# Installing the packages that is needed to measure coherence score
!pip install gensim

In [5]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from warnings import filterwarnings

filterwarnings('ignore')


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [6]:
# Loading the final version of chorus data and its embeddings.
embeds = np.load('Embeds_Chorus_20240312.npy')
doc_df = pd.read_csv("Chorus_Data_20240312.csv")
doc_df.drop("Unnamed: 0", axis=1, inplace=True)

In [8]:
# Getting all tokenized choruses into a list 
docs = doc_df["Tokenized_Chorus"].tolist()

# BERTopic Model

In [16]:
# Getting the frequency of the words in choruses to create a vocabulary
from sklearn.feature_extraction.text import CountVectorizer
import collections
from tqdm import tqdm

vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(docs):
  vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 2]; len(vocab)

100%|██████████| 9903/9903 [00:00<00:00, 23758.69it/s]


10389

In [18]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim import corpora

def bert_coherence_score(topic_model, docs):
  """
  Measures coherence score of the bertopic model.

  Parameters:
  - topic_model (BERTopic): The model that is used for making topic modeling.
  - docs (list): The documents that is wanted to be classified.

  Returns:
  - float value: representing coherence score of the model.
  """

  # Preprocess Documents
  topics = topic_model.topics_
  documents = pd.DataFrame({"Document": docs,
                            "ID": range(len(docs)),
                            "Topic": topics})
  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

  # Extract vectorizer and analyzer from BERTopic
  vectorizer = topic_model.vectorizer_model
  analyzer = vectorizer.build_analyzer()

  # Extract features for Topic Coherence evaluation
  words = vectorizer.get_feature_names()
  tokens = [analyzer(doc) for doc in cleaned_docs]
  dictionary = corpora.Dictionary(tokens)
  corpus = [dictionary.doc2bow(token) for token in tokens]
  topic_words = [[words for words, _ in topic_model.get_topic(topic)]
                 for topic in range(len(set(topics))-1)]

  # Evaluate
  coherence_model = CoherenceModel(topics=topic_words,
                                   texts=tokens,
                                   corpus=corpus,
                                   dictionary=dictionary,
                                   coherence='c_v')
  coherence = coherence_model.get_coherence()

  return coherence

## HYPERPARAMETER TUNING

In [20]:
# Importing related packages
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, LlamaCPP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import time
from warnings import filterwarnings
import itertools
from llama_cpp import Llama

filterwarnings("ignore")

transformers_name = "all-MiniLM-L6-v2"

embed_model = SentenceTransformer(transformers_name)  

# Selecting the range of number of topic parameters to be wanted to observe the performance.
number_of_top_list = list(range(10,20))


opt_df = pd.DataFrame()
for i in range(len(number_of_top_list)):
  start_time = time.time()
  number_of_top = number_of_top_list[i]

  print(f"Parameter Set : number_of_top -> {number_of_top}")

  print("UMAP Model")
  umap_model = UMAP(metric="cosine", random_state=99, n_jobs=-1)      # Dimension reduction.
  print("-"*20)

  print("HDBSCAN Model")
  hdbscan_model = HDBSCAN(metric='euclidean', cluster_selection_method='eom', prediction_data=True)     # Clustering algorithm
  vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english", ngram_range=(1,2))     # Determining frequency of the words.
  representation_model = KeyBERTInspired()        # Using for topic representation.


  print("-"*20)

  print("Bertopic Model")
  topic_model= BERTopic(
      n_gram_range=(1,2),
      nr_topics=number_of_top,
      language="english",
      embedding_model=embed_model,
      umap_model=umap_model,
      hdbscan_model=hdbscan_model,
      vectorizer_model=vectorizer_model,
      representation_model=representation_model,
      verbose=True
  ).fit(docs, embeddings=embeds)

  # Measuring the number of outliers after modeling. Topic -1 represents outlier.
  top_freq = topic_model.get_topic_freq()
  topic_count = len(top_freq)
  try:
    outlier_count = top_freq.loc[top_freq["Topic"]==-1,"Count"].values[0]
  except:
    outlier_count = 0
  print("-"*20)

  print("Coherence Score is calculated")
  coh_score = bert_coherence_score(topic_model, docs)

  temp_df = pd.DataFrame({"Nr_Topics":[number_of_top], "Topic_Count":[topic_count], "Outlier_Count":[outlier_count], "Coherence_Score":[coh_score]})


  opt_df = pd.concat([opt_df, temp_df], ignore_index=True)
  end_time = time.time()
  print(f"\nProcess Time: {(end_time-start_time)/60} minutes")
  print(opt_df)
  print("#"*50)



2024-03-12 18:40:59,084 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Parameter Set : number_of_top -> 8
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:41:16,966 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:41:16,968 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:41:17,281 - BERTopic - Cluster - Completed ✓
2024-03-12 18:41:17,282 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 18:43:08,472 - BERTopic - Representation - Completed ✓
2024-03-12 18:43:08,475 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 18:43:13,887 - BERTopic - Topic reduction - Reduced number of topics from 380 to 8


--------------------
Coherence Score is calculated


2024-03-12 18:43:29,790 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.5117960969607034 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
##################################################
Parameter Set : number_of_top -> 9
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:43:47,079 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:43:47,080 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:43:47,390 - BERTopic - Cluster - Completed ✓
2024-03-12 18:43:47,391 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 18:45:40,099 - BERTopic - Representation - Completed ✓
2024-03-12 18:45:40,101 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 18:45:45,622 - BERTopic - Topic reduction - Reduced number of topics from 380 to 9


--------------------
Coherence Score is calculated


2024-03-12 18:46:01,600 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.530096383889516 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
##################################################
Parameter Set : number_of_top -> 10
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:46:18,665 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:46:18,667 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:46:18,971 - BERTopic - Cluster - Completed ✓
2024-03-12 18:46:18,972 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 18:48:14,218 - BERTopic - Representation - Completed ✓
2024-03-12 18:48:14,220 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 18:48:20,542 - BERTopic - Topic reduction - Reduced number of topics from 380 to 10


--------------------
Coherence Score is calculated


2024-03-12 18:48:39,253 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.627496604124705 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
##################################################
Parameter Set : number_of_top -> 11
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:48:56,051 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:48:56,053 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:48:56,309 - BERTopic - Cluster - Completed ✓
2024-03-12 18:48:56,310 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 18:50:53,033 - BERTopic - Representation - Completed ✓
2024-03-12 18:50:53,035 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 18:50:58,808 - BERTopic - Topic reduction - Reduced number of topics from 380 to 11


--------------------
Coherence Score is calculated


2024-03-12 18:51:15,144 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.5981733322143556 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
3         11           11           4605         0.595424
##################################################
Parameter Set : number_of_top -> 12
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:51:32,228 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:51:32,229 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:51:32,535 - BERTopic - Cluster - Completed ✓
2024-03-12 18:51:32,536 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 18:53:18,160 - BERTopic - Representation - Completed ✓
2024-03-12 18:53:18,169 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 18:53:23,842 - BERTopic - Topic reduction - Reduced number of topics from 380 to 12


--------------------
Coherence Score is calculated


2024-03-12 18:53:38,727 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.3929887334505717 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
3         11           11           4605         0.595424
4         12           12           4605         0.599102
##################################################
Parameter Set : number_of_top -> 13
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:53:53,442 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:53:53,445 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:53:53,700 - BERTopic - Cluster - Completed ✓
2024-03-12 18:53:53,701 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 18:55:35,387 - BERTopic - Representation - Completed ✓
2024-03-12 18:55:35,389 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 18:55:41,545 - BERTopic - Topic reduction - Reduced number of topics from 380 to 13


--------------------
Coherence Score is calculated


2024-03-12 18:55:55,721 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.283165999253591 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
3         11           11           4605         0.595424
4         12           12           4605         0.599102
5         13           13           4605         0.590749
##################################################
Parameter Set : number_of_top -> 14
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:56:10,706 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:56:10,708 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:56:10,972 - BERTopic - Cluster - Completed ✓
2024-03-12 18:56:10,973 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 18:58:01,167 - BERTopic - Representation - Completed ✓
2024-03-12 18:58:01,169 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 18:58:08,215 - BERTopic - Topic reduction - Reduced number of topics from 380 to 14


--------------------
Coherence Score is calculated


2024-03-12 18:58:24,894 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.486223832766215 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
3         11           11           4605         0.595424
4         12           12           4605         0.599102
5         13           13           4605         0.590749
6         14           14           4605         0.589477
##################################################
Parameter Set : number_of_top -> 15
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 18:58:42,113 - BERTopic - Dimensionality - Completed ✓
2024-03-12 18:58:42,114 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 18:58:42,436 - BERTopic - Cluster - Completed ✓
2024-03-12 18:58:42,436 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 19:00:36,510 - BERTopic - Representation - Completed ✓
2024-03-12 19:00:36,512 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 19:00:43,914 - BERTopic - Topic reduction - Reduced number of topics from 380 to 15


--------------------
Coherence Score is calculated


2024-03-12 19:01:01,034 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.602269717057546 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
3         11           11           4605         0.595424
4         12           12           4605         0.599102
5         13           13           4605         0.590749
6         14           14           4605         0.589477
7         15           15           4605         0.593707
##################################################
Parameter Set : number_of_top -> 16
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 19:01:17,892 - BERTopic - Dimensionality - Completed ✓
2024-03-12 19:01:17,894 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 19:01:18,175 - BERTopic - Cluster - Completed ✓
2024-03-12 19:01:18,176 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 19:03:04,993 - BERTopic - Representation - Completed ✓
2024-03-12 19:03:04,994 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 19:03:12,611 - BERTopic - Topic reduction - Reduced number of topics from 380 to 16


--------------------
Coherence Score is calculated


2024-03-12 19:03:29,240 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.4700695673624673 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
3         11           11           4605         0.595424
4         12           12           4605         0.599102
5         13           13           4605         0.590749
6         14           14           4605         0.589477
7         15           15           4605         0.593707
8         16           16           4605         0.598513
##################################################
Parameter Set : number_of_top -> 17
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 19:03:46,177 - BERTopic - Dimensionality - Completed ✓
2024-03-12 19:03:46,179 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 19:03:46,445 - BERTopic - Cluster - Completed ✓
2024-03-12 19:03:46,446 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 19:05:36,237 - BERTopic - Representation - Completed ✓
2024-03-12 19:05:36,240 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 19:05:43,913 - BERTopic - Topic reduction - Reduced number of topics from 380 to 17


--------------------
Coherence Score is calculated


2024-03-12 19:06:01,031 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.529796318213145 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0          8            8           4605         0.626766
1          9            9           4605         0.623014
2         10           10           4605         0.585542
3         11           11           4605         0.595424
4         12           12           4605         0.599102
5         13           13           4605         0.590749
6         14           14           4605         0.589477
7         15           15           4605         0.593707
8         16           16           4605         0.598513
9         17           17           4605         0.616256
##################################################
Parameter Set : number_of_top -> 18
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 19:06:17,520 - BERTopic - Dimensionality - Completed ✓
2024-03-12 19:06:17,522 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 19:06:17,818 - BERTopic - Cluster - Completed ✓
2024-03-12 19:06:17,819 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 19:08:06,376 - BERTopic - Representation - Completed ✓
2024-03-12 19:08:06,379 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 19:08:14,102 - BERTopic - Topic reduction - Reduced number of topics from 380 to 18


--------------------
Coherence Score is calculated


2024-03-12 19:08:31,082 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm



Process Time: 2.5008034626642863 minutes
    Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0           8            8           4605         0.626766
1           9            9           4605         0.623014
2          10           10           4605         0.585542
3          11           11           4605         0.595424
4          12           12           4605         0.599102
5          13           13           4605         0.590749
6          14           14           4605         0.589477
7          15           15           4605         0.593707
8          16           16           4605         0.598513
9          17           17           4605         0.616256
10         18           18           4605         0.602966
##################################################
Parameter Set : number_of_top -> 19
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 19:08:47,671 - BERTopic - Dimensionality - Completed ✓
2024-03-12 19:08:47,672 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 19:08:47,946 - BERTopic - Cluster - Completed ✓
2024-03-12 19:08:47,947 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 19:10:36,249 - BERTopic - Representation - Completed ✓
2024-03-12 19:10:36,251 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 19:10:44,657 - BERTopic - Topic reduction - Reduced number of topics from 380 to 19


--------------------
Coherence Score is calculated

Process Time: 2.531596267223358 minutes
    Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0           8            8           4605         0.626766
1           9            9           4605         0.623014
2          10           10           4605         0.585542
3          11           11           4605         0.595424
4          12           12           4605         0.599102
5          13           13           4605         0.590749
6          14           14           4605         0.589477
7          15           15           4605         0.593707
8          16           16           4605         0.598513
9          17           17           4605         0.616256
10         18           18           4605         0.602966
11         19           19           4605         0.609011
##################################################


In [24]:
# Saving the parameter values in hyperparameter optimization process
opt_df.to_csv("Optimization_Topic_Model_20240312.csv")

# FINAL MODEL

In [42]:
# Loading the parameter values data to determine the best value for topic modeling. Higher coherence score is better.
best_number_of_topics = pd.read_csv("Optimization_Topic_Model_20240312.csv")
best_number_of_topics = best_number_of_topics.sort_values(by="Coherence_Score", ascending=False).head(1)["Nr_Topics"].values[0]

In [43]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, LlamaCPP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import time
from warnings import filterwarnings
import itertools
from llama_cpp import Llama

filterwarnings("ignore")

transformers_name = "all-MiniLM-L6-v2"

embed_model = SentenceTransformer(transformers_name)    


# Select the best number of topic parameter.
number_of_top_list = [int(best_number_of_topics)]


opt_df = pd.DataFrame()
for i in range(len(number_of_top_list)):
  start_time = time.time()
  number_of_top = number_of_top_list[i]

  print(f"Parameter Set : number_of_top -> {number_of_top}")

  print("UMAP Model")
  umap_model = UMAP(metric="cosine", random_state=99, n_jobs=-1)    # Dimension reduction.
  print("-"*20)

  print("HDBSCAN Model")
  hdbscan_model = HDBSCAN(metric='euclidean', cluster_selection_method='eom', prediction_data=True)   # Clustering
  vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english", ngram_range=(1,2))     # Determining frequency of the words.
  representation_model = KeyBERTInspired()        # Using for topic representation.


  print("-"*20)

  print("Bertopic Model")
  topic_model= BERTopic(
      n_gram_range=(1,2),
      nr_topics=number_of_top,
      language="english",
      embedding_model=embed_model,
      umap_model=umap_model,
      hdbscan_model=hdbscan_model,
      vectorizer_model=vectorizer_model,
      representation_model=representation_model,
      verbose=True
  ).fit(docs, embeddings=embeds)

  # Measuring the number of outliers after modeling. Topic -1 represents outlier.
  top_freq = topic_model.get_topic_freq()
  topic_count = len(top_freq)
  try:
    outlier_count = top_freq.loc[top_freq["Topic"]==-1,"Count"].values[0]
  except:
    outlier_count = 0
  print("-"*20)

  print("Coherence Score is calculated")
  coh_score = bert_coherence_score(topic_model, docs)

  temp_df = pd.DataFrame({"Nr_Topics":[number_of_top], "Topic_Count":[topic_count], "Outlier_Count":[outlier_count], "Coherence_Score":[coh_score]})


  opt_df = pd.concat([opt_df, temp_df], ignore_index=True)
  end_time = time.time()
  print(f"\nProcess Time: {(end_time-start_time)/60} minutes")
  print(opt_df)
  print("#"*50)



2024-03-12 19:52:54,824 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Parameter Set : number_of_top -> 17
UMAP Model
--------------------
HDBSCAN Model
--------------------
Bertopic Model


2024-03-12 19:53:10,143 - BERTopic - Dimensionality - Completed ✓
2024-03-12 19:53:10,145 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-12 19:53:10,405 - BERTopic - Cluster - Completed ✓
2024-03-12 19:53:10,406 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-12 19:54:51,631 - BERTopic - Representation - Completed ✓
2024-03-12 19:54:51,634 - BERTopic - Topic reduction - Reducing number of topics
2024-03-12 19:54:58,713 - BERTopic - Topic reduction - Reduced number of topics from 380 to 17


--------------------
Coherence Score is calculated

Process Time: 2.324249800046285 minutes
   Nr_Topics  Topic_Count  Outlier_Count  Coherence_Score
0         17           17           4605         0.616256
##################################################


In [44]:
# Final model Topic Information
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4605,-1_nigga_niggas_baby_wanna,"[nigga, niggas, baby, wanna, ain, stay, girl, ...",[girl you know i can provide yeah you can get ...
1,0,2807,0_love_wanna_stay_don,"[love, wanna, stay, don, babe, leave, ain, bab...",[you might also like because your love because...
2,1,1639,1_niggas_nigga_lil_ain,"[niggas, nigga, lil, ain, yo, gang, ride, noth...",[i put one leg on the headboard and leave the ...
3,2,368,2_dreamer_dreams_dream_heaven,"[dreamer, dreams, dream, heaven, pray, life, d...",[dream on dreamer life gets in your way then y...
4,3,270,3_nigga_bitch_molly_pussy,"[nigga, bitch, molly, pussy, girl, ain, say, w...",[baby it s your world ain t it uh baby it s yo...
5,4,33,4_rich_riches_money_cash,"[rich, riches, money, cash, spend, dollars, pa...",[big white mansion in my habitat habitat smoke...
6,5,31,5_angel_angels_hallelujah_sing,"[angel, angels, hallelujah, sing, choirs, sing...",[you re all i need to know tonight you re my a...
7,6,29,6_santa_christmas_rudolph_reindeer,"[santa, christmas, rudolph, reindeer, claus, s...",[but last christmas i gave you my heart this y...
8,7,22,7_girls_girl_wanna_shorty,"[girls, girl, wanna, shorty, tatted, laid, wan...",[i m tryna pick the right one tryna pick the r...
9,8,20,8_pitches_pitching_braves_bat,"[pitches, pitching, braves, bat, play, run, pr...",[this game is different you only get one shot ...


In [45]:
# Visualization of topics' distribution
topic_model.visualize_documents(docs, embeddings=embeds)

### We can observe that the songs which contains love lyrics(orange) and slang lyrics(green) have dominated all the songs' lyrics in our samples. The grey ones also represent as outliers that the topic model was not able to classified.

In [65]:
# Getting prediction
topics, probs = topic_model.transform(docs, embeddings=embeds)

2024-03-12 21:11:36,304 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-03-12 21:11:36,358 - BERTopic - Dimensionality - Completed ✓
2024-03-12 21:11:36,359 - BERTopic - Clustering - Approximating new points with `hdbscan_model`


2024-03-12 21:11:36,861 - BERTopic - Cluster - Completed ✓


In [66]:
# Assigning topics into doc_df dataframe.
doc_df["Topics"] = topics

In [68]:
# Saving the final version of topic modeling
doc_df.to_csv("FINAL_TOPICS_20240312.csv")

In [55]:
# Loading the hot 100 charts.
bilboard_df = pd.read_csv("../Billboard_Lists_1960-01-01_2024-02-23.csv")

In [57]:
# Converting the dtype of 'Week' as datetime
bilboard_df["Week"] = pd.to_datetime(bilboard_df["Week"])

In [58]:
# Filtering the songs since 1990.
bilboard_df2 = bilboard_df[bilboard_df["Week"]>='1990-01-01'].reset_index(drop=True)

In [47]:
# SAVING Topic Model
#topic_model.save("Topic_Model_20240312")



In [73]:
# Merging the data that is assigned topics with hot 100 charts data to analyze.
merged_df = pd.merge(bilboard_df2, doc_df[["Artist_Name","Song","Chorus","Tokenized_Chorus", "Topics"]], on = ["Artist_Name","Song"], how="left")

In [78]:
# Dropping null values if exist.
merged_df2 = merged_df.dropna(axis=0).reset_index(drop=True)

In [89]:
# Getting Topic Labels in BERTopic model.
merged_df2["Topic_Labels"] = merged_df2["Topics"].map(topic_model.topic_labels_)

In [95]:
# Grouping topics based on the popularity per week in Hot 100 charts
merged_df3 = merged_df2.groupby(["Week","Topic_Labels"], as_index=False)["Topics"].count()

In [115]:
# Getting topic proportions of each week 
merged_df3["Total_Topics_by_Week"] = merged_df3.groupby("Week")["Topics"].transform(sum)
merged_df3["Topics_Ratio_per_Week"] = merged_df3["Topics"] / merged_df3["Total_Topics_by_Week"]

In [103]:
# Getting year info from 'Week' column to group data based on year.
merged_df3["Year"] = merged_df3["Week"].dt.year

In [106]:
# Grouping topics based on Year and topic labels.
merged_df4 = merged_df3.groupby(["Year","Topic_Labels"], as_index=False)["Topics"].count()

In [112]:
# Getting topic proportions of each year 
merged_df4["Total_Topics_by_Year"] = merged_df4.groupby("Year")["Topics"].transform(sum)
merged_df4["Topics_Ratio_per_Year"] = merged_df4["Topics"] / merged_df4["Total_Topics_by_Year"]

In [118]:
# Saving the topic modeling info to make prompt engineering for finding appropriate topic names.
#topic_model.get_topic_info().to_csv("INFO_Topic_Model_20240312.csv")

In [120]:
# READING THE DATA AFTER GETTING CUSTOM LABELS FROM CHATGPT (USING PROMPT ENGINEERING)
topic_table = pd.read_csv("INFO_Topic_Model_Updated_wPromptEng_20240312.csv")

In [121]:
topic_table

Unnamed: 0.1,Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Contains_Explicit,Topic Name
0,0,-1,4605,-1_nigga_niggas_baby_wanna,"['nigga', 'niggas', 'baby', 'wanna', 'ain', 's...",['girl you know i can provide yeah you can get...,True,Urban Tales
1,1,0,2807,0_love_wanna_stay_don,"['love', 'wanna', 'stay', 'don', 'babe', 'leav...",['you might also like because your love becaus...,False,Romantic Reflections
2,2,1,1639,1_niggas_nigga_lil_ain,"['niggas', 'nigga', 'lil', 'ain', 'yo', 'gang'...",['i put one leg on the headboard and leave the...,True,Hustle & Flow
3,3,2,368,2_dreamer_dreams_dream_heaven,"['dreamer', 'dreams', 'dream', 'heaven', 'pray...",['dream on dreamer life gets in your way then ...,False,Dreamscapes & Reality
4,4,3,270,3_nigga_bitch_molly_pussy,"['nigga', 'bitch', 'molly', 'pussy', 'girl', '...",['baby it s your world ain t it uh baby it s y...,True,Explicit Affairs
5,5,4,33,4_rich_riches_money_cash,"['rich', 'riches', 'money', 'cash', 'spend', '...",['big white mansion in my habitat habitat smok...,False,Wealth & Glamour
6,6,5,31,5_angel_angels_hallelujah_sing,"['angel', 'angels', 'hallelujah', 'sing', 'cho...",['you re all i need to know tonight you re my ...,False,Spiritual Journeys
7,7,6,29,6_santa_christmas_rudolph_reindeer,"['santa', 'christmas', 'rudolph', 'reindeer', ...",['but last christmas i gave you my heart this ...,False,Festive Vibes
8,8,7,22,7_girls_girl_wanna_shorty,"['girls', 'girl', 'wanna', 'shorty', 'tatted',...",['i m tryna pick the right one tryna pick the ...,False,Feminine Charms
9,9,8,20,8_pitches_pitching_braves_bat,"['pitches', 'pitching', 'braves', 'bat', 'play...",['this game is different you only get one shot...,False,Athletic Aspirations


In [133]:
# Getting topic custom names to assign to predicted documents.
topic_custom_map = {item[0]:item[1] for item in topic_table[["Topic", "Topic Name"]].values}

In [134]:
# Assigning custom topic names as topic names in Bertopic modeling.
topic_model.set_topic_labels(topic_custom_map)

In [139]:
# Saving custom final version of the model. THIS IS THE MODEL VERSION THAT WE NEED TO USE WHEN WE MAKE PREDICTION.
#topic_model.save("Custom_Topic_Model_20240312")



In [140]:
# Loading the saved model.
loaded_model = BERTopic.load("Custom_Topic_Model_20240312")

In [167]:
# Plotting the word frequency of each topic
fig = loaded_model.visualize_barchart(custom_labels=True, top_n_topics=len(loaded_model.topic_labels_), width=300, height=150)
fig.show()

### We can observe the most frequent words in each topic above. While Romantic Reflections involves more love words, Hustle & Flow involves more slang words.

In [143]:
# Getting info of final model version.
custom_topic_info_df = loaded_model.get_topic_info()

In [146]:
# Getting topic custom names to assign to predicted documents.
custom_name_map = {item[0]:item[1] for item in custom_topic_info_df[["Name","CustomName"]].values}

In [150]:
# Applying the custom topic names to our main predicted topic data.
merged_df3["Topic_Name"] = merged_df3["Topic_Labels"].map(custom_name_map)

In [151]:
merged_df3

Unnamed: 0,Week,Topic_Labels,Topics,Year,Total_Topics_by_Week,Topics_Ratio_per_Week,Topic_Name
0,1990-01-05,-1_nigga_niggas_baby_wanna,20,1990,41,0.487805,Urban Tales
1,1990-01-05,0_love_wanna_stay_don,15,1990,41,0.365854,Romantic Reflections
2,1990-01-05,11_blame_blaming_fault_mistakes,1,1990,41,0.024390,Guilt & Redemption
3,1990-01-05,15_unknown_deep_sail_wind,1,1990,41,0.024390,Mystic Explorations
4,1990-01-05,2_dreamer_dreams_dream_heaven,2,1990,41,0.048780,Dreamscapes & Reality
...,...,...,...,...,...,...,...
10227,2024-02-23,12_fight_fightin_youth_screamin,1,2024,86,0.011628,Youthful Rebellion
10228,2024-02-23,1_niggas_nigga_lil_ain,19,2024,86,0.220930,Hustle & Flow
10229,2024-02-23,2_dreamer_dreams_dream_heaven,2,2024,86,0.023256,Dreamscapes & Reality
10230,2024-02-23,3_nigga_bitch_molly_pussy,4,2024,86,0.046512,Explicit Affairs


In [154]:
# Observing the trends of topic over time
import plotly.express as px

fig = px.line(merged_df3, x="Week", y="Topics_Ratio_per_Week", color="Topic_Name")

# Update x-axis and y-axis labels
fig.update_layout(
    xaxis_title="Week",
    yaxis_title="Topic Ratio per Week"
)

fig.show()

### We can observe by following red line, the interests to romantic songs are descreasing while the interests of offensive choruses(green line) are increasing over time.

In [187]:
# Final check
fig = loaded_model.visualize_documents(docs=docs,embeddings=embeds, custom_labels=True, height=650, width=1300)
fig.show()

### We can examine the distribution of topic clusters after finding appropriate topic name by using ChatGPT above.

In [211]:
# Observing coherence score changes based on the number of topics.
fig = px.line(opt_table, x="Nr_Topics", y="Coherence_Score")

fig.update_layout(
    xaxis_title="Number of Topics",
    yaxis_title="Coherence Score"
)

fig.update_xaxes(range=[10, 19])
fig.update_yaxes(range=[0.4, 0.7])  

for x, y in zip(opt_table['Nr_Topics'], opt_table['Coherence_Score']):
    fig.add_annotation(x=x, y=y, text=str(round(y,2)), showarrow=True, arrowhead=1)

fig.show()

### After hyperparameter optimization process, we can obtain the most appropriate number of topic as 17 by using coherence score metric(Higher is better).