## BERTopic

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np
import os

import nltk
from nltk.corpus import PlaintextCorpusReader

In [None]:
import preprocess2
reviews_corpus = preprocess2.load_corpus('sephora_corpus')
reviews_docs = preprocess2.corpus2docs(reviews_corpus)
reviews_docs_joined = [" ".join(x) for x in reviews_docs]  #joined to fit vectorizer

In [None]:
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
topic_model = BERTopic(verbose=True, n_gram_range=(1,1))
topics, _ = topic_model.fit_transform(reviews_docs_joined); len(topic_model.get_topic_info())

In [None]:
import numpy as np
from bertopic import BERTopic
from gensim import corpora
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import CoherenceModel
from hdbscan import HDBSCAN
from umap import UMAP
import gensim.corpora as corpora
import pandas as pd
import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the range of parameters
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Dimensionality Reduction Parameters
n_neighbors_range = [5, 10, 15]
n_components_range = [3, 5, 7]

model_results = {'Topics': [],
                 'N_Neighbors': [],
                 'N_Components': [],
                 'Coherence': []
                }

# Define the size of the subset (10%)
subset_size = int(len(reviews_docs_joined) * 0.1)
subset_docs = np.random.choice(reviews_docs_joined, subset_size, replace=False)

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(topics_range) * len(n_neighbors_range) * len(n_components_range)))

for k in topics_range:
    for n_neighbors in n_neighbors_range:
        for n_components in n_components_range:
            # Initialize the BERTopic model with specified hyperparameters
            embedding_model = SentenceTransformer('all-mpnet-base-v2')
            umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, metric='manhattan', low_memory=True)
            hdbscan_model = HDBSCAN()
            vectorizer_model = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
            representation_model = KeyBERTInspired()
            
            # Create the BERTopic model
            topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
            
            # Fit the model on your subset of documents
            topics, _ = topic_model.fit_transform(subset_docs)
            
            # Preprocess documents
            documents = pd.DataFrame(
                {"Document": subset_docs,
                 "ID": range(len(subset_docs)),
                 "Topic": topics}
            )
            documents_per_topic = documents.groupby(
                ['Topic'], as_index=False).agg({'Document': ' '.join})
            cleaned_docs = topic_model._preprocess_text(
                documents_per_topic.Document.values)
            
            # Extract vectorizer and analyzer from the fitted model
            vectorizer_model = topic_model.vectorizer_model
            analyzer = vectorizer_model.build_analyzer()
            
            # Extract features for topic coherence evaluation
            tokens = [analyzer(doc) for doc in cleaned_docs]
            dictionary = corpora.Dictionary(tokens)
            corpus = [dictionary.doc2bow(token) for token in tokens]
            topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics)) - 1)]
            
            # Calculate coherence
            coherence_model = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='c_v')
            coherence_score = coherence_model.get_coherence()
            
            # Save the model results
            model_results['Topics'].append(k)
            model_results['N_Neighbors'].append(n_neighbors)
            model_results['N_Components'].append(n_components)
            model_results['Coherence'].append(coherence_score)

            pbar.update(1)
    pbar.close()

In [None]:
pd.DataFrame(model_results).to_csv('./bertopic_tuning_results.csv', index=False)

In [None]:
## Clustering

In [None]:
import numpy as np # linear algebra
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from bs4 import BeautifulSoup
from scipy.stats import multivariate_normal as mvn
import nltk
import os
import random


import string
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

In [None]:
import preprocess2

reviews_corpus = preprocess2.load_corpus('corpus_lemma')
reviews_docs = preprocess2.corpus2docs(reviews_corpus)
reviews_docs_joined = [" ".join(x) for x in reviews_docs]



# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_features=1000, stop_words = 'english', ngram_range=(1,1), analyzer='word')  # You can adjust the number of features as needed

# Fit and transform your preprocessed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews_docs_joined)

In [None]:
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

# Step 1: Import the necessary libraries (if not already imported)
n_components = 2  # Number of components for TruncatedSVD
n_clusters_range = range(1, 10)  # Adjust the range as needed

svd = TruncatedSVD(n_components=n_components, random_state=0)
Y_svd = svd.fit_transform(tfidf_matrix)

kmeans = [KMeans(n_clusters=i, max_iter=600) for i in n_clusters_range]

score = [kmeans[i].fit(Y_svd).inertia_ for i in range(len(kmeans))]  # Added a closing parenthesis here

plt.plot(n_clusters_range, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

In [None]:
# Step 2: Choose the number of clusters (k)
k = 3  # You can adjust the number of clusters based on your problem

# Step 3: Apply k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(tfidf_matrix)

# Get the cluster assignments for each document
cluster_assignments = kmeans.labels_

In [None]:
# 4.1. Print the cluster assignments for each document
for i in range(k):
    cluster_i_indices = np.where(cluster_assignments == i)[0]
    print(f"Cluster {i} documents:")
    for doc_index in cluster_i_indices:
        print(reviews_docs_joined[doc_index])
    print("\n")

In [None]:
# 4.2. Analyze cluster centroids
cluster_centroids = kmeans.cluster_centers_

In [None]:
# 4.3. Get the top terms for each cluster
terms = tfidf_vectorizer.get_feature_names_out()
order_centroids = cluster_centroids.argsort()[:, ::-1]

for i in range(k):
    print(f"Cluster {i} top terms:")
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print(top_terms)
    print("\n")

In [None]:
# 4.4. Evaluate the clustering quality (if you have ground truth labels)
# You can use metrics like silhouette score, completeness, or homogeneity.

#silhouette score
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(tfidf_matrix, cluster_assignments)
print(f"Silhouette Score: {silhouette_avg}")