## BERTopic

In [1]:
import numpy as np
from bertopic import BERTopic
from gensim import corpora
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import CoherenceModel
from hdbscan import HDBSCAN
from umap import UMAP
import gensim.corpora as corpora
import pandas as pd
import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

import pandas as pd
import numpy as np
import os

import nltk
from nltk.corpus import PlaintextCorpusReader

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
import preprocess2
reviews_corpus = preprocess2.load_corpus('sephora_corpus')
reviews_docs = preprocess2.corpus2docs(reviews_corpus)


# # Build the bigram and trigram models
# bigram = gensim.models.Phrases(reviews_docs, min_count=5, threshold=100) # higher threshold fewer phrases. 

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)

# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# data_words_bigrams = make_bigrams(reviews_docs)

reviews_docs_joined = [" ".join(x) for x in reviews_docs]

In [3]:
from keybert import KeyBERT
# Extract keywords
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(reviews_docs_joined )

# Create our vocabulary
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))

KeyboardInterrupt: 

In [None]:
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
vectorizer_model= CountVectorizer(vocabulary=vocabulary)
embedding_model = SentenceTransformer('all-mpnet-base-v2')
embeddings = embedding_model.encode(reviews_docs_joined, show_progress_bar=False)
representation_model = KeyBERTInspired() #to reduce the appearance of stop words
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

In [None]:
from sklearn.decomposition import PCA


def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x


# Initialize and rescale PCA embeddings
pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))

In [None]:
# Dimensionality Reduction Parameters
n_neighbors_range = [5, 10, 15]
n_components_range = [3, 5, 7]

# Define the range of parameters for clustering
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples = 10, metric='euclidean', prediction_data=True)
kmeans_model = KMeans(n_clusters=7)

cluster_range = range(hdbscan_model, kmeans_model)

# Define the range of parameters for ngrams
unigram = (1, 1)
bigram = (1, 2)
ngram_range = range(unigram, bigram)

model_results = {'N_Neighbors': [],
                 'N_Components': [],
                 'Clustering_methods': [],
                 'Ngram_range': [],
                 'Silhouette_score': []
                }

# Define the size of the subset (10%)
subset_size = int(len(reviews_docs_joined) * 0.3)
subset_docs = np.random.choice(reviews_docs_joined, subset_size, replace=False)

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(n_neighbors_range) *len(n_components_range) * len(cluster_range) * len(ngram_range)))

for n_neighbors in n_neighbors_range:
    for n_components in n_components_range:
        for cluster in cluster_range:
            for ngram in ngram_range:
                umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, low_memory=True, min_dist=0.0, metric="cosine", init=pca_embeddings,)
                # Create the BERTopic model
                topic_model = BERTopic(embedding_model=embedding_model, umap_model= umap_model, hdbscan_model=cluster, n_gram_range = ngram, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
                
                print(f'N_Neighbors: {n_neighbors}')
                print(f'N_Components: {n_components}')
                print(f'Clustering_methods: {cluster}')
                print(f'Ngram_range: {ngram}')

            # Fit the model on your subset of documents
                topics, _ = topic_model.fit_transform(subset_docs)
                
                # Preprocess documents
                documents = pd.DataFrame(
                    {"Document": subset_docs,
                    "ID": range(len(subset_docs)),
                    "Topic": topics}
                )
                documents_per_topic = documents.groupby(
                    ['Topic'], as_index=False).agg({'Document': ' '.join})
                cleaned_docs = topic_model._preprocess_text(
                    documents_per_topic.Document.values)
                
                # Extract vectorizer and analyzer from the fitted model
                vectorizer_model = topic_model.vectorizer_model
                analyzer = vectorizer_model.build_analyzer()
                
                # Extract features for topic coherence evaluation
                tokens = [analyzer(doc) for doc in cleaned_docs]
                dictionary = corpora.Dictionary(tokens)
                corpus = [dictionary.doc2bow(token) for token in tokens]
                topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics)) - 1)]
                
                # Calculate coherence
                coherence_model = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='c_v')
                coherence_score = coherence_model.get_coherence()
                print(f'Coherence score: {coherence_score}')

                # Get list of topics
                print(topic_model.get_topic_info())
                
                # Save the model results
                model_results['N_Neighbors'].append(n_neighbors)
                model_results['N_Components'].append(n_components)
                model_results['Clustering_methods'].append(cluster)
                model_results['Ngram_range'].append(ngram)
                model_results['Coherence'].append(coherence_score)

                pbar.update(1)
    pbar.close()

IndentationError: unexpected indent (2534787867.py, line 40)

In [6]:
pd.DataFrame(model_results).to_csv('./bertopic_tuning_results.csv', index=False)