## BERTopic

In [2]:
import numpy as np
from bertopic import BERTopic
from gensim import corpora
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import CoherenceModel
from hdbscan import HDBSCAN
from umap import UMAP
import gensim.corpora as corpora
import pandas as pd
import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np
import os

import nltk
from nltk.corpus import PlaintextCorpusReader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import preprocess2
reviews_corpus = preprocess2.load_corpus('sephora_corpus')
reviews_docs = preprocess2.corpus2docs(reviews_corpus)


# # Build the bigram and trigram models
# bigram = gensim.models.Phrases(reviews_docs, min_count=5, threshold=100) # higher threshold fewer phrases. 

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)

# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# data_words_bigrams = make_bigrams(reviews_docs)

reviews_docs_joined = [" ".join(x) for x in reviews_docs]

In [4]:
from keybert import KeyBERT
# Extract keywords
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(reviews_docs_joined )

# Create our vocabulary
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))

In [5]:
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from sklearn.cluster import KMeans

In [6]:
vectorizer_model= CountVectorizer(vocabulary=vocabulary)
embedding_model = SentenceTransformer('all-mpnet-base-v2')
embeddings = embedding_model.encode(reviews_docs_joined, show_progress_bar=False)
representation_model = KeyBERTInspired() #to reduce the appearance of stop words
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) #to reduce the impact of frequent words

In [8]:

 # Dimensionality Reduction Parameters
n_neighbors_range = [5, 10, 15]
n_components_range = [3, 5, 7]

# Define the range of parameters for clustering
hdbscan_model = HDBSCAN()
kmeans_model = KMeans(n_clusters=7)

cluster_range = [hdbscan_model, kmeans_model]

# Define the range of parameters for ngrams
unigram = (1, 1)
bigram = (1, 2)
ngram_range = [unigram, bigram]

model_results = {'N_Neighbors': [],
                 'N_Components': [],
                 'Clustering_methods': [],
                 'Ngram_range': [],
                 'Coherence': []
                }

# Define the size of the subset (10%)
subset_size = int(len(reviews_docs_joined) * 0.3)
subset_docs = np.random.choice(reviews_docs_joined, subset_size, replace=False)

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(n_neighbors_range) *len(n_components_range) * len(cluster_range) * len(ngram_range)))

for n_neighbors in n_neighbors_range:
    for n_components in n_components_range:
        for cluster in cluster_range:
            for ngram in ngram_range:
                umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, low_memory=True, min_dist=0.0, metric="cosine")
                # Create the BERTopic model
                topic_model = BERTopic(embedding_model=embedding_model, umap_model= umap_model, hdbscan_model=cluster, n_gram_range = ngram, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
                
                print(f'N_Neighbors: {n_neighbors}')
                print(f'N_Components: {n_components}')
                print(f'Clustering_methods: {cluster}')
                print(f'Ngram_range: {ngram}')

            # Fit the model on your subset of documents
                topics, _ = topic_model.fit_transform(subset_docs)
                
                # Preprocess documents
                documents = pd.DataFrame(
                    {"Document": subset_docs,
                    "ID": range(len(subset_docs)),
                    "Topic": topics}
                )
                documents_per_topic = documents.groupby(
                    ['Topic'], as_index=False).agg({'Document': ' '.join})
                cleaned_docs = topic_model._preprocess_text(
                    documents_per_topic.Document.values)
                
                # Extract vectorizer and analyzer from the fitted model
                vectorizer_model = topic_model.vectorizer_model
                analyzer = vectorizer_model.build_analyzer()
                
                # Extract features for topic coherence evaluation
                tokens = [analyzer(doc) for doc in cleaned_docs]
                dictionary = corpora.Dictionary(tokens)
                corpus = [dictionary.doc2bow(token) for token in tokens]
                topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics)) - 1)]
                
                # Calculate coherence
                coherence_model = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='c_v')
                coherence_score = coherence_model.get_coherence()
                print(f'Coherence score: {coherence_score}')

                # Get list of topics
                print(topic_model.get_topic_info())
                
                # Save the model results
                model_results['N_Neighbors'].append(n_neighbors)
                model_results['N_Components'].append(n_components)
                model_results['Clustering_methods'].append(cluster)
                model_results['Ngram_range'].append(ngram)
                model_results['Coherence'].append(coherence_score)

                pbar.update(1)
    pbar.close()

  0%|          | 0/36 [14:10<?, ?it/s]


N_Neighbors: 5
N_Components: 3
Clustering_methods: HDBSCAN()
Ngram_range: (1, 1)


  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3754327050489134
     Topic  Count                                              Name  \
0       -1  10484      -1_moisturizes_concealer_regimen_niacinamide   
1        0    951            0_lipstick_lipglosses_lipgloss_tasting   
2        1    712                 1_mask_masks_maskne_moisturizermy   
3        2    394                      2_tanning_tan_tanned_tanluxe   
4        3    267     3_mascara_makeupespecially_makeupnot_eyeliner   
..     ...    ...                                               ...   
793    792      5     792_moisturizing_moisturizer_skin_brightening   
794    793      5       793_cleanser_exfoliator_moisturizer_cleanse   
795    794      5                     794_skincare_serum_skin_toner   
796    795      5  795_moisturize_moisturizer_moisturizing_skincare   
797    796      5     796_exfoliant_exfoliants_exfoliating_cleanser   

                                        Representation  \
0    [moisturizes, concealer, regimen, niacinamide,..

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3786153402708534
     Topic  Count                                               Name  \
0       -1  10215    -1_moisturiser_moisturizes_moisturize_concealer   
1        0    696                           0_mask_masks_maskne_nose   
2        1    430  1_serummoisturizer_serumy_serumessence_serumgreat   
3        2    400             2_nuface_productsjust_device_worthless   
4        3    378                    3_tanning_tan_tanned_tanologist   
..     ...    ...                                                ...   
821    820      5           820_moisturizer_moisturized_skin_product   
822    821      5              821_serum_skincare_retinol_moisturize   
823    822      5                 822_skincare_moisturizer_skin_acne   
824    823      5              823_retinol_serum_skincare_hyaluronic   
825    824      5            824_moisturizing_moisturizer_cream_skin   

                                        Representation  \
0    [moisturiser, moisturizes, moisturiz

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.42944047351199527
   Topic  Count                                              Name  \
0      0   7376  0_moisturizer_moisturizing_moisturizers_skincare   
1      1   5894                 1_acne_skincare_serum_moisturizer   
2      2   4081               2_moisturizer_fragrance_serum_cream   
3      3   3494                3_cleanser_skincare_balm_exfoliant   
4      4   2770               4_sunscreen_moisturizer_tan_tanning   
5      5   2362               5_concealer_moisturizing_cream_balm   
6      6    950             6_lipstick_lipgloss_balm_moisturizing   

                                      Representation  \
0  [moisturizer, moisturizing, moisturizers, skin...   
1  [acne, skincare, serum, moisturizer, retinol, ...   
2  [moisturizer, fragrance, serum, cream, product...   
3  [cleanser, skincare, balm, exfoliant, soap, la...   
4  [sunscreen, moisturizer, tan, tanning, lotion,...   
5  [concealer, moisturizing, cream, balm, gel, ap...   
6  [lipstick, lipg

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.46738941181735627
   Topic  Count                                              Name  \
0      0   7654  0_skincare_moisturizer_moisturizing_moisturizers   
1      1   5632              1_serum_skincare_moisturizer_retinol   
2      2   4820                        2_acne_skin_pimple_product   
3      3   2937      3_cleanser_exfoliant_exfoliation_exfoliating   
4      4   2558            4_sunscreen_moisturizer_tanning_lotion   
5      5   2329              5_moisturizing_serum_concealer_cream   
6      6    997                6_lipstick_lipgloss_chapstick_balm   

                                      Representation  \
0  [skincare, moisturizer, moisturizing, moisturi...   
1  [serum, skincare, moisturizer, retinol, wrinkl...   
2  [acne, skin, pimple, product, redness, face, b...   
3  [cleanser, exfoliant, exfoliation, exfoliating...   
4  [sunscreen, moisturizer, tanning, lotion, make...   
5  [moisturizing, serum, concealer, cream, balm, ...   
6  [lipstick, lipg

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3752265335377597
     Topic  Count                                               Name  \
0       -1  10452            -1_concealer_moisturize_perfume_essence   
1        0    968                0_lipstick_lipglosses_lipgloss_balm   
2        1    705                       1_mask_masks_maskne_charcoal   
3        2    322  2_moisturizersunscreen_clarisonic_moisterizing...   
4        3    298                    3_tanning_tan_tanned_tanologist   
..     ...    ...                                                ...   
818    817      5       817_moisturizer_moisturizing_concealer_cream   
819    818      5                818_cleanser_makeup_rinsing_cleanse   
820    819      5          819_applicator_promo_advertised_packaging   
821    820      5    820_makeup_moisturizering_moisturizer_fragrance   
822    821      5                821_lotion_cream_regimen_ingredient   

                                        Representation  \
0    [concealer, moisturize, perfume, ess

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3746721927151484
     Topic  Count                                               Name  \
0       -1  10167        -1_moisturizes_moisturize_concealer_perfume   
1        0    975              0_lipstick_lipglosses_lipgloss_flavor   
2        1    699                       1_masks_mask_maskne_skinplus   
3        2    399                    2_tanning_tan_tanned_tanologist   
4        3    324       3_app_productsjust_appresults_freeproductgot   
..     ...    ...                                                ...   
819    818      5           818_sunscreen_moisturizer_sunburned_oily   
820    819      5  819_moisturizer_moisturizing_moisturizeri_mois...   
821    820      5              820_skincare_serum_product_ingredient   
822    821      5             821_cleanser_cleanseri_skincare_lather   
823    822      5      822_moisturize_complexion_moisturized_sephora   

                                        Representation  \
0    [moisturizes, moisturize, concealer,

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.45904194454746494
   Topic  Count                                              Name  \
0      0   7110  0_moisturizing_moisturizer_skincare_moisturizers   
1      1   7037              1_serum_moisturizer_skincare_retinol   
2      2   3923             2_moisturizer_cream_fragrance_product   
3      3   2931       3_cleanser_exfoliator_exfoliant_exfoliating   
4      4   2654                4_sunscreen_tan_moisturizer_lotion   
5      5   2287               5_moisturizing_concealer_serum_balm   
6      6    985                6_lipstick_lipgloss_chapstick_balm   

                                      Representation  \
0  [moisturizing, moisturizer, skincare, moisturi...   
1  [serum, moisturizer, skincare, retinol, acne, ...   
2  [moisturizer, cream, fragrance, product, serum...   
3  [cleanser, exfoliator, exfoliant, exfoliating,...   
4  [sunscreen, tan, moisturizer, lotion, tanning,...   
5  [moisturizing, concealer, serum, balm, cream, ...   
6  [lipstick, lipg

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.46005462253767465
   Topic  Count                                              Name  \
0      0   8895  0_moisturizer_skincare_moisturizers_moisturizing   
1      1   4637                     1_acne_skincare_serum_retinol   
2      2   4415                 2_serum_moisturizer_product_cream   
3      3   3140             3_cleanser_skincare_exfoliator_makeup   
4      4   2518            4_sunscreen_moisturizer_lotion_tanning   
5      5   2358              5_concealer_serum_moisturizing_cream   
6      6    964             6_lipstick_balm_lipgloss_moisturizing   

                                      Representation  \
0  [moisturizer, skincare, moisturizers, moisturi...   
1  [acne, skincare, serum, retinol, pimple, skin,...   
2  [serum, moisturizer, product, cream, fragrance...   
3  [cleanser, skincare, exfoliator, makeup, balm,...   
4  [sunscreen, moisturizer, lotion, tanning, make...   
5  [concealer, serum, moisturizing, cream, balm, ...   
6  [lipstick, balm

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3795991669841959
     Topic  Count                                               Name  \
0       -1   9964      -1_concealer_complexion_acneprone_exfoliating   
1        0    983               0_lipstick_lipglosses_tasting_flavor   
2        1    691  1_moisturizerreally_skincareovernight_maskne_mask   
3        2    396                    2_tanning_tan_tanned_tanologist   
4        3    300  3_moisturizersunscreen_moisterizing_clarisonic...   
..     ...    ...                                                ...   
832    831      5            831_skincare_fragrance_perfume_perfumed   
833    832      5          832_moisturizer_moisturizing_lotion_serum   
834    833      5                    833_serum_skincare_product_skin   
835    834      5               834_sunscreen_skin_concealer_peeling   
836    835      5                  835_skin_product_tingle_sensitive   

                                        Representation  \
0    [concealer, complexion, acneprone, e

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.38450955096498074
     Topic  Count                                               Name  \
0       -1  10362          -1_concealer_lotion_acneprone_formulation   
1        0    985                 0_lipstick_lipglosses_lipgloss_lip   
2        1    710                          1_mask_masks_maskne_maski   
3        2    356      2_mascara_makeupespecially_makeupnot_eyeliner   
4        3    306         3_retinoid_retinol_retinoids_retinoidsacne   
..     ...    ...                                                ...   
789    788      5                           788_eye_cream_gel_eyesit   
790    789      5                   789_fragrance_year_sephora_serum   
791    790      5  790_moisturizer_moisturizing_moisturized_fragr...   
792    791      5             791_moisturizer_moisturizing_acne_skin   
793    792      5   792_skincare_serum_makeup_spotshyperpigmentation   

                                        Representation  \
0    [concealer, lotion, acneprone, form

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.44550548147605357
   Topic  Count                                              Name  \
0      0   7171              0_serum_skincare_moisturizer_retinol   
1      1   6857  1_skincare_moisturizer_moisturizers_moisturizing   
2      2   3950                 2_serum_moisturizer_cream_product   
3      3   3156          3_skincare_cleanser_exfoliator_exfoliant   
4      4   2604                4_sunscreen_moisturizer_tan_lotion   
5      5   2188              5_moisturizing_serum_concealer_cream   
6      6   1001                      6_lipstick_lipgloss_balm_lip   

                                      Representation  \
0  [serum, skincare, moisturizer, retinol, vitami...   
1  [skincare, moisturizer, moisturizers, moisturi...   
2  [serum, moisturizer, cream, product, fragrance...   
3  [skincare, cleanser, exfoliator, exfoliant, ex...   
4  [sunscreen, moisturizer, tan, lotion, tanning,...   
5  [moisturizing, serum, concealer, cream, balm, ...   
6  [lipstick, lipg

  idf = np.log((avg_nr_samples / df)+1)
 33%|███▎      | 12/36 [4:32:00<9:04:01, 1360.05s/it]

Coherence score: 0.4616251009491177
   Topic  Count                                              Name  \
0      0   7120  0_moisturizer_skincare_moisturizers_moisturizing   
1      1   6676                 1_skincare_acne_moisturizer_serum   
2      2   3901             2_moisturizer_serum_fragrance_product   
3      3   2585               3_sunscreen_tanning_tan_moisturizer   
4      4   2511    4_exfoliator_exfoliating_exfoliant_exfoliation   
5      5   2482              5_moisturizing_serum_concealer_cream   
6      6   1652                 6_moisturizing_balm_chapstick_lip   

                                      Representation  \
0  [moisturizer, skincare, moisturizers, moisturi...   
1  [skincare, acne, moisturizer, serum, retinol, ...   
2  [moisturizer, serum, fragrance, product, cream...   
3  [sunscreen, tanning, tan, moisturizer, makeup,...   
4  [exfoliator, exfoliating, exfoliant, exfoliati...   
5  [moisturizing, serum, concealer, cream, balm, ...   
6  [moisturizing, b


  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3758642144415747
     Topic  Count                                              Name  \
0       -1   9948         -1_moisturizers_lotion_rosacea_complexion   
1        0   2055               0_concealer_eyecream_wrinkle_eyelid   
2        1   2036      1_cleanser_exfoliators_exfoliator_exfoliants   
3        2   1550             2_serum_hyaluronic_vitamin_complexion   
4        3    957           3_lipstick_lipglosses_lipgloss_vaseline   
..     ...    ...                                               ...   
469    468      5     468_moisturizer_moisturizers_sephoras_sephora   
470    469      5                469_jar_product_repurchase_hydrate   
471    470      5        470_repurchase_product_hopeful_buyingtried   
472    471      5  471_moisturizers_moisturizer_moisturizing_makeup   
473    472      5                   472_skincare_serum_skin_product   

                                        Representation  \
0    [moisturizers, lotion, rosacea, complexion, sp..

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3749017965041191
     Topic  Count                                           Name  \
0       -1   9910  -1_moisturizes_moisturizers_lotion_hyaluronic   
1        0   2165            0_concealer_eyecream_wrinkle_eyelid   
2        1   2058  1_exfoliator_exfoliators_exfoliants_exfoliant   
3        2    959             2_lipstick_lipglosses_lipgloss_lip   
4        3    697              3_masks_mask_maskne_moisturizermy   
..     ...    ...                                            ...   
472    471      5            471_serum_moisturizer_skincare_acne   
473    472      5  472_moisturizer_moisturizers_cream_hyaluronic   
474    473      5             473_moisturizer_scent_smell_smells   
475    474      5       474_moisturize_pimple_complexion_redness   
476    475      5           475_retinol_skin_discoloring_redness   

                                        Representation  \
0    [moisturizes, moisturizers, lotion, hyaluronic...   
1    [concealer, eyecream, wrin

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.4777366889443418
   Topic  Count                                              Name  \
0      0   8101  0_moisturizer_moisturizing_skincare_moisturizers   
1      1   5050                1_skincare_moisturizer_serum_cream   
2      2   4564                   2_acne_skincare_retinol_rosacea   
3      3   3323              3_cleanser_fragrance_exfoliator_soap   
4      4   2606             4_sunscreen_makeup_moisturizer_lotion   
5      5   2339               5_moisturizing_serum_concealer_balm   
6      6    944                      6_lipstick_lipgloss_balm_lip   

                                      Representation  \
0  [moisturizer, moisturizing, skincare, moisturi...   
1  [skincare, moisturizer, serum, cream, product,...   
2  [acne, skincare, retinol, rosacea, serum, pimp...   
3  [cleanser, fragrance, exfoliator, soap, exfoli...   
4  [sunscreen, makeup, moisturizer, lotion, tanni...   
5  [moisturizing, serum, concealer, balm, cream, ...   
6  [lipstick, lipgl

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.47929868593444214
   Topic  Count                                              Name  \
0      0   7665  0_moisturizer_skincare_moisturizing_moisturizers   
1      1   5948                1_skincare_serum_moisturizer_cream   
2      2   4536                     2_skincare_acne_rosacea_serum   
3      3   2805                3_sunscreen_moisturizer_lotion_tan   
4      4   2718       4_cleanser_exfoliator_exfoliating_exfoliant   
5      5   2204               5_moisturizing_concealer_serum_balm   
6      6   1051                      6_lipstick_lipgloss_balm_lip   

                                      Representation  \
0  [moisturizer, skincare, moisturizing, moisturi...   
1  [skincare, serum, moisturizer, cream, retinol,...   
2  [skincare, acne, rosacea, serum, skin, pimple,...   
3  [sunscreen, moisturizer, lotion, tan, makeup, ...   
4  [cleanser, exfoliator, exfoliating, exfoliant,...   
5  [moisturizing, concealer, serum, balm, cream, ...   
6  [lipstick, lipg

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3749346266680565
     Topic  Count                                               Name  \
0       -1  12735   -1_moisturizers_moisturizes_concealer_hyaluronic   
1        0   1620         0_cleansers_cleanser_clarisonic_exfoliates   
2        1    963              1_lipstick_lipglosses_lipgloss_flavor   
3        2    677                    2_mask_masks_maskne_exfoliation   
4        3    291                        3_tanning_tan_tanned_tanner   
..     ...    ...                                                ...   
531    530      5                     530_line_smooth_creamy_product   
532    531      5              531_makeup_skincare_moisturizer_serum   
533    532      5                         532_product_smell_use_used   
534    533      5        533_serum_skincare_moisturizing_moisturized   
535    534      5  534_moisturizers_moisturizing_hyaluronic_cream...   

                                        Representation  \
0    [moisturizers, moisturizes, conceale

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3760906823858615
     Topic  Count                                              Name  \
0       -1  11663  -1_moisturizers_moisturizes_concealer_moisturize   
1        0   2029      0_cleanser_exfoliators_exfoliator_exfoliants   
2        1    962             1_lipstick_lipglosses_lipgloss_flavor   
3        2    700                 2_serum_hyaluronic_vitamin_elixir   
4        3    678                    3_mask_masks_maskne_exfoliants   
..     ...    ...                                               ...   
481    480      5                 480_product_ingredient_skin_ilium   
482    481      5     481_moisturizer_lotion_moisturizing_cosmetics   
483    482      5     482_moisturizer_moisturizes_moisturizing_rash   
484    483      5                   483_scent_smell_smelling_smelly   
485    484      5           484_concealer_makeup_moisturizing_cream   

                                        Representation  \
0    [moisturizers, moisturizes, concealer, moistur..

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.5296854900326781
   Topic  Count                                              Name  \
0      0  10040                 0_serum_skincare_moisturizer_acne   
1      1   7899  1_skincare_moisturizer_moisturizers_moisturizing   
2      2   2993       2_cleanser_exfoliator_exfoliating_exfoliant   
3      3   2738             3_sunscreen_moisturizer_lotion_makeup   
4      4   2213               4_moisturizing_concealer_serum_balm   
5      5    955                      5_lipstick_lipgloss_lip_balm   
6      6     89                         6_bonne_vraiment_pour_bon   

                                      Representation  \
0  [serum, skincare, moisturizer, acne, retinol, ...   
1  [skincare, moisturizer, moisturizers, moisturi...   
2  [cleanser, exfoliator, exfoliating, exfoliant,...   
3  [sunscreen, moisturizer, lotion, makeup, tanni...   
4  [moisturizing, concealer, serum, balm, cream, ...   
5  [lipstick, lipgloss, lip, balm, chapstick, moi...   
6  [bonne, vraiment

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.4592261490080816
   Topic  Count                                              Name  \
0      0   7041  0_skincare_moisturizer_moisturizers_moisturizing   
1      1   6612              1_moisturizer_skincare_serum_retinol   
2      2   4362               2_serum_moisturizer_cream_fragrance   
3      3   2947          3_cleanser_skincare_exfoliator_exfoliant   
4      4   2799                4_sunscreen_moisturizer_lotion_tan   
5      5   2221              5_concealer_moisturizing_cream_serum   
6      6    945             6_lipstick_lipgloss_balm_moisturizing   

                                      Representation  \
0  [skincare, moisturizer, moisturizers, moisturi...   
1  [moisturizer, skincare, serum, retinol, acne, ...   
2  [serum, moisturizer, cream, fragrance, product...   
3  [cleanser, skincare, exfoliator, exfoliant, cl...   
4  [sunscreen, moisturizer, lotion, tan, makeup, ...   
5  [concealer, moisturizing, cream, serum, balm, ...   
6  [lipstick, lipgl

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3874054246613988
     Topic  Count                                          Name  \
0       -1  13307  -1_moisturizers_concealer_complexion_mascara   
1        0    960           0_lipstick_lipglosses_lipgloss_balm   
2        1    692             1_mask_masks_moisturizermy_maskne   
3        2    415   2_sunscreen_sunscreeniess_suncreen_sunblock   
4        3    397               3_tan_tanning_tanned_tanologist   
..     ...    ...                                           ...   
529    528      5                   528_serum_product_skin_oily   
530    529      5      529_moisturizer_moisturizers_serum_cream   
531    530      5   530_exfoliated_hydration_hydrating_hydrated   
532    531      5   531_influenster_received_complimentary_love   
533    532      5                  532_acne_skincare_daily_week   

                                        Representation  \
0    [moisturizers, concealer, complexion, mascara,...   
1    [lipstick, lipglosses, lipgloss, balm,

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3804883755633665
     Topic  Count                                              Name  \
0       -1  12219  -1_moisturizers_moisturizes_moisturize_concealer   
1        0   1552              0_cleansers_cleanser_exfoliates_balm   
2        1    941            1_lipstick_lipglosses_lipgloss_flavour   
3        2    699                 2_moisturizermy_mask_masks_maskne   
4        3    699                  3_sunscreen_suncreen_sunburn_spf   
..     ...    ...                                               ...   
485    484      5               484_puffiness_eye_puffy_eyesmassage   
486    485      5                485_hyaluronic_skin_cream_tingling   
487    486      5                     486_balm_product_algae_sample   
488    487      5               487_concealer_cream_serum_puffiness   
489    488      5        488_moisturizing_moisturizer_cream_texture   

                                        Representation  \
0    [moisturizers, moisturizes, moisturize, concea..

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.490091604756653
   Topic  Count                                              Name  \
0      0   7656  0_skincare_moisturizer_moisturizers_moisturizing   
1      1   7034              1_serum_skincare_moisturizer_retinol   
2      2   4092                     2_acne_skincare_rosacea_serum   
3      3   2776                    3_sunscreen_makeup_lotion_skin   
4      4   2222       4_cleanser_exfoliator_exfoliant_exfoliating   
5      5   2208               5_concealer_moisturizing_serum_balm   
6      6    939                      6_lipstick_lipgloss_lip_balm   

                                      Representation  \
0  [skincare, moisturizer, moisturizers, moisturi...   
1  [serum, skincare, moisturizer, retinol, vitami...   
2  [acne, skincare, rosacea, serum, pimple, skin,...   
3  [sunscreen, makeup, lotion, skin, tan, moistur...   
4  [cleanser, exfoliator, exfoliant, exfoliating,...   
5  [concealer, moisturizing, serum, balm, cream, ...   
6  [lipstick, lipglo

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.4853550234062825
   Topic  Count                                              Name  \
0      0   7895  0_moisturizer_moisturizing_moisturizers_skincare   
1      1   6216                     1_skincare_acne_serum_retinol   
2      2   3975               2_moisturizer_serum_cream_fragrance   
3      3   3035       3_exfoliant_exfoliator_cleanser_exfoliating   
4      4   2592            4_sunscreen_moisturizer_tanning_lotion   
5      5   2265               5_moisturizing_concealer_balm_cream   
6      6    949                      6_lipstick_lipgloss_balm_lip   

                                      Representation  \
0  [moisturizer, moisturizing, moisturizers, skin...   
1  [skincare, acne, serum, retinol, skin, vitamin...   
2  [moisturizer, serum, cream, fragrance, product...   
3  [exfoliant, exfoliator, cleanser, exfoliating,...   
4  [sunscreen, moisturizer, tanning, lotion, tan,...   
5  [moisturizing, concealer, balm, cream, serum, ...   
6  [lipstick, lipgl

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.36752995830931984
     Topic  Count                                           Name  \
0       -1  10324   -1_moisturizers_lotion_complexion_hyaluronic   
1        0   2216         0_concealer_eyecream_wrinkle_undereyes   
2        1   2051   1_cleanser_exfoliants_exfoliator_exfoliating   
3        2   1366          2_sunscreen_suncreen_sunblock_sunburn   
4        3    960         3_lipstick_lipglosses_lipgloss_tasting   
..     ...    ...                                            ...   
429    428      5        428_serum_lotionlike_skincare_fragrance   
430    429      5  429_fragrance_fragranced_moisturizing_scented   
431    430      5          430_spatula_moisturizer_cream_product   
432    431      5         431_toner_tonerserum_moisturizing_acne   
433    432      5          432_skincare_moisturizer_acne_product   

                                        Representation  \
0    [moisturizers, lotion, complexion, hyaluronic,...   
1    [concealer, eyecream, wri

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.37901165428631867
     Topic  Count                                               Name  \
0       -1  12157  -1_moisturizers_exfoliating_hyaluronic_complexion   
1        0   1405            0_concealers_concealer_eyecream_eyebags   
2        1    947                 1_lipstick_lipglosses_lipgloss_lip   
3        2    915              2_sunscreen_suncreen_sunblock_sunburn   
4        3    698                    3_mask_masks_maskne_exfoliation   
..     ...    ...                                                ...   
477    476      5            476_serum_retinol_freeproduct_fragrance   
478    477      5                  477_bottle_applicator_serum_cream   
479    478      5                   478_serum_wrinkle_chapstick_skin   
480    479      5                     479_lotion_review_redness_peel   
481    480      5                   480_moisturizer_balm_facial_skin   

                                        Representation  \
0    [moisturizers, exfoliating, hyaluro

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.4650058866511623
   Topic  Count                                              Name  \
0      0   6944  0_skincare_moisturizers_moisturizer_moisturizing   
1      1   6172                1_skincare_serum_moisturizer_cream   
2      2   4399                     2_skincare_acne_serum_rosacea   
3      3   3196                3_sunscreen_makeup_tan_moisturizer   
4      4   2971       4_cleanser_exfoliator_exfoliating_exfoliant   
5      5   2294               5_moisturizing_concealer_cream_balm   
6      6    951                      6_lipstick_lipgloss_balm_lip   

                                      Representation  \
0  [skincare, moisturizers, moisturizer, moisturi...   
1  [skincare, serum, moisturizer, cream, product,...   
2  [skincare, acne, serum, rosacea, skin, product...   
3  [sunscreen, makeup, tan, moisturizer, lotion, ...   
4  [cleanser, exfoliator, exfoliating, exfoliant,...   
5  [moisturizing, concealer, cream, balm, applica...   
6  [lipstick, lipgl

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.4242640764182138
   Topic  Count                                        Name  \
0      0   7851    0_skincare_moisturizing_moisturizer_balm   
1      1   6735           1_skincare_acne_moisturizer_serum   
2      2   4142  2_skincare_moisturizer_moisturizing_lotion   
3      3   4086          3_skincare_moisturizer_serum_cream   
4      4   2266        4_moisturizing_serum_concealer_cream   
5      5   1758           5_moisturizing_balm_lip_chapstick   
6      6     89                   6_bonne_vraiment_pour_bon   

                                      Representation  \
0  [skincare, moisturizing, moisturizer, balm, cl...   
1  [skincare, acne, moisturizer, serum, skin, cre...   
2  [skincare, moisturizer, moisturizing, lotion, ...   
3  [skincare, moisturizer, serum, cream, product,...   
4  [moisturizing, serum, concealer, cream, balm, ...   
5  [moisturizing, balm, lip, chapstick, mask, moi...   
6  [bonne, vraiment, pour, bon, seulement, voir, ...   

          

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.37924584225464886
     Topic  Count                                               Name  \
0       -1  13084      -1_moisturizers_moisturizes_concealer_retinol   
1        0   1978       0_cleanser_exfoliators_exfoliator_exfoliants   
2        1    936              1_lipstick_lipglosses_lipgloss_flavor   
3        2    700                   2_sunscreen_suncreen_sunburn_spf   
4        3    668                    3_mask_masks_maskne_exfoliation   
..     ...    ...                                                ...   
424    423      5  423_moisturizer_moisturizers_skincare_moisturi...   
425    424      5                424_applicator_moisturize_cream_eye   
426    425      5         425_moisturizing_moisturizer_cream_wrinkle   
427    426      5          426_moisturizer_lotion_moisturizing_scent   
428    427      5         427_allergic_drysensitiveredness_acne_love   

                                        Representation  \
0    [moisturizers, moisturizes, conceal

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.38442594783306255
     Topic  Count                                          Name  \
0       -1  12143  -1_moisturizers_lotion_hyaluronic_complexion   
1        0   2118        0_eyecream_concealer_applicator_eyelid   
2        1    940        1_lipstick_lipglosses_lipgloss_lipbalm   
3        2    705         2_sunscreen_suncreen_sunblock_sunburn   
4        3    664               3_mask_masks_maskne_exfoliation   
..     ...    ...                                           ...   
381    380      5                  380_aroma_scent_bottle_smell   
382    381      5                   381_acne_break_broke_pimple   
383    382      5       382_sunscreen_fragrance_blended_powdery   
384    383      5    383_turmeric_moisturizer_skincare_cleanser   
385    384      5               384_dryness_skin_dry_ingredient   

                                        Representation  \
0    [moisturizers, lotion, hyaluronic, complexion,...   
1    [eyecream, concealer, applicator, eye

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.47633082136756205
   Topic  Count                                              Name  \
0      0   8086  0_skincare_moisturizer_moisturizers_moisturizing   
1      1   5883                     1_skincare_acne_serum_retinol   
2      2   4227            2_skincare_moisturizer_fragrance_serum   
3      3   2924       3_cleanser_exfoliator_exfoliating_exfoliant   
4      4   2591             4_sunscreen_moisturizer_lotion_makeup   
5      5   2265              5_moisturizing_cream_concealer_serum   
6      6    951        6_chapstick_lipstick_lipgloss_moisturizing   

                                      Representation  \
0  [skincare, moisturizer, moisturizers, moisturi...   
1  [skincare, acne, serum, retinol, skin, wrinkle...   
2  [skincare, moisturizer, fragrance, serum, crea...   
3  [cleanser, exfoliator, exfoliating, exfoliant,...   
4  [sunscreen, moisturizer, lotion, makeup, skin,...   
5  [moisturizing, cream, concealer, serum, balm, ...   
6  [chapstick, lip

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.49539561323304193
   Topic  Count                                              Name  \
0      0   9283  0_moisturizer_skincare_moisturizing_moisturizers   
1      1   7242                1_serum_skincare_moisturizer_cream   
2      2   4237                     2_acne_skincare_rosacea_serum   
3      3   2948                       3_sunscreen_tan_tanning_spf   
4      4   2177               4_moisturizing_serum_concealer_balm   
5      5    951                     5_lipstick_lipgloss_lip_gloss   
6      6     89                         6_bonne_vraiment_pour_bon   

                                      Representation  \
0  [moisturizer, skincare, moisturizing, moisturi...   
1  [serum, skincare, moisturizer, cream, product,...   
2  [acne, skincare, rosacea, serum, skin, pimple,...   
3  [sunscreen, tan, tanning, spf, lotion, moistur...   
4  [moisturizing, serum, concealer, balm, cream, ...   
5  [lipstick, lipgloss, lip, gloss, moisturizing,...   
6  [bonne, vraimen

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.3905383672742523
     Topic  Count                                         Name  \
0       -1  12938  -1_moisturizers_acneprone_lotion_complexion   
1        0   2201          0_concealer_eyecream_wrinkle_eyelid   
2        1    917       1_lipstick_lipglosses_lipgloss_lipbalm   
3        2    652              2_mask_maskne_masks_exfoliation   
4        3    387        3_sunscreen_suncreen_sunblock_sunburn   
..     ...    ...                                          ...   
411    410      5                410_cleanser_wipes_wipe_brush   
412    411      5       411_cleanser_perfume_exfoliating_scent   
413    412      5               412_cleanser_scent_review_soap   
414    413      5        413_skincare_dermalogica_skin_rubbing   
415    414      5  414_wrinkle_effectiveness_moisturized_daily   

                                        Representation  \
0    [moisturizers, acneprone, lotion, complexion, ...   
1    [concealer, eyecream, wrinkle, eyelid, lauder,... 

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.37634645331523464
     Topic  Count                                               Name  \
0       -1  13141        -1_moisturizers_concealer_lotion_hyaluronic   
1        0   1926       0_cleanser_exfoliator_exfoliators_exfoliants   
2        1    945                 1_lipstick_lipglosses_lipgloss_lip   
3        2    693                    2_exfoliation_mask_masks_maskne   
4        3    643              3_sunscreen_suncreen_sunblock_sunburn   
..     ...    ...                                                ...   
396    395      5                 395_moisturizer_serum_makeup_cream   
397    396      5                   396_skin_oily_greasy_niacinamide   
398    397      5                   397_scent_smell_moisture_product   
399    398      5  398_moisturizer_moisturizers_moisturizing_frag...   
400    399      5                 399_moisturizing_face_skin_massage   

                                        Representation  \
0    [moisturizers, concealer, lotion, h

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.5512787072027406
   Topic  Count                                              Name  \
0      0  10107                 0_skincare_serum_moisturizer_acne   
1      1   7849  1_skincare_moisturizer_moisturizers_moisturizing   
2      2   2959       2_exfoliator_exfoliant_cleanser_exfoliating   
3      3   2762             3_sunscreen_lotion_moisturizer_makeup   
4      4   2212               4_moisturizing_concealer_serum_balm   
5      5    949                      5_lipstick_lipgloss_lip_lips   
6      6     89                         6_bonne_vraiment_pour_bon   

                                      Representation  \
0  [skincare, serum, moisturizer, acne, retinol, ...   
1  [skincare, moisturizer, moisturizers, moisturi...   
2  [exfoliator, exfoliant, cleanser, exfoliating,...   
3  [sunscreen, lotion, moisturizer, makeup, tanni...   
4  [moisturizing, concealer, serum, balm, cream, ...   
5  [lipstick, lipgloss, lip, lips, balm, gloss, v...   
6  [bonne, vraiment

  idf = np.log((avg_nr_samples / df)+1)


Coherence score: 0.48581239718216285
   Topic  Count                                              Name  \
0      0   9286  0_moisturizer_skincare_moisturizers_moisturizing   
1      1   7142              1_serum_moisturizer_skincare_retinol   
2      2   4318                     2_skincare_acne_rosacea_serum   
3      3   2849             3_sunscreen_makeup_lotion_moisturizer   
4      4   2185               4_concealer_moisturizing_balm_serum   
5      5   1058             5_lipstick_balm_lipgloss_moisturizing   
6      6     89                         6_bonne_vraiment_pour_bon   

                                      Representation  \
0  [moisturizer, skincare, moisturizers, moisturi...   
1  [serum, moisturizer, skincare, retinol, produc...   
2  [skincare, acne, rosacea, serum, retinol, skin...   
3  [sunscreen, makeup, lotion, moisturizer, tanni...   
4  [concealer, moisturizing, balm, serum, cream, ...   
5  [lipstick, balm, lipgloss, moisturizing, lip, ...   
6  [bonne, vraimen

In [9]:
pd.DataFrame(model_results).to_csv('./bertopic_tuning_results.csv', index=False)

In [10]:
map_model = UMAP(n_neighbors=15, n_components=7, low_memory=True, min_dist=0.0, metric="cosine")
topic_model = BERTopic(embedding_model=embedding_model, umap_model= umap_model, hdbscan_model=kmeans_model, n_gram_range = (1,1), vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
topics = topic_model.fit_transform(reviews_docs_joined)

print(topic_model.get_topic_info())

   Topic  Count                                              Name  \
0      0  27359  0_moisturizer_skincare_moisturizers_moisturizing   
1      1  18966               1_acne_skincare_retinol_moisturizer   
2      2  14114                 2_serum_moisturizer_cream_product   
3      3   9890     3_cleanser_exfoliator_exfoliating_exfoliation   
4      4   8518                4_sunscreen_moisturizer_lotion_spf   
5      5   7706               5_moisturizing_serum_concealer_balm   
6      6   3205             6_lipstick_lipgloss_moisturizing_balm   

                                      Representation  \
0  [moisturizer, skincare, moisturizers, moisturi...   
1  [acne, skincare, retinol, moisturizer, serum, ...   
2  [serum, moisturizer, cream, product, fragrance...   
3  [cleanser, exfoliator, exfoliating, exfoliatio...   
4  [sunscreen, moisturizer, lotion, spf, makeup, ...   
5  [moisturizing, serum, concealer, balm, cream, ...   
6  [lipstick, lipgloss, moisturizing, balm, lip, ...   

In [11]:
map_model = UMAP(n_neighbors=15, n_components=7, low_memory=True, min_dist=0.0, metric="cosine")
topic_model = BERTopic(embedding_model=embedding_model, umap_model= umap_model, hdbscan_model=hdbscan_model, n_gram_range = (1,1), vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
topics = topic_model.fit_transform(reviews_docs_joined)

print(topic_model.get_topic_info())

      Topic  Count                                             Name  \
0        -1  50485    -1_moisturizes_acneprone_concealer_complexion   
1         0   3174            0_lipstick_lipgloss_lipbalm_chapstick   
2         1   2239                  1_masks_mask_maskne_exfoliation   
3         2   1112  2_leavesskin_dermalogical_innbeautys_sandalwood   
4         3   1111                     3_tanning_tan_tanned_bronzer   
...     ...    ...                                              ...   
1107   1106      5                  1106_shade_shades_blended_blend   
1108   1107      5       1107_complexion_vitamin_application_months   
1109   1108      5     1108_moisturizing_serumlike_applicator_cream   
1110   1109      5               1109_fragrance_scent_product_smell   
1111   1110      5    1110_moisturizer_moisturizers_spatula_product   

                                         Representation  \
0     [moisturizes, acneprone, concealer, complexion...   
1     [lipstick, lipgloss, li