In [8]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from umap import UMAP
import nltk
nltk.download('stopwords')



def train_bertopic(data):
    try:
        # Initiate UMAP
        umap_model = UMAP(n_neighbors=15, 
                        n_components=5, 
                        min_dist=0.0, 
                        metric='cosine', 
                        random_state=100)
        


        #NLTK English stopwords
        stopwords = nltk.corpus.stopwords.words('english')
        airbnb_related_words = ['stay', 'airbnb', 'paris', 'would', 'time', 'apartment']
        names_and_surnames = pd.read_csv('../data/names_and_surnames.csv')
        # Expand stopwords
        stopwords.extend(list(names_and_surnames['names_&_surnames']) + airbnb_related_words)


        vectorizer_model = CountVectorizer(stop_words=stopwords)
        representation_model = MaximalMarginalRelevance(diversity=0.8)

        # Initiate BERTopic
        topic_model = BERTopic(umap_model=umap_model, 
                            vectorizer_model=vectorizer_model, 
        #                      min_topic_size=200,
        #                       top_n_words=4,
                            language="multilingual",
                            calculate_probabilities=True,
                            representation_model=representation_model)



        import time
        start = time.time()


        # Run BERTopic model
        topics,_ = topic_model.fit_transform(data)

        end = time.time()
        print(end - start)

        # Return the trained model and topics
        return topic_model, topics
    
    except Exception as e:
        raise Exception(f"Error occurred during BERTopic training: {e}")
    


def main():
    parquet_file = "../data/paris_reviews_preprocessed.parquet"

    try:
        # Process the Parquet file
        df = pd.read_parquet(parquet_file)
        docs = df.comments

        # Train the BERTopic model
        model, topics = train_bertopic(docs)

        embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
        model.save("../model/model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)


    except Exception as e:
        print(f"Error occurred during data processing: {e}")



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/piyush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
if __name__ == "__main__":
    main()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [3]:
2225.226475954056/60

37.0871079325676

In [10]:
loaded_model = BERTopic.load("../model/model_dir")


In [11]:
loaded_model.get_topic_info()[0:20]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,30191,-1_appartement_comfortable_recommend_two,"[appartement, comfortable, recommend, two, bed...",
1,0,982,0_using_bnb_airbnbs_booking,"[using, bnb, airbnbs, booking, next, bed, buil...",
2,1,911,1_propre_confortable_hôte_agencé,"[propre, confortable, hôte, agencé, commerces,...",
3,2,882,2_recommend_comfortable_beautiful_bed,"[recommend, comfortable, beautiful, bed, defin...",
4,3,749,3_lumineux_spacieux_agencé_goût,"[lumineux, spacieux, agencé, goût, confortable...",
5,4,687,4_cœur_lumineux_réactive_confortable,"[cœur, lumineux, réactive, confortable, toits,...",
6,5,676,5_recommandons_famille_appartement_attentes,"[recommandons, famille, appartement, attentes,...",
7,6,589,6_hôtel_personnel_déjeuner_accueillant,"[hôtel, personnel, déjeuner, accueillant, conf...",
8,7,587,7_spot_amazing_parís_distance,"[spot, amazing, parís, distance, recommend, co...",
9,8,566,8_subway_options_nearby_right,"[subway, options, nearby, right, stop, positio...",
