#### Imports

In [1]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired

  from .autonotebook import tqdm as notebook_tqdm


#### Load Data

In [2]:
df = pd.read_csv('./output_data/modified_case_data.csv')
abstracts = df['abstract'].tolist()

#### Load Embeddings

##### Embedding model: all-MiniLM-L6-v2

In [3]:
# Load the DataFrame from the CSV file
embeddings_0 = pd.read_csv('./embeddings/all-MiniLM-L6-v2.csv')

# Extract embeddings and document IDs
document_ids = embeddings_0['ucid'].values
embeddings_0_loaded = embeddings_0.drop('ucid', axis=1).values

##### Embedding model: stella_en_400M_v5 (MTEB Clustering Category rank 4)

In [3]:
# Load the DataFrame from the CSV file
embeddings_1 = pd.read_csv('./embeddings/stella_en_400M_v5.csv')

# Extract embeddings
embeddings_1_loaded = embeddings_1.drop('ucid', axis=1).values


#### Initialize Vectorizer

In [4]:
vec_model = CountVectorizer(min_df=5,stop_words='english', ngram_range=(1,3))

#### Initialize Representation Model

In [5]:
# Create your representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
topic_model_0 = BERTopic(representation_model=representation_model)
topic_model_1 = BERTopic(representation_model=representation_model)

### Calculate topics (embedding: all-MiniLM-L6-v2.csv)

In [11]:
topics_0, probs_0 = topic_model_0.fit_transform(abstracts)

#### Update topic model for fine tuning

In [None]:
topic_model_0.update_topics(abstracts, vectorizer_model=vec_model)

#### Get topic info

In [None]:
topic_model_0.get_topic_info()

### Calculate topics (embedding: stella_en_400M_v5)

In [None]:
topics_1, probs_1 = topic_model_1.fit_transform(abstracts)

#### Update topic model for fine tuning

In [None]:
topic_model_1.update_topics(abstracts, vectorizer_model=vec_model)

#### Get Topic Info

In [None]:
topic_model_1.get_topic_info()