#### Imports

In [1]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance

  from .autonotebook import tqdm as notebook_tqdm


#### Load Data

In [2]:
df = pd.read_csv('./output_data/modified_case_data.csv')
abstracts = df['abstract'].tolist()

#### Load Embeddings

##### Embedding model: all-MiniLM-L6-v2

In [3]:
# Load the DataFrame from the CSV file
embeddings_0 = pd.read_csv('./embeddings/all-MiniLM-L6-v2.csv')

# Extract embeddings and document IDs
document_ids = embeddings_0['ucid'].values
embeddings_0_loaded = embeddings_0.drop('ucid', axis=1).values

##### Embedding model: stella_en_400M_v5 (MTEB Clustering Category rank 4)

In [3]:
# Load the DataFrame from the CSV file
embeddings_1 = pd.read_csv('./embeddings/stella_en_400M_v5.csv')

# Extract embeddings
embeddings_1_loaded = embeddings_1.drop('ucid', axis=1).values

#### Initialize Vectorizer

In [112]:
vec_model = CountVectorizer(stop_words='english')

#### Initialize Representation Model

In [172]:
# Create your representation model
representation_model = MaximalMarginalRelevance(diversity=0.4)

# Use the representation model in BERTopic on top of the default pipeline
topic_model_0 = BERTopic(representation_model=representation_model)
topic_model_1 = BERTopic(representation_model=representation_model, vectorizer_model=vec_model, embedding_model=embeddings_1_loaded)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### Calculate topics (embedding: all-MiniLM-L6-v2.csv)

In [13]:
topics_0, probs_0 = topic_model_0.fit_transform(abstracts, embeddings=embeddings_0_loaded)

#### Update Topic Model

In [None]:
topic_model_0.update_topics(abstracts, vectorizer_model=vec_model)

#### Get topic info

In [14]:
topic_model_0.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,26808,-1_engine_power_motor_vehicle,"[engine, power, motor, vehicle, electric, cont...",[Restart control device applied to a vehicle (...
1,0,880,0_oil_oil pump_pump_lubricating,"[oil, oil pump, pump, lubricating, electric oi...","[According to the present invention, when both..."
2,1,683,1_utility model_utility_model_input shaft,"[utility model, utility, model, input shaft, s...",[The utility model provides a hybrid power dev...
3,2,658,2_braking_regenerative_braking force_regenerat...,"[braking, regenerative, braking force, regener...",[PROBLEM TO BE SOLVED: To prevent regenerative...
4,3,624,3_cooling_water_cooling water_radiator,"[cooling, water, cooling water, radiator, heat...",[A vehicle capable of efficiently recovering r...
...,...,...,...,...,...
403,402,10,402_vehicle engine driven_port power_vehicle e...,"[vehicle engine driven, port power, vehicle en...",[The present invention relates to a modular po...
404,403,10,403_ecu_cs mode_causes engine_ecu causes,"[ecu, cs mode, causes engine, ecu causes, cs, ...",[A vehicle provided with a motor generator as ...
405,404,10,404_abnormality diagnosis_abnormality_diagnosi...,"[abnormality diagnosis, abnormality, diagnosis...",[The present invention makes it easier to secu...
406,405,10,405_pumping_automobile chassis_concrete_leg,"[pumping, automobile chassis, concrete, leg, s...",[The invention discloses a kind of tandem type...


#### Visualize topics

### Calculate topics (embedding: stella_en_400M_v5)

In [169]:
topics_1, probs_1 = topic_model_1.fit_transform(abstracts, embeddings=embeddings_1_loaded)

#### Update topic model

In [127]:
topic_model_1.update_topics(abstracts, vectorizer_model=vec_model)

#### Get topic info 

In [154]:
topic_model_1.get_topic_info(11)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,11,261,11_countershaft_input_shafts_shaft,"[countershaft, input, shafts, shaft, stage, id...",[The invention relates to a transmission (4) c...


#### Visualize topics

In [167]:
topic_model_1.get_topic(300)

[('management', 0.032912736097917915),
 ('energy', 0.031042564774208),
 ('storage', 0.0194806694369169),
 ('circuitry', 0.016666779398388322),
 ('units', 0.013748793209901898),
 ('benefit', 0.011798993127112403),
 ('manager', 0.010679930114374072),
 ('converters', 0.008804514514363445),
 ('cost', 0.00856065287632451),
 ('charge', 0.008189341647966275)]

In [178]:
topic_model_1.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,24677,-1_power_engine_motor_vehicle,"[power, engine, motor, vehicle, battery, gener...",[The invention relates to a power coupling tra...
1,0,1005,0_oil_pump_lubricating_passage,"[oil, pump, lubricating, passage, lubrication,...",[The present invention relates to a method of ...
2,1,648,1_machine_subtransmission_partial_internal,"[machine, subtransmission, partial, internal, ...",[Hybrid drive train for a hybrid-driven vehicl...
3,2,572,2_hydraulic_accumulator_fluid_pressure,"[hydraulic, accumulator, fluid, pressure, hydr...",[A hydrostatic hybrid drive system for road ve...
4,3,499,3_braking_regenerative_brake_force,"[braking, regenerative, brake, force, regenera...",[An engine and a second power generating devic...
...,...,...,...,...,...
451,450,10,450_module_multipart_subcomponents_subsystems,"[module, multipart, subcomponents, subsystems,...","[The invention relates to a method, apparatus ..."
452,451,10,451_rearguard_b1_b2_lawiener,"[rearguard, b1, b2, lawiener, em1, em2, c2, lo...",[A transmission for a longitudinal front-drive...
453,452,10,452_44s_45s_4s_distributor,"[44s, 45s, 4s, distributor, 44c, 45c, 5s, 5c, ...",[A power transmission apparatus of a hybrid el...
454,453,10,453_winterizing_dewinterizing_lowvoltage_phase,"[winterizing, dewinterizing, lowvoltage, phase...",[PROBLEM TO BE SOLVED: To provide a heating de...


In [None]:
# Use the "c-TF-IDF" strategy with a threshold
new_topics = topic_model_1.reduce_outliers(abstracts, topics_1 , strategy="embeddings")

# Reduce all outliers that are left with the "distributions" strategy
# new_topics = topic_model_1.reduce_outliers(abstracts, new_topics, strategy="distributions")


In [174]:
topic_distr, _ = topic_model_1.approximate_distribution(abstracts)


In [184]:
topic_model_1.visualize_distribution(topic_distr[6])


In [None]:
for doc_index, topic_probs in enumerate(topic_distr):
    print(doc_index, topic_probs)