In [1]:
# imports
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
#imports
import torch
import transformers
from transformers import pipeline
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import TextGeneration

In [3]:
# load data
df = pd.read_csv('./modified_case_data.csv')
abstracts = df['abstract'].tolist()

In [4]:
# Embedding model: stella_en_400M_v5 (MTEB Clustering Category rank 4)

# Load the DataFrame from the CSV file
embeddings = pd.read_csv('./stella_en_400M_v5.csv')

# Extract embeddings
document_ids = embeddings['ucid'].values
embeddings = embeddings.drop('ucid', axis=1).values


In [5]:
# Vectorizer Model
vec_model = CountVectorizer(min_df=2,stop_words='english', ngram_range=(1,3))

In [6]:
prompt = '''I've a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information above, extract only one short topic label. Output only the topic label in the following format: topic: <topic label>'''

In [7]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [9]:
# Text generation with Llama 2
gemma_2_2b_it = TextGeneration(generator, prompt=prompt,nr_docs=3,
    doc_length=2000,
    tokenizer=tokenizer,
    diversity=0.1)

mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [mmr, gemma_2_2b_it]

In [10]:

topic_model = BERTopic(
  representation_model=representation_models,
  vectorizer_model=vec_model,
  verbose=True,
    calculate_probabilities=True
)



In [11]:
# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings=embeddings)

2024-08-07 06:15:37,809 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-07 06:16:38,677 - BERTopic - Dimensionality - Completed ✓
2024-08-07 06:16:38,682 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-07 06:21:09,290 - BERTopic - Cluster - Completed ✓
2024-08-07 06:21:09,309 - BERTopic - Representation - Extracting topics from clusters using representation models.
  2%|▏         | 10/441 [00:13<09:25,  1.31s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 441/441 [22:57<00:00,  3.12s/it]
2024-08-07 06:44:31,036 - BERTopic - Representation - Completed ✓


In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,24662,-1_\n\n**topic:** Hybrid Vehicle Powertrain Sy...,[\n\n**topic:** Hybrid Vehicle Powertrain Syst...,[An object of the present invention is to prov...
1,0,876,0_\n\n**topic:** Electric Oil Pump Control Dev...,[\n\n**topic:** Electric Oil Pump Control Devi...,[The present invention provides a hydraulic co...
2,1,671,1_\n\n**topic:** Hybrid Transmission Systems i...,[\n\n**topic:** Hybrid Transmission Systems in...,[Hybrid drive train for a hybrid-driven vehicl...
3,2,563,2_\n\n**Topic:** Hydraulic Hybrid Drive System...,[\n\n**Topic:** Hydraulic Hybrid Drive Systems...,[A series hydraulic hybrid system for a vehicl...
4,3,371,3_\n\n**topic:** Vehicle Technology \n___,"[\n\n**topic:** Vehicle Technology \n, , , , ,...",[A hybrid vehicle is controlled to preferably ...
...,...,...,...,...,...
436,435,10,435_\n\n**topic:** Electric Power Conversion D...,[\n\n**topic:** Electric Power Conversion Devi...,[The present invention provides a power conver...
437,436,10,436_\n\n**topic:** Vibration Suppression in Hy...,[\n\n**topic:** Vibration Suppression in Hybri...,[PROBLEM TO BE SOLVED: To provide a hybrid veh...
438,437,10,437_\n\n**topic:** Vehicle Control System for ...,[\n\n**topic:** Vehicle Control System for Imp...,[PROBLEM TO BE SOLVED: To provide a vehicle dr...
439,438,10,438_\n\n**topic:** Emergency Driving System fo...,[\n\n**topic:** Emergency Driving System for H...,[A vehicle having an emergency power device co...


In [25]:
topics_dict = topic_model.get_topics()

In [26]:
for k, v in topics_dict.items():
    print(k, v[0][0].strip())

-1 **topic:** Hybrid Vehicle Powertrain System
0 **topic:** Electric Oil Pump Control Device for Vehicle
1 **topic:** Hybrid Transmission Systems in Motor Vehicles
2 **Topic:** Hydraulic Hybrid Drive Systems
3 **topic:** Vehicle Technology
4 **topic:** Catalyst Warm-Up Control for Emission Reduction
5 **Topic:** Hydraulic Hybrid Power Systems for Loaders
6 **topic:**  Separating Clutch Design in Hybrid Modules
7 **topic:** Hybrid Power System Components and Connections
8 **topic:** Temperature Control of Electric Motors in Vehicles
9 **topic:** Planetary Gear Set Design for Motor Vehicle Transmission
10 **topic:** Noise Suppression in Hybrid Vehicle Operation Lines
11 **Topic Label:** Double Motor Hybrid Power System
12 **Topic Label:** Torsional Vibration Damper System in Hybrid Drive Train

**Explanation:**

The provided text describes various aspects of torsional vibration dampers used in hybrid drive trains. The focus is on their structure, components, and how they function within 