### BERTopic topic modelling with Llama 3.1 as representational model

In [None]:
#imports
import torch
import transformers
from transformers import pipeline
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import TextGeneration

In [17]:
# load data
df = pd.read_csv('./modified_case_data.csv')
abstracts = df['abstract'].tolist()

In [18]:
# Embedding model: stella_en_400M_v5 (MTEB Clustering Category rank 4)

# Load the DataFrame from the CSV file
embeddings = pd.read_csv('./stella_en_400M_v5.csv')

# Extract embeddings
document_ids = embeddings['ucid'].values
embeddings = embeddings.drop('ucid', axis=1).values

In [19]:
# Vectorizer Model
vec_model = CountVectorizer(stop_words='english', ngram_range=(1,2))

In [20]:
prompt = '''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

            Cutting Knowledge Date: December 2023
            Today Date: 06 Aug 2024

            You are a helpful assistant who helps to create labels for topics in a topic modelling pipeline. <|eot_id|><|start_header_id|>user<|end_header_id|>
            I've a topic that contains the following documents:
            [DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information above, extract only one short topic label in the following format:
topic: <topic label>
Output only the label and nothing else. <|eot_id|><|start_header_id|>assistant<|end_header_id|>'''

In [21]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

generator = transformers.pipeline(
    "text-generation", 
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    max_new_tokens=500,
    temperature = 0.1,
    repetition_penalty=1.1
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:

# Create your representation model

llama = TextGeneration(generator, prompt=prompt, doc_length = 2000, tokenizer='char', diversity=0.1)

mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [mmr, llama]

# Use the chained models
topic_model = BERTopic(representation_model=representation_models, vectorizer_model=vec_model, verbose=True, calculate_probabilities=True)

In [None]:
topics, probs = topic_model.fit_transform(abstracts, embeddings=embeddings)

In [24]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,23334,-1_\n\ntopic: Hybrid Vehicle Power Systems___,"[\n\ntopic: Hybrid Vehicle Power Systems, , , ...",[PROBLEM TO BE SOLVED: To provide a hybrid veh...
1,0,1355,0_\n\ntopic: Hybrid Power Transmission Systems___,"[\n\ntopic: Hybrid Power Transmission Systems,...",[The invention discloses a double-planet-row d...
2,1,1025,1_\n\ntopic: Vehicle Hydraulic Systems___,"[\n\ntopic: Vehicle Hydraulic Systems, , , , ,...",[To provide a vehicle hydraulic control device...
3,2,614,2_\n\ntopic: Hybrid Vehicle Emission Control S...,[\n\ntopic: Hybrid Vehicle Emission Control Sy...,[PROBLEM TO BE SOLVED: To suppress the deterio...
4,3,574,3_\n\ntopic: Hybrid Vehicle Drive Train System...,[\n\ntopic: Hybrid Vehicle Drive Train Systems...,[The present invention relates to a hybrid tra...
...,...,...,...,...,...
419,418,10,418_\n\ntopic: Hybrid Power Systems for Engine...,[\n\ntopic: Hybrid Power Systems for Engineeri...,[The invention relates to an energy-saving hyd...
420,419,10,419_\n\ntopic: Road Surface Gradient Estimatio...,[\n\ntopic: Road Surface Gradient Estimation a...,[PROBLEM TO BE SOLVED: To more properly estima...
421,420,10,420_\n\ntopic: Vehicle Cooling System Design___,"[\n\ntopic: Vehicle Cooling System Design, , ,...","[A vehicle that includes an electric motor (2,..."
422,421,10,421_\n\ntopic: Regenerative Retarder Control S...,[\n\ntopic: Regenerative Retarder Control Syst...,[PURPOSE:To prevent a vehicle from deteriorati...


In [None]:
print(topics)

In [33]:
topic_model.get_topic_info(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,10,288,10_\n\ntopic: Torsional Vibration Damper Techn...,[\n\ntopic: Torsional Vibration Damper Technol...,"[Torsional vibration damper (100, 200), in par..."


In [None]:

with open('topics_llama.txt', 'w') as f:
    # Loop over the topics and write the output to the file
    for topic in range(10):
        topic_info = topic_model.get_topic(topic)
        representative_docs = df[df.topic == topic][:10]['document'].tolist()

        f.write(f"Topic: {topic}\n")
        f.write("Topic Information: \n")
        f.write(f"{topic_info}\n")
        f.write("Representative Documents: \n")
        for doc in representative_docs:
            f.write(f"{doc}\n")
        f.write("\n")

In [57]:
topics_dict = topic_model.get_topics()

In [60]:
for k, v in topics_dict.items():
    print(k, v[0][0].strip())

-1 topic: Hybrid Vehicle Power Systems
0 topic: Hybrid Power Transmission Systems
1 topic: Vehicle Hydraulic Systems
2 topic: Hybrid Vehicle Emission Control Systems
3 topic: Hybrid Vehicle Drive Train Systems
4 topic: Hydraulic Hybrid Systems
5 topic: Hybrid Vehicle Cooling System Design
6 topic: Hybrid Vehicle Drive Control System
7 topic: Hybrid Vehicle Noise Reduction Control
8 topic: Hybrid Electric Vehicle Technology
9 topic: Vibration Damping Control Device
10 topic: Torsional Vibration Damper Technology
11 topic: Vehicle Transmission Systems
12 topic: Power Supply Systems and Control Devices
13 topic: Compressed Air Hybrid Vehicle Technology
14 topic: Fuel-Efficient Route Planning for Hybrid Vehicles
15 topic: Reducing Shift Shock in Vehicle Power Transmission Devices
16 topic: Flywheel Energy Storage Systems
17 topic: Regenerative Braking Control Systems
18 topic: Engine Start Control in Hybrid Vehicles
19 topic: Motor Temperature Regulation
20 topic: Hybrid Transmission Syste

In [64]:

# Reduce outliers using the `probabilities` strategy
topic_model.reduce_outliers(abstracts, topics, probabilities=probs, strategy="probabilities")

[]

In [65]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,23334,-1_\n\ntopic: Hybrid Vehicle Power Systems___,"[\n\ntopic: Hybrid Vehicle Power Systems, , , ...",[PROBLEM TO BE SOLVED: To provide a hybrid veh...
1,0,1355,0_\n\ntopic: Hybrid Power Transmission Systems___,"[\n\ntopic: Hybrid Power Transmission Systems,...",[The invention discloses a double-planet-row d...
2,1,1025,1_\n\ntopic: Vehicle Hydraulic Systems___,"[\n\ntopic: Vehicle Hydraulic Systems, , , , ,...",[To provide a vehicle hydraulic control device...
3,2,614,2_\n\ntopic: Hybrid Vehicle Emission Control S...,[\n\ntopic: Hybrid Vehicle Emission Control Sy...,[PROBLEM TO BE SOLVED: To suppress the deterio...
4,3,574,3_\n\ntopic: Hybrid Vehicle Drive Train System...,[\n\ntopic: Hybrid Vehicle Drive Train Systems...,[The present invention relates to a hybrid tra...
...,...,...,...,...,...
419,418,10,418_\n\ntopic: Hybrid Power Systems for Engine...,[\n\ntopic: Hybrid Power Systems for Engineeri...,[The invention relates to an energy-saving hyd...
420,419,10,419_\n\ntopic: Road Surface Gradient Estimatio...,[\n\ntopic: Road Surface Gradient Estimation a...,[PROBLEM TO BE SOLVED: To more properly estima...
421,420,10,420_\n\ntopic: Vehicle Cooling System Design___,"[\n\ntopic: Vehicle Cooling System Design, , ,...","[A vehicle that includes an electric motor (2,..."
422,421,10,421_\n\ntopic: Regenerative Retarder Control S...,[\n\ntopic: Regenerative Retarder Control Syst...,[PURPOSE:To prevent a vehicle from deteriorati...


In [68]:
topic_model.get_document_info(abstracts)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,A hydraulic regenerative braking and power sup...,-1,-1_\n\ntopic: Hybrid Vehicle Power Systems___,"[\n\ntopic: Hybrid Vehicle Power Systems, , , ...",[PROBLEM TO BE SOLVED: To provide a hybrid veh...,\n\ntopic: Hybrid Vehicle Power Systems - - ...,0.486581,False
1,"Higher, the lower planet row plug-in hybrid-po...",0,0_\n\ntopic: Hybrid Power Transmission Systems___,"[\n\ntopic: Hybrid Power Transmission Systems,...",[The invention discloses a double-planet-row d...,\n\ntopic: Hybrid Power Transmission Systems -...,0.195589,False
2,The traction arrangement comprises one or more...,-1,-1_\n\ntopic: Hybrid Vehicle Power Systems___,"[\n\ntopic: Hybrid Vehicle Power Systems, , , ...",[PROBLEM TO BE SOLVED: To provide a hybrid veh...,\n\ntopic: Hybrid Vehicle Power Systems - - ...,0.902328,False
3,A dual-motor hybrid power driving system compr...,362,362_\n\ntopic: Double-Motor Hybrid Power Trans...,[\n\ntopic: Double-Motor Hybrid Power Transmis...,[A double-motor hybrid power driving system co...,\n\ntopic: Double-Motor Hybrid Power Transmiss...,1.000000,False
4,FIELD: transport.SUBSTANCE: hybrid transmissio...,20,20_\n\ntopic: Hybrid Transmission Systems___,"[\n\ntopic: Hybrid Transmission Systems, , , ,...",[Hybrid transmission device with a first trans...,\n\ntopic: Hybrid Transmission Systems - - -...,0.199330,False
...,...,...,...,...,...,...,...,...
46661,A battery arrangement for a motor vehicle incl...,12,12_\n\ntopic: Power Supply Systems and Control...,[\n\ntopic: Power Supply Systems and Control D...,[The in-vehicle controller of the present disc...,\n\ntopic: Power Supply Systems and Control De...,0.134601,False
46662,The invention relates to a method for operatin...,58,58_\n\ntopic: Hybrid Semi-Trailer Drive System...,"[\n\ntopic: Hybrid Semi-Trailer Drive Systems,...",[PROBLEM TO BE SOLVED: To effectively improve ...,\n\ntopic: Hybrid Semi-Trailer Drive Systems -...,0.011609,False
46663,A torsional vibration damping system (7) for a...,10,10_\n\ntopic: Torsional Vibration Damper Techn...,[\n\ntopic: Torsional Vibration Damper Technol...,"[Torsional vibration damper (100, 200), in par...",\n\ntopic: Torsional Vibration Damper Technolo...,1.000000,False
46664,Various methods and systems are provided for a...,42,42_\n\ntopic: Automatic Transmissions for Elec...,[\n\ntopic: Automatic Transmissions for Electr...,"[Transmission (1) for a vehicle (100), compris...",\n\ntopic: Automatic Transmissions for Electri...,1.000000,False


In [69]:
topic_distr, _ = topic_model.approximate_distribution(abstracts)

100%|██████████| 47/47 [02:06<00:00,  2.70s/it]
