Backup from Google Colab


In [1]:
!pip install bertopic gensim openai llama-cpp-python

Collecting openai
  Downloading openai-1.68.2-py3-none-any.whl (606 kB)
Collecting pydantic<3,>=1.9.0
  Downloading pydantic-2.10.6-py3-none-any.whl (431 kB)
Collecting distro<2,>=1.7.0
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting httpx<1,>=0.23.0
  Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
Collecting sniffio
  Downloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting jiter<1,>=0.4.0
  Downloading jiter-0.9.0-cp38-cp38-win_amd64.whl (198 kB)
Collecting anyio<5,>=3.5.0
  Downloading anyio-4.5.2-py3-none-any.whl (89 kB)
Collecting annotated-types>=0.6.0
  Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
Collecting pydantic-core==2.27.2
  Downloading pydantic_core-2.27.2-cp38-cp38-win_amd64.whl (2.0 MB)
Collecting httpcore==1.*
  Downloading httpcore-1.0.7-py3-none-any.whl (78 kB)
Collecting exceptiongroup>=1.0.2; python_version < "3.11"
  Using cached exceptiongroup-1.2.2-py3-none-any.whl (16 kB)
Collecting h11<0.15,>=0.13
  Downloading h11-0.14.0-

In [3]:
import pandas as pd
import re
import nltk
import numpy as np
import hdbscan
import umap
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Download required resources
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
csv_path = "C:/Users/User/OneDrive - Singapore Management University/Semester 5/ISSS609 Text Analytics and Applications/Group Project/Amazon Fine Food Reviews/topic_model_sample_391k.csv"
df = pd.read_csv(csv_path)

In [5]:
# Ensure correct column name
if "Text" not in df.columns:
    raise ValueError("Column 'Text' not found in dataset")

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Text preprocessing function (Stopword removal is disabled to retain context)
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', str(text))  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove special characters, numbers
    words = simple_preprocess(text)  # Tokenize
    words = [lemmatizer.lemmatize(word) for word in words if len(word) > 2]
    return " ".join(words)

# Apply text preprocessing
df["processed_text"] = df["Text"].apply(preprocess_text)


In [7]:
from bertopic import BERTopic
from bertopic.representation import LlamaCPP
from llama_cpp import Llama

# Custom prompt for LlamaCPP
summ_prompt = """
Q: I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the above information, can you give a short label of the topic?
A:
"""

# Load the LLaMA model locally
llm = Llama(model_path="C:/Users/User/OneDrive - Singapore Management University/Semester 5/ISSS609 Text Analytics and Applications/Group Project/Amazon Fine Food Reviews/LLM/zephyr-7b-alpha.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=8192, stop="Q:")

# Create the representation model using LlamaCPP
representation_model = LlamaCPP(llm, prompt=summ_prompt)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from C:/Users/User/OneDrive - Singapore Management University/Semester 5/ISSS609 Text Analytics and Applications/Group Project/Amazon Fine Food Reviews/LLM/zephyr-7b-alpha.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-alpha
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loa

llama_kv_cache_init: layer 12: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 13: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 14: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 15: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 16: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 17: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 18: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 19: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 20: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 21: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 22: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 23: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 24: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024
llama_kv_cache_init: layer 25: n_embd_k_gqa = 1024, n_embd_v_gqa

In [28]:

# Load embedding model (optimized for speed & performance)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

# ✅ Optimized UMAP: Reduce dimensionality while retaining local structure
umap_model = umap.UMAP(
    n_neighbors=10,  # Reduced to retain local document similarities
    n_components=10,  # Increased from 5 to preserve more information
    metric='cosine',
    random_state=42
)

# ✅ Optimized HDBSCAN: Reduce outlier sensitivity
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=500,  # Reduced from 1000 (allows smaller clusters)
    min_samples=5,  # Reduced from 10 (reduces noise sensitivity)
    metric='euclidean',
    cluster_selection_method='leaf',  # More fine-grained clustering
    prediction_data=True
)

# ✅ Optimized CountVectorizer: Retain rare but meaningful words
vectorizer_model = CountVectorizer(
    max_df=0.98,  # Increased to retain more common words
    min_df=3,  # Reduced to allow rarer words
    stop_words='english',  # Stopwords removal
    ngram_range=(1, 2)  # Allow bi-grams for better topic coherence
)

#Explicit classbased TF-IDF representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Initialize BERTopic with optimized settings
bertopic_model = BERTopic(
    representation_model=representation_model,
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    verbose=True
)

# Fit BERTopic model
topics, probs = bertopic_model.fit_transform(df["processed_text"].tolist())

# Extract topics, filtering out -1 (outliers)
topic_words_list = [
    [word for word, _ in bertopic_model.get_topic(topic_id)]
    for topic_id in bertopic_model.get_topics().keys() if topic_id != -1
]



2025-03-22 23:42:17,109 - BERTopic - Embedding - Transforming documents to embeddings.


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=12221.0), HTML(value='')))




2025-03-22 23:45:24,311 - BERTopic - Embedding - Completed ✓
2025-03-22 23:45:24,311 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-22 23:54:11,552 - BERTopic - Dimensionality - Completed ✓
2025-03-22 23:54:11,576 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-23 00:04:10,733 - BERTopic - Cluster - Completed ✓
2025-03-23 00:04:10,867 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|                                                                                          | 0/152 [00:00<?, ?it/s]llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   75108.63 ms /  1630 tokens (   46.08 ms per token,    21.70 tokens per second)
llama_perf_context_print:        eval time =    2478.85 ms /    15 runs   (  165.26 ms per token,     6.05 tokens per second)
llama_perf_context_print:       total time =   78218.58 ms /  1645 tokens
  1%|▌ 

llama_perf_context_print:        eval time =    2326.92 ms /    15 runs   (  155.13 ms per token,     6.45 tokens per second)
llama_perf_context_print:       total time =   70191.82 ms /  1512 tokens
  9%|███████▎                                                                       | 14/152 [14:13<2:46:16, 72.29s/it]Llama.generate: 17 prefix-match hit, remaining 1401 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   63103.65 ms /  1401 tokens (   45.04 ms per token,    22.20 tokens per second)
llama_perf_context_print:        eval time =    2297.95 ms /    15 runs   (  153.20 ms per token,     6.53 tokens per second)
llama_perf_context_print:       total time =   65411.29 ms /  1416 tokens
 10%|███████▊                                                                       | 15/152 [15:18<2:40:22, 70.24s/it]Llama.generate: 17 prefix-match hit, remaining 1096 prompt tokens to eval
llama_perf_context_print:    

llama_perf_context_print:       total time =   65574.06 ms /  1427 tokens
 18%|██████████████▌                                                                | 28/152 [27:44<1:56:53, 56.56s/it]Llama.generate: 17 prefix-match hit, remaining 2018 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   93333.93 ms /  2018 tokens (   46.25 ms per token,    21.62 tokens per second)
llama_perf_context_print:        eval time =    2349.15 ms /    15 runs   (  156.61 ms per token,     6.39 tokens per second)
llama_perf_context_print:       total time =   95699.36 ms /  2033 tokens
 19%|███████████████                                                                | 29/152 [29:20<2:20:01, 68.31s/it]Llama.generate: 17 prefix-match hit, remaining 838 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   37070.88 ms /   838 tokens (   44.24 ms per token,

 28%|█████████████████████▊                                                         | 42/152 [44:03<2:10:03, 70.94s/it]Llama.generate: 18 prefix-match hit, remaining 1352 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   63352.37 ms /  1352 tokens (   46.86 ms per token,    21.34 tokens per second)
llama_perf_context_print:        eval time =    2619.31 ms /    15 runs   (  174.62 ms per token,     5.73 tokens per second)
llama_perf_context_print:       total time =   65981.80 ms /  1367 tokens
 28%|██████████████████████▎                                                        | 43/152 [45:09<2:06:10, 69.46s/it]Llama.generate: 17 prefix-match hit, remaining 893 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   40316.75 ms /   893 tokens (   45.15 ms per token,    22.15 tokens per second)
llama_perf_context_print:        eval time = 

 37%|█████████████████████████████                                                  | 56/152 [59:30<1:49:52, 68.67s/it]Llama.generate: 17 prefix-match hit, remaining 1164 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   51717.80 ms /  1164 tokens (   44.43 ms per token,    22.51 tokens per second)
llama_perf_context_print:        eval time =    2314.62 ms /    15 runs   (  154.31 ms per token,     6.48 tokens per second)
llama_perf_context_print:       total time =   54042.54 ms /  1179 tokens
 38%|████████████████████████████▉                                                | 57/152 [1:00:24<1:41:47, 64.29s/it]Llama.generate: 17 prefix-match hit, remaining 2046 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   94230.40 ms /  2046 tokens (   46.06 ms per token,    21.71 tokens per second)
llama_perf_context_print:        eval time =

 46%|███████████████████████████████████▍                                         | 70/152 [1:13:01<1:16:06, 55.69s/it]Llama.generate: 17 prefix-match hit, remaining 1619 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   73472.40 ms /  1619 tokens (   45.38 ms per token,    22.04 tokens per second)
llama_perf_context_print:        eval time =    2323.40 ms /    15 runs   (  154.89 ms per token,     6.46 tokens per second)
llama_perf_context_print:       total time =   75805.15 ms /  1634 tokens
 47%|███████████████████████████████████▉                                         | 71/152 [1:14:17<1:23:21, 61.75s/it]Llama.generate: 17 prefix-match hit, remaining 1057 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   47390.24 ms /  1057 tokens (   44.83 ms per token,    22.30 tokens per second)
llama_perf_context_print:        eval time =

 55%|██████████████████████████████████████████▌                                  | 84/152 [1:26:51<1:00:11, 53.11s/it]Llama.generate: 17 prefix-match hit, remaining 1108 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   49363.18 ms /  1108 tokens (   44.55 ms per token,    22.45 tokens per second)
llama_perf_context_print:        eval time =    2219.88 ms /    15 runs   (  147.99 ms per token,     6.76 tokens per second)
llama_perf_context_print:       total time =   51598.27 ms /  1123 tokens
 56%|████████████████████████████████████████████▏                                  | 85/152 [1:27:43<58:48, 52.66s/it]Llama.generate: 17 prefix-match hit, remaining 761 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   33482.59 ms /   761 tokens (   44.00 ms per token,    22.73 tokens per second)
llama_perf_context_print:        eval time = 

 64%|██████████████████████████████████████████████████▉                            | 98/152 [1:39:52<59:10, 65.75s/it]Llama.generate: 17 prefix-match hit, remaining 968 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   42764.01 ms /   968 tokens (   44.18 ms per token,    22.64 tokens per second)
llama_perf_context_print:        eval time =    2210.40 ms /    15 runs   (  147.36 ms per token,     6.79 tokens per second)
llama_perf_context_print:       total time =   44983.97 ms /   983 tokens
 65%|███████████████████████████████████████████████████▍                           | 99/152 [1:40:37<52:34, 59.52s/it]Llama.generate: 17 prefix-match hit, remaining 1357 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   61421.69 ms /  1357 tokens (   45.26 ms per token,    22.09 tokens per second)
llama_perf_context_print:        eval time = 

 74%|█████████████████████████████████████████████████████████▍                    | 112/152 [1:53:32<39:19, 58.98s/it]Llama.generate: 17 prefix-match hit, remaining 1606 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   73100.71 ms /  1606 tokens (   45.52 ms per token,    21.97 tokens per second)
llama_perf_context_print:        eval time =    2357.61 ms /    15 runs   (  157.17 ms per token,     6.36 tokens per second)
llama_perf_context_print:       total time =   75468.06 ms /  1621 tokens
 74%|█████████████████████████████████████████████████████████▉                    | 113/152 [1:54:48<41:33, 63.93s/it]Llama.generate: 17 prefix-match hit, remaining 959 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   42613.06 ms /   959 tokens (   44.43 ms per token,    22.50 tokens per second)
llama_perf_context_print:        eval time = 

 83%|████████████████████████████████████████████████████████████████▋             | 126/152 [2:08:09<25:56, 59.87s/it]Llama.generate: 17 prefix-match hit, remaining 984 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   43570.07 ms /   984 tokens (   44.28 ms per token,    22.58 tokens per second)
llama_perf_context_print:        eval time =    2244.69 ms /    15 runs   (  149.65 ms per token,     6.68 tokens per second)
llama_perf_context_print:       total time =   45824.02 ms /   999 tokens
 84%|█████████████████████████████████████████████████████████████████▏            | 127/152 [2:08:55<23:11, 55.66s/it]Llama.generate: 17 prefix-match hit, remaining 1801 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   82220.35 ms /  1801 tokens (   45.65 ms per token,    21.90 tokens per second)
llama_perf_context_print:        eval time = 

 92%|███████████████████████████████████████████████████████████████████████▊      | 140/152 [2:21:34<12:06, 60.58s/it]Llama.generate: 17 prefix-match hit, remaining 1780 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   81315.86 ms /  1780 tokens (   45.68 ms per token,    21.89 tokens per second)
llama_perf_context_print:        eval time =    2335.56 ms /    15 runs   (  155.70 ms per token,     6.42 tokens per second)
llama_perf_context_print:       total time =   83677.25 ms /  1795 tokens
 93%|████████████████████████████████████████████████████████████████████████▎     | 141/152 [2:22:58<12:22, 67.51s/it]Llama.generate: 17 prefix-match hit, remaining 1320 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   59237.30 ms /  1320 tokens (   44.88 ms per token,    22.28 tokens per second)
llama_perf_context_print:        eval time =

In [29]:
# Define save path
save_path = "C:/Users/User/OneDrive - Singapore Management University/Semester 5/ISSS609 Text Analytics and Applications/Group Project/Amazon Fine Food Reviews/Clean/Experiment_3 - base bertopic + cTFIDF + LLM Rep/bertopic_model_v5"

# # Save the trained BERTopic model
bertopic_model.save(save_path)
# print(f"Model saved to {save_path}")



In [30]:
from bertopic import BERTopic

# Load the saved model
bertopic_model = BERTopic.load(save_path)
print("Model loaded successfully!")

Model loaded successfully!


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Get the c-TF-IDF matrix and vocabulary
ctfidf = bertopic_model.c_tf_idf_
vocab = bertopic_model.vectorizer_model.get_feature_names_out()

# Get topic-term weights for each topic
topic_words_list = []
for row in ctfidf:
    # Get indices of top 10 terms per topic
    top_indices = row.toarray().flatten().argsort()[::-1][:10]
    top_words = [vocab[i] for i in top_indices]
    topic_words_list.append(top_words)


In [32]:
for i, topic in enumerate(topic_words_list[:5]):
    print(f"Topic {i}: {topic}")


Topic 0: ['bar', 'cat', 'blend', 'chocolate', 'candy', 'roast', 'keurig', 'dark', 'green', 'chicken']
Topic 1: ['dog food', 'dog', 'food dog', 'beneful', 'puppy', 'kibble', 'stool', 'coat', 'dry dog', 'feeding']
Topic 2: ['email', 'business', 'timely', 'wa exactly', 'product arrived', 'seller', 'transaction', 'vendor', 'item wa', 'great service']
Topic 3: ['popcorn', 'kernel', 'popper', 'theater', 'movie', 'popped', 'pop', 'popping', 'movie theater', 'microwave popcorn']
Topic 4: ['love tea', 'tea bag', 'tea', 'tea wa', 'flowering', 'flowering tea', 'teapot', 'loose', 'loose tea', 'box tea']


In [33]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

# Flatten multi-word phrases into individual tokens
flattened_topic_words_list = []
for topic in topic_words_list:
    clean_topic = []
    for word in topic:
        clean_topic.extend(word.split())  # split multi-word expressions like "pop chip"
    flattened_topic_words_list.append(clean_topic)

# Tokenize documents
tokenized_docs = df["processed_text"].apply(lambda x: x.split()).tolist()
dictionary = Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=5, no_above=0.8)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Compute coherence
coherence_model = CoherenceModel(
    topics=flattened_topic_words_list,
    texts=tokenized_docs,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print(f"Topic Coherence (c_v): {coherence_score:.4f}")


Topic Coherence (c_v): 0.7720


In [34]:
# Compute Topic Diversity
all_words = [word for topic in topic_words_list for word in topic]
unique_words = set(all_words)
diversity_score = len(unique_words) / len(all_words) if len(all_words) > 0 else 0

# Print results
print(f"Coherence Score (c_v): {coherence_score}")
print(f"Diversity Score: {diversity_score}")
print(f"Number of Topics: {len(topic_words_list)}")

Coherence Score (c_v): 0.772043193776765
Diversity Score: 0.9210526315789473
Number of Topics: 152


In [35]:
# Get topic frequency from BERTopic
topic_freq = bertopic_model.get_topic_freq()

# Remove outlier topic (-1)
topic_freq = topic_freq[topic_freq.Topic != -1]

# Get top 10 topic IDs by frequency
top_10_topic_ids = topic_freq.head(10)["Topic"].tolist()

topic_insights = []
for topic_id in top_10_topic_ids:
    keywords = topic_words_list[topic_id]
    keywords_str = ", ".join(keywords)
    
    reps = bertopic_model.get_representative_docs(topic_id)
    example_doc = reps[0][:300] if reps else "N/A"

    prompt = f"""
You are an NLP expert working for Amazon under the Fine Food Departments. Here's the output of a topic modeling analysis.

Topic {topic_id} Keywords: {keywords_str}

Representative Document: "{example_doc}"

Topic Coherence Score: {coherence_score:.4f}
Topic Diversity Score: {diversity_score:.4f}

Please:
1. Briefly explain what this topic is about.
2. Suggest how this topic might be useful for Amazon and how the Fine Food Departments, in terms of product characteristic or demands.

Be concise and insightful.
"""

    result = llm(prompt, max_tokens=300)
    topic_insights.append({
        "Topic": topic_id,
        "Keywords": keywords_str,
        "Insight": result["choices"][0]["text"]
    })


Llama.generate: 3 prefix-match hit, remaining 216 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =   23937.06 ms /   216 tokens (  110.82 ms per token,     9.02 tokens per second)
llama_perf_context_print:        eval time =   36379.66 ms /   195 runs   (  186.56 ms per token,     5.36 tokens per second)
llama_perf_context_print:       total time =   60961.99 ms /   411 tokens
Llama.generate: 35 prefix-match hit, remaining 176 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =    7517.76 ms /   176 tokens (   42.71 ms per token,    23.41 tokens per second)
llama_perf_context_print:        eval time =   43937.60 ms /   299 runs   (  146.95 ms per token,     6.81 tokens per second)
llama_perf_context_print:       total time =   51814.68 ms /   475 tokens
Llama.generate: 35 prefix-match hit, remaining 184 prompt tokens to eval
llama_perf_c

In [37]:
import pandas as pd

insight_df = pd.DataFrame(topic_insights)
insight_df.to_csv("C:/Users/User/OneDrive - Singapore Management University/Semester 5/ISSS609 Text Analytics and Applications/Group Project/Amazon Fine Food Reviews/Clean/Experiment_3 - base bertopic + cTFIDF + LLM Rep/top_10_llm_topic_insights_v1.csv", index=False)

In [38]:
# Get topic frequency from BERTopic
topic_freq = bertopic_model.get_topic_freq()

# Remove outlier topic (-1)
topic_freq = topic_freq[topic_freq.Topic != -1]

# Get top 30 topic IDs by frequency
top_30_topic_ids = topic_freq.head(30)["Topic"].tolist()

topic_insights = []
for topic_id in top_30_topic_ids:
    keywords = topic_words_list[topic_id]
    keywords_str = ", ".join(keywords)
    
    reps = bertopic_model.get_representative_docs(topic_id)
    example_doc = reps[0][:300] if reps else "N/A"

    prompt = f"""
You are an NLP expert working for Amazon under the Fine Food Departments. Here's the output of a topic modeling analysis.

Topic {topic_id} Keywords: {keywords_str}

Representative Document: "{example_doc}"

Topic Coherence Score: {coherence_score:.4f}
Topic Diversity Score: {diversity_score:.4f}

Please:
1. Briefly explain what this topic is about.
2. Suggest how this topic might be useful for Amazon and how the Fine Food Departments, in terms of product characteristic or demands.

Be concise and insightful.
"""

    result = llm(prompt, max_tokens=300)
    topic_insights.append({
        "Topic": topic_id,
        "Keywords": keywords_str,
        "Insight": result["choices"][0]["text"]
    })

Llama.generate: 35 prefix-match hit, remaining 184 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =    6534.07 ms /   184 tokens (   35.51 ms per token,    28.16 tokens per second)
llama_perf_context_print:        eval time =   22527.08 ms /   160 runs   (  140.79 ms per token,     7.10 tokens per second)
llama_perf_context_print:       total time =   29129.13 ms /   344 tokens
Llama.generate: 35 prefix-match hit, remaining 176 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =    6697.25 ms /   176 tokens (   38.05 ms per token,    26.28 tokens per second)
llama_perf_context_print:        eval time =   35737.26 ms /   254 runs   (  140.70 ms per token,     7.11 tokens per second)
llama_perf_context_print:       total time =   42814.79 ms /   430 tokens
Llama.generate: 35 prefix-match hit, remaining 184 prompt tokens to eval
llama_perf_

Llama.generate: 36 prefix-match hit, remaining 199 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =    8654.29 ms /   199 tokens (   43.49 ms per token,    22.99 tokens per second)
llama_perf_context_print:        eval time =   42322.36 ms /   299 runs   (  141.55 ms per token,     7.06 tokens per second)
llama_perf_context_print:       total time =   51905.59 ms /   498 tokens
Llama.generate: 36 prefix-match hit, remaining 191 prompt tokens to eval
llama_perf_context_print:        load time =   75643.54 ms
llama_perf_context_print: prompt eval time =    8233.75 ms /   191 tokens (   43.11 ms per token,    23.20 tokens per second)
llama_perf_context_print:        eval time =   42411.31 ms /   299 runs   (  141.84 ms per token,     7.05 tokens per second)
llama_perf_context_print:       total time =   50775.16 ms /   490 tokens
Llama.generate: 35 prefix-match hit, remaining 189 prompt tokens to eval
llama_perf_

In [39]:
import pandas as pd

insight_df = pd.DataFrame(topic_insights)
insight_df.to_csv("C:/Users/User/OneDrive - Singapore Management University/Semester 5/ISSS609 Text Analytics and Applications/Group Project/Amazon Fine Food Reviews/Clean/Experiment_3 - base bertopic + cTFIDF + LLM Rep/top_30_llm_topic_insights_v1.csv", index=False)

In [None]:
# Get top topic frequencies, excluding -1
topic_freq = bertopic_model.get_topic_freq()
topic_freq = topic_freq[topic_freq["Topic"] != -1].head(20)

# Plot
plt.figure(figsize=(12, 6))
plt.bar(topic_freq["Topic"].astype(str), topic_freq["Count"], color="skyblue")
plt.xlabel("Topic Number")
plt.ylabel("Number of Documents")
plt.title("Top 20 Topics by Frequency (Excluding Outliers)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Get top topic frequencies, including -1, outliers
topic_freq = bertopic_model.get_topic_freq().head(20)

plt.figure(figsize=(12, 6))
plt.bar(topic_freq["Topic"].astype(str), topic_freq["Count"], color="skyblue")
plt.xlabel("Topic Number")
plt.ylabel("Number of Documents")
plt.title("Top 10 Topics by Frequency")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Ensure 'Assigned_Topic' column exists
if "Assigned_Topic" not in df.columns:
    df["Assigned_Topic"] = topics  # Assign topics

# Extract sample reviews for topics -1 to 19
for topic_num in range(-1, 20):
    print(f"\n🔹 Sample Reviews for Topic {topic_num}:")

    # Filter the DataFrame for the given topic
    topic_reviews = df[df["Assigned_Topic"] == topic_num]["Text"]

    # Check if reviews exist for the topic
    if not topic_reviews.empty:
        sample_reviews = topic_reviews.sample(min(3, len(topic_reviews)), random_state=42).tolist()
    else:
        sample_reviews = ["No reviews available for this topic."]

    print(sample_reviews)


In [None]:
docs = df["processed_text"].apply(lambda x: " ".join(x)).tolist()
timestamps = df["DateTime"].tolist()
topics, _ = bertopic_model.transform(docs)

# Add topic & timestamp back into a DataFrame
df_topics = pd.DataFrame({
    "Document": docs,
    "Timestamp": timestamps,
    "Topic": topics
})

# Get top 10 topics
top_10_topic_ids = (
    bertopic_model.get_topic_freq()
    .query("Topic != -1")
    .head(10)["Topic"]
    .tolist()
)

# Filter to only those documents
filtered_df = df_topics[df_topics.Topic.isin(top_10_topic_ids)]

# Then recompute topics_over_time on only those docs
topics_over_time_filtered = bertopic_model.topics_over_time(
    docs=filtered_df["Document"].tolist(),
    timestamps=filtered_df["Timestamp"].tolist(),
    topics=filtered_df["Topic"].tolist(),
    nr_bins=15
)

# Plot
fig = bertopic_model.visualize_topics_over_time(
    topics_over_time_filtered,
    topics=top_10_topic_ids
)
fig.show()
