In [None]:
import pandas as pd
from langchain_community.llms import Ollama
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Load dataset
df = pd.read_csv('Cleaned_reviews.csv')  # Update the path
texts = df['review_body']

# Initialize Ollama LLM
ollama = Ollama(model="llama3")

def generate_category(text):
    prompt = f"These reviews are for an Amazon Kindle Fire Tablet. Read each review and identify a single, descriptive theme or topic:\n\n{text}"
    response = ollama.invoke(prompt)  # use invoke instead of predict (which is deprecated)
    return response.content.decode().strip()

# Apply LLM to generate themes for each review using parallel processing
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(generate_category, text) for text in texts]
    df['theme'] = list(tqdm(as_completed(futures), total=len(futures)))

In [None]:
# Use KMeans clustering to group similar themes together
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['theme'])
kmeans = KMeans(n_clusters=10, random_state=42)  # You can adjust the number of clusters as needed
clusters = kmeans.fit_predict(X)
df['category'] = clusters

# Display the updated dataframe
print(df.head())