In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

df = pd.read_csv('simulated_amazon_reviews.csv')  # Replace with actual path
df.head()


In [None]:

df.dropna(subset=['review_text'], inplace=True)
df['review_text'] = df['review_text'].str.lower()


In [None]:

tfidf = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english')
X_tfidf = tfidf.fit_transform(df['review_text'])


In [None]:

svd = TruncatedSVD(n_components=100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_lsa = lsa.fit_transform(X_tfidf)


In [None]:

kmeans = KMeans(n_clusters=10, random_state=42)
labels = kmeans.fit_predict(X_lsa)
df['cluster'] = labels
df[['review_text', 'cluster']].head()


In [None]:

# For demonstration only, simulate ground truth and calculate metrics
true_labels = df['category'].astype('category').cat.codes.values[:len(labels)]
precision = precision_score(true_labels, labels, average='macro')
recall = recall_score(true_labels, labels, average='macro')
f1 = f1_score(true_labels, labels, average='macro')
precision, recall, f1
