In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

# Load dataset
df = pd.read_csv("apple_products.csv")

# Choose the text column (change this if your dataset has a different one)
text_column = "product_name"   # or "description" if available
documents = df[text_column].astype(str).fillna("")

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(documents)

# Define clustering algorithms
algorithms = {
    "KMeans": KMeans(n_clusters=5, random_state=42),
    "Agglomerative": AgglomerativeClustering(n_clusters=5),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=5)
}

# Compare performance
for name, algo in algorithms.items():
    if name == "Agglomerative":
        labels = algo.fit_predict(X.toarray())  # Agglomerative needs dense arrays
    else:
        labels = algo.fit_predict(X)

    # For DBSCAN, check if clustering worked
    if len(set(labels)) > 1:
        score = silhouette_score(X, labels)
        print(f"{name} Silhouette Score: {score:.4f}")
    else:
        print(f"{name}: Could not compute Silhouette Score (only one cluster).")


KeyError: 'product_name'

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

# Load dataset
df = pd.read_csv("apple_products.csv")

# Print available columns
print("Available columns:", df.columns.tolist())

# Auto-detect a text column (first non-numeric column)
text_column = None
for col in df.columns:
    if df[col].dtype == "object":
        text_column = col
        break

if text_column is None:
    raise ValueError("No text column found in dataset. Please specify manually.")

print(f"\nUsing text column for clustering: {text_column}")

# Extract text data
documents = df[text_column].astype(str).fillna("")

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(documents)

# Define clustering algorithms
algorithms = {
    "KMeans": KMeans(n_clusters=5, random_state=42),
    "Agglomerative": AgglomerativeClustering(n_clusters=5),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=5)
}

# Compare performance
for name, algo in algorithms.items():
    if name == "Agglomerative":
        labels = algo.fit_predict(X.toarray())  # Agglomerative needs dense arrays
    else:
        labels = algo.fit_predict(X)

    # For DBSCAN, check if clustering worked
    if len(set(labels)) > 1:
        score = silhouette_score(X, labels)
        print(f"{name} Silhouette Score: {score:.4f}")
    else:
        print(f"{name}: Could not compute Silhouette Score (only one cluster).")


Available columns: ['Product Name', 'Product URL', 'Brand', 'Sale Price', 'Mrp', 'Discount Percentage', 'Number Of Ratings', 'Number Of Reviews', 'Upc', 'Star Rating', 'Ram']

Using text column for clustering: Product Name
KMeans Silhouette Score: 0.1619
Agglomerative Silhouette Score: 0.1739
DBSCAN: Could not compute Silhouette Score (only one cluster).
