In [23]:
import json
import json
from collections import defaultdict
import gzip
import random
from tqdm import tqdm
import argparse
import os
from typing import Dict, Any, List
import pandas as pd

In [4]:
pred_items_path = "clustering_results/prediction_item_names_per_cluster.json"

In [24]:
def read_json(file):
    # Open and load the JSON file
    with open(file, 'r') as file:
        return json.load(file)
    return None
    
def extract_meta_data(path):
    meta_data = dict()
    with gzip.open(path) as f:
        for line in tqdm(f):
            line = json.loads(line)
            attr_dict = dict()
            asin = line['asin']
            category = ' '.join(line['category'])
            brand = line['brand']
            title = line['title']
            price = line["price"]

            attr_dict['title'] = title
            attr_dict['brand'] = brand
            attr_dict['category'] = category
            attr_dict['price'] = price

            meta_data[asin] = attr_dict

    return meta_data

def create_cluster_dataframe(clusters: Dict[str, List[str]], 
                           meta_data: Dict[str, Dict[str, Any]]) -> pd.DataFrame:
    """
    Create a pandas DataFrame from cluster assignments and item metadata.
    
    Parameters:
    -----------
    clusters : Dict[str, List[str]]
        Dictionary where keys are cluster labels and values are lists of item IDs
    meta_data : Dict[str, Dict[str, Any]]
        Dictionary where keys are item IDs and values are metadata dictionaries
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: item_id, cluster_label, and all metadata columns
    """
    
    rows = []
    
    # Iterate through each cluster
    for cluster_label, item_ids in clusters.items():
        for item_id in item_ids:
            # Start with item_id and cluster_label
            row = {
                'item_id': item_id,
                'cluster_label': cluster_label
            }
            
            # Add metadata if available for this item
            if item_id in meta_data:
                row.update(meta_data[item_id])
            else:
                # If no metadata, you might want to add None/empty values
                # Get all possible metadata keys from other items
                all_meta_keys = set()
                for meta in meta_data.values():
                    all_meta_keys.update(meta.keys())
                
                # Add None for missing metadata
                for key in all_meta_keys:
                    if key not in row:
                        row[key] = None
            
            rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows)
    
    # Ensure consistent column order (item_id, cluster_label, then metadata columns)
    columns = ['item_id', 'cluster_label']
    other_columns = [col for col in df.columns if col not in columns]
    df = df[columns + sorted(other_columns)]
    
    return df

In [25]:
pred_items = read_json(pred_items_path)

In [26]:
meta_data = extract_meta_data("data/01_raw/Industrial_and_Scientific_metadata.jsonl.gz")

167442it [00:06, 26793.48it/s]


In [28]:
df = create_cluster_dataframe(pred_items, meta_data)

In [33]:
df_label_0 = df[df.cluster_label == "0"]

In [36]:
df_label_0["title"].nunique()

414

In [46]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from typing import Dict, List, Any

def train_categorical_decision_tree(df, max_depth=4, min_samples_split=20, filename="categorical_tree.png"):
    """
    Train a decision tree on categorical features (brand, title, category) to predict cluster labels.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with columns: item_id, cluster_label, title, brand, category
    max_depth : int
        Maximum depth of the decision tree
    min_samples_split : int
        Minimum samples required to split a node
    filename : str
        Filename to save the tree visualization
        
    Returns:
    --------
    clf : DecisionTreeClassifier
        Trained decision tree classifier
    encoder : OneHotEncoder
        Fitted one-hot encoder for the features
    feature_names : list
        List of feature names after one-hot encoding
    """
    
    # Prepare the data
    df_clean = df.copy()
    
    # Fill missing values
    df_clean['title'] = df_clean['title'].fillna('Unknown')
    df_clean['brand'] = df_clean['brand'].fillna('Unknown')
    df_clean['category'] = df_clean['category'].fillna('Unknown')
    
    # Convert cluster labels to integers
    df_clean['cluster_label'] = df_clean['cluster_label'].astype(int)
    
    # Prepare categorical features for one-hot encoding
    categorical_features = ['title', 'brand', 'category']
    X_categorical = df_clean[categorical_features]
    
    # One-hot encode categorical features
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    X_encoded = encoder.fit_transform(X_categorical)
    
    # Get feature names after encoding
    feature_names = encoder.get_feature_names_out(categorical_features)
    
    # Target variable
    y = df_clean['cluster_label']
    
    print(f"Training data shape after one-hot encoding: {X_encoded.shape}")
    print(f"Number of clusters: {y.nunique()}")
    print(f"Number of features after encoding: {len(feature_names)}")
    
    # Train decision tree
    clf = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42,
        criterion='gini'
    )
    
    clf.fit(X_encoded, y)
    
    # Print feature importance (top 10 most important features)
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': clf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    # Create class names
    unique_clusters = sorted(y.unique())
    class_names = [f"Cluster {i}" for i in unique_clusters]
    
    # For visualization, use only top features to avoid overcrowding
    top_features_mask = clf.feature_importances_ > np.percentile(clf.feature_importances_, 80)
    if np.sum(top_features_mask) > 20:  # If still too many, take top 20
        top_indices = np.argsort(clf.feature_importances_)[-20:]
        top_features_mask = np.zeros_like(clf.feature_importances_, dtype=bool)
        top_features_mask[top_indices] = True
    
    # Create a simplified feature name list for visualization
    simplified_feature_names = []
    for name in feature_names:
        # Shorten long feature names for better visualization
        if len(name) > 30:
            simplified_feature_names.append(name[:27] + "...")
        else:
            simplified_feature_names.append(name)
    
    # Plot and save the tree
    plt.figure(figsize=(30, 20))
    plot_tree(
        clf,
        feature_names=simplified_feature_names,
        class_names=class_names,
        filled=True,
        rounded=True,
        fontsize=6,
        max_depth=2  # Limit display depth for readability
    )
    plt.title("Decision Tree for Cluster Classification\n(Based on One-Hot Encoded Title, Brand, Category)", fontsize=16)
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"\nDecision tree saved as: {filename}")
    
    # Print accuracy on training data
    y_pred = clf.predict(X_encoded)
    accuracy = accuracy_score(y, y_pred)
    print(f"Training accuracy: {accuracy:.3f}")
    
    return clf, encoder, feature_names

def predict_cluster_with_tree(clf, encoder, feature_names, title, brand, category):
    """
    Predict cluster for a new item using the trained decision tree.
    
    Parameters:
    -----------
    clf : DecisionTreeClassifier
        Trained decision tree
    encoder : OneHotEncoder
        Fitted one-hot encoder
    feature_names : list
        List of feature names after encoding
    title, brand, category : str
        Item features
        
    Returns:
    --------
    int : Predicted cluster label
    """
    
    # Prepare the input data
    input_data = pd.DataFrame({
        'title': [title or 'Unknown'],
        'brand': [brand or 'Unknown'], 
        'category': [category or 'Unknown']
    })
    
    # One-hot encode the input
    X_new = encoder.transform(input_data)
    
    # Make prediction
    prediction = clf.predict(X_new)[0]
    prediction_proba = clf.predict_proba(X_new)[0]
    
    print(f"Prediction probabilities: {dict(zip(range(len(prediction_proba)), prediction_proba))}")
    
    return prediction

In [47]:
# Train the tree
clf, encoders, feature_names = train_categorical_decision_tree(df)

# # Make a prediction
# prediction = predict_cluster_with_tree(clf, encoders, 'New Product', 'Brand X', 'Category 1')
# print(f"\nPredicted cluster for new item: {prediction}")

Training data shape after one-hot encoding: (9805, 10150)
Number of clusters: 3
Number of features after encoding: 10150

Top 10 Most Important Features:
                                                feature  importance
9074  category_Industrial & Scientific Additive Manu...    0.653235
9014                                       brand_uxcell    0.154892
9065  category_Industrial & Scientific Additive Manu...    0.096679
6439                                      brand_AcuRite    0.072037
6173  title_eSUN 3D 1.75mm PETG Natural Filament 1kg...    0.007928
6311  title_uxcell Mandrel Mounted White Conical Fel...    0.007019
9263  category_Industrial & Scientific Hydraulics, P...    0.002215
4999  title_Smartbuy 1.75mm Grey ABS 3D Printer Fila...    0.002023
6163  title_eSUN 1.75mm Cool White PLA PRO (PLA+) 3D...    0.002006
3519  title_MakerBot MP06077 Build Plate Tape, Repli...    0.001968

Decision tree saved as: categorical_tree.png
Training accuracy: 0.588


In [39]:
clf