# Product Price Prediction - Data Curation Pipeline

**Dataset**: Amazon Reviews 2023 (McAuley-Lab)  
**Objective**: Curate balanced dataset for price prediction model training

## Environment Setup

In [None]:
import os
import logging
from dotenv import load_dotenv
from huggingface_hub import login
import matplotlib.pyplot as plt

from items import Item
from loaders import ItemLoader
from data_utils import DatasetBalancer, DatasetSplitter, DatasetExporter, DatasetAnalyzer
from visualization import DataVisualizer
from config import DATASET_CATEGORIES

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
%matplotlib inline

In [None]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [None]:
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

## Single Category Analysis (Appliances)

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    "raw_meta_Appliances",
    split="full",
    trust_remote_code=True
)

print(f"Total appliances: {len(dataset):,}")

In [None]:
datapoint = dataset[0]
print(f"Title: {datapoint['title']}")
print(f"Price: {datapoint['price']}")
print(f"Features: {len(datapoint['features'])} items")

In [None]:
prices_count = 0
for dp in dataset:
    try:
        price_str = dp.get('price')
        if price_str and price_str != 'None':
            price = float(price_str)
            if price > 0:
                prices_count += 1
    except (ValueError, TypeError):
        pass

coverage = prices_count / len(dataset) * 100
print(f"Items with valid prices: {prices_count:,} ({coverage:.1f}%)")

In [None]:
prices = []
lengths = []

for dp in dataset:
    try:
        price_str = dp.get('price')
        if price_str and price_str != 'None':
            price = float(price_str)
            if price > 0:
                prices.append(price)
                content = str(dp['title']) + str(dp['description']) + str(dp['features']) + str(dp['details'])
                lengths.append(len(content))
    except (ValueError, TypeError):
        pass

plt.figure(figsize=(15, 6))
plt.title(f"Content Length Distribution\nAvg: {sum(lengths)/len(lengths):,.0f} | Max: {max(lengths):,}")
plt.xlabel('Characters')
plt.ylabel('Count')
plt.hist(lengths, rwidth=0.7, color='darkviolet', bins=range(0, 6000, 100))
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
plt.title(f"Raw Price Distribution\nAvg: ${sum(prices)/len(prices):.2f} | Max: ${max(prices):,.2f}")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color='navy', bins=range(0, 1000, 10))
plt.show()

## Process Single Category with Item Class

In [None]:
items = []

for dp in dataset:
    try:
        price_str = dp.get('price')
        if price_str and price_str != 'None':
            price = float(price_str)
            if price > 0:
                item = Item(dp, price)
                if item.include:
                    items.append(item)
    except (ValueError, TypeError):
        pass

print(f"Curated items: {len(items):,}")

In [None]:
print("Training Prompt:")
print(items[100].prompt)
print("\n" + "="*60 + "\n")
print("Test Prompt:")
print(items[100].test_prompt())

In [None]:
visualizer = DataVisualizer()
visualizer.plot_token_distribution(items)
visualizer.plot_price_distribution(items)

## Multi-Category Dataset Loading

In [None]:
all_items = []

for category in DATASET_CATEGORIES:
    loader = ItemLoader(category)
    items = loader.load(workers=8)
    all_items.extend(items)

print(f"\nTotal items loaded: {len(all_items):,}")

In [None]:
DatasetAnalyzer.print_statistics(all_items, "Raw Dataset")

In [None]:
visualizer.plot_comprehensive_analysis(all_items, "Raw Dataset")

## Dataset Balancing

Balance price distribution and reduce automotive category dominance

In [None]:
balancer = DatasetBalancer()
balanced_items = balancer.balance(all_items)

print(f"Balanced dataset: {len(balanced_items):,} items")

In [None]:
DatasetAnalyzer.print_statistics(balanced_items, "Balanced Dataset")

In [None]:
visualizer.plot_comprehensive_analysis(balanced_items, "Balanced Dataset")

## Train/Test Split

In [None]:
splitter = DatasetSplitter()
train_items, test_items = splitter.split(balanced_items)

print(f"Training set: {len(train_items):,}")
print(f"Test set: {len(test_items):,}")

In [None]:
DatasetAnalyzer.print_statistics(train_items, "Training Set")
DatasetAnalyzer.print_statistics(test_items, "Test Set")

In [None]:
visualizer.plot_price_distribution(test_items[:250], "Test Set Sample (250 items)")

## Export Datasets

In [None]:
dataset_dict = DatasetExporter.to_huggingface(train_items, test_items)
print(dataset_dict)

In [None]:
DatasetExporter.to_pickle(train_items, 'train.pkl')
DatasetExporter.to_pickle(test_items, 'test.pkl')
print("Datasets exported to pickle files")

## Validation

In [None]:
sample_item = train_items[398000] if len(train_items) > 398000 else train_items[0]

print("Sample Training Prompt:")
print(sample_item.prompt)
print("\n" + "="*60 + "\n")

tokens = Item.tokenizer.encode(sample_item.prompt)
print(f"Token count: {len(tokens)}")
print(f"Last 10 tokens: {tokens[-10:]}")
print(f"Decoded: {Item.tokenizer.batch_decode(tokens[-10:])}")

In [None]:
print("\n" + "="*60)
print("PIPELINE COMPLETED SUCCESSFULLY")
print("="*60)
print(f"Training items: {len(train_items):,}")
print(f"Test items: {len(test_items):,}")
print(f"Average price: ${sum(item.price for item in train_items)/len(train_items):.2f}")
print(f"Files created: train.pkl, test.pkl")
print("="*60)