In [27]:
import pandas as pd
import spacy
from spacy.training import Example
from spacy.util import minibatch
import random

# Load the dataset
print("Loading dataset...")
data = pd.read_csv("BusinessForecastingProject-main/data/amazon-purchases.csv")
print(f"Dataset loaded with {len(data)} rows.")

# Drop rows with missing titles or categories
print("Cleaning data...")
data = data.dropna(subset=["Title", "Category"])
print(f"Dataset cleaned. Remaining rows: {len(data)}")

# Define department categories
departments = [
    'Clothing, Shoes & Jewelry', 'Everything Else', 'Home & Kitchen', 'Sports & Outdoors', 
    'Tools & Home Improvement', 'Electronics', 'Apps & Games', 'Health & Personal Care', 
    'Office Products', 'Industrial & Scientific', 'Grocery & Gourmet Food', 'Baby', 'Pet Supplies', 
    'Automotive Parts and Accessories', 'Beauty & Personal Care', 'Cell Phones & Accessories', 
    'Garden & Outdoor', 'Video Games', 'Books', 'Kindle Accessories', 'Appliances', 
    'Arts, Crafts & Sewing', 'Gift Cards', 'Musical Instruments', 'Toys & Games', 'Alexa Skills', 
    'CDs & Vinyl', 'Software', 'Movies & TV'
]

# Pre-defined training data
print("Preparing predefined training data...")
train_data = [
    ("Simple Joys by Carter's Baby 3-Pack Neutral Cotton Sleeper Gown", {"cats": {"Clothing, Shoes & Jewelry": 1}}),
    ("Amazon Basics 2-Ply Flex-Sheets Paper Towels", {"cats": {"Home & Kitchen": 1}}),
    ("Bellivera Women's Stand Collar Quilted Puffer Jacket", {"cats": {"Clothing, Shoes & Jewelry": 1}}),
    ("Amazon Basics Stainless Steel Wire Whisk Set - 3-Piece", {"cats": {"Home & Kitchen": 1}}),
]
print(f"Predefined training data contains {len(train_data)} examples.")

# Prepare data from the CSV
print("Converting CSV data to training format...")
def create_cats(category):
    return {dept: 1 if dept == category else 0 for dept in departments}

csv_train_data = [
    (row["Title"], {"cats": create_cats(row["Category"])})
    for _, row in data.iterrows()
]
print(f"CSV training data prepared with {len(csv_train_data)} examples.")

# Combine pre-defined training data and CSV data
full_train_data = train_data + csv_train_data
print(f"Full training data contains {len(full_train_data)} examples.")

# Load blank spaCy model
nlp = spacy.blank("en")

# Add the textcat_multilabel pipe
print("Adding text classification pipe...")
textcat = nlp.add_pipe("textcat_multilabel")

# Add department labels
for label in departments:
    textcat.add_label(label)
print(f"Added {len(departments)} labels to the model.")

# Convert training data into Example objects
print("Converting training data into spaCy Example objects...")
examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in full_train_data]
print(f"Converted {len(examples)} examples.")

# Initialize the model
print("Initializing the model...")
optimizer = nlp.initialize()

# Training loop
n_iters = 10
print("Starting training loop...")
for i in range(n_iters):
    random.shuffle(examples)
    losses = {}
    # Use minibatch to process data in chunks
    batches = minibatch(examples, size=8)
    for batch in batches:
        nlp.update(batch, sgd=optimizer, losses=losses)
    print(f"Epoch {i+1}, Loss: {losses}")

# Save the trained model
print("Saving the trained model...")
nlp.to_disk("trained_textcat_model")
print("Model saved to 'trained_textcat_model'.")

# Load the trained model
print("Loading the trained model...")
nlp = spacy.load("trained_textcat_model")
print("Model loaded successfully.")

# Test the model with new titles
test_titles = [
    "SanDisk Ultra 16GB Class 10 SDHC UHS-I Memory Card",
    "Amazon Echo Dot (3rd Gen) - Smart speaker with Alexa",
    "Men's Waterproof Hiking Boots"
]

print("Testing the model with new titles...")
for title in test_titles:
    doc = nlp(title)
    predicted_cats = {label: score for label, score in doc.cats.items()}
    sorted_cats = sorted(predicted_cats.items(), key=lambda x: x[1], reverse=True)
    print(f"Title: {title}")
    print("Predicted categories (sorted):")
    for category, score in sorted_cats[:5]:  # Display top 5 categories
        print(f"  {category}: {score:.2f}")
    print("-" * 50)


Loading dataset...
Dataset loaded with 1850717 rows.
Cleaning data...
Dataset cleaned. Remaining rows: 1760351
Preparing predefined training data...
Predefined training data contains 4 examples.
Converting CSV data to training format...
CSV training data prepared with 1760351 examples.
Full training data contains 1760355 examples.
Adding text classification pipe...
Added 29 labels to the model.
Converting training data into spaCy Example objects...
Converted 1760355 examples.
Initializing the model...
Starting training loop...
Epoch 1, Loss: {'textcat_multilabel': 2.051887231845583}
Epoch 2, Loss: {'textcat_multilabel': 0.05603447493571418}
Epoch 3, Loss: {'textcat_multilabel': 0.056034479881152094}
Epoch 4, Loss: {'textcat_multilabel': 0.0560351221015285}
Epoch 5, Loss: {'textcat_multilabel': 0.05603447680968672}
Epoch 6, Loss: {'textcat_multilabel': 0.05603447983420525}
Epoch 7, Loss: {'textcat_multilabel': 0.0560344767492828}
Epoch 8, Loss: {'textcat_multilabel': 0.05603587435150806