In [2]:
import pandas as pd
import spacy
from spacy.training import Example

# Load your data
data = pd.read_csv("data/amazon-purchases.csv")

# Define Prime Day dates and add a "Prime Purchase" column
prime_days = ["2022-07-12", "2022-07-13", "2021-06-21", "2021-06-22", "2020-10-13", "2020-10-14", 
              "2019-07-15", "2019-07-16", "2018-07-17", "2018-07-18"]
data["Prime Purchase"] = data["Order Date"].apply(lambda x: 1 if x in prime_days else 0)

# Define department categories
departments = [
    'Clothing, Shoes & Jewelry', 'Everything Else', 'Home & Kitchen', 'Sports & Outdoors', 
    'Tools & Home Improvement', 'Electronics', 'Apps & Games', 'Health & Personal Care', 
    'Office Products', 'Industrial & Scientific', 'Grocery & Gourmet Food', 'Baby', 'Pet Supplies', 
    'Automotive Parts and Accessories', 'Beauty & Personal Care', 'Cell Phones & Accessories', 
    'Garden & Outdoor', 'Video Games', 'Books', 'Kindle Accessories', 'Appliances', 
    'Arts, Crafts & Sewing', 'Gift Cards', 'Musical Instruments', 'Toys & Games', 'Alexa Skills', 
    'CDs & Vinyl', 'Software', 'Movies & TV'
]

# Load blank spaCy model
nlp = spacy.blank("en")

# Add the textcat_multilabel pipe
textcat = nlp.add_pipe("textcat_multilabel")

# Add labels
for label in departments:
    textcat.add_label(label)

# Example training data
train_data = [
    ("Simple Joys by Carter's Baby 3-Pack Neutral Cotton Sleeper Gown", {"cats": {"Clothing, Shoes & Jewelry": 1}}),
    ("Amazon Basics 2-Ply Flex-Sheets Paper Towels", {"cats": {"Home & Kitchen": 1}}),
    ("Bellivera Women's Stand Collar Quilted Puffer Jacket", {"cats": {"Clothing, Shoes & Jewelry": 1}}),
    ("Amazon Basics Stainless Steel Wire Whisk Set - 3-Piece", {"cats": {"Home & Kitchen": 1}}),
]



In [4]:
products = pd.read_json("data/train_data.json")

In [5]:
products

Unnamed: 0,0,1
0,"Random Orbit Sanding Disc, 5-Inch",{'cats': {'Abrasive & Finishing Products': 1}}
1,2 Packs for DeWalt DWE64233 & N329079 Orbital ...,{'cats': {'Abrasive & Finishing Products': 1}}
2,A Set of 5 Wool Cloths for Buffing or Polishin...,{'cats': {'Abrasive & Finishing Products': 1}}
3,Wood Carving Disc 6 Teeth Grinder Wheel Disc A...,{'cats': {'Abrasive & Finishing Products': 1}}
4,"Sander Pads for Dewalt, 5"" 8-Hole Orbital Sand...",{'cats': {'Abrasive & Finishing Products': 1}}
...,...,...
4955,Kobo Elipsa 2E | eReader | 10.3” Glare-Free To...,{'cats': {'eBook Readers & Accessories': 1}}
4956,Energizer Clip-on Book Light for Reading in Be...,{'cats': {'eBook Readers & Accessories': 1}}
4957,BOOX Note Air 2 Plus 10.3 with Magnet ePaper E...,{'cats': {'eBook Readers & Accessories': 1}}
4958,Strapsicle Kindle Hand Strap | Pack of 2 Hand ...,{'cats': {'eBook Readers & Accessories': 1}}


In [6]:
products_train_data = [(row[0], row[1]) for _, row in products.iterrows()]
products_train_data

[('Random Orbit Sanding Disc, 5-Inch',
  {'cats': {'Abrasive & Finishing Products': 1}}),
 ('2 Packs for DeWalt DWE64233 & N329079 Orbital Sander Replacement Pad, 5 inch 8 Hole Hook and Loop Sanding Disc Backing Pads, Compatible with DWE6421-B2, DWE6421-B3, DWE6423/6423K, DWE6421/6421K',
  {'cats': {'Abrasive & Finishing Products': 1}}),
 ('A Set of 5 Wool Cloths for Buffing or Polishing with TechDiamonTools Diamond Pastes',
  {'cats': {'Abrasive & Finishing Products': 1}}),
 ('Wood Carving Disc 6 Teeth Grinder Wheel Disc Angle Grinder Chainsaw Wheel Woodworking Disc Wood Shaping Wheel Chain Saw Disc Circular Saw Blade for Cutting Carving Shaping Wood(Arc Blade)',
  {'cats': {'Abrasive & Finishing Products': 1}}),
 ('Sander Pads for Dewalt, 5" 8-Hole Orbital Sanding Replacement Hook and Loop Pads for DeWalt DW420 DW421 DW422 DW423 DW426 D26451 Random Orbital Sander, 2-Pack',
  {'cats': {'Abrasive & Finishing Products': 1}}),
 ('Diamond Dresser Single Point Grinding Wheel Tapered Point 

In [7]:
# Training the text classifier
optimizer = nlp.begin_training()
for i in range(3):  # Number of training iterations
    losses = {}
    for text, annotations in products_train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Iteration {i+1}, Losses: {losses}")

# Test with an example
doc = nlp("Amazon Basics Microfiber Cleaning Cloths, Pack of 24")
print(doc.cats)  # Outputs the category scores for each label

Iteration 1, Losses: {'textcat_multilabel': 0.18747377204587068}
Iteration 2, Losses: {'textcat_multilabel': 0.0012478982020132357}
Iteration 3, Losses: {'textcat_multilabel': 2.485939878580634e-07}
{'Clothing, Shoes & Jewelry': 0.9981885552406311, 'Everything Else': 0.7175805568695068, 'Home & Kitchen': 0.7575376629829407, 'Sports & Outdoors': 0.9998989105224609, 'Tools & Home Improvement': 0.9999958276748657, 'Electronics': 0.5583662986755371, 'Apps & Games': 0.7011405825614929, 'Health & Personal Care': 0.8251920938491821, 'Office Products': 0.9874451160430908, 'Industrial & Scientific': 0.9999970197677612, 'Grocery & Gourmet Food': 0.7625522613525391, 'Baby': 0.9981021285057068, 'Pet Supplies': 0.27120259404182434, 'Automotive Parts and Accessories': 0.49678486585617065, 'Beauty & Personal Care': 0.9995506405830383, 'Cell Phones & Accessories': 0.9993939399719238, 'Garden & Outdoor': 0.9815956950187683, 'Video Games': 0.9999959468841553, 'Books': 0.8185359835624695, 'Kindle Accesso

In [8]:
# Example prediction function
def predict_category(text):
    # Process the text through the trained model
    doc = nlp(text)
    
    # Get the category with the highest score
    highest_score_label = max(doc.cats, key=doc.cats.get)
    highest_score = doc.cats[highest_score_label]
    
    return highest_score_label, highest_score

# Test the prediction function with some example text
for i in range(0, 20):
    example_text = str(data['Title'][i])
    category, score = predict_category(example_text)
    print(data['Title'][i],f"Most likely category: {category} (Score: {score:.4f})")

SanDisk Ultra 16GB Class 10 SDHC UHS-I Memory Card up to 80MB/s (SDSDUNC-016G-GN6IN) Most likely category: Industrial & Scientific (Score: 1.0000)
Betron BS10 Earphones Wired Headphones in Ear Noise Isolating Earbuds with Microphone and Volume Control Powerful Bass Driven Sound, 12mm Large Drivers, Ergonomic Design Most likely category: Industrial & Scientific (Score: 1.0000)
nan Most likely category: Industrial & Scientific (Score: 1.0000)
Perfecto Stainless Steel Shaving Bowl. Durable Metal Mug For Shaving Soap & Cream. Perfect Addition To Your Wet Shaving Kit. Double Layer Smooth Shave Unbreakable Mug With Heat Insulation Most likely category: Industrial & Scientific (Score: 1.0000)
Proraso Shaving Cream for Men Most likely category: Toys & Games (Score: 1.0000)
Micro USB Cable Android Charger - Syncwire [2-Pack 6.6ft] Super-Durable Nylon Braided Fast Sync&Charging Cord for Samsung Galaxy S7 Edge/S7/S6, HTC, LG, Sony, Xbox One, PS4 - Space Grey Most likely category: Industrial & Sci

In [18]:
data['Title'][2]

nan

In [20]:
import os

#Save the model
output_dir = os.path.join(os.getcwd(), "models/spacy_textcat_model")
nlp.to_disk(output_dir)