In [4]:
import pandas as pd
import spacy
from spacy.training import Example

# Load your data
data = pd.read_csv("amazon-purchases.csv")

# Define Prime Day dates and add a "Prime Purchase" column
prime_days = ["2022-07-12", "2022-07-13", "2021-06-21", "2021-06-22", "2020-10-13", "2020-10-14", 
              "2019-07-15", "2019-07-16", "2018-07-17", "2018-07-18"]
data["Prime Purchase"] = data["Order Date"].apply(lambda x: 1 if x in prime_days else 0)

# Define department categories
departments = [
    'Clothing, Shoes & Jewelry', 'Everything Else', 'Home & Kitchen', 'Sports & Outdoors', 
    'Tools & Home Improvement', 'Electronics', 'Apps & Games', 'Health & Personal Care', 
    'Office Products', 'Industrial & Scientific', 'Grocery & Gourmet Food', 'Baby', 'Pet Supplies', 
    'Automotive Parts and Accessories', 'Beauty & Personal Care', 'Cell Phones & Accessories', 
    'Garden & Outdoor', 'Video Games', 'Books', 'Kindle Accessories', 'Appliances', 
    'Arts, Crafts & Sewing', 'Gift Cards', 'Musical Instruments', 'Toys & Games', 'Alexa Skills', 
    'CDs & Vinyl', 'Software', 'Movies & TV'
]

# Load blank spaCy model
nlp = spacy.blank("en")

# Add the textcat_multilabel pipe
textcat = nlp.add_pipe("textcat_multilabel")

# Add labels
for label in departments:
    textcat.add_label(label)

# Example training data
train_data = [
    ("Simple Joys by Carter's Baby 3-Pack Neutral Cotton Sleeper Gown", {"cats": {"Clothing, Shoes & Jewelry": 1}}),
    ("Amazon Basics 2-Ply Flex-Sheets Paper Towels", {"cats": {"Home & Kitchen": 1}}),
    ("Bellivera Women's Stand Collar Quilted Puffer Jacket", {"cats": {"Clothing, Shoes & Jewelry": 1}}),
    ("Amazon Basics Stainless Steel Wire Whisk Set - 3-Piece", {"cats": {"Home & Kitchen": 1}}),
]

# Training the text classifier
optimizer = nlp.begin_training()
for i in range(10):  # Number of training iterations
    losses = {}
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Iteration {i+1}, Losses: {losses}")

# Test with an example
doc = nlp("Amazon Basics Microfiber Cleaning Cloths, Pack of 24")
print(doc.cats)  # Outputs the category scores for each label



Iteration 1, Losses: {'textcat_multilabel': 0.03606108062012936}
Iteration 2, Losses: {'textcat_multilabel': 2.57037200412924e-06}
Iteration 3, Losses: {'textcat_multilabel': 5.833169849367437e-07}
Iteration 4, Losses: {'textcat_multilabel': 2.606413693762022e-07}
Iteration 5, Losses: {'textcat_multilabel': 1.5986447365889944e-07}
Iteration 6, Losses: {'textcat_multilabel': 1.1746851646954326e-07}
Iteration 7, Losses: {'textcat_multilabel': 9.585510474252601e-08}
Iteration 8, Losses: {'textcat_multilabel': 8.26054689095912e-08}
Iteration 9, Losses: {'textcat_multilabel': 7.470850482693692e-08}
Iteration 10, Losses: {'textcat_multilabel': 6.952352948896845e-08}
{'Clothing, Shoes & Jewelry': 0.9568436145782471, 'Everything Else': 0.617598831653595, 'Home & Kitchen': 0.9967989921569824, 'Sports & Outdoors': 0.14870500564575195, 'Tools & Home Improvement': 0.5263456106185913, 'Electronics': 0.304215669631958, 'Apps & Games': 0.48125430941581726, 'Health & Personal Care': 0.9293075203895569

In [8]:
# Example prediction function
def predict_category(text):
    # Process the text through the trained model
    doc = nlp(text)
    
    # Get the category with the highest score
    highest_score_label = max(doc.cats, key=doc.cats.get)
    highest_score = doc.cats[highest_score_label]
    
    return highest_score_label, highest_score

# Test the prediction function with some example text
example_text = data['Title'][0]
category, score = predict_category(example_text)
print(f"Most likely category: {category} (Score: {score:.4f})")

Most likely category: Clothing, Shoes & Jewelry (Score: 0.9368)


In [9]:
data['Title'][0]

'SanDisk Ultra 16GB Class 10 SDHC UHS-I Memory Card up to 80MB/s (SDSDUNC-016G-GN6IN)'