## Product Pricer Model

A model that estimates how much something costs.

### Baseline Models

In [None]:
!pip install pandas numpy scikit-learn gensim huggingface-hub

In [1]:
### General Imports
import os
import math
import json
import random
from dotenv import load_dotenv
import matplotlib.pyplot as plt 
import pickle 
from collections import Counter
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Imports For Traditional Machine Learning

import pandas as pd 
import numpy as np 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
### For NLP related machine learning

from sklearn.feature_extraction.text import CountVectorizer 
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [4]:
### More of machine learning related ones
from sklearn.svm import LinearSVR 
from sklearn.ensemble import RandomForestRegressor 

In [5]:
### Internal Classes

from items import Item

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [6]:
### Constants -- used for printing to stdout in color

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"green": GREEN, "orange": YELLOW, "red": RED} 

In [7]:
### Environment 

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
HF_TOKEN_KEY = os.getenv("HF_TOKEN")

login(HF_TOKEN_KEY)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [8]:
%matplotlib inline

## Loading the pkl files

In [9]:
with open("train.pkl", "rb") as f:
    train = pickle.load(f)

with open("test.pkl", "rb") as f:
    test = pickle.load(f)

In [10]:
print(test[0].test_prompt())

How much does this cost to the nearest dollar?

Sydney Rustic Mirror - Vanity Mirror, Bathroom Mirror, Farmhouse Decor, Wood Mirror, Large Mirror - 4 Sizes & 20 Colors - Red Oak
The Sydney low profile thin wood framed mirror will blend effortlessly in your current decor creating a simple yet sophisticated look. Our mirror features a reclaimed rustic styled wood finish, strong decorative lines with a thin 2.25‚Äù inch wide frame maximizing the visible mirror. This hanging mirror can be mounted horizontally or vertically. Available in 20 Colors - Shown in Red Oak. Not sure on color we do color samples please contact us for details. Available in 4 sizes, 24x30, 36x30, 42x30 & 60x30, all measurements are overall dimensions including frame and mirror. We offer two types

Price is $


In [11]:
print(test[0].price)

189.99


## Tester Class

In [25]:
class Tester: 

    def __init__(self, predictor, title=None, data=test, size=250): 
        self.predictor = predictor
        self.data = data
        ### Conditionally used for either the text report or chart's title
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.truths = []
        self.guesses = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth): 
        if error < 20 or error / truth < 0.2:
            return "green"
        elif error < 80 or error / truth < 0.4: 
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i): 
        datapoint = self.data[i]
        guess = self.predictor(datapoint)
        truth = datapoint.price
        error = abs(guess - truth)
        log_error = math.log(truth + 1) - math.log(guess + 1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint.title if len(datapoint.title) < 40 else datapoint.title[:40] + "..."
        self.truths.append(truth)
        self.guesses.append(guess)
        self.errors.append(error) 
        self.sles.append(sle) 
        self.colors.append(color) 
        print(f"{COLOR_MAP[color]}{i + 1}: Guess: ${guess:,.2f} | Truth: ${truth:,.2f} | Error: {error:,.2f} | SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title): 
        max_error = max(self.errors) 
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color="deepskyblue", lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val) 
        plt.title(title)
        plt.show()

    def report(self): 
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color == "green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits / self.size * 100:.1f}%"
        self.chart(title)

    def run(self):
        for i in range(self.size): 
            self.run_datapoint(i) 
        self.report()

    @classmethod
    def test(cls, function):
        cls(function).run()


In [26]:
### Simulator function

def random_pricer(item): 
    return random.randrange(1, 1000)

In [None]:
random.seed(42)

### Run Tester with a simulating guess predictor
Tester.test(random_pricer)

In [35]:
### Another test -- with average price from the train data

training_prices = [item.price for item in train]
training_avr_price = sum(training_prices) / len(training_prices)

def constant_pricer(item): 
    return training_avr_price 

In [None]:
Tester.test(constant_pricer)

## Feature Engineering Mockup Practice

Educational example using product weights, brand, and other fields 
to demonstrate feature engineering, not meant for real world usage.

In [40]:
train[0].details

'{"Package Dimensions": "12.4 x 10.08 x 7.76 inches", "Item Weight": "5.33 pounds", "Item model number": "MS300", "Best Sellers Rank": {"Photographic Lighting Monolights": 296}, "Date First Available": "August 4, 2019", "Manufacturer": "Godox", "Brand": "GODOX", "Compatible Mountings": "Nikon", "Camera Flash": "Studio", "Included Components": "MS Flash Head *1 Power Cord *1 Lamp Cover *1", "Wireless Communication Technology": "Optical Pulse"}'

In [45]:
### Create a new "features" field on items, 
### and populate it with json parsed from the details dict

for item in train: 
    item.features = json.loads(item.details)

for item in test: 
    item.features = json.loads(item.details)

In [46]:
print(train[0].features.keys())

dict_keys(['Package Dimensions', 'Item Weight', 'Item model number', 'Best Sellers Rank', 'Date First Available', 'Manufacturer', 'Brand', 'Compatible Mountings', 'Camera Flash', 'Included Components', 'Wireless Communication Technology'])


In [None]:
### Look at 20 most common features in training set

feature_count = Counter()

for item in train: 
    for f in item.features.keys(): 
        feature_count[f] += 1

feature_count.most_common(20)

### Product Weights

In [60]:
# Some janky code to pluck out the Item Weight

def get_weight(item):
    weight_str = item.features.get('Item Weight')
    if weight_str:
        parts = weight_str.split(' ')
        amount = float(parts[0])
        unit = parts[1].lower()
        if unit=="pounds":
            return amount
        elif unit=="ounces":
            return amount / 16
        elif unit=="grams":
            return amount / 453.592
        elif unit=="milligrams":
            return amount / 453592
        elif unit=="kilograms":
            return amount / 0.453592
        elif unit=="hundredths" and parts[2].lower()=="pounds":
            return amount / 100
        else:
            print(weight_str)
    return None

In [62]:
weights = [get_weight(t) for t in train]
weights = [w for w in weights if w]

240 Tons


In [68]:
average_weight = sum(weights) / len(weights)
print(f"Avr Weight: {average_weight:.2f}")

Avr Weight: 5.82


In [128]:
### Set default weight for items with None value for weights
def get_defualt_weight(item): 
    weight = get_weight(item)
    return weight or average_weight

In [129]:
w = get_weight(train[1000])
print(w)

1.38


### Best Seller Ranks 

In [104]:
def get_rank(item): 
    """
    Get an average rank per item out of multiple ranks across different categories.
    """
    ### A product has ranks across multiple categories in Amazon data
    ranks_dict = item.features.get("Best Sellers Rank")
    if ranks_dict:
        ranks = ranks_dict.values()
        avr_rank = sum(ranks) / len(ranks)
        return avr_rank
        
    return None
    
    

In [105]:
get_rank(train[100])

323746.0

In [110]:
ranks = [get_rank(t) for t in train]
ranks = [r for r in ranks if r]

average_rank = sum(ranks) / len(ranks)
print(f"Average Rank = {average_rank:,.2f}")

Average Rank = 123,129.66


In [111]:
def get_default_rank(item): 
    rank = get_rank(item)
    return rank or average_rank

In [113]:
def get_text_length(item): 
    return len(item.prompt)

In [114]:
### Look at 40 most common brands

brand_counts = Counter()

for t in train: 
    brand = t.features.get("Brand")
    if brand: 
        brand_counts[brand] += 1 

print(brand_counts.most_common(40))

[('HP', 4484), ('Dell', 3189), ('Lenovo', 2401), ('SAMSUNG', 1795), ('ASUS', 1377), ('Sony', 1309), ('Funko', 831), ('Canon', 815), ('Apple', 784), ('Intel', 669), ('Magic The Gathering', 637), ('Generic', 604), ('Nikon', 534), ('Garmin', 483), ('Pyle', 482), ('Panasonic', 448), ('MSI', 435), ('LG', 434), ('Rockville', 427), ('GODOX', 423), ('Microsoft', 414), ('Acer', 364), ('Fender', 336), ('AmScope', 317), ('Pokemon', 316), ('KICKER', 316), ('Yu-Gi-Oh!', 306), ('Logitech', 294), ('Gigabyte', 280), ('Harmony Audio', 277), ('Audio-Technica', 263), ('Fujifilm', 261), ('Western Digital', 248), ('Plantronics', 244), ('Ibanez', 243), ('acer', 241), ('Olympus', 235), ('DJI', 231), ('Bose', 229), ('Seagate', 229)]


In [127]:
TOP_ELECTRONICS_BRANDS = ["hp", "dell", "lenovo", "samsung", "asus", "sony", "canon", "apple", "intel"]

def is_top_electronics_brand(item):
    brand = item.features.get("Brand")
    return brand and brand.lower() in TOP_ELECTRONICS_BRANDS

print(is_top_electronics_brand(train[2300]))

True


### Result: get_features()

In [130]:
def get_features(item): 
    """
    Return weight, rank, text_length, and whether it's one of top electronics brands in a dictionary form
    """
    return {
        "weight": get_defualt_weight(item), 
        "rank": get_default_rank(item), 
        "text_length": get_text_length(item), 
        "is_top_electronics_brand": 1 if is_top_electronics_brand(item) else 0 
    }

In [132]:
get_features(train[0])

{'weight': 5.33,
 'rank': 296.0,
 'text_length': 723,
 'is_top_electronics_brand': 0}