# The Product Pricer With Linear Regression

Traditional machine learning models that estimate how much a product costs based on extracted numeric features, such as weight, brand, text length, and popularity rank â€” rather than raw text descriptions.

## Purpose

To demonstrate a baseline regression model that can predict prices from structured data.
This serves as the foundation to later compare and improve upon with more advanced models.

### Baseline Models

In [None]:
!pip install pandas numpy scikit-learn gensim huggingface-hub

In [None]:
### General Imports
import os
import math
import json
import random
from dotenv import load_dotenv
import matplotlib.pyplot as plt 
import pickle 
from collections import Counter
from huggingface_hub import login

In [None]:
### Imports For Traditional Machine Learning

import pandas as pd 
import numpy as np 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
### For NLP related machine learning

from sklearn.feature_extraction.text import CountVectorizer 
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
### More of machine learning related ones
from sklearn.svm import LinearSVR 
from sklearn.ensemble import RandomForestRegressor 

In [None]:
### Internal Classes

from items import Item

In [None]:
### Constants -- used for printing to stdout in color

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"green": GREEN, "orange": YELLOW, "red": RED} 

In [None]:
### Environment 

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
HF_TOKEN_KEY = os.getenv("HF_TOKEN")

login(HF_TOKEN_KEY)


In [None]:
%matplotlib inline

## Loading the pkl files

In [None]:
with open("train.pkl", "rb") as f:
    train = pickle.load(f)

with open("test.pkl", "rb") as f:
    test = pickle.load(f)

In [None]:
print(test[0].test_prompt())

In [None]:
print(test[0].price)

## Tester Class

In [None]:
class Tester: 

    def __init__(self, predictor, title=None, data=test, size=250): 
        self.predictor = predictor
        self.data = data
        ### Conditionally used for either the text report or chart's title
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.truths = []
        self.guesses = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth): 
        if error < 20 or error / truth < 0.2:
            return "green"
        elif error < 80 or error / truth < 0.4: 
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i): 
        datapoint = self.data[i]
        guess = self.predictor(datapoint)
        truth = datapoint.price
        error = abs(guess - truth)
        log_error = math.log(truth + 1) - math.log(guess + 1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint.title if len(datapoint.title) < 40 else datapoint.title[:40] + "..."
        self.truths.append(truth)
        self.guesses.append(guess)
        self.errors.append(error) 
        self.sles.append(sle) 
        self.colors.append(color) 
        print(f"{COLOR_MAP[color]}{i + 1}: Guess: ${guess:,.2f} | Truth: ${truth:,.2f} | Error: {error:,.2f} | SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title): 
        max_error = max(self.errors) 
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color="deepskyblue", lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val) 
        plt.title(title)
        plt.show()

    def report(self): 
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color == "green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits / self.size * 100:.1f}%"
        self.chart(title)

    def run(self):
        for i in range(self.size): 
            self.run_datapoint(i) 
        self.report()

    @classmethod
    def test(cls, function):
        cls(function).run()


In [None]:
### Simulator function

def random_pricer(item): 
    return random.randrange(1, 1000)

In [None]:
random.seed(42)

### Run Tester with a simulating guess predictor
Tester.test(random_pricer)

In [None]:
### Another test -- with average price from the train data

training_prices = [item.price for item in train]
training_avr_price = sum(training_prices) / len(training_prices)

def constant_pricer(item): 
    return training_avr_price 

In [None]:
Tester.test(constant_pricer)

## Feature Engineering

Educational example using product weights, brand, and other fields 
to demonstrate feature engineering, not meant for real world usage.

In [None]:
train[0].details

In [None]:
### Create a new "features" field on items, 
### and populate it with json parsed from the details dict

for item in train: 
    item.features = json.loads(item.details)

for item in test: 
    item.features = json.loads(item.details)

In [None]:
print(train[0].features.keys())

In [None]:
### Look at 20 most common features in training set

feature_count = Counter()

for item in train: 
    for f in item.features.keys(): 
        feature_count[f] += 1

feature_count.most_common(20)

### Product Weights

In [None]:
# Some janky code to pluck out the Item Weight

def get_weight(item):
    weight_str = item.features.get('Item Weight')
    if weight_str:
        parts = weight_str.split(' ')
        amount = float(parts[0])
        unit = parts[1].lower()
        if unit=="pounds":
            return amount
        elif unit=="ounces":
            return amount / 16
        elif unit=="grams":
            return amount / 453.592
        elif unit=="milligrams":
            return amount / 453592
        elif unit=="kilograms":
            return amount / 0.453592
        elif unit=="hundredths" and parts[2].lower()=="pounds":
            return amount / 100
        else:
            print(weight_str)
    return None

In [None]:
weights = [get_weight(t) for t in test]

### Filter out None values (if w)
weights = [w for w in weights if w]

In [None]:
average_weight = sum(weights) / len(weights)
print(f"Avr Weight: {average_weight:.2f}")

In [None]:
### Set default weight for items with None value for weights
def get_defualt_weight(item): 
    weight = get_weight(item)
    return weight or average_weight

In [None]:
w = get_weight(train[1000])
print(w)

### Best Seller Ranks 

In [None]:
def get_rank(item): 
    """
    Get an average rank per item out of multiple ranks across different categories.
    """
    ### A product has ranks across multiple categories in Amazon data
    ranks_dict = item.features.get("Best Sellers Rank")
    if ranks_dict:
        ranks = ranks_dict.values()
        avr_rank = sum(ranks) / len(ranks)
        return avr_rank
        
    return None
    
    

In [None]:
get_rank(train[100])

In [None]:
ranks = [get_rank(t) for t in train]
ranks = [r for r in ranks if r]

average_rank = sum(ranks) / len(ranks)
print(f"Average Rank = {average_rank:,.2f}")

In [None]:
def get_default_rank(item): 
    rank = get_rank(item)
    return rank or average_rank

In [None]:
def get_text_length(item): 
    return len(item.prompt)

In [None]:
### Look at 40 most common brands

brand_counts = Counter()

for t in train: 
    brand = t.features.get("Brand")
    if brand: 
        brand_counts[brand] += 1 

print(brand_counts.most_common(40))

In [None]:
TOP_ELECTRONICS_BRANDS = ["hp", "dell", "lenovo", "samsung", "asus", "sony", "canon", "apple", "intel"]

def is_top_electronics_brand(item):
    brand = item.features.get("Brand")
    return brand and brand.lower() in TOP_ELECTRONICS_BRANDS

print(is_top_electronics_brand(train[2300]))

### Result: get_features()

In [None]:
def get_features(item): 
    """
    Return weight, rank, text_length, and whether it's one of top electronics brands in a dictionary form
    """
    return {
        "weight": get_defualt_weight(item), 
        "rank": get_default_rank(item), 
        "text_length": get_text_length(item), 
        "is_top_electronics_brand": 1 if is_top_electronics_brand(item) else 0 
    }

In [None]:
get_features(train[0])

## Traditional Linear Regression

In [None]:
# A utility function to convert the features(dict) into a pandas dataframe

def list_to_dataframe(items):
    features = [get_features(item) for item in items]
    df = pd.DataFrame(features)
    ### Add price label
    df["price"] = [item.price for item in items]

    return df

train_df = list_to_dataframe(train)
test_df = list_to_dataframe(test[:250])

In [None]:
print(test_df[:10])

In [None]:
### Traditional Linear Regression

np.random.seed(42)

### Separate features and target 
feature_columns = ["weight", "rank", "text_length", "is_top_electronics_brand"]

X_train = train_df[feature_columns]
y_train = train_df["price"]

X_test = test_df[feature_columns]
y_test = test_df["price"]

### Train a Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

print(model.coef_)

### Print pairs of feature_columns and coefficients 
print("\n|==Metrics Report==|\n")
for features, coef in zip(feature_columns, model.coef_): 
    print(f"{features}: {coef}")

### Intercept rate
print(f"Intercept: {model.intercept_}")


### Predict the test set and evaluate (with error metrics)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squred Error: {mse}")
print(f"R-Squared Score: {r2}")

In [None]:
### Function to predict price for a new item

def linear_regression_pricer(item):
    features = get_features(item)
    print("FEATURES: ", features)
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

In [None]:
Tester.test(linear_regression_pricer)