# This notebook is for your info only -

This is where I created the features for the dataset that we use during the labs for this segment.

In [None]:
# imports

import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from items import Item
from loaders import ItemLoader
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
from tqdm import tqdm
import pickle
import json

In [None]:
# Load in dataset

with open('../train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('../test.pkl', 'rb') as file:
    test = pickle.load(file)

In [None]:
items = train + test

In [None]:
len(items)

In [None]:
for item in tqdm(items):
    item.features = json.loads(item.details)

In [None]:
for item in items:
    w = item.features.get('Item Weight')
    if w:
        space = w.index(' ')
        item.weight = float(w[:space])
        item.units = w[space+1:]
    else:
        item.weight = None
        item.units = None

In [None]:
items[0].units

In [None]:
set(item.units for item in items if item.units is not None)

In [None]:
multipliers = {'Grams': 0.035274,
 'Hundredths Pounds': 0.16,
 'Kilograms': 35.27396,
 'Milligrams': 0.000035274,
 'Ounces': 1,
 'Pounds': 16,
 'ounces': 1,
 'pounds': 16}

In [None]:
for item in items:
    if item.weight:
        item.weight = item.weight * multipliers[item.units]
        item.units = "ounces"

In [None]:
for item in items:
    ranks = item.features.get("Best Sellers Rank")
    if ranks:
        item.rank = min(ranks.values())
    else:
        item.rank = None

In [None]:
from datetime import datetime

replacer = {
    "01M": "January",
    "02M": "February",
    "03M": "March",
    "04M": "April",
    "05M": "May",
    "06M": "June",
    "07M": "July",
    "08M": "August",
    "09M": "September",
    "10M": "October",
    "11M": "November",
    "12M": "December"
}

for item in items:
    avail = item.features.get("Date First Available")
    if avail:
        for key, value in replacer.items():
            avail = avail.replace(key, value)
        date_obj = datetime.strptime(avail, "%B %d, %Y")
        item.timestamp = date_obj.timestamp()
    else:
        item.timestamp = None

In [None]:
top_tech = ['samsung', 'sony', 'garmin', 'intel', 'dell computers','hp','lg','asus', 'nikon']
top_toys = ['mattel', 'hasbro', 'lego']
for item in items:
    item.is_top_tech = (item.features.get("Manufacturer") or "").lower() in top_tech
    item.is_top_toys = (item.features.get("Manufacturer") or "").lower() in top_toys


In [None]:
ranks = [i.rank for i in items if i.rank]
average_rank = sum(ranks)/len(ranks)
weights = [i.weight for i in items if i.weight]
average_weight = sum(weights)/len(weights)
timestamps

In [None]:
train_features = [t for t in train if t.rank and t.weight and t.timestamp]

In [None]:
len(train_features)

In [None]:
test_features = [t for t in test if t.rank and t.weight and t.timestamp]

In [None]:
len(test_features)

In [None]:

with open('../training_data.pkl', 'wb') as file:
    pickle.dump(train_features, file)

with open('../test_data.pkl', 'wb') as file:
    pickle.dump(test_features, file)

# Update the pickle files

Addimg a text attribute to all the datapoints to simplify the code in the labs

In [None]:
with open('../train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('../test.pkl', 'rb') as file:
    test = pickle.load(file)

In [None]:
for t in train:
    t.text = t.test_prompt().replace('How much does this cost to the nearest dollar?\n\n', '').replace('\n\nPrice is $', '')
for t in test:
    t.text = t.test_prompt().replace('How much does this cost to the nearest dollar?\n\n', '').replace('\n\nPrice is $', '')

In [None]:
with open('../train.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('../test.pkl', 'wb') as file:
    pickle.dump(test, file)

In [None]:
with open('../training_data.pkl', 'rb') as file:
    train = pickle.load(file)

with open('../test_data.pkl', 'rb') as file:
    test = pickle.load(file)

In [None]:
for t in train:
    t.text = t.test_prompt().replace('How much does this cost to the nearest dollar?\n\n', '').replace('\n\nPrice is $', '')
for t in test:
    t.text = t.test_prompt().replace('How much does this cost to the nearest dollar?\n\n', '').replace('\n\nPrice is $', '')

In [None]:
with open('../training_data.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('../test_data.pkl', 'wb') as file:
    pickle.dump(test, file)