In [1]:
import keepa
import numpy as np
import requests
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
from sklearn import linear_model

In [2]:
with open("api_key.txt") as key_file :
    api_key = key_file.readline().strip()
api = keepa.Keepa(api_key)

In [3]:
# do a product search to filter out books that may be good to look at
book_params = {
    'deltaPercent30_USED_gte': 20,
    'deltaPercent30_USED_lte': 99,
    'deltaPercent90_COUNT_USED_gte': -50,
    'deltaPercent90_COUNT_USED_lte': 99,
    'avg180_SALES_gte': 0,
    'avg180_SALES_lte': 7000000,
    'rootCategory': 283155
}
# don't include list price because there's no way to compare the list price with the used price at this point in the process
# have Mark go through the other parameters and see if there's anything he thinks would also be helpful to include

In [27]:
test_params = {
    "avg180_SALES_gte": 10000,
    "avg180_SALES_lte": 6000000,
    "current_USED_gte": 800,
    "current_USED_lte": 1200,
    "avg30_USED_gte": 1800,
    "avg30_USED_lte": 1000000000,
    "current_LISTPRICE_gte": 5000,
    "current_LISTPRICE_lte": 100000000,
    "avg180_LISTPRICE_gte": 5000,
    "avg180_LISTPRICE_lte": 100000000,
    "rootCategory": 283155,
    "perPage": 10
}

In [28]:
test = api.product_finder(test_params)

Exception: REQUEST_REJECTED

In [3]:
asins = ['1888799838',
'1111987254',
'0769857698',
'185617610X',
'0521860962',
'0387708820',
'0072852631',
'1943876339',
'1580017304',
'1269773186']
book_data = api.query(asins)

100%|██████████| 10/10 [00:03<00:00,  2.81it/s]


In [16]:
# now call the api to get the data on all these books
book_data = api.query(test[40:50])

100%|██████████| 10/10 [00:05<00:00,  1.94it/s]


In [33]:
book_data[1]['asin']

'0155510088'

In [None]:
# ------- THE METRICS --------
# Used Count % change from 90 average now to 90 day average a year ago (maybe have it's weight be linear based off the average
# of the averages)
# Used Count change in number (90 day average to 90 day average)
# Current Used Count
# Current List Price
# Current New Price
# Max Trade-in value over the past year
# % ROI from current used price + shipping (see below) compared to average 
# price it sold at (or during peak?) minus Amazon ($10+15%) fees
# Residuals of rolling average
# Max rolling average (amount made during peak)

In [4]:
# when it sold, how much it sold for, and the number of used offers at that time
two_yrs_ago = datetime.datetime.now() - relativedelta(years=2)
sell_prices = pd.DataFrame()
sell_dates = pd.DataFrame()
used_counts = pd.DataFrame()
for book in range(len(book_data)) :
    last_two_years = book_data[book]['data']['SALES_time'] > two_yrs_ago
    drop_dates_list = []
    sell_prices_list = []
    used_counts_list = []
    for day in np.where(last_two_years)[0] :
        if ((book_data[book]['data']['SALES'][day - 1] - 
            book_data[book]['data']['SALES'][day]) / 
            (book_data[book]['data']['SALES'][day - 1]) >= .04) :
            day_sold = book_data[book]['data']['SALES_time'][day]
            drop_dates_list.append(day_sold)
            try :
                day_sold_Used_index = np.where(book_data[book]['data']['USED_time'] == day_sold)[0][0] - 1 
            except IndexError :
                previous_days = np.where(book_data[book]['data']['USED_time'] < day_sold)
                day_sold_Used_index = previous_days[0][-1]
            sell_prices_list.append(book_data[book]['data']['USED'][day_sold_Used_index])
            try :
                day_sold_used_count = np.where(book_data[book]['data']['COUNT_USED_time'] == day_sold)[0][0] - 1
            except IndexError :
                previous_days_count = np.where(book_data[book]['data']['COUNT_USED_time'] < day_sold)
                day_sold_used_count = previous_days_count[0][-1]
            used_counts_list.append(book_data[book]['data']['COUNT_USED'][day_sold_used_count])
    drop_dates_df = pd.DataFrame({book_data[book]['asin']:drop_dates_list})
    sell_dates = pd.concat([sell_dates, drop_dates_df], ignore_index=True, axis=1)
    sell_prices_df = pd.DataFrame({book_data[book]['asin']:sell_prices_list})
    sell_prices = pd.concat([sell_prices, sell_prices_df], ignore_index=True, axis=1)
    used_counts_df = pd.DataFrame({book_data[book]['asin']:used_counts_list})
    used_counts = pd.concat([used_counts, used_counts_df], ignore_index=True, axis=1)

In [40]:
# find the rolling 30 day sales total
sell_prices = sell_prices.fillna(0) #convert the NaNs to 0s so we can do math with them
rolling_averages = pd.DataFrame()
num_sold = pd.DataFrame()
for book in range(len(book_data)) :
    two_yrs_ago = datetime.datetime.now() - relativedelta(years=2)
    two_yrs_30days = two_yrs_ago + datetime.timedelta(days=30)
    mving_window = [two_yrs_ago, two_yrs_30days]
    rolling_average = []
    books_sold = []
    for day in range(701) : # there's 701 days between the end of the moving window and today
        start = sell_dates[book] > mving_window[0]
        end = sell_dates[book] < mving_window[1]
        window = np.where(start & end)[0]
        rolling_average.append(np.sum(sell_prices[book][window]))
        books_sold.append(np.count_nonzero(sell_prices[book][window]))
        for date in range(len(mving_window)) :
            mving_window[date] += datetime.timedelta(days=1)
    rolling_averages[book] = rolling_average
    num_sold[book] = books_sold

In [6]:
# Variance of rolling sales totals (residuals)
regr = linear_model.LinearRegression()
now = datetime.datetime.now().date()
start_date = now + relativedelta(years=-2,days=30) 
x = np.arange(701).reshape(-1,1)
variance = []
for book in range(len(book_data)) :
    y = rolling_averages[book]
    regr.fit(x, y)
    score = regr.score(x,y)
    y_mean = np.average(y)
    sum_sqs = []
    for day in range(701) :
        sum_sqs.append((rolling_averages[book][day] - y_mean) ** 2)
    total_sum_sqs = np.sum(sum_sqs)
    residual_sum_sqs = total_sum_sqs * (1 - score)
    variance.append(residual_sum_sqs)

In [45]:
# Highest peak amount and date
peaks = pd.DataFrame()
peak_amount = []
peak_end_date = []
peak_num_sold = []
now = datetime.datetime.now().date()
start = now + relativedelta(years=-2,days=30)
date_range = pd.date_range(start, now)
for book in range(len(book_data)) :
    peak = np.amax(rolling_averages[book][335:]) # only look at past year
    peak_amount.append(peak)
    peak_time = np.where(rolling_averages[book] == peak)[0]
    peak_end_date.append(date_range[peak_time[-1]])
    peak_num_sold.append(num_sold[book][peak_time[-1]])
peaks['Peak Amount'] = peak_amount  #### Do we even want to use the total amount it made during peak? 
# take this part out probably
# how can we calculate likelyhood to sell
# maybe take this and compare it with the average number of used offers during that time
peaks['Peak End Date'] = peak_end_date
peaks['Num Sold During Peak'] = peak_num_sold

In [24]:
# USED COUNT % change and difference for 90 averages over a year
# compute a reimann sum for the step graph, with delta x as 12 hours
used_count_avgs = pd.DataFrame()
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=90)
for year in range(2) :    
    used_count_avg = []
    for book in range(len(book_data)) :
        start = thirty_days_ago - (datetime.timedelta(days=365) * year)
        total_used_sum = []
        for twelve_hours in range(180) :
            time = start + (datetime.timedelta(hours=12) * twelve_hours)
            current = book_data[book]['data']['COUNT_USED_time'] < time
            last_value = book_data[book]['data']['COUNT_USED'][np.where(current)[0][-1]]
            current_used_count = 0 if(last_value == -1) else last_value # the data lists -1 where there are really 0 used offers
            total_used_sum.append(current_used_count)
        used_count_avg.append(np.average(total_used_sum))
    used_count_avgs['This year' if(not year) else 'Last year'] = used_count_avg

# now use the averages to compute our metrics
# also take out the current used count, list price, and new price
used_count_metrics = pd.DataFrame()
used_count_per_change = []
used_count_diff = []
current_used_count = []
current_list_price = []
current_new_price = []
for book in range(len(book_data)) :
    per_change = (used_count_avgs['This year'][book] - 
                  used_count_avgs['Last year'][book]) / used_count_avgs['Last year'][book]
    used_count_per_change.append(per_change)
    used_count_diff.append(used_count_avgs['This year'][book] - used_count_avgs['Last year'][book])
    current_used_count.append(book_data[book]['data']['COUNT_USED'][-1])
    if(np.isnan(book_data[book]['data']['NEW'][-1])) :
        current_new_price.append(book_data[book]['data']['NEW'][-2])
    else :
        current_new_price.append(book_data[book]['data']['NEW'][-1])
    try :
        book_data[book]['data']['LISTPRICE']
        if(np.isnan(book_data[book]['data']['LISTPRICE'][-1])) :
            current_list_price.append(current_new_price[book])
        else :
            current_list_price.append(book_data[book]['data']['LISTPRICE'][-1])
    except KeyError :
        current_list_price.append(current_new_price[book])
    
used_count_metrics['USED_COUNT percent change'] = used_count_per_change
used_count_metrics['USED_COUNT difference'] = used_count_diff

In [9]:
# % ROI from current used price + shipping (see below) compared to average 
# price it sold at (or during peak?) minus Amazon ($10+15%) fees
# for current used price (how much we pay) add $3.99 for below $5.46 and $2 for between that and $10

# adjust each price it sold at for Amazon fees, counting anything less than $10 as not selling
# Sum up the Amazon-fee-adjusted prices it sold at
# add shipping to current used price
# calculate adjusted ROI

# change anytime it sold for less than $10 to counting it as not selling at all
for column in sell_prices.columns :
    for row in sell_prices.index :
        if(sell_prices[column][row] <= 10) :
            sell_prices[column][row] = 0

# create the function for deducting Amazon fees (15% + $10 flat fee)
Amazon_fees = lambda price : price - (price * .15) - 10

Amazon_fees(11.77) # below this it calculates lost money

# make function for adding shipping costs to cheaper books
def add_shipping(price) :
    if price < 5.46 :
        return(price + 3.99)
    elif price < 10 :
        return(price + 2)
    else :
        return price

# Calculate the total money the book made after Amazon fees in the past year
past_year = datetime.datetime.now() - relativedelta(years=1)
psuedo_roi = []
for book in sell_dates.columns :
    recently = np.where(sell_dates[book] > past_year)[0]
    total_revenue = np.sum(Amazon_fees(sell_prices[book][recently]))
    used_price = add_shipping(book_data[book]['data']['USED'][-1])
    fake_roi = (total_revenue - used_price) / used_price
    psuedo_roi.append(fake_roi)

In [10]:
# max trade-in value over the past year
max_trade_in = []
for book in range(len(book_data)) :
    try :
        last_365 = np.where(book_data[book]['data']['TRADE_IN_time'] > past_year)[0]
        trade_in_data = np.nan_to_num(book_data[book]['data']['TRADE_IN'][last_365])
        max_trade_in.append(np.amax(trade_in_data))
    except :
        max_trade_in.append(0)

In [25]:
metrics = pd.DataFrame({'Psuedo ROI': psuedo_roi, 
                        'USED COUNT % change': used_count_metrics['USED_COUNT percent change'],
                        'USED COUNT difference': used_count_metrics['USED_COUNT difference'],
                        'CURRENT USED COUNT': current_used_count,
                        'CURRENT LIST PRICE': current_list_price,
                        'CURRENT NEW PRICE': current_new_price,
                        'Max Trade-in Value': max_trade_in,
                        'Cyclicity': variance
                       })

In [26]:
metrics 

Unnamed: 0,Psuedo ROI,USED COUNT % change,USED COUNT difference,CURRENT USED COUNT,CURRENT LIST PRICE,CURRENT NEW PRICE,Max Trade-in Value,Cyclicity
0,6.279064,-0.397813,-5.255556,6,9.99,9.99,20.28,2077264.0
1,6.005884,-0.104161,-2.211111,13,109.99,81.88,22.07,1650239.0
2,18.585194,-0.322908,-8.488889,14,140.0,114.38,13.99,1694461.0
3,24.757296,0.610473,5.311111,9,193.0,661.14,48.7,18424330.0
4,37.778017,-0.332166,-9.488889,18,196.95,373.78,30.06,144934400.0
5,-0.836795,14.528736,7.022222,1,160.83,160.83,0.0,7518766.0
6,-0.232965,-0.623491,-7.461111,1,146.48,146.48,0.0,841644.8
7,3.183201,-0.665464,-11.272222,4,87.95,48.0,12.96,917543.2
8,10.735145,-0.521008,-10.677778,7,206.08,206.08,15.17,921461.4
9,26.107991,-0.655252,-25.088889,11,95.0,67.84,0.0,1433419.0


In [None]:
# now train the model
regr = linear_model.LinearRegression()
X = metrics
y = [#put the subjective 'scores' in here]
regr.fit(X,y)
regr.coef_

In [None]:
# Questions for Mark ~ 
# what can knowing the used offer count when it sold be good for?
# If a book sold many times at a low value, that's not bad, but only slightly good right?
# We care more about how much a book can sell in a year than in a peak period right?
# So really we should calculate the sum total it made after Amazon fees in the past year then use that to compute the ROI
# compare to the current priced + shipping

In [None]:
# Calculate the value/worth of it to us and expected value ~ compare it with current price
# when doing exp val if less than $10 just counts as not selling


In [46]:
peaks

Unnamed: 0,Peak Amount,Peak End Date,Num Sold During Peak
0,192.1,2020-02-23,5
1,130.23,2019-06-16,3
2,136.78,2020-02-16,3
3,394.05,2020-02-03,3
4,1884.72,2020-02-12,38
5,372.35,2019-09-21,3
6,111.2,2019-05-10,2
7,205.12,2020-02-08,5
8,179.5,2019-11-24,2
9,138.0,2020-01-12,3


In [None]:

# we can then create a graph of the likelyhood it will sell in that peak period at that given price
# ((probability it will sell) * (the price it would have sold at - Amazon fees) - (used price + shipping)) / (used pric+shipng)

# we calculate our price to be the average price for which it sold during the peak period this last year
peak_avg_price = []
for book in range(len(book_data)) :
    avg_price = peaks['Peak Amount'][book] / peaks['Num Sold During Peak'][book]
    peak_avg_price.append(avg_price)

# we set lambda to be the number of times it sold in that period divided by 30   
lambdas = []
for book in range(len(book_data)) :
    lambdas.append(peaks['Num Sold During Peak'] / 30)

# given that a book sold, probability that it was our book that sold and not someone else's
def prob_book_sold(used_count) :
    return .9 - (.13 * np.sqrt(used_count))

# given a certain lambda and given the probability that it was our book that sold, now find the probability that 
# the book sold during the peak period