In [56]:
import keepa
import numpy as np
import requests
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
from sklearn import linear_model

In [2]:
with open("api_key.txt") as key_file :
    api_key = key_file.readline().strip()
api = keepa.Keepa(api_key)

In [3]:
# do a product search to filter out books that may be good to look at
book_params = {
    'deltaPercent30_USED_gte': 20,
    'deltaPercent30_USED_lte': 99,
    'deltaPercent90_COUNT_USED_gte': -50,
    'deltaPercent90_COUNT_USED_lte': 99,
    'avg180_SALES_gte': 0,
    'avg180_SALES_lte': 7000000,
    'rootCategory': 283155
}
# don't include list price because there's no way to compare the list price with the used price at this point in the process
# have Mark go through the other parameters and see if there's anything he thinks would also be helpful to include

In [14]:
test_params = {
    "avg180_SALES_gte": 10000,
    "avg180_SALES_lte": 6000000,
    "current_USED_gte": 800,
    "current_USED_lte": 1200,
    "avg30_USED_gte": 1800,
    "avg30_USED_lte": 1000000000,
    "current_LISTPRICE_gte": 5000,
    "current_LISTPRICE_lte": 100000000,
    "avg180_LISTPRICE_gte": 5000,
    "avg180_LISTPRICE_lte": 100000000,
    "rootCategory": 283155,
    "perPage": 10
}

In [None]:
test = api.product_finder(test_params)

In [16]:
asins = ['1888799838',
'1111987254',
'0769857698',
'185617610X',
'0521860962',
'0387708820',
'0072852631',
'1943876339',
'1580017304',
'1269773186']
book_data = api.query(asins)

100%|██████████| 10/10 [00:03<00:00,  3.09it/s]


In [16]:
# now call the api to get the data on all these books
book_data = api.query(test[40:50])

100%|██████████| 10/10 [00:05<00:00,  1.94it/s]


In [33]:
book_data[1]['asin']

'0155510088'

In [None]:
# ------- THE METRICS --------
# Used Count % change from 90 average now to 90 day average a year ago (maybe have it's weight be linear based off the average
# of the averages)
# Used Count change in number (90 day average to 90 day average)
# Current Used Count
# Current List Price
# Current New Price
# Max Trade-in value over the past year
# % ROI from current used price + shipping (see below) compared to average 
# price it sold at (or during peak?) minus Amazon ($10+15%) fees
# Residuals of rolling average
# Max rolling average (amount made during peak)

In [None]:
# TO DO 
# find the used offer count when it sold
# Calculate the value/worth of it to us and expected value ~ compare it with current price
# when doing exp val if less than $10 just counts as not selling

# for current used price (how much we pay) add $3.99 for below $5.46 and $2 for between that and $10

# when 

In [57]:
# when it sold and how much it sold for
two_yrs_ago = datetime.datetime.now() - relativedelta(years=2)
sell_prices = pd.DataFrame()
sell_dates = pd.DataFrame()
for book in range(len(book_data)) :
    last_two_years = book_data[book]['data']['SALES_time'] > two_yrs_ago
    drop_dates_list = []
    sell_prices_list = []
    for day in np.where(last_two_years)[0] :
        if ((book_data[book]['data']['SALES'][day - 1] - 
            book_data[book]['data']['SALES'][day]) / 
            (book_data[book]['data']['SALES'][day - 1]) >= .04) :
            day_sold = book_data[book]['data']['SALES_time'][day]
            drop_dates_list.append(day_sold)
            try :
                day_sold_index = np.where(book_data[book]['data']['USED_time'] == day_sold)[0][0] - 1
            except IndexError :
                previous_days = np.where(book_data[book]['data']['USED_time'] < day_sold)
                day_sold_index = previous_days[0][-1]
            sell_prices_list.append(book_data[book]['data']['USED'][day_sold_index])
    drop_dates_df = pd.DataFrame({book_data[book]['asin']:drop_dates_list})
    sell_dates = pd.concat([sell_dates, drop_dates_df], ignore_index=True, axis=1)
    sell_prices_df = pd.DataFrame({book_data[book]['asin']:sell_prices_list})
    sell_prices = pd.concat([sell_prices, sell_prices_df], ignore_index=True, axis=1)

In [58]:
# find the rolling 30 day sales total
sell_prices = sell_prices.fillna(0) #convert the NaNs to 0s so we can do math with them
rolling_averages = pd.DataFrame()
for book in range(len(book_data)) :
    two_yrs_ago = datetime.datetime.now() - relativedelta(years=2)
    two_yrs_30days = two_yrs_ago + datetime.timedelta(days=30)
    mving_window = [two_yrs_ago, two_yrs_30days]
    rolling_average = []
    for day in range(701) : # there's 701 days between the end of the moving window and today
        start = sell_dates[book] > mving_window[0]
        end = sell_dates[book] < mving_window[1]
        window = np.where(start & end)[0]
        rolling_average.append(np.sum(sell_prices[book][window]))
        for date in range(len(mving_window)) :
            mving_window[date] += datetime.timedelta(days=1)
    rolling_averages[book] = rolling_average

In [82]:
# Variance of rolling sales totals (residuals)
regr = linear_model.LinearRegression()
now = datetime.datetime.now().date()
start_date = now + relativedelta(years=-2,days=30) 
x = np.arange(701).reshape(-1,1)
variance = []
for book in range(len(book_data)) :
    y = rolling_averages[book]
    regr.fit(x, y)
    score = regr.score(x,y)
    y_mean = np.average(y)
    sum_sqs = []
    for day in range(701) :
        sum_sqs.append((rolling_averages[book][day] - y_mean) ** 2)
    total_sum_sqs = np.sum(sum_sqs)
    residual_sum_sqs = total_sum_sqs * (1 - score)
    variance.append(residual_sum_sqs)

In [91]:
# Highest peak amount and date
peaks = pd.DataFrame()
peak_amount = []
peak_end_date = []
now = datetime.datetime.now().date()
start = now + relativedelta(years=-2,days=30)
date_range = pd.date_range(start, now)
for book in range(len(book_data)) :
    peak = np.amax(rolling_averages[book])
    peak_amount.append(peak)
    peak_time = np.where(rolling_averages[book] == peak)[0]
    peak_end_date.append(date_range[peak_time[-1]])
peaks['Peak Amount'] = peak_amount  #### Do we even want to use the total amount it made during peak? 
# take this part out probably
# how can we calculate likelyhood to sell
# maybe take this and compare it with the average number of used offers during that time
peaks['Peak End Date'] = peak_end_date

In [51]:
# USED COUNT % change and difference for 90 averages over a year
# compute a reimann sum for the step graph, with delta x as 12 hours
used_count_avgs = pd.DataFrame()
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=90)
for year in range(2) :    
    used_count_avg = []
    for book in range(len(book_data)) :
        start = thirty_days_ago - (datetime.timedelta(days=365) * year)
        total_used_sum = []
        for twelve_hours in range(180) :
            time = start + (datetime.timedelta(hours=12) * twelve_hours)
            current = book_data[book]['data']['COUNT_USED_time'] < time
            last_value = book_data[book]['data']['COUNT_USED'][np.where(current)[0][-1]]
            current_used_count = 0 if(last_value == -1) else last_value # the data lists -1 where there are really 0 used offers
            total_used_sum.append(current_used_count)
        used_count_avg.append(np.average(total_used_sum))
    used_count_avgs['This year' if(not year) else 'Last year'] = used_count_avg

# now use the averages to compute our metrics
used_count_metrics = pd.DataFrame()
used_count_per_change = []
used_count_diff = []
for book in range(len(book_data)) :
    per_change = (used_count_avgs['This year'][book] - 
                  used_count_avgs['Last year'][book]) / used_count_avgs['Last year'][book]
    used_count_per_change.append(per_change)
    used_count_diff.append(used_count_avgs['This year'][book] - used_count_avgs['Last year'][book])
used_count_metrics['USED_COUNT percent change'] = used_count_per_change
used_count_metrics['USED_COUNT difference'] = used_count_diff

In [None]:
# % current used price is below average price it sold for after Amazon fees


In [46]:
for book in range(len(book_data)) :
    print(book_data[book]['asin'])

0072852631
0387708820
0521860962
0769857698
1111987254
1269773186
1580017304
185617610X
1888799838
1943876339


In [47]:
# Access new price history and associated time data
newprice = book_data[5]['data']['COUNT_USED']
newpricetime = book_data[5]['data']['COUNT_USED_time']

# print the first 10 prices
print('%20s   %s' % ('Date', 'Price'))
for i in range(len(newpricetime[10:])):
    print('%20s   $%.2f' % (newpricetime[i], newprice[i]))

                Date   Price
 2015-07-12 13:53:00   $1.00
 2015-07-24 07:32:00   $2.00
 2015-08-12 09:44:00   $-1.00
 2015-09-18 08:14:00   $1.00
 2015-09-30 08:21:00   $3.00
 2015-10-06 10:17:00   $2.00
 2015-11-06 16:38:00   $3.00
 2015-12-16 19:35:00   $2.00
 2015-12-30 12:55:00   $3.00
 2016-01-04 22:23:00   $4.00
 2016-01-07 00:06:00   $3.00
 2016-01-09 22:14:00   $4.00
 2016-01-16 03:17:00   $5.00
 2016-01-20 17:48:00   $6.00
 2016-01-25 01:09:00   $5.00
 2016-01-28 17:53:00   $4.00
 2016-01-29 04:21:00   $3.00
 2016-05-03 17:20:00   $2.00
 2016-05-05 17:43:00   $1.00
 2016-05-19 20:19:00   $2.00
 2016-06-04 00:42:00   $1.00
 2016-06-05 19:10:00   $2.00
 2016-06-15 18:52:00   $3.00
 2016-06-21 09:20:00   $4.00
 2016-07-20 12:08:00   $5.00
 2016-07-27 19:36:00   $4.00
 2016-08-05 20:10:00   $3.00
 2016-08-12 18:24:00   $2.00
 2016-08-17 01:24:00   $1.00
 2016-08-23 12:16:00   $-1.00
 2016-08-27 07:08:00   $1.00
 2016-09-01 02:24:00   $-1.00
 2016-09-20 03:24:00   $1.00
 2016-10-31