<a id="top"></a>
# Sections

[Intro](#intro)  
[Setup](#setup)  
[Load Data](#loaddata)  
 - [Load Sales History](#saleshistory)  
 - [Load Summary Table](#summarytable)  
 - [Merge](#merge)
 
[Exploration](#explore)  
[Export](#export)  
[Feature Engineering](#features)  
[Modeling](#model)


<a id='intro'></a>
# Intro
[Back to top](#top)

The objective is to forecast short-term (up to 7 days out) price behaviour for all of the sneakers in my dataset.  
In this notebook I'll load & clean the data, and run a regression model to make predictions.  
The results of this analysis will be written to a csv file for use in a flask app.

<a id='setup'></a>
# Setup
[Back to top](#top)

In [None]:
# Imports and options
import pandas as pd
import numpy as np
import matplotlib as plt
import pickle
import re
from random import randint
import re

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from seaborn import plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm

% matplotlib inline

# Pandas display options
pd.options.display.max_columns = 40
pd.options.display.max_rows = 200
pd.options.display.float_format = '{:20,.2f}'.format

Define functions for graphs:

In [None]:
def make_hist(df, column, title=None, xlabel='', pct=False, range=None):
    multiplier = 1
    if pct == True:
        multiplier = 100
    fig = plt.figure(figsize=(10,4))
    plt.hist(df[column]*multiplier, bins=50, range=range)
    plt.ylabel('Count')
    plt.xlabel(xlabel if xlabel else '')
    plt.title(column if not title else title)

In [None]:
def make_box(df, xcolumn, title=None, ylabel='', xlabel='', pct=False):
    multiplier = 1
    if pct == True:
        multiplier = 100
    fig = plt.figure(figsize=(10,4))
    ax = sns.boxplot(x=df[xcolumn]*multiplier, showfliers=False, palette="deep")
    ax.set(title=xcolumn if not title else title)
    ax.set(xlabel=xcolumn if not xlabel else xlabel )
    return ax

In [None]:
def make_line(df, xcolumn, ycolumn, title=None, ylabel='', xlabel='Date', pct=False):
    multiplier = 1
    if pct == True:
        multiplier = 100
    fig = plt.figure(figsize=(10,4))
    plt.plot(df[xcolumn], df[ycolumn]*multiplier, linewidth=1)
    plt.title(xcolumn if not title else title)
    plt.ylabel(ylabel if ylabel else '')
    plt.xlabel(xlabel)
    plt.xticks(rotation=45)
    return plt

In [None]:
def make_bar(df, xcolumn, ycolumn, title=None, xlabel='', ylabel='', pct=False):
    fig = plt.figure(figsize=(10,4))
    if pct == True:
        df[ycolumn] = df[ycolumn] * 100
    ax = df.groupby(xcolumn)[ycolumn].mean().plot(kind = 'bar')
    plt.xlabel(xlabel if xlabel else '')
    plt.ylabel(ylabel if ylabel else '')
    plt.title(xcolumn if not title else title)

Define general helper functions:

In [None]:
def drop_columns(df, columns):
    '''Drops a list of columns in a dataframe'''
    for c in columns:
        df.drop(c, axis = 1, inplace = True)
        
def get_percent(part, whole):
    return round(100 * float(part)/float(whole), 2)

def unique_list(l):
    '''makes a unique list'''
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist

<a id='loaddata'></a>
# Load Data
[Back to top](#top)

### File Descriptions:
***sales_history.pkl*** - history by shoe by sale   
***shoe_file.json*** - descriptive information by shoe


<a id='saleshistory'></a>
## Load Sales History
[Back to top](#top)

In [None]:
# Open the cleaned sales_history data
with open('data/sales_history.pkl', 'rb') as picklefile:
    sales_history = pickle.load(picklefile)
    
original_sales_length = len(sales_history)
print original_sales_length

In [None]:
sales_history.head(2)

In [None]:
# Cleanup functions
def extract_date_part(df):
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['hour'] = df.index.hour
    df['sale_date'] = df.index.date
    
def price_to_float(df, cols):
    '''strips out non-numericals and convert to floats'''
    for c in cols:
        df[c] = df[c].replace({'\D': ''}, regex=True)
        df[c] = pd.to_numeric(df[c], errors='coerce')
        
def shoe_brand(x):
    """get brand of shoe based on line"""
    if x in adidas_list:
        x = "adidas"
    elif x in nike_list:
        x = "nike"
    return x

def clean_sizes(x):
    """convert shoe sizes into floats"""
    if "K" in x:
        x = 1
    elif "Y" in x:
        x = 1
    else:
        x = re.sub(r'[a-zA-Z-/]', ' ', x)
        x_list = [float(x) for x in x.strip().split()]
        x = sum(x_list) / float(len(x_list))
        x = round(x, 2)
    return x

def core_fringe(x):
    """converts shoe size into 1 for fringe or 0 for core sizing"""
    if x < 7.5:
        x = 1
    elif x > 14:
        x = 1
    else:
        x = 0
    return x


In [None]:
# Maintain lists of lines within shoe brands
adidas_list = ['y-3', 'y3', 'adidas','yeezy', 'nmd']
nike_list = ['jordan','nike','foamposite','air','kobe','lebron','kd','sf','kyrie','lunar','cavs',
                 'pg','lebron','flyknit']
other_list = ['reebok', 'new', 'asics','vans','ua','puma','saucony','diadora','timberland']

In [None]:
# Create a duplicate column for datetime and shoe name
sales_history['sale_date_time'] = sales_history.index
sales_history['shoe_name'] = sales_history['name']

In [None]:
# Clean up the sales_history dataframe 
extract_date_part(sales_history)
price_to_float(sales_history, ['sale_price'])
drop_columns(sales_history, ['date_time'])

# Extract the line from the shoe name and add the brand column
sales_history['line'] = sales_history['name'].str.split().str.get(0).str.lower()
sales_history['brand'] = sales_history['line'].apply(lambda x: shoe_brand(x))

# Add a column for shoe size range
sales_history['size_number'] = sales_history['shoe_size'].apply(lambda x: clean_sizes(x))
sales_history['shoe_size'] = sales_history['size_number'].apply(lambda x: core_fringe(x))

In [None]:
sales_history.head(2)

<a id='summarytable'></a>
## Load Summary Table
[Back to top](#top)

In [None]:
summary = pd.read_json('data/shoe_file.json')

In [None]:
orig_summary_length = len(summary)
print orig_summary_length

In [None]:
summary.head(2)

In [None]:
# Cleanup functions for the summary table
def consolidate_colors(x):
    '''Simplifies shoe colors into basic colors(black, white, etc) or other'''
    things_to_replace = ["/","-","core","true","pirate","fierce","turtledove","varsity",
                         "fire","royal","gym","wolf","infared","23","metallic","bone",
                         "dark","university","concord","anthracite","coin","cool","cement","   "]
    for thing in things_to_replace:
        x = x.replace(thing," ")
        x = x.strip()
        
    x = ' '.join(sorted(unique_list(x.split())))
    
    if x == "black white":
        x = "basic"
    elif x == "black":
        x = "basic"
    elif x == "white":
        x = "basic"
    else:
        x = "other"
    
    return x

In [None]:
# Rename columns
summary.rename(
    columns={'release date':'release_date', 'original retail':'original_retail','style':'style_code'}, inplace=True)

In [None]:
# Replace string values in price columns
price_to_float(summary,['num_sales', 'highest_bid', 'lowest_ask', 'original_retail'])

# Convert release date to datetime format    
summary = summary[~summary.release_date.str.contains("n/a")]
summary['release_date'] = pd.to_datetime(summary['release_date'],  format=' %m.%d.%y')

# Remove newline from the name column
summary['name'] = summary.name.str.replace("\n", " ")

# Add a simplified color column for use in later modeling
summary['main_color'] = summary['colorway'].apply(lambda x: consolidate_colors(x))

# Drop any duplicates across the whole dataframe
summary = summary.drop_duplicates()

In [None]:
# Pickle list for use in the instagram scraper
unique_names = [name.encode('ascii','ignore') for name in summary.name.unique()]
unique_names = sorted(unique_names)

with open('data/shoe_name_list.pkl', 'wb') as picklefile:
    pickle.dump(unique_names, picklefile)

In [None]:
sales_history.head(2)

<a id='merge'></a>
## Merge
[Back to top](#top)

In [None]:
# Merge the summary and sales_history dataframes
sales_history = sales_history.merge(summary, on='name', sort=False, how='inner')

# Sort the new merged table by shoe name
sales_history.sort_values(['name'], inplace=True)

In [None]:
sales_history.head(2)

In [None]:
# Drop unnecessary columns:
drop_list = ['lowest_ask','num_asks','num_bids','num_sales','highest_bid']

drop_columns(sales_history, drop_list)

In [None]:
sales_history.head()

<a id='explore'></a>
# Exploration
[Back to top](#top)

In [None]:
make_bar(sales_history, 'month', 'sale_price', title="Avg. Sale Price by Month", 
         xlabel="Month", ylabel="Avg. Sale Price")

In [None]:
make_bar(sales_history, 'size_number', 'sale_price')

In [None]:
make_hist(sales_history, 'sale_price', range=[0,2000], title='Count of Sale Price')

<a id='export'></a>
# Export
[Back to top](#top)

Created a few tables below for the web app to pull from.

In [None]:
with open('data/final_sales_history.pkl', 'wb') as picklefile:
    pickle.dump(sales_history, picklefile)

<a id='features'></a>
# Feature Engineering
[Back to top](#top)

In [None]:
## Add a num_sales (frequency) column
sales_history['num_sales'] = sales_history.groupby('shoe_name')['shoe_name'].transform('count')

In [None]:
## Limit everything below this to shoes with over 50 sales recorded
print(len(sales_history))
sales_history = sales_history[sales_history.num_sales > 50]
print(len(sales_history))

In [None]:
# Aggregate to sales by day
shoes = sales_history[['name', 'sale_date', 'sale_price']].groupby(['name', 'sale_date'])
shoes = shoes.aggregate(['mean', 'count']).reset_index()

# Collapse the header
shoes.columns = shoes.columns.droplevel(0)

# Convert groupby object to dataframe & name columns
shoes = pd.DataFrame(shoes)
shoes.columns = ['name', 'sale_date', 'sale_price', 'volume']
sales_history = sales_history.reset_index()

# add back in the other information about the shoe
more_info = (sales_history[['name', 'main_color', 'line', 'brand', 'style_code', 'image_url',
                           'release_date', 'original_retail', 'colorway']]
             .drop_duplicates())

shoes = shoes.merge(more_info, on = 'name')

In [None]:
# Take the log of the sale price
shoes['log_sale_price'] = np.log(shoes.sale_price)

In [None]:
shoes.sample(5)

# Time Features

In [None]:
## Overwrite sales_history - no more original sales_history dataframe anymore
sales_history = shoes.copy()
sales_history['shoe_name'] = sales_history['name']
sales_history['sale_date'] = pd.to_datetime(sales_history['sale_date'])
sales_history['time_since_release'] = sales_history['sale_date'] - sales_history['release_date']

# change the time delta into a count of days since release date & log it
sales_history['total_days_td'] = sales_history['time_since_release'].dt.total_seconds() / (24 * 60 * 60)
sales_history['log_total_days_td'] = np.log(sales_history.total_days_td)

In [None]:
# Last sale price
sales_history['sale_lagged_1'] = sales_history.groupby(['shoe_name']).sale_price.shift(1)
sales_history['pct_vs_last'] = sales_history['sale_price'] / sales_history['sale_lagged_1'] - 1
# Sale price minus 2
sales_history['sale_lagged_2'] = sales_history.groupby(['shoe_name']).sale_price.shift(2)
sales_history['pct_vs_last_2'] = sales_history['sale_lagged_1'] / sales_history['sale_lagged_2'] - 1
# Sale price minus 3
sales_history['sale_lagged_3'] = sales_history.groupby(['shoe_name']).sale_price.shift(3)
sales_history['pct_vs_last_3'] = sales_history['sale_lagged_2'] / sales_history['sale_lagged_3'] - 1
# Sale price minus 4
sales_history['sale_lagged_4'] = sales_history.groupby(['shoe_name']).sale_price.shift(4)
sales_history['pct_vs_last_4'] = sales_history['sale_lagged_3'] / sales_history['sale_lagged_4'] - 1

### Note: computing rolling average requires unique index of name + datetime

In [None]:
sales_history = sales_history.set_index(['name', 'sale_date'])

In [None]:
# Rolling average
sales_history['rolling_avg_4'] = sales_history.groupby(level = 0)['sale_price'].rolling(4).mean().shift(1).reset_index(0,drop=True)
# sales_history['rolling_avg_20'] = sales_history.groupby(level = 0)['sale_price'].rolling(20).mean().shift(1).reset_index(0,drop=True)

# Insert an intercept
sales_history['intercept'] = 1

In [None]:
# Check that the lagged sales are calculated properly
sales_history.head(100)

In [None]:
# Then drop the NaNs.
sales_history = sales_history.dropna()

In [None]:
# Transform the other variables into logs
sales_history['sale_lagged_1'] = np.log(sales_history.sale_lagged_1)
sales_history['sale_lagged_2'] = np.log(sales_history.sale_lagged_2)
sales_history['sale_lagged_3'] = np.log(sales_history.sale_lagged_3)

In [None]:
sales_history.head(2)

In [None]:
column_names = list(sales_history.columns.values)
len(column_names)

<a id='model'></a>
# Model for all shoes
[Back to top](#top)

In [None]:
all_shoes = sales_history.reset_index()
all_shoes = all_shoes.dropna()
all_shoes = all_shoes.sort_values(['sale_date'])

In [None]:
# save the min and max of rolling average for the forecast function later
rolling_avg_4_min = all_shoes.rolling_avg_4.min()
rolling_avg_4_max = all_shoes.rolling_avg_4.max()
# rolling_avg_20_min = all_shoes.rolling_avg_20.min()
# rolling_avg_20_max = all_shoes.rolling_avg_20.max()

In [None]:
# Normalize the team_stats dataframe between 0 and 1 (between -1 and 1 would be team_stats.mean()/ same as below)
all_shoes['rolling_avg_4'] = (all_shoes['rolling_avg_4'] - all_shoes.rolling_avg_4.min()) / (all_shoes.rolling_avg_4.max() - all_shoes.rolling_avg_4.min())
# all_shoes['rolling_avg_20'] = (all_shoes['rolling_avg_20'] - all_shoes.rolling_avg_20.min()) / (all_shoes.rolling_avg_20.max() - all_shoes.rolling_avg_20.min())

# all_shoes['rolling_avg_4'] = np.log(all_shoes.rolling_avg_4)
# all_shoes['rolling_avg_20'] = np.log(all_shoes.rolling_avg_20)
all_shoes.head(3)

In [None]:
all_shoes.loc[all_shoes.name == 'Adidas NMD R1 Black Red', ['sale_price', 'rolling_avg_4']].plot();

In [None]:
# get the dummy variabls for the non-numerical features
for col in all_shoes[['main_color','brand','line']]:
    dummies = pd.get_dummies(all_shoes[col], prefix = col)
    all_shoes = all_shoes.join(dummies)
    all_shoes = all_shoes.drop(col, 1)

In [None]:
# drop the nas again?
all_shoes = all_shoes.dropna()
all_shoes.head()

In [None]:
all_shoes = all_shoes.replace([np.inf, -np.inf], np.nan)
all_shoes = all_shoes.dropna()
all_shoes = all_shoes.reset_index()


# Desired outcome variable
y = all_shoes.log_sale_price

# Full set of features - no shoe size anymore
X = all_shoes[['intercept', 'rolling_avg_4','pct_vs_last_2','pct_vs_last_3','log_total_days_td',
              'main_color_basic', 'original_retail','brand_nike','line_foamposite','line_air']]

In [None]:
X.head()

In [None]:
# Set test train split
cutpoint = round((len(X)/4)*3)

X_train = X.ix[:cutpoint,:]
X_test = X.ix[cutpoint:,:]

y_train = y.ix[:cutpoint]
y_test = y.ix[cutpoint:]

model = sm.OLS(y_train, X_train)
est = model.fit()
y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)
r_2 = est.rsquared
est.summary()

In [None]:
# calculate means of y_test and y_train
mean_y_test = np.mean(y_test)
mean_y_train = np.mean(y_train)

# print r_squared (train and test)
print(1 - (mean_squared_error(y_train, y_train_pred)/mean_squared_error([mean_y_train]*len(y_train), y_train)))
print(1 - (mean_squared_error(y_test, y_test_pred)/mean_squared_error([mean_y_test]*len(y_test), y_test)))

# Tree Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
tree = RandomForestRegressor(n_estimators=1000, max_depth=5)

tree_model = tree.fit(X_train, y_train)

In [None]:
train_tree_pred = tree_model.predict(X_train)
test_tree_pred = tree_model.predict(X_test)
all_preds = tree_model.predict(X)

In [None]:
# print r_squared (train and test)
print(1 - (mean_squared_error(y_train, train_tree_pred)/mean_squared_error([mean_y_train]*len(y_train), y_train)))
print(1 - (mean_squared_error(y_test, test_tree_pred)/mean_squared_error([mean_y_test]*len(y_test), y_test)))

In [None]:
tree_model.feature_importances_

In [None]:
plt.scatter(y_test, test_tree_pred);

In [None]:
# import forestci as fci
# http://contrib.scikit-learn.org/forest-confidence-interval/auto_examples/plot_mpg.html#sphx-glr-auto-examples-plot-mpg-py
# calculate inbag and unbiased variance
# inbag = fci.calc_inbag(X_train.shape[0], tree)
# unbiased = fci.random_forest_error(tree, inbag, X_train, X_test)

# Evaluate test set results

In [None]:
# Get predicted and actual Y values from test data set
# Actual price
y_test2 = y_test.reset_index(drop = True)
print(y_test2.tail(1))
# Predicted price
yhat = pd.Series(test_tree_pred)
print(yhat.tail(1))
# Shoe and date info
names_dates = all_shoes.ix[cutpoint:, ['name', 'date']].reset_index(drop = True)
print(names_dates.tail(1))

In [None]:
# How well does the model predict if the price will go up or down? 
testing = pd.concat([names_dates, y_test2, yhat], axis = 1)
testing.columns = ['name', 'date', 'y', 'yhat']
testing.y = np.exp(testing.y)
testing.yhat = np.exp(testing.yhat)
testing = testing.set_index(['name', 'date'])
testing = testing.sort_index()
testing = testing.reset_index()
# Lagged y and yhat
testing['y_last'] = testing.groupby(['name']).y.shift(1)
testing['yhat_last'] = testing.groupby(['name']).yhat.shift(1)
# Compare both y and yhat to previous y value
testing['y_change'] = testing.y - testing.y_last
testing['yhat_change'] = testing.yhat - testing.y_last ## less accurate if you use yhat last
# Flags for increase in y 
testing['y_up'] = 0
testing.loc[testing.y_change >= 0, 'y_up'] = 1
testing['yhat_up'] = 0
testing.loc[testing.yhat_change >= 0, 'yhat_up'] = 1
testing = testing.dropna()
# Summary of results
testsum = testing.groupby(['y_up', 'yhat_up']).size()
testsum = pd.DataFrame(testsum).reset_index()
testsum.columns = ['y_up', 'yhat_up', 'n'] 
pct_num = (testsum.loc[0, 'n'] + testsum.loc[3, 'n'])
pct_denom = testsum['n'].sum()
pct_correct = round(float(pct_num) / float(pct_denom), 3)
print(testsum)
print(pct_num)
print(pct_denom)
print(pct_correct)
testing.head(15)

In [None]:
testing.describe()

# Forecast

In [None]:
# Predictions for the flask app
# Shoe list for forecast
shoe_list = [i for i in all_shoes['name'].unique()]
shoe_n = len(shoe_list)

# have a random shoe to check whenever i want to check a shoe
chosen_shoe = shoe_list[randint(0, shoe_n)]

# Data set for forecast - all the x variables but also name, sale price, pct vs last
forecast = all_shoes[['name', 'sale_price', 'pct_vs_last', 'intercept', 'rolling_avg_4','pct_vs_last_2','pct_vs_last_3','log_total_days_td',
              'main_color_basic', 'original_retail','brand_nike','line_Foamposite','line_Air']]

# Choose a shoe for one run
x = forecast[forecast.name == chosen_shoe].reset_index(drop = True)
x.tail(1)

In [None]:
def make_new_row(df, mod):
    """ Forecast the sale price of the next sale using the model """
    # The last row of the dataframe & sale_price
    last_row = (len(df) - 1)
    last_sale_price = df.sale_price[last_row]
    
    # the row that will be created
    new_row_num = len(df)

    # Generate X values for next observation
    rolling_avg_4 = df.sale_price[(last_row - 3):(last_row + 1)].mean() # adjust for rolling window
#     rolling_avg_20 = df.sale_price[(last_row - 19):(last_row + 1)].mean() # adjust for rolling window

    # Normalize rolling average using global variables created above 
    rolling_avg_4 = (rolling_avg_4 - rolling_avg_4_min) / (rolling_avg_4_max - rolling_avg_4_min)
    # rolling_avg_20 = (rolling_avg_20 - rolling_avg_20_min) / (rolling_avg_20_max - rolling_avg_20_min)
    
    # take pct vs last from row above
    pct_vs_last_2 = df.pct_vs_last[last_row]
    pct_vs_last_3 = df.pct_vs_last_2[last_row]
    
    # added 1 day to the time delta and then re log it 
    log_total_days_td = np.log(np.exp(df.log_total_days_td[last_row]) + 1) # better way to do this? 
    
    # fill in dataframe with other features by shoe
    main_color_basic = df.main_color_basic[last_row]
    original_retail = df.original_retail[last_row]
    brand_nike = df.brand_nike[last_row]
    line_Foamposite = df.line_Foamposite[last_row]
    line_Air = df.line_Air[last_row]
    
    # add the intercept
    intercept = 1
    
    # New array with all the variables saved above
    new_row = [intercept, rolling_avg_4, pct_vs_last_2, pct_vs_last_3, log_total_days_td, 
               main_color_basic, original_retail, brand_nike, line_Foamposite, line_Air]
    
    # convert new row to array and reshape
    new_array = np.asarray(new_row)
    new_array = new_array.reshape(1, -1) # to avoid numpy deprecation warning
    
    # Predict w/ tree model
    log_pred_value = tree_model.predict(new_array)
    pred_value = float(np.exp(log_pred_value)[0])
    
    # fill in sale price for new row, calc pct vs last on predicted sale price
    sale_price = pred_value # sample from distribution? 
    pct_vs_last = sale_price / df.sale_price[last_row] - 1
    
    # adding everything together to make the new row 
    post_pred = [df.name[last_row], sale_price, pct_vs_last]
    new_row_for_df = post_pred + new_row
    df.ix[new_row_num] = new_row_for_df
    
    return pred_value, last_sale_price, df

# run it for the one shoe
pred_value, last_sale_price, newdf = make_new_row(x, tree_model)

In [None]:
## Forecast next sale for all shoes! 
pred_results = []
for i in shoe_list:
    # subset dataframe to i in shoe list
    xdf = forecast[forecast.name == i].reset_index(drop = True)
    # next 7 days
    for n in range(0, 7):
        n_label = n + 1
        pred_value, last_sale_price, xdf = make_new_row(xdf, tree_model)
        pred_results.append([i, n_label, pred_value, last_sale_price])
pred_results = pd.DataFrame(pred_results)
pred_results.columns = ['name', 'n', 'predicted', 'last_sale_price']

In [None]:
pred_results.head(20)

In [None]:
chosen_shoe = shoe_list[randint(0, shoe_n)]
pred_results.loc[pred_results.name == chosen_shoe, ['predicted']].plot()

In [None]:
# Get the mean of predictions for each shoe
pred2 = pred_results.copy()
pred2['avg_pred'] = pred2.groupby('name')['predicted'].transform('mean')
pred2 = pred2[pred2.n == 1]
pred2 = pred2[['name', 'avg_pred', 'last_sale_price']]
pred2.columns = ['name', 'predicted', 'last_sale_price']
pred2.predicted = pred2['predicted'].apply(lambda x: round(x, 2))
pred2.head()

In [None]:
# add the up or down column and send to csv for the flask app
pred2['change'] = pred2['predicted'] - pred2['last_sale_price']
pred2['trend'] = 'down'
pred2.loc[pred2.change > 0, 'trend'] = 'up'
pred2.to_csv('shoe_forecast.csv', index = False)
pred2.head()

In [None]:
pred2.describe()

In [None]:
pred2.trend.value_counts()