# _*Time Series Forecasting with Neural Network*_
## Bitcoin Price Prediction Model

Setup the notebook.

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

## Imports

In [2]:
from IPython.display import HTML

from fastai.structured import *
from fastai.column_data import *

np.set_printoptions(threshold=50, edgeitems=20)

# The Data 

## Tuning Paramenters


In [3]:
# movement_threshold = 1e-5 # this is equal to 10 cents when BTC = $10,000
movement_threshold = 1e-4 # this is equal to $1.00 when BTC = $10,000

## Import
This is the output of the data generation notebook.

In [4]:
PATH='data/btc/'
fname = 'btc-out.csv'

In [5]:
data = pd.read_csv(f'{PATH}{fname}', low_memory=False, sep = '\t') 
data.head()

Unnamed: 0,DateTime_UTC,price_open,price_high,price_low,price_close,volume,close_price_returns,close_price_returns_bins,close_price_returns_labels
0,2014-12-01 00:30:00,300.0,300.0,300.0,300.0,0.01,0.0,"(-0.00282, 0.0]",4
1,2014-12-01 00:35:00,300.0,300.0,300.0,300.0,0.0,0.0,"(-0.00282, 0.0]",4
2,2014-12-01 00:40:00,300.0,300.0,300.0,300.0,0.01,0.0,"(-0.00282, 0.0]",4
3,2014-12-01 00:45:00,370.0,370.0,370.0,370.0,0.0,23.333333,"(0.181, 67.889]",9
4,2014-12-01 00:50:00,370.0,370.0,370.0,370.0,0.0,0.0,"(-0.00282, 0.0]",4


The following returns summarized aggregate information to each table accross each field.

In [6]:
display(DataFrameSummary(data).summary())

Unnamed: 0,DateTime_UTC,price_open,price_high,price_low,price_close,volume,close_price_returns,close_price_returns_bins,close_price_returns_labels
count,,343229,343229,343229,343229,343229,343229,,343229
mean,,2097.8,2101.75,2093.56,2097.92,35.6203,0.0024071,,4.4679
std,,3553.47,3561.74,3544.46,3553.45,68.316,0.534662,,2.87211
min,,0.06,150,0.06,111.89,0,-42.0151,,0
25%,,319.94,320.48,319.42,320.2,7.54449,-0.042785,,2
50%,,601.83,602.37,601.49,602.13,17.8366,0,,4
75%,,1762.94,1764.95,1760.35,1762.87,39.2645,0.0527797,,7
max,,19892,19892,19891,19892,2682.82,67.8887,,9
counts,343229,343229,343229,343229,343229,343229,343229,343229,343229
uniques,343229,132410,120068,128032,131577,321649,291139,10,10


## Data Cleaning / Feature Engineering

As a structured data problem, we necessarily have to go through all the cleaning and feature engineering, even though we're using a neural network.

#### TODO
Do the train test split later....

In [7]:
# n = data.shape[0]
# splits = [[90, 10],
#           [80, 20],
#           [60, 40],
#           [50, 50]]
# train = 

### Up/Down

I will add a column for UP/DOWN/HOLD direction. I'll make this a boolean.

In [8]:
data['UP'] = data.close_price_returns > movement_threshold
data['DOWN'] = data.close_price_returns < -movement_threshold
data['HOLD'] = data.UP == data.DOWN

In [9]:
data.head(5)

Unnamed: 0,DateTime_UTC,price_open,price_high,price_low,price_close,volume,close_price_returns,close_price_returns_bins,close_price_returns_labels,UP,DOWN,HOLD
0,2014-12-01 00:30:00,300.0,300.0,300.0,300.0,0.01,0.0,"(-0.00282, 0.0]",4,False,False,True
1,2014-12-01 00:35:00,300.0,300.0,300.0,300.0,0.0,0.0,"(-0.00282, 0.0]",4,False,False,True
2,2014-12-01 00:40:00,300.0,300.0,300.0,300.0,0.01,0.0,"(-0.00282, 0.0]",4,False,False,True
3,2014-12-01 00:45:00,370.0,370.0,370.0,370.0,0.0,23.333333,"(0.181, 67.889]",9,True,False,False
4,2014-12-01 00:50:00,370.0,370.0,370.0,370.0,0.0,0.0,"(-0.00282, 0.0]",4,False,False,True


### Datetime

The following extracts particular date fields from a complete datetime for the purpose of constructing categoricals.

You should *always* consider this feature extraction step when working with date-time. Without expanding your date-time into these additional fields, you can't capture any trend/cyclical behavior as a function of time at any of these granularities. We'll add to every table with a date field.

`add_datepart` is from the structured fastai library

In [10]:
add_datepart(data, "DateTime_UTC", drop=False)
# ?add_datepart

In [30]:
data#.head(5)
# data.iloc[324933: 325133, ]

Unnamed: 0,DateTime_UTC,price_open,price_high,price_low,price_close,volume,close_price_returns,close_price_returns_bins,close_price_returns_labels,UP,...,DateTime_UTCDay,DateTime_UTCDayofweek,DateTime_UTCDayofyear,DateTime_UTCIs_month_end,DateTime_UTCIs_month_start,DateTime_UTCIs_quarter_end,DateTime_UTCIs_quarter_start,DateTime_UTCIs_year_end,DateTime_UTCIs_year_start,DateTime_UTCElapsed
0,2014-12-01 00:30:00,300.00,300.00,300.00,300.00,0.010000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417393800
1,2014-12-01 00:35:00,300.00,300.00,300.00,300.00,0.000000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417394100
2,2014-12-01 00:40:00,300.00,300.00,300.00,300.00,0.010000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417394400
3,2014-12-01 00:45:00,370.00,370.00,370.00,370.00,0.000000,23.333333,"(0.181, 67.889]",9,True,...,1,0,335,False,True,False,False,False,False,1417394700
4,2014-12-01 00:50:00,370.00,370.00,370.00,370.00,0.000000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417395000
5,2014-12-01 00:55:00,370.00,370.00,370.00,370.00,0.000000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417395300
6,2014-12-01 01:00:00,370.00,370.00,370.00,370.00,0.000000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417395600
7,2014-12-01 01:05:00,370.00,370.00,370.00,370.00,0.000000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417395900
8,2014-12-01 01:10:00,370.00,370.00,370.00,370.00,0.000000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417396200
9,2014-12-01 01:15:00,370.00,370.00,370.00,370.00,0.000000,0.000000,"(-0.00282, 0.0]",4,False,...,1,0,335,False,True,False,False,False,False,1417396500


## Durations

It is common when working with time series data to extract data that explains relationships across rows as opposed to columns, e.g.:
* Running averages
* Time until next event
* Time since last event

This is often difficult to do with most table manipulation frameworks, since they are designed to work with relationships across columns. As such, we've created a class to handle this type of data.

We'll define a function `get_elapsed` for cumulative counting across a sorted dataframe. Given a particular field `fld` to monitor, this function will start tracking time since the last occurrence of that field. When the field is seen again, the counter is set to zero.

Upon initialization, this will result in datetime na's until the field is encountered. This is reset every time a new store is seen. We'll see how to use this shortly.

##### Note
This could be used with quantiles, when was the last time there was a rise/fall over 10%?

#### Special Functions

In [12]:
# test = data.iloc[1,0]
# test2 = data.iloc[7,0]
# test2.minute #- test.minute


In [13]:
# count number of time periods since the last up movement
def get_elapsed_since_updown(fld):
    global df
    
    res_down = []
    res_up = []
    counter_up = 0
    counter_down = 0
    
    for v in df[fld].values:
        if v > 0: # UP
            counter_up = -1
        elif v < 0: # down
            counter_down = -1
        counter_down += 1
        counter_up += 1
        res_up.append(counter_up)
        res_down.append(counter_down)
                
    df['SinceUP'] = res_up
    df['SinceDown'] = res_down
    

In [14]:
def get_elapsed_since_percent(fld):
    global df
    
    last_01 = 0
    last_025 = 0
    last_05 = 0
    last_075 = 0
    last_1 = 0
    last_2 = 0
    last_3 = 0
    last_4 = 0
    last_5 = 0

    last_01_down = 0
    last_025_down = 0
    last_05_down = 0
    last_075_down = 0
    last_1_down = 0
    last_2_down = 0
    last_3_down = 0
    last_4_down = 0
    last_5_down = 0

    last_01_up = 0
    last_025_up = 0
    last_05_up = 0
    last_075_up = 0
    last_1_up = 0
    last_2_up = 0
    last_3_up = 0
    last_4_up = 0
    last_5_up = 0

    res_01 = []
    res_025 = []
    res_05 = []
    res_075 = []
    res_1 = []
    res_2 = []
    res_3 = []
    res_4 = []
    res_5 = []

    res_01_down = []
    res_025_down = []
    res_05_down = []
    res_075_down = []
    res_1_down = []
    res_2_down = []
    res_3_down = []
    res_4_down = []
    res_5_down = []

    res_01_up = []
    res_025_up = []
    res_05_up = []
    res_075_up = []
    res_1_up = []
    res_2_up = []
    res_3_up = []
    res_4_up = []
    res_5_up = []

    for r in df.close_price_returns.values:
        # up movements
        if r > 5:
            last_01_up = -1
            last_025_up = -1
            last_05_up = -1
            last_075_up = -1
            last_1_up = -1
            last_2_up = -1
            last_3_up = -1
            last_4_up = -1
            last_5_up = -1
        elif r > 4:
            last_01_up = -1
            last_025_up = -1
            last_05_up = -1
            last_075_up = -1
            last_1_up = -1
            last_2_up = -1
            last_3_up = -1
            last_4_up = -1
        elif r > 3:
            last_01_up = -1
            last_025_up = -1
            last_05_up = -1
            last_075_up = -1
            last_1_up = -1
            last_2_up = -1
            last_3_up = -1
        elif r > 2:
            last_01_up = -1
            last_025_up = -1
            last_05_up = -1
            last_075_up = -1
            last_1_up = -1
            last_2_up = -1
        elif r > 1:
            last_01_up = -1
            last_025_up = -1
            last_05_up = -1
            last_075_up = -1
            last_1_up = -1
        elif r > 0.75:
            last_01_up = -1
            last_025_up = -1
            last_05_up = -1
            last_075_up = -1
        elif r > 0.5:
            last_01_up = -1
            last_025_up = -1
            last_05_up = -1
        elif r > 0.25:
            last_01_up = -1
            last_025_up = -1
        elif r > 0.1:
            last_01_up = -1
        # down movements
        elif r < -5:
            last_01_down = -1
            last_025_down = -1
            last_05_down = -1
            last_075_down = -1
            last_1_down = -1
            last_2_down = -1
            last_3_down = -1
            last_4_down = -1
            last_5_down = -1
        elif r < -4:
            last_01_down = -1
            last_025_down = -1
            last_05_down = -1
            last_075_down = -1
            last_1_down = -1
            last_2_down = -1
            last_3_down = -1
            last_4_down = -1
        elif r < -3:
            last_01_down = -1
            last_025_down = -1
            last_05_down = -1
            last_075_down = -1
            last_1_down = -1
            last_2_down = -1
            last_3_down = -1
        elif r < -2:
            last_01_down = -1
            last_025_down = -1
            last_05_down = -1
            last_075_down = -1
            last_1_down = -1
            last_2_down = -1
        elif r < -1:
            last_01_down = -1
            last_025_down = -1
            last_05_down = -1
            last_075_down = -1
            last_1_down = -1
        elif r < -0.75:
            last_01_down = -1
            last_025_down = -1
            last_05_down = -1
            last_075_down = -1
        elif r < -0.5:
            last_01_down = -1
            last_025_down = -1
            last_05_down = -1
        elif r < -0.25:
            last_01_down = -1
            last_025_down = -1
        elif r < -0.1:
            last_01_down = -1

        # either direction movements
        last_01 = min(last_01_up, last_01_down)
        last_025 = min(last_025_up, last_025_down)
        last_05 = min(last_05_up, last_05_down)
        last_075 = min(last_075_up, last_075_down)
        last_1 = min(last_1_up, last_1_down)
        last_2 = min(last_2_up, last_2_down)
        last_3 = min(last_3_up, last_3_down)
        last_4 = min(last_4_up, last_4_down)
        last_5 = min(last_5_up, last_5_down)

        #increment counters
        last_01 += 1
        last_025 += 1
        last_05 += 1
        last_075 += 1
        last_1 += 1
        last_2 += 1
        last_3 += 1
        last_4 += 1
        last_5 += 1

        last_01_down += 1
        last_025_down += 1
        last_05_down += 1
        last_075_down += 1
        last_1_down += 1
        last_2_down += 1
        last_3_down += 1
        last_4_down += 1
        last_5_down += 1

        last_01_up += 1
        last_025_up += 1
        last_05_up += 1
        last_075_up += 1
        last_1_up += 1
        last_2_up += 1
        last_3_up += 1
        last_4_up += 1
        last_5_up += 1

        # store results
        res_01.append(last_01)
        res_025.append(last_025)
        res_05.append(last_05)
        res_075.append(last_075)
        res_1.append(last_1)
        res_2.append(last_2)
        res_3.append(last_3)
        res_4.append(last_4)
        res_5.append(last_5)

        res_01_down.append(last_01_down)
        res_025_down.append(last_025_down)
        res_05_down.append(last_05_down)
        res_075_down.append(last_075_down)
        res_1_down.append(last_1_down)
        res_2_down.append(last_2_down)
        res_3_down.append(last_3_down)
        res_4_down.append(last_4_down)
        res_5_down.append(last_5_down)

        res_01_up.append(last_01_up)
        res_025_up.append(last_025_up)
        res_05_up.append(last_05_up)
        res_075_up.append(last_075_up)
        res_1_up.append(last_1_up)
        res_2_up.append(last_2_up)
        res_3_up.append(last_3_up)
        res_4_up.append(last_4_up)
        res_5_up.append(last_5_up)

    # store the results as new columns
    df = df.assign(Since_01 = res_01)
    df = df.assign(Since_025 = res_025)
    df = df.assign(Since_05 = res_05)
    df = df.assign(Since_075 = res_075)
    df = df.assign(Since_1 = res_1)
    df = df.assign(Since_2 = res_2)
    df = df.assign(Since_3 = res_3)
    df = df.assign(Since_4 = res_4)
    df = df.assign(Since_5 = res_5)
    df = df.assign(Since_01_down = res_01_down)
    df = df.assign(Since_025_down = res_025_down)
    df = df.assign(Since_05_down = res_05_down)
    df = df.assign(Since_075_down = res_075_down)
    df = df.assign(Since_1_down = res_1_down)
    df = df.assign(Since_2_down = res_2_down)
    df = df.assign(Since_3_down = res_3_down)
    df = df.assign(Since_4_down = res_4_down)
    df = df.assign(Since_5_down = res_5_down)
    df = df.assign(Since_01_up = res_01_up)
    df = df.assign(Since_025_up = res_025_up)
    df = df.assign(Since_05_up = res_05_up)
    df = df.assign(Since_075_up = res_075_up)
    df = df.assign(Since_1_up = res_1_up)
    df = df.assign(Since_2_up = res_2_up)
    df = df.assign(Since_3_up = res_3_up)
    df = df.assign(Since_4_up = res_4_up)
    df = df.assign(Since_5_up = res_5_up)

#### Apply these special functions

I'll apply these extra columns.

In [18]:
fld = 'close_price_returns'
df = data[['DateTime_UTC', fld]]

get_elapsed_since_percent(fld)
get_elapsed_since_updown(fld)

df.head(20)

Unnamed: 0,DateTime_UTC,close_price_returns,Since_01,Since_025,Since_05,Since_075,Since_1,Since_2,Since_3,Since_4,...,Since_025_up,Since_05_up,Since_075_up,Since_1_up,Since_2_up,Since_3_up,Since_4_up,Since_5_up,SinceUP,SinceDown
0,2014-12-01 00:30:00,0.0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2014-12-01 00:35:00,0.0,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
2,2014-12-01 00:40:00,0.0,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
3,2014-12-01 00:45:00,23.333333,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,2014-12-01 00:50:00,0.0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,5
5,2014-12-01 00:55:00,0.0,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,6
6,2014-12-01 01:00:00,0.0,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,7
7,2014-12-01 01:05:00,0.0,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,8
8,2014-12-01 01:10:00,0.0,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,9
9,2014-12-01 01:15:00,0.0,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,10


We're going to set the active index to Date.

In [19]:
# df = df.set_index("DateTime_UTC")
# df = df.remove_in

In [20]:
list(data)

['DateTime_UTC',
 'price_open',
 'price_high',
 'price_low',
 'price_close',
 'volume',
 'close_price_returns',
 'close_price_returns_bins',
 'close_price_returns_labels',
 'UP',
 'DOWN',
 'HOLD',
 'DateTime_UTCYear',
 'DateTime_UTCMonth',
 'DateTime_UTCWeek',
 'DateTime_UTCDay',
 'DateTime_UTCDayofweek',
 'DateTime_UTCDayofyear',
 'DateTime_UTCIs_month_end',
 'DateTime_UTCIs_month_start',
 'DateTime_UTCIs_quarter_end',
 'DateTime_UTCIs_quarter_start',
 'DateTime_UTCIs_year_end',
 'DateTime_UTCIs_year_start',
 'DateTime_UTCElapsed']

In [21]:
list(df)

['DateTime_UTC',
 'close_price_returns',
 'Since_01',
 'Since_025',
 'Since_05',
 'Since_075',
 'Since_1',
 'Since_2',
 'Since_3',
 'Since_4',
 'Since_5',
 'Since_01_down',
 'Since_025_down',
 'Since_05_down',
 'Since_075_down',
 'Since_1_down',
 'Since_2_down',
 'Since_3_down',
 'Since_4_down',
 'Since_5_down',
 'Since_01_up',
 'Since_025_up',
 'Since_05_up',
 'Since_075_up',
 'Since_1_up',
 'Since_2_up',
 'Since_3_up',
 'Since_4_up',
 'Since_5_up',
 'SinceUP',
 'SinceDown']

### Rolling Quantiles
I don't think I have any variable this will work for

In [22]:
# bwd = df[['Store']+columns].sort_index().groupby("Store").rolling(7, min_periods=1).sum()
# 
# fwd = df[['Store']+columns].sort_index(ascending=False
#                                       ).groupby("Store").rolling(7, min_periods=1).sum()

### Backup Tables

It's usually a good idea to back up large tables of extracted / wrangled features before you join them onto another one, that way you can go back to it easily if you need to make changes to it.

In [25]:
len(df)
len(data)
df = data.merge(df, on = 'DateTime_UTC')

##### Saving Table

In [26]:
df.to_feather(f'{PATH}df')

##### Loading Table

In [27]:
df = pd.read_feather(f'{PATH}df')

In [28]:
df["DateTime_UTC"] = pd.to_datetime(df.DateTime_UTC)

In [29]:
df.columns

Index(['DateTime_UTC', 'price_open', 'price_high', 'price_low', 'price_close',
       'volume', 'close_price_returns_x', 'close_price_returns_bins',
       'close_price_returns_labels', 'UP', 'DOWN', 'HOLD', 'DateTime_UTCYear',
       'DateTime_UTCMonth', 'DateTime_UTCWeek', 'DateTime_UTCDay',
       'DateTime_UTCDayofweek', 'DateTime_UTCDayofyear',
       'DateTime_UTCIs_month_end', 'DateTime_UTCIs_month_start',
       'DateTime_UTCIs_quarter_end', 'DateTime_UTCIs_quarter_start',
       'DateTime_UTCIs_year_end', 'DateTime_UTCIs_year_start',
       'DateTime_UTCElapsed', 'close_price_returns_y', 'Since_01', 'Since_025',
       'Since_05', 'Since_075', 'Since_1', 'Since_2', 'Since_3', 'Since_4',
       'Since_5', 'Since_01_down', 'Since_025_down', 'Since_05_down',
       'Since_075_down', 'Since_1_down', 'Since_2_down', 'Since_3_down',
       'Since_4_down', 'Since_5_down', 'Since_01_up', 'Since_025_up',
       'Since_05_up', 'Since_075_up', 'Since_1_up', 'Since_2_up', 'Since_3_up',
  

## Create features

### TODO Save Model
Save this to csv and edit it in R to do more feature engineering. time permitting.

In [38]:
df.to_csv(f"{PATH}df-out.csv", sep = "\t")

#### More Featurue engineering maybe
rolling quantils thing for:
  - sum(volume)
  - sum(volume) / max(last7 volumes)
  - return labels
  - sequential down/up
  - return (like a moving average)

Might also be good to have number of ticks per time period (number of trades happening, not just overall volume). Need to go back to the data generator for that one.

Also want to get high and low incorporated in there somehow...
  - high's % over close
  - low's % under close
  
Also need more time features:
  - Hour, Minute, and Second

Redo volume:
  - need buy volume and sell volume as seprate things. Need to go back to tick data.

# STOPPED HERE
this is continued in the part2 notebook

# STOPPED HERE

In [31]:
joined = df

In [33]:
len(joined.columns)

55

In [34]:
joined.head().T

Unnamed: 0,0,1,2,3,4
DateTime_UTC,2014-12-01 00:30:00,2014-12-01 00:35:00,2014-12-01 00:40:00,2014-12-01 00:45:00,2014-12-01 00:50:00
price_open,300,300,300,370,370
price_high,300,300,300,370,370
price_low,300,300,300,370,370
price_close,300,300,300,370,370
volume,0.01,0,0.01,0,0
close_price_returns_x,0,0,0,23.3333,0
close_price_returns_bins,"(-0.00282, 0.0]","(-0.00282, 0.0]","(-0.00282, 0.0]","(0.181, 67.889]","(-0.00282, 0.0]"
close_price_returns_labels,4,4,4,9,4
UP,False,False,False,True,False


Now that we've engineered all our features, we need to convert to input compatible with a neural network.

This includes converting categorical variables into contiguous integers or one-hot encodings, normalizing continuous features to standard normal, etc...

In [36]:
joined.columns


Index(['DateTime_UTC', 'price_open', 'price_high', 'price_low', 'price_close',
       'volume', 'close_price_returns_x', 'close_price_returns_bins',
       'close_price_returns_labels', 'UP', 'DOWN', 'HOLD', 'DateTime_UTCYear',
       'DateTime_UTCMonth', 'DateTime_UTCWeek', 'DateTime_UTCDay',
       'DateTime_UTCDayofweek', 'DateTime_UTCDayofyear',
       'DateTime_UTCIs_month_end', 'DateTime_UTCIs_month_start',
       'DateTime_UTCIs_quarter_end', 'DateTime_UTCIs_quarter_start',
       'DateTime_UTCIs_year_end', 'DateTime_UTCIs_year_start',
       'DateTime_UTCElapsed', 'close_price_returns_y', 'Since_01', 'Since_025',
       'Since_05', 'Since_075', 'Since_1', 'Since_2', 'Since_3', 'Since_4',
       'Since_5', 'Since_01_down', 'Since_025_down', 'Since_05_down',
       'Since_075_down', 'Since_1_down', 'Since_2_down', 'Since_3_down',
       'Since_4_down', 'Since_5_down', 'Since_01_up', 'Since_025_up',
       'Since_05_up', 'Since_075_up', 'Since_1_up', 'Since_2_up', 'Since_3_up',
  

In [37]:
# cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
#     'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
#     'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
#     'SchoolHoliday_fw', 'SchoolHoliday_bw']

# contin_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
#    'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
#    'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
#    'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

contin_vars = [
    'price_open',
    'price_high',
    'price_low',
    'price_close',
    'volume',
    'close_price_returns_x']

cat_vars = ['close_price_returns_labels', 'UP', 'DOWN', 'HOLD', 'DateTime_UTCYear', 
    'DateTime_UTCMonth', 'DateTime_UTCWeek', 'DateTime_UTCDay', 'DateTime_UTCDayofweek', 
    'Since_01', 'Since_025', 'Since_05', 'Since_075', 'Since_1', 'Since_2', 'Since_3', 
    'Since_4', 'Since_5', 'Since_01_down', 'Since_025_down', 'Since_05_down', 
    'Since_075_down', 'Since_1_down', 'Since_2_down', 'Since_3_down', 'Since_4_down', 
    'Since_5_down', 'Since_01_up', 'Since_025_up', 'Since_05_up', 'Since_075_up', 
    'Since_1_up', 'Since_2_up', 'Since_3_up', 'Since_4_up', 'Since_5_up', 'SinceUP', 
    'SinceDown']

n = len(joined); n

343229

In [None]:
dep = 'Sales'
joined = joined[cat_vars+contin_vars+[dep, 'Date']].copy()

In [None]:
joined_test[dep] = 0
joined_test = joined_test[cat_vars+contin_vars+[dep, 'Date', 'Id']].copy()

In [None]:
for v in cat_vars: joined[v] = joined[v].astype('category').cat.as_ordered()

In [None]:
apply_cats(joined_test, joined)

In [None]:
for v in contin_vars:
    joined[v] = joined[v].astype('float32')
    joined_test[v] = joined_test[v].astype('float32')

We're going to run on a sample.

In [None]:
idxs = get_cv_idxs(n, val_pct=150000/n)
joined_samp = joined.iloc[idxs].set_index("Date")
samp_size = len(joined_samp); samp_size

To run on the full dataset, use this instead:

In [None]:
samp_size = n
joined_samp = joined.set_index("Date")

We can now process our data...

In [None]:
joined_samp.head(2)

In [None]:
df, y, nas, mapper = proc_df(joined_samp, 'Sales', do_scale=True)
yl = np.log(y)

In [None]:
joined_test = joined_test.set_index("Date")

In [None]:
df_test, _, nas, mapper = proc_df(joined_test, 'Sales', do_scale=True, skip_flds=['Id'],
                                  mapper=mapper, na_dict=nas)

In [None]:
df.head(2)

In time series data, cross-validation is not random. Instead, our holdout data is generally the most recent data, as it would be in real application. This issue is discussed in detail in [this post](http://www.fast.ai/2017/11/13/validation-sets/) on our web site.

One approach is to take the last 25% of rows (sorted by date) as our validation set.

In [None]:
train_ratio = 0.75
# train_ratio = 0.9
train_size = int(samp_size * train_ratio); train_size
val_idx = list(range(train_size, len(df)))

An even better option for picking a validation set is using the exact same length of time period as the test set uses - this is implemented here:

In [None]:
val_idx = np.flatnonzero(
    (df.index<=datetime.datetime(2014,9,17)) & (df.index>=datetime.datetime(2014,8,1)))

In [None]:
val_idx=[0]

## DL

We're ready to put together our models.

Root-mean-squared percent error is the metric Kaggle used for this competition.

In [None]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

We can create a ModelData object directly from out data frame.

In [None]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, yl.astype(np.float32), cat_flds=cat_vars, bs=128,
                                       test_df=df_test)

Some categorical variables have a lot more levels than others. Store, in particular, has over a thousand!

In [None]:
cat_sz = [(c, len(joined_samp[c].cat.categories)+1) for c in cat_vars]

In [None]:
cat_sz

We use the *cardinality* of each variable (that is, its number of unique values) to decide how large to make its *embeddings*. Each level will be associated with a vector with length defined as below.

In [None]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [None]:
emb_szs

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-3

In [None]:
m.lr_find()

In [None]:
m.sched.plot(100)

### Sample

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-3

In [None]:
m.fit(lr, 3, metrics=[exp_rmspe])

In [None]:
m.fit(lr, 5, metrics=[exp_rmspe], cycle_len=1)

In [None]:
m.fit(lr, 2, metrics=[exp_rmspe], cycle_len=4)

### All

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-3

In [None]:
m.fit(lr, 1, metrics=[exp_rmspe])

In [None]:
m.fit(lr, 3, metrics=[exp_rmspe])

In [None]:
m.fit(lr, 3, metrics=[exp_rmspe], cycle_len=1)

### Test

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-3

In [None]:
m.fit(lr, 3, metrics=[exp_rmspe])

In [None]:
m.fit(lr, 3, metrics=[exp_rmspe], cycle_len=1)

In [None]:
m.save('val0')

In [None]:
m.load('val0')

In [None]:
x,y=m.predict_with_targs()

In [None]:
exp_rmspe(x,y)

In [None]:
pred_test=m.predict(True)

In [None]:
pred_test = np.exp(pred_test)

In [None]:
joined_test['Sales']=pred_test

In [None]:
csv_fn=f'{PATH}tmp/sub.csv'

In [None]:
joined_test[['Id','Sales']].to_csv(csv_fn, index=False)

In [None]:
FileLink(csv_fn)

## RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
((val,trn), (y_val,y_trn)) = split_by_idx(val_idx, df.values, yl)

In [None]:
m = RandomForestRegressor(n_estimators=40, max_features=0.99, min_samples_leaf=2,
                          n_jobs=-1, oob_score=True)
m.fit(trn, y_trn);

In [None]:
preds = m.predict(val)
m.score(trn, y_trn), m.score(val, y_val), m.oob_score_, exp_rmspe(preds, y_val)