In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt

# Change the plot size.
plt.rcParams['figure.figsize'] = [18.0, 12.0]

In [None]:
import pandas as pd
import numpy as np

In [None]:
from pandas_summary import DataFrameSummary

In [None]:
# import my Finance Analysis Tools (fat)
import finance as fat

In [None]:
# full imports of fastai fail on windows, so just import some of it for now
from fastai.structured import add_datepart, apply_cats, proc_df

Don't run this on windows, only on paperspace...

In [None]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=20)

In [None]:
PATH = 'data/gold-test'

## Load basic data

In [None]:
ticker = 'IAU'
data = fat.get_price_data(ticker)

In [None]:
data.tail()

## Create Features

We're going to try to be predicting tomorrow's closing price, so add it as a column.

In [None]:
data['Tomorrow Adj Close'] = data['Adj Close'].shift(-1)

In [None]:
data.tail()

We'll also add google trends for searches for  "gold price usd"

In [None]:
search = "gold price usd"
trends = fat.get_google_trends_df(data, search)

In [None]:
trends.tail()

In [None]:
#related_queries

In [None]:
data = pd.DataFrame.join(data, trends)

In [None]:
data = data.dropna()
data[-14:]

In [None]:
plot_data = data[['Adj Close', search]]
plot_data.plot(secondary_y = [search])

Add separate columns for various date parts.

In [None]:
data = data.reset_index()
add_datepart(data, 'Date', drop=False)
data = data.set_index('Date')

In [None]:
data.tail()

Add in columns for several Simple Moving Averages for various periods of days.

In [None]:
sma_pct_diff = fat.get_sma_pct_diff_df(data, 'Adj Close')
del sma_pct_diff['Adj Close']
data = pd.DataFrame.join(data, sma_pct_diff)

In [None]:
data = data.dropna()

In [None]:
data.tail()

In [None]:
DataFrameSummary(data).summary().T

In [None]:
data.columns

Break the columns up into "category variables" and "continuous variables".

In [None]:
cat_vars = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Days since updated gold price usd',
            'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']

contin_vars = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'gold price usd', 'Elapsed',
               'pct diff Adj Close SMA3', 'pct diff Adj Close SMA5', 'pct diff Adj Close SMA10', 'pct diff Adj Close SMA20', 
               'pct diff Adj Close SMA50', 'pct diff Adj Close SMA100', 'pct diff Adj Close SMA200']

In [None]:
print(f'len(cat_vars) = {len(cat_vars)}')
print(f'len(contin_vars) = {len(contin_vars)}')
print(f'len(data.columns) = {len(data.columns)}')
assert(len(data.columns) == (len(cat_vars) + len(contin_vars) + 1))

Split data into train and test sets.

In [None]:
data_len = len(data)
train_len = int(data_len * .8)
joined = data[:train_len].copy()
joined_test = data[train_len:].copy()
#del joined_test['Tomorrow Adj Close']

In [None]:
#train.tail().T

In [None]:
#test.head().T

In [None]:
#test.tail().T

In [None]:
dep = 'Tomorrow Adj Close'
joined = joined.reset_index()
joined = joined[cat_vars+contin_vars+[dep, 'Date']].copy()

In [None]:
joined.tail().T

In [None]:
joined_test[dep] = 0
joined_test = joined_test.reset_index()
joined_test.head().T

In [None]:
joined_test = joined_test[cat_vars+contin_vars+[dep,'Date']].copy()

In [None]:
joined = joined.dropna()
joined_test = joined_test.dropna()

In [None]:
for v in cat_vars: 
    joined[v] = joined[v].astype('category').cat.as_ordered()
    #joined_test[v] = joined_test[v].astype('category').cat.as_ordered()

In [None]:
apply_cats(joined_test, joined)

We're going to run on full sample size.

In [None]:
samp_size = len(joined)
joined_samp = joined.set_index("Date")

We can now process our data...

In [None]:
df, y, nas, mapper = proc_df(joined_samp, dep, do_scale=True)
yl = np.log(y)

In [None]:
df.head(2).T

In time series data, cross-validation is not random. Instead, our holdout data is generally the most recent data, as it would be in real application. This issue is discussed in detail in [this post](http://www.fast.ai/2017/11/13/validation-sets/) on our web site.

One approach is to take the last 25% of rows (sorted by date) as our validation set.

In [None]:
train_ratio = 0.75
# train_ratio = 0.9
train_size = int(samp_size * train_ratio); train_size
val_idx = list(range(train_size, len(df)))

In [None]:
#del joined_test[dep]

In [None]:
#for v in cat_vars: 
#    #joined[v] = joined[v].astype('category').cat.as_ordered()
#    joined_test[v] = joined_test[v].astype('category').cat.as_ordered()

In [None]:
joined_test = joined_test.set_index('Date')

In [None]:
joined_test.tail().T

In [None]:
df_test, _, nas, mapper = proc_df(joined_test, dep, do_scale=True,
                                  mapper=mapper, na_dict=nas)

In [None]:
joined = joined.dropna()
joined_test = joined_test.dropna()

## Deep Learning

We're ready to put together our models.

Root-mean-squared percent error is the metric Kaggle used for this competition.

In [None]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

We can create a ModelData object directly from out data frame.

In [None]:
# before: bs=128, changed to bs=8 to get results while finding learning rate
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, yl.astype(np.float32), cat_flds=cat_vars, bs=8,
                                       test_df=joined_test)

Create categorical variables.

In [None]:
cat_sz = [(c, len(joined_samp[c].cat.categories)+1) for c in cat_vars]
cat_sz

We use the *cardinality* of each variable (that is, its number of unique values) to decide how large to make its *embeddings*. Each level will be associated with a vector with length defined as below.

In [None]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

Find learning rate.

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-3

In [None]:
m.lr_find(1e-7,1e-2)

In [None]:
m.sched.plot_lr()

In [None]:
m.sched.plot(100)

### Sample

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-3

In [None]:
m.fit(lr, 3, metrics=[exp_rmspe])

In [None]:
m.fit(lr, 5, metrics=[exp_rmspe], cycle_len=1)

In [None]:
m.fit(lr, 2, metrics=[exp_rmspe], cycle_len=4)

In [None]:
m.save('val0')

In [None]:
m.load('val0')

In [None]:
x,y=m.predict_with_targs()

In [None]:
exp_rmspe(x,y)

In [None]:
pred_test=m.predict(is_test=True)

In [None]:
pred_test = np.exp(pred_test)

In [None]:
joined_test[dep]=pred_test

In [None]:
csv_fn=f'{PATH}tmp/sub.csv'

In [None]:
joined_test[[dep]].to_csv(csv_fn, index=False)