#Predicting Bitcoin Price

##Forecasting

Use the exported data from 'Data Collection and Exploration' notebook to forecast the direction of the price of bitcoin.

###Imports

In [2]:
import pandas as pd
from pandas.io.data import DataReader

import numpy as np
from datetime import datetime

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.cross_validation import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline


###Creating Lagged Series

Create a pandas DataFrame that contains the lagged percentage returns for a prior number of days (based on the approach described in [Forecasting Financial Time Series](https://www.quantstart.com/articles/Forecasting-Financial-Time-Series-Part-1)).

Define a method to read in the collated bitcoin dataset and select the most recent 2 years:

In [3]:
def get_data(filename, index):
    data = pd.read_csv(filename, index_col=index)
    data = data[(data.date >= '2012-01-01') & (data.date <= '2015-12-31')]

    return data

Define methods create a series of lagged data and percentage returns corresponding to how much the price went up or down compared to the present daily price:

In [4]:
def create_lagged_df(data, lags):
    datalag = pd.DataFrame(index=data.index)
    datalag["price_usd"] = data["price_usd"]

    # Create the shifted lag series of prior trading period price values
    for i in xrange(0,lags):
        datalag["lag%s" % str(i+1)] = data["price_usd"].shift(i+1)
    
    return datalag


def create_returns_df(datalag):
    returns = pd.DataFrame(index=datalag.index)
    returns["price_usd"] = datalag["price_usd"].pct_change()*100.0
    
    return returns

In [5]:
def create_lagged_series(lags=30):
    data = get_data('bitcoin_price.csv', 'trading_day')
    
    datalag = create_lagged_df(data, lags)

    returns = create_returns_df(datalag)

    # If any of the values of percentage returns equal zero, set them to
    # a small number (stops issues with QDA model in scikit-learn)
    for i,x in enumerate(returns["price_usd"]):
        if (abs(x) < 0.0001):
            returns["price_usd"][i] = 0.0001

    # Create the lagged percentage returns columns
    for i in xrange(0,lags):
        returns["lag%s" % str(i+1)] = datalag["lag%s" % str(i+1)].pct_change()*100.0

    # Create the "Direction" column (+1 or -1) indicating an up/down day
    returns["direction"] = np.sign(returns["price_usd"])

    return returns

Define a method to fit the model on the training set, predict on the test data whether the price of bitcoin will increase or decrease from the prior day's price, then calculate the hit rate:

In [6]:
def fit_model(name, model, X_train, y_train, X_test, pred):
    # Fit and predict the model on the training, and then test, data
    fitted_model = model.fit(X_train, y_train)
    pred[name] = model.predict(X_test)

    # Create a series with 1 being correct direction, 0 being wrong
    # and then calculate the hit rate based on the actual direction
    pred["%s_Correct" % name] = (1.0 + pred[name] * pred["Actual"])/2.0
    hit_rate = np.mean(pred["%s_Correct" % name])
    print "%s: %.3f" % (name, hit_rate)
    
    print fitted_model, "\n"

Create the lagged series and exclude NAs:

In [9]:
btc = create_lagged_series(lags=14)
btc = btc.dropna()
predictors = ["lag1"]
X = btc[predictors]
y = btc["direction"]

Create the training and test sets

In [10]:
# The test data is split into two parts: Before and after September 2015
start_test = '2015-09-01'

# Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]

Create the prediction DataFrame:

In [11]:
pred = pd.DataFrame(index=y_test.index)
pred["Actual"] = y_test

Create and fit three basic models:

In [12]:
print "Hit Rates:"
models = [("LR", LogisticRegression()), ("LDA", LDA()), ("QDA", QDA())]
for m in models:
    fit_model(m[0], m[1], X_train, y_train, X_test, pred)

print pred.head()

Hit Rates:
LR: 0.598
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

LDA: 0.598
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001) 

QDA: 0.590
QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariances=False, tol=0.0001) 

             Actual  LR  LR_Correct  LDA  LDA_Correct  QDA  QDA_Correct
trading_day                                                            
2015-09-01       -1   1           0    1            0    1            0
2015-09-02        1   1           1    1            1    1            1
2015-09-03       -1   1           0    1            0    1            0
2015-09-04        1   1           1    1            1    1            1
2015-