#Predicting Bitcoin Price

##Data Analysis

Use the exported data from 'Data Collection and Exploration' notebook to create a model for predicting the next day market price of bitcoin.

###Imports

In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import random
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

###Data Munging

Create the outcome variable `next_day`.

data = pd.read_csv('bitcoin_price.csv')

data['trend'] = 0

next_day = data['price_usd'].iloc[1:]
data = data.iloc[:-1,:]
data['next_day'] = next_day.values

Create features for moving averages for price and volume.

In [None]:
data['ma_price_3'] = pd.rolling_mean(data['price_usd'], 3)
data['ma_price_7'] = pd.rolling_mean(data['price_usd'], 7)

data['ma_volume_3'] = pd.rolling_mean(data['transaction_vol'], 3)
data['ma_volume_7'] = pd.rolling_mean(data['transaction_vol'], 7)

data.dropna()

Read in the Google Trends result and merge with the main data set.

In [None]:
def merge_columns(main, other):
    result = pd.merge(left=main,right=other, how='outer', left_on='date', right_on='date')
    
    return result


trends = pd.read_csv('bitcoin_trends.csv')
data = merge_columns(data, trends)


Select data subset of interest. Early timepoints with very low or sparse data are excluded from analysis.

In [None]:
data = data[(data.date >= '2011-01-01') & (data.date <= '2015-12-31')]

columns = set(data.columns.values)
selected_columns = list(columns - set(['date', 'Unnamed: 0', 'block_transactions']))


df = data[selected_columns]
print selected_columns

### Ordinary Least-Squares Regression Model

In [None]:
lm = smf.ols('next_day ~ exchange_vol_usd + blockchain_size + trend + miners_revenue + orphaned_blocks + transaction_vol + unique_addresses + sub100_transactions + nonpopular_transactions + block_size + total_bitcoins + total_transactions + index + sub10000_transactions + bitcoin + ma_price_7 + ma_price_3 + deficit + market_cap + percent_cost + short_transactions + sub1000_transactions + difficulty + sub10_transactions + all_transactions + trade_transaction_ratio + ma_volume_3 + price_usd + ma_volume_7 + transaction_fees_usd + confirmation_time + transaction_cost + output_vol + hash_rate + transaction_fees', data=df)
fit1 = lm.fit()
print fit1.summary()

TODO: Analyze the P-values of each feature to determine suitability for inclusion in further analysis.

###Linear Regression Analysis

Split the data into training and test set.

In [None]:
np.random.seed(1)
random.seed(1)
data = data.loc[np.random.permutation(data.index)]

# Select 70% of the dataset to be training data
highest_train_row = int(data.shape[0] * .7)
train = data.loc[:highest_train_row,:]

# Select 30% of the dataset to be test data.
test = data.loc[highest_train_row:,:]

Define the method to generate a regression model.

In [None]:
def regression_model(predictor):
    regressor= LinearRegression()

    regressor.fit(train[predictor], train['next_day'])
    predictions = regressor.predict(test[predictor])
    mse = sum((predictions - test['next_day']) ** 2) / len(predictions)

    return regressor, predictions, mse

Define the method to visualize the model statistics. Make scatterplots with the actual values in the training set and test set.

In [None]:
def draw_scatterplot(predictor, regressor, predictions):
    
    plt.scatter(train[predictor], train['next_day'])
    plt.plot(train[predictor], regressor.predict(train[predictor]))
    plt.show()
    plt.scatter(test[predictor], test['next_day'])
    plt.plot(test[predictor], predictions)
    plt.show()


Try out different models parameters.

In [None]:
rm , predictions, mse = regression_model(['price_usd'])
#draw_scatterplot('price_usd', rm, predictions)
print mse # 206.12217429c

rm, predictions, mse = regression_model(['trend'])
print mse # 46367.4115111

rm, predictions, mse = regression_model(['transaction_vol', 'price_usd'])
print mse # 206.373259397

TODO: Feature selection and feature engineering to further improve the model