In [65]:
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

#For Prediction
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing, cross_validation, svm

In [52]:
# Read Datasets 
research_data_raw = pd.read_csv('Datasets/bitcoin_research_data_v2.csv')
# Drop unnecessary columns
research_data = research_data_raw.drop('Unnamed: 0', axis=1)
research_data.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USD,Popularity,Popularity_Normalized
0,2019-05-11,BTCUSD,6337.9,6984.8,6337.9,6793.5,9855.87,66133073.75,12.0,1200.0
1,2019-05-10,BTCUSD,6151.4,6430.0,6110.1,6337.9,9872.86,62166515.28,12.0,1200.0
2,2019-05-09,BTCUSD,5939.6,6174.0,5933.2,6151.4,7405.18,44816872.39,12.0,1200.0
3,2019-05-08,BTCUSD,5744.0,5983.2,5660.0,5939.6,6007.11,35184697.52,12.0,1200.0
4,2019-05-07,BTCUSD,5687.4,5955.8,5687.4,5906.7,3668.12,21496575.65,12.0,1200.0


# Implementation 1 - Same as 2, but drop of columns happen after shift

In [162]:
# Prediction with -1 day window
research_data_imp1 = research_data.copy()
research_data_imp1['Prediction'] = research_data[['Close']].shift(-19) # Prediction has the closing price of previous day
research_data_imp1.dropna(inplace=True) # Drop last row which has NAN as prediction
research_data_imp1.tail()


Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USD,Popularity,Popularity_Normalized,Prediction
1784,2014-06-10,BTCUSD,643.44,657.72,643.44,646.89,7.13,4677.87,3.0,300.0,530.0
1785,2014-06-09,BTCUSD,646.54,656.44,636.5,643.44,4.65,3000.62,3.0,300.0,494.77
1786,2014-06-08,BTCUSD,659.8,659.8,646.0,646.54,0.888,582.0,3.0,300.0,493.43
1787,2014-06-07,BTCUSD,653.4,659.8,636.68,659.8,4.75,3083.79,3.0,300.0,445.56
1788,2014-06-06,BTCUSD,657.01,660.0,653.4,653.4,5.27,3468.93,3.0,300.0,445.56


In [163]:
# Split data - One day out (check if correct)
X = np.array(research_data_imp1.drop(['Prediction', 'Symbol', 'Date', 'Popularity'], 1)) # Create X with numeric columns only
Y = np.array(research_data_imp1['Prediction']) # Y with prediction column
X = preprocessing.scale(X) # Standardize values
X_prediction = X[-19:] # last (x) rows in X (most far out date)
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.5)

In [164]:
X_prediction

array([[-0.644441  , -0.64285748, -0.64832401, -0.64490084, -0.77236755,
        -0.58744856, -0.52504988],
       [-0.64108232, -0.64258127, -0.64585046, -0.64539254, -0.77242283,
        -0.58745282, -0.52504988],
       [-0.64268778, -0.64091582, -0.645752  , -0.64203306, -0.77092234,
        -0.58731882, -0.52504988],
       [-0.64174723, -0.64196322, -0.64684406, -0.6436389 , -0.77291662,
        -0.58749621, -0.45021009],
       [-0.6411704 , -0.64079002, -0.64761387, -0.64269814, -0.77158835,
        -0.58737919, -0.45021009],
       [-0.63834024, -0.63415554, -0.6429562 , -0.64212117, -0.77090391,
        -0.58731357, -0.45021009],
       [-0.63746505, -0.63471616, -0.64018428, -0.63929034, -0.77048536,
        -0.58727461, -0.45021009],
       [-0.64082658, -0.63909721, -0.64530443, -0.63841494, -0.77238071,
        -0.58744798, -0.45021009],
       [-0.6429833 , -0.63802246, -0.64890585, -0.64177726, -0.77030898,
        -0.58726342, -0.45021009],
       [-0.64694723, -0.6444

In [165]:
clf = LinearRegression()
clf.fit(X_train, Y_train)
prediction = (clf.predict(X_prediction))
confidence = clf.score(X_test, Y_test)

In [166]:
prediction

array([615.35843868, 620.74284071, 621.86880564, 558.84774505,
       563.67867511, 583.2131858 , 581.18788986, 566.88803088,
       568.49125621, 553.19547805, 566.64371177, 560.50536631,
       607.60806287, 616.88469632, 614.38082102, 616.79383038,
       624.33363071, 622.82904638, 621.176964  ])

In [167]:
confidence

0.9214270336822228

# Implementation 2 - Seems cleaner

In [211]:
research_data_imp2 = research_data.copy()
research_data_imp2.sort_values(by='Date', inplace=True, ascending=True) # Sort by date ascending
forecast_out = int(math.ceil(0.0001 * len(research_data_imp2))) # 0.01% of data
print(len(research_data_imp2))
print(forecast_out)

1808
1


In [212]:
research_data_imp2['Prediction'] = research_data_imp2['Close'].shift(-forecast_out)
research_data_imp2.tail()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USD,Popularity,Popularity_Normalized,Prediction
4,2019-05-07,BTCUSD,5687.4,5955.8,5687.4,5906.7,3668.12,21496575.65,12.0,1200.0,5939.6
3,2019-05-08,BTCUSD,5744.0,5983.2,5660.0,5939.6,6007.11,35184697.52,12.0,1200.0,6151.4
2,2019-05-09,BTCUSD,5939.6,6174.0,5933.2,6151.4,7405.18,44816872.39,12.0,1200.0,6337.9
1,2019-05-10,BTCUSD,6151.4,6430.0,6110.1,6337.9,9872.86,62166515.28,12.0,1200.0,6793.5
0,2019-05-11,BTCUSD,6337.9,6984.8,6337.9,6793.5,9855.87,66133073.75,12.0,1200.0,


In [222]:
X = np.array(research_data_imp2.drop(['Prediction', 'Symbol', 'Date', 'Popularity'], 1)) # Create X with numeric columns only
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]

# research_data_imp2.dropna(inplace=True)

y = np.array(research_data_imp2['Prediction'])
y = y[:-forecast_out]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
prediction = (clf.predict(X_test))

In [223]:
confidence

0.9958944546514997

In [225]:
prediction_lately = (clf.predict(X_lately))
prediction_lately

array([6805.626376])

In [224]:
# Evaluate the prediction accuracy of the model
from sklearn import metrics

print("The Explained Variance: %.2f" % clf.score(X_test, y_test))  
print("The Mean Absolute Error: %.2f" % metrics.mean_absolute_error(y_test, prediction))  
print("The Median Absolute Error: %.2f" % metrics.median_absolute_error(y_test, prediction)) 
print("The Mean Squared Error: %.2f" % metrics.mean_squared_error(y_test, prediction)) 
print("The Root Mean Squared Error: %.2f" % (np.sqrt(metrics.mean_squared_error(y_test, prediction))))

The Explained Variance: 1.00
The Mean Absolute Error: 97.12
The Median Absolute Error: 18.53
The Mean Squared Error: 52674.21
The Root Mean Squared Error: 229.51


# Implementation 3 - Wrong

In [83]:
research_data_imp3 = research_data.copy()
research_data_imp3 = research_data_imp3.drop(['Symbol', 'Date'], 1)
research_data_imp3.head()

Unnamed: 0,Open,High,Low,Close,Volume BTC,Volume USD,Popularity,Popularity_Normalized
0,6337.9,6984.8,6337.9,6793.5,9855.87,66133073.75,12.0,1200.0
1,6151.4,6430.0,6110.1,6337.9,9872.86,62166515.28,12.0,1200.0
2,5939.6,6174.0,5933.2,6151.4,7405.18,44816872.39,12.0,1200.0
3,5744.0,5983.2,5660.0,5939.6,6007.11,35184697.52,12.0,1200.0
4,5687.4,5955.8,5687.4,5906.7,3668.12,21496575.65,12.0,1200.0


In [85]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split

X = research_data_imp3['Close'].to_frame() # independent variable - Close
y = research_data_imp3['Popularity'].to_frame() # dependent variable - Popularity
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [86]:
print("X Train Size:", len(X_train))
print("y Train Size:", len(y_train))
print("X Test Size:", len(X_test))
print("y Test Size:", len(y_test))

X Train Size: 1446
y Train Size: 1446
X Test Size: 362
y Test Size: 362


In [87]:
from sklearn.linear_model import LinearRegression
# instantiate the regressor class
regressor = LinearRegression()
# fit the build the model by fitting the regressor to the training data
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [88]:
# make a prediction set using the test set
prediction = regressor.predict(X_test)

In [90]:
# Evaluate the prediction accuracy of the model
from sklearn import metrics

print("The Explained Variance: %.2f" % regressor.score(X_test, y_test))  
print("The Mean Absolute Error: %.2f" % metrics.mean_absolute_error(y_test, prediction))  
print("The Median Absolute Error: %.2f" % metrics.median_absolute_error(y_test, prediction)) 
print("The Mean Squared Error: %.2f" % metrics.mean_squared_error(y_test, prediction)) 
print("The Root Mean Squared Error: %.2f" % (np.sqrt(metrics.mean_squared_error(y_test, prediction))))

The Explained Variance: 0.64
The Mean Absolute Error: 3.83
The Median Absolute Error: 1.49
The Mean Squared Error: 44.10
The Root Mean Squared Error: 6.64
