# Bitcoing Price Predictor


We want to predict the price of bitcoin using historical bitcoin data as well as its sentiment analysis over time. We are planning on experimenting with time series models such as LSTM to predict token price. To measure sentiment analysis, we will be using the fear and greed API which measures engagement with Bitcoin. 



In [1]:
import math # Mathematical functions 
import numpy as np # Fundamental package for scientific computing with Python
import pandas as pd # Additional functions for analysing and manipulating data
from datetime import date, timedelta, datetime # Date Functions
from pandas.plotting import register_matplotlib_converters # This function adds plotting functions for calender dates
import matplotlib.pyplot as plt # Important package for visualization - we use this to plot the market data
import matplotlib.dates as mdates # Formatting dates
# from sklearn.metrics import mean_absolute_error, mean_squared_error # Packages for measuring model performance / errors
from keras.models import Sequential # Deep learning library, used for neural networks
from keras.layers import LSTM, Dense, Dropout # Deep learning classes for recurrent and regular densely-connected layers
from keras.callbacks import EarlyStopping # EarlyStopping during model training
from sklearn.preprocessing import RobustScaler, MinMaxScaler # This Scaler removes the median and scales the data according to the quantile range to normalize the price data 
# import seaborn as sns

# Extracting and Training dataset

In [2]:
#read csv
df = pd.read_csv("data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv")

In [3]:
#drop NA values 
print(df.dtypes)
print(df.shape) #(4857377, 8)
df = df.dropna()
print(df.dtypes)
print(df.shape) #(3613769, 8)

Timestamp              int64
Open                 float64
High                 float64
Low                  float64
Close                float64
Volume_(BTC)         float64
Volume_(Currency)    float64
Weighted_Price       float64
dtype: object
(4857377, 8)
Timestamp              int64
Open                 float64
High                 float64
Low                  float64
Close                float64
Volume_(BTC)         float64
Volume_(Currency)    float64
Weighted_Price       float64
dtype: object
(3613769, 8)


In [4]:
# Indexing Batches
df.set_index('Timestamp', inplace=True)
data = df

In [5]:
# List of considered Features
FEATURES = ['Open','High','Low','Close','Volume_(BTC)','Volume_(Currency)','Weighted_Price']

In [6]:
# Create the dataset with features and filter the data to the list of FEATURES
data_filtered = df[FEATURES]

In [7]:
# We add a prediction column and set dummy values to prepare the data for scaling
data_filtered_ext = data_filtered.copy()
data_filtered_ext['Prediction'] = data_filtered_ext['Weighted_Price']

In [8]:
# Print the tail of the dataframe
# print(data_filtered_ext.tail())

# Get the number of rows in the data
nrows = data_filtered.shape[0]

In [9]:

# Convert the data to numpy values
np_data_unscaled = np.array(data_filtered)
np_data = np.reshape(np_data_unscaled, (nrows, -1))
print(np_data.shape)

(3613769, 7)


In [10]:
# Transform the data by scaling each feature to a range between 0 and 1
scaler = MinMaxScaler()
np_data_scaled = scaler.fit_transform(np_data_unscaled)

In [11]:
# Creating a separate scaler that works on a single column for scaling predictions
scaler_pred = MinMaxScaler()
df_Close = pd.DataFrame(data_filtered_ext['Weighted_Price'])
np_Close_scaled = scaler_pred.fit_transform(df_Close)

In [12]:
# Set the sequence length - this is the timeframe used to make a single prediction
sequence_length = 50

In [13]:
# Prediction Index
index_Close = data.columns.get_loc("Weighted_Price")

In [14]:
# Split the training data into train and train data sets
# As a first step, we get the number of rows to train the model on 80% of the data 
train_data_len = math.ceil(np_data_scaled.shape[0] * 0.8)

In [15]:
# Create the training and test data
train_data = np_data_scaled[0:train_data_len, :]
test_data = np_data_scaled[train_data_len - sequence_length:, :]

In [16]:
# The RNN needs data with the format of [samples, time steps, features]
# Here, we create N samples, sequence_length time steps per sample, and 6 features
def partition_dataset(sequence_length, data):
    x, y = [], []
    data_len = data.shape[0]
    for i in range(sequence_length, data_len):
        x.append(data[i-sequence_length:i,:]) #contains sequence_length values 0-sequence_length * columsn
        y.append(data[i, index_Close]) #contains the prediction values for validation,  for single-step prediction
    
    # Convert the x and y to numpy arrays
    x = np.array(x)
    y = np.array(y)
    return x, y


In [17]:
# Generate training data and test data
x_train, y_train = partition_dataset(sequence_length, train_data)
x_test, y_test = partition_dataset(sequence_length, test_data)

In [18]:
# Print the shapes: the result is: (rows, training_sequence, features) (prediction value, )
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(2890966, 50, 7) (2890966,)
(722753, 50, 7) (722753,)


In [19]:
# Validate that the prediction value and the input match up
# The last close price of the second input sample should equal the first prediction value
print(x_train[1][sequence_length-1][index_Close])
print(y_train[0])

4.375133305987339e-05
4.375133305987339e-05


In [20]:
# Configure the neural network model
model = Sequential()

In [21]:
# Model with n_neurons = inputshape Timestamps, each with x_train.shape[2] variables
n_neurons = x_train.shape[1] * x_train.shape[2]
print(n_neurons, x_train.shape[1], x_train.shape[2])
model.add(LSTM(n_neurons, return_sequences=True)) 
model.add(LSTM(n_neurons, return_sequences=False))
model.add(Dense(5))
model.add(Dense(1))

350 50 7


In [22]:
# Compile the model
model.compile(optimizer='adam', loss='mse')

In [24]:
# Training the model
# epochs = 50
epochs = 3
batch_size = 16
early_stop = EarlyStopping(monitor='loss', patience=5, verbose=1)
history = model.fit(x_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(x_test, y_test),
                    callbacks=[early_stop])


Epoch 1/3


TypeError: 'NoneType' object is not callable

In [None]:
# Plot training & validation loss values
#fig, ax = plt.subplots(figsize=(20, 10), sharex=True)
#plt.plot(history.history["loss"])
#plt.title("Model loss")
#plt.ylabel("Loss")
#plt.xlabel("Epoch")
#ax.xaxis.set_major_locator(plt.MaxNLocator(epochs))
#plt.legend(["Train", "Test"], loc="upper left")
#plt.grid()
#plt.show()

In [None]:
# Get the predicted values
y_pred_scaled = model.predict(x_test)


In [None]:
# Unscale the predicted values
y_pred = scaler_pred.inverse_transform(y_pred_scaled)
y_test_unscaled = scaler_pred.inverse_transform(y_test.reshape(-1, 1))

In [None]:
# Mean Absolute Error (MAE)
MAE = mean_absolute_error(y_test_unscaled, y_pred)
print(f'Median Absolute Error (MAE): {np.round(MAE, 2)}')

In [None]:
# Mean Absolute Percentage Error (MAPE)
MAPE = np.mean((np.abs(np.subtract(y_test_unscaled, y_pred)/ y_test_unscaled))) * 100
print(f'Mean Absolute Percentage Error (MAPE): {np.round(MAPE, 2)} %')

In [None]:
# Median Absolute Percentage Error (MDAPE)
MDAPE = np.median((np.abs(np.subtract(y_test_unscaled, y_pred)/ y_test_unscaled)) ) * 100
print(f'Median Absolute Percentage Error (MDAPE): {np.round(MDAPE, 2)} %')

In [None]:
# # Add the date column
# print(data_filtered)
data_filtered_sub = data_filtered.copy()
data_filtered_sub['Date'] = date_index

In [None]:
# # Add the difference between the valid and predicted prices
train = data_filtered_sub[:train_data_len + 1]
valid = data_filtered_sub[train_data_len:]
valid.insert(1, "Prediction", y_pred.ravel(), True)
print(valid)
print(valid["price_in_usd"])
valid.insert(1, "Difference", valid["Prediction"] - valid["price_in_usd"], True)

In [None]:
# # Zoom in to a closer timeframe
# valid = valid[valid['Date'] > display_start_date]
# train = train[train['Date'] > display_start_date]


In [None]:
# Visualize the data
fig, ax1 = plt.subplots(figsize=(22, 10), sharex=True)
xt = train['Date']; yt = train[["price_in_usd"]]
xv = valid['Date']; yv = valid[["price_in_usd", "Prediction"]]
plt.title("Predictions vs Actual Values", fontsize=20)
plt.ylabel("price in usd", fontsize=18)
plt.plot(xt, yt, color="#039dfc", linewidth=2.0)
plt.plot(xv, yv["Prediction"], color="#E91D9E", linewidth=2.0)
plt.plot(xv, yv["price_in_usd"], color="black", linewidth=2.0)
plt.legend(["Train", "Test Predictions", "Actual Values"], loc="upper left")
plt.show()

In [None]:
# # Create the bar plot with the differences
x = valid['Date']
y = valid["Difference"]

In [None]:
# Create custom color range for positive and negative differences
valid.loc[y >= 0, 'diff_color'] = "#2BC97A"
valid.loc[y < 0, 'diff_color'] = "#C92B2B"

plt.bar(x, y, width=0.8, color=valid['diff_color'])
plt.grid()
plt.show()