In [25]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import r2_score
import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score


In [26]:
# Loading the data
df = pd.read_csv('../data/BitCoin.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.drop(columns='Unnamed: 0', inplace=True)

In [27]:
# Calculating the moving averages
df['50-day MA'] = df['Close'].rolling(window=50).mean()
df['100-day MA'] = df['Close'].rolling(window=100).mean()
df['200-day MA'] = df['Close'].rolling(window=200).mean()

# Calculating the exponential smoothing
df['50-day ES'] = df['Close'].ewm(span=50, adjust=False).mean()
df['100-day ES'] = df['Close'].ewm(span=100, adjust=False).mean()

# Calculating the relative strength index (RSI)
df['RSI'] = df['Close'].rolling(window=14).apply(lambda x: 100 - (100 / (1 + x.iloc[-1] / x.iloc[-14])))

# Calculating the Bollinger Bands
df['20-day BB'] = df['Close'].rolling(window=20).mean()
df['50-day BB'] = df['Close'].rolling(window=50).mean()

# Calculating the momentum
df['10-day Momentum'] = df['Close'].rolling(window=10).apply(lambda x: x.iloc[-1] / x.iloc[-10])
df['20-day Momentum'] = df['Close'].rolling(window=20).apply(lambda x: x.iloc[-1] / x.iloc[-20])

# Calculating the volume
df['Daily Volume'] = df['Volume'].rolling(window=1).sum()
df['Weekly Volume'] = df['Volume'].rolling(window=7).sum()

In [28]:
# Log-Transforming the ata
df['log_open'] = np.log(df['Open'])
df['log_high'] = np.log(df['High'])
df['log_low'] = np.log(df['Low'])
df['log_close'] = np.log(df['Close'])
df['log_volume'] = np.log(df['Volume'])

In [29]:
# Replacing Na values with 0
df = df.fillna(0)
# df.isnull().sum()

In [30]:
# Dropping Leaky columns
df = df.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap' ])

In [31]:
# Preprocess your data
scaler = MinMaxScaler()
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d') 
df['Date'] = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').timestamp())
df_scaled = scaler.fit_transform(df)

In [32]:

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_scaled[:, :-1], df_scaled[:, -1], test_size=0.2, random_state=42)

# Define your model
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

# Train your model
early_stopping = EarlyStopping(patience=5)
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate your model
mse = model.evaluate(X_test, y_test)
mae = mean_absolute_error(y_test, model.predict(X_test))
mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
r2 = r2_score(y_test, model.predict(X_test))
print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'MAPE: {mape}')
print(f'R-squared: {r2}')

  super().__init__(**kwargs)


Epoch 1/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - loss: 0.0793 - val_loss: 0.0127
Epoch 2/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0120 - val_loss: 0.0085
Epoch 3/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0093 - val_loss: 0.0080
Epoch 4/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.0078 - val_loss: 0.0078
Epoch 5/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0083 - val_loss: 0.0076
Epoch 6/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0080 - val_loss: 0.0074
Epoch 7/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0083 - val_loss: 0.0072
Epoch 8/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0078 - val_loss: 0.0072
Epoch 9/100
[1m41/41[0m [32m━━━━━━━━━

In [33]:
# Defining the hyperparameter grid
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

In [34]:
# Defining the model
model = RandomForestRegressor()

In [35]:
# Performing grid search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Performing the RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_iter=10)
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Grid hyperparameters:", grid_search.best_params_)
print("Best Randomized hyperparameters:", random_search.best_params_)

Best Grid hyperparameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best Randomized hyperparameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': None}


In [36]:
# Evaluating the models with the best hyperparameters
print('GridSearchCV:')
y_pred = grid_search.best_estimator_.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'MAPE: {mape}')
print(f'R-squared: {r2}')


print('\n RandomizedSearchCV:')
y_pred = random_search.best_estimator_.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'MAPE: {mape}')
print(f'R-squared: {r2}')



GridSearchCV:
MSE: 9.268885546595941e-07
MAE: 0.0005751812639936512
MAPE: 0.0017176024950517417
R-squared: 0.9999769871822555

 RandomizedSearchCV:
MSE: 9.120196399120238e-07
MAE: 0.0005768560492598014
MAPE: 0.0017896913644868063
R-squared: 0.9999773563481313


In [37]:
# Performing cross-validation on training model
train_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Performing cross-validation on testing model
test_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='neg_mean_squared_error')

# Evaluating the scores
print("Cross-validation Train scores:", train_scores)
print("Cross-validation Train Mean:", train_scores.mean())
print("Cross-validation Test scores:", test_scores)
print("Cross-validation Test Mean:", test_scores.mean())

Cross-validation Train scores: [-2.56971824e-06 -7.21203213e-06 -7.67052093e-06 -1.94701307e-06
 -2.70973925e-06]
Cross-validation Train Mean: -4.42180472328542e-06
Cross-validation Test scores: [-1.98785493e-04 -2.64393494e-05 -6.10579784e-05 -5.33988641e-05
 -5.25105371e-05]
Cross-validation Test Mean: -7.843844439157207e-05


In [None]:
sns.set()
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('With high correlations')
plt.show()