In [29]:
import yfinance as yf
import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
from pandas import read_csv
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import hvplot.pandas

In [2]:
ethdata = yf.download("ETH-USD", start="2020-04-08", end="2022-04-08")
eth_df = pd.DataFrame(ethdata)
eth_df.to_csv("eth_data.csv")

[*********************100%***********************]  1 of 1 completed


In [3]:
eth_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-08,165.240326,173.210266,164.493195,172.641739,172.641739,17063110836
2020-04-09,172.761261,172.897781,167.987122,170.807144,170.807144,14901696210
2020-04-10,170.829269,170.949768,154.914474,158.412445,158.412445,17980944616
2020-04-11,158.538986,161.167572,155.29834,158.216019,158.216019,13555089447
2020-04-12,158.232391,164.516953,156.320511,161.142426,161.142426,15123721386


In [4]:
eth_close = eth_df[['Close']]
eth_close.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2020-04-08,172.641739
2020-04-09,170.807144
2020-04-10,158.412445
2020-04-11,158.216019
2020-04-12,161.142426


In [5]:
eth_close.hvplot(title='ETH-USD', figsize=(16,8))

In [6]:
# From original data, drop just Volume and Adj Close to test which features are more important
# Adj Close accounts for value after dividends are paid out, which ETH does not do. This is geared towards stock data.
eth_df.drop(['Adj Close', 'Volume'], axis='columns', inplace=True)
eth_df.head(5)

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-08,165.240326,173.210266,164.493195,172.641739
2020-04-09,172.761261,172.897781,167.987122,170.807144
2020-04-10,170.829269,170.949768,154.914474,158.412445
2020-04-11,158.538986,161.167572,155.29834,158.216019
2020-04-12,158.232391,164.516953,156.320511,161.142426


In [7]:
# Define features set
X = eth_df.copy()
X.drop("Close", axis=1, inplace=True)
X.tail(5)

Unnamed: 0_level_0,Open,High,Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-04-04,3522.36499,3535.148193,3422.000977
2022-04-05,3521.239746,3546.706787,3410.547607
2022-04-06,3411.672119,3411.672119,3171.205078
2022-04-07,3172.197266,3263.474609,3155.581055
2022-04-08,3233.272461,3301.607422,3179.142334


In [8]:
# Define target vector
y = eth_df["Close"].ravel()
y[:5]

array([172.64173889, 170.80714417, 158.41244507, 158.21601868,
       161.14242554])

In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
# Show how many samples are in the training set
len(X_train)

584

In [11]:
# Show how many samples are in the testing set; verify 20%
len(X_test)

147

In [12]:
# Creating StandardScaler instance
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

In [13]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
model = RandomForestRegressor()

In [15]:
# Hyper Parameter Tuning using RandomizedSearchCV which is prefered to GridSearchCV when you are using a high computational cost
# or you are looking at a large amound of parameters 
param_grid = {
'n_estimators': [50, 100, 500, 1000],
'max_depth': [1, 5, 10, 15],
'min_samples_split': [2, 4, 6, 8],
'min_samples_leaf': np.arange(1, 2, 3, dtype=int),
'bootstrap': [True, False],
'random_state': [0, 1, 2, 3, 4]
}

In [16]:
# Randomized Search Cross Validation
random_search_cv = RandomizedSearchCV(estimator=model,
param_distributions = param_grid, 
cv=3, n_jobs=-1, verbose=2, n_iter=200)
rscv_fit = random_search_cv.fit(X_train_scaled, y_train)
best_param = rscv_fit.best_params_
print(best_param)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
{'random_state': 3, 'n_estimators': 500, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}


In [17]:
model = RandomForestRegressor(n_estimators=500, random_state=3, min_samples_split=4, min_samples_leaf=1, max_depth=10, bootstrap=True)
model.fit(X_train_scaled, y_train)
predict = model.predict(X_test_scaled)
print(predict)
print(predict.shape)

[ 385.05787091  376.04948285  173.95547869  212.26003993 3243.39318017
 1956.57555793 1327.62154105 2617.62867064 1933.02022146  206.41189656
  212.85639027 2114.05724338  240.79813041  382.63453715  375.42768038
  349.05357045 3861.00034639 3109.56624233 2269.01456502  232.400686
 4055.14294512 3230.76032217 2980.21788904  375.14064424 2784.78295309
 4401.0387003   402.12605113  605.19610531 3701.14613618 3392.98824435
  163.51838927  604.49535515 3291.28124646  357.47883548 1826.94947961
 3401.14240746 4265.53125592 3489.02181536  237.33583013 1571.78160592
 2307.24426418  171.97769851 1113.06775277 2641.19645066 4541.75171165
 3700.28244517 2109.99239838  564.11887133 4046.90791341  227.85876738
 3847.41522891 4406.08674787 2863.18149256 3377.11440386  450.78009318
 1555.72080206 1694.29609907 2509.76658449 4528.96523446 2196.36127621
 2247.29867173 2821.4567667  4031.6481305  4010.54583644 3251.99602448
  583.34417324 1136.80119408 2544.41275401 3599.95391633 3028.08484353
 3236.57

In [18]:
print(f'Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, predict), 4))
print(f'Mean Squared Error:', round(metrics.mean_squared_error(y_test, predict), 4))
print(f'Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, predict)), 4))
print(f'(R^2) Score:', round(metrics.r2_score(y_test, predict), 4))
print(f'Train Score : {model.score(X_train_scaled, y_train) * 100:.2f}% and Test Score : {model.score(X_test_scaled, y_test) * 100:.2f}% using Random Tree Regressor.')
errors = abs(predict - y_test)
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.') 

Mean Absolute Error: 47.9203
Mean Squared Error: 7081.8254
Root Mean Squared Error: 84.1536
(R^2) Score: 0.9967
Train Score : 99.96% and Test Score : 99.67% using Random Tree Regressor.
Accuracy: 97.68 %.


In [19]:
# Create dataframe to run predictions for different lengths of time and write into a csv file
price_predictions = pd.DataFrame({'Predictions': predict}, 
                                 index=pd.date_range(start=eth_df.index[-1], 
                                                     periods=len(predict), 
                                                     freq='D'))
price_predictions.to_csv('ETH_Price_Prediction_Date.csv')

In [20]:
# 1 week time period
one_week_prediction = pd.DataFrame(price_predictions[:7])
one_week_prediction.to_csv('One_Week_ETH_Price_Prediction.csv')

In [21]:
# Plot 1 Week Prediction
weekly_prediction = pd.read_csv('One_Week_ETH_Price_Prediction.csv', )
one_week_prediction.hvplot(title='1 Week ETH-USD Price Prediction', ylabel='USD', figsize=(16,8)) 

In [22]:
# 1 month time period
one_month_prediction = pd.DataFrame(price_predictions[:30])
one_month_prediction.to_csv('One_Month_ETH_Price_Prediction.csv')

In [23]:
monthly_prediction = pd.read_csv('One_Month_ETH_Price_Prediction.csv')
monthly_prediction.hvplot(title='1 Month ETH-USD Price Prediction', ylabel='USD', figsize=(16,8)) 

In [24]:
# 3 month time period
three_month_prediction = pd.DataFrame(price_predictions[:91])
three_month_prediction.to_csv('Three_Month_ETH_Price_Prediction.csv')

In [25]:
quarterly_prediction = pd.read_csv('Three_Month_ETH_Price_Prediction.csv')
quarterly_prediction.hvplot(title='Quarterly ETH-USD Price Prediction', ylabel='USD', figsize=(16,8)) 

In [26]:
# 6 month time period
six_month_prediction = pd.DataFrame(price_predictions[:182])
six_month_prediction.to_csv('Six_Month_ETH_Price_Prediction.csv')

In [27]:
half_year_prediction = pd.read_csv('Six_Month_ETH_Price_Prediction.csv')
half_year_prediction.hvplot(title='Half Year ETH-USD Price Prediction', ylabel='USD', figsize=(16,8)) 