In [1]:
import numpy as np
import pandas as pd

import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

#model
# from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV

from modules.optimize import *
from modules.utils import preprocess, train_test_split_by_time, clean_dataset
from modules.visualization import candle_stick_chart, plot_monthly_open_close_comparison
from modules.visualization import plot_monthly_high_low_comparison, plot_ohlc_price_chart, prediction_visualization
from modules.financial_features import feature_engineering, calculate_feature_importance, get_top_features, hyperparameter_tuning
from modules.prediction_pipeline import prediction_model, models, train_mse, test_mse, train_r2, test_r2, train_rmse, test_rmse

In [2]:
df = pd.read_csv('data/ETHUSDT_data_new.csv', sep=';')
df = preprocess(df)

In [3]:
df.head()

Unnamed: 0,datetime,open,high,low,close,volume
0,2021-01-01 00:00:00,737.18,740.0,730.44,731.64,46772.61
1,2021-01-01 00:15:00,731.7,732.99,730.0,732.36,20375.178
2,2021-01-01 00:30:00,732.36,735.1,732.21,734.18,14593.525
3,2021-01-01 00:45:00,734.18,736.35,733.04,734.6,16351.214
4,2021-01-01 01:00:00,734.61,744.49,734.0,744.47,42580.2


In [4]:
df.tail()

Unnamed: 0,datetime,open,high,low,close,volume
34589,2021-12-28 23:00:00,3808.65,3814.98,3793.3,3807.11,9375.533
34590,2021-12-28 23:15:00,3807.11,3817.56,3805.72,3805.72,6684.468
34591,2021-12-28 23:30:00,3811.25,3816.14,3801.26,3807.87,6094.621
34592,2021-12-28 23:45:00,3807.87,3809.02,3788.94,3791.99,7712.899
34593,2021-12-29 00:00:00,3792.0,3819.0,3783.0,3817.27,17594.83


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34594 entries, 0 to 34593
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  34594 non-null  datetime64[ns]
 1   open      34594 non-null  float64       
 2   high      34594 non-null  float64       
 3   low       34594 non-null  float64       
 4   close     34594 non-null  float64       
 5   volume    34594 non-null  float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 1.8 MB


In [6]:
df.describe()

Unnamed: 0,open,high,low,close,volume
count,34594.0,34594.0,34594.0,34594.0,34594.0
mean,2762.592846,2773.191161,2751.622746,2762.314601,32029.641588
std,1028.582494,1030.536771,1026.578592,1028.007699,31747.076555
min,716.67,721.9,714.55,716.65,0.0
25%,1879.5075,1888.31,1871.9625,1880.165,13471.88525
50%,2592.485,2605.92,2578.55,2598.925,22901.244
75%,3630.125,3640.93,3616.34,3638.0575,38940.07875
max,4852.33,4877.54,4847.0,4852.33,579046.3


In [7]:
#checking for nan
print('Null Values:',df.isnull().values.sum())

Null Values: 0


### EDA

In [8]:
# fig_candle_stick = candle_stick_chart(data)
# fig_candle_stick.show()

In [9]:
fig_monthly_open_close = plot_monthly_open_close_comparison(df)
fig_monthly_open_close.show()

In [10]:
fig_monthly_high_low = plot_monthly_high_low_comparison(df)
fig_monthly_high_low.show()

In [11]:
fig_plot_ohlc_price_chart = plot_ohlc_price_chart(df)
fig_plot_ohlc_price_chart.show()

### Train Test Split

In [12]:

train_start, train_end = '2021-01-02 00:00:00', '2021-09-30 00:00:00',
test_start, test_end = '2021-10-01 00:00:00', '2021-12-29 00:00:00'
prediction_interval=1
train_df, test_df = train_test_split_by_time(df, train_start, train_end, test_start, test_end)

In [13]:
train_df.head()

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-02 00:00:00,729.7,730.95,724.64,728.5,33251.746
2021-01-02 00:15:00,728.5,728.51,717.0,718.52,67917.55
2021-01-02 00:30:00,718.51,723.9,717.0,722.9,34604.24
2021-01-02 00:45:00,722.99,723.2,719.61,720.8,16793.537
2021-01-02 01:00:00,720.81,721.9,715.6,716.65,26311.502


# Feature Engeering

## Optimizing Trading Indicators

In this code snippet, we are optimizing trading indicators for a financial dataset using various time periods.

### Exponential Moving Average (EMA) Optimization

We optimize two EMA indicators: one with a longer time period and another with a shorter time period. The optimization is done within specified ranges for both long and short periods.

- `opt_long_ema`: Optimized long EMA period.
- `opt_short_ema`: Optimized short EMA period.

### Simple Moving Average (SMA) Optimization

Similar to EMA, we optimize two SMA indicators with different time periods.

- `opt_long_sma`: Optimized long SMA period.
- `opt_short_sma`: Optimized short SMA period.

### Relative Strength Index (RSI) Optimization

We optimize RSI indicators with different time periods for minimum and maximum values.

- `opt_long_rsi`: Optimized long RSI period.
- `opt_short_rsi`: Optimized short RSI period.

### Rate of Change (ROC) Optimization

ROC indicators are optimized similarly to RSI.

- `opt_long_roc`: Optimized long ROC period.
- `opt_short_roc`: Optimized short ROC period.

These optimizations aim to find the most suitable time periods for these technical indicators based on historical price data. The resulting optimized parameters can be used in trading strategies and analysis.


In [14]:
opt_long_ema, opt_short_ema= optimize_trend(price = train_df['close'].reset_index()['close'], min_long = 10, max_long = 20, min_short = 5, max_short = 9 ,TI = 'EMA' )
opt_long_sma, opt_short_sma= optimize_trend(price = train_df['close'].reset_index()['close'], min_long = 10, max_long = 20, min_short = 5, max_short = 9 ,TI = 'SMA' )
opt_long_rsi, opt_short_rsi= optmizer_oscillator(price = train_df['close'].reset_index()['close'], min_period = 10, max_period = 20,TI = 'RSI' ),optmizer_oscillator(price = train_df['close'].reset_index()['close'], min_period = 5, max_period = 9,TI = 'RSI' )
opt_long_roc, opt_short_roc= optmizer_oscillator(price = train_df['close'].reset_index()['close'], min_period = 10,max_period = 20, TI = 'ROC' ),optmizer_oscillator(price = train_df['close'].reset_index()['close'], min_period = 5,max_period = 9, TI = 'ROC' )

In [15]:
print(f"Optimize Long EMA Timeperiod : {opt_long_ema} Optimize Short EMA Timeperiod : {opt_short_ema}")
print(f"Optimize Long SMA Timeperiod : {opt_long_sma} Optimize Short SMA Timeperiod : {opt_short_sma}")
print(f"Optimize Long RSI Timeperiod : {opt_long_rsi} Optimize Short RSI Timeperiod : {opt_short_rsi}")
print(f"Optimize Long ROC Timeperiod : {opt_long_roc} Optimize Short ROC Timeperiod : {opt_short_roc}")

Optimize Long EMA Timeperiod : 19 Optimize Short EMA Timeperiod : 8
Optimize Long SMA Timeperiod : 11 Optimize Short SMA Timeperiod : 8
Optimize Long RSI Timeperiod : 11 Optimize Short RSI Timeperiod : 6
Optimize Long ROC Timeperiod : 11 Optimize Short ROC Timeperiod : 7


In [16]:
train_test_df = pd.concat([train_df, test_df], axis = 0)

# Calculating technical indicators
train_test_df["ema_diff"] = tb.EMA(train_test_df['close'], timeperiod=opt_short_ema) - tb.EMA(train_test_df['close'], timeperiod=opt_long_ema)
train_test_df["sma_diff"] = tb.SMA(train_test_df['close'], timeperiod=opt_short_sma) - tb.SMA(train_test_df['close'], timeperiod=opt_long_sma)
train_test_df['rsi_diff'] = tb.RSI(train_test_df['close'], timeperiod=opt_short_rsi) - tb.RSI(train_test_df['close'], timeperiod=opt_long_rsi)
train_test_df['roc_diff'] = tb.ROC(train_test_df['close'], timeperiod=opt_short_roc) - tb.ROC(train_test_df['close'], timeperiod=opt_long_roc)


### Prediction

## Model Selection

After evaluating the models, we selected the **RandomForestRegressor** as the best-performing model based on its R-squared (R2) score, which indicates a strong predictive performance on the test data.

We will proceed with further analysis and predictions using the RandomForestRegressor model as our chosen model.

This model selection process ensures that we are using the most suitable model for our specific dataset and problem.


In [17]:
prediction_intervals = [-1*1, -1*2, -1*4, -1*96, -1*2880] # check for 30 days

# Define the folder path and file name
model_eval_folder = 'model_evaluation'
prediction_folder = 'model_predictions'
intervals = ['15mins', '30mins', '1hour', '1day', '30days']
count = 0
for prediction_interval in tqdm(prediction_intervals):
    train_test_df = feature_engineering(train_test_df, prediction_interval=prediction_interval)

    train_test_df = clean_dataset(train_test_df)  
    train_df, test_df = prediction_model(train_test_df,
                                         RandomForestRegressor(),
                                         f'RandomForestRegressor_{intervals[count]}',
                                         train_end=train_end,
                                         test_start=test_start)

    final_df = pd.concat([train_df, test_df], axis = 0)


    if prediction_interval == -1:
        predictions_file_name = 'predictions_15mins.csv'
    elif prediction_interval == -2:
        predictions_file_name = 'predictions_30mins.csv'
    elif prediction_interval == -4:
        predictions_file_name = 'predictions_1hour.csv'
    elif prediction_interval == -96:
        predictions_file_name = 'predictions_1day.csv'
    elif prediction_interval == -2880:
        predictions_file_name = 'predictions_30days.csv'


    # Check if the folder exists, and create it if it doesn't
    if not os.path.exists(model_eval_folder):
        os.makedirs(model_eval_folder)
    
    if not os.path.exists(prediction_folder):
        os.makedirs(prediction_folder)


    model_prediction_file_path = os.path.join(prediction_folder, predictions_file_name)
    final_df.to_csv(model_prediction_file_path)

    count += 1

  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [03:11<12:46, 191.68s/it]



 40%|████      | 2/5 [05:04<07:15, 145.31s/it]



 60%|██████    | 3/5 [06:46<04:11, 125.65s/it]



 80%|████████  | 4/5 [07:59<01:44, 104.75s/it]



100%|██████████| 5/5 [09:02<00:00, 108.57s/it]


In [18]:
model_eval = pd.DataFrame(
            data= {'Models' : models,
                'train_mse': train_mse,
                'test_mse': test_mse,
                'train_r2': train_r2,
                'test_r2': test_r2,
                'train_rmse': train_rmse,
                'test_rmse': test_rmse}
            )
# Save the DataFrame as a CSV file inside the folder
model_eval_file_path = os.path.join(model_eval_folder, 'model_eval.csv')
model_eval.to_csv(model_eval_file_path, index=False)