In [1]:
import sys
sys.path.insert(0, sys.path[0].removesuffix('/src/jupyter_nb'))
from pycaret.classification import *
from src.utils import *
from src.calcEMA import *
from src.myenv import *
import plotly.express as px


In [2]:
# Variables
# ETCUSDT BTCUSDT
# symbol = 'ETHUSDT'
symbol = 'BTCUSDT'
# lightgbm  xgboost
estimator = 'xgboost'
_compare_models = False

start_train_date = '2010-01-01'  # train < and test >=
start_test_date = '2023-01-01'  # train < and test >=

stop_loss = 2.0
label = 'status'
regression_times = 24 * 30 * 2  # horas
regression_profit_and_loss = 24
# numeric_features=['open', 'high', 'low', 'volume', 'close', 'rsi']
numeric_features = ['open', 'high', 'low', 'volume', 'close', ]
_calc_rsi = True

### Metadata

<code>
Field Name - Description</br>
open_time - Kline Open time in unix time format</br>
open - Open Price</br>
high - High Price</br>
low	- Low Price</br>
close	- Close Price</br>
volume - Volume</br>
close_time - Kline Close time in unix time format</br>
quote_volume - Quote Asset Volume</br>
count	- Number of Trades</br>
taker_buy_volume - Taker buy base asset volume during this period</br>
taker_buy_quote_volume - Taker buy quote asset volume during this period</br>
ignore - Ignore</br>
</code>

In [3]:
use_cols = date_features + numeric_features
print(use_cols)
all_data = read_data(f'{datadir}/{symbol}', all_cols=None, use_cols=use_cols)

# Filter all data
all_data = all_data[(all_data['open_time'] >= start_train_date)].copy()

print(all_data.info())

['open_time', 'open', 'high', 'low', 'volume', 'close']
read_data: Start reading file:  /home/marcelo/des/mg_crypto_trader/src/data/BTCUSDT/BTCUSDT.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 52920 entries, 0 to 52919
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   open_time  52920 non-null  datetime64[ns]
 1   open       52920 non-null  float64       
 2   high       52920 non-null  float64       
 3   low        52920 non-null  float64       
 4   close      52920 non-null  float64       
 5   volume     52920 non-null  float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 2.8 MB
None


In [4]:
if _calc_rsi:
  all_data = calc_RSI(all_data)
  numeric_features.append('rsi')
  all_data.dropna(inplace=True)
print(all_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52906 entries, 14 to 52919
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   open_time  52906 non-null  datetime64[ns]
 1   open       52906 non-null  float64       
 2   high       52906 non-null  float64       
 3   low        52906 non-null  float64       
 4   close      52906 non-null  float64       
 5   volume     52906 non-null  float64       
 6   rsi        52906 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 3.2 MB
None


In [5]:
all_cols = date_features + numeric_features
print('All Columns: ', all_cols)
all_data[all_cols]
#all_data = all_data[[all_cols]].copy()

All Columns:  ['open_time', 'open', 'high', 'low', 'volume', 'close', 'rsi']


Unnamed: 0,open_time,open,high,low,volume,close,rsi
14,2017-08-17 18:00:00,4289.24,4302.45,4218.68,51.275163,4256.97,44.57
15,2017-08-17 19:00:00,4241.91,4335.30,4200.74,46.710850,4325.23,51.96
16,2017-08-17 20:00:00,4307.56,4354.84,4258.56,48.975472,4346.74,54.05
17,2017-08-17 21:00:00,4346.74,4369.69,4309.23,61.773036,4333.55,52.54
18,2017-08-17 22:00:00,4333.55,4359.13,4310.00,57.376142,4336.80,52.89
...,...,...,...,...,...,...,...
52915,2023-09-04 21:00:00,25836.68,25842.23,25631.21,1856.109350,25677.71,30.14
52916,2023-09-04 22:00:00,25677.72,25798.54,25632.61,839.811150,25775.01,41.63
52917,2023-09-04 23:00:00,25775.02,25874.97,25764.02,1283.818470,25826.02,46.59
52918,2023-09-05 00:00:00,25826.03,25844.48,25737.69,860.375180,25768.49,42.23


In [6]:
def regress_until_diff(data: pd.DataFrame, diff_percent: float, max_regression_profit_and_loss=6):
    data['close_shift_x'] = 0.0
    data['diff_shift_x'] = 0.0
    data['shift_x'] = 0
    data[label] = 'ESTAVEL'
    for row_nu in range(1, data.shape[0]):
        diff = 0
        i = 1

        while (abs(diff) <= diff_percent):
            if (i > max_regression_profit_and_loss) or ((row_nu + i) >= data.shape[0]):
                break

            close = data.iloc[row_nu:row_nu + 1]['close'].values[0]
            close_px = data.iloc[row_nu + i:row_nu + i + 1]['close'].values[0]
            diff = -100 * (close - close_px) / close
            # print(f'ROW_NU: {row_nu} - regresssion_times: {i} - diff: {diff}')
            i += 1

        data['close_shift_x'].iloc[row_nu:row_nu + 1] = close_px
        data['diff_shift_x'].iloc[row_nu:row_nu + 1] = diff
        data['shift_x'].iloc[row_nu:row_nu + 1] = i - 1 if i == max_regression_profit_and_loss + 1 else i

        if diff >= diff_percent:
            data[label].iloc[row_nu:row_nu + 1] = 'SOBE_' + str(diff_percent)
            
        elif diff <= -diff_percent:
            data[label].iloc[row_nu:row_nu + 1] = 'CAI_' + str(diff_percent)

    return data.drop(columns=['close_shift_x', 'diff_shift_x', 'shift_x'])

In [7]:
all_data = regress_until_diff(all_data, stop_loss, regression_profit_and_loss)

all_data

Unnamed: 0,open_time,open,high,low,close,volume,rsi,status
14,2017-08-17 18:00:00,4289.24,4302.45,4218.68,4256.97,51.275163,44.57,ESTAVEL
15,2017-08-17 19:00:00,4241.91,4335.30,4200.74,4325.23,46.710850,51.96,CAI_2.0
16,2017-08-17 20:00:00,4307.56,4354.84,4258.56,4346.74,48.975472,54.05,CAI_2.0
17,2017-08-17 21:00:00,4346.74,4369.69,4309.23,4333.55,61.773036,52.54,CAI_2.0
18,2017-08-17 22:00:00,4333.55,4359.13,4310.00,4336.80,57.376142,52.89,CAI_2.0
...,...,...,...,...,...,...,...,...
52915,2023-09-04 21:00:00,25836.68,25842.23,25631.21,25677.71,1856.109350,30.14,ESTAVEL
52916,2023-09-04 22:00:00,25677.72,25798.54,25632.61,25775.01,839.811150,41.63,ESTAVEL
52917,2023-09-04 23:00:00,25775.02,25874.97,25764.02,25826.02,1283.818470,46.59,ESTAVEL
52918,2023-09-05 00:00:00,25826.03,25844.48,25737.69,25768.49,860.375180,42.23,ESTAVEL


In [8]:
for nf in numeric_features.copy():
    for i in range(1, regression_times + 1):
        col = nf + "_" + str(i)
        all_data[col] = all_data[nf].shift(i)
        numeric_features.append(col)

all_data.dropna(inplace=True)
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51466 entries, 1454 to 52919
Columns: 8648 entries, open_time to rsi_1440
dtypes: datetime64[ns](1), float64(8646), object(1)
memory usage: 3.3+ GB


In [9]:
train_data = all_data[(all_data['open_time'] >= start_train_date) & (all_data['open_time'] < start_test_date)]
train_data = train_data.sort_values(date_features)
train_data

Unnamed: 0,open_time,open,high,low,close,volume,rsi,status,open_1,open_2,...,rsi_1431,rsi_1432,rsi_1433,rsi_1434,rsi_1435,rsi_1436,rsi_1437,rsi_1438,rsi_1439,rsi_1440
1454,2017-10-17 00:00:00,5760.00,5774.98,5726.97,5726.98,32.255686,55.90,CAI_2.0,5706.11,5722.99,...,48.89,45.76,42.61,47.17,46.99,52.89,52.54,54.05,51.96,44.57
1455,2017-10-17 01:00:00,5726.98,5745.51,5590.00,5590.00,83.628885,39.64,ESTAVEL,5760.00,5706.11,...,48.35,48.89,45.76,42.61,47.17,46.99,52.89,52.54,54.05,51.96
1456,2017-10-17 02:00:00,5590.02,5648.32,5590.00,5619.98,56.374651,43.51,CAI_2.0,5726.98,5760.00,...,51.66,48.35,48.89,45.76,42.61,47.17,46.99,52.89,52.54,54.05
1457,2017-10-17 03:00:00,5615.12,5620.01,5600.44,5620.01,66.317717,43.51,CAI_2.0,5590.02,5726.98,...,47.32,51.66,48.35,48.89,45.76,42.61,47.17,46.99,52.89,52.54
1458,2017-10-17 04:00:00,5612.02,5659.99,5600.60,5657.01,67.643293,48.26,CAI_2.0,5615.12,5590.02,...,50.07,47.32,51.66,48.35,48.89,45.76,42.61,47.17,46.99,52.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46976,2022-12-31 19:00:00,16577.78,16590.06,16565.10,16570.14,4044.433590,50.10,ESTAVEL,16581.45,16600.93,...,52.77,52.77,52.02,51.13,52.58,49.15,48.64,46.35,48.79,45.83
46977,2022-12-31 20:00:00,16570.14,16574.97,16564.09,16568.60,2622.143550,49.71,ESTAVEL,16577.78,16581.45,...,48.57,52.77,52.77,52.02,51.13,52.58,49.15,48.64,46.35,48.79
46978,2022-12-31 21:00:00,16568.19,16571.64,16544.12,16548.28,3618.773890,44.78,ESTAVEL,16570.14,16577.78,...,47.24,48.57,52.77,52.77,52.02,51.13,52.58,49.15,48.64,46.35
46979,2022-12-31 22:00:00,16548.28,16567.49,16470.00,16520.81,6695.136250,39.13,ESTAVEL,16568.19,16570.14,...,41.07,47.24,48.57,52.77,52.77,52.02,51.13,52.58,49.15,48.64


In [10]:
setup = setup(train_data,
              train_size=0.7,
              target=label,
              numeric_features=numeric_features,
              date_features=['open_time'],
              create_date_columns=["hour", "day", "month"],
              fold_strategy='timeseries',
              fold=3,
              session_id=123,
              normalize=True,
              use_gpu=False,
              verbose=True,
              n_jobs=20,
              )

Unnamed: 0,Description,Value
0,Session id,123
1,Target,status
2,Target type,Multiclass
3,Target mapping,"CAI_2.0: 0, ESTAVEL: 1, SOBE_2.0: 2"
4,Original data shape,"(45527, 8648)"
5,Transformed data shape,"(45527, 8650)"
6,Transformed train set shape,"(31868, 8650)"
7,Transformed test set shape,"(13659, 8650)"
8,Numeric features,8646
9,Date features,1


In [11]:
#Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
#model_name = 'lightgbm' 'xgboost'
if _compare_models:
  best = setup.compare_models()
  estimator = best.__class__.__name__
else:
  best = setup.create_model(estimator)

best

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6745,0.8441,0.6745,0.6742,0.6743,0.5117,0.5118
1,0.7462,0.8928,0.7462,0.7462,0.7461,0.6191,0.6192
2,0.7822,0.9156,0.7822,0.7824,0.7821,0.6732,0.6734
Mean,0.7343,0.8842,0.7343,0.7343,0.7342,0.6014,0.6015
Std,0.0448,0.0298,0.0448,0.045,0.0448,0.0671,0.0671


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
test_data = all_data[all_data['open_time'] >= start_test_date]
test_data = test_data.sort_values(date_features)

In [13]:
# predict on test set
holdout_pred = predict_model(best)
print(holdout_pred['prediction_score'].mean())

holdout_pred[[label, 'prediction_score']].groupby(label).mean()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.8077,0.9316,0.8077,0.8078,0.8077,0.7114,0.7115


0.67593557


Unnamed: 0_level_0,prediction_score
status,Unnamed: 1_level_1
0,0.65136
1,0.723546
2,0.654403


In [14]:
predict = predict_model(best, data=test_data.drop(columns=[label]))
predict[label] = test_data[label]
predict['_score'] = predict['prediction_label'] == predict[label]
print('Score Mean:', predict['_score'].mean())

Score Mean: 0.3054386260313184


In [15]:
predict[[label, '_score']].groupby(label).mean()

Unnamed: 0_level_0,_score
status,Unnamed: 1_level_1
CAI_2.0,0.599639
ESTAVEL,0.2036
SOBE_2.0,0.333595


In [16]:
final_predict = finalize_model(best) # data=test_data.sort_values(date_features).drop(columns=[label]))

In [17]:
_predict = predict_model(final_predict, data=test_data.sort_values(date_features).drop(columns=[label]))
_predict[label] = test_data[label]
_predict['_score'] = _predict['prediction_label'] == _predict[label]
print('Score Mean:', _predict['_score'].mean())

Score Mean: 0.31150025256777236


In [18]:
_predict[[label, '_score']].groupby(label).mean()

Unnamed: 0_level_0,_score
status,Unnamed: 1_level_1
CAI_2.0,0.695221
ESTAVEL,0.192351
SOBE_2.0,0.310047


In [19]:
for i in range(1, 9999):
  filename = f'{symbol}_{estimator}_SL_{stop_loss}_RT_{regression_times}_RPL_{regression_profit_and_loss}_{i}'
  print('Model file name: ', filename + '.pkl')
  if os.path.exists(filename + '.pkl'):
    continue
  else:    
    save_model(final_predict, filename)
    break

Model file name:  BTCUSDT_xgboost_SL_2.0_RT_1440_RPL_24_1.pkl
Transformation Pipeline and Model Successfully Saved
