### Treinamento do Modelo Machine Learning

In [25]:
%pip install pandas scikit-learn xgboost joblib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Importações de Bibliotecas

In [2]:
# Data load and processing
import pandas as pd
import numpy as np
from typing import List, Tuple
import boto3
from io import StringIO
from dotenv import load_dotenv
import os

In [3]:
## Pre-Processing and evaluation
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, accuracy_score

In [4]:
# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from joblib import dump

#### Instruções

1. Renomear o arquivo .env_exemplo para somente .env
2. Adicionar popular as variaveis conforme o padrão de nomenclatura que voce utilizar

### Reading the Data

In [5]:
load_dotenv()
def read_from_s3(bucket_name:str):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    response = []
    for obj in bucket.objects.all():
        body = obj.get()['Body'].read()
        s=str(body,'utf-8')
        data = StringIO(s)
        response.append(pd.read_csv(data, index_col=0))
    return response

In [12]:
bucket_name = os.environ['BUCKET_NAME']
csv_file_name = os.environ['CSV_FILE_NAME']
bucket_layer = os.environ['BUCKET_LAYER']
object_name = f'{bucket_layer}/{csv_file_name}'
data = read_from_s3(bucket_name)[0]
data.head()

Unnamed: 0,open,high,low,close,volume,edit_count,tomorrow,sentiment,neg_sentiment,fng_index,fng_class,fng_in_yest,fng_class_yest,edit_count_y,sentiment_y,neg_sentiment_y,target
2018-03-03,11101.900391,11528.200195,11002.400391,11489.700195,6690570240,3.066667,11512.599609,-0.307224,0.541296,56,Greed,47.0,Neutral,3.066667,-0.307224,0.541296,1
2018-03-04,11497.400391,11512.599609,11136.099609,11512.599609,6084149760,2.933333,11573.299805,-0.258349,0.513519,44,Fear,56.0,Greed,3.066667,-0.307224,0.541296,1
2018-03-05,11532.400391,11704.099609,11443.900391,11573.299805,6468539904,2.866667,10779.900391,-0.259235,0.496852,55,Greed,44.0,Fear,2.933333,-0.258349,0.513519,0
2018-03-06,11500.099609,11500.099609,10694.299805,10779.900391,6832169984,2.666667,9965.570312,-0.249632,0.47463,59,Greed,55.0,Greed,2.866667,-0.259235,0.496852,0
2018-03-07,10803.900391,10929.5,9692.120117,9965.570312,8797910016,2.633333,9395.009766,-0.216476,0.441296,37,Fear,59.0,Greed,2.666667,-0.249632,0.47463,0


### Data Pre-Processing

In [7]:
# If working with data, please uncoment this line for safety, this will generate a backup of the dataframe prior to pre-processing
# df_backup = data.copy()

In [13]:
# Defining the predictors Columns
predictors: List[str] = [
    "close",
    "volume",
    "open",
    "high",
    "low",
    "edit_count_y",
    "sentiment_y",
    "neg_sentiment_y",
    "fng_in_yest",
    "fng_class_yest_adjusted"
]

map = {'Neutral': 0, 'Greed': 1, 'Fear': -1, 'Extreme Fear': -2, 'Extreme Greed': 2}

std_scaler = StandardScaler()

In [14]:
#Adjusting fng classification values
def adjust_df_for_ml(df:pd.DataFrame, map:dict) -> pd.DataFrame:
    df['fng_class_yest_adjusted'] = df['fng_class_yest'].map(map)
    df.drop(columns=['fng_class', 'fng_class_yest'], axis=1, inplace=True)
    return df

data = adjust_df_for_ml(df=data, map=map)
data.head()

Unnamed: 0,open,high,low,close,volume,edit_count,tomorrow,sentiment,neg_sentiment,fng_index,fng_in_yest,edit_count_y,sentiment_y,neg_sentiment_y,target,fng_class_yest_adjusted
2018-03-03,11101.900391,11528.200195,11002.400391,11489.700195,6690570240,3.066667,11512.599609,-0.307224,0.541296,56,47.0,3.066667,-0.307224,0.541296,1,0
2018-03-04,11497.400391,11512.599609,11136.099609,11512.599609,6084149760,2.933333,11573.299805,-0.258349,0.513519,44,56.0,3.066667,-0.307224,0.541296,1,1
2018-03-05,11532.400391,11704.099609,11443.900391,11573.299805,6468539904,2.866667,10779.900391,-0.259235,0.496852,55,44.0,2.933333,-0.258349,0.513519,0,-1
2018-03-06,11500.099609,11500.099609,10694.299805,10779.900391,6832169984,2.666667,9965.570312,-0.249632,0.47463,59,55.0,2.866667,-0.259235,0.496852,0,1
2018-03-07,10803.900391,10929.5,9692.120117,9965.570312,8797910016,2.633333,9395.009766,-0.216476,0.441296,37,59.0,2.666667,-0.249632,0.47463,0,1


Here we are going to test 3 approaches:

1. without any normalization or scalling
2. applying a normalization
3. applying scalling

In [15]:
#spliting data for gridsearch
train = data.iloc[0:2000]
test = data.iloc[2000:2378]

## prepare x_train for models
X_train = train[predictors]
X_train_norm = normalize(train[predictors])
X_train_scaled = std_scaler.fit_transform(train[predictors])

## prepate x_test for models
X_test = test[predictors]
X_test_norm = normalize(test[predictors])
X_test_scaled = std_scaler.fit_transform(test[predictors])

## Target
y_train = train['target']
y_test = test['target']

### Running Grid Search CV to find best Model

In [31]:
def run_grid_search_cv(model, params, X, y):
    search_cv = GridSearchCV(model, params)
    search_cv.fit(X, y)
    return search_cv.best_score_, \
           search_cv.best_params_, \
           search_cv.best_estimator_

def print_grid_evaluation_report(best_score: float, best_params: dict) -> None:
    print("----Evaluation Report----\n")
    print(f"Best Score Achieved: {best_score}\n")
    print(f"Best Params Found: {best_params}\n")
    print("---------------------------------\n")

def predict(train: pd.DataFrame, test: pd.DataFrame, 
            predictors: List[str], model, mode:str= 'raw') -> pd.DataFrame:
    y_train = train["target"]
    X_train = train[predictors]
    X_test = test[predictors]
    
    if mode == 'norm':
        X_train = normalize(X_train)
        X_test = normalize(X_test)
    elif mode == 'sca':
        X_train = std_scaler.fit_transform(X_train)
        X_test = std_scaler.fit_transform(X_test)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    preds = pd.Series(preds, index=test.index, name="predictions")
    return pd.concat([test["target"], preds], axis=1)

def backtest(data: pd.DataFrame, model, 
             predictors: List[str], start: int = 1095, 
             step: int = 150, mode:str = 'raw') -> pd.DataFrame:
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions = predict(train, test, predictors, model, mode)
        all_predictions.append(predictions)

    return pd.concat(all_predictions)

def evaluate_model(predictions: pd.DataFrame) -> Tuple[float, float]:
    precision = precision_score(predictions["target"], predictions["predictions"])
    accuracy = accuracy_score(predictions["target"], predictions["predictions"])
    return precision, accuracy


#### Random Forest (RF)

In [32]:
rf_params = {
    'n_estimators':[100, 125, 150, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3, 5, 7, 9]
}
rf_model = RandomForestClassifier()

##### Random Forest (RF) raw

In [21]:
best_score, best_params, best_model_rf_cr = run_grid_search_cv(model=rf_model, params=rf_params, X=X_train, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.4710000000000001

Best Params Found: {'criterion': 'log_loss', 'max_depth': 3, 'n_estimators': 100}

---------------------------------



In [26]:
predictions = backtest(data, best_model_rf_cr, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.49
Acurácia do modelo: 0.49
            target  predictions
2021-03-05       0            1
2021-03-06       1            0
2021-03-07       1            0
2021-03-08       1            0
2021-03-09       1            0
...            ...          ...
2024-09-02       0            1
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            1
2024-09-06       1            1

[1282 rows x 2 columns]


##### Random Forest (RF) Scaled

In [27]:
best_score, best_params, best_model_rf_sca = run_grid_search_cv(model=rf_model, params=rf_params, X=X_train_scaled, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.46449999999999997

Best Params Found: {'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 125}

---------------------------------



In [28]:
predictions = backtest(data, best_model_rf_sca, predictors, mode='sca')

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.50
Acurácia do modelo: 0.50
            target  predictions
2021-03-05       0            1
2021-03-06       1            1
2021-03-07       1            1
2021-03-08       1            1
2021-03-09       1            1
...            ...          ...
2024-09-02       0            0
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            1
2024-09-06       1            1

[1282 rows x 2 columns]


##### Random Forest (RF) Normalized

In [29]:
best_score, best_params, best_model_rf_norm = run_grid_search_cv(model=rf_model, params=rf_params, X=X_train_norm, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.5165000000000001

Best Params Found: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 200}

---------------------------------



In [30]:
predictions = backtest(data, best_model_rf_norm, predictors, mode='norm')

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.49
Acurácia do modelo: 0.50
            target  predictions
2021-03-05       0            1
2021-03-06       1            1
2021-03-07       1            1
2021-03-08       1            1
2021-03-09       1            1
...            ...          ...
2024-09-02       0            1
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            1
2024-09-06       1            1

[1282 rows x 2 columns]


#### KNN Classifier

In [34]:
knn_params = {
    'n_neighbors':[9,10,11,12,13,14],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'weights': ['uniform', 'distance']
}
knn_model = KNeighborsClassifier()

##### KNN Classifier Raw

In [36]:
best_score, best_params, best_model_knn_cr = run_grid_search_cv(model=knn_model, params=knn_params, X=X_train, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.518

Best Params Found: {'algorithm': 'auto', 'n_neighbors': 13, 'weights': 'distance'}

---------------------------------



In [37]:
predictions = backtest(data, best_model_knn_cr, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.53
Acurácia do modelo: 0.54
            target  predictions
2021-03-05       0            1
2021-03-06       1            0
2021-03-07       1            1
2021-03-08       1            1
2021-03-09       1            1
...            ...          ...
2024-09-02       0            1
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            0
2024-09-06       1            1

[1282 rows x 2 columns]


##### KNN Classifier Scaled

In [38]:
best_score, best_params, best_model_knn_sca = run_grid_search_cv(model=knn_model, params=knn_params, X=X_train_scaled, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.4664999999999999

Best Params Found: {'algorithm': 'auto', 'n_neighbors': 10, 'weights': 'distance'}

---------------------------------



In [39]:
predictions = backtest(data, best_model_knn_sca, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.52
Acurácia do modelo: 0.53
            target  predictions
2021-03-05       0            1
2021-03-06       1            0
2021-03-07       1            1
2021-03-08       1            1
2021-03-09       1            1
...            ...          ...
2024-09-02       0            1
2024-09-03       1            0
2024-09-04       0            1
2024-09-05       0            0
2024-09-06       1            1

[1282 rows x 2 columns]


##### KNN Classifier Normalized

In [40]:
best_score, best_params, best_model_knn_norm = run_grid_search_cv(model=knn_model, params=knn_params, X=X_train_norm, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.517

Best Params Found: {'algorithm': 'brute', 'n_neighbors': 10, 'weights': 'uniform'}

---------------------------------



In [41]:
predictions = backtest(data, best_model_knn_norm, predictors, mode='norm')

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.51
Acurácia do modelo: 0.52
            target  predictions
2021-03-05       0            1
2021-03-06       1            0
2021-03-07       1            1
2021-03-08       1            0
2021-03-09       1            0
...            ...          ...
2024-09-02       0            1
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            1
2024-09-06       1            1

[1282 rows x 2 columns]


#### XGBoost

In [46]:
xgb_params = {'random_state' : [1], 
              'learning_rate' : [0.1, 0.2, 0.3, 0.4], 
              'n_estimators' : [100,150,200,250,300], 
              'colsample_bytree' : [0.25, 0.5, 0.75, 1], 
              'max_depth' : [3,5,6,7,8]
}

model_xgb = XGBClassifier()

##### XGBoost Raw

In [47]:
best_score, best_params, best_model_xgb_cr = run_grid_search_cv(model=model_xgb, params=xgb_params, X=X_train, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.46399999999999997

Best Params Found: {'colsample_bytree': 0.5, 'learning_rate': 0.4, 'max_depth': 5, 'n_estimators': 300, 'random_state': 1}

---------------------------------



In [48]:
predictions = backtest(data, best_model_xgb_cr, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.50
Acurácia do modelo: 0.51
            target  predictions
2021-03-05       0            1
2021-03-06       1            0
2021-03-07       1            0
2021-03-08       1            0
2021-03-09       1            0
...            ...          ...
2024-09-02       0            0
2024-09-03       1            1
2024-09-04       0            0
2024-09-05       0            1
2024-09-06       1            0

[1282 rows x 2 columns]


##### XGBoost Scaled

In [49]:
best_score, best_params, best_model_xgb_sca = run_grid_search_cv(model=model_xgb, params=xgb_params, X=X_train_scaled, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.46399999999999997

Best Params Found: {'colsample_bytree': 0.5, 'learning_rate': 0.4, 'max_depth': 5, 'n_estimators': 300, 'random_state': 1}

---------------------------------



In [50]:
predictions = backtest(data, best_model_xgb_sca, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.50
Acurácia do modelo: 0.51
            target  predictions
2021-03-05       0            1
2021-03-06       1            0
2021-03-07       1            0
2021-03-08       1            0
2021-03-09       1            0
...            ...          ...
2024-09-02       0            0
2024-09-03       1            1
2024-09-04       0            0
2024-09-05       0            1
2024-09-06       1            0

[1282 rows x 2 columns]


##### XGBoost Norm

In [51]:
best_score, best_params, best_model_xgb_norm = run_grid_search_cv(model=model_xgb, params=xgb_params, X=X_train_norm, y=y_train)
print_grid_evaluation_report(best_score, best_params)

----Evaluation Report----

Best Score Achieved: 0.49749999999999994

Best Params Found: {'colsample_bytree': 0.25, 'learning_rate': 0.4, 'max_depth': 3, 'n_estimators': 250, 'random_state': 1}

---------------------------------



In [52]:
predictions = backtest(data, best_model_xgb_norm, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.48
Acurácia do modelo: 0.49
            target  predictions
2021-03-05       0            1
2021-03-06       1            0
2021-03-07       1            0
2021-03-08       1            0
2021-03-09       1            0
...            ...          ...
2024-09-02       0            0
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            1
2024-09-06       1            1

[1282 rows x 2 columns]


### Running Best Model

In [25]:
# Função de predição
def predict(train: pd.DataFrame, test: pd.DataFrame, predictors: List[str], model: XGBClassifier) -> pd.DataFrame:
    model.fit(train[predictors], train["target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="predictions")
    return pd.concat([test["target"], preds], axis=1)

# Função de backtesting
# Usaremos uma abordagem de janela móvel para fazer o backtesting do modelo.
# Pegaremos uma janela de dados, usaremos para treinar o modelo e, em seguida, usaremos o modelo para prever a próxima janela de dados.
# Repetiremos esse processo até termos usado todos os dados.
# O parâmetro start significa 3 anos de dados para treinar o modelo.
# O parâmetro step significa 150 dias de dados para testar o modelo.
def backtest(data: pd.DataFrame, model: XGBClassifier, predictors: List[str], start: int = 1095, step: int = 150) -> pd.DataFrame:
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)

    return pd.concat(all_predictions)

# Função de avaliação do modelo
def evaluate_model(predictions: pd.DataFrame) -> Tuple[float, float]:
    precision = precision_score(predictions["target"], predictions["predictions"])
    accuracy = accuracy_score(predictions["target"], predictions["predictions"])
    return precision, accuracy

In [35]:
# Criar e treinar o modelo XGBoost
model = XGBClassifier(random_state=1, learning_rate=0.2, n_estimators=500, colsample_bytree = 1, max_depth = 8)
predictions = backtest(data, model, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.48
Acurácia do modelo: 0.49
            target  predictions
2021-03-05       0            0
2021-03-06       1            0
2021-03-07       1            0
2021-03-08       1            0
2021-03-09       1            0
...            ...          ...
2024-09-02       0            0
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            1
2024-09-06       1            0

[1282 rows x 2 columns]


In [102]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2378 entries, 2018-03-02 to 2024-09-06
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   open                         2378 non-null   float64
 1   high                         2378 non-null   float64
 2   low                          2378 non-null   float64
 3   close                        2378 non-null   float64
 4   volume                       2378 non-null   int64  
 5   edit_count                   2378 non-null   float64
 6   tomorrow                     2377 non-null   float64
 7   sentiment                    2378 non-null   float64
 8   neg_sentiment                2378 non-null   float64
 9   fng_index                    2378 non-null   int64  
 10  target                       2378 non-null   int64  
 11  fng_classification_adjusted  2378 non-null   int64  
dtypes: float64(8), int64(4)
memory usage: 241.5+ KB


#### Serialização do Modelo

In [28]:
# Salvar o modelo treinado em um arquivo
model_filename = "btc_trend_prediction_model.joblib"
dump(model, model_filename)

['btc_trend_prediction_model.joblib']