### Treinamento do Modelo Machine Learning

In [25]:
%pip install pandas scikit-learn xgboost joblib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


#### Importações de Bibliotecas

In [118]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, accuracy_score
from typing import List, Tuple
from joblib import dump
import boto3
from io import StringIO
from sklearn.preprocessing import normalize
from sklearn.model_selection import GridSearchCV

#### Treinamento e Predições do Modelo XGBoost

In [22]:
def read_from_s3(bucket_name:str):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)
    response = []
    for obj in bucket.objects.all():
        body = obj.get()['Body'].read()
        s=str(body,'utf-8')
        data = StringIO(s)
        response.append(pd.read_csv(data, index_col=0))
    return response

In [20]:
bucket_name = "dl-general-prd-julio"
csv_file_name = 'btc.csv'
bucket_layer = 'bronze/database/bitcoin/raw'
object_name = f'{bucket_layer}/{csv_file_name}'
df = read_from_s3(bucket_name)[0]
df.head()

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,edit_count,tomorrow,sentiment,neg_sentiment,fng_index,fng_classification,target
0,2018-03-02,10977.400391,11189.0,10850.099609,11086.400391,7620590080,3.066667,11489.700195,-0.307225,0.541296,47,Neutral,1
1,2018-03-03,11101.900391,11528.200195,11002.400391,11489.700195,6690570240,3.066667,11512.599609,-0.307225,0.541296,56,Greed,1
2,2018-03-04,11497.400391,11512.599609,11136.099609,11512.599609,6084149760,2.933333,11573.299805,-0.258349,0.513519,44,Fear,1
3,2018-03-05,11532.400391,11704.099609,11443.900391,11573.299805,6468539904,2.866667,10779.900391,-0.259235,0.496852,55,Greed,0
4,2018-03-06,11500.099609,11500.099609,10694.299805,10779.900391,6832169984,2.666667,9965.570312,-0.249632,0.47463,59,Greed,0


In [133]:
# Definir preditores
# Estas são as features que estamos usando para prever o target
predictors: List[str] = [
    "close",
    "volume",
    "open",
    "high",
    "low",
    "edit_count",
    "sentiment",
    "neg_sentiment",
    "fng_index",
    "fng_classification_adjusted"
]

# Função de predição
def predict(train: pd.DataFrame, test: pd.DataFrame, predictors: List[str], model: XGBClassifier) -> pd.DataFrame:
    model.fit(train[predictors], train["target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="predictions")
    return pd.concat([test["target"], preds], axis=1)

# Função de backtesting
# Usaremos uma abordagem de janela móvel para fazer o backtesting do modelo.
# Pegaremos uma janela de dados, usaremos para treinar o modelo e, em seguida, usaremos o modelo para prever a próxima janela de dados.
# Repetiremos esse processo até termos usado todos os dados.
# O parâmetro start significa 3 anos de dados para treinar o modelo.
# O parâmetro step significa 150 dias de dados para testar o modelo.
def backtest(data: pd.DataFrame, model: XGBClassifier, predictors: List[str], start: int = 1095, step: int = 150) -> pd.DataFrame:
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)

    return pd.concat(all_predictions)

# Função de avaliação do modelo
def evaluate_model(predictions: pd.DataFrame) -> Tuple[float, float]:
    precision = precision_score(predictions["target"], predictions["predictions"])
    accuracy = accuracy_score(predictions["target"], predictions["predictions"])
    return precision, accuracy

In [77]:
# Carregar os dados do arquivo CSV
data = read_from_s3(bucket_name)[0]
#Adjusting fng classification values
map = {'Neutral': 0, 'Greed': 1, 'Fear': -1, 'Extreme Fear': -2, 'Extreme Greed': 2}
data['fng_classification_adjusted'] = data['fng_classification'].map(map)
data.drop(columns='fng_classification', axis=1, inplace=True)
data.head()


Unnamed: 0,open,high,low,close,volume,edit_count,tomorrow,sentiment,neg_sentiment,fng_index,target,fng_classification_adjusted
2018-03-02,10977.400391,11189.0,10850.099609,11086.400391,7620590080,3.066667,11489.700195,-0.307225,0.541296,47,1,0
2018-03-03,11101.900391,11528.200195,11002.400391,11489.700195,6690570240,3.066667,11512.599609,-0.307225,0.541296,56,1,1
2018-03-04,11497.400391,11512.599609,11136.099609,11512.599609,6084149760,2.933333,11573.299805,-0.258349,0.513519,44,1,-1
2018-03-05,11532.400391,11704.099609,11443.900391,11573.299805,6468539904,2.866667,10779.900391,-0.259235,0.496852,55,0,1
2018-03-06,11500.099609,11500.099609,10694.299805,10779.900391,6832169984,2.666667,9965.570312,-0.249632,0.47463,59,0,1


In [136]:
# Criar e treinar o modelo XGBoost
model = XGBClassifier(random_state=1, learning_rate=0.2, n_estimators=200, colsample_bytree = 0.5, max_depth = 5)
predictions = backtest(data, model, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.50
Acurácia do modelo: 0.51
            target  predictions
2021-03-04       1            1
2021-03-05       0            1
2021-03-06       1            1
2021-03-07       1            0
2021-03-08       1            1
...            ...          ...
2024-09-02       0            0
2024-09-03       1            1
2024-09-04       0            1
2024-09-05       0            1
2024-09-06       0            0

[1283 rows x 2 columns]


In [102]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2378 entries, 2018-03-02 to 2024-09-06
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   open                         2378 non-null   float64
 1   high                         2378 non-null   float64
 2   low                          2378 non-null   float64
 3   close                        2378 non-null   float64
 4   volume                       2378 non-null   int64  
 5   edit_count                   2378 non-null   float64
 6   tomorrow                     2377 non-null   float64
 7   sentiment                    2378 non-null   float64
 8   neg_sentiment                2378 non-null   float64
 9   fng_index                    2378 non-null   int64  
 10  target                       2378 non-null   int64  
 11  fng_classification_adjusted  2378 non-null   int64  
dtypes: float64(8), int64(4)
memory usage: 241.5+ KB


#### Diferent models try

In [132]:
def predict_norm(train: pd.DataFrame, test: pd.DataFrame, predictors: List[str], model: XGBClassifier) -> pd.DataFrame:
    X_train = normalize(train[predictors])
    y_train = train["target"]
    model.fit(X_train, y_train)
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="predictions")
    return pd.concat([test["target"], preds], axis=1)

def backtest_norm(data: pd.DataFrame, model: XGBClassifier, predictors: List[str], start: int = 1095, step: int = 150) -> pd.DataFrame:
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions = predict_norm(train, test, predictors, model)
        all_predictions.append(predictions)

    return pd.concat(all_predictions)

In [112]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [115]:
train = data.iloc[0:2000]
test = data.iloc[2000:2378]

## prepare data for models
X_train = normalize(train[predictors])
y_train = train["target"]

In [119]:
def run_grid_search_cv(model, params, X, y):
    search_cv = GridSearchCV(model, params)
    search_cv.fit(X, y)
    return search_cv.best_estimator_


In [120]:
rf_params = {
    'n_estimators':[100, 125, 150, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [3, 5, 7, 9]
}
model = RandomForestClassifier()
rf_best = run_grid_search_cv(model=model, params=rf_params, X=X_train, y=y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [137]:
predictions = backtest_norm(data, rf_best, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)



Precisão do modelo: 0.49
Acurácia do modelo: 0.50
            target  predictions
2021-03-04       1            1
2021-03-05       0            1
2021-03-06       1            1
2021-03-07       1            1
2021-03-08       1            1
...            ...          ...
2024-09-02       0            0
2024-09-03       1            0
2024-09-04       0            0
2024-09-05       0            0
2024-09-06       0            0

[1283 rows x 2 columns]




In [130]:
knn_params = {
    'n_neighbors':[9,10,11,12,13,14],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'weights': ['uniform', 'distance']
}
model = KNeighborsClassifier()
knn_best = run_grid_search_cv(model=model, params=knn_params, X=X_train, y=y_train)
print(knn_best)

KNeighborsClassifier(n_neighbors=9)


In [138]:
predictions = backtest_norm(data, knn_best, predictors)

# Avaliar o modelo
precision, accuracy = evaluate_model(predictions)
print(f"Precisão do modelo: {precision:.2f}")
print(f"Acurácia do modelo: {accuracy:.2f}")
print(predictions)

Precisão do modelo: 0.50
Acurácia do modelo: 0.50
            target  predictions
2021-03-04       1            0
2021-03-05       0            0
2021-03-06       1            0
2021-03-07       1            0
2021-03-08       1            0
...            ...          ...
2024-09-02       0            0
2024-09-03       1            0
2024-09-04       0            0
2024-09-05       0            0
2024-09-06       0            0

[1283 rows x 2 columns]




#### Serialização do Modelo

In [28]:
# Salvar o modelo treinado em um arquivo
model_filename = "btc_trend_prediction_model.joblib"
dump(model, model_filename)

['btc_trend_prediction_model.joblib']