## Rennes Data Challenge 2023 

Mathis Derenne

---

### Random Forest Model

We will train a Random Forest Regressor that uses previous values and exogenous variable to predict Close_BTC values.

We will implement a preprocessing pipeline as follows :
- Impute missing values
- Scale the data using RobustScaler
- Apply PCA to reduce the dimensionality


In [31]:
import pandas as pd
import numpy as np
from utils import load_data, validation_split
import warnings
warnings.simplefilter('ignore')

In [32]:
X, y = load_data()

X_train = X[X.index < validation_split]
X_test = X[X.index >= validation_split]
y_train = y[y.index < validation_split]
y_test = y[y.index >= validation_split]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1313, 199), (1313,), (148, 199), (148,))

### Preprocessing

#### Numeric preprocessing

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=0.7, svd_solver='full'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

### RandomForestRegressor

In [34]:
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.ensemble import RandomForestRegressor
from skforecast.model_selection import backtesting_forecaster
from skforecast.plot import plot_residuals

forecaster = ForecasterAutoreg(
    regressor = RandomForestRegressor(),
    lags = 3,
    transformer_exog = preprocessor
)

metric, y_pred = backtesting_forecaster(
    forecaster            = forecaster,
    y                     = y,
    exog                  = X,
    steps                 = 1,
    metric                = 'mean_squared_error',
    initial_train_size    = y_train.size,
    fixed_train_size      = False,
    gap                   = 0,
    allow_incomplete_fold = True,
    refit                 = True,
    n_jobs                = 'auto',
    verbose               = False,
    show_progress         = True  
)

  0%|          | 0/148 [00:00<?, ?it/s]



### Save the models predictions

In [35]:
y_pred = y_pred.squeeze()
y_pred.name = 'Close_BTC'
y_pred.index.name = 'date'
y_pred.to_csv('prediction/rfr_pred.csv')