### Time-series model

- modèle [Autoformer](https://huggingface.co/blog/autoformer)
- modèle [Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)
- modèle [Informer](https://huggingface.co/blog/informer)
- modèle [Sk Forecast](https://huggingface.co/blog/skforecast)

In [146]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/output_data.csv')

# Split dataset in date between 08-2017 to 08-2022 and testing fom 09-2022 to 04-2023

X = df[(df['date'] >= '2017-08-01') & (df['date'] < '2022-09-01')].copy()
X_test = df[(df['date'] >= '2022-09-01') & (df['date'] < '2023-05-01')].copy()

X_date = X['date']
X.drop('date', axis = 1, inplace = True)

X_date_test = X_test['date']
X_test.drop('date', axis = 1, inplace = True)

X.shape, X_test.shape, X_date

# X = X.dropna()


((1315, 200),
 (147, 200),
 0       2017-08-17
 1       2017-08-18
 2       2017-08-21
 3       2017-08-22
 4       2017-08-23
            ...    
 1310    2022-08-25
 1311    2022-08-26
 1312    2022-08-29
 1313    2022-08-30
 1314    2022-08-31
 Name: date, Length: 1315, dtype: object)

In [147]:
# Extract all column name starting by 'close'
targets = [col for col in df.columns if col.startswith('Close')]
targets

['Close_BTC',
 'Close_ETH',
 'Close_DOGE',
 'Close_SHIB',
 'Close_DOT',
 'Close_BCH',
 'Close_SOL',
 'Close_ADA',
 'Close_MATIC',
 'Close_BNB',
 'Close_LTC',
 'Close_XRP']

### Preprocessing

In [148]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

### Outliers

In [149]:
from ipywidgets import interact
import seaborn as sns
import matplotlib.pyplot as plt

def plot(column):
    if df[column].dtype in ['int64', 'float64']:
        sns.boxplot(x=df[column])
    else:
        sns.countplot(x=df[column])
    plt.show()

interact(plot, column=df.columns.tolist())

interactive(children=(Dropdown(description='column', options=('date', 'btc_tweet_count', 'eth_tweet_count', 'b…

<function __main__.plot(column)>

In [187]:
# To write a function to handle outliers

### Missing data

In [163]:
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer

# Get all col containing missing values
na_cols = [col for col in X.columns if X[col].isna().sum() > 0]
imputer = SimpleImputer(strategy = 'median')

print(len(na_cols))

153


### Numerical data

In [164]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [165]:
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f'Numeric columns : {len(numeric_columns)}')

Numeric columns : 200


### No categorical data

#### Preprocessing pipeline 

In [186]:
std = StandardScaler()
pca = PCA()

# Build preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        # ('outlier', outlier_remover, outlier_col),
        ('imputer', imputer, na_cols),
        ('scaler', std, numeric_columns),
        ('pca', pca, numeric_columns) 
    ])