### Time-series model

- modèle [Autoformer](https://huggingface.co/blog/autoformer)
- modèle [Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)
- modèle [Informer](https://huggingface.co/blog/informer)

In [31]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/output_data.csv')

# Split dataset in date between 08-2017 to 08-2022 and testing fom 09-2022 to 04-2023

X = df[(df['date'] >= '2017-08-01') & (df['date'] < '2022-09-01')]
X_test = df[(df['date'] >= '2022-09-01') & (df['date'] < '2023-05-01')]

X_date = X['date']
X.drop('date', axis = 1, inplace = True)

X_date_test = X_test['date']
X_test.drop('date', axis = 1, inplace = True)

X.shape, X_test.shape, X_date

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('date', axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop('date', axis = 1, inplace = True)


((1315, 200),
 (147, 200),
 0       2017-08-17
 1       2017-08-18
 2       2017-08-21
 3       2017-08-22
 4       2017-08-23
            ...    
 1310    2022-08-25
 1311    2022-08-26
 1312    2022-08-29
 1313    2022-08-30
 1314    2022-08-31
 Name: date, Length: 1315, dtype: object)

In [32]:
# Extract all column name starting by 'close'
targets = [col for col in df.columns if col.startswith('Close')]
targets

['Close_BTC',
 'Close_ETH',
 'Close_DOGE',
 'Close_SHIB',
 'Close_DOT',
 'Close_BCH',
 'Close_SOL',
 'Close_ADA',
 'Close_MATIC',
 'Close_BNB',
 'Close_LTC',
 'Close_XRP']

### Preprocessing

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

### Outliers

In [34]:
from ipywidgets import interact
import seaborn as sns
import matplotlib.pyplot as plt

def plot(column):
    if df[column].dtype in ['int64', 'float64']:
        sns.boxplot(x=df[column])
    else:
        sns.countplot(x=df[column])
    plt.show()

interact(plot, column=df.columns.tolist())

interactive(children=(Dropdown(description='column', options=('date', 'btc_tweet_count', 'eth_tweet_count', 'b…

<function __main__.plot(column)>

In [54]:
from sklearn.base import BaseEstimator, TransformerMixin

class NaNChecker(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        nan_info = X.isnull().stack()
        nan_info = nan_info[nan_info]
        if not nan_info.empty:
            print("NaN values found!")
            for index, value in nan_info.items():
                print(f"Column: {index[1]}, Index: {index[0]}, Value: {value}")
        else:
            print("No NaN values found.")
        return X

# Créer une instance de NaNChecker
nan_checker = NaNChecker()

In [55]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor, verbose=False):
        self.factor = factor
        self.verbose = verbose

    def fit(self, X, y=None):
        self.iqr_ranges_ = {}
        for col in X.select_dtypes(include=np.number).columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            self.iqr_ranges_[col] = (Q1 - self.factor * IQR, Q3 + self.factor * IQR)
        return self

    def transform(self, X, y=None):
        X_out = X.copy()
        for col in X.select_dtypes(include=np.number).columns:
            low, high = self.iqr_ranges_[col]
            outliers = X_out[(X_out[col] < low) | (X_out[col] > high)]
            if self.verbose:
                print(f"Removing {len(outliers)} outliers from column {col}")
            X_out = X_out[(X_out[col] >= low) & (X_out[col] <= high)]
        return X_out
    
# Execute outlier to see how many row will be affected
OutlierRemover(factor = 1.5, verbose = True).fit_transform(X)

outlier_remover = OutlierRemover(factor = 1.5)

Removing 40 outliers from column btc_tweet_count
Removing 150 outliers from column eth_tweet_count
Removing 34 outliers from column bnb_tweet_count
Removing 32 outliers from column xrp_tweet_count
Removing 38 outliers from column matic_tweet_count
Removing 19 outliers from column ada_tweet_count
Removing 63 outliers from column doge_tweet_count
Removing 64 outliers from column shib_tweet_count
Removing 42 outliers from column dot_tweet_count
Removing 39 outliers from column bch_tweet_count
Removing 41 outliers from column ltc_tweet_count
Removing 15 outliers from column sol_tweet_count
Removing 6 outliers from column btc_posts_count
Removing 68 outliers from column btc_textblob_polarity_min
Removing 4 outliers from column btc_textblob_polarity_max
Removing 7 outliers from column btc_textblob_polarity_mean
Removing 7 outliers from column btc_vader_polarity_compound_min
Removing 0 outliers from column btc_vader_polarity_compound_max
Removing 10 outliers from column btc_vader_polarity_com

### Missing data

In [56]:
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer

# Get all col containing missing values
na_cols = [col for col in df.columns if df[col].isna().sum() > 0]
print(len(na_cols))

imputer = ColumnTransformer(
    transformers=[
        ('imputer', KNNImputer(), na_cols)
    ]
)

153


In [57]:
# Get dataframe without all na_cols
X_na = X.drop(na_cols, axis = 1)
X_na.isna().sum().sum()

0

In [58]:
KNN = KNNImputer()
res = KNNImputer().fit_transform(X)   
# Is there any missing value left?
np.isnan(res).sum()

0

### Numerical data

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [60]:
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f'Numeric columns : {len(numeric_columns)}')

Numeric columns : 200


In [61]:
num_pipeline = ColumnTransformer(
    transformers=[
    ('scaler', StandardScaler(), numeric_columns), 
    ('nan_check2', nan_checker, X.columns),
    ('pca', PCA(n_components= 'mle', svd_solver='full'), numeric_columns) 
])

### No categorical data

#### Preprocessing pipeline 

In [63]:
# Build preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('outlier', outlier_remover, X.columns.tolist()),
        ('nan_check3', NaNChecker(), X.columns),
        
        ('imputer', imputer, na_cols),
        ('nan_check4', nan_checker, X.columns),
        
        ('num', num_pipeline, numeric_columns),
        ('nan_check5', nan_checker, X.columns),
        
    ])

# Check for errors in preprocessing
preprocessor.fit(X)
# preprocessor

NaN values found!
Column: xrp_posts_count, Index: 0, Value: True
Column: xrp_textblob_polarity_min, Index: 0, Value: True
Column: xrp_textblob_polarity_max, Index: 0, Value: True
Column: xrp_textblob_polarity_mean, Index: 0, Value: True
Column: xrp_vader_polarity_compound_min, Index: 0, Value: True
Column: xrp_vader_polarity_compound_max, Index: 0, Value: True
Column: xrp_vader_polarity_compound_mean, Index: 0, Value: True
Column: matic_posts_count, Index: 0, Value: True
Column: matic_textblob_polarity_min, Index: 0, Value: True
Column: matic_textblob_polarity_max, Index: 0, Value: True
Column: matic_textblob_polarity_mean, Index: 0, Value: True
Column: matic_vader_polarity_compound_min, Index: 0, Value: True
Column: matic_vader_polarity_compound_max, Index: 0, Value: True
Column: matic_vader_polarity_compound_mean, Index: 0, Value: True
Column: ada_posts_count, Index: 0, Value: True
Column: ada_textblob_polarity_min, Index: 0, Value: True
Column: ada_textblob_polarity_max, Index: 0, V

TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.