In [49]:
# Import all data visualisation libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import seaborn as sns
import ipywidgets as ipy
from IPython.display import Image

sns.set_theme()
sns.set_palette("colorblind")
sns.set_style('whitegrid')
%matplotlib inline

## Loading Data

In [50]:
import pandas as pd
import numpy as np
from datetime import datetime

df = pd.read_csv('data/output_data.csv', parse_dates = ['date'])
df = df.set_index('date')
df = df.asfreq('B')
df = df.sort_index()

start_train = '2017-08-01'
end_train = '2022-09-01'
start_test = '2022-09-01'
end_test = '2023-05-01'
timedelta_test = datetime.strptime(end_test, '%Y-%m-%d') - datetime.strptime(start_test, '%Y-%m-%d')
 
X_train = df.loc[start_train:end_train]
X_test = df.loc[start_test:end_test]

X_train.shape, X_test.shape

# Extract all column name starting by 'close'
targets = [col for col in df.columns if col.startswith('Close')]

## Preprocessing pipeline 

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

### Infinites values

In [52]:
def replace_infinities(X):
    return X.replace([np.inf, -np.inf], np.nan)

replace_infinities_transformer = FunctionTransformer(replace_infinities)


### null standard deviation

In [53]:
def drop_zero_std(X):
    zero_std_mask = np.std(X, axis=0) != 0
    return X[:, zero_std_mask]

drop_zero_std = FunctionTransformer(drop_zero_std)

### Outliers

In [54]:
from ipywidgets import interact
import seaborn as sns
import matplotlib.pyplot as plt

def plot(column):
    if df[column].dtype in ['int64', 'float64']:
        sns.boxplot(x=df[column])
    else:
        sns.countplot(x=df[column])
    plt.show()

interact(plot, column=df.columns.tolist())

interactive(children=(Dropdown(description='column', options=('btc_tweet_count', 'eth_tweet_count', 'bnb_tweet…

<function __main__.plot(column)>

In [55]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, verbose=False):
        self.threshold = threshold
        self.verbose = verbose

    def fit(self, X, y=None):
        self.median_ = X.median()
        self.iqr_ = X.quantile(0.75) - X.quantile(0.25)
        return self

    def transform(self, X, y=None):
        X_new = X[~((X < (self.median_ - self.threshold * self.iqr_)) | (X > (self.median_ + self.threshold * self.iqr_))).any(axis=1)]
        if self.verbose:
            print(f"Removed {X.shape[0] - X_new.shape[0]} rows")
        return X_new

threshold = 10
OutlierRemover(threshold = threshold, verbose = True).fit_transform(X_train)

outlier_remover = OutlierRemover(threshold = threshold)

Removed 243 rows


### Missing data

In [56]:
from sklearn.impute import KNNImputer

imputer = KNNImputer()

# Display total number of missing values per column before imputation
print("Missing values before imputation:")
print(X_train.isna().sum().sort_values())

Missing values before imputation:
btc_tweet_count                        0
crisis_posts_count                     0
sdcc_vader_polarity_compound_mean      0
sdcc_vader_polarity_compound_max       0
sdcc_vader_polarity_compound_min       0
                                    ... 
Volume_SOL                           778
Close_DOT                            786
Volume_DOT                           786
Close_SHIB                           972
Volume_SHIB                          972
Length: 200, dtype: int64


### Numerical data

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [58]:
numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f'Numeric columns : {len(numeric_columns)}')

Numeric columns : 200


### No categorical data

#### Preprocessing pipeline 

In [59]:
std = StandardScaler()
pca = PCA(n_components=70)

preprocessor = Pipeline(steps=[
    ('replace_infinities', replace_infinities_transformer),
    ('outlier_remover', OutlierRemover(threshold=40, verbose=True)),
    ('imputer', KNNImputer()),
    ('remove_zero_std', drop_zero_std),
    ('scaler', std),
    ('pca', pca)
])

preprocessor.fit_transform(X_train)

Removed 20 rows


array([[-1.11164609, -2.16138418, -4.13504482, ..., -0.75105211,
        -1.43210546, -0.3534062 ],
       [-2.04989163, -2.9014148 , -6.06705429, ...,  0.16937305,
         0.17026453, -2.69772227],
       [-1.82372038, -3.31301145, -5.81928701, ...,  0.28890944,
         0.81297175, -0.30051029],
       ...,
       [-5.46999217, -4.11560485, 13.67264713, ..., -0.63165573,
         1.60121346, -0.41332339],
       [-5.3456536 , -4.39141604, 14.57630531, ..., -1.88041319,
         2.33297378,  0.12609956],
       [-5.55408094, -4.32074953, 14.1564177 , ..., -0.5970994 ,
         1.49125718, -0.64953976]])

In [60]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def score(y_pred, y_test):
    """Return MSE, RMSE, MAE, MAPE, R2"""
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, mae, mape, r2

In [61]:
def plot_pred(y_train, y_pred, y_test, target_name, model_name):
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(y_train, label='Train')
    ax.plot(y_test, label='Test')
    ax.plot(y_pred, label='Predicted')
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter(mdates.AutoDateLocator()))

    ax.yaxis.set_major_formatter(mtick.StrMethodFormatter('${x:,.0f}'))
    ax.set_xlabel('Date')
    ax.set_ylabel('')

    ax.set_title(f'Prediction on {target_name[5:]} value using {model_name}')
    ax.legend()
    plt.show()

In [62]:
def train_model(X_train, X_test, y_train):
    """Train Sarimax model"""
    
    return model

In [63]:
for target in targets:
    y_train = X_train[target]
    y_test = X_test[target]
    
    X_train = X_train.drop(columns=target)
    X_test = X_test.drop(columns=target)
    
    model = train_model(X_train, X_test, y_train)
    y_pred = model.predict(start=start_test, end=end_test)
    mse, rmse, mae, mape, r2 = score(y_pred, y_test)
    print(f'{target[5:]}: MSE={mse:.2f}, RMSE={rmse:.2f}, MAE={mae:.2f}, MAPE={mape:.2f}, R2={r2:.2f}')
    plot_pred(y_train, y_pred, y_test, target, 'Sarimax')

NameError: name 'model' is not defined