In [1]:
!pip install ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=39df4c0dd33ac3f32ffa082aca106a9f74d412d8087a191926665d5e8cb01413
  Stored in directory: /root/.cache/pip/wheels/5f/67/4f/8a9f252836e053e532c6587a3230bc72a4deb16b03a829610b
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [2]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
import ta
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv('/kaggle/input/directional-forecasting-cryptocurrencies/train.csv')

train_df['datetime'] = pd.to_datetime(train_df['timestamp'], unit='s')
print(f"Training data date range:")
print(f"Start: {train_df['datetime'].min()}")
print(f"End: {train_df['datetime'].max()}")
print(f"Total days: {(train_df['datetime'].max() - train_df['datetime'].min()).days}")

Training data date range:
Start: 2018-05-04 22:01:00
End: 2022-05-17 19:58:00
Total days: 1473


In [4]:
train_df.shape

(2122438, 12)

In [5]:
test_df = pd.read_csv('/kaggle/input/directional-forecasting-cryptocurrencies/test.csv')
test_df['datetime'] = pd.to_datetime(test_df['timestamp'],unit='s')
print(f"Test data date range:")
print(f"Start: {test_df['datetime'].min()}")
print(f"End: {test_df['datetime'].max()}")
print(f"Total days: {(test_df['datetime'].max() - test_df['datetime'].min()).days}")

Test data date range:
Start: 2022-05-17 19:58:00
End: 2024-02-08 12:14:00
Total days: 631


In [6]:
test_df.shape

(909617, 12)

In [7]:
import os
import pandas as pd
import numpy as np
import warnings
import ta
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from statsmodels.tsa.statespace.sarimax import SARIMAX

warnings.filterwarnings('ignore')

def calculate_macro_f1(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Optional F1 utility."""
    f1_class_0 = f1_score(y_true, y_pred, pos_label=0, average='binary')
    f1_class_1 = f1_score(y_true, y_pred, pos_label=1, average='binary')
    return (f1_class_0 + f1_class_1) / 2

class TimeSeriesModel:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()

    def calculate_technical_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculates technical indicators. 
        Expects columns: ['timestamp', 'open', 'high', 'low', 'close', 'volume'] at a minimum.
        """
        required_cols = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")

        df = df.copy()

        # Calculate indicators (shifted columns, RSI, MACD, etc.)
        df['returns'] = df['close'].shift(1)
        df['rsi'] = ta.momentum.RSIIndicator(df['close']).rsi()
        df['macd'] = ta.trend.MACD(df['close']).macd()

        df['rolling_mean_5'] = df['close'].rolling(window=5, min_periods=1).mean()
        df['volume_ma_5'] = df['volume'].rolling(window=5, min_periods=1).mean()

        df['price_range'] = (df['high'] - df['low']) / df['open']

        # Time features
        df['hour'] = pd.to_datetime(df['timestamp'], unit='s').dt.hour
        df['day_of_week'] = pd.to_datetime(df['timestamp'], unit='s').dt.dayofweek

        # Shift columns (except timestamp, target, hour, day_of_week)
        # to avoid label leakage
        excluded_cols = ['timestamp', 'target', 'hour', 'day_of_week']
        feature_cols = [c for c in df.columns if c not in excluded_cols]
        df[feature_cols] = df[feature_cols].shift(1)

        # Sort by timestamp just to be safe
        df = df.sort_values('timestamp').reset_index(drop=True)
        return df

    def prepare_features(self, df: pd.DataFrame) -> np.ndarray:
        """
        Selects and scales exogenous features.
        """
        exog_columns = [
            'returns', 'rsi', 'macd',
            'rolling_mean_5', 'volume_ma_5', 'price_range',
            'hour', 'day_of_week'
        ]
        for col in exog_columns:
            if col not in df.columns:
                # If a column is missing, create it filled with 0
                df[col] = 0

        # Fill any leftover NaNs with 0; we've already done fillna but just in case.
        exog = df[exog_columns].fillna(0)

        # Fit scaler if model is None, otherwise transform
        if self.model is None:
            return self.scaler.fit_transform(exog)
        else:
            return self.scaler.transform(exog)

    def train(self, train_data: pd.DataFrame) -> None:
        """
        Train SARIMAX on the entire training set (no chunking).
        """
        if len(train_data) == 0:
            raise ValueError("train_data is empty, cannot train.")

        # 1) Calculate indicators
        df = self.calculate_technical_indicators(train_data)

        # 2) Fill NaNs instead of drop
        df = df.fillna(method='ffill').fillna(method='bfill')

        # Optionally, you can replace any remaining NaNs with 0:
        # df = df.fillna(0)

        # 3) Check if DataFrame is still empty
        if df.empty:
            raise ValueError("No valid rows after fillna. Possibly all data was NaN.")

        # 4) Prepare exogenous features
        exog = self.prepare_features(df)

        # 5) Train the SARIMAX model
        self.model = SARIMAX(
            df['close'],
            exog=exog,
            order=(1, 0, 0),          # Simple AR(1) example
            enforce_stationarity=False
        ).fit(disp=False)

    def predict(self, future_df: pd.DataFrame) -> np.ndarray:
        """
        Predict on the entire test set using the trained model.
        """
        if self.model is None:
            raise ValueError("Model needs to be trained first.")

        if len(future_df) == 0:
            print("Warning: future_df is empty. Returning empty predictions.")
            return np.array([])

        # 1) Calculate indicators
        df = self.calculate_technical_indicators(future_df)

        # 2) Fill NaNs instead of drop
        df = df.fillna(method='ffill').fillna(method='bfill')

        # Or, df = df.fillna(0)

        if df.empty:
            print("Warning: No valid rows after fillna in predict. Returning empty.")
            return np.array([])

        # 3) Prepare exogenous features
        exog = self.prepare_features(df)

        # 4) Forecast
        forecast = self.model.forecast(steps=len(df), exog=exog)

        # 5) Compute directional predictions
        diff_forecast = forecast.diff()
        predictions = (diff_forecast > 0).astype(int)

        # Edge case for the first row
        if len(predictions) > 0:
            # Compare forecast[0] to df['close'].iloc[0]
            predictions.iloc[0] = int(forecast.iloc[0] > df['close'].iloc[0])

        return predictions.values

def main():
    try:
        # 1) Load train & test data
        train_df = pd.read_csv('/kaggle/input/directional-forecasting-cryptocurrencies/train.csv')
        test_df = pd.read_csv('/kaggle/input/directional-forecasting-cryptocurrencies/test.csv')

        print("Training data shape:", train_df.shape)
        print("Test data shape:", test_df.shape)

        # 2) Train on the entire training dataset
        model = TimeSeriesModel()
        model.train(train_df)

        # 3) Predict on the entire test dataset
        print("\nPredicting on the entire test set...")
        predictions = model.predict(test_df)

        # 4) Check if length matches
        if len(predictions) != len(test_df):
            print(f"Warning: predictions length ({len(predictions)}) "
                  f"does not match test_df length ({len(test_df)})")

        # 5) Save to submission.csv
        output_path = 'submission.csv'
        pd.DataFrame({
            'row_id': range(len(predictions)),
            'target': predictions
        }).to_csv(output_path, index=False)

        print(f"\nSubmission file saved to {output_path}")
        print(f"Total predictions made: {len(predictions)}")

    except Exception as e:
        print(f"Error in main: {e}")

if __name__ == "__main__":
    main()


Training data shape: (2122438, 11)
Test data shape: (909617, 11)

Predicting on the entire test set...

Submission file saved to submission.csv
Total predictions made: 909617


In [None]:
0.49092