In [1]:
!pip install ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=b714dc2952da2d301a5c27856e85c81cbe1736b34e2d0e220b5da0b5d6c7ea0f
  Stored in directory: /root/.cache/pip/wheels/5f/67/4f/8a9f252836e053e532c6587a3230bc72a4deb16b03a829610b
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [2]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import warnings
import ta
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.statespace.sarimax import SARIMAX
import xgboost as xgb
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
import gc

class TimeSeriesModel:
    def __init__(self):
        self.sarimax_model = None
        self.xgb_model = None
        self.scaler = StandardScaler()
        self.best_params = None
        self.best_weight = None
        
    def _calculate_momentum(self, df):
        """Calculate momentum indicators using shifted data to prevent lookahead."""
        result = pd.DataFrame(index=df.index)
        
        # Ensure we're using shifted prices
        close = df['close']
        high = df['high']
        low = df['low']
        
        # Calculate indicators
        result['rsi'] = ta.momentum.RSIIndicator(close, window=14).rsi()
        result['macd'] = ta.trend.MACD(close, window_fast=12).macd()
        result['atr'] = ta.volatility.AverageTrueRange(high, low, close, window=14).average_true_range()
        
        return result
        
    def _calculate_volume(self, df):
        """Calculate volume-based indicators using shifted data."""
        result = pd.DataFrame(index=df.index)
        volume = df['volume']  # Already shifted in main function
        
        result['volume_ma_10'] = volume.rolling(window=10, min_periods=1).mean()
        result['volume_ratio'] = volume / result['volume_ma_10']
        
        return result
        
    def calculate_technical_indicators(self, df):
        """Calculate all technical indicators ensuring no lookahead bias."""
        df = df.copy()
        
        # First shift all base price and volume data
        base_cols = ['close', 'high', 'low', 'volume']
        df[base_cols] = df[base_cols].shift(1)
        
        # Calculate returns on shifted data
        df['returns'] = df['close'].pct_change()
        
        # Calculate technical indicators in parallel
        with ThreadPoolExecutor(max_workers=2) as executor:
            futures = {
                executor.submit(self._calculate_momentum, df): 'momentum',
                executor.submit(self._calculate_volume, df): 'volume'
            }
            for future in futures:
                result = future.result()
                df = pd.concat([df, result], axis=1)
        
        # Add time-based features
        timestamp = pd.to_datetime(df['timestamp'], unit='s')
        df['hour'] = timestamp.dt.hour
        df['day_of_week'] = timestamp.dt.dayofweek
        
        # Calculate target (next period's price direction)
        df['target'] = df['close'].shift(-1).diff() > 0
        
        return df.fillna(0)
        
    def prepare_features(self, df):
        """Prepare and scale features, ensuring no target leakage."""
        # Define feature columns
        exog_columns = ['returns', 'volume_ratio', 'rsi', 'macd', 'atr', 'hour', 'day_of_week']
        
        # Create a copy with only the needed columns
        feature_df = df[exog_columns].copy()
        
        return self.scaler.fit_transform(feature_df) if self.sarimax_model is None else self.scaler.transform(feature_df)
        
    def tune_parameters(self, train_data):
        """Tune model parameters using a proper temporal train-validation split."""
        best_params = None
        best_weight = None
        best_score = 0
        print("Tuning parameters...")
        df = self.calculate_technical_indicators(train_data)
        
        # Ensure chronological split
        train_size = int(len(df) * 0.7)
        train_end = train_size
        train_start = max(0, train_end - 500000)  # Use last 500K rows of training set
        
        train_df = df[train_start:train_end]
        val_df = df[train_end:-1]  # -1 to exclude last row where target is NA
        
        if len(train_df) == 0:
            train_df = df[-500000:-100000]
            val_df = df[-100000:-1]
            
        print(f"Train set size: {len(train_df)}, Validation set size: {len(val_df)}")
        print(f"Validation set positive ratio: {val_df['target'].mean()}")
        
        xgb_params = {
            'learning_rate': [0.03, 0.05],
            'max_depth': [5, 6],
            'n_estimators': [200],
            'subsample': [0.9]
        }
        
        sarimax_orders = [(1,1,0), (2,1,0)]
        seasonal_orders = [(0,1,1,12)]
        weights = [0.2, 0.3, 0.4]
        
        total_combinations = (
            len(sarimax_orders) * len(seasonal_orders) * 
            len(xgb_params['learning_rate']) * len(xgb_params['max_depth']) * 
            len(xgb_params['n_estimators']) * len(xgb_params['subsample']) * len(weights)
        )
        print(f"\nStarting grid search with {total_combinations} combinations")
        
        best_score = 0
        total_tried = 0
        
        for order in sarimax_orders:
            for seasonal_order in seasonal_orders:
                print(f"\nSARIMAX order {order}, seasonal_order {seasonal_order}")
                for lr in xgb_params['learning_rate']:
                    for depth in xgb_params['max_depth']:
                        for n_est in xgb_params['n_estimators']:
                            for subsample in xgb_params['subsample']:
                                for weight in weights:
                                    try:
                                        total_tried += 1
                                        print(f"\nTrial {total_tried}/{total_combinations}")
                                        print(f"XGB: lr={lr}, depth={depth}, n_est={n_est}, subsample={subsample}, weight={weight}")
                                        
                                        # Train SARIMAX
                                        print("Training SARIMAX...")
                                        # Use last 50K points of training data for SARIMAX
                                        sarimax_train = train_df.iloc[-50000:]
                                        train_exog = self.prepare_features(sarimax_train)
                                        val_exog = self.prepare_features(val_df)
                                        
                                        sarimax = SARIMAX(
                                            sarimax_train['close'],
                                            exog=train_exog,
                                            order=order,
                                            seasonal_order=seasonal_order
                                        ).fit(disp=False, method='powell')
                                        
                                        # Train XGBoost
                                        print("Training XGBoost...")
                                        xgb_model = xgb.XGBClassifier(
                                            learning_rate=lr,
                                            max_depth=depth,
                                            n_estimators=n_est,
                                            subsample=subsample,
                                            tree_method='hist',
                                            device='cuda'
                                        )
                                        
                                        X_train = self.prepare_features(train_df)
                                        y_train = train_df['target'].astype(int)
                                        xgb_model.fit(X_train, y_train)
                                        
                                        # Make predictions
                                        sarimax_pred = sarimax.forecast(steps=len(val_df), exog=val_exog)
                                        xgb_pred = xgb_model.predict_proba(val_exog)[:, 1]
                                        
                                        final_pred = (
                                            (sarimax_pred.diff() > 0).astype(float) * weight + 
                                            xgb_pred * (1-weight) > 0.5
                                        ).astype(int)
                                        
                                        # Calculate F1 score
                                        y_val = val_df['target'].astype(int)
                                        score = f1_score(y_val, final_pred, average='macro')
                                        print(f"F1 Score: {score}")
                                        
                                        if score > best_score:
                                            best_score = score
                                            best_params = {
                                                'order': order,
                                                'seasonal_order': seasonal_order,
                                                'learning_rate': lr,
                                                'max_depth': depth,
                                                'n_estimators': n_est,
                                                'subsample': subsample
                                            }
                                            best_weight = weight
                                            print(f"New best F1: {best_score}")
                                            print(f"Best params: {best_params}")
                                            print(f"Best weight: {best_weight}")
                                            
                                    except Exception as e:
                                        print(f"Error in trial: {str(e)}")
                                        continue
        
        if best_params is None:
            raise Exception("No valid parameters found during tuning")
            
        self.best_params = best_params
        self.best_weight = best_weight
        print(f"\nTuning completed. Best F1: {best_score}")
        return best_params, best_weight
        
    def train(self, train_data):
        """Train the final model using the entire training dataset."""
        df = self.calculate_technical_indicators(train_data)
        
        if self.best_params is None:
            self.best_params, self.best_weight = self.tune_parameters(train_data)
        
        print("\nTraining final models...")
        # Use last 50K points for SARIMAX
        sarimax_df = df.iloc[-50000:-1].copy()  # Exclude last row where target is NA
        sarimax_exog = self.prepare_features(sarimax_df)
        
        self.sarimax_model = SARIMAX(
            sarimax_df['close'],
            exog=sarimax_exog,
            order=self.best_params['order'],
            seasonal_order=self.best_params['seasonal_order']
        ).fit(disp=False, method='powell')
        
        # Train XGBoost on all data
        X = self.prepare_features(df[:-1])  # Exclude last row where target is NA
        y = df[:-1]['target'].astype(int)
        
        self.xgb_model = xgb.XGBClassifier(
            learning_rate=self.best_params['learning_rate'],
            max_depth=self.best_params['max_depth'],
            n_estimators=self.best_params['n_estimators'],
            subsample=self.best_params['subsample'],
            tree_method='hist',
            device='cuda'
        )
        self.xgb_model.fit(X, y)
        gc.collect()
        
    def predict(self, future_df):
        """Make predictions for future data."""
        print("Making predictions...")
        df = self.calculate_technical_indicators(future_df)
        exog = self.prepare_features(df)
        
        sarimax_forecast = self.sarimax_model.forecast(steps=len(df), exog=exog)
        xgb_pred = self.xgb_model.predict_proba(exog)[:, 1]
        
        return (
            (sarimax_forecast.diff() > 0).astype(float) * self.best_weight + 
            xgb_pred * (1-self.best_weight) > 0.5
        ).astype(int)

def main():
    try:
        print("Loading data...")
        chunks = []
        for chunk in pd.read_csv('/kaggle/input/directional-forecasting-cryptocurrencies/train.csv',
                               usecols=['timestamp', 'close', 'high', 'low', 'volume'],
                               chunksize=500000):
            chunks.append(chunk)
        
        train_df = pd.concat(chunks)
        chunks = None
        gc.collect()
        
        test_df = pd.read_csv('/kaggle/input/directional-forecasting-cryptocurrencies/test.csv',
                             usecols=['timestamp', 'close', 'high', 'low', 'volume'])
        
        model = TimeSeriesModel()
        try:
            model.train(train_df)
        except Exception as e:
            print(f"Error during model training: {str(e)}")
            raise
            
        try:
            predictions = model.predict(test_df)
            submission = pd.DataFrame({
                'row_id': range(len(predictions)), 
                'target': predictions
            })
            submission.to_csv('submission.csv', index=False)
            print("Done!")
        except Exception as e:
            print(f"Error during prediction or saving: {str(e)}")
            raise
            
    except Exception as e:
        print(f"Fatal error in main: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Loading data...
Tuning parameters...
Train set size: 500000, Validation set size: 636731
Validation set positive ratio: 0.47318569380162107

Starting grid search with 24 combinations

SARIMAX order (1, 1, 0), seasonal_order (0, 1, 1, 12)

Trial 1/24
XGB: lr=0.03, depth=5, n_est=200, subsample=0.9, weight=0.2
Training SARIMAX...
Training XGBoost...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




F1 Score: 0.49338035243545053
New best F1: 0.49338035243545053
Best params: {'order': (1, 1, 0), 'seasonal_order': (0, 1, 1, 12), 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.9}
Best weight: 0.2

Trial 2/24
XGB: lr=0.03, depth=5, n_est=200, subsample=0.9, weight=0.3
Training SARIMAX...
Training XGBoost...
F1 Score: 0.49199780373933566

Trial 3/24
XGB: lr=0.03, depth=5, n_est=200, subsample=0.9, weight=0.4
Training SARIMAX...
Training XGBoost...
F1 Score: 0.4919846242185263

Trial 4/24
XGB: lr=0.03, depth=6, n_est=200, subsample=0.9, weight=0.2
Training SARIMAX...
Training XGBoost...
F1 Score: 0.4938281043367362
New best F1: 0.4938281043367362
Best params: {'order': (1, 1, 0), 'seasonal_order': (0, 1, 1, 12), 'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.9}
Best weight: 0.2

Trial 5/24
XGB: lr=0.03, depth=6, n_est=200, subsample=0.9, weight=0.3
Training SARIMAX...
Training XGBoost...
F1 Score: 0.49201074992211624

Trial 6/24
XGB