# GRU Stock Price Prediction

## Configuration
- Sequence Length: 5 days
- Hidden Dim: 64
- Num Layers: 2
- Dropout: 0.3
- Batch Size: 128
- Epochs: 100 (Early Stopping: patience=10)
- Sample Weighting: Enabled
- Target: Next day stock value
- Normalization: MinMaxScaler
- Aggregation: Article-level prediction → Daily average

## 0. Import Libraries

In [None]:
import os
import gc
import json
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

## 1. Path Configuration

In [None]:
# Path configuration
# Google Colab:
# DATA_DIR = Path("/content/drive/MyDrive/COSE362/data/feature_engineering")
# OUTPUT_DIR = Path("/content/drive/MyDrive/COSE362/data/prediction_output/results_gru")

# Local:
DATA_DIR = Path("../feature_datasets")
OUTPUT_DIR = Path("results_gru")

RESULTS_DIR = OUTPUT_DIR / "results"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data Source: {DATA_DIR}")
print(f"Output Path: {OUTPUT_DIR}")
print(f"Results Path: {RESULTS_DIR}")

## 2. Device Configuration

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 3. Hyperparameters

In [None]:
# GRU Hyperparameters
CONFIG = {
    'seq_length': 5,
    'hidden_dim': 64,
    'num_layers': 2,
    'dropout': 0.3,
    'batch_size': 128,
    'epochs': 100,
    'early_stopping_patience': 10,
    'learning_rate': 0.001,
}

print("GRU Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## 4. Data Loading Functions

In [None]:
def load_data(file_path):
    """
    Load parquet file and create Date column
    """
    print(f"Loading {file_path.name}...")
    df = pd.read_parquet(file_path)
    
    # Sort by date_index
    if 'date_index' in df.columns:
        df = df.sort_values('date_index').reset_index(drop=True)
    
    # Create Date column
    if 'pub_date' in df.columns:
        df['Date'] = pd.to_datetime(df['pub_date'], format='%Y_%m_%d')
        print(f"   Using 'pub_date' for Date column")
    elif 'date_str' in df.columns:
        df['Date'] = pd.to_datetime(df['date_str'], format='%Y_%m_%d')
        print(f"   Using 'date_str' for Date column")
    elif 'date_index' in df.columns:
        base_date = pd.to_datetime('2017-01-01')
        df['Date'] = base_date + pd.to_timedelta(df['date_index'], unit='D')
        print(f"   Using 'date_index' for Date column (base: 2017-01-01)")
    else:
        print("   [Warning] No date column found. Using default range.")
        df['Date'] = pd.date_range(start='2017-01-01', periods=len(df), freq='D')
    
    print(f"   Loaded {len(df)} rows, {len(df.columns)} columns")
    print(f"   ✅ Date range: {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}")
    
    return df

## 5. Preprocessing Functions

In [None]:
def preprocess_and_split(df, target_col='value'):
    """
    Preprocess data and split into train/valid/test
    
    Features to DROP:
    - lag_1, lag_2, lag_3, lag_4, lag_5 (GRU learns temporal patterns)
    - fg_value (look-ahead bias)
    - metadata columns
    
    Features to KEEP:
    - date_index (temporal trend)
    - emb_* or pca_* (text embeddings)
    - person_* (one-hot vectors)
    - fg_lag_* (economic indicators, lagged)
    """
    
    # ========================================
    # 1. Target Creation (next day value)
    # ========================================
    daily_prices = df[['date_index', target_col]].drop_duplicates().sort_values('date_index')
    daily_prices['target'] = daily_prices[target_col].shift(-1)
    
    df = df.drop(columns=['target'], errors='ignore')
    df = df.merge(daily_prices[['date_index', 'target']], on='date_index', how='left')
    df = df.dropna(subset=['target'])
    
    print(f"   After target creation: {len(df)} rows")
    
    # ========================================
    # 2. Sample Weight Calculation
    # ========================================
    date_counts = df['date_index'].value_counts()
    df['sample_weight'] = df['date_index'].map(lambda x: 1.0 / date_counts[x])
    
    print(f"   Sample weights: min={df['sample_weight'].min():.4f}, "
          f"max={df['sample_weight'].max():.4f}, mean={df['sample_weight'].mean():.4f}")
    
    # ========================================
    # 3. Extract Date for splitting
    # ========================================
    if 'Date' not in df.columns:
        raise ValueError("'Date' column not found in dataframe")
    
    dates = df['Date'].copy()
    date_indices = df['date_index'].copy()
    df = df.drop(columns=['Date'])
    
    # ========================================
    # 4. Columns to Drop
    # ========================================
    cols_to_drop = [
        # Metadata
        'person', 'person_id', 'article_id',
        
        # Date columns
        'pub_date', 'article_date',
        
        # Target related
        'value',      # Current stock price
        'target',     # Target (will be extracted separately)
        
        # Lag features (GRU learns temporal patterns itself)
        'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5',
        
        # Fear-Greed current value (look-ahead bias)
        'fg_value',
        
        # Weight (will be extracted separately)
        'sample_weight',
    ]
    
    actual_drop = [c for c in cols_to_drop if c in df.columns]
    print(f"   Dropping columns: {actual_drop}")
    
    # ========================================
    # 5. Extract X, y, weights
    # ========================================
    X = df.drop(columns=actual_drop, errors='ignore')
    y = df['target'].copy()
    weights = df['sample_weight'].copy()
    
    # Remove datetime/object columns
    datetime_cols = X.select_dtypes(include=['datetime64']).columns.tolist()
    if datetime_cols:
        print(f"   ⚠️ Removing datetime columns: {datetime_cols}")
        X = X.drop(columns=datetime_cols)
    
    object_cols = X.select_dtypes(include=['object']).columns.tolist()
    if object_cols:
        print(f"   ⚠️ WARNING: Found non-numeric columns: {object_cols}")
        print(f"   Removing them...")
        X = X.drop(columns=object_cols)
    
    print(f"   ✅ Feature columns ({len(X.columns)}): {list(X.columns[:10])}...")
    print(f"   ✅ All features are numeric: {X.dtypes.apply(lambda x: x.kind in 'biufc').all()}")
    
    # ========================================
    # 6. Train/Valid/Test Split
    # ========================================
    train_mask = (dates <= '2018-12-31')
    valid_mask = (dates >= '2019-01-01') & (dates <= '2019-06-30')
    test_mask = (dates >= '2019-07-01')
    
    print(f"   Train: {train_mask.sum()} rows")
    print(f"   Valid: {valid_mask.sum()} rows")
    print(f"   Test:  {test_mask.sum()} rows")
    
    if train_mask.sum() == 0 or test_mask.sum() == 0:
        raise ValueError("Train or Test set is empty!")
    
    return (
        (X[train_mask], y[train_mask], weights[train_mask], date_indices[train_mask]),
        (X[valid_mask], y[valid_mask], weights[valid_mask], date_indices[valid_mask]),
        (X[test_mask], y[test_mask], weights[test_mask], date_indices[test_mask], dates[test_mask])
    )

## 6. Weighted Time Series Dataset

In [None]:
class WeightedTimeSeriesDataset(Dataset):
    """
    Time series dataset with sample weighting
    
    Returns sequences of length seq_length with corresponding weights
    """
    def __init__(self, X, y, weights, seq_length):
        """
        Args:
            X: (n_samples, n_features) numpy array
            y: (n_samples,) numpy array
            weights: (n_samples,) numpy array
            seq_length: int, sequence length
        """
        self.X = X
        self.y = y
        self.weights = weights
        self.seq_length = seq_length
    
    def __len__(self):
        return len(self.X) - self.seq_length + 1
    
    def __getitem__(self, idx):
        """
        Returns:
            X_seq: (seq_length, n_features)
            y_target: (1,)
            weight: (1,)
        """
        # Sequence: idx ~ idx+seq_length-1
        X_seq = self.X[idx : idx + self.seq_length]
        
        # Target: last time step
        y_target = self.y[idx + self.seq_length - 1]
        
        # Weight: last time step
        weight = self.weights[idx + self.seq_length - 1]
        
        return (
            torch.FloatTensor(X_seq),
            torch.FloatTensor([y_target]),
            torch.FloatTensor([weight])
        )

## 7. GRU Model

In [None]:
class GRUModel(nn.Module):
    """
    GRU-based stock price prediction model
    """
    def __init__(self, input_dim, hidden_dim=64, num_layers=2, dropout=0.3):
        super(GRUModel, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.gru = nn.GRU(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.fc = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        """
        Args:
            x: (batch, seq_length, input_dim)
        Returns:
            out: (batch, 1)
        """
        # GRU forward
        gru_out, _ = self.gru(x)
        
        # Use last time step output
        last_out = gru_out[:, -1, :]  # (batch, hidden_dim)
        
        # Fully connected layer
        out = self.fc(last_out)  # (batch, 1)
        
        return out

## 8. Early Stopping

In [None]:
class EarlyStopping:
    """
    Early stopping to stop training when validation loss doesn't improve
    """
    def __init__(self, patience=10, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
    
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

## 9. Training Function

In [None]:
def train_gru_model(dataset_name, file_path):
    """
    Train GRU model with early stopping and sample weighting
    
    Returns:
        test_mse: float
    """
    
    # ========================================
    # 0. Check if result already exists
    # ========================================
    output_path = RESULTS_DIR / f"pred_gru_{dataset_name}.json"
    
    if output_path.exists():
        print(f"\n{'='*60}")
        print(f"⏭️  SKIPPING: {dataset_name}")
        print(f"{'='*60}")
        print(f"   Result already exists: {output_path.name}")
        
        try:
            with open(output_path, 'r') as f:
                result_data = json.load(f)
            
            actuals = [item['actual'] for item in result_data]
            preds = [item['predicted'] for item in result_data]
            mse = mean_squared_error(actuals, preds)
            
            print(f"   ✅ Cached Test MSE: {mse:.4f}")
            return mse
            
        except Exception as e:
            print(f"   ⚠️ Warning: Could not read cached MSE: {e}")
            print(f"   Re-running experiment...")
    
    # ========================================
    # 1. Load and preprocess data
    # ========================================
    print(f"\n{'='*60}")
    print(f"Processing: {dataset_name}")
    print(f"{'='*60}")
    
    df = load_data(file_path)
    
    (X_train, y_train, w_train, date_idx_train), \
    (X_valid, y_valid, w_valid, date_idx_valid), \
    (X_test, y_test, w_test, date_idx_test, dates_test) = preprocess_and_split(df)
    
    del df
    gc.collect()
    
    # ========================================
    # 2. Normalization (MinMaxScaler)
    # ========================================
    print(f"\n   Normalizing features...")
    
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    
    X_train_scaled = scaler_x.fit_transform(X_train.values).astype(np.float32)
    y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten().astype(np.float32)
    
    X_valid_scaled = scaler_x.transform(X_valid.values).astype(np.float32)
    y_valid_scaled = scaler_y.transform(y_valid.values.reshape(-1, 1)).flatten().astype(np.float32)
    
    X_test_scaled = scaler_x.transform(X_test.values).astype(np.float32)
    y_test_raw = y_test.values.astype(np.float32)
    
    w_train = w_train.values.astype(np.float32)
    w_valid = w_valid.values.astype(np.float32)
    w_test = w_test.values.astype(np.float32)
    
    print(f"   ✅ Features normalized to [0, 1]")
    
    # ========================================
    # 3. Create datasets and dataloaders
    # ========================================
    seq_length = CONFIG['seq_length']
    batch_size = CONFIG['batch_size']
    
    train_dataset = WeightedTimeSeriesDataset(X_train_scaled, y_train_scaled, w_train, seq_length)
    valid_dataset = WeightedTimeSeriesDataset(X_valid_scaled, y_valid_scaled, w_valid, seq_length)
    test_dataset = WeightedTimeSeriesDataset(X_test_scaled, y_test_raw, w_test, seq_length)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    print(f"\n   Dataset sizes:")
    print(f"   - Train: {len(train_dataset)} samples")
    print(f"   - Valid: {len(valid_dataset)} samples")
    print(f"   - Test: {len(test_dataset)} samples")
    
    # ========================================
    # 4. Initialize model
    # ========================================
    input_dim = X_train_scaled.shape[1]
    
    model = GRUModel(
        input_dim=input_dim,
        hidden_dim=CONFIG['hidden_dim'],
        num_layers=CONFIG['num_layers'],
        dropout=CONFIG['dropout']
    ).to(DEVICE)
    
    criterion = nn.MSELoss(reduction='none')
    optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['learning_rate'])
    early_stopping = EarlyStopping(patience=CONFIG['early_stopping_patience'])
    
    print(f"\n   Model initialized:")
    print(f"   - Input dim: {input_dim}")
    print(f"   - Hidden dim: {CONFIG['hidden_dim']}")
    print(f"   - Num layers: {CONFIG['num_layers']}")
    print(f"   - Dropout: {CONFIG['dropout']}")
    print(f"   - Parameters: {sum(p.numel() for p in model.parameters())}")
    
    # ========================================
    # 5. Training loop
    # ========================================
    print(f"\n   Training started...")
    
    best_val_loss = float('inf')
    
    for epoch in range(CONFIG['epochs']):
        # Train
        model.train()
        train_loss = 0.0
        train_batches = 0
        
        for X_batch, y_batch, w_batch in train_loader:
            X_batch = X_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)
            w_batch = w_batch.to(DEVICE)
            
            optimizer.zero_grad()
            
            # Forward
            predictions = model(X_batch)
            
            # Weighted MSE loss
            losses = criterion(predictions, y_batch)
            weighted_loss = (losses * w_batch).mean()
            
            # Backward
            weighted_loss.backward()
            optimizer.step()
            
            train_loss += weighted_loss.item()
            train_batches += 1
        
        train_loss /= train_batches
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_batches = 0
        
        with torch.no_grad():
            for X_batch, y_batch, w_batch in valid_loader:
                X_batch = X_batch.to(DEVICE)
                y_batch = y_batch.to(DEVICE)
                w_batch = w_batch.to(DEVICE)
                
                predictions = model(X_batch)
                losses = criterion(predictions, y_batch)
                weighted_loss = (losses * w_batch).mean()
                
                val_loss += weighted_loss.item()
                val_batches += 1
        
        val_loss /= val_batches
        
        # Logging
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"   Epoch [{epoch+1:3d}/{CONFIG['epochs']}] "
                  f"Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
        
        # Early stopping
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"\n   ⚠️ Early stopping triggered at epoch {epoch+1}")
            break
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
    
    print(f"\n   ✅ Training completed")
    print(f"   ✅ Best validation loss: {best_val_loss:.6f}")
    
    # ========================================
    # 6. Test prediction
    # ========================================
    print(f"\n   Predicting on test set...")
    
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for X_batch, _, _ in test_loader:
            X_batch = X_batch.to(DEVICE)
            predictions = model(X_batch)
            all_preds.append(predictions.cpu().numpy())
    
    test_preds_scaled = np.vstack(all_preds).flatten()
    test_preds = scaler_y.inverse_transform(test_preds_scaled.reshape(-1, 1)).flatten()
    
    # Align test data (first seq_length-1 samples are lost)
    y_test_aligned = y_test_raw[seq_length-1:]
    date_idx_test_aligned = date_idx_test.values[seq_length-1:]
    dates_test_aligned = dates_test.values[seq_length-1:]
    
    # ========================================
    # 7. Daily aggregation (article-level → daily)
    # ========================================
    results_df = pd.DataFrame({
        'date_index': date_idx_test_aligned,
        'date': dates_test_aligned,
        'actual': y_test_aligned,
        'predicted': test_preds
    })
    
    # Aggregate by date
    daily_results = results_df.groupby('date_index').agg({
        'date': 'first',
        'actual': 'first',  # Same value for all articles on same day
        'predicted': 'mean'  # Average predictions
    }).reset_index()
    
    # Calculate MSE
    test_mse = mean_squared_error(daily_results['actual'], daily_results['predicted'])
    
    print(f"\n   ✅ Test MSE (Daily): {test_mse:.4f}")
    
    # ========================================
    # 8. Save results
    # ========================================
    result_data = []
    for _, row in daily_results.iterrows():
        result_data.append({
            "date": pd.to_datetime(row['date']).strftime('%Y-%m-%d'),
            "actual": float(row['actual']),
            "predicted": float(row['predicted'])
        })
    
    with open(output_path, 'w') as f:
        json.dump(result_data, f, indent=4)
    
    print(f"   Saved predictions to: results/{output_path.name}")
    
    # Memory cleanup
    del model, optimizer, train_loader, valid_loader, test_loader
    del X_train_scaled, X_valid_scaled, X_test_scaled
    torch.cuda.empty_cache()
    gc.collect()
    
    return test_mse

## 10. Main Execution Loop

In [None]:
# ========================================
# Main Execution
# ========================================

levels = ['B', 'C', 'D']
methods = ['headlines', 'chunking', 'bodyText', 'paragraphs']
types = ['pca', 'orig']

metrics_list = []

# ========================================
# Dataset A (Baseline)
# ========================================
path_A = DATA_DIR / "dataset_A.parquet"
if path_A.exists():
    print("\n" + "="*60)
    print("BASELINE: Dataset A")
    print("="*60)
    
    try:
        mse = train_gru_model("A", path_A)
        metrics_list.append({
            "Feature_set": "A",
            "Embeddings": "-",
            "Dim_reduction": "-",
            "Model": "GRU",
            "MSE": mse
        })
    except Exception as e:
        print(f"❌ Error on Dataset A: {e}")
        import traceback
        traceback.print_exc()
    finally:
        gc.collect()
        torch.cuda.empty_cache()
else:
    print(f"Warning: {path_A} not found. Skipping Dataset A.")

# ========================================
# Datasets B, C, D
# ========================================
total_datasets = len(levels) * len(methods) * len(types)
current = 0

for level in levels:
    for method in methods:
        for dtype in types:
            current += 1
            fname = f"dataset_{level}_{method}_{dtype}.parquet"
            fpath = DATA_DIR / fname
            
            if not fpath.exists():
                print(f"\n[{current}/{total_datasets}] Skipping {fname}: File not found.")
                continue
            
            dname = f"{level}_{method}_{dtype}"
            
            try:
                mse = train_gru_model(dname, fpath)
                metrics_list.append({
                    "Feature_set": level,
                    "Embeddings": method,
                    "Dim_reduction": dtype,
                    "Model": "GRU",
                    "MSE": mse
                })
            except Exception as e:
                print(f"\n❌ Error on {dname}: {e}")
                import traceback
                traceback.print_exc()
            finally:
                gc.collect()
                torch.cuda.empty_cache()

# ========================================
# Save final results
# ========================================
final_df = pd.DataFrame(metrics_list).sort_values("Test_MSE")
csv_path = OUTPUT_DIR / "gru_evaluation_metrics.csv"
final_df.to_csv(csv_path, index=False)

print("\n" + "="*60)
print("ALL TASKS COMPLETED")
print("="*60)
print(f"\nResults saved to: {csv_path}")
print(f"\nTop 10 Models by Test MSE:")
print(final_df.head(10))

# 총 51분 정도