In [None]:
"""
# üß™ LSTM Forecasting Model for News Engagement Prediction
AI Course - NLP Track  
This notebook implements the main Bidirectional LSTM model to forecast 
users' future news engagement across 7 political stances.
"""

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings('ignore')

In [None]:
# -------------------- Configuration & Hyperparameters --------------------
class Config:
    # Data
    SEQ_LENGTH = 8          # 8 quarters = 2 years input
    NUM_STANCES = 7         # 7 political stances (-3 to +3)
    SAMPLE_FRACTION = 0.05  # 5% of total data for quick experimentation
    
    # Model Architecture
    HIDDEN_DIM = 128
    NUM_LAYERS = 2
    DROPOUT = 0.3
    BIDIRECTIONAL = True
    
    # Training
    BATCH_SIZE = 32
    LEARNING_RATE = 1e-3
    NUM_EPOCHS = 30
    PATIENCE = 5           # Early stopping patience
    RANDOM_SEED = 42
    
    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
config = Config()

In [None]:
# Set random seeds for reproducibility
random.seed(config.RANDOM_SEED)
np.random.seed(config.RANDOM_SEED)
torch.manual_seed(config.RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.RANDOM_SEED)
    torch.backends.cudnn.deterministic = True

print(f"üìå Device: {config.DEVICE}")
print(f"üìå Sample fraction: {config.SAMPLE_FRACTION*100:.0f}%")

In [None]:
# -------------------- 1. Load Data with Random Sampling --------------------
DATA_PATH = "../data/icwsm-2024-forecasting-data-anon.json"

print(f"üìÇ Loading {config.SAMPLE_FRACTION*100:.0f}% random sample from {DATA_PATH}...")

with open(DATA_PATH, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

print(f"üìä Total records in full dataset: {len(raw_data):,}")

# Random sampling
all_keys = list(raw_data.keys())
sampled_keys = random.sample(all_keys, int(len(all_keys) * config.SAMPLE_FRACTION))

records = []
for key in tqdm(sampled_keys, desc="Loading records"):
    value = raw_data[key]
    records.append({
        'user_id': value['user_id_anonymized'],
        'timestamp': pd.to_datetime(value['created_at']),
        'stances': value['partisan stance']
    })

df = pd.DataFrame(records)
df['quarter'] = df['timestamp'].dt.to_period('Q')
print(f"‚úÖ Data loaded. Total records in sample: {len(df):,}")
print(f"‚úÖ Unique users: {df['user_id'].nunique():,}")

In [None]:
# -------------------- 2. Build Time Series Sequences --------------------
def build_sequences(df, seq_length=8):
    """
    Convert user engagements into (X, y) sequences.
    X: [n_samples, seq_length, 7] - input quarters
    y: [n_samples, 7] - target quarter
    """
    sequences = []
    labels = []
    users_with_sequences = 0
    
    for user_id, user_df in tqdm(df.groupby('user_id'), desc="Building sequences"):
        user_df = user_df.sort_values('timestamp')
        
        # Count engagements per quarter
        quarterly_counts = []
        for quarter, quarter_df in user_df.groupby('quarter'):
            counts = np.zeros(7, dtype=np.float32)
            for stances in quarter_df['stances']:
                for stance in stances:
                    idx = int(stance) + 3  # Convert -3..3 to 0..6
                    if 0 <= idx < 7:
                        counts[idx] += 1
            quarterly_counts.append(counts)
        
        # Create sliding windows
        if len(quarterly_counts) >= seq_length + 1:
            users_with_sequences += 1
            for i in range(len(quarterly_counts) - seq_length):
                seq = quarterly_counts[i:i+seq_length]
                label = quarterly_counts[i+seq_length]
                sequences.append(seq)
                labels.append(label)
    
    print(f"‚úÖ Users with sequences: {users_with_sequences}")
    return np.array(sequences, dtype=np.float32), np.array(labels, dtype=np.float32)

sequences, labels = build_sequences(df, config.SEQ_LENGTH)
print(f"‚úÖ Total sequences created: {len(sequences):,}")
print(f"   Sequence shape: {sequences.shape}")
print(f"   Label shape: {labels.shape}")

In [None]:
# -------------------- 3. Train/Validation Split --------------------
split = int(0.8 * len(sequences))
X_train, X_val = sequences[:split], sequences[split:]
y_train, y_val = labels[:split], labels[split:]

print(f"\nüìä Training set size: {len(X_train):,}")
print(f"üìä Validation set size: {len(X_val):,}")

In [None]:
# -------------------- 4. Define LSTM Model --------------------
class NewsForecaster(nn.Module):
    """Bidirectional LSTM model for news engagement forecasting."""
    
    def __init__(self, config):
        super().__init__()
        
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=config.NUM_STANCES,
            hidden_size=config.HIDDEN_DIM,
            num_layers=config.NUM_LAYERS,
            batch_first=True,
            dropout=config.DROPOUT if config.NUM_LAYERS > 1 else 0,
            bidirectional=config.BIDIRECTIONAL
        )
        
        # Output dimension after LSTM
        lstm_out_dim = config.HIDDEN_DIM * (2 if config.BIDIRECTIONAL else 1)
        
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(lstm_out_dim, 64),
            nn.ReLU(),
            nn.Dropout(config.DROPOUT),
            nn.Linear(64, config.NUM_STANCES)
        )
    
    def forward(self, x, return_embedding=False):
        """
        Args:
            x: Input tensor [batch_size, seq_len, 7]
            return_embedding: If True, returns hidden state for clustering
        
        Returns:
            embedding: Hidden state [batch_size, hidden_dim*2]
            output: Prediction [batch_size, 7]
        """
        _, (h_n, _) = self.lstm(x)
        
        # Extract last hidden states from both directions
        if self.lstm.bidirectional:
            h_forward = h_n[-2, :, :]
            h_backward = h_n[-1, :, :]
            embedding = torch.cat([h_forward, h_backward], dim=1)
        else:
            embedding = h_n[-1, :, :]
        
        output = self.fc(embedding)
        
        if return_embedding:
            return embedding, output
        return output

In [None]:
# -------------------- 5. Training Function --------------------
def train_model(config, X_train, y_train, X_val, y_val):
    """Train the LSTM model with early stopping."""
    
    # Convert to tensors
    train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
    val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
    
    train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False)
    
    # Initialize model
    model = NewsForecaster(config).to(config.DEVICE)
    criterion = nn.L1Loss()  # MAE loss
    optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
    
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(config.NUM_EPOCHS):
        # Training phase
        model.train()
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(config.DEVICE), batch_y.to(config.DEVICE)
            
            optimizer.zero_grad()
            pred = model(batch_X)
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(config.DEVICE), batch_y.to(config.DEVICE)
                pred = model(batch_X)
                loss = criterion(pred, batch_y)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        val_losses.append(val_loss)
        
        print(f"Epoch {epoch+1:2d}/{config.NUM_EPOCHS} | "
              f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= config.PATIENCE:
                print(f"‚ö†Ô∏è Early stopping triggered at epoch {epoch+1}")
                break
    
    # Load best model
    model.load_state_dict(torch.load("best_model.pt"))
    return model, train_losses, val_losses


In [None]:
# -------------------- 6. Train Model --------------------
print("\nüöÄ Starting LSTM model training...")
model, train_losses, val_losses = train_model(config, X_train, y_train, X_val, y_val)

In [None]:
# -------------------- 7. Learning Curves --------------------
plt.figure(figsize=(12, 5))
plt.plot(train_losses, label='Train Loss', linewidth=2)
plt.plot(val_losses, label='Validation Loss', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('MAE Loss', fontsize=12)
plt.title('üìâ Learning Curves - LSTM Forecaster', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# -------------------- 8. Evaluation --------------------
model.eval()
val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False)

all_preds = []
all_labels = []
with torch.no_grad():
    for batch_X, batch_y in val_loader:
        batch_X = batch_X.to(config.DEVICE)
        pred = model(batch_X).cpu().numpy()
        all_preds.append(pred)
        all_labels.append(batch_y.numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

# Calculate MAE per stance
mae_per_stance = np.mean(np.abs(all_preds - all_labels), axis=0)
avg_mae = np.mean(mae_per_stance)

print("\n" + "="*60)
print("üìä MODEL EVALUATION ON VALIDATION SET")
print("="*60)
stances = [-3, -2, -1, 0, 1, 2, 3]
for i, stance in enumerate(stances):
    print(f"  Stance {stance:2d}: MAE = {mae_per_stance[i]:.4f}")
print(f"\n‚úÖ Average MAE: {avg_mae:.4f}")

In [None]:
# -------------------- 9. Baseline Comparison (Last Value) --------------------
baseline_preds = X_val[:, -1, :]  # Last observed quarter
baseline_mae = np.mean(np.abs(baseline_preds - y_val), axis=0)
baseline_avg_mae = np.mean(baseline_mae)

print("\n" + "="*60)
print("üìä BASELINE COMPARISON (Last Value)")
print("="*60)
for i, stance in enumerate(stances):
    improvement = (baseline_mae[i] - mae_per_stance[i]) / baseline_mae[i] * 100
    print(f"  Stance {stance:2d}: Baseline MAE = {baseline_mae[i]:.4f} | "
          f"LSTM MAE = {mae_per_stance[i]:.4f} | "
          f"Improvement = {improvement:.1f}%")

print(f"\n‚úÖ Baseline Average MAE: {baseline_avg_mae:.4f}")
print(f"‚úÖ LSTM Average MAE: {avg_mae:.4f}")
print(f"üìà Overall Improvement: {(baseline_avg_mae - avg_mae) / baseline_avg_mae * 100:.2f}%")


In [None]:
# -------------------- 10. Scatter Plot: True vs Predicted --------------------
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for i, stance in enumerate(stances):
    ax = axes[i]
    ax.scatter(all_labels[:, i], all_preds[:, i], alpha=0.5, s=10, color='purple')
    
    # Identity line
    max_val = max(np.max(all_labels[:, i]), np.max(all_preds[:, i]))
    ax.plot([0, max_val], [0, max_val], 'r--', linewidth=1, label='Perfect Prediction')
    
    ax.set_xlabel('True Values', fontsize=10)
    ax.set_ylabel('Predicted Values', fontsize=10)
    ax.set_title(f'Stance {stance}', fontsize=12)
    ax.grid(True, alpha=0.3)
    ax.legend(fontsize=8)

axes[-1].axis('off')
plt.suptitle('üéØ True vs Predicted Engagement Counts', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# -------------------- 11. Error Distribution --------------------
errors = all_preds - all_labels
plt.figure(figsize=(12, 5))
plt.hist(errors.flatten(), bins=50, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel('Prediction Error', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('üìä Distribution of Prediction Errors', fontsize=16)
plt.axvline(0, color='red', linestyle='--', linewidth=2)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nüìä Error Statistics:")
print(f"   Mean Error: {np.mean(errors):.4f}")
print(f"   Std Error: {np.std(errors):.4f}")
print(f"   95% CI: [{np.percentile(errors, 2.5):.4f}, {np.percentile(errors, 97.5):.4f}]")


In [None]:
# -------------------- 12. Save Model --------------------
import os
os.makedirs('../models_saved', exist_ok=True)
torch.save(model.state_dict(), '../models_saved/lstm_forecaster.pt')
print("\nüíæ Model saved to ../models_saved/lstm_forecaster.pt")


In [None]:
print("\n‚úÖ LSTM forecasting experiments completed successfully.")