# LSTM Business Sequence Prediction Training

This notebook trains an LSTM model to predict the next business a user will visit.

## Model Architecture
- **Input**: Sequence of business IDs (max length 20)
- **Embedding**: 128-dimensional embeddings for 20,002 businesses
- **LSTM**: 2 layers with 256 hidden units and 0.3 dropout
- **Output**: Softmax over 20,002 businesses

## Data Split Strategy
- **Train/Val**: Non-Atlanta users only (zero data leakage)
- **Test**: Atlanta users only (final inference set)

## GPU Configuration
- Automatically detects and uses GPU if available

## 1. Setup & GPU Check

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import polars as pl
import json
import numpy as np
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB')
    print(f'CUDA Version: {torch.version.cuda}')
else:
    print('⚠️  No GPU detected. Training will be slow on CPU.')

## 2. Data Loading

In [None]:
# Paths
DATA_DIR = Path('../data/processed/ga/lstm_data')

# Load vocabulary
with open(DATA_DIR / 'business_vocab.json', 'r') as f:
    vocab = json.load(f)

vocab_size = len(vocab)
pad_idx = vocab['<PAD>']
unk_idx = vocab['<UNK>']

print(f'Vocabulary size: {vocab_size:,}')
print(f'PAD index: {pad_idx}')
print(f'UNK index: {unk_idx}')

In [None]:
# Load data
print('Loading data...')
train_df = pl.read_parquet(DATA_DIR / 'business_train.parquet')
val_df = pl.read_parquet(DATA_DIR / 'business_val.parquet')
test_df = pl.read_parquet(DATA_DIR / 'business_test.parquet')

print(f'Train: {len(train_df):,} examples')
print(f'Val:   {len(val_df):,} examples')
print(f'Test:  {len(test_df):,} examples (Atlanta only)')

print('\nSample:')
print(train_df.head(1))

In [None]:
class BusinessSequenceDataset(Dataset):
    def __init__(self, df):
        self.sequences = torch.tensor(df['input_seq'].to_list(), dtype=torch.long)
        self.targets = torch.tensor(df['target'].to_list(), dtype=torch.long)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

# Create datasets
train_dataset = BusinessSequenceDataset(train_df)
val_dataset = BusinessSequenceDataset(val_df)
test_dataset = BusinessSequenceDataset(test_df)

# Create dataloaders
BATCH_SIZE = 512
NUM_WORKERS = 4

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

print(f'DataLoaders created:')
print(f'  Train batches: {len(train_loader)}')
print(f'  Val batches: {len(val_loader)}')
print(f'  Test batches: {len(test_loader)}')

## 3. Model Architecture

In [None]:
class BusinessLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2, dropout=0.3, pad_idx=0):
        super(BusinessLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, 
                           dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        last_hidden = lstm_out[:, -1, :]
        out = self.dropout(last_hidden)
        logits = self.fc(out)
        return logits

# Hyperparameters
EMBED_DIM = 128
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3
LEARNING_RATE = 0.001

# Create model
model = BusinessLSTM(vocab_size, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT, pad_idx)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params:,}')
print(f'Model on device: {next(model.parameters()).device}')

## 4. Training Loop

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training settings
MAX_EPOCHS = 50
EARLY_STOPPING_PATIENCE = 5
CHECKPOINT_DIR = Path('../models/business_lstm')
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f'Training configuration:')
print(f'  Max epochs: {MAX_EPOCHS}')
print(f'  Early stopping patience: {EARLY_STOPPING_PATIENCE}')
print(f'  Learning rate: {LEARNING_RATE}')
print(f'  Batch size: {BATCH_SIZE}')

In [None]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for sequences, targets in tqdm(loader, desc='Training'):
        sequences, targets = sequences.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)
    return total_loss / len(loader), correct / total

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for sequences, targets in tqdm(loader, desc='Evaluating'):
            sequences, targets = sequences.to(device), targets.to(device)
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    return total_loss / len(loader), correct / total

def calculate_topk_accuracy(model, loader, device, k_values=[1, 5, 10]):
    model.eval()
    topk_correct = {k: 0 for k in k_values}
    total = 0
    with torch.no_grad():
        for sequences, targets in tqdm(loader, desc='Top-K'):
            sequences, targets = sequences.to(device), targets.to(device)
            outputs = model(sequences)
            for k in k_values:
                _, topk_pred = outputs.topk(k, dim=1)
                correct = topk_pred.eq(targets.view(-1, 1).expand_as(topk_pred)).any(dim=1)
                topk_correct[k] += correct.sum().item()
            total += targets.size(0)
    return {k: topk_correct[k] / total for k in k_values}

In [None]:
# Training loop
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
best_val_loss = float('inf')
patience_counter = 0

print('Starting training...')
for epoch in range(1, MAX_EPOCHS + 1):
    print(f'\nEpoch {epoch}/{MAX_EPOCHS}')
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
    print(f'Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}')
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'val_acc': val_acc,
        }, CHECKPOINT_DIR / 'best_model.pt')
        print(f'✓ Saved best model')
    else:
        patience_counter += 1
        print(f'Early stopping: {patience_counter}/{EARLY_STOPPING_PATIENCE}')
    
    if patience_counter >= EARLY_STOPPING_PATIENCE:
        print(f'Early stopping at epoch {epoch}')
        break

print(f'\nTraining complete! Best val loss: {best_val_loss:.4f}')

In [None]:
# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(history['train_loss'], label='Train')
ax1.plot(history['val_loss'], label='Val')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Loss')
ax1.legend()
ax1.grid(True)

ax2.plot(history['train_acc'], label='Train')
ax2.plot(history['val_acc'], label='Val')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.savefig(CHECKPOINT_DIR / 'training_history.png', dpi=150)
plt.show()

## 5. Evaluation on Test Set (Atlanta)

In [None]:
# Load best model
checkpoint = torch.load(CHECKPOINT_DIR / 'best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
print(f'Loaded best model from epoch {checkpoint["epoch"]}')

# Evaluate on test set
print('\nEvaluating on Atlanta test set...')
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_acc:.4f}')

# Top-K accuracy
topk_acc = calculate_topk_accuracy(model, test_loader, device, k_values=[1, 5, 10, 20])
print('\nTop-K Accuracy:')
for k, acc in topk_acc.items():
    print(f'  Top-{k:2d}: {acc*100:.2f}%')

## 6. Generate Predictions

In [None]:
def generate_predictions(model, loader, device, top_k=10):
    model.eval()
    all_predictions = []
    all_targets = []
    all_scores = []
    with torch.no_grad():
        for sequences, targets in tqdm(loader, desc='Predictions'):
            sequences = sequences.to(device)
            outputs = model(sequences)
            probs = torch.softmax(outputs, dim=1)
            topk_probs, topk_indices = probs.topk(top_k, dim=1)
            all_predictions.extend(topk_indices.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            all_scores.extend(topk_probs.cpu().numpy())
    return all_predictions, all_targets, all_scores

predictions, targets, scores = generate_predictions(model, test_loader, device, top_k=10)
print(f'Generated {len(predictions):,} predictions')

# Save
predictions_df = pl.DataFrame({
    'target_business_idx': targets,
    'predicted_business_indices': predictions,
    'prediction_scores': scores
})
output_path = DATA_DIR / 'atlanta_business_predictions.parquet'
predictions_df.write_parquet(output_path)
print(f'✓ Saved to {output_path}')

## 7. Summary

In [None]:
print('=' * 80)
print('BUSINESS LSTM TRAINING SUMMARY')
print('=' * 80)
print(f'Best Val Loss: {best_val_loss:.4f}')
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_acc:.4f}')
print('\nTop-K Accuracy:')
for k, acc in topk_acc.items():
    print(f'  Top-{k:2d}: {acc*100:.2f}%')
print('=' * 80)