# Campus Demand Forecasting with DeepAR

This notebook trains a DeepAR model to forecast occupancy rates for educational resources and generates 24-hour predictions for 5 random resources.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import json
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# SageMaker setup
sess = sagemaker.Session()
role = get_execution_role()
bucket = sess.default_bucket()
region = boto3.Session().region_name

print(f"SageMaker Role: {role}")
print(f"S3 Bucket: {bucket}")
print(f"Region: {region}")

## 2. Load Data from S3

In [None]:
# S3 path to your parquet file
s3_path = '<your-s3-URI>'

# Read parquet file
df = pd.read_parquet(s3_path)
print(f"Data shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

## 3. Data Preprocessing and Feature Engineering

In [None]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Convert string columns to numeric where needed
df['current_occupancy'] = pd.to_numeric(df['current_occupancy'], errors='coerce')
df['occupancy_rate'] = pd.to_numeric(df['occupancy_rate'], errors='coerce')
df['total_capacity'] = pd.to_numeric(df['total_capacity'], errors='coerce')
df['availability_hours'] = pd.to_numeric(df['availability_hours'], errors='coerce')

# Sort by resource_id and timestamp
df = df.sort_values(['resource_id', 'timestamp']).reset_index(drop=True)

print(f"Data types after conversion:")
print(df.dtypes)

In [None]:
# Create lag features (populate the null lag columns)
def create_lag_features(group):
    # Sort by timestamp
    group = group.sort_values('timestamp')
    
    # Create lag features
    group['occupancy_rate_lag_1h'] = group['occupancy_rate'].shift(1)
    group['occupancy_rate_lag_24h'] = group['occupancy_rate'].shift(24)
    group['occupancy_rate_lag_168h'] = group['occupancy_rate'].shift(168)  # 1 week
    
    # Create rolling average for last 24 hours
    group['occupancy_rate_avg_24h'] = group['occupancy_rate'].rolling(window=24, min_periods=1).mean()
    
    return group

# Apply lag features per resource
df = df.groupby('resource_id', group_keys=False).apply(create_lag_features)

# Fill remaining NaN values in lag features with 0 (for initial periods)
lag_cols = ['occupancy_rate_lag_1h', 'occupancy_rate_lag_24h', 'occupancy_rate_lag_168h', 'occupancy_rate_avg_24h']
df[lag_cols] = df[lag_cols].fillna(0)

print(f"\nLag features created. Sample:")
print(df[['resource_id', 'timestamp', 'occupancy_rate'] + lag_cols].head(30))

In [None]:
# Encode categorical features
from sklearn.preprocessing import LabelEncoder

# Create label encoders for categorical columns
categorical_cols = ['day_of_week', 'is_exam_period', 'is_peak_hour', 'resource_type', 'location']
label_encoders = {}

for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
        print(f"{col}: {le.classes_}")

print(f"\nCategorical encoding complete.")

## 4. Prepare Data for DeepAR

In [None]:
# DeepAR requires time series data in JSON format
# We'll create train and test splits

prediction_length = 24  # Forecast next 24 hours
context_length = 168    # Use last 7 days (168 hours) as context

# Get unique resources
resources = df['resource_id'].unique()
print(f"Total resources: {len(resources)}")

# Split data: use last 24 hours for testing
train_data = []
test_data = []
resource_mapping = []  # Track resource_id for each entry

for resource in resources:
    resource_df = df[df['resource_id'] == resource].sort_values('timestamp')
    
    # Skip if not enough data
    if len(resource_df) < context_length + prediction_length:
        continue
    
    # Get time series values
    target = resource_df['occupancy_rate'].values.tolist()
    start_time = resource_df['timestamp'].min()
    
    # Dynamic features (categorical encoded features)
    dynamic_feat = []
    for col in ['day_of_week_encoded', 'is_exam_period_encoded', 'is_peak_hour_encoded']:
        if col in resource_df.columns:
            dynamic_feat.append(resource_df[col].values.tolist())
    
    # Static categorical features
    cat_features = []
    if 'resource_type_encoded' in resource_df.columns:
        cat_features.append(int(resource_df['resource_type_encoded'].iloc[0]))
    if 'location_encoded' in resource_df.columns:
        cat_features.append(int(resource_df['location_encoded'].iloc[0]))
    
    # Train data (all but last 24 hours)
    train_entry = {
        "start": str(start_time),
        "target": target[:-prediction_length],
    }
    if dynamic_feat:
        train_entry["dynamic_feat"] = [feat[:-prediction_length] for feat in dynamic_feat]
    if cat_features:
        train_entry["cat"] = cat_features
    
    train_data.append(train_entry)
    
    # Test data (full series for evaluation)
    test_entry = {
        "start": str(start_time),
        "target": target,
    }
    if dynamic_feat:
        test_entry["dynamic_feat"] = dynamic_feat
    if cat_features:
        test_entry["cat"] = cat_features
    
    test_data.append(test_entry)
    resource_mapping.append(resource)  # Store the actual resource_id

print(f"Train samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

In [None]:
# Save data to JSON Lines format
def write_jsonlines(data, filename):
    with open(filename, 'w') as f:
        for entry in data:
            json.dump(entry, f)
            f.write('\n')

write_jsonlines(train_data, 'train.json')
write_jsonlines(test_data, 'test.json')

print("Data files created: train.json, test.json")

In [None]:
# Upload to S3
s3_data_path = f"s3://{bucket}/deepar-occupancy-forecast"
s3_train_path = f"{s3_data_path}/train/train.json"
s3_test_path = f"{s3_data_path}/test/test.json"

boto3.Session().resource('s3').Bucket(bucket).Object('deepar-occupancy-forecast/train/train.json').upload_file('train.json')
boto3.Session().resource('s3').Bucket(bucket).Object('deepar-occupancy-forecast/test/test.json').upload_file('test.json')

print(f"Train data uploaded to: {s3_train_path}")
print(f"Test data uploaded to: {s3_test_path}")

## 5. Train DeepAR Model

In [None]:
# Get DeepAR container image
image_name = sagemaker.image_uris.retrieve('forecasting-deepar', region)

print(f"DeepAR image: {image_name}")

In [None]:
# Configure DeepAR estimator
estimator = sagemaker.estimator.Estimator(
    image_uri=image_name,
    role=role,
    instance_count=1,
    instance_type='ml.c5.2xlarge',
    output_path=f"s3://{bucket}/deepar-occupancy-forecast/output",
    sagemaker_session=sess
)

# Set hyperparameters
estimator.set_hyperparameters(
    time_freq='H',  # Hourly frequency
    epochs=100,
    early_stopping_patience=10,
    mini_batch_size=32,
    learning_rate=0.001,
    context_length=context_length,
    prediction_length=prediction_length,
    num_cells=40,
    num_layers=3,
    likelihood='gaussian',
    dropout_rate=0.1,
    embedding_dimension=10,
    num_dynamic_feat=len(dynamic_feat) if dynamic_feat else 0,
    cardinality=json.dumps([len(le.classes_) for le in [label_encoders.get('resource_type'), label_encoders.get('location')] if le is not None])
)

print("Estimator configured.")

In [None]:
# Train the model
data_channels = {
    "train": s3_train_path,
    "test": s3_test_path
}

estimator.fit(inputs=data_channels, wait=True)

print("Training complete!")

## 6. Deploy Model and Generate Predictions

In [None]:
# Deploy the model
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer()
)

print(f"Model deployed to endpoint: {predictor.endpoint_name}")

## 7. Generate 24-Hour Forecasts for 5 Random Resources

In [None]:
# Select 5 random resources
np.random.seed(42)
random_indices = np.random.choice(len(test_data), size=min(5, len(test_data)), replace=False)
random_test_samples = [(test_data[i], resource_mapping[i]) for i in random_indices]

print(f"Selected {len(random_test_samples)} random resources for prediction")
print(f"Resource IDs: {[r[1] for r in random_test_samples]}")

In [None]:
# Generate predictions
predictions = []

for idx, (sample, resource_id) in enumerate(random_test_samples):
    # Prepare input (use all but last 24 hours)
    input_data = {
        "instances": [{
            "start": sample["start"],
            "target": sample["target"][:-prediction_length]
        }],
        "configuration": {
            "num_samples": 100,
            "output_types": ["mean", "quantiles"],
            "quantiles": ["0.1", "0.5", "0.9"]
        }
    }
    
    # Add dynamic features if present - MUST include full prediction range
    if "dynamic_feat" in sample:
        # Include historical + future (full series)
        input_data["instances"][0]["dynamic_feat"] = sample["dynamic_feat"]
    
    # Add categorical features if present
    if "cat" in sample:
        input_data["instances"][0]["cat"] = sample["cat"]
    
    # Get prediction
    result = predictor.predict(input_data)
    predictions.append({
        "resource_id": resource_id,
        "actual": sample["target"][-prediction_length:],
        "predicted_mean": result["predictions"][0]["mean"],
        "predicted_quantiles": result["predictions"][0]["quantiles"]
    })

print(f"Generated predictions for {len(predictions)} resources")

## 8. Visualize Predictions

In [None]:
# Plot predictions vs actual
fig, axes = plt.subplots(len(predictions), 1, figsize=(15, 4*len(predictions)))

if len(predictions) == 1:
    axes = [axes]

for idx, pred in enumerate(predictions):
    ax = axes[idx]
    hours = list(range(1, prediction_length + 1))
    
    # Plot actual values
    ax.plot(hours, pred["actual"], label='Actual', marker='o', linewidth=2)
    
    # Plot predicted mean
    ax.plot(hours, pred["predicted_mean"], label='Predicted Mean', marker='s', linewidth=2, linestyle='--')
    
    # Plot confidence intervals
    ax.fill_between(
        hours,
        pred["predicted_quantiles"]["0.1"],
        pred["predicted_quantiles"]["0.9"],
        alpha=0.3,
        label='80% Confidence Interval'
    )
    
    ax.set_xlabel('Hours Ahead')
    ax.set_ylabel('Occupancy Rate')
    ax.set_title(f'Resource {pred["resource_id"]}: 24-Hour Occupancy Rate Forecast')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('occupancy_forecasts.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualization saved as 'occupancy_forecasts.png'")

## 9. Calculate Prediction Metrics

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate metrics for each resource
metrics_summary = []

for pred in predictions:
    actual = np.array(pred["actual"])
    predicted = np.array(pred["predicted_mean"])
    
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    r2 = r2_score(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / (actual + 1e-10))) * 100
    
    metrics_summary.append({
        'Resource': pred["resource_id"],
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'MAPE (%)': mape
    })

metrics_df = pd.DataFrame(metrics_summary)
print("\nPrediction Metrics:")
print(metrics_df.to_string(index=False))

print("\nAverage Metrics:")
print(metrics_df.mean().to_string())

## 10. Cleanup (Optional)

In [None]:
# Uncomment to delete the endpoint when done
# predictor.delete_endpoint()
# print("Endpoint deleted")

## Summary

This notebook:
1. Loaded occupancy data from S3 parquet file
2. Created lag features (1h, 24h, 168h) and rolling averages
3. Encoded categorical features for machine learning
4. Prepared data in DeepAR JSON format
5. Trained a DeepAR forecasting model
6. Generated 24-hour forecasts for 5 random resources
7. Visualized predictions with confidence intervals
8. Calculated prediction accuracy metrics

The model can now be used to forecast occupancy rates for any resource in the dataset.