# Bowl Game Data Exploration

This notebook explores NCAA bowl game data from the CollegeFootballData API to understand:
- Available features and their distributions
- Relationships between features and outcomes
- Data quality and missing values
- Feature engineering opportunities

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bowl_mania.data import CFBDClient

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Data from CFBD API

First, we'll set up the API client and fetch bowl game data for recent years.

In [None]:
# Initialize the CFBD client
# Note: Set your API key in environment variable CFBD_API_KEY or pass it here
client = CFBDClient()

# Define years to analyze
years = [2019, 2020, 2021, 2022, 2023]

print(f"Fetching bowl game data for years: {years}")

In [None]:
# Fetch comprehensive bowl game data
# This may take a few minutes depending on API rate limits
data = client.fetch_bowl_game_data(years, save_cache=True)

print("\nData shapes:")
for key, df in data.items():
    print(f"{key}: {df.shape}")

## 2. Explore Bowl Games Data

In [None]:
# Examine bowl games dataset
games_df = data['games']
print("Bowl Games Dataset:")
print(f"Shape: {games_df.shape}")
print(f"\nColumns: {list(games_df.columns)}")
print("\nFirst few rows:")
games_df.head()

In [None]:
# Check for missing values
print("Missing values in games data:")
missing = games_df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

In [None]:
# Basic statistics
print("Basic statistics for game scores:")
games_df[['home_points', 'away_points']].describe()

## 3. Visualize Score Distributions

In [None]:
# Distribution of scores
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

if 'home_points' in games_df.columns and 'away_points' in games_df.columns:
    axes[0].hist(games_df['home_points'].dropna(), bins=30, alpha=0.7, label='Home')
    axes[0].hist(games_df['away_points'].dropna(), bins=30, alpha=0.7, label='Away')
    axes[0].set_xlabel('Points')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Points Scored')
    axes[0].legend()
    
    # Calculate point differential
    games_df['point_diff'] = games_df['home_points'] - games_df['away_points']
    axes[1].hist(games_df['point_diff'].dropna(), bins=30, alpha=0.7, color='green')
    axes[1].set_xlabel('Point Differential (Home - Away)')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Distribution of Point Differentials')
    axes[1].axvline(x=0, color='red', linestyle='--', label='Even')
    axes[1].legend()

plt.tight_layout()
plt.show()

## 4. Explore Team Statistics

In [None]:
# Examine team statistics
team_stats = data['team_stats']
print("Team Statistics Dataset:")
print(f"Shape: {team_stats.shape}")
print(f"\nColumns: {list(team_stats.columns)}")
print("\nFirst few rows:")
team_stats.head()

## 5. Explore Advanced Statistics

In [None]:
# Examine advanced statistics (S&P+ ratings, etc.)
advanced_stats = data['advanced_stats']
if not advanced_stats.empty:
    print("Advanced Statistics Dataset:")
    print(f"Shape: {advanced_stats.shape}")
    print(f"\nColumns: {list(advanced_stats.columns)}")
    print("\nFirst few rows:")
    display(advanced_stats.head())
else:
    print("No advanced statistics available")

## 6. Explore Betting Lines

In [None]:
# Examine betting lines
betting_lines = data['betting_lines']
if not betting_lines.empty:
    print("Betting Lines Dataset:")
    print(f"Shape: {betting_lines.shape}")
    print(f"\nColumns: {list(betting_lines.columns)}")
    print("\nFirst few rows:")
    display(betting_lines.head())
    
    # Visualize spread distribution
    if 'spread' in betting_lines.columns:
        plt.figure(figsize=(10, 5))
        betting_lines['spread'].dropna().hist(bins=30, alpha=0.7)
        plt.xlabel('Point Spread')
        plt.ylabel('Frequency')
        plt.title('Distribution of Betting Spreads')
        plt.axvline(x=0, color='red', linestyle='--', label='Pick\'em')
        plt.legend()
        plt.show()
else:
    print("No betting lines data available")

## 7. Explore SP+ Ratings

In [None]:
# Examine SP+ ratings
sp_ratings = data['sp_ratings']
if not sp_ratings.empty:
    print("SP+ Ratings Dataset:")
    print(f"Shape: {sp_ratings.shape}")
    print(f"\nColumns: {list(sp_ratings.columns)}")
    print("\nFirst few rows:")
    display(sp_ratings.head())
else:
    print("No SP+ ratings data available")

## 8. Feature Correlation Analysis

In [None]:
# Correlation analysis for numeric features
# This will be expanded once we understand the data structure better
print("Correlation analysis will be performed after data merging and feature engineering")

## 9. Data Quality Summary

In [None]:
print("=" * 60)
print("DATA QUALITY SUMMARY")
print("=" * 60)

for name, df in data.items():
    if not df.empty:
        print(f"\n{name.upper()}:")
        print(f"  Total records: {len(df)}")
        print(f"  Total columns: {len(df.columns)}")
        missing_pct = (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100
        print(f"  Missing data: {missing_pct:.2f}%")
    else:
        print(f"\n{name.upper()}: No data available")

## Next Steps

Based on this exploration, the next steps are:

1. **Feature Engineering**: Create meaningful features by merging datasets and calculating differentials
2. **Data Preparation**: Handle missing values and prepare training/test sets
3. **Modeling**: Build and compare linear and Bayesian regression models
4. **Evaluation**: Assess model performance and cover prediction accuracy