In [1]:
# Install dependencies
%pip install numpy
%pip install scikit-learn
%pip install matplotlib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Normalize and assign the data to classes to load in batches for the model

In [7]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import Dataset, DataLoader

# Quick normalization first
def quick_normalize(game_objects):
    print("Starting quick normalization...")
    scaler = MinMaxScaler()
    
    # Collect all data to fit the scaler
    print("Collecting all data to fit scaler...")
    all_data = []
    for game in game_objects:
        all_data.append(game.matchups)
        all_data.append(game.team_history)
        all_data.append(game.opponent_history)
    
    # Concatenate all data and fit the scaler
    print("Fitting scaler...")
    combined_data = pd.concat(all_data)
    scaler.fit(combined_data)
    
    # Now transform each game's data using the fitted scaler
    print("Transforming data...")
    for i, game in enumerate(game_objects):
        if i % 1000 == 0:
            print(f"Normalizing game {i}/{len(game_objects)}")
            
        # Normalize game_date
        baseline_date = pd.to_datetime("2000-01-01")
        game.game_date = (pd.to_datetime(game.game_date) - baseline_date).days
        
        # Transform each sequence using the same scaler
        game.matchups = pd.DataFrame(
            scaler.transform(game.matchups),
            columns=game.matchups.columns,
            index=game.matchups.index
        )
        game.team_history = pd.DataFrame(
            scaler.transform(game.team_history),
            columns=game.team_history.columns,
            index=game.team_history.index
        )
        game.opponent_history = pd.DataFrame(
            scaler.transform(game.opponent_history),
            columns=game.opponent_history.columns,
            index=game.opponent_history.index
        )
    
    return game_objects, scaler

# Classes for PyTorch handling
class GameData:
    def __init__(self, game_id, game_date, is_regular_season, is_playoffs, 
                 is_pre_season, matchups, team_history, opponent_history, target):
        self.game_id = game_id
        self.game_date = game_date
        self.is_regular_season = is_regular_season
        self.is_playoffs = is_playoffs
        self.is_pre_season = is_pre_season
        self.matchups = matchups
        self.team_history = team_history
        self.opponent_history = opponent_history
        self.target = target

    def to_tensor_dict(self):
        tensor_dict = {
            'matchups_tensor': torch.FloatTensor(self.matchups.values),
            'team_history_tensor': torch.FloatTensor(self.team_history.values),
            'opponent_history_tensor': torch.FloatTensor(self.opponent_history.values),
            'matchups_lengths': torch.LongTensor([len(self.matchups)]),
            'team_history_lengths': torch.LongTensor([len(self.team_history)]),
            'opponent_history_lengths': torch.LongTensor([len(self.opponent_history)]),
            'target': torch.FloatTensor([self.target]),
            'game_type': torch.FloatTensor([
                self.is_regular_season,
                self.is_playoffs,
                self.is_pre_season
            ])
        }
        return tensor_dict

    @staticmethod
    def collate_batch(batch):
        tensor_dicts = [game.to_tensor_dict() for game in batch]
        batch_dict = {}
        
        # No need for lengths or padding since sequences are fixed length
        for key in ['matchups_tensor', 'team_history_tensor', 'opponent_history_tensor']:
            batch_dict[key] = torch.stack([d[key] for d in tensor_dicts])
        
        batch_dict['game_type'] = torch.stack([d['game_type'] for d in tensor_dicts])
        batch_dict['target'] = torch.cat([d['target'] for d in tensor_dicts])
        
        return batch_dict

class NBAGamesDataset(Dataset):
    def __init__(self, game_objects):
        self.game_objects = game_objects
    
    def __len__(self):
        return len(self.game_objects)
    
    def __getitem__(self, idx):
        return self.game_objects[idx]

Normalize the data and save to pkl

In [8]:
with open('rnn_game_objects.pkl', 'rb') as f:
    game_objects = pickle.load(f)

# Quick normalize
normalized_games, scaler = quick_normalize(game_objects)

# Split into train test and validation
baseline_date = pd.to_datetime("2000-01-01")

train_games = []
val_games = []
test_games = []

for game in normalized_games:
    # Convert days since baseline back to a date
    game_date = baseline_date + pd.Timedelta(days=int(game.game_date))
    
    if game_date.year <= 2019: 
        train_games.append(game)
    elif game_date.year == 2020:  
        val_games.append(game)
    else:  
        test_games.append(game)

print(f"Training games (2001-2019): {len(train_games)}")
print(f"Validation games (2020): {len(val_games)}")
print(f"Test games (2021-2023): {len(test_games)}")

# Create datasets and dataloaders
batch_size = 32
train_dataset = NBAGamesDataset(train_games)
val_dataset = NBAGamesDataset(val_games)
test_dataset = NBAGamesDataset(test_games)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=GameData.collate_batch
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=GameData.collate_batch
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=GameData.collate_batch
)

# Save everything
with open('game_stats_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

processed_data = {
    'train_dataset': train_loader.dataset.game_objects,
    'val_dataset': val_loader.dataset.game_objects,
    'test_dataset': test_loader.dataset.game_objects
}

with open('processed_game_objects.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

# Print information
print(f"\nNumber of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")

Starting quick normalization...
Collecting all data to fit scaler...
Fitting scaler...
Transforming data...
Normalizing game 0/28803
Normalizing game 1000/28803
Normalizing game 2000/28803
Normalizing game 3000/28803
Normalizing game 4000/28803
Normalizing game 5000/28803
Normalizing game 6000/28803
Normalizing game 7000/28803
Normalizing game 8000/28803
Normalizing game 9000/28803
Normalizing game 10000/28803
Normalizing game 11000/28803
Normalizing game 12000/28803
Normalizing game 13000/28803
Normalizing game 14000/28803
Normalizing game 15000/28803
Normalizing game 16000/28803
Normalizing game 17000/28803
Normalizing game 18000/28803
Normalizing game 19000/28803
Normalizing game 20000/28803
Normalizing game 21000/28803
Normalizing game 22000/28803
Normalizing game 23000/28803
Normalizing game 24000/28803
Normalizing game 25000/28803
Normalizing game 26000/28803
Normalizing game 27000/28803
Normalizing game 28000/28803

Number of training batches: 721
Number of validation batches: 9

Model architecture for RNN

In [9]:
import torch
import torch.nn as nn

class PredictorRNN(nn.Module):
    def __init__(self, feature_names, hidden_size=128, num_layers=2, dropout=0.2):
        super().__init__()
        self.feature_names = feature_names
        self.input_size = len(feature_names)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM networks for each sequence type
        self.matchups_lstm = nn.LSTM(
            input_size=self.input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )
        
        self.team_history_lstm = nn.LSTM(
            input_size=self.input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )
        
        self.opponent_history_lstm = nn.LSTM(
            input_size=self.input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False
        )
        
        # Game type embedding
        self.game_type_proj = nn.Linear(3, hidden_size)
        
        # Combination layer
        combined_size = (hidden_size * 3) + hidden_size  
        
        self.classifier = nn.Sequential(
            nn.Linear(combined_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid()
        )

    def forward(self, batch_dict):
        # Process matchups (always length 5)
        _, (matchups_hidden, _) = self.matchups_lstm(batch_dict['matchups_tensor'])
        matchups_hidden = matchups_hidden[-1] 
        
        # Process team history (always length 10)
        _, (team_hidden, _) = self.team_history_lstm(batch_dict['team_history_tensor'])
        team_hidden = team_hidden[-1]
        
        # Process opponent history (always length 10)
        _, (opponent_hidden, _) = self.opponent_history_lstm(batch_dict['opponent_history_tensor'])
        opponent_hidden = opponent_hidden[-1]
        
        # Process game type
        game_type_embedded = self.game_type_proj(batch_dict['game_type'])
        
        # Combine all features
        combined = torch.cat([
            matchups_hidden,
            team_hidden,
            opponent_hidden,
            game_type_embedded
        ], dim=1)
        
        # Make prediction and ensure output is between 0 and 1
        output = self.classifier(combined)
        output = torch.clamp(output, 0, 1)  
        
        return output
    
    def train_model(self, train_loader, val_loader, num_epochs=50, learning_rate=0.0005, patience=5):
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5)
        
        best_val_loss = float('inf')
        best_model = None
        patience_counter = 0
        
        for epoch in range(num_epochs):
            # Training phase
            self.train()
            total_train_loss = 0
            correct_predictions = 0
            total_predictions = 0
            
            for batch in train_loader:
                optimizer.zero_grad()
                outputs = self(batch)
                # Reshape target to match output shape
                target = batch['target'].view(-1, 1) 
                loss = criterion(outputs, target)
                
                loss.backward()
                optimizer.step()
                
                total_train_loss += loss.item()
                predictions = (outputs >= 0.5).float()
                correct_predictions += (predictions == target).sum().item()  
                total_predictions += target.size(0)
            
            avg_train_loss = total_train_loss / len(train_loader)
            train_accuracy = correct_predictions / total_predictions
            
            # Validation phase
            self.eval()
            total_val_loss = 0
            correct_val_predictions = 0
            total_val_predictions = 0
            
            with torch.no_grad():
                for batch in val_loader:
                    outputs = self(batch)
                    target = batch['target'].view(-1, 1) 
                    loss = criterion(outputs, target)
                    
                    total_val_loss += loss.item()
                    predictions = (outputs >= 0.5).float()
                    correct_val_predictions += (predictions == target).sum().item() 
                    total_val_predictions += target.size(0)
            
            avg_val_loss = total_val_loss / len(val_loader)
            val_accuracy = correct_val_predictions / total_val_predictions
            
            scheduler.step(avg_val_loss)
            
            # Early stopping if val loss isn't less than best
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model = self.state_dict().copy()
                patience_counter = 0
            else:
                patience_counter += 1
                
            print(f'Epoch {epoch+1}/{num_epochs}')
            print(f'Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
            print(f'Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
            print(f'Patience Counter: {patience_counter}/{patience}')
            print('--------------------')
            
            if patience_counter >= patience:
                print(f'Early stopping triggered after epoch {epoch+1}')
                break
        
        self.load_state_dict(best_model)
        return self, {
            'best_val_loss': best_val_loss,
            'epochs_trained': epoch + 1
        }

def create_feature_filtered_loader(original_loader, feature_names, batch_size):
    filtered_games = []

    for game in original_loader.dataset.game_objects:
        # Check for NaN values in all sequences
        has_nan = (game.matchups[feature_names].isna().any().any() or 
                  game.team_history[feature_names].isna().any().any() or 
                  game.opponent_history[feature_names].isna().any().any())
        
        if not has_nan:  # Only include games without NaN values
            filtered_game = GameData(
                game_id=game.game_id,
                game_date=game.game_date,
                is_regular_season=game.is_regular_season,
                is_playoffs=game.is_playoffs,
                is_pre_season=game.is_pre_season,
                matchups=game.matchups[feature_names],
                team_history=game.team_history[feature_names],
                opponent_history=game.opponent_history[feature_names],
                target=game.target
            )
            filtered_games.append(filtered_game)
    
    print(f"Filtered out {len(original_loader.dataset.game_objects) - len(filtered_games)} games with NaN values")
    print(f"Remaining games: {len(filtered_games)}")

    filtered_dataset = NBAGamesDataset(filtered_games)

    return DataLoader(
        filtered_dataset,
        batch_size=batch_size,
        shuffle=True if original_loader.dataset == 'train' else False,
        collate_fn=GameData.collate_batch
    )

Train the model and check accuracies

In [10]:
import pickle

with open('processed_game_objects.pkl', 'rb') as f:
    processed_data = pickle.load(f)

features = [
    'days_ago', 
    'games_ago', 
    'is_home_team', 
    'is_regular_season_matchup', 
    'is_playoffs_matchup', 
    'is_pre_season_matchup', 
    'wl', 
    'pts_for', 
    'fg_pct_for', 
    'fg3_pct_for', 
    'fg3m_for', 
    'ft_pct_for', 
    'ftm_for', 
    'reb_for', 
    'ast_for', 
    'stl_for', 
    'blk_for', 
    'tov_for', 
    'pts_against', 
    'fg_pct_against', 
    'fg3_pct_against', 
    'fg3m_against', 
    'ft_pct_against', 
    'ftm_against', 
    'reb_against', 
    'ast_against', 
    'stl_against', 
    'blk_against', 
    'tov_against'
    ]

train_loader = create_feature_filtered_loader(train_loader, features, batch_size=32)
val_loader = create_feature_filtered_loader(val_loader, features, batch_size=32)

# Create and train model
model = PredictorRNN(feature_names=features)
trained_model, history = model.train_model(train_loader, val_loader)

Filtered out 10 games with NaN values
Remaining games: 23032
Filtered out 0 games with NaN values
Remaining games: 2880
Epoch 1/50
Train Loss: 0.6539, Train Accuracy: 0.6220
Val Loss: 0.6602, Val Accuracy: 0.6062
Patience Counter: 0/5
--------------------
Epoch 2/50
Train Loss: 0.6430, Train Accuracy: 0.6310
Val Loss: 0.6601, Val Accuracy: 0.6066
Patience Counter: 0/5
--------------------
Epoch 3/50
Train Loss: 0.6411, Train Accuracy: 0.6342
Val Loss: 0.6583, Val Accuracy: 0.6038
Patience Counter: 0/5
--------------------
Epoch 4/50
Train Loss: 0.6402, Train Accuracy: 0.6376
Val Loss: 0.6602, Val Accuracy: 0.6059
Patience Counter: 1/5
--------------------
Epoch 5/50
Train Loss: 0.6400, Train Accuracy: 0.6370
Val Loss: 0.6579, Val Accuracy: 0.6080
Patience Counter: 0/5
--------------------
Epoch 6/50
Train Loss: 0.6393, Train Accuracy: 0.6371
Val Loss: 0.6611, Val Accuracy: 0.6035
Patience Counter: 1/5
--------------------
Epoch 7/50
Train Loss: 0.6391, Train Accuracy: 0.6372
Val Loss: 

Evaluate the model

In [11]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score

def evaluate_model(model, test_loader):
    model.eval()
    all_predictions_raw = []  # Store raw probabilities
    all_predictions = []      # Store binary predictions
    all_targets = []

    with torch.no_grad():
        for batch in test_loader:
            outputs = model(batch)
            target = batch['target'].view(-1, 1)

            # Store raw probabilities
            all_predictions_raw.extend(outputs.cpu().numpy().flatten())

            # Get binary predictions
            predictions = (outputs >= 0.5).float()
            all_predictions.extend(predictions.cpu().numpy().flatten())
            all_targets.extend(target.cpu().numpy().flatten())

    # Convert to numpy arrays for easier manipulation
    predictions_raw = np.array(all_predictions_raw)
    predictions = np.array(all_predictions)
    targets = np.array(all_targets)

    # Calculate metrics
    cm = confusion_matrix(targets, predictions)
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Create figure with subplots
    fig = plt.figure(figsize=(20, 10))

    # 1. Confusion Matrix
    ax1 = fig.add_subplot(221)
    im = ax1.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax1.set_title('Confusion Matrix')

    # Add numbers to confusion matrix
    for i in range(2):
        for j in range(2):
            ax1.text(j, i, str(cm[i, j]),
                    ha="center", va="center", color="white" if cm[i, j] > cm.max()/2 else "black")

    plt.colorbar(im, ax=ax1)
    ax1.set_xlabel('Predicted Label')
    ax1.set_ylabel('True Label')
    ax1.set_xticks([0, 1])
    ax1.set_yticks([0, 1])
    ax1.set_xticklabels(['0', '1'])
    ax1.set_yticklabels(['0', '1'])

    # 2. ROC Curve
    ax2 = fig.add_subplot(222)
    fpr, tpr, _ = roc_curve(targets, predictions_raw)
    roc_auc = auc(fpr, tpr)

    ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax2.set_xlim([0.0, 1.0])
    ax2.set_ylim([0.0, 1.05])
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title('Receiver Operating Characteristic (ROC) Curve')
    ax2.legend(loc="lower right")
    ax2.grid(True)

    # 3. Precision-Recall Curve
    ax3 = fig.add_subplot(223)
    precision_curve, recall_curve, _ = precision_recall_curve(targets, predictions_raw)
    avg_precision = average_precision_score(targets, predictions_raw)

    ax3.plot(recall_curve, precision_curve, color='blue', lw=2,
             label=f'Precision-Recall curve (AP = {avg_precision:.2f})')
    ax3.set_xlabel('Recall')
    ax3.set_ylabel('Precision')
    ax3.set_title('Precision-Recall Curve')
    ax3.legend(loc="lower left")
    ax3.grid(True)

    # 4. Prediction Distribution
    ax4 = fig.add_subplot(224)
    ax4.hist(predictions_raw, bins=50, alpha=0.5, label='All Predictions')
    ax4.hist(predictions_raw[targets == 1], bins=50, alpha=0.5, label='Actual Positive')
    ax4.hist(predictions_raw[targets == 0], bins=50, alpha=0.5, label='Actual Negative')
    ax4.set_xlabel('Predicted Probability')
    ax4.set_ylabel('Count')
    ax4.set_title('Distribution of Predictions')
    ax4.legend()
    ax4.grid(True)

    plt.tight_layout()
    plt.savefig('evaluation_metrics.png')
    plt.close()

    # Print metrics
    print("\nModel Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")

    print("\nConfusion Matrix:")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'roc_auc': roc_auc,
        'avg_precision': avg_precision,
        'confusion_matrix': cm,
        'predictions_raw': predictions_raw,
        'predictions': predictions,
        'targets': targets
    }

# Usage:
test_loader = create_feature_filtered_loader(test_loader, features, batch_size=32)
metrics = evaluate_model(trained_model, test_loader)

Filtered out 0 games with NaN values
Remaining games: 2881

Test Results:
Total Test Games: 2881
Correct Predictions: 1731
Test Accuracy: 0.6008
