In [1]:
"""
Hands-On Practice: Pipelines & Cross-Validation
================================================

Work through these exercises to master the concepts.
Each exercise builds on the previous one.

Author: Oleksandr
Date: November 28, 2024
"""

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier

# =============================================================================
# EXERCISE 1: Basic Pipeline Construction
# =============================================================================

print("\n" + "="*80)
print("EXERCISE 1: Build Your First Pipeline")
print("="*80)

"""
TASK: Create a pipeline with:
1. SimpleImputer (median strategy)
2. DecisionTreeClassifier (max_depth=5)

Then fit and evaluate it.
"""

# Load data
NBA_DATA_PATH = 'C:\\Users\\userPC\\projects\\predictive-modeling-platform\\data\\processed\\nba\\final\\nba_train_data_enhanced.csv'
df = pd.read_csv(NBA_DATA_PATH)
df = df.dropna(subset=['HOME_WIN'])

# Basic features
features = ['NET_RATING_DIFF', 'REST_ADVANTAGE']
X = df[features]
y = df['HOME_WIN']

# YOUR CODE HERE:
# Create pipeline named 'my_first_pipeline'
# Step 1: 'imputer' - SimpleImputer with strategy='median'
# Step 2: 'classifier' - DecisionTreeClassifier with max_depth=5

my_first_pipeline = None  # Replace with your pipeline

# Uncomment to test:
# # Simple train/test split
# split_idx = int(0.8 * len(df))
# X_train = X.iloc[:split_idx]
# y_train = y.iloc[:split_idx]
# X_test = X.iloc[split_idx:]
# y_test = y.iloc[split_idx:]
# 
# # Fit and evaluate
# my_first_pipeline.fit(X_train, y_train)
# accuracy = my_first_pipeline.score(X_test, y_test)
# print(f"\n‚úì Pipeline accuracy: {accuracy:.4f}")

# =============================================================================
# EXERCISE 2: Understanding fit() vs transform()
# =============================================================================

print("\n" + "="*80)
print("EXERCISE 2: Understand fit() vs transform()")
print("="*80)

"""
TASK: Manually create and apply a scaler to understand the process.
Compare statistics learned from train vs test.
"""

# Create small dataset
X_train_sample = np.array([[1, 2], [2, 4], [3, 6], [4, 8]])
X_test_sample = np.array([[5, 10], [6, 12]])

# YOUR CODE HERE:
# 1. Create a StandardScaler
# 2. Fit it on X_train_sample (learns mean and std)
# 3. Transform X_train_sample using the fitted scaler
# 4. Transform X_test_sample using the SAME fitted scaler
# 5. Print the mean and std learned by the scaler

# Uncomment to test:
# scaler = StandardScaler()
# scaler.fit(X_train_sample)
# 
# X_train_scaled = scaler.transform(X_train_sample)
# X_test_scaled = scaler.transform(X_test_sample)
# 
# print(f"\nTrained on: {X_train_sample}")
# print(f"Learned Mean: {scaler.mean_}")
# print(f"Learned Std: {scaler.scale_}")
# print(f"\nTrain Scaled:\n{X_train_scaled}")
# print(f"\nTest Scaled:\n{X_test_scaled}")
# print("\n‚úì Notice: Test data scaled using TRAIN statistics!")

# =============================================================================
# EXERCISE 3: Time-Series Cross-Validation
# =============================================================================

print("\n" + "="*80)
print("EXERCISE 3: Implement Time-Series Cross-Validation")
print("="*80)

"""
TASK: Use TimeSeriesSplit to evaluate a model with 3 folds.
Print the performance for each fold.
"""

# Load full dataset
features = ['NET_RATING_DIFF', 'HOME_B2B', 'AWAY_B2B']
X = df[features]
y = df['HOME_WIN']

# Create pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('classifier', DecisionTreeClassifier(max_depth=4, random_state=42))
])

# YOUR CODE HERE:
# 1. Create TimeSeriesSplit with n_splits=3
# 2. Use cross_validate to evaluate the pipeline
# 3. Print mean and std of test scores

# Uncomment to test:
# from sklearn.model_selection import TimeSeriesSplit, cross_validate
# 
# tscv = TimeSeriesSplit(n_splits=3)
# 
# cv_results = cross_validate(
#     pipeline, X, y,
#     cv=tscv,
#     scoring='accuracy',
#     return_train_score=True
# )
# 
# print(f"\nCV Accuracy: {cv_results['test_score'].mean():.4f} (¬±{cv_results['test_score'].std():.4f})")
# print(f"\nFold Results:")
# for i, (train_score, test_score) in enumerate(zip(cv_results['train_score'], cv_results['test_score'])):
#     print(f"  Fold {i+1}: Train={train_score:.4f}, Test={test_score:.4f}")

# =============================================================================
# EXERCISE 4: Detecting Data Leakage
# =============================================================================

print("\n" + "="*80)
print("EXERCISE 4: Detect Data Leakage")
print("="*80)

"""
TASK: Compare two approaches and identify which one has data leakage.
"""

# Approach A: Fit scaler on all data
print("\n--- Approach A ---")
scaler_a = StandardScaler()
X_scaled_a = scaler_a.fit_transform(X)  # Fits on ALL data

split_idx = int(0.8 * len(df))
X_train_a = X_scaled_a[:split_idx]
X_test_a = X_scaled_a[split_idx:]

model_a = DecisionTreeClassifier(max_depth=4, random_state=42)
model_a.fit(X_train_a, y.iloc[:split_idx])
accuracy_a = model_a.score(X_test_a, y.iloc[split_idx:])

print(f"Approach A Accuracy: {accuracy_a:.4f}")

# Approach B: Fit scaler only on training data
print("\n--- Approach B ---")
split_idx = int(0.8 * len(df))
X_train_b = X.iloc[:split_idx]
X_test_b = X.iloc[split_idx:]
y_train_b = y.iloc[:split_idx]
y_test_b = y.iloc[split_idx:]

scaler_b = StandardScaler()
scaler_b.fit(X_train_b)  # Fits only on TRAIN data
X_train_b_scaled = scaler_b.transform(X_train_b)
X_test_b_scaled = scaler_b.transform(X_test_b)

model_b = DecisionTreeClassifier(max_depth=4, random_state=42)
model_b.fit(X_train_b_scaled, y_train_b)
accuracy_b = model_b.score(X_test_b_scaled, y_test_b)

print(f"Approach B Accuracy: {accuracy_b:.4f}")

# YOUR ANSWER HERE:
# Which approach has data leakage? Why?
# Which accuracy is more reliable?

print("\n‚úì Which approach is correct? Think about it!")
print("  Hint: Which scaler sees test data during training?")

# =============================================================================
# EXERCISE 5: GridSearchCV with Pipeline
# =============================================================================

print("\n" + "="*80)
print("EXERCISE 5: Hyperparameter Tuning with GridSearchCV")
print("="*80)

"""
TASK: Use GridSearchCV to find the best max_depth for DecisionTreeClassifier.
"""

# Prepare data
features = ['NET_RATING_DIFF', 'REST_ADVANTAGE', 'HOME_B2B', 'AWAY_B2B']
X = df[features]
y = df['HOME_WIN']

# Train/test split (chronological)
split_idx = int(0.8 * len(df))
X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]

# YOUR CODE HERE:
# 1. Create a pipeline with SimpleImputer and DecisionTreeClassifier
# 2. Define param_grid testing max_depth values [3, 4, 5, 6]
# 3. Use GridSearchCV with TimeSeriesSplit(n_splits=3)
# 4. Fit on training data
# 5. Evaluate on test data
# 6. Print best parameters and test accuracy

# Uncomment to test:
# pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('classifier', DecisionTreeClassifier(random_state=42))
# ])
# 
# param_grid = {
#     'classifier__max_depth': [3, 4, 5, 6]
# }
# 
# grid_search = GridSearchCV(
#     pipeline,
#     param_grid,
#     cv=TimeSeriesSplit(n_splits=3),
#     scoring='accuracy',
#     n_jobs=-1
# )
# 
# grid_search.fit(X_train, y_train)
# 
# print(f"\nBest Parameters: {grid_search.best_params_}")
# print(f"Best CV Score: {grid_search.best_score_:.4f}")
# 
# # Test set evaluation
# best_pipeline = grid_search.best_estimator_
# test_accuracy = best_pipeline.score(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy:.4f}")

# =============================================================================
# EXERCISE 6: Complete Model Comparison
# =============================================================================

print("\n" + "="*80)
print("EXERCISE 6: Compare Multiple Models")
print("="*80)

"""
TASK: Compare Decision Tree vs XGBoost using time-series CV.
"""

# Prepare data
features = [
    'NET_RATING_DIFF',
    'EFG_PCT_DIFF',
    'TOV_PCT_DIFF',
    'OREB_PCT_DIFF',
    'FTA_RATE_DIFF',
    'REST_ADVANTAGE'
]
X = df[features]
y = df['HOME_WIN']

# YOUR CODE HERE:
# 1. Create two pipelines:
#    - Decision Tree (max_depth=5)
#    - XGBoost (n_estimators=100, max_depth=4, learning_rate=0.1)
# 2. Evaluate both using TimeSeriesSplit(n_splits=5)
# 3. Compare accuracy and log_loss
# 4. Determine which is better

# Uncomment to test:
# from sklearn.metrics import make_scorer, log_loss as sklearn_log_loss
# 
# # Decision Tree Pipeline
# dt_pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('classifier', DecisionTreeClassifier(max_depth=5, random_state=42))
# ])
# 
# # XGBoost Pipeline
# xgb_pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('classifier', XGBClassifier(
#         n_estimators=100,
#         max_depth=4,
#         learning_rate=0.1,
#         random_state=42,
#         eval_metric='logloss'
#     ))
# ])
# 
# # Scoring metrics
# scoring = {
#     'accuracy': 'accuracy',
#     'log_loss': 'neg_log_loss'
# }
# 
# # Time-series CV
# tscv = TimeSeriesSplit(n_splits=5)
# 
# # Evaluate Decision Tree
# dt_cv = cross_validate(dt_pipeline, X, y, cv=tscv, scoring=scoring)
# print("\nDecision Tree Results:")
# print(f"  Accuracy:  {dt_cv['test_accuracy'].mean():.4f} (¬±{dt_cv['test_accuracy'].std():.4f})")
# print(f"  Log Loss:  {-dt_cv['test_log_loss'].mean():.4f} (¬±{dt_cv['test_log_loss'].std():.4f})")
# 
# # Evaluate XGBoost
# xgb_cv = cross_validate(xgb_pipeline, X, y, cv=tscv, scoring=scoring)
# print("\nXGBoost Results:")
# print(f"  Accuracy:  {xgb_cv['test_accuracy'].mean():.4f} (¬±{xgb_cv['test_accuracy'].std():.4f})")
# print(f"  Log Loss:  {-xgb_cv['test_log_loss'].mean():.4f} (¬±{xgb_cv['test_log_loss'].std():.4f})")
# 
# # Winner
# if -xgb_cv['test_log_loss'].mean() < -dt_cv['test_log_loss'].mean():
#     print("\n‚úì Winner: XGBoost (lower log loss is better)")
# else:
#     print("\n‚úì Winner: Decision Tree (lower log loss is better)")

# =============================================================================
# EXERCISE 7: Challenge - Build Production Pipeline
# =============================================================================

print("\n" + "="*80)
print("EXERCISE 7: Challenge - Production-Ready Pipeline")
print("="*80)

"""
TASK: Create a complete production pipeline that:
1. Uses all relevant features
2. Tunes XGBoost hyperparameters
3. Evaluates on hold-out test set
4. Calculates Brier Skill Score
"""

# YOUR CODE HERE:
# Build the complete pipeline following best practices:
# - Chronological train/val/test split (70/15/15)
# - Grid search on validation set
# - Final evaluation on test set
# - Report all metrics

print("\n‚úì Complete this challenge to master Pipelines & CV!")
print("  Hints:")
print("  1. Use features from FEATURES_FULL in baseline model")
print("  2. Tune max_depth, learning_rate, n_estimators")
print("  3. Use TimeSeriesSplit for CV")
print("  4. Evaluate ONCE on test set")

# =============================================================================
# SOLUTIONS
# =============================================================================

print("\n" + "="*80)
print("Ready to check your solutions?")
print("Uncomment the code blocks above and run each exercise!")
print("="*80 + "\n")

# Tips for success:
print("üí° Learning Tips:")
print("1. Work through exercises in order")
print("2. Uncomment code blocks one at a time")
print("3. Modify parameters to see effects")
print("4. Compare results with expected values")
print("5. Ask questions when stuck!")
print("\n‚ú® You've got this! Let's build some models! üèÄ")


EXERCISE 1: Build Your First Pipeline

EXERCISE 2: Understand fit() vs transform()

EXERCISE 3: Implement Time-Series Cross-Validation

EXERCISE 4: Detect Data Leakage

--- Approach A ---
Approach A Accuracy: 0.6686

--- Approach B ---
Approach B Accuracy: 0.6686

‚úì Which approach is correct? Think about it!
  Hint: Which scaler sees test data during training?

EXERCISE 5: Hyperparameter Tuning with GridSearchCV

EXERCISE 6: Compare Multiple Models

EXERCISE 7: Challenge - Production-Ready Pipeline

‚úì Complete this challenge to master Pipelines & CV!
  Hints:
  1. Use features from FEATURES_FULL in baseline model
  2. Tune max_depth, learning_rate, n_estimators
  3. Use TimeSeriesSplit for CV
  4. Evaluate ONCE on test set

Ready to check your solutions?
Uncomment the code blocks above and run each exercise!

üí° Learning Tips:
1. Work through exercises in order
2. Uncomment code blocks one at a time
3. Modify parameters to see effects
4. Compare results with expected values
5. A