In [None]:
# # Notebook 3: Cross-Validation for Time Series Data
#
# ## Goals
# * Understand why standard shuffled CV methods are incorrect for time-dependent data.
# * Learn about **Lookahead Bias**.
# * Implement time series cross-validation using `sklearn.model_selection.TimeSeriesSplit`.
# * Visualize the training and validation indices used in time series CV.

# ## 1. The Problem: Temporal Dependency
#
# Many datasets, especially in monitoring or forecasting scenarios, have an inherent **temporal order**. Examples:
# *   Predicting patient deterioration based on vital signs over the last few hours.
# *   Forecasting disease outbreaks based on weekly case counts.
# *   Analyzing sequential events in EHRs.
#
# In these cases, the order of data points matters. The value at time `t` might depend on values at `t-1`, `t-2`, etc.
#
# **Why standard K-Fold fails:** Standard methods (like `KFold`, `StratifiedKFold` with `shuffle=True`) randomly shuffle the data before splitting. This breaks the temporal order. A model could be trained on data from the future (e.g., Wednesday, Friday) to predict data from the past (e.g., Monday), which is impossible in a real-world scenario.
#
# **Lookahead Bias:** Using future information to make predictions about the past or present leads to overly optimistic performance estimates that are not achievable in practice.

# ## 2. The Solution: Time Series Cross-Validation
#
# Time series CV techniques respect the temporal order. They ensure that the validation set always consists of data points that occurred *after* the data points in the training set.
#
# **`TimeSeriesSplit` in Scikit-learn:**
# This splitter yields folds where the training set grows (or slides) over time, and the validation set consists of the points immediately following the training set.
#
# ```
# Fold 1: [----------] [validation]
# Fold 2: [--------------------] [validation]
# Fold 3: [------------------------------] [validation]
# ...
# ```
# (`[]` denotes data points, `validation` is the set for testing in that fold)

# ## 3. Setup and Generating Time Series Data
#
# Let's create some simple synthetic time series data. We'll create features `X_time` and a target `y_time`. For simplicity, we'll assume the data is already sorted chronologically.

# +
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression # Example: Regression task
from sklearn.metrics import mean_squared_error

# %matplotlib inline

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Generate synthetic time series data
N_SAMPLES_TIME = 300
N_FEATURES_TIME = 3

# Create a time index
time_index = np.arange(N_SAMPLES_TIME)

# Create features (e.g., lagged values, sine waves, random noise)
X_time = np.zeros((N_SAMPLES_TIME, N_FEATURES_TIME))
X_time[:, 0] = np.sin(time_index / 20) # A cyclical feature
X_time[:, 1] = time_index / N_SAMPLES_TIME # A trend feature
X_time[:, 2] = np.random.randn(N_SAMPLES_TIME) * 0.5 # Noise feature

# Create a target variable that depends on past features and some noise
y_time = (
    0.8 * np.sin((time_index - 5) / 20) # Lagged sine wave
    + 0.5 * ((time_index - 10) / N_SAMPLES_TIME) # Lagged trend
    + np.random.randn(N_SAMPLES_TIME) * 0.3 # Noise
)


# Visualize the data (optional)
plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
plt.plot(time_index, X_time[:, 0], label='Feature 1 (Sine)')
plt.plot(time_index, X_time[:, 1], label='Feature 2 (Trend)')
plt.title('Synthetic Features')
plt.legend()
plt.ylabel('Value')

plt.subplot(2, 1, 2)
plt.plot(time_index, y_time, label='Target Variable (y_time)', color='C0')
plt.title('Synthetic Target Variable')
plt.legend()
plt.xlabel('Time Step')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

print(f"Generated X_time shape: {X_time.shape}")
print(f"Generated y_time shape: {y_time.shape}")
# -

# ## 4. Applying TimeSeriesSplit
#
# We instantiate `TimeSeriesSplit` and use it with `cross_val_score`. Common parameters include:
# *   `n_splits`: Number of folds to create.
# *   `max_train_size`: Optional, to create a sliding window instead of an expanding one.
# *   `gap`: Optional, number of samples to leave between train and test sets.

# +
# Define a model (e.g., Linear Regression for this synthetic task)
model_time = LinearRegression()

# Instantiate TimeSeriesSplit
N_SPLITS_TIME = 5
tscv = TimeSeriesSplit(n_splits=N_SPLITS_TIME) # Default: expanding window

print(f"\n--- Running Time Series Cross-Validation ({N_SPLITS_TIME} Splits) ---")

# Use cross_val_score. Note 'neg_mean_squared_error' as score needs to be maximized.
# We take the negative later.
time_scores = cross_val_score(
    model_time,
    X_time,
    y_time,
    cv=tscv,
    scoring='neg_mean_squared_error', # Lower MSE is better, so maximize negative MSE
    n_jobs=-1
)

# Convert scores back to positive MSE
time_mse_scores = -time_scores

print(f"\nIndividual fold MSEs (TimeSeriesSplit): {time_mse_scores}")
print(f"Mean MSE:      {time_mse_scores.mean():.4f}")
print(f"Std deviation: {time_mse_scores.std():.4f}")
# -

# ## 5. Visualizing TimeSeriesSplit Indices
#
# Let's manually iterate through the splits to see how the training and validation indices are assigned, confirming the temporal separation.

# +
def plot_cv_indices(cv, X, y, ax, n_splits, lw=10):
    """Create a sample plot for indices generated by a CV object."""
    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=plt.cm.coolwarm,
                   vmin=-.2, vmax=1.2)

    # Formatting
    yticklabels = list(range(n_splits))
    ax.set(yticks=np.arange(n_splits) + .5, yticklabels=yticklabels,
           xlabel='Sample Index', ylabel="CV Iteration",
           ylim=[n_splits, -.2], xlim=[0, len(X)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

# Visualize TimeSeriesSplit
fig, ax = plt.subplots(figsize=(10, 3))
plot_cv_indices(tscv, X_time, y_time, ax, N_SPLITS_TIME)
ax.legend([plt.Line2D(X_time,y_time,color=plt.cm.coolwarm(0.), lw=4)], ['Training set'], loc=(1.02, .8))
ax.legend([plt.Line2D(X_time,y_time,color=plt.cm.coolwarm(1.), lw=4)], ['Validation set'], loc=(1.02, .6))


plt.tight_layout()
plt.show()

# Print first few train/test indices from the generator
print("\n--- Manual Time Series Split Example (First 2 Folds) ---")
fold_counter = 1
for train_index, val_index in tscv.split(X_time):
     if fold_counter > 2: break
     print(f"\nFold {fold_counter}:")
     print(f"  Train indices (Range): {train_index.min()}-{train_index.max()} (Size: {len(train_index)})")
     print(f"  Val indices (Range):   {val_index.min()}-{val_index.max()} (Size: {len(val_index)})")
     # Verify temporal order
     print(f"  Validation starts after Train ends: {val_index.min() > train_index.max()}")
     fold_counter += 1
# -

# **Observation:** The visualization clearly shows that the validation set (red) always comes *after* the training set (blue) in each CV iteration, respecting the temporal order. The training set size increases with each fold in the default expanding window setup.

# ## 6. Conclusion
#
# When dealing with time-ordered data, using specialized cross-validation techniques like `TimeSeriesSplit` is crucial to avoid lookahead bias and obtain realistic performance estimates. Randomly shuffling time series data before applying standard K-Fold CV will lead to misleadingly optimistic results. Always consider the nature of your data and choose the appropriate CV strategy.