In [4]:
# [1] Setup & Imports (Absolute Path Fix)
# --------------------------------------------------------------------------
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier 
from sklearn.metrics import roc_auc_score
import joblib 

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

# ðŸš¨ THE FIX: Manually set the Project Root to the known correct location.
# This bypasses any confusion about the notebook's current working directory.
PROJECT_ROOT = Path("/Users/hc/Documents/projects/roku-patch-stability-analytics") 

DATA_DIR = PROJECT_ROOT / "data"
MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True) 
MODEL_PATH = MODEL_DIR / "catboost_classifier_v001.cbm"

print(f"Project Root (Fixed): {PROJECT_ROOT.resolve()}")
print(f"Data Directory (Fixed): {DATA_DIR.resolve()}")

Project Root (Fixed): /Users/hc/Documents/projects/roku-patch-stability-analytics
Data Directory (Fixed): /Users/hc/Documents/projects/roku-patch-stability-analytics/data


In [3]:
# [1] Setup & Imports
# --------------------------------------------------------------------------
import os
from pathlib import Path
# ... (rest of imports)

# Define Project and Data Paths relative to the notebook's location
# ASSUMPTION: Notebook lives in <project_root>/notebooks/
PROJECT_ROOT = Path.cwd().parent 
DATA_DIR = PROJECT_ROOT / "data"

print(f"Project Root (Calculated): {PROJECT_ROOT.resolve()}")
print(f"Data Directory (Calculated): {DATA_DIR.resolve()}")

Project Root (Calculated): /Users/hc/Documents/projects
Data Directory (Calculated): /Users/hc/Documents/projects/data


In [5]:
# [2] Load Historical Data and Simulate New Data Arrival
# --------------------------------------------------------------------------

# --- A. Load Historical (Training) Data ---
# This is the original synthetic feature file (simulates your entire historical dataset)
historical_df = pd.read_csv(DATA_DIR / "synthetic_firmware_features_50rows.csv")

# Ensure target column is present
if 'high_risk_flag' not in historical_df.columns:
    # Assuming 'error_rate_per_10k' is the raw target column from synthesis
    historical_df['high_risk_flag'] = (historical_df['error_rate_per_10k'] >= 1000).astype(int) 

# --- B. Simulate New Data Arrival (New Patches + Known Outcomes) ---
# In a real environment, this data would come from the live deployment and monitoring systems
# We'll simulate fetching 5 new patch records (with known outcomes)
new_data_df = historical_df.sample(n=5, random_state=99).copy()
new_data_df['firmware_version'] = new_data_df['firmware_version'].astype(str) + '.NEW'

# --- C. Combine for Retraining ---
retrain_df = pd.concat([historical_df, new_data_df], ignore_index=True)

print(f"Historical (Training) Size: {historical_df.shape[0]} rows")
print(f"Combined Retraining Dataset Size: {retrain_df.shape[0]} rows")

Historical (Training) Size: 50 rows
Combined Retraining Dataset Size: 55 rows


In [6]:
# [3] Drift Analysis and Retraining Trigger Check
# --------------------------------------------------------------------------

# 1. Check for Population Drift (Feature Change)
# Compares the mean of the most important feature (code_churn_score)
hist_mean_churn = historical_df['code_churn_score'].mean()
new_mean_churn = new_data_df['code_churn_score'].mean()
churn_drift = abs(new_mean_churn - hist_mean_churn) / hist_mean_churn * 100

print(f"Historical Mean Churn: {hist_mean_churn:.2f}")
print(f"New Data Mean Churn: {new_mean_churn:.2f}")
print(f"Feature Drift (Churn): {churn_drift:.2f}%")

# 2. Check for Prediction Drift (Model Performance Degradation)
# We load the existing model and test its performance on the *new* data
# If the performance drops below a threshold, we trigger a full retraining.

# NOTE: For this simulation, we will assume drift is found and always retrain.
RETRAIN_TRIGGERED = True # In a real system, this would be a calculated boolean

if RETRAIN_TRIGGERED:
    print("\nðŸš¨ RETRAINING TRIGGERED: (Simulated performance degradation or significant drift detected.)")
else:
    print("\nâœ… Drift within acceptable limits. No retraining needed.")

Historical Mean Churn: 0.34
New Data Mean Churn: 0.31
Feature Drift (Churn): 9.67%

ðŸš¨ RETRAINING TRIGGERED: (Simulated performance degradation or significant drift detected.)


In [7]:
# [4] Model Retraining and Versioning
# --------------------------------------------------------------------------

if RETRAIN_TRIGGERED:
    
    FEATURES = [
        "code_churn_score",
        "previous_version_error_rate",
        "avg_device_age_days",
        "is_hotfix",
        "patch_security",
    ]
    TARGET = "high_risk_flag"

    # Define X and y for the combined retraining dataset
    X_retrain = retrain_df[FEATURES]
    y_retrain = retrain_df[TARGET]

    # --- Retrain the CatBoost Classifier ---
    cbc_v002 = CatBoostClassifier(
        iterations=500,
        learning_rate=0.01,
        loss_function='Logloss',
        verbose=0,
        random_seed=42
    )

    cbc_v002.fit(X_retrain, y_retrain)
    
    # --- Versioning and Saving ---
    # In a real system, we would increment the version number dynamically
    new_model_path = MODEL_DIR / "catboost_classifier_v002.cbm"
    cbc_v002.save_model(new_model_path)
    
    print(f"\nâœ… Retraining complete. New model saved as: {new_model_path.name}")
    print(f"New model ROC AUC (Approximate): {roc_auc_score(y_retrain, cbc_v002.predict_proba(X_retrain)[:, 1]):.4f}")

    # Update the CI Gate to use the new version in production
    print("\n--- NEXT ACTION: Update CI Gate to load 'catboost_classifier_v002.cbm' ---")
else:
    print("Model version v001 remains active.")


âœ… Retraining complete. New model saved as: catboost_classifier_v002.cbm
New model ROC AUC (Approximate): 1.0000

--- NEXT ACTION: Update CI Gate to load 'catboost_classifier_v002.cbm' ---
