In [19]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

In [20]:
DATA_DIR = "../data"  # relative to your project root

firmware_features = pd.read_csv(f"{DATA_DIR}/synthetic_firmware_features_50rows.csv")
error_rates = pd.read_csv(f"{DATA_DIR}/error_rates.csv")
pre_post = pd.read_csv(f"{DATA_DIR}/pre_post_errors.csv")
rma_fw = pd.read_csv(f"{DATA_DIR}/rma_by_firmware.csv")
rma_err = pd.read_csv(f"{DATA_DIR}/rma_by_error.csv")
error_spikes = pd.read_csv(f"{DATA_DIR}/error_spikes.csv")
stability_curves = pd.read_csv(f"{DATA_DIR}/stability_curves.csv")
device_inventory = pd.read_csv(f"{DATA_DIR}/device_inventory.csv")
device_events = pd.read_csv(f"{DATA_DIR}/device_events.csv")
merged = pd.read_csv(f"{DATA_DIR}/firmware_features_ml.csv")
ml_df = firmware_features.copy()

print("Loaded synthetic firmware dataset!")
print(firmware_features.head())
print("\nShape:", firmware_features.shape)
print("ML dataframe columns:")
print(ml_df.columns.tolist())
print("firmware_features:\n", firmware_features.head(), "\n")
print("pre_post:\n", pre_post.head(), "\n")
print("rma_by_firmware:\n", rma_fw.head(), "\n")
print("error_rates:\n", error_rates.head(), "\n")
print("Merged ML dataset loaded. Rows:", len(merged))

Loaded synthetic firmware dataset!
  firmware_version release_date previous_version  days_since_previous_release  patch_size_mb  files_changed  \
0           10.0.1   2024-01-07            9.9.9                         90.0      19.437984            466   
1           10.0.2   2024-01-24           10.0.1                         17.0      11.815477            283   
2           10.0.3   2024-02-19           10.0.2                         26.0      10.562794            253   
3           10.0.4   2024-03-29           10.0.3                         39.0       9.799312            235   
4           10.1.0   2024-04-11           10.0.4                         13.0       8.381174            201   

   lines_changed  is_hotfix  patch_security  code_churn_score  avg_device_age_days  previous_version_error_rate  \
0           8436      False           False          0.279867           702.219142                     0.000000   
1           5123      False           False          0.292176       

In [10]:
print("firmware_features_ml columns:\n", firmware_features.columns.tolist(), "\n")
print("pre_post_errors columns:\n", pre_post.columns.tolist(), "\n")
print("rma_by_firmware columns:\n", rma_fw.columns.tolist(), "\n")

firmware_features_ml columns:
 ['firmware_version', 'release_date', 'previous_version', 'days_since_previous_release', 'patch_size_mb', 'files_changed', 'lines_changed', 'is_hotfix', 'patch_security', 'code_churn_score', 'avg_device_age_days', 'previous_version_error_rate', 'error_rate_per_10k'] 

pre_post_errors columns:
 ['release_version', 'error_code', 'pre_errors', 'post_errors', 'delta_errors', 'post_pre_ratio'] 

rma_by_firmware columns:
 ['firmware_version', 'tickets', 'rmas', 'rma_rate'] 



In [11]:
# --- Prepare pre_post_errors for merging ---

pre_post_clean = pre_post.copy()

# 1. rename key
pre_post_clean = pre_post_clean.rename(columns={
    "release_version": "firmware_version"
})

# 2. aggregate at firmware level (collapse per-error rows)
agg_pre_post = pre_post_clean.groupby("firmware_version").agg({
    "pre_errors": "sum",
    "post_errors": "sum",
    "delta_errors": "sum",
    "post_pre_ratio": "mean"
}).reset_index()

print("agg_pre_post:\n", agg_pre_post.head())

agg_pre_post:
   firmware_version  pre_errors  post_errors  delta_errors  post_pre_ratio
0           10.0.0           0         2112          2112             NaN
1           10.0.5        1993         2059            66        1.023333
2           10.1.0        1932         2065           133        1.098333
3           10.1.2        1933         1981            48        1.038333
4           10.2.0        1889         2034           145        1.081667


In [12]:
ml_df = firmware_features.copy()

print("Starting ML DF shape:", ml_df.shape)

# --- Merge aggregated pre/post errors ---
ml_df = ml_df.merge(
    agg_pre_post,
    on="firmware_version",
    how="left"
)

print("After merging pre_post:", ml_df.shape)

# --- Merge RMA data ---
ml_df = ml_df.merge(
    rma_fw,
    on="firmware_version",
    how="left",
    suffixes=("", "_rma")
)

print("After merging RMA:", ml_df.shape)

ml_df.head()

Starting ML DF shape: (50, 13)
After merging pre_post: (50, 17)
After merging RMA: (50, 20)


Unnamed: 0,firmware_version,release_date,previous_version,days_since_previous_release,patch_size_mb,files_changed,lines_changed,is_hotfix,patch_security,code_churn_score,avg_device_age_days,previous_version_error_rate,error_rate_per_10k,pre_errors,post_errors,delta_errors,post_pre_ratio,tickets,rmas,rma_rate
0,10.0.1,2024-01-07,9.9.9,90.0,19.437984,466,8436,False,False,0.279867,702.219142,0.0,2240.901031,,,,,,,
1,10.0.2,2024-01-24,10.0.1,17.0,11.815477,283,5123,False,False,0.292176,426.954965,452.75906,1645.261002,,,,,,,
2,10.0.3,2024-02-19,10.0.2,26.0,10.562794,253,4580,False,False,0.214577,336.319311,2133.975495,2590.104774,,,,,,,
3,10.0.4,2024-03-29,10.0.3,39.0,9.799312,235,4254,False,False,0.172038,340.087869,3624.725127,2429.288368,,,,,,,
4,10.1.0,2024-04-11,10.0.4,13.0,8.381174,201,3639,False,True,0.240462,467.025999,2519.057153,1150.953631,1932.0,2065.0,133.0,1.098333,1569.0,235.0,0.15


In [13]:
ml_df.info()

# Show key modeling features
ml_df[[
    "firmware_version",
    "error_rate_per_10k",
    "pre_errors",
    "post_errors",
    "post_pre_ratio",
    "tickets",
    "rmas",
    "rma_rate"
]].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   firmware_version             50 non-null     object 
 1   release_date                 50 non-null     object 
 2   previous_version             50 non-null     object 
 3   days_since_previous_release  50 non-null     float64
 4   patch_size_mb                50 non-null     float64
 5   files_changed                50 non-null     int64  
 6   lines_changed                50 non-null     int64  
 7   is_hotfix                    50 non-null     bool   
 8   patch_security               50 non-null     bool   
 9   code_churn_score             50 non-null     float64
 10  avg_device_age_days          50 non-null     float64
 11  previous_version_error_rate  50 non-null     float64
 12  error_rate_per_10k           50 non-null     float64
 13  pre_errors            

Unnamed: 0,firmware_version,error_rate_per_10k,pre_errors,post_errors,post_pre_ratio,tickets,rmas,rma_rate
0,10.0.1,2240.901031,,,,,,
1,10.0.2,1645.261002,,,,,,
2,10.0.3,2590.104774,,,,,,
3,10.0.4,2429.288368,,,,,,
4,10.1.0,1150.953631,1932.0,2065.0,1.098333,1569.0,235.0,0.15


In [14]:
ml_df = ml_df.copy()

# 1) Regression target: just reuse the observed error rate per 10k
ml_df["target_error_rate"] = ml_df["error_rate_per_10k"]

# 2) Regression-style severity: use post_pre_ratio,
#    but fill missing with 1.0 (meaning "no change / neutral")
ml_df["regression_ratio"] = ml_df["post_pre_ratio"].fillna(1.0)

# 3) Classification target: "high risk" firmware
#    Define high risk as being in the top 25% of error rates
threshold = ml_df["error_rate_per_10k"].quantile(0.75)
ml_df["high_risk_flag"] = (ml_df["error_rate_per_10k"] >= threshold).astype(int)

print("High-risk threshold (75th percentile):", threshold)
ml_df[[
    "firmware_version",
    "error_rate_per_10k",
    "target_error_rate",
    "pre_errors",
    "post_errors",
    "post_pre_ratio",
    "regression_ratio",
    "tickets",
    "rmas",
    "rma_rate",
    "high_risk_flag"
]]

High-risk threshold (75th percentile): 2723.581797358553


Unnamed: 0,firmware_version,error_rate_per_10k,target_error_rate,pre_errors,post_errors,post_pre_ratio,regression_ratio,tickets,rmas,rma_rate,high_risk_flag
0,10.0.1,2240.901031,2240.901031,,,,1.0,,,,0
1,10.0.2,1645.261002,1645.261002,,,,1.0,,,,0
2,10.0.3,2590.104774,2590.104774,,,,1.0,,,,0
3,10.0.4,2429.288368,2429.288368,,,,1.0,,,,0
4,10.1.0,1150.953631,1150.953631,1932.0,2065.0,1.098333,1.098333,1569.0,235.0,0.15,0
5,10.2.0,3553.338329,3553.338329,1889.0,2034.0,1.081667,1.081667,875.0,122.0,0.139,1
6,10.3.0,2662.086254,2662.086254,1960.0,2097.0,1.088333,1.088333,986.0,175.0,0.177,0
7,10.4.0,2968.478875,2968.478875,,,,1.0,,,,1
8,10.4.1,2956.309284,2956.309284,,,,1.0,,,,1
9,10.4.2,1577.17214,1577.17214,,,,1.0,,,,0


In [15]:
# Work on a copy so we don't accidentally mutate ml_df in weird ways
model_df = ml_df.copy()

# Convert bools to integers for consistency
bool_cols = ["patch_feature", "patch_hotfix", "patch_security"]
for col in bool_cols:
    model_df[col] = model_df[col].astype(int)

# -------------------------
# 1) Define feature columns
# -------------------------

feature_cols = [
    # Patch complexity / size
    "patch_size_mb",
    "files_changed",
    "lines_changed",
    "code_churn_score",
    
    # Release cadence / timing
    "days_since_previous_release",
    
    # Patch type flags / hotfix indicator
    "is_hotfix",
    "patch_feature",
    "patch_hotfix",
    "patch_security",
    
    # Historical performance context
    "previous_version_error_rate",
    "avg_device_age_days",
    
    # Aggregated error behavior
    "pre_errors",
    "post_errors",
    "delta_errors",
    "post_pre_ratio",
    "regression_ratio",
    
    # Support / RMA behavior
    "tickets",
    "rmas",
    "rma_rate"
]

# Keep only rows where we have everything we care about
required_cols = feature_cols + ["target_error_rate", "high_risk_flag"]
model_df = model_df.dropna(subset=required_cols)

print("Model DF shape after dropping NA:", model_df.shape)

# -------------------------
# 2) Build X and targets
# -------------------------

X = model_df[feature_cols].copy()
y_reg = model_df["target_error_rate"].copy()
y_clf = model_df["high_risk_flag"].copy()

print("Feature matrix X shape:", X.shape)
print("Regression target y_reg shape:", y_reg.shape)
print("Classification target y_clf shape:", y_clf.shape)

X.head()

KeyError: 'patch_feature'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# -------------------------
# Train / test split
# -------------------------

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X,
    y_reg,
    test_size=0.33,   # ~ 1/3 for test, given small sample size
    random_state=42
)

print("Train size:", X_train_reg.shape[0], "rows")
print("Test size:", X_test_reg.shape[0], "rows")

# -------------------------
# Random Forest Regressor
# -------------------------

rf_reg = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

rf_reg.fit(X_train_reg, y_train_reg)

reg_preds = rf_reg.predict(X_test_reg)

# Evaluate: MSE -> RMSE and R²
mse = mean_squared_error(y_test_reg, reg_preds)
rmse = mse ** 0.5
r2 = r2_score(y_test_reg, reg_preds)

print("RandomForestRegressor Performance")
print("  RMSE:", rmse)
print("  R²  :", r2)
print("\nActual vs Predicted (test set):")
for actual, pred in zip(y_test_reg, reg_preds):
    print(f"  actual={actual:.1f}  predicted={pred:.1f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances from the fitted regressor
importances = rf_reg.feature_importances_
fi_reg = (
    pd.DataFrame({
        "feature": feature_cols,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
)

fi_reg

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=fi_reg, x="importance", y="feature")
plt.title("RandomForestRegressor - Feature Importance (Firmware Error Rate)")
plt.tight_layout()
plt.show()

In [None]:
# Poisson Regression on synthetic firmware data (scaled)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Make sure ml_df is the synthetic firmware dataframe
print("Columns in ml_df:\n", ml_df.columns.tolist(), "\n")

# 1. Choose features that actually exist in the synthetic dataset
poisson_features = [
    "patch_size_mb",
    "files_changed",
    "lines_changed",
    "code_churn_score",
    "avg_device_age_days",
    "days_since_previous_release",
    "previous_version_error_rate",
    "is_hotfix",
    "patch_security",
]

# Design matrix X (cast to float where needed) 
X_pois = ml_df[poisson_features].astype(float)

# Poisson needs a non-negative target. We’ll turn the rate into a
# pseudo “count” by scaling it down.
y_pois = (ml_df["error_rate_per_10k"] / 10.0).clip(lower=0)

print("Poisson dataset shape:", X_pois.shape)

# 2. Train/test split
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_pois, y_pois, test_size=0.3, random_state=42
)

# 3. Pipeline: scale features -> Poisson regression
poisson_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", PoissonRegressor(alpha=0.1, max_iter=1000))
])

poisson_pipeline.fit(X_train_p, y_train_p)

# 4. Predictions and evaluation
y_pred_p = poisson_pipeline.predict(X_test_p)

rmse_p = mean_squared_error(y_test_p, y_pred_p) ** 0.5
r2_p = r2_score(y_test_p, y_pred_p)

print("\nPoisson Regression performance:")
print(f"  RMSE: {rmse_p:.2f}")
print(f"  R²  : {r2_p:.3f}")

# 5. Inspect coefficients (after the scaler)
poisson_model = poisson_pipeline.named_steps["model"]

print("\nPoisson coefficients:")
for feat, coef in zip(poisson_features, poisson_model.coef_):
    print(f"  {feat:30s}: {coef:.4f}")

In [None]:
y = ml_df["error_rate_per_10k"]

print("Mean of target:", y.mean())
print("Variance of target:", y.var())
print("Variance / Mean:", y.var() / y.mean())

In [None]:
# --- Rerunning Negative Binomial with reduced feature set ---

# 1. Choose a reduced, less collinear feature set
reduced_features = [
    "code_churn_score", 
    "previous_version_error_rate",
    "avg_device_age_days", 
    "is_hotfix",
    "patch_security"
]

# 2. Re-build design matrix X and target y
X_nb_reduced = ml_df[reduced_features].copy().astype('float64')
y_nb_reduced = ml_df["error_rate_per_10k"].copy().astype('float64') 

# Optional: ensure no NaNs
nb_df_reduced = pd.concat([X_nb_reduced, y_nb_reduced], axis=1).dropna()
X_nb_reduced = nb_df_reduced[reduced_features]
y_nb_reduced = nb_df_reduced["error_rate_per_10k"]

print("Reduced Negative Binomial dataset shape:", X_nb_reduced.shape)

# 3. Train/test split (using the same split ratio)
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(
    X_nb_reduced, y_nb_reduced, test_size=0.33, random_state=42
)

# 4. Add constant
X_train_red_sm = sm.add_constant(X_train_red, prepend=False)
X_test_red_sm = sm.add_constant(X_test_red, prepend=False)

# 5. FIT NEGATIVE BINOMIAL MODEL (Reduced)
nb_model_reduced = sm.GLM(
    y_train_red, 
    X_train_red_sm, 
    family=sm.families.NegativeBinomial()
).fit()

# Predictions and evaluation
y_pred_nb_red = nb_model_reduced.predict(X_test_red_sm)
rmse_nb_red = mean_squared_error(y_test_red, y_pred_nb_red) ** 0.5
r2_nb_red = r2_score(y_test_red, y_pred_nb_red)

print("\n--- Negative Binomial Regression (Reduced Features) Performance ---")
print(f" RMSE: {rmse_nb_red:.2f}")
print(f" R² : {r2_nb_red:.3f}")
print("\nReduced Negative Binomial coefficients:")
for feat, coef in zip(X_train_red_sm.columns, nb_model_reduced.params):
    print(f" {feat:30s}: {coef:.4f}")

In [None]:
print("Creating classification target...")

error_threshold = ml_df['error_rate_per_10k'].quantile(0.75)
ml_df['high_risk_flag'] = (ml_df['error_rate_per_10k'] > error_threshold).astype(int)

print("Threshold:", error_threshold)
print(ml_df['high_risk_flag'].value_counts())

In [None]:
# 1. Create Classification Target (assuming 75th percentile threshold)
error_threshold = ml_df['error_rate_per_10k'].quantile(0.75)
ml_df['high_risk_flag'] = (ml_df['error_rate_per_10k'] > error_threshold).astype(int)

# 2. Define Feature Set (Using the reduced set for stability)
rf_features = [
    "code_churn_score", 
    "previous_version_error_rate",
    "avg_device_age_days", 
    "is_hotfix",
    "patch_security"
]

X = ml_df[rf_features].astype('float64')
y_reg = ml_df["error_rate_per_10k"].astype('float64') 
y_clf = ml_df["high_risk_flag"].astype('int') 

# Split data (using the same random state 42)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.33, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.33, random_state=42)

# --- A. RANDOM FOREST REGRESSOR ---

rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(X_train_r, y_train_r)
y_pred_rfr = rfr.predict(X_test_r)

rmse_rfr = mean_squared_error(y_test_r, y_pred_rfr) ** 0.5
r2_rfr = r2_score(y_test_r, y_pred_rfr)

print("="*60)
print("A. Random Forest Regressor Performance")
print(f" RMSE: {rmse_rfr:.2f}")
print(f" R² : {r2_rfr:.3f}")

# --- B. RANDOM FOREST CLASSIFIER ---

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train_c, y_train_c)
y_pred_rfc = rfc.predict(X_test_c)
y_proba_rfc = rfc.predict_proba(X_test_c)[:, 1]

print("\nB. Random Forest Classifier Performance")
print(f" Accuracy: {accuracy_score(y_test_c, y_pred_rfc):.3f}")
print(f" ROC-AUC Score: {roc_auc_score(y_test_c, y_proba_rfc):.3f}")
print("\nClassification Report:\n", classification_report(y_test_c, y_pred_rfc))

# --- C. DIAGNOSTICS: FEATURE IMPORTANCE & CONFUSION MATRIX ---

print("\nC. Random Forest Feature Importance:")
feature_importances = pd.Series(rfr.feature_importances_, index=rf_features).sort_values(ascending=False)
print(feature_importances)

# Optional: Generate Plots for Diagnostics
# 
#

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score
from catboost import CatBoostRegressor, CatBoostClassifier

# NOTE: The data splits (X_train_r, y_train_r, etc.) must be defined from the previous step.
# We will use the reduced feature set (5 features) for stability.
rf_features = [
    "code_churn_score", 
    "previous_version_error_rate",
    "avg_device_age_days", 
    "is_hotfix",
    "patch_security"
]

# --- A. CATBOOST REGRESSOR ---

# Using minimal verbosity and early stopping for speed
cbr = CatBoostRegressor(
    random_seed=42, 
    verbose=0, 
    early_stopping_rounds=10, 
    n_estimators=1000,
    loss_function='RMSE'
)
# Assuming X_train_r and y_train_r exist
cbr.fit(X_train_r, y_train_r, eval_set=(X_test_r, y_test_r), use_best_model=True)
y_pred_cbr = cbr.predict(X_test_r)

rmse_cbr = mean_squared_error(y_test_r, y_pred_cbr) ** 0.5
r2_cbr = r2_score(y_test_r, y_pred_cbr)

print("="*60)
print("D. CatBoost Regressor Performance")
print(f" RMSE: {rmse_cbr:.2f}")
print(f" R² : {r2_cbr:.3f}")


# --- B. CATBOOST CLASSIFIER ---

# Using minimal verbosity and early stopping for speed
cbc = CatBoostClassifier(
    random_seed=42, 
    verbose=0, 
    early_stopping_rounds=10, 
    n_estimators=1000,
    loss_function='Logloss'
)
# Assuming X_train_c and y_train_c exist
cbc.fit(X_train_c, y_train_c, eval_set=(X_test_c, y_test_c), use_best_model=True)
y_pred_cbc = cbc.predict(X_test_c)
y_proba_cbc = cbc.predict_proba(X_test_c)[:, 1]

print("\nE. CatBoost Classifier Performance")
print(f" Accuracy: {accuracy_score(y_test_c, y_pred_cbc):.3f}")
print(f" ROC-AUC Score: {roc_auc_score(y_test_c, y_proba_cbc):.3f}")

# --- C. DIAGNOSTICS: CATBOOST FEATURE IMPORTANCE ---

print("\nF. CatBoost Feature Importance (Regressor):")
feature_importances_cbr = pd.Series(cbr.get_feature_importance(), index=rf_features).sort_values(ascending=False)
print(feature_importances_cbr)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming Random Forest Regressor (rfr) had the best performance (use cbr if better)
y_test_final = y_test_r
y_pred_final = y_pred_rfr # or y_pred_cbr if CatBoost is better

plt.figure(figsize=(7, 7))
sns.scatterplot(x=y_test_final, y=y_pred_final)
plt.plot([y_test_final.min(), y_test_final.max()], 
         [y_test_final.min(), y_test_final.max()], 
         'r--', lw=2)
plt.title('Actual vs. Predicted Error Rate (Best Regressor)')
plt.xlabel('Actual Error Rate per 10k')
plt.ylabel('Predicted Error Rate per 10k')
plt.savefig('actual_vs_predicted_regressor.png')
plt.close()
print("Generated actual_vs_predicted_regressor.png")

In [None]:
from sklearn.metrics import confusion_matrix

# Assuming Random Forest Classifier (rfc) had the best performance (use cbc if better)
y_test_final_clf = y_test_c
y_pred_final_clf = y_pred_rfc # or y_pred_cbc if CatBoost is better

cm = confusion_matrix(y_test_final_clf, y_pred_final_clf)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Low Risk', 'High Risk'],
            yticklabels=['Low Risk', 'High Risk'])
plt.title('Confusion Matrix (Best Classifier)')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix_classifier.png')
plt.close()
print("Generated confusion_matrix_classifier.png")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assuming CatBoost Regressor (cbr) is the best
y_test_final = y_test_r
y_pred_final = y_pred_cbr 

plt.figure(figsize=(7, 7))
sns.scatterplot(x=y_test_final, y=y_pred_final)
plt.plot([y_test_final.min(), y_test_final.max()], 
         [y_test_final.min(), y_test_final.max()], 
         'r--', lw=2)
plt.title('Actual vs. Predicted Error Rate (CatBoost Regressor)')
plt.xlabel('Actual Error Rate per 10k')
plt.ylabel('Predicted Error Rate per 10k')
plt.savefig('actual_vs_predicted_regressor.png')
plt.close()
print("Generated actual_vs_predicted_regressor.png")

In [None]:
# Assuming CatBoost Classifier (cbc) is the best
y_test_final_clf = y_test_c
y_pred_final_clf = y_pred_cbc 

cm = confusion_matrix(y_test_final_clf, y_pred_final_clf)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Low Risk', 'High Risk'],
            yticklabels=['Low Risk', 'High Risk'])
plt.title('Confusion Matrix (CatBoost Classifier)')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix_classifier.png')
plt.close()
print("Generated confusion_matrix_classifier.png")

In [None]:
from sklearn.metrics import roc_curve, auc

# Reusing variables from the CatBoost Classifier (cbc)
fpr, tpr, thresholds = roc_curve(y_test_final_clf, y_proba_cbc)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig('roc_curve.png')
plt.close()
print("Generated roc_curve.png")

In [None]:
# After model training is complete and performance is confirmed
cbc.save_model("catboost_risk_classifier.cbm")

In [None]:
from catboost import CatBoostClassifier
import numpy as np

# Same feature set you used to train the classifier
CLASSIFIER_FEATURES = [
    "code_churn_score",
    "previous_version_error_rate",
    "avg_device_age_days",
    "is_hotfix",
    "patch_security",
]

# 1. Load the saved model
cbc_loaded = CatBoostClassifier()
cbc_loaded.load_model("catboost_risk_classifier.cbm")

# 2. Take a few rows from ml_df as a proxy for “new patches”
X_sample = ml_df[CLASSIFIER_FEATURES].tail(5)

# 3. Get high-risk probabilities
proba_sample = cbc_loaded.predict_proba(X_sample)[:, 1]
flag_sample = (proba_sample >= 0.5).astype(int)

print("Sample high-risk probabilities:", np.round(proba_sample, 3))
print("Sample flags (1 = high risk):  ", flag_sample.tolist())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_squared_error

# ----------------------------------------------------
# STEP A: Re-Establish Data & Splits 
# ----------------------------------------------------

# UNCOMMENT AND CORRECT THE PATH TO LOAD YOUR DATA
# If you are running this from your project root, the path below should work.
ml_df = pd.read_csv('../data/synthetic_firmware_features_50rows.csv') 

# Define features and targets
rf_features = [
    "code_churn_score", "previous_version_error_rate",
    "avg_device_age_days", "is_hotfix", "patch_security"
]

# UNCOMMENT THESE LINES TO RE-CREATE TARGETS & SPLITS
error_threshold = ml_df['error_rate_per_10k'].quantile(0.75)
ml_df['high_risk_flag'] = (ml_df['error_rate_per_10k'] > error_threshold).astype(int)
X = ml_df[rf_features].astype('float64')
y_reg = ml_df["error_rate_per_10k"].astype('float64')
y_clf = ml_df["high_risk_flag"].astype('int')
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.33, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.33, random_state=42)


# ----------------------------------------------------
# STEP B: Train and Save CatBoost Regressor (Rec #2)
# ----------------------------------------------------
cbr = CatBoostRegressor(random_seed=42, verbose=0, n_estimators=1000, loss_function='RMSE')
cbr.fit(X_train_r, y_train_r, eval_set=(X_test_r, y_test_r), use_best_model=True)
cbr.save_model("catboost_error_regressor.cbm") 
print("Regressor model saved as catboost_error_regressor.cbm")


# ----------------------------------------------------
# STEP C: Train and Save CatBoost Classifier (Rec #1)
# ----------------------------------------------------
cbc = CatBoostClassifier(random_seed=42, verbose=0, n_estimators=1000, loss_function='Logloss')
cbc.fit(X_train_c, y_train_c, eval_set=(X_test_c, y_test_c), use_best_model=True)
cbc.save_model("catboost_risk_classifier.cbm") 
print("Classifier model saved as catboost_risk_classifier.cbm")

In [None]:
# STEP 1: Compute historical churn bounds from training data (ml_df)

print("Columns in ml_df:", ml_df.columns.tolist())

churn_min = ml_df["code_churn_score"].min()
churn_max = ml_df["code_churn_score"].max()

print(f"\nHistorical code_churn_score min: {churn_min:.6f}")
print(f"Historical code_churn_score max: {churn_max:.6f}")

In [17]:
# Assuming ml_df is loaded or you can load it here
# ml_df = pd.read_csv('../data/synthetic_firmware_features_50rows.csv') 

churn_min = ml_df["code_churn_score"].min()
churn_max = ml_df["code_churn_score"].max()

print(f"Historical Churn MIN: {churn_min:.2f}")
print(f"Historical Churn MAX: {churn_max:.2f}")

Historical Churn MIN: 0.14
Historical Churn MAX: 0.87


In [18]:
# --- 2. DEFINE CONSTANTS FOR STABLE SCALING ---
# Replace placeholder values with actual historical churn min/max
CHURN_HISTORICAL_MIN = 0.14
CHURN_HISTORICAL_MAX = 0.87

# Weights for composite score (can adjust based on QA preference)
W_MODEL = 0.6
W_CHURN = 0.4