In [15]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

%run ml_project_normalized.ipynb

Note: you may need to restart the kernel to use updated packages.
dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned color
cleaned breed
dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned color
cleaned breed
Done running ml_project.ipynb.


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import numpy as np
import pandas as pd

def train_classifier(X_train, y_train_raw, X_test, thresholds=None, base_model=None):
    """
    Trains separate One-vs-Rest classifiers for each class using Random Forest.
    Includes per-fold CV scoring and generalization accuracy.
    Defaults to the highest-probability non-Adoption class when needed.

    Parameters:
        X_train (DataFrame): Training features
        y_train_raw (Series): Target labels (strings)
        X_test (DataFrame): Test features
        thresholds (dict): Optional per-class thresholds
        base_model (sklearn estimator): Optional custom classifier

    Returns:
        final_preds (Series): Final predictions on X_test
        ovr_models (dict): Dictionary of trained models
    """

    class_labels = sorted(y_train_raw.unique())
    ovr_models = {}
    class_probs = []

    if base_model is None:
        base_model = RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            n_jobs=-1,
            class_weight='balanced'
        )

    print("\n=== [ OvR Model Training and Cross-Validation ] ===")
    for target_class in class_labels:
        print(f"\n[INFO] OvR model for class: '{target_class}'")
        y_binary = (y_train_raw == target_class).astype(int)

        weights = compute_class_weight(
            class_weight='balanced',
            classes=np.array([0, 1]),
            y=y_binary
        )
        class_weight_dict = {0: weights[0], 1: weights[1]}

        clf = RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            n_jobs=-1,
            class_weight=class_weight_dict
        )

        print("[INFO] 5-Fold CV (balanced accuracy):")
        scores = cross_val_score(
            clf,
            X_train,
            y_binary,
            cv=5,
            scoring='balanced_accuracy',
            n_jobs=-1
        )
        for i, score in enumerate(scores):
            print(f"  Fold {i+1}: {score:.4f}")
        print(f"  Mean CV score: {scores.mean():.4f}")

        clf.fit(X_train, y_binary)
        ovr_models[target_class] = clf

        class_probs.append(clf.predict_proba(X_test)[:, 1])  # positive class probability

    all_probs = np.vstack(class_probs).T
    class_order = class_labels

    final_preds = []
    fallback_count = 0

    for prob_row in all_probs:
        scores = dict(zip(class_order, prob_row))
        selected_class = None
        selected_score = 0

        # Apply threshold logic
        for cls, score in scores.items():
            threshold = thresholds.get(cls, 0.5) if thresholds else 0.5
            if score >= threshold and score > selected_score:
                selected_class = cls
                selected_score = score

        # Fallback: pick highest non-Adoption class if no threshold met
        if not selected_class:
            non_adoption_scores = {cls: sc for cls, sc in scores.items() if cls != 'Adoption'}
            selected_class = max(non_adoption_scores.items(), key=lambda x: x[1])[0]
            fallback_count += 1

        final_preds.append(selected_class)

    print(f"\n[INFO] Fallback to best non-Adoption class used for {fallback_count} samples")

    # === Generalization accuracy on training set ===
    print("\n=== [ Generalization Accuracy on Training Set ] ===")
    train_probs = []
    for cls in class_order:
        model = ovr_models[cls]
        train_probs.append(model.predict_proba(X_train)[:, 1])

    train_all_probs = np.vstack(train_probs).T
    train_preds = []
    fallback_train = 0

    for prob_row in train_all_probs:
        scores = dict(zip(class_order, prob_row))
        selected_class = None
        selected_score = 0

        for cls, score in scores.items():
            threshold = thresholds.get(cls, 0.5) if thresholds else 0.5
            if score >= threshold and score > selected_score:
                selected_class = cls
                selected_score = score

        if not selected_class:
            non_adoption_scores = {cls: sc for cls, sc in scores.items() if cls != 'Adoption'}
            selected_class = max(non_adoption_scores.items(), key=lambda x: x[1])[0]
            fallback_train += 1

        train_preds.append(selected_class)

    acc = accuracy_score(y_train_raw, train_preds)
    bal_acc = balanced_accuracy_score(y_train_raw, train_preds)
    print(f"[INFO] Accuracy:           {acc:.4f}")
    print(f"[INFO] Balanced Accuracy:  {bal_acc:.4f}")
    print(f"[INFO] Fallbacks (train):  {fallback_train}")

    return pd.Series(final_preds, index=X_test.index), ovr_models


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# === [1] EXTRA DATA CLEANING ===
df_train = bucket_seasons(df_train)
df_test  = bucket_seasons(df_test)

y_train_raw = df_train['outcome_type']
df_train = df_train.drop(columns=['outcome_type', 'is_mix', 'intake_hour', 'intake_month'])

X_train = df_train.copy()
X_test  = df_test.copy()

# === [2] Label encode outcome_type for reporting and consistency ===
le = LabelEncoder()
y_encoded = le.fit_transform(y_train_raw)
print("Encoding mapping:", dict(zip(le.classes_, range(len(le.classes_)))))

# === [3] Identify rare classes (< 5% frequency, for logging/debugging) ===
rare_classes = [
    label for label, count in pd.Series(y_encoded).value_counts().items()
    if count < 0.05 * len(y_encoded)
]
print("Rare classes (by encoded value):")
for cls in rare_classes:
    print(f"  {cls}: {le.classes_[cls]}")

# === [4] Categorical Feature Encoding ===
cat_cols_onehot = ['intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake', 'season', 'breed']
cat_cols_freq   = ['primary_color']

# Frequency encode high-cardinality features
for col in cat_cols_freq:
    freq_map = X_train[col].value_counts()
    X_train[col] = X_train[col].map(freq_map)
    X_test[col] = X_test[col].map(freq_map).fillna(0)

# Save original version for segmented analysis
raw_X_train = X_train.copy()

# One-hot encode categorical features
X_train = pd.get_dummies(X_train, columns=cat_cols_onehot, drop_first=True)
X_test = pd.get_dummies(X_test,  columns=cat_cols_onehot, drop_first=True)

# Ensure same columns in both sets
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# === [5] Define Thresholds for Each Class ===
thresholds = {
    'Adoption': 0.9,
    'Died': 0.2,
    'Euthanasia': 0.3,
    'Return to Owner': 0.5,
    'Transfer': 0.5
}

# === [6] Train OvR Model and Predict ===
final_predictions, ovr_models = train_classifier(
    X_train=X_train,
    y_train_raw=y_train_raw,
    X_test=X_test,
    thresholds=thresholds
)

# === [7] Save final decoded predictions ===
save_predictions(final_predictions, model_name='multi_ovr_with_thresholds')

# === [8] Optional: Per-animal-type reports ===
# print("\n[INFO] Prediction Distribution:")
# print(final_predictions.value_counts())

# # Masks derived from raw training data
# cat_mask = raw_X_train['animal_type'] == 'Cat'
# dog_mask = raw_X_train['animal_type'] == 'Dog'

# print("\n[INFO] Classification Report (Cats only):")
# print(classification_report(
#     y_true=y_train_raw[cat_mask],
#     y_pred=train_preds[cat_mask]
# ))

# print("\n[INFO] Classification Report (Dogs only):")
# print(classification_report(
#     y_true=y_train_raw[dog_mask],
#     y_pred=train_preds[dog_mask]
# ))

Encoding mapping: {'Adoption': 0, 'Died': 1, 'Euthanasia': 2, 'Return to Owner': 3, 'Transfer': 4}
Rare classes (by encoded value):
  2: Euthanasia
  1: Died

=== [ OvR Model Training and Cross-Validation ] ===

[INFO] OvR model for class: 'Adoption'
[INFO] 5-Fold CV (balanced accuracy):
  Fold 1: 0.6795
  Fold 2: 0.6813
  Fold 3: 0.6814
  Fold 4: 0.6881
  Fold 5: 0.6922
  Mean CV score: 0.6845

[INFO] OvR model for class: 'Died'
[INFO] 5-Fold CV (balanced accuracy):
  Fold 1: 0.5216
  Fold 2: 0.5261
  Fold 3: 0.5347
  Fold 4: 0.5359
  Fold 5: 0.5229
  Mean CV score: 0.5282

[INFO] OvR model for class: 'Euthanasia'
[INFO] 5-Fold CV (balanced accuracy):
  Fold 1: 0.5882
  Fold 2: 0.5824
  Fold 3: 0.5781
  Fold 4: 0.5861
  Fold 5: 0.5849
  Mean CV score: 0.5839

[INFO] OvR model for class: 'Return to Owner'
[INFO] 5-Fold CV (balanced accuracy):
  Fold 1: 0.6927
  Fold 2: 0.6924
  Fold 3: 0.6916
  Fold 4: 0.6901
  Fold 5: 0.6929
  Mean CV score: 0.6920

[INFO] OvR model for class: 'Transf