In [20]:
%pip install nbformat
%pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [21]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

In [22]:
%run ml_project_normalized.ipynb

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
Done running ml_project.ipynb.


In [23]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import make_scorer
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

def train_classifier(X_train, y_train, X_test, rare_classes, categorical_features):
    """
    Trains an XGBoost model using SMOTE for class balancing and hyperparameter tuning.
    CatBoost handles categorical features natively, but here we're demonstrating hyperparameter tuning with XGBoost.
    
    Parameters:
    X_train (pd.DataFrame): Training features.
    y_train (pd.Series or np.array): Training target values.
    X_test (pd.DataFrame): Test features.
    rare_classes (list): List of integer-encoded classes to be oversampled with SMOTE.
    categorical_features (list): List of column names for categorical features.
    
    Returns:
        best_estimator: The best estimator from RandomizedSearchCV.
        test_predictions: The predicted labels for X_test from the best estimator.
    """
    
    # Compute rare class sampling target
    y_series        = pd.Series(y_train)
    max_count       = y_series.value_counts().max()
    sampling_target = {cls: max_count for cls in rare_classes}

    class_labels = np.unique(y_train)
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=class_labels,
        y=y_train
    )
    
    # Get indices of categorical features
    cat_feature_indices = [X_train.columns.get_loc(col) for col in categorical_features]
    
    # Define the pipeline
    pipeline = Pipeline([
        ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
        # ('smote', SMOTENC(
        #     categorical_features=cat_feature_indices,
        #     sampling_strategy=sampling_target,
        #     random_state=42
        # )),
        ('xgb', XGBClassifier(eval_metric='logloss', verbosity=1))
    ])
    
    # Expanded parameter grid for RandomizedSearchCV
    param_distributions = {
        "xgb__max_depth": [2, 3, 5, 7, 9],
        "xgb__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
        "xgb__n_estimators": [50, 100, 200, 300, 500],
        "xgb__subsample": [0.3, 0.5, 0.7, 1.0],
        "xgb__colsample_bytree": [0.3, 0.5, 0.7, 1.0],
        "xgb__min_child_weight": [1, 3, 5, 7],
        "xgb__gamma": [0, 0.1, 0.3, 0.5, 1.0],
        "xgb__reg_alpha": [0, 0.01, 0.1, 0.5, 1.0],
        "xgb__reg_lambda": [0.5, 1.0, 1.5, 2.0, 3.0]
    }
    
    stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    randomized_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=20,
        cv=stratified_cv,
        scoring='balanced_accuracy',
        verbose=3,
        random_state=42,
    )
    
    print(f"\n[INFO] Starting training with {len(X_train)} samples, {len(y_train)} labels")
    
    # Fit the model with hyperparameter search
    randomized_search.fit(X_train, y_train)
    print(f"[INFO] Training complete. Best model fitted on {len(X_train)} samples.\n")
    
    print('Best parameters:', randomized_search.best_params_)
    print('Best cross-validation accuracy:', randomized_search.best_score_)
    
    # Assuming classification_report_with_accuracy_score is defined elsewhere
    cv_scores = cross_val_score(randomized_search.best_estimator_, X_train, y_train, cv=5, verbose=3, scoring=make_scorer(classification_report_with_accuracy_score))
    print('Generalization accuracy (via cross_val_score):', cv_scores.mean())
    
    # Predict on test data
    test_predictions = randomized_search.predict(X_test)
    
    return randomized_search.best_estimator_, test_predictions


In [24]:
# from sklearn.utils import resample
# df_train_downsample = resample(df_train, replace=True, n_samples=10000, random_state=42)
# print(df_train_downsample.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample

# df_train = resample(df_train, replace=True, n_samples=25000, random_state=42)

## Encode targets with LabelEncoder
# Dog encoding
le = LabelEncoder()
X_train = df_train.drop(columns=['outcome_type'])

y_train = df_train['outcome_type']
y_train = le.fit_transform(y_train)
print('Encoding mapping:', le.classes_)

# Define rare classes that need oversampling 
rare_classes = [
  label for label, count in pd.Series(y_train).value_counts().items()
  if count < 0.05 * len(y_train)
]
print("Rare classes:")
for cls in rare_classes:
  print(f"  {cls}: {le.classes_[cls]}")

cat_cols = {'intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake', 'breed', 'primary_color'}
categorical_features = [col for col in X_train.columns if col in cat_cols]

print("Training model for Dog data:")
best_model, test_predictions = train_classifier(
  X_train=X_train,
  y_train=y_train,
  X_test=df_test,
  rare_classes=rare_classes,
  categorical_features=categorical_features
)
predictions = le.inverse_transform(test_predictions)

save_predictions(predictions)

Encoding mapping: ['Adoption' 'Died' 'Euthanasia' 'Return to Owner' 'Transfer']
Rare classes:
  2: Euthanasia
  1: Died
Training model for Dog data:

[INFO] Starting training with 111155 samples, 111155 labels
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END xgb__colsample_bytree=0.3, xgb__gamma=0.3, xgb__learning_rate=0.1, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=500, xgb__reg_alpha=0.1, xgb__reg_lambda=3.0, xgb__subsample=0.7;, score=0.369 total time=  10.0s
[CV 2/5] END xgb__colsample_bytree=0.3, xgb__gamma=0.3, xgb__learning_rate=0.1, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=500, xgb__reg_alpha=0.1, xgb__reg_lambda=3.0, xgb__subsample=0.7;, score=0.374 total time=   9.4s
[CV 3/5] END xgb__colsample_bytree=0.3, xgb__gamma=0.3, xgb__learning_rate=0.1, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=500, xgb__reg_alpha=0.1, xgb__reg_lambda=3.0, xgb__subsample=0.7;, score=0.370 total time=   9.4s
[CV 4/5] EN

In [None]:
predictions = le.inverse_transform(test_predictions)

save_predictions(predictions, 'xg_boost')

Combined test predictions saved to: ./test_xg_boost_predictions_combined.csv


In [None]:
|