In [1]:
%pip install nbformat

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

In [2]:
%run ml_project_normalized.ipynb

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
Done running ml_project.ipynb.


In [5]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from xgboost import XGBClassifier

def train_classifier(X_train, y_train, X_test):
    """
    Trains an XGBoost model using a pipeline that includes a frequency encoding transformation,
    OneHotEncoder, and hyperparameter tuning via RandomizedSearchCV.
    
    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series or np.array): Training target values.
        X_test (pd.DataFrame): Test features.
    
    Returns:
        best_estimator: The best estimator from RandomizedSearchCV.
        test_predictions: The predicted labels for X_test from the best estimator.
    """
    # Construct the pipeline:
    #   1. Apply frequency encoding (for example, on 'primary_color' & 'breed' if implemented in apply_freq_encode)
    #   2. OneHotEncode the features (adjust handle_unknown and sparse_output as needed)
    #   3. Fit an XGBClassifier.
    pipeline = Pipeline([
        ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ('xgb', XGBClassifier(eval_metric='logloss', verbosity=1))
    ])
    
    # Set up parameter distributions for XGBoost.
    param_distributions = {
        "xgb__max_depth": [3, 6, 9],
        "xgb__learning_rate": [0.01, 0.1, 0.2],
        "xgb__n_estimators": [50, 100, 200],
        "xgb__subsample": [0.5, 0.7, 1.0],
        "xgb__colsample_bytree": [0.5, 0.7, 1.0]
    }
    
    # Perform hyperparameter search using RandomizedSearchCV.
    randomized_search = RandomizedSearchCV(
        estimator=pipeline, 
        param_distributions=param_distributions,
        n_iter=1,
        cv=5, 
        scoring='accuracy', 
        verbose=3,
    )
    
    randomized_search.fit(X_train, y_train)
    
    print('Best parameters:', randomized_search.best_params_)
    print('Best cross-validation accuracy:', randomized_search.best_score_)
    
    cv_scores = cross_val_score(randomized_search.best_estimator_, X_train, y_train, cv=5, verbose=3)
    print('Generalization accuracy (via cross_val_score):', cv_scores.mean())
    
    # Make predictions on the test set using the best estimator.
    test_predictions = randomized_search.predict(X_test)
    
    return randomized_search.best_estimator_, test_predictions


In [3]:
from sklearn.utils import resample
df_train_downsample = resample(df_train, replace=True, n_samples=50000, random_state=42)
print(df_train_downsample.shape)

(50000, 10)


In [6]:
from sklearn.preprocessing import LabelEncoder

# For Dog:
train_dog = df_train_downsample[df_train_downsample['animal_type'] == 'Dog'].copy()
X_train_dog = train_dog.drop(columns=['animal_type', 'outcome_type'])
y_train_dog = train_dog['outcome_type']

test_dog = df_test[df_test['animal_type'] == 'Dog'].copy()
X_test_dog = test_dog.drop(columns=['animal_type'])

# For Cat:
train_cat = df_train_downsample[df_train_downsample['animal_type'] == 'Cat'].copy()
X_train_cat = train_cat.drop(columns=['animal_type', 'outcome_type'])
y_train_cat = train_cat['outcome_type']

test_cat = df_test[df_test['animal_type'] == 'Cat'].copy()
X_test_cat = test_cat.drop(columns=['animal_type'])

## Encode targets with LabelEncoder
# Dog encoding
le_dog = LabelEncoder()
y_train_dog_encoded = le_dog.fit_transform(y_train_dog)

# Cat encoding
le_cat = LabelEncoder()
y_train_cat_encoded = le_cat.fit_transform(y_train_cat)

print("Training model for Dog data:")
best_estimator_dog, dog_predictions_encoded = train_classifier(X_train_dog, y_train_dog_encoded, X_test_dog)
dog_predictions = le_dog.inverse_transform(dog_predictions_encoded)

print("\nTraining model for Cat data:")
best_estimator_cat, cat_predictions_encoded = train_classifier(X_train_cat, y_train_cat_encoded, X_test_cat)
cat_predictions = le_cat.inverse_transform(cat_predictions_encoded)

Training model for Dog data:
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END xgb__colsample_bytree=0.5, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=200, xgb__subsample=0.7;, score=0.602 total time=  55.0s
[CV 2/5] END xgb__colsample_bytree=0.5, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=200, xgb__subsample=0.7;, score=0.592 total time=  55.8s
[CV 3/5] END xgb__colsample_bytree=0.5, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=200, xgb__subsample=0.7;, score=0.593 total time=  55.3s
[CV 4/5] END xgb__colsample_bytree=0.5, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=200, xgb__subsample=0.7;, score=0.597 total time=  56.2s
[CV 5/5] END xgb__colsample_bytree=0.5, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=200, xgb__subsample=0.7;, score=0.580 total time=  57.4s
Best parameters: {'xgb__subsample': 0.7, 'xgb__n_estimators': 200, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.2, 'xgb__colsam

KeyboardInterrupt: 