In [34]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

In [35]:
%run ml_project_normalized.ipynb

Note: you may need to restart the kernel to use updated packages.
dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
ml_project successfully imported.


In [36]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from xgboost import XGBClassifier

def train_classifier(X_train, y_train, X_test):
    """
    Trains an XGBoost model using a pipeline that includes a frequency encoding transformation,
    OneHotEncoder, and hyperparameter tuning via RandomizedSearchCV.
    
    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series or np.array): Training target values.
        X_test (pd.DataFrame): Test features.
    
    Returns:
        best_estimator: The best estimator from RandomizedSearchCV.
        test_predictions: The predicted labels for X_test from the best estimator.
    """
    # Construct the pipeline:
    #   1. Apply frequency encoding (for example, on 'primary_color' & 'breed' if implemented in apply_freq_encode)
    #   2. OneHotEncode the features (adjust handle_unknown and sparse_output as needed)
    #   3. Fit an XGBClassifier.
    pipeline = Pipeline([
        ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ('xgb', XGBClassifier(eval_metric='logloss', verbosity=1))
    ])
    
    # Set up parameter distributions for XGBoost.
    param_distributions = {
        "xgb__max_depth": [3, 6, 9],
        "xgb__learning_rate": [0.01, 0.1, 0.2],
        "xgb__n_estimators": [50, 100, 200],
        "xgb__subsample": [0.5, 0.7, 1.0],
        "xgb__colsample_bytree": [0.5, 0.7, 1.0]
    }
    
    # Perform hyperparameter search using RandomizedSearchCV.
    randomized_search = RandomizedSearchCV(
        estimator=pipeline, 
        param_distributions=param_distributions,
        n_iter=1,
        cv=5, 
        scoring='accuracy', 
        verbose=3,
    )
    
    randomized_search.fit(X_train, y_train)
    
    print('Best parameters:', randomized_search.best_params_)
    print('Best cross-validation accuracy:', randomized_search.best_score_)
    
    cv_scores = cross_val_score(randomized_search.best_estimator_, X_train, y_train, cv=5, verbose=3)
    print('Generalization accuracy (via cross_val_score):', cv_scores.mean())
    
    # Make predictions on the test set using the best estimator.
    test_predictions = randomized_search.predict(X_test)
    
    return randomized_search.best_estimator_, test_predictions


In [37]:
from sklearn.preprocessing import LabelEncoder
# For Dog:
train_dog = df_train[df_train['animal_type'] == 'Dog'].copy()
X_train_dog = train_dog.drop(columns=['animal_type', 'outcome_type'])
y_train_dog = train_dog['outcome_type']

test_dog = df_test[df_test['animal_type'] == 'Dog'].copy()
X_test_dog = test_dog.drop(columns=['animal_type'])

# For Cat:
train_cat = df_train[df_train['animal_type'] == 'Cat'].copy()
X_train_cat = train_cat.drop(columns=['animal_type', 'outcome_type'])
y_train_cat = train_cat['outcome_type']

test_cat = df_test[df_test['animal_type'] == 'Cat'].copy()
X_test_cat = test_cat.drop(columns=['animal_type'])

## Encode targets with LabelEncoder
# Dog encoding
le_dog = LabelEncoder()
y_train_dog_encoded = le_dog.fit_transform(y_train_dog)

# Cat encoding
le_cat = LabelEncoder()
y_train_cat_encoded = le_cat.fit_transform(y_train_cat)

print("Training model for Dog data:")
best_estimator_dog, dog_predictions_encoded = train_classifier(X_train_dog, y_train_dog_encoded, X_test_dog)
dog_predictions = le_dog.inverse_transform(dog_predictions_encoded)

print("\nTraining model for Cat data:")
best_estimator_cat, cat_predictions_encoded = train_classifier(X_train_cat, y_train_cat_encoded, X_test_cat)
cat_predictions = le_cat.inverse_transform(cat_predictions_encoded)

combine_predictions(dog_predictions, cat_predictions, X_test_dog, X_test_cat)


Training model for Dog data:
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=50, xgb__subsample=1.0;, score=nan total time=   0.0s
[CV 2/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=50, xgb__subsample=1.0;, score=nan total time=   0.0s
[CV 3/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=50, xgb__subsample=1.0;, score=nan total time=   0.0s
[CV 4/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=50, xgb__subsample=1.0;, score=nan total time=   0.0s
[CV 5/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.2, xgb__max_depth=6, xgb__n_estimators=50, xgb__subsample=1.0;, score=nan total time=   0.0s


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nneom\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'primary_color'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 252, in transform
    out = self._transform(X, func=self.func, kw_args=self.kw_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\sklearn\preprocessing\_function_transformer.py", line 379, in _transform
    return func(X, **(kw_args if kw_args else {}))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nneom\AppData\Local\Temp\ipykernel_28548\2690814270.py", line 5, in apply_freq_encode
    df = freq_encode(df, 'primary_color')
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nneom\AppData\Local\Temp\ipykernel_28548\2931022905.py", line 33, in freq_encode
    freq_series = df[col].value_counts()
                  ~~^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\pandas\core\frame.py", line 4102, in __getitem__
    indexer = self.columns.get_loc(key)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nneom\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'primary_color'
