In [57]:
%pip install nbformat

Note: you may need to restart the kernel to use updated packages.


In [58]:
import nbformat

# Read your notebook (assuming version 4 for example purposes)
nb = nbformat.read("ml_project.ipynb", as_version=4)

# Normalize the notebook to add missing id fields and other updates
nbformat.validator.validate(nb)

# Write the normalized notebook back to a file
nbformat.write(nb, "ml_project_normalized.ipynb")

In [59]:
%run ml_project_normalized.ipynb

Note: you may need to restart the kernel to use updated packages.
dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color
Done running ml_project.ipynb.


In [60]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
#from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [61]:
# Trying to figure out SMOTE
from imblearn.over_sampling import SMOTE, SMOTENC
#df_train_downsample.head()

In [62]:

def train_NB_classifier(X_train, y_train, X_test):
    """
    Trains an Naive Bayes Multinominal model using a pipeline that includes a frequency encoding transformation,
    OneHotEncoder, and hyperparameter tuning via RandomizedSearchCV.
    
    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series or np.array): Training target values.
        X_test (pd.DataFrame): Test features.
    
    Returns:
        best_estimator: The best estimator from RandomizedSearchCV.
        test_predictions: The predicted labels for X_test from the best estimator.
    """
    # Construct the pipeline:
    #   1. Apply frequency encoding (for example, on 'primary_color' & 'breed' if implemented in apply_freq_encode)
    #   2. OneHotEncode the features (adjust handle_unknown and sparse_output as needed)
    #   3. Fit an XGBClassifier.
    smote = SMOTENC(categorical_features=['intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake', 'breed', 'is_mix', 'primary_color'], random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)



    pipeline = Pipeline([
        ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
        ('nb', MultinomialNB())
    ])
    
    # Set up parameter distributions for XGBoost.
    param_distributions = {
        "nb__alpha": [0.1, 0.5, 1, 2, 5, 10],
    }
    
    # Perform hyperparameter search using RandomizedSearchCV.
    randomized_search = RandomizedSearchCV(
        estimator=pipeline, 
        param_distributions=param_distributions,
        n_iter=1,
        cv=5, 
        scoring='balanced_accuracy', 
        verbose=3,
    )
    
    randomized_search.fit(X_resampled, y_resampled)
    
    print('Best parameters:', randomized_search.best_params_)
    print('Best cross-validation accuracy:', randomized_search.best_score_)
    
    cv_scores = cross_val_score(randomized_search.best_estimator_, X_resampled, y_resampled, cv=5, verbose=3, scoring='balanced_accuracy')
    print('Generalization Balanced accuracy (via cross_val_score):', cv_scores.mean())

    # Make predictions on the test set using the best estimator.
    test_predictions = randomized_search.predict(X_test)
    
    return randomized_search.best_estimator_, test_predictions


In [63]:
# downsampling
from sklearn.utils import resample
df_train_downsample = resample(df_train, replace=False, n_samples=50000, random_state=42)
print(df_train_downsample.shape)

(50000, 10)


In [64]:
# Cross Validation
practice_dog = df_train#[df_train_downsample['animal_type'] == 'Dog']
practice_dog_labels = practice_dog['outcome_type']
practice_dog_data = practice_dog.drop(columns=['outcome_type'])
practice_train, practice_test, labels_train, labels_test = train_test_split(practice_dog_data,practice_dog_labels , test_size=0.20, random_state=42)

smote = SMOTENC(categorical_features=['intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake', 'breed', 'is_mix', 'primary_color'], random_state=42)
X_resampled, y_resampled = smote.fit_resample(practice_train, labels_train)

pipeline = Pipeline([
    ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
   # ('smote', SMOTE(random_state=42)),  # <-- insert SMOTE here
    ('nb', MultinomialNB())
])

# Set up parameter distributions for XGBoost.
param_distributions = {
    "nb__alpha": [0.1,.2, .3, .4, 0.5, 1, 2, 5, 10],
}

# Perform hyperparameter search using RandomizedSearchCV.
randomized_search = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions=param_distributions,
    n_iter=1,
    cv=5, 
    scoring='balanced_accuracy', 
    verbose=3,
)

randomized_search.fit(X_resampled, y_resampled)

print('Best parameters:', randomized_search.best_params_)
print('Best cross-validation accuracy:', randomized_search.best_score_)

cv_scores = cross_val_score(randomized_search.best_estimator_, X_resampled, y_resampled, cv=5, verbose=3, scoring='balanced_accuracy')
print('Generalization Balanced accuracy (via cross_val_score):', cv_scores.mean())

test_pred = randomized_search.predict(practice_test)


NB_conf = confusion_matrix(y_true=labels_test, y_pred=test_pred)
print("Confusion Matrix:\n", NB_conf)

# Generate classification report
report = classification_report(y_true=labels_test, y_pred=test_pred)
print("Classification Report:\n", report)


Fitting 5 folds for each of 1 candidates, totalling 5 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 1/5] END .....................nb__alpha=0.1;, score=0.493 total time=   0.6s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 2/5] END .....................nb__alpha=0.1;, score=0.533 total time=   0.6s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 3/5] END .....................nb__alpha=0.1;, score=0.533 total time=   0.6s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 4/5] END .....................nb__alpha=0.1;, score=0.521 total time=   0.6s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 5/5] END .....................nb__alpha=0.1;, score=0.506 total time=   0.6s
Best parameters: {'nb__alpha': 0.1}
Best cross-validation accuracy: 0.517074586463689


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.496) total time=   0.5s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.535) total time=   0.5s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.535) total time=   0.5s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.522) total time=   0.5s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.508) total time=   0.5s
Generalization Balanced accuracy (via cross_val_score): 0.5192557240780762
Confusion Matrix:
 [[8966  104  195 1079  687]
 [  80   29   21   11   65]
 [ 207   48  255  104   83]
 [1085    4  124 1939   84]
 [2816   95  250  616 3284]]
Classification Report:
                  precision    recall  f1-score   support

       Adoption       0.68      0.81      0.74     11031
           Died       0.10      0.14      0.12       206
     Euthanasia       0.30      0.37      0.33       697
Return to Owner       0.52      0.60      0.56      3236
       Transfer       0.78      0.47      0.58      7061

       accuracy                           0.65     22231
      macro avg       0.48      0.48      0.47     22231
   weighted avg       0.67      0.65      0.65     22231



In [65]:
from sklearn.preprocessing import LabelEncoder
# For Dog:
'''
train_dog = df_train_downsample[df_train_downsample['animal_type'] == 'Dog'].copy()
X_train_dog = train_dog.drop(columns=['animal_type', 'outcome_type'])
y_train_dog = train_dog['outcome_type']

test_dog = df_test[df_test['animal_type'] == 'Dog'].copy()
X_test_dog = test_dog.drop(columns=['animal_type'])

# For Cat:
train_cat = df_train_downsample[df_train_downsample['animal_type'] == 'Cat'].copy()
X_train_cat = train_cat.drop(columns=['animal_type', 'outcome_type'])
y_train_cat = train_cat['outcome_type']

test_cat = df_test[df_test['animal_type'] == 'Cat'].copy()
X_test_cat = test_cat.drop(columns=['animal_type'])

## Encode targets with LabelEncoder
# Dog encoding
le_dog = LabelEncoder()
y_train_dog_encoded = le_dog.fit_transform(y_train_dog)

# Cat encoding
le_cat = LabelEncoder()
y_train_cat_encoded = le_cat.fit_transform(y_train_cat)
'''

'''
print("Training model for Dog data:")
best_estimator_dog, dog_predictions_encoded = train_NB_classifier(X_train_dog, y_train_dog_encoded, X_test_dog)
dog_predictions = le_dog.inverse_transform(dog_predictions_encoded)

'''
# Don't split cat and dog
X_train = df_train.drop(columns=['outcome_type'])
y_train = df_train['outcome_type']

X_test = df_test

le_train = LabelEncoder()
y_train_encoded = le_train.fit_transform(y_train)


print("\nTraining model")
best_estimator, predictions_encoded = train_NB_classifier(X_train, y_train_encoded, X_test)
predictions = le_train.inverse_transform(predictions_encoded)


save_predictions(predictions, "NB")
print("DONE!\n", best_estimator)



Training model
Fitting 5 folds for each of 1 candidates, totalling 5 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 1/5] END .....................nb__alpha=0.1;, score=0.495 total time=   0.3s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 2/5] END .....................nb__alpha=0.1;, score=0.524 total time=   0.3s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 3/5] END .....................nb__alpha=0.1;, score=0.537 total time=   0.3s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 4/5] END .....................nb__alpha=0.1;, score=0.514 total time=   0.3s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV 5/5] END .....................nb__alpha=0.1;, score=0.511 total time=   0.3s
Best parameters: {'nb__alpha': 0.1}
Best cross-validation accuracy: 0.5162015189210941


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.495) total time=   0.3s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.523) total time=   0.3s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.538) total time=   0.3s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)


[CV] END ................................ score: (test=0.513) total time=   0.3s
[CV] END ................................ score: (test=0.512) total time=   0.3s
Generalization Balanced accuracy (via cross_val_score): 0.5162233120047656
Combined test predictions saved to: ./test_4_NB_predictions_combined.csv
DONE!
 Pipeline(steps=[('freq',
                 FunctionTransformer(func=<function apply_freq_encode at 0x7f0690de4c10>)),
                ('onehot', OneHotEncoder(handle_unknown='ignore')),
                ('nb', MultinomialNB(alpha=0.1))])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(freq_series)
