In [2]:
# # If needed, create a virtual environment by uncommenting the line below:
# !python -m venv .venv

# # Activate the virtual environment by running the appropriate command for your OS:
# # On Windows:
# # !.\.venv\Scripts\activate

# # On macOS/Linux:
# !source .agentic_ai_venv/bin/activate 

# # Install the required packages by uncommenting the line below:
# %pip install -r ./requirements.txt

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn 
from IPython.display import display
from pathlib import Path
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from typing import Tuple, cast
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
df = pd.read_csv("./data/pet_adoption_data.csv")

In [5]:
df

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,Rabbit,Rabbit,123,Gray,Large,20.498100,0,0,28,14,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,2502,Dog,Poodle,72,Orange,Small,27.039045,1,0,66,26,1,1
2003,2503,Rabbit,Rabbit,124,Brown,Small,4.726954,1,1,59,150,0,0
2004,2504,Rabbit,Rabbit,113,Orange,Small,1.758592,1,0,68,302,0,0
2005,2505,Dog,Labrador,12,Gray,Large,20.961592,1,0,59,478,0,0


### Baseline Model

To have a starting point against which to measure our improvements, let's quickly create a baseline model.

Gradient-boosted trees are generally a good starting point, so let's start from there.

Bonus question: Why are gradient-boosted trees a good choice to get a baseline?

<!-- Answer: Gradient-boosted trees are a good choice because they generally perform well on structured data, can handle both numerical and categorical features, and they are robust to outliers. -->

For the `HistGradientBoostingRegressor` model to work, we need to specify the categorical features, and each category must have less than 256 values.

To specify the categorical features, we can either:
- Use the `categorical_features='from_dtype'` option, which will consider columns that are of the pandas dtype `category` as categorical.
- Explicitly specify the names of the categorical columns using  `categorical_features=['PetType'...]`.

It is more flexible to use the first option (since we can change our pre-processing without changing the model), but it also means that we must convert the relevant columns to the `category` dtype.

As you can see using `df.dtypes`, most of the categorical columns are of type `object`, and the "Address" column, if converted to a category, would have too many categories.

The 'Address' column is not really categorical, so for now let's drop it in the pre-processing step.

We can use `scikit-learn`'s `ColumnTransformer` and `Pipeline` to achieve this in a convenient way.

First, let's identify the categorical features in the dataset.

In [6]:
from typing import Callable, cast
from sklearn.exceptions import NotFittedError
from sklearn.utils.validation import check_is_fitted
from IPython.display import display
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import root_mean_squared_error, mean_absolute_percentage_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import pandas._typing

In [7]:
X = df[['PetType', 'Breed', 'AgeMonths', 'Color', 'Size', 'WeightKg',
       'Vaccinated', 'HealthCondition', 'TimeInShelterDays', 'AdoptionFee','PreviousOwner']]
y = df['AdoptionLikelihood']

X_train,X_test, y_train,y_test = train_test_split(X,y , test_size=0.3)
print(f"X_train Shape: {X_train.shape}")
print(f"Y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape}")
print(f"y_test Shape: {y_test.shape}")

X_train Shape: (1404, 11)
Y_train Shape: (1404,)
X_test Shape: (603, 11)
y_test Shape: (603,)


In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

def baseline_accuracy(X_train, y_train, X_test, y_test):
    dummy = DummyClassifier(strategy='most_frequent')
    dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_test)
    return accuracy_score(y_test, y_pred)


baseline_acc = baseline_accuracy(X_train, y_train, X_test, y_test)


In [9]:
results_df = pd.DataFrame({
    "Model": ["DummyClassifier"],
    "Accuracy": [baseline_acc]
})

print(results_df)

             Model  Accuracy
0  DummyClassifier   0.66335


### Full Preprocessing + Pipeline with Feature Engineering

In [10]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [11]:
X_ = df.drop(columns=["AdoptionLikelihood"])
Y_ = cast(pd.Series, df["AdoptionLikelihood"])

### 1. Feature Engineering

In [12]:
def feature_eng() -> ColumnTransformer :
    
    numeric_features = ['AgeMonths', 'WeightKg', 'TimeInShelterDays', 'AdoptionFee']
    column_transformer = ColumnTransformer(transformers=[
    # One-hot encode categorical columns
    ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['PetType', 'Breed', 'Color']),
            
    # Ordinal encode 'Size'
    ('ordinal', OrdinalEncoder(categories=[['Small', 'Medium', 'Large']]), ['Size']),
            
    # Standard scale numeric columns
    ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough')
    return  column_transformer

feat_eng_transformer = feature_eng()


In [13]:
def build_preprocessing_pipeline(feat_eng_transformer: ColumnTransformer) -> Pipeline:
    
    # Drop the target column ["AdoptionLikelihood"] 
    drop_column = ColumnTransformer(
        transformers=[
            ("drop_target", "drop", ["AdoptionLikelihood"])
        ],
        remainder="passthrough",
        verbose_feature_names_out=False
    )

    # type casting
    type_casting = ColumnTransformer(
        transformers=[
            ("cast_category", FunctionTransformer(lambda x: x.astype("category")), ['PetType', 'Breed', 'Color', 'Size'])
        ],
        remainder="passthrough",
        verbose_feature_names_out=False
    )
    # build pipeline
    preprocessing_pipeline = Pipeline([
        ("drop_column", drop_column),
        ("type_casting", type_casting),
        ("feature_eng", feat_eng_transformer)
    ])

    preprocessing_pipeline.set_output(transform="pandas")  # return DataFrame
    return preprocessing_pipeline

preprocessing_pipeline = build_preprocessing_pipeline(feat_eng_transformer)


In [14]:
# Build final pipeline with preprocessing + model
final_pipeline = Pipeline([
    ("preprocessing", preprocessing_pipeline),
    ("model", RandomForestClassifier())
])

In [15]:
display(final_pipeline)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('drop_column', ...), ('type_casting', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('drop_target', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,transformers,"[('cast_category', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function bui...t 0x1179bc5e0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Small', 'Medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Train-Test Validation 
We used Stratify to split our dataset and then evaluate our metrics

In [16]:
# (80% train, 20% test)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["AdoptionLikelihood"]  # maintain class distribution
)

In [17]:
# fitting
final_pipeline.fit(train_df, train_df["AdoptionLikelihood"])

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('drop_column', ...), ('type_casting', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('drop_target', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,transformers,"[('cast_category', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function bui...t 0x1179bc5e0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Small', 'Medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Evaluate metrics

In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# prediction
y_pred = final_pipeline.predict(X_)

# metrics
accuracy = accuracy_score(Y_, y_pred)
report = classification_report(Y_, y_pred)
cm = confusion_matrix(Y_, y_pred)

print (
    "accuracy :", accuracy, 
    "\nreport :\n", report,
    "\nconfusion matrix :\n", cm
)


accuracy : 0.9845540607872446 
report :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1348
           1       0.98      0.97      0.98       659

    accuracy                           0.98      2007
   macro avg       0.98      0.98      0.98      2007
weighted avg       0.98      0.98      0.98      2007
 
confusion matrix :
 [[1337   11]
 [  20  639]]


In [19]:
#put as dict and 
report_dict = classification_report(Y_, y_pred, output_dict=True)
classification_metrics_df = pd.DataFrame(report_dict).transpose()

classification_metrics_df.loc["accuracy", :] = [accuracy, None, None, None]

class_labels = sorted(Y_.unique())  # assumes Y_ is a pandas Series
confusion_matrix_df = pd.DataFrame(cm, index=class_labels, columns=class_labels)

print("Classification Metrics:\n", classification_metrics_df)
print("\nConfusion Matrix:\n", confusion_matrix_df)

Classification Metrics:
               precision    recall  f1-score  support
0              0.985262  0.991840  0.988540   1348.0
1              0.983077  0.969651  0.976318    659.0
accuracy       0.984554       NaN       NaN      NaN
macro avg      0.984169  0.980745  0.982429   2007.0
weighted avg   0.984544  0.984554  0.984527   2007.0

Confusion Matrix:
       0    1
0  1337   11
1    20  639


Compare our final pipeline to our baseline

In [20]:
def compare_metrics(
    metrics_1: dict[str, float],
    metrics_2: dict[str, float],
    name_1: str,
    name_2: str,
) -> pd.DataFrame:
    """
    Return a DataFrame comparing the metrics of two experiments.
    """
    return pd.DataFrame(
        [metrics_1, metrics_2],
        index=cast(pandas._typing.Axes, [name_1, name_2]),
    ).T.assign(Delta=lambda df: df[name_1] - df[name_2])

In [21]:
display(
    compare_metrics(
        metrics_1=baseline_acc,
        name_1="Baseline",
        metrics_2=accuracy,
        name_2="Final pipeline",
    )
)

Unnamed: 0,Baseline,Final pipeline,Delta
0,0.66335,0.984554,-0.321204


# Model Validation

In [22]:
def model_validation(df: pd.DataFrame):

    df = df.copy()  # Avoid modifying the original DataFrame

    X, y = df.drop(columns=["AdoptionLikelihood"]), df["AdoptionLikelihood"]
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        shuffle=True,
    )

    model = make_pipeline(
        OneHotEncoder(handle_unknown="ignore"),
        GradientBoostingClassifier()
    )

    model.fit(X_train, y_train)

    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    return train_score, test_score

train_test_output = model_validation(df=df)

In [23]:
display(
    pd.DataFrame(
        [
            {"score": "train", "value": train_test_output[0]},
            {"score": "test", "value": train_test_output[1]},
        ]
    )
)

Unnamed: 0,score,value
0,train,0.907165
1,test,0.89801


# Basic gridsearch

In [24]:
def hp_grid() -> ParameterGrid:

    hyperparameter_grid = ParameterGrid(
        {
            "n_estimators": [500, 1000],
            "max_depth": [5, 10, 50],
        }
    )

    return hyperparameter_grid


def traintest(
    df: pd.DataFrame,
    hyperparameter_grid: ParameterGrid,
) -> list[Tuple[dict, Tuple[float, float]]]:

    df = df.copy()
    X, y = df.drop(columns=["AdoptionLikelihood"]), df["AdoptionLikelihood"]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        shuffle=True,
    )

    results = []
    for i_hyperparameter_grid, hyperparameters in enumerate(hyperparameter_grid):
        print(
            f"Testing hyperparameters: {hyperparameters} ({i_hyperparameter_grid+1}/{len(hyperparameter_grid)})"
        )
        model = make_pipeline(
            OneHotEncoder(handle_unknown="ignore"),
            RandomForestClassifier(**hyperparameters, random_state=42),
        )

        model.fit(X_train, y_train)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)

        print(f"  train_score={train_score}, test_score={test_score}")
        results.append(
            (
                hyperparameters,
                (train_score, test_score),
            )
        )

    return results



hyperparameter_grid = hp_grid()
tt_output = traintest(
    df=df,
    hyperparameter_grid=hyperparameter_grid,
)


display(
    pd.DataFrame(
        [{**k, "train_score": v[0], "test_score": v[1]} for k, v in tt_output]
    ).sort_values(by="test_score", ascending=False)
)

Testing hyperparameters: {'max_depth': 5, 'n_estimators': 500} (1/6)
  train_score=0.6716510903426791, test_score=0.6716417910447762
Testing hyperparameters: {'max_depth': 5, 'n_estimators': 1000} (2/6)
  train_score=0.6716510903426791, test_score=0.6716417910447762
Testing hyperparameters: {'max_depth': 10, 'n_estimators': 500} (3/6)
  train_score=0.6940809968847352, test_score=0.6840796019900498
Testing hyperparameters: {'max_depth': 10, 'n_estimators': 1000} (4/6)
  train_score=0.6940809968847352, test_score=0.681592039800995
Testing hyperparameters: {'max_depth': 50, 'n_estimators': 500} (5/6)
  train_score=0.9931464174454828, test_score=0.8855721393034826
Testing hyperparameters: {'max_depth': 50, 'n_estimators': 1000} (6/6)
  train_score=0.9956386292834891, test_score=0.8880597014925373


Unnamed: 0,max_depth,n_estimators,train_score,test_score
5,50,1000,0.995639,0.88806
4,50,500,0.993146,0.885572
2,10,500,0.694081,0.68408
3,10,1000,0.694081,0.681592
0,5,500,0.671651,0.671642
1,5,1000,0.671651,0.671642


In [25]:
from sklearn.model_selection import cross_val_score

def crossvalidation(
    df: pd.DataFrame,
    hyperparameter_grid: ParameterGrid,
    n_folds: int,
) -> list[Tuple[dict, Tuple[float, float]]]:

    df = df.copy()
    X, y = df.drop(columns=["AdoptionLikelihood"]), df["AdoptionLikelihood"]

    results = []
    for i_hyperparameter_grid, hyperparameters in enumerate(hyperparameter_grid):
        print(
            f"Testing hyperparameters: {hyperparameters} ({i_hyperparameter_grid+1}/{len(hyperparameter_grid)})"
        )
        model = make_pipeline(
            OneHotEncoder(handle_unknown="ignore"),
            RandomForestClassifier(**hyperparameters, random_state = 42),
        )

        val_scores = cross_val_score(model, X, y, cv=n_folds)
        mean_val_score = val_scores.mean()
        std_val_score = val_scores.std()

        print(f"  mean_val_score={mean_val_score}, std_val_score={std_val_score}")

        results.append(
            (
                hyperparameters,
                (mean_val_score, std_val_score),
            )
        )

    return results


print("Exercise 3: running")
crossval_output = crossvalidation(
    df=df,
    hyperparameter_grid=hyperparameter_grid,
    n_folds=5,
)


display(
    pd.DataFrame(
        [
            {
                **k,
                "mean_val_score": v[0],
                "std_val_score": v[1],
            }
            for k, v in crossval_output
        ]
    ).sort_values(by="mean_val_score", ascending=False)
)

Exercise 3: running
Testing hyperparameters: {'max_depth': 5, 'n_estimators': 500} (1/6)
  mean_val_score=0.6716492351211525, std_val_score=0.0009106147278593625
Testing hyperparameters: {'max_depth': 5, 'n_estimators': 1000} (2/6)
  mean_val_score=0.6716492351211525, std_val_score=0.0009106147278593625
Testing hyperparameters: {'max_depth': 10, 'n_estimators': 500} (3/6)
  mean_val_score=0.6771330380516372, std_val_score=0.00584761353143536
Testing hyperparameters: {'max_depth': 10, 'n_estimators': 1000} (4/6)
  mean_val_score=0.6836019404225754, std_val_score=0.008081812631561944
Testing hyperparameters: {'max_depth': 50, 'n_estimators': 500} (5/6)
  mean_val_score=0.8863599707199663, std_val_score=0.08629073134249204
Testing hyperparameters: {'max_depth': 50, 'n_estimators': 1000} (6/6)
  mean_val_score=0.8858624582821554, std_val_score=0.08622927792051614


Unnamed: 0,max_depth,n_estimators,mean_val_score,std_val_score
4,50,500,0.88636,0.086291
5,50,1000,0.885862,0.086229
3,10,1000,0.683602,0.008082
2,10,500,0.677133,0.005848
0,5,500,0.671649,0.000911
1,5,1000,0.671649,0.000911


**mean_val_score** → mean validation accuracy across folds

**std_val_score** → how much the accuracy varies across folds (higher = less stable)
ours is high so it is very unstable

# Run cross validation on training dataset

In [26]:
def test_score(
    df: pd.DataFrame,
) -> Tuple[GridSearchCV, float]:

    df = df.copy() 
    X, y = df.drop(columns=["AdoptionLikelihood"]), df["AdoptionLikelihood"]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        shuffle=True,
    )

    grid_search = GridSearchCV(
        estimator=Pipeline(
            steps=[
                ("ohe", OneHotEncoder(handle_unknown="ignore")),
                ("rfc", RandomForestClassifier()),
            ],
        ),
        param_grid={
            "rfc__n_estimators": [500, 1000],
            "rfc__max_depth": [5, 10, 50],
        },
        cv=5,
        verbose=2,  # display computation time for each fold, parameter candidate and score
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    test_score: float = cast(float, grid_search.score(X_test, y_test))

    return grid_search, test_score

final_output = test_score(df=df)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ............rfc__max_depth=5, rfc__n_estimators=500; total time=   0.4s
[CV] END ............rfc__max_depth=5, rfc__n_estimators=500; total time=   0.3s
[CV] END ............rfc__max_depth=5, rfc__n_estimators=500; total time=   0.3s
[CV] END ............rfc__max_depth=5, rfc__n_estimators=500; total time=   0.3s
[CV] END ............rfc__max_depth=5, rfc__n_estimators=500; total time=   0.4s
[CV] END ...........rfc__max_depth=5, rfc__n_estimators=1000; total time=   0.7s
[CV] END ...........rfc__max_depth=5, rfc__n_estimators=1000; total time=   0.7s
[CV] END ...........rfc__max_depth=5, rfc__n_estimators=1000; total time=   0.7s
[CV] END ...........rfc__max_depth=10, rfc__n_estimators=500; total time=   0.4s
[CV] END ...........rfc__max_depth=10, rfc__n_estimators=500; total time=   0.3s
[CV] END ...........rfc__max_depth=5, rfc__n_estimators=1000; total time=   0.7s
[CV] END ...........rfc__max_depth=10, rfc__n_est

In [27]:
print("Final test score:", final_output[1])

Final test score: 0.8880597014925373


### Drop PetID and Rebuild
Restart by dropping PetID and rebuild.

In [28]:
# Drop PetID and any other suspicious columns
df_clean = df.drop(columns=['PetID'], errors='ignore')

print(f"Original shape: {df.shape}")
print(f"Clean shape: {df_clean.shape}")
print(f"Columns removed: {set(df.columns) - set(df_clean.columns)}")

# REDEFINE FEATURES AND TARGET
X_clean = df_clean.drop(columns=["AdoptionLikelihood"])
y_clean = df_clean["AdoptionLikelihood"]

print(f"\nClean features: {list(X_clean.columns)}")
print(f"Target distribution:\n{y_clean.value_counts(normalize=True)}")

Original shape: (2007, 13)
Clean shape: (2007, 12)
Columns removed: {'PetID'}

Clean features: ['PetType', 'Breed', 'AgeMonths', 'Color', 'Size', 'WeightKg', 'Vaccinated', 'HealthCondition', 'TimeInShelterDays', 'AdoptionFee', 'PreviousOwner']
Target distribution:
AdoptionLikelihood
0    0.671649
1    0.328351
Name: proportion, dtype: float64


In [29]:
def build_clean_preprocessing_pipeline() -> Pipeline:
    """
    Build preprocessing pipeline without any data leakage risks
    """
    # Define feature groups
    categorical_features = ['PetType', 'Breed', 'Color']
    ordinal_features = ['Size']  # Known categories
    numeric_features = ['AgeMonths', 'WeightKg', 'TimeInShelterDays', 'AdoptionFee']
    binary_features = ['Vaccinated', 'PreviousOwner']  # Assuming these are yes/no
    
    # Feature engineering transformer
    feature_eng = ColumnTransformer(transformers=[
        # One-hot encode categorical columns
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), 
         categorical_features),
        
        # Ordinal encode 'Size'
        ('ordinal', OrdinalEncoder(categories=[['Small', 'Medium', 'Large']]), 
         ordinal_features),
        
        # Standard scale numeric columns
        ('numeric', StandardScaler(), numeric_features),
        
        # Pass through binary features
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop',  # Explicitly drop any unexpected columns
    verbose_feature_names_out=False
    )
    
    # Build clean pipeline
    preprocessing_pipeline = Pipeline([
        ("feature_engineering", feature_eng)
    ])
    
    preprocessing_pipeline.set_output(transform="pandas")
    return preprocessing_pipeline

# Create clean pipeline
clean_preprocessing_pipeline = build_clean_preprocessing_pipeline()

In [30]:
# Clean train-test split
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, 
    test_size=0.2, 
    random_state=42,
    stratify=y_clean
)

print(f"X_train_clean shape: {X_train_clean.shape}")
print(f"X_test_clean shape: {X_test_clean.shape}")
print(f"y_train_clean shape: {y_train_clean.shape}")
print(f"y_test_clean shape: {y_test_clean.shape}")

# New baseline with clean data
def clean_baseline(X_train, y_train, X_test, y_test):
    dummy = DummyClassifier(strategy='most_frequent')
    dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_test)
    return accuracy_score(y_test, y_pred)

baseline_clean = clean_baseline(X_train_clean, y_train_clean, X_test_clean, y_test_clean)
print(f"📊 Clean Baseline Accuracy: {baseline_clean:.4f}")

X_train_clean shape: (1605, 11)
X_test_clean shape: (402, 11)
y_train_clean shape: (1605,)
y_test_clean shape: (402,)
📊 Clean Baseline Accuracy: 0.6716


In [31]:
def test_simple_model_clean(X_train, y_train, X_test, y_test):
    """
    Test a simple, well-regularized model to check for overfitting
    """
    # Simple preprocessing (no complex pipeline)
    preprocessor = ColumnTransformer([
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), 
         ['PetType', 'Breed', 'Color']),
        ('ordinal', OrdinalEncoder(categories=[['Small', 'Medium', 'Large']]), ['Size']),
        ('scaler', StandardScaler(), ['AgeMonths', 'WeightKg', 'TimeInShelterDays', 'AdoptionFee'])
    ], remainder='passthrough')
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=100,
            max_depth=10,  # Limited depth to prevent overfitting
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42
        ))
    ])
    
    model.fit(X_train, y_train)
    
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    overfit_gap = train_score - test_score
    
    print("🧪 SIMPLE MODEL TEST (Clean Data):")
    print(f"   Train Score: {train_score:.4f}")
    print(f"   Test Score: {test_score:.4f}")
    print(f"   Overfit Gap: {overfit_gap:.4f}")
    
    if overfit_gap < 0.03:
        print("   ✅ EXCELLENT: No overfitting detected!")
    elif overfit_gap < 0.05:
        print("   ⚠️  ACCEPTABLE: Minor overfitting")
    else:
        print("   🚨 PROBLEM: Significant overfitting")
    
    return model, train_score, test_score

simple_model, simple_train, simple_test = test_simple_model_clean(
    X_train_clean, y_train_clean, X_test_clean, y_test_clean
)

🧪 SIMPLE MODEL TEST (Clean Data):
   Train Score: 0.9576
   Test Score: 0.9154
   Overfit Gap: 0.0422
   ⚠️  ACCEPTABLE: Minor overfitting


In [32]:
import mlflow
import mlflow.sklearn

# Set up MLflow experiment for clean restart
mlflow.set_experiment("AdoptionLikelihood_Clean_Restart")

def track_clean_baseline_mlflow(X_train, y_train, X_test, y_test):
    """Track baseline with clean data"""
    with mlflow.start_run(run_name="Clean_Baseline"):
        dummy = DummyClassifier(strategy='most_frequent')
        dummy.fit(X_train, y_train)
        accuracy = dummy.score(X_test, y_test)
        
        mlflow.log_param("strategy", "most_frequent")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(dummy, "baseline_model")
        
        print(f"📊 Clean Baseline: {accuracy:.4f}")
        return accuracy

clean_baseline_acc = track_clean_baseline_mlflow(X_train_clean, y_train_clean, X_test_clean, y_test_clean)

  return FileStore(store_uri, store_uri)


📊 Clean Baseline: 0.6716


In [33]:
def clean_hyperparameter_optimization(X_train, y_train, X_test, y_test, n_iter=10):
    """
    Hyperparameter optimization with clean data and proper regularization
    """
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint
    
    with mlflow.start_run(run_name="Clean_RandomizedSearch"):
        # Use simple preprocessing
        preprocessor = ColumnTransformer([
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), 
             ['PetType', 'Breed', 'Color']),
            ('ordinal', OrdinalEncoder(categories=[['Small', 'Medium', 'Large']]), ['Size']),
            ('scaler', StandardScaler(), ['AgeMonths', 'WeightKg', 'TimeInShelterDays', 'AdoptionFee'])
        ], remainder='passthrough')
        
        # Conservative parameter space to prevent overfitting
        param_dist = {
            'classifier__n_estimators': randint(50, 150),
            'classifier__max_depth': randint(5, 20),
            'classifier__min_samples_split': randint(2, 10),
            'classifier__min_samples_leaf': randint(1, 5),
            'classifier__max_features': ['sqrt', 'log2']
        }
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
        
        random_search = RandomizedSearchCV(
            pipeline,
            param_dist,
            n_iter=n_iter,
            cv=5,  # More folds for better validation
            scoring='accuracy',
            random_state=42,
            n_jobs=-1,
            verbose=1
        )
        
        print("🚀 Starting clean hyperparameter optimization...")
        random_search.fit(X_train, y_train)
        
        # Log results
        train_score = random_search.score(X_train, y_train)
        test_score = random_search.score(X_test, y_test)
        
        mlflow.log_params(random_search.best_params_)
        mlflow.log_metric("train_score", train_score)
        mlflow.log_metric("test_score", test_score)
        mlflow.log_metric("overfit_gap", train_score - test_score)
        mlflow.log_metric("cv_score", random_search.best_score_)
        
        mlflow.sklearn.log_model(random_search.best_estimator_, "best_model")
        
        print("🎯 CLEAN OPTIMIZATION RESULTS:")
        print(f"   Best Parameters: {random_search.best_params_}")
        print(f"   CV Score: {random_search.best_score_:.4f}")
        print(f"   Train Score: {train_score:.4f}")
        print(f"   Test Score: {test_score:.4f}")
        print(f"   Overfit Gap: {train_score - test_score:.4f}")
        
        return random_search

# Run clean optimization
clean_optimization = clean_hyperparameter_optimization(
    X_train_clean, y_train_clean, X_test_clean, y_test_clean, n_iter=15
)

🚀 Starting clean hyperparameter optimization...
Fitting 5 folds for each of 15 candidates, totalling 75 fits




🎯 CLEAN OPTIMIZATION RESULTS:
   Best Parameters: {'classifier__max_depth': 14, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 141}
   CV Score: 0.9271
   Train Score: 0.9595
   Test Score: 0.9179
   Overfit Gap: 0.0416


In [34]:
def final_clean_validation(best_model, X_test, y_test, baseline_accuracy):
    """
    Final validation with clean data
    """
    print("=" * 60)
    print("FINAL CLEAN VALIDATION")
    print("=" * 60)
    
    from sklearn.metrics import classification_report, confusion_matrix
    
    # Predictions
    y_pred = best_model.predict(X_test)
    final_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"🎯 Final Test Accuracy: {final_accuracy:.4f}")
    print(f"📊 Baseline Accuracy: {baseline_accuracy:.4f}")
    print(f"📈 Improvement: +{(final_accuracy - baseline_accuracy)*100:.2f}%")
    
    # Detailed metrics
    print(f"\n📋 Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Feature importance if available
    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        importances = best_model.named_steps['classifier'].feature_importances_
        feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
        
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        print(f"\n🔍 TOP 10 FEATURE IMPORTANCES (Clean):")
        print(importance_df.head(10).to_string(index=False))
    
    return final_accuracy

# Run final validation
if 'clean_optimization' in locals():
    final_accuracy = final_clean_validation(
        clean_optimization.best_estimator_, 
        X_test_clean, y_test_clean, 
        clean_baseline_acc
    )

FINAL CLEAN VALIDATION
🎯 Final Test Accuracy: 0.9179
📊 Baseline Accuracy: 0.6716
📈 Improvement: +24.63%

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       270
           1       0.90      0.84      0.87       132

    accuracy                           0.92       402
   macro avg       0.91      0.90      0.91       402
weighted avg       0.92      0.92      0.92       402


🔍 TOP 10 FEATURE IMPORTANCES (Clean):
                   feature  importance
             ordinal__Size    0.230443
         scaler__AgeMonths    0.184396
     remainder__Vaccinated    0.116525
remainder__HealthCondition    0.082226
          scaler__WeightKg    0.076376
       scaler__AdoptionFee    0.074069
    onehot__Breed_Labrador    0.070749
 scaler__TimeInShelterDays    0.063022
       onehot__PetType_Dog    0.022005
  remainder__PreviousOwner    0.010990
