In [111]:
%pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [112]:
import pandas as pd
import numpy as np

In [113]:
# THIS IS FOR DATA CLEANING - all the stuff should be here
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train = df_train[df_train['intake_type'] != 'Wildlife']
df_train = df_train.dropna(subset=['age_upon_intake'])

In [114]:
def drop_cols(df):
    cols_to_drop = ['id', 'name', 'date_of_birth', 'outcome_time', 'found_location']
    existing_cols = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols)
    return df

In [115]:
def clean_intake_time(df):
    # Intake Time
    # Convert string timestamps to UNIX timestamp
    dt_series = pd.to_datetime(df['intake_time'], errors='coerce')
    df['intake_time'] = dt_series.astype('int64') // (10 ** 9)
    return df

# Age Upon Intake
def convert_age(age_str):
    """
    Convert age strings to years.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num
    elif "month" in unit:
        return num / 12
    elif "week" in unit:
        return num / 52
    elif "day" in unit:
        return num / 365
    else:
        return None

def clean_age_and_sex_upon_intake(df):
    df.loc[df['sex_upon_intake'].isna(), 'sex_upon_intake'] = 'Unknown'
    df['age_upon_intake'] = df['age_upon_intake'].apply(convert_age)
    df.loc[df['age_upon_intake'] < 0, 'age_upon_intake'] = 0
    return df

# Breed
def clean_breed(df):
# Create is_mix column
    df['is_mix'] = df['breed'].str.contains('mix', case=False, na=False).astype(int)
    # remove mix from all breeds
    df['breed'] = df['breed'].str.replace(' mix', '', case=False)
    return df

In [116]:
color_group_map = {
  'blue tabby':       'gray tabby',
  'silver tabby':     'gray tabby',
  'silver':           'gray',
  'blue':             'gray',
  'orange tabby':     'orange',
  'orange tiger':     'orange',
  'red':              'orange',
  'red tabby':        'orange',
  'red tick':         'orange',
  'yellow':           'orange',
  'tan':              'cream',
}

def clean_color(df):
  # lowercase
  df['color'] = df['color'].str.lower().str.strip()

  # feature engineering -> primary color 
  df['primary_color'] = df['color'].astype(str).apply(
      lambda x: x.split('/')[0].strip() if '/' in x else x.strip()
  )

  # simplify synonymous colors if in map
  df['primary_color'] = df['primary_color'].map(color_group_map).fillna(df['primary_color'])

  df = df.drop(columns=['color'])
  return df


def freq_encode(df, col):
  # count frequencies
  freq_series = df[col].value_counts()

  # map frequencies back to the original column, replacing values
  df[col] = df[col].map(freq_series)

  return df

In [117]:
def clean_intake_cond(df):
    # # Known mappings
    # df['intake_condition'] = df['intake_condition'].replace({
    #     'Unknown': 'Unknown Condition / Other', 
    #     'Other': 'Unknown Condition / Other',
    #     'Space': 'Unknown Condition / Other',
    #     'Behavior': 'Normal / Behavior', 
    #     'Normal': 'Normal / Behavior',
    #     'Neonatal': 'Nursing / Neonatal', 
    #     'Nursing': 'Nursing / Neonatal',
    #     'Neurologic': 'Med Urgent', 
    #     'Agonal': 'Med Urgent', 
    #     'Parvo': 'Med Urgent',
    #     'Congenital': 'Sick'
    # })

    # # Replace any unknown conditions (those not in the mapping) with 'Med Urgent'
    # df['intake_condition'] = df['intake_condition'].apply(lambda x: x if x in df['intake_condition'].unique() else 'Med Urgent')

    df['intake_condition'] = df['intake_condition'].replace({'Unknown': 'Unknown Condition / Other', 'Other': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Space': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Behavior': 'Normal / Behavior', 'Normal': 'Normal / Behavior'})
    df['intake_condition'] = df['intake_condition'].replace({'Neonatal': 'Nursing / Neonatal', 'Nursing': 'Nursing / Neonatal'})
    df['intake_condition'] = df['intake_condition'].replace({'Neurologic': 'Med Urgent', 'Agonal': 'Med Urgent', 'Parvo': 'Med Urgent'})
    df['intake_condition'] = df['intake_condition'].replace({'Agonal': 'Med Urgent / Neurological'})
    df['intake_condition'] = df['intake_condition'].replace({'Congenital': 'Sick'})
    
    return df


In [118]:
# def clean_animal_type(df):
#     dummies = pd.get_dummies(df['animal_type'], drop_first=True)
#     df = df.drop('animal_type', axis=1)
#     df = pd.concat([df, dummies], axis=1)
#     return df

In [119]:
def clean_data(df):
    df = drop_cols(df)
    print('dropped columns')
    df = clean_intake_time(df)
    print('cleaned intake time')
    df = clean_intake_cond(df)
    print('cleaned intake condition')
    df = clean_age_and_sex_upon_intake(df)
    print('cleaned age and sex')
    df = clean_breed(df)
    print('cleaned breed')
    df = clean_color(df)
    print('cleaned color')
    # df = clean_animal_type(df)
    # print('cleaned animal type')
    return df

In [120]:
from sklearn.preprocessing import OneHotEncoder
def encode_columns(df):

    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(df[['category_column']])

    dummies = pd.get_dummies(df['intake_type'])
    df = df.drop('intake_type', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['intake_condition'])
    df = df.drop('intake_condition', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['sex_upon_intake'])
    df = df.drop('sex_upon_intake', axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [121]:
df_train = clean_data(df_train)
# df_train = encode_columns(df_train)
label_column = df_train.pop('outcome_type')
df_train.insert(df_train.shape[1], 'outcome_type', label_column)    
df_train.head()

dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color


Unnamed: 0,intake_time,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,is_mix,primary_color,outcome_type
0,1436101140,Stray,Normal / Behavior,Dog,Spayed Female,8.0,English Springer Spaniel,0,white,Return to Owner
1,1460659380,Stray,Normal / Behavior,Dog,Intact Male,0.916667,Basenji,1,sable,Return to Owner
2,1652314980,Public Assist,Normal / Behavior,Cat,Neutered Male,2.0,Domestic Shorthair,0,orange,Transfer
3,1487421960,Owner Surrender,Normal / Behavior,Dog,Neutered Male,2.0,Labrador Retriever,1,chocolate,Return to Owner
4,1555408380,Public Assist,Normal / Behavior,Dog,Neutered Male,6.0,Great Dane,1,black,Return to Owner


In [122]:
df_test = clean_data(df_test)

dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color


In [123]:
# Transformer code for pipelines

# Use FunctionTransformer to wrap the freq_encode function
def apply_freq_encode(df):
    df = freq_encode(df, 'primary_color')
    df = freq_encode(df, 'breed')

    return df

In [None]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from xgboost import XGBClassifier

def train_classifier(X_train, y_train, X_test):
    """
    Trains an XGBoost model using a pipeline that includes a frequency encoding transformation,
    OneHotEncoder, and hyperparameter tuning via RandomizedSearchCV.
    
    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series or np.array): Training target values.
        X_test (pd.DataFrame): Test features.
    
    Returns:
        best_estimator: The best estimator from RandomizedSearchCV.
        test_predictions: The predicted labels for X_test from the best estimator.
    """
    # Construct the pipeline:
    #   1. Apply frequency encoding (for example, on 'primary_color' & 'breed' if implemented in apply_freq_encode)
    #   2. OneHotEncode the features (adjust handle_unknown and sparse_output as needed)
    #   3. Fit an XGBClassifier.
    pipeline = Pipeline([
        ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ('xgb', XGBClassifier(eval_metric='logloss', verbosity=1))
    ])
    
    # Set up parameter distributions for XGBoost.
    param_distributions = {
        "xgb__max_depth": [3, 6, 9],
        "xgb__learning_rate": [0.01, 0.1, 0.2],
        "xgb__n_estimators": [50, 100, 200],
        "xgb__subsample": [0.5, 0.7, 1.0],
        "xgb__colsample_bytree": [0.5, 0.7, 1.0]
    }
    
    # Perform hyperparameter search using RandomizedSearchCV.
    randomized_search = RandomizedSearchCV(
        estimator=pipeline, 
        param_distributions=param_distributions,
        n_iter=1,
        cv=5, 
        scoring='accuracy', 
        verbose=3,
    )
    
    randomized_search.fit(X_train, y_train)
    
    print('Best parameters:', randomized_search.best_params_)
    print('Best cross-validation accuracy:', randomized_search.best_score_)
    
    cv_scores = cross_val_score(randomized_search.best_estimator_, X_train, y_train, cv=5, verbose=3)
    print('Generalization accuracy (via cross_val_score):', cv_scores.mean())
    
    # Make predictions on the test set using the best estimator.
    test_predictions = randomized_search.predict(X_test)
    
    return randomized_search.best_estimator_, test_predictions


: 

In [None]:
from sklearn.preprocessing import LabelEncoder

# For Dog:
train_dog = df_train[df_train['animal_type'] == 'Dog'].copy()
X_train_dog = train_dog.drop(columns=['animal_type', 'outcome_type'])
y_train_dog = train_dog['outcome_type']

test_dog = df_test[df_test['animal_type'] == 'Dog'].copy()
X_test_dog = test_dog.drop(columns=['animal_type'])

# For Cat:
train_cat = df_train[df_train['animal_type'] == 'Cat'].copy()
X_train_cat = train_cat.drop(columns=['animal_type', 'outcome_type'])
y_train_cat = train_cat['outcome_type']

test_cat = df_test[df_test['animal_type'] == 'Cat'].copy()
X_test_cat = test_cat.drop(columns=['animal_type'])

## Encode targets with LabelEncoder
# Dog encoding
le_dog = LabelEncoder()
y_train_dog_encoded = le_dog.fit_transform(y_train_dog)

# Cat encoding
le_cat = LabelEncoder()
y_train_cat_encoded = le_cat.fit_transform(y_train_cat)

print("Training model for Dog data:")
best_estimator_dog, dog_predictions_encoded = train_classifier(X_train_dog, y_train_dog_encoded, X_test_dog)
dog_predictions = le_dog.inverse_transform(dog_predictions_encoded)

print("\nTraining model for Cat data:")
best_estimator_cat, cat_predictions_encoded = train_classifier(X_train_cat, y_train_cat_encoded, X_test_cat)
cat_predictions = le_cat.inverse_transform(cat_predictions_encoded)

Training model for Dog data:
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.01, xgb__max_depth=9, xgb__n_estimators=200, xgb__subsample=1.0;, score=0.585 total time= 6.2min
[CV 2/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.01, xgb__max_depth=9, xgb__n_estimators=200, xgb__subsample=1.0;, score=0.590 total time= 6.4min
[CV 3/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.01, xgb__max_depth=9, xgb__n_estimators=200, xgb__subsample=1.0;, score=0.591 total time= 7.1min
[CV 4/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.01, xgb__max_depth=9, xgb__n_estimators=200, xgb__subsample=1.0;, score=0.595 total time= 7.3min
[CV 5/5] END xgb__colsample_bytree=0.7, xgb__learning_rate=0.01, xgb__max_depth=9, xgb__n_estimators=200, xgb__subsample=1.0;, score=0.596 total time= 9.7min
Best parameters: {'xgb__subsample': 1.0, 'xgb__n_estimators': 200, 'xgb__max_depth': 9, 'xgb__learning_rate': 0.01, 'xgb__

In [None]:
## PREDICTION STITCHING ##
def combine_predictions(dog_pred, cat_pred, dog_true, cat_true):
    """
    Pass in list of dog predictions and cat predictions
    Returns -> stitched together predictions based on original test set order
    """
    dog_pred_series = pd.Series(dog_pred, index=dog_true.index)
    cat_pred_series = pd.Series(cat_pred, index=cat_true.index)
    # Concatenate both series and sort by the original index so the output 
    # reflects the same order as the original test dataset
    all_predictions = pd.concat([dog_pred_series, cat_pred_series]).sort_index()
    final_df = pd.DataFrame({'Predicted_Label': all_predictions})
    csv_path = './test_predictions_combined.csv'
    final_df.to_csv(csv_path, index=False)
    print(f"Combined test predictions saved to: {csv_path}")

combine_predictions(dog_predictions, cat_predictions, X_test_dog, X_test_cat)