In [20]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas as pd
import numpy as np

In [22]:
# THIS IS FOR DATA CLEANING - all the stuff should be here
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train = df_train[df_train['intake_type'] != 'Wildlife']
df_train = df_train.dropna(subset=['age_upon_intake'])

In [23]:
def drop_cols(df):
    cols_to_drop = ['id', 'name', 'date_of_birth', 'outcome_time', 'found_location']
    existing_cols = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols)
    return df

In [24]:
def clean_intake_time(df):
    # Intake Time
    # Convert string timestamps to UNIX timestamp
    dt_series = pd.to_datetime(df['intake_time'], errors='coerce')
    df['intake_time'] = dt_series.astype('int64') // (10 ** 9)
    return df

# Age Upon Intake
def convert_age(age_str):
    """
    Convert age strings to years.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num
    elif "month" in unit:
        return num / 12
    elif "week" in unit:
        return num / 52
    elif "day" in unit:
        return num / 365
    else:
        return None

def clean_age_and_sex_upon_intake(df):
    df.loc[df['sex_upon_intake'].isna(), 'sex_upon_intake'] = 'Unknown'
    df['age_upon_intake'] = df['age_upon_intake'].apply(convert_age)
    df.loc[df['age_upon_intake'] < 0, 'age_upon_intake'] = 0
    return df

# Breed
def clean_breed(df):
# Create is_mix column
    df['is_mix'] = df['breed'].str.contains('mix', case=False, na=False).astype(int)
    # remove mix from all breeds
    df['breed'] = df['breed'].str.replace(' mix', '', case=False)
    return df

In [25]:
color_group_map = {
  'blue tabby':       'gray tabby',
  'silver tabby':     'gray tabby',
  'silver':           'gray',
  'blue':             'gray',
  'orange tabby':     'orange',
  'orange tiger':     'orange',
  'red':              'orange',
  'red tabby':        'orange',
  'red tick':         'orange',
  'yellow':           'orange',
  'tan':              'cream',
}

def clean_color(df):
  # lowercase
  df['color'] = df['color'].str.lower().str.strip()

  # feature engineering -> primary color 
  df['primary_color'] = df['color'].astype(str).apply(
      lambda x: x.split('/')[0].strip() if '/' in x else x.strip()
  )

  # simplify synonymous colors if in map
  df['primary_color'] = df['primary_color'].map(color_group_map).fillna(df['primary_color'])

  df = df.drop(columns=['color'])
  return df


def freq_encode(df, col):
  # count frequencies
  freq_series = df[col].value_counts()

  # map frequencies back to the original column, replacing values
  df[col] = df[col].map(freq_series)

  return df

In [26]:
def clean_intake_cond(df):
    # # Known mappings
    # df['intake_condition'] = df['intake_condition'].replace({
    #     'Unknown': 'Unknown Condition / Other', 
    #     'Other': 'Unknown Condition / Other',
    #     'Space': 'Unknown Condition / Other',
    #     'Behavior': 'Normal / Behavior', 
    #     'Normal': 'Normal / Behavior',
    #     'Neonatal': 'Nursing / Neonatal', 
    #     'Nursing': 'Nursing / Neonatal',
    #     'Neurologic': 'Med Urgent', 
    #     'Agonal': 'Med Urgent', 
    #     'Parvo': 'Med Urgent',
    #     'Congenital': 'Sick'
    # })

    # # Replace any unknown conditions (those not in the mapping) with 'Med Urgent'
    # df['intake_condition'] = df['intake_condition'].apply(lambda x: x if x in df['intake_condition'].unique() else 'Med Urgent')

    df['intake_condition'] = df['intake_condition'].replace({'Unknown': 'Unknown Condition / Other', 'Other': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Space': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Behavior': 'Normal / Behavior', 'Normal': 'Normal / Behavior'})
    df['intake_condition'] = df['intake_condition'].replace({'Neonatal': 'Nursing / Neonatal', 'Nursing': 'Nursing / Neonatal'})
    df['intake_condition'] = df['intake_condition'].replace({'Neurologic': 'Med Urgent', 'Agonal': 'Med Urgent', 'Parvo': 'Med Urgent'})
    df['intake_condition'] = df['intake_condition'].replace({'Agonal': 'Med Urgent / Neurological'})
    df['intake_condition'] = df['intake_condition'].replace({'Congenital': 'Sick'})
    
    return df


In [27]:
# def clean_animal_type(df):
#     dummies = pd.get_dummies(df['animal_type'], drop_first=True)
#     df = df.drop('animal_type', axis=1)
#     df = pd.concat([df, dummies], axis=1)
#     return df

In [28]:
def clean_data(df):
    df = drop_cols(df)
    print('dropped columns')
    df = clean_intake_time(df)
    print('cleaned intake time')
    df = clean_intake_cond(df)
    print('cleaned intake condition')
    df = clean_age_and_sex_upon_intake(df)
    print('cleaned age and sex')
    df = clean_breed(df)
    print('cleaned breed')
    df = clean_color(df)
    print('cleaned color')
    # df = clean_animal_type(df)
    # print('cleaned animal type')
    return df

In [29]:
from sklearn.preprocessing import OneHotEncoder
def encode_columns(df):

    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(df[['category_column']])

    dummies = pd.get_dummies(df['intake_type'])
    df = df.drop('intake_type', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['intake_condition'])
    df = df.drop('intake_condition', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['sex_upon_intake'])
    df = df.drop('sex_upon_intake', axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [30]:
'''
df_train = clean_data(df_train)
# df_train = encode_columns(df_train)
label_column = df_train.pop('outcome_type')
df_train.insert(df_train.shape[1], 'outcome_type', label_column)    
df_train.head()
'''

"\ndf_train = clean_data(df_train)\n# df_train = encode_columns(df_train)\nlabel_column = df_train.pop('outcome_type')\ndf_train.insert(df_train.shape[1], 'outcome_type', label_column)    \ndf_train.head()\n"

In [31]:
df_test = clean_data(df_test)
df_test

dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color


Unnamed: 0,intake_time,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,outcome_type,is_mix,primary_color
0,-9223372037,Normal,Dog,Neutered Male,2 years,,Tricolor,,0,1
1,-9223372037,Sick,Cat,Intact Female,4 weeks,,Calico,,0,9
2,-9223372037,Normal,Dog,Neutered Male,4 years,,Tan/Gray,,0,6
3,-9223372037,Normal,Dog,Intact Female,5 months,,Brown/White,,0,1
4,-9223372037,Injured,Cat,Intact Female,2 years,,Black/White,,0,2
...,...,...,...,...,...,...,...,...,...,...
27786,-9223372037,Normal,Dog,Intact Female,8 months,,Brown Brindle/White,,0,10
27787,-9223372037,Normal,Cat,Intact Female,2 months,,Black,,0,6
27788,-9223372037,Normal,Dog,Neutered Male,8 years,,White,,0,11
27789,-9223372037,Injured,Cat,Spayed Female,9 years,,Brown/Brown Tabby,,0,8


In [14]:
# Transformer code for pipelines

# Use FunctionTransformer to wrap the freq_encode function
def apply_freq_encode(df):
    df = freq_encode(df, 'primary_color')
    df = freq_encode(df, 'breed')

    return df

In [15]:
'''
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from xgboost import XGBClassifier

def train_classifier(X_train, y_train, X_test):
    """
    Trains an XGBoost model using a pipeline that includes a frequency encoding transformation,
    OneHotEncoder, and hyperparameter tuning via RandomizedSearchCV.
    
    Parameters:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series or np.array): Training target values.
        X_test (pd.DataFrame): Test features.
    
    Returns:
        best_estimator: The best estimator from RandomizedSearchCV.
        test_predictions: The predicted labels for X_test from the best estimator.
    """
    # Construct the pipeline:
    #   1. Apply frequency encoding (for example, on 'primary_color' & 'breed' if implemented in apply_freq_encode)
    #   2. OneHotEncode the features (adjust handle_unknown and sparse_output as needed)
    #   3. Fit an XGBClassifier.
    pipeline = Pipeline([
        ('freq', FunctionTransformer(apply_freq_encode, validate=False)),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ('xgb', XGBClassifier(eval_metric='logloss', verbosity=1))
    ])
    
    # Set up parameter distributions for XGBoost.
    param_distributions = {
        "xgb__max_depth": [3, 6, 9],
        "xgb__learning_rate": [0.01, 0.1, 0.2],
        "xgb__n_estimators": [50, 100, 200],
        "xgb__subsample": [0.5, 0.7, 1.0],
        "xgb__colsample_bytree": [0.5, 0.7, 1.0]
    }
    
    # Perform hyperparameter search using RandomizedSearchCV.
    randomized_search = RandomizedSearchCV(
        estimator=pipeline, 
        param_distributions=param_distributions,
        n_iter=1,
        cv=5, 
        scoring='accuracy', 
        verbose=3,
    )
    
    randomized_search.fit(X_train, y_train)
    
    print('Best parameters:', randomized_search.best_params_)
    print('Best cross-validation accuracy:', randomized_search.best_score_)
    
    cv_scores = cross_val_score(randomized_search.best_estimator_, X_train, y_train, cv=5, verbose=3)
    print('Generalization accuracy (via cross_val_score):', cv_scores.mean())
    
    # Make predictions on the test set using the best estimator.
    test_predictions = randomized_search.predict(X_test)
    
    return randomized_search.best_estimator_, test_predictions
'''

'\nfrom sklearn.preprocessing import OneHotEncoder, FunctionTransformer\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import RandomizedSearchCV, cross_val_score\nfrom xgboost import XGBClassifier\n\ndef train_classifier(X_train, y_train, X_test):\n    """\n    Trains an XGBoost model using a pipeline that includes a frequency encoding transformation,\n    OneHotEncoder, and hyperparameter tuning via RandomizedSearchCV.\n\n    Parameters:\n        X_train (pd.DataFrame): Training features.\n        y_train (pd.Series or np.array): Training target values.\n        X_test (pd.DataFrame): Test features.\n\n    Returns:\n        best_estimator: The best estimator from RandomizedSearchCV.\n        test_predictions: The predicted labels for X_test from the best estimator.\n    """\n    # Construct the pipeline:\n    #   1. Apply frequency encoding (for example, on \'primary_color\' & \'breed\' if implemented in apply_freq_encode)\n    #   2. OneHotEncode the features (a

In [16]:

'''
from sklearn.preprocessing import LabelEncoder
# For Dog:
train_dog = df_train[df_train['animal_type'] == 'Dog'].copy()
X_train_dog = train_dog.drop(columns=['animal_type', 'outcome_type'])
y_train_dog = train_dog['outcome_type']

test_dog = df_test[df_test['animal_type'] == 'Dog'].copy()
X_test_dog = test_dog.drop(columns=['animal_type'])

# For Cat:
train_cat = df_train[df_train['animal_type'] == 'Cat'].copy()
X_train_cat = train_cat.drop(columns=['animal_type', 'outcome_type'])
y_train_cat = train_cat['outcome_type']

test_cat = df_test[df_test['animal_type'] == 'Cat'].copy()
X_test_cat = test_cat.drop(columns=['animal_type'])

## Encode targets with LabelEncoder
# Dog encoding
le_dog = LabelEncoder()
y_train_dog_encoded = le_dog.fit_transform(y_train_dog)

# Cat encoding
le_cat = LabelEncoder()
y_train_cat_encoded = le_cat.fit_transform(y_train_cat)

print("Training model for Dog data:")
best_estimator_dog, dog_predictions_encoded = train_classifier(X_train_dog, y_train_dog_encoded, X_test_dog)
dog_predictions = le_dog.inverse_transform(dog_predictions_encoded)

print("\nTraining model for Cat data:")
best_estimator_cat, cat_predictions_encoded = train_classifier(X_train_cat, y_train_cat_encoded, X_test_cat)
cat_predictions = le_cat.inverse_transform(cat_predictions_encoded)

'''

'\nfrom sklearn.preprocessing import LabelEncoder\n# For Dog:\ntrain_dog = df_train[df_train[\'animal_type\'] == \'Dog\'].copy()\nX_train_dog = train_dog.drop(columns=[\'animal_type\', \'outcome_type\'])\ny_train_dog = train_dog[\'outcome_type\']\n\ntest_dog = df_test[df_test[\'animal_type\'] == \'Dog\'].copy()\nX_test_dog = test_dog.drop(columns=[\'animal_type\'])\n\n# For Cat:\ntrain_cat = df_train[df_train[\'animal_type\'] == \'Cat\'].copy()\nX_train_cat = train_cat.drop(columns=[\'animal_type\', \'outcome_type\'])\ny_train_cat = train_cat[\'outcome_type\']\n\ntest_cat = df_test[df_test[\'animal_type\'] == \'Cat\'].copy()\nX_test_cat = test_cat.drop(columns=[\'animal_type\'])\n\n## Encode targets with LabelEncoder\n# Dog encoding\nle_dog = LabelEncoder()\ny_train_dog_encoded = le_dog.fit_transform(y_train_dog)\n\n# Cat encoding\nle_cat = LabelEncoder()\ny_train_cat_encoded = le_cat.fit_transform(y_train_cat)\n\nprint("Training model for Dog data:")\nbest_estimator_dog, dog_predictio

In [17]:
## PREDICTION STITCHING ##
def combine_predictions(dog_pred, cat_pred, dog_true, cat_true):
    """
    Pass in list of dog predictions and cat predictions
    Returns -> stitched together predictions based on original test set order
    """
    dog_pred_series = pd.Series(dog_pred, index=dog_true.index)
    cat_pred_series = pd.Series(cat_pred, index=cat_true.index)
    # Concatenate both series and sort by the original index so the output 
    # reflects the same order as the original test dataset
    all_predictions = pd.concat([dog_pred_series, cat_pred_series]).sort_index()
    final_df = pd.DataFrame({'Predicted_Label': all_predictions})
    csv_path = './test_predictions_combined.csv'
    final_df.to_csv(csv_path, index=False)
    print(f"Combined test predictions saved to: {csv_path}")

# combine_predictions(dog_predictions, cat_predictions, X_test_dog, X_test_cat)

In [None]:
# At the end of the notebook, or wherever you want to save the functions
with open('ml_project.py', 'w') as f:
    f.write(""" 
    # All your function code goes here
    """)

'\n# At the end of the notebook, or wherever you want to save the functions\nwith open(\'ml_project_func.py\', \'w\') as f:\n    f.write(""" \n    # All your function code goes here\n    """)\n'

In [None]:
print('ml_project successfully imported.')