In [49]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [50]:
import pandas as pd
import numpy as np

In [51]:
# THIS IS FOR DATA CLEANING - all the stuff should be here
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train = df_train[df_train['intake_type'] != 'Wildlife']
df_train = df_train.dropna(subset=['age_upon_intake'])

In [52]:
def drop_cols(df):
    cols_to_drop = ['id', 'name', 'date_of_birth', 'outcome_time', 'found_location']
    existing_cols = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols)
    return df

In [73]:
# this is for when you want to use hour and month as NUMERICAL -- this is because the models need to know that these months / hours wrap around
def time_as_cyclical(df):
    df['hour_sin'] = np.sin(2 * np.pi * df['intake_hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['intake_hour'] / 24)

    df['month_sin'] = np.sin(2 * np.pi * df['intake_month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['intake_month'] / 12)

    df = df.drop(columns=['intake_hour', 'intake_month'])
    return df

def bucket_seasons(df):
    def month_to_season(month):
        if month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        elif month in [9, 10, 11]:
            return 'Fall'
        elif month in [12, 1, 2]:
            return 'Winter'
        else:
            return 'Unknown'

    df['season'] = df['intake_month'].apply(month_to_season)
    return df

def bucket_days(df):
    def hour_to_time_of_day(hour):
        if hour in [5, 6, 7, 8, 9, 10, 11]:
            return 'Morning'
        elif hour in [12, 13, 14, 15, 16, 17]:
            return 'Afternoon'
        elif hour in [18, 19]:
            return 'Evening'
        elif hour in [20, 21, 22, 23, 0, 1, 2, 3, 4]:
            return 'Night'
        else:
            return 'Unknown'

    df['time_of_day'] = df['intake_hour'].apply(hour_to_time_of_day)
    return df


In [54]:
# this converts intake_time into different columns for years, months, hours, and if it is a weekend
def engineer_time(df):
    df['intake_datetime'] = pd.to_datetime(df['intake_time'], unit='s')
    df['intake_year'] = df['intake_datetime'].dt.year
    df['intake_month'] = df['intake_datetime'].dt.month
    # df['intake_dayofweek'] = df['intake_datetime'].dt.dayofweek
    df['intake_hour'] = df['intake_datetime'].dt.hour
    # df['is_weekend'] = df['intake_dayofweek'].isin([5, 6]).astype(int)


    df = df.drop(columns=['intake_time', 'intake_datetime' ])
    return df

In [55]:
def clean_intake_time(df):
    # Intake Time
    # Convert string timestamps to UNIX timestamp
    dt_series = pd.to_datetime(df['intake_time'], errors='coerce')
    df['intake_time'] = dt_series.astype('int64') // (10 ** 9)
    return engineer_time(df)

# Age Upon Intake
def convert_age(age_str):
    """
    Convert age strings to years.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num
    elif "month" in unit:
        return num / 12
    elif "week" in unit:
        return num / 52
    elif "day" in unit:
        return num / 365
    else:
        return None

def clean_age_and_sex_upon_intake(df):
    df.loc[df['sex_upon_intake'].isna(), 'sex_upon_intake'] = 'Unknown'
    df['age_upon_intake'] = df['age_upon_intake'].apply(convert_age)
    df.loc[df['age_upon_intake'] < 0, 'age_upon_intake'] = 0
    return df

# Breed
def clean_breed(df):
# Create is_mix column
    df['is_mix'] = df['breed'].str.contains('mix', case=False, na=False).astype(int)
    # remove mix from all breeds
    df['breed'] = df['breed'].str.replace(' mix', '', case=False)
    return df

In [56]:
color_group_map = {
  'blue tabby':       'gray tabby',
  'silver tabby':     'gray tabby',
  'silver':           'gray',
  'blue':             'gray',
  'orange tabby':     'orange',
  'orange tiger':     'orange',
  'red':              'orange',
  'red tabby':        'orange',
  'red tick':         'orange',
  'yellow':           'orange',
  'tan':              'cream',
}

def clean_color(df):
  # lowercase
  df['color'] = df['color'].str.lower().str.strip()

  # feature engineering -> primary color 
  df['primary_color'] = df['color'].astype(str).apply(
      lambda x: x.split('/')[0].strip() if '/' in x else x.strip()
  )

  # simplify synonymous colors if in map
  df['primary_color'] = df['primary_color'].map(color_group_map).fillna(df['primary_color'])

  df = df.drop(columns=['color'])
  return df


def freq_encode(df, col):
  # count frequencies
  freq_series = df[col].value_counts()

  # map frequencies back to the original column, replacing values
  df[col] = df[col].map(freq_series)

  return df

In [57]:
def clean_intake_cond(df):
    # Known mappings
    df['intake_condition'] = df['intake_condition'].replace({'Unknown': 'Unknown Condition / Other', 'Other': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Space': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Behavior': 'Normal / Behavior', 'Normal': 'Normal / Behavior'})
    df['intake_condition'] = df['intake_condition'].replace({'Neonatal': 'Nursing / Neonatal', 'Nursing': 'Nursing / Neonatal'})
    df['intake_condition'] = df['intake_condition'].replace({'Neurologic': 'Med Urgent', 'Agonal': 'Med Urgent', 'Parvo': 'Med Urgent'})
    df['intake_condition'] = df['intake_condition'].replace({'Agonal': 'Med Urgent / Neurological'})
    df['intake_condition'] = df['intake_condition'].replace({'Congenital': 'Sick'})
    
    return df


In [58]:
def clean_data(df):
    df = drop_cols(df)
    print('dropped columns')
    df = clean_intake_time(df)
    print('cleaned intake time')
    df = clean_intake_cond(df)
    print('cleaned intake condition')
    df = clean_age_and_sex_upon_intake(df)
    print('cleaned age and sex')
    df = clean_breed(df)
    print('cleaned breed')
    df = clean_color(df)
    print('cleaned color')
    # df = clean_animal_type(df)
    # print('cleaned animal type')
    return df

In [59]:
from sklearn.preprocessing import OneHotEncoder
def encode_columns(df):

    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(df[['category_column']])

    dummies = pd.get_dummies(df['intake_type'])
    df = df.drop('intake_type', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['intake_condition'])
    df = df.drop('intake_condition', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['sex_upon_intake'])
    df = df.drop('sex_upon_intake', axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [None]:
df_train = clean_data(df_train)
# df_train = encode_columns(df_train)
label_column = df_train.pop('outcome_type')
df_train.insert(df_train.shape[1], 'outcome_type', label_column)    
df_train.head()

dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color


Unnamed: 0,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,intake_year,intake_month,intake_hour,is_mix,primary_color,outcome_type,season
0,Stray,Normal / Behavior,Dog,Spayed Female,8.0,English Springer Spaniel,2015,7,12,0,white,Return to Owner,Summer
1,Stray,Normal / Behavior,Dog,Intact Male,0.916667,Basenji,2016,4,18,1,sable,Return to Owner,Spring
2,Public Assist,Normal / Behavior,Cat,Neutered Male,2.0,Domestic Shorthair,2022,5,0,0,orange,Transfer,Spring
3,Owner Surrender,Normal / Behavior,Dog,Neutered Male,2.0,Labrador Retriever,2017,2,12,1,chocolate,Return to Owner,Winter
4,Public Assist,Normal / Behavior,Dog,Neutered Male,6.0,Great Dane,2019,4,9,1,black,Return to Owner,Spring


In [61]:
df_test = clean_data(df_test)
df_test.head()

dropped columns


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned breed
cleaned color


Unnamed: 0,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,intake_year,intake_month,intake_hour,is_mix,primary_color
0,Stray,Normal / Behavior,Dog,Neutered Male,2.0,Beagle,2019,1,16,1,tricolor
1,Stray,Sick,Cat,Intact Female,0.076923,Domestic Shorthair,2013,10,7,1,calico
2,Stray,Normal / Behavior,Dog,Neutered Male,4.0,Doberman Pinsch/Australian Cattle Dog,2014,6,10,0,cream
3,Stray,Normal / Behavior,Dog,Intact Female,0.416667,Pit Bull,2015,7,18,0,brown
4,Stray,Injured,Cat,Intact Female,2.0,Domestic Shorthair,2017,2,10,1,black


In [62]:
# Transformer code for pipelines

# Use FunctionTransformer to wrap the freq_encode function
def apply_freq_encode(df):
    df = freq_encode(df, 'primary_color')
    df = freq_encode(df, 'breed')

    return df

In [63]:
def combine_predictions(dog_pred, cat_pred, dog_true, cat_true):
    """
    Combines dog and cat predictions, ordering them by the original test set indices,
    and writes a CSV with sequential Ids and Outcome Types.
    
    Example CSV output:
    Id,Outcome Type
    1,Died
    2,Euthanasia
    3,Adoption
    ...
    """
    # Create prediction series with the same indices as the original true labels
    dog_pred_series = pd.Series(dog_pred, index=dog_true.index)
    cat_pred_series = pd.Series(cat_pred, index=cat_true.index)
    
    # Concatenate both series and sort by the original index so the output
    # reflects the same order as the original test dataset.
    all_predictions = pd.concat([dog_pred_series, cat_pred_series]).sort_index()
    
    # Generate sequential IDs starting from 1 and create the final DataFrame
    final_df = pd.DataFrame({
        'Id': range(1, len(all_predictions) + 1),
        'Outcome Type': all_predictions.values
    })
    
    # Save the DataFrame to CSV with a header row; commas separate each field.
    csv_path = './test_predictions_combined.csv'
    final_df.to_csv(csv_path, index=False)
    print(f"Combined test predictions saved to: {csv_path}")

In [64]:
def save_predictions(y_pred, model_name):

    """
    Alternate version that does not have stitching.
    
    Example CSV output:
    Id,Outcome Type
    1,Died
    2,Euthanasia
    3,Adoption
    ...
    
    Parameters:
        y_pred (numpy.ndarray): Predicted labels.
    """
    import pandas as pd

    # Create the final DataFrame with sequential Ids and Outcome Types.
    final_df = pd.DataFrame({
        'Id': range(1, len(y_pred) + 1),
        'Outcome Type': y_pred  # y_pred is a numpy array; no need for .values.
    })
    
    # Save the DataFrame to a CSV file with a header row and comma separation.
    csv_path = './test_' + model_name + '_predictions_combined.csv'
    final_df.to_csv(csv_path, index=False)
    print(f"Combined test predictions saved to: {csv_path}")


In [65]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
balanced_acc_scorer = make_scorer(balanced_accuracy_score)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def classification_report_with_accuracy_score(y_true, y_pred):

    print (classification_report(y_true, y_pred)) # print classification report
    return balanced_accuracy_score(y_true, y_pred) # return accuracy score

In [66]:
from sklearn.utils.class_weight import compute_class_weight

def get_class_weights(y_train):
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )

    class_weight_dict = dict(zip(np.unique(y_train), class_weights))
    return class_weight_dict

In [67]:
print('Done running ml_project.ipynb.')

Done running ml_project.ipynb.
