In [2]:
%pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
# THIS IS FOR DATA CLEANING - all the stuff should be here
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train = df_train[df_train['intake_type'] != 'Wildlife']
df_train = df_train.dropna(subset=['age_upon_intake'])

In [5]:
def drop_cols(df):
    cols_to_drop = ['id', 'name', 'date_of_birth', 'outcome_time', 'found_location']
    existing_cols = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols)
    return df

In [6]:
# this is for when you want to use hour and month as NUMERICAL -- this is because the models need to know that these months / hours wrap around
def time_as_cyclical(df):
    df['hour_sin'] = np.sin(2 * np.pi * df['intake_hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['intake_hour'] / 24)

    df['month_sin'] = np.sin(2 * np.pi * df['intake_month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['intake_month'] / 12)

    df = df.drop(columns=['intake_hour', 'intake_month'])
    return df

def bucket_seasons(df):
    def month_to_season(month):
        if month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        elif month in [9, 10, 11]:
            return 'Fall'
        elif month in [12, 1, 2]:
            return 'Winter'
        else:
            return 'Unknown'

    df['season'] = df['intake_month'].apply(month_to_season)
    return df

def bucket_days(df):
    def hour_to_time_of_day(hour):
        if hour in [5, 6, 7, 8, 9, 10, 11]:
            return 'Morning'
        elif hour in [12, 13, 14, 15, 16, 17]:
            return 'Afternoon'
        elif hour in [18, 19]:
            return 'Evening'
        elif hour in [20, 21, 22, 23, 0, 1, 2, 3, 4]:
            return 'Night'
        else:
            return 'Unknown'

    df['time_of_day'] = df['intake_hour'].apply(hour_to_time_of_day)
    return df


In [7]:
# this converts intake_time into different columns for years, months, hours, and if it is a weekend
def engineer_time(df):
    df['intake_datetime'] = pd.to_datetime(df['intake_time'], unit='s')
    df['intake_year'] = df['intake_datetime'].dt.year
    df['intake_month'] = df['intake_datetime'].dt.month
    # df['intake_dayofweek'] = df['intake_datetime'].dt.dayofweek
    df['intake_hour'] = df['intake_datetime'].dt.hour
    # df['is_weekend'] = df['intake_dayofweek'].isin([5, 6]).astype(int)


    df = df.drop(columns=['intake_time', 'intake_datetime' ])
    return df

In [8]:
def age_in_months(age_str):
    """
    Convert age strings to months.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num * 12
    elif "month" in unit:
        return num
    elif "week" in unit:
        return num / 4  # approx. weeks in a month
    elif "day" in unit:
        return num / 30.0  # approx. days in a month
    else:
        return None


In [9]:
def clean_intake_time(df):
    # Intake Time
    # Convert string timestamps to UNIX timestamp
    dt_series = pd.to_datetime(df['intake_time'], errors='coerce')
    df['intake_time'] = dt_series.astype('int64') // (10 ** 9)
    return engineer_time(df)

# Age Upon Intake
def convert_age(age_str):
    """
    Convert age strings to years.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num
    elif "month" in unit:
        return num / 12
    elif "week" in unit:
        return num / 52
    elif "day" in unit:
        return num / 365
    else:
        return None

def clean_age_and_sex_upon_intake(df):
    df.loc[df['sex_upon_intake'].isna(), 'sex_upon_intake'] = 'Unknown'
    df['age_upon_intake'] = df['age_upon_intake'].apply(age_in_months)#(convert_age)
    df.loc[df['age_upon_intake'] < 0, 'age_upon_intake'] = 0
    return df

In [10]:
def map_cat_rarity(df):
    is_domestic = df["breed"].str.lower().str.contains("domestic", na=False)
    df.loc[~is_domestic, "breed"] = "Rare"
    df.loc[is_domestic, "breed"] = "Common"
    df['size'] = 2
    df["size"] = df["size"].astype(int)  # enforce int type
    return df

In [11]:
def clean_breed(df):
    # Create is_mix column
    df['is_mix'] = (df['breed'].str.contains('mix', case=False, na=False).astype(int)) | (df['breed'].str.contains('/', case=False, na=False).astype(int))
    # remove mix from all breeds
    df['breed'] = df['breed'].str.replace(' mix', '', case=False)

    df_dog = map_dog_cluster(df[df['animal_type'] == 'Dog'].copy())
    df_cat = map_cat_rarity(df[df['animal_type'] == 'Cat'].copy())

    # Add cleaned dog and cat records back
    df_cleaned = pd.concat([df_dog, df_cat])

    # Sort back to the original order using the index
    df_cleaned = df_cleaned.sort_index()

    df_cleaned.head()
    df_cleaned['size'] = df_cleaned['size'].fillna(3).astype(int)

    return df_cleaned

In [12]:
# Manual corrections BEFORE splitting / 
manual_fixes = {
  'Black/Tan Hound': 'English Coonhound', 
  'Black/Tan Hound/Black Mouth Cur': 'English Coonhound',
  'Black/Tan Hound/Catahoula': 'English Coonhound', 
  'Black/Tan Hound/Doberman Pinsch': 'English Coonhound',
  'Black/Tan Hound/Great Dane': 'English Coonhound',
  'Black/Tan Hound/Labrador Retriever': 'English Coonhound',
  'Black/Tan Hound/Siberian Husky': 'English Coonhound',
  'Catahoula/Black/Tan Hound': 'Catahoula',
  'Dachshund Stan': 'Dachshund',
  'Dachshund/Black/Tan Hound': 'Dachshund',
  'Doberman Pinsch/Black/Tan Hound': 'Doberman Pinsch',
  'German Shepherd/Black/Tan Hound': 'German Shepherd',
  'Labrador Retriever/Black/Tan Hound': 'Labrador Retriever',
  'Plott Hound/Black/Tan Hound': 'Plott Hound',
  'Dachshund Stan Mix': 'Dachshund',

  # nneomas mappings (may have created new breeds in dog_info)
  "Boxer/Miniature Poodle" : "Boxerdoodle",
  "Minature Poodle/Boxer" : "Boxerdoodle",
  "Miniature Poodle/Cocker Spaniel": "Cockapoo",
  "Cocker Spaniel/Miniature Poodle": "Cockapoo",
  "Miniature Poodle/English Cocker Spaniel": "Cockapoo",
  "English Cocker Spaniel/Miniature Poodle": "Cockapoo",
  "Toy Poodle/Cocker Spaniel": "Cockapoo",
  "Cocker Spaniel/Toy Poodle": "Cockapoo",
  "Toy Poodle/English Cocker Spaniel": "Cockapoo",
  "English Cocker Spaniel/Toy Poodle": "Cockapoo",
  "Standard Poodle/Cocker Spaniel": "Cockapoo",
  "Cocker Spaniel/Standard Poodle": "Cockapoo",
  "Standard Poodle/English Cocker Spaniel": "Cockapoo",
  "English Cocker Spaniel/Standard Poodle": "Cockapoo",
  "Miniature Poodle/Lhasa Apso": "Lhasapoo",
  "Lhasa Apso/Miniature Poodle": "Lhasapoo",
  "Lhasa Apso/Toy Poodle": "Lhasapoo",
  "Toy Poodle/Lhasa Apso": "Lhasapoo",
  "Standard Poodle/Lhasa Apso": "Lhasapoo",
  "Lhasa Apso/Standard Poodle": "Lhasapoo",
  "Standard Poodle/Labrador Retriever": "Labradoodle",
  "Labrador Retriever/Standard Poodle": "Labradoodle",
  "Labrador Retriever/Miniature Poodle": "Labradoodle Miniature",
  "Miniature Poodle/Labrador Retriever": "Labradoodle Miniature",
  "Labrador Retriever/Toy Poodle": "Labradoodle Miniature",
  "Toy Poodle/Labrador Retriever": "Labradoodle Miniature",
  "Yorkshire Terrier/Miniature Poodle" : "Yorkipoo",
  "Miniature Poodle/Yorkshire Terrier" : "Yorkipoo",
  "Yorkshire Terrier/Toy Poodle" : "Yorkipoo",
  "Toy Poodle/Yorkshire Terrier" : "Yorkipoo",
  "Yorkshire Terrier/Standard Poodle " : "Yorkipoo",
  "Standard Poodlee/Yorkshire Terrier" : "Yorkipoo",
  # Mini Schnoodle (Mini Schnauzer x Mini or Toy Poodle)
  "Miniature Schnauzer/Miniature Poodle": "Mini Schnoodle",
  "Miniature Poodle/Miniature Schnauzer": "Mini Schnoodle",
  "Miniature Schnauzer/Toy Poodle": "Mini Schnoodle",
  "Toy Poodle/Miniature Schnauzer": "Mini Schnoodle",
  # Standard Schnoodle (Miniature or Standard Schnauzer x Mini or Standard Poodle)
  "Standard Schnauzer/Miniature Poodle": "Standard Schnoodle",
  "Miniature Poodle/Standard Schnauzer": "Standard Schnoodle",
  "Miniature Schnauzer/Standard Poodle": "Standard Schnoodle",
  "Standard Poodle/Miniature Schnauzer": "Standard Schnoodle",
  "Standard Schnauzer/Toy Poodle": "Standard Schnoodle",
  "Toy Poodle/Standard Schnauzer": "Standard Schnoodle",
  # Giant Schnoodle (Giant Schnauzer x Standard Poodle)
  "Standard Schnauzer/Standard Poodle": "Giant Schnoodle",
  "Standard Poodle/Standard Schnauzer": "Giant Schnoodle",
  "Schnauzer Giant/Standard Poodle": "Giant Schnoodle",
  "Standard Poodle/Schnauzer Giant": "Giant Schnoodle",
  "German Shepherd/Pit Bull" : "German Shepherd Pit Bull",
  "Pit Bull/German Shepherd" : "German Shepherd Pit Bull",
  "Labrador Retriever/Pit Bull":"Labrabull",
  "Pit Bull/Labrador Retriever":"Labrabull",
  "Labrador Retriever/American Pit Bull Terrier": "Labrabull",
  "American Pit Bull Terrier/Labrador Retriever": "Labrabull",
  "German Shepherd/Labrador Retriever": "German Sheprador",
  "Labrador Retriever/German Shepherd": "German Sheprador",
  "Australian Shepherd/Labrador Retriever": "Australian Shepherd Lab", 
  "Labrador Retriever/Australian Shepherd": "Australian Shepherd Lab",
  "Chihuahua Shorthair/Dachshund": "Chiweenie",
  "Dachshund/Chihuahua Shorthair": "Chiweenie",
  "Dachshund Longhair/Chihuahua Longhair": "Chiweenie",
  "Chihuahua Shorthair/Dachshund Wirehair": "Chiweenie",
  "Dachshund Wirehair/Chihuahua Shorthair": "Chiweenie",
  "Chihuahua Longhair/Dachshund": "Chiweenie",
  "Dachshund/Chihuahua Longhair": "Chiweenie",
  "Dachshund Longhair/Chihuahua Shorthair": "Chiweenie",
  "Chihuahua Longhair/Dachshund Longhair": "Chiweenie",
  "Dachshund Wirehair/Chihuahua Longhair": "Chiweenie",
  "Chihuahua Longhair/Dachshund Wirehair": "Chiweenie",
  "Chihuahua Shorthair/Dachshund Longhair": "Chiweenie",
  "Labrador Retriever/Border Collie": "Borador",
  "Border Collie/Labrador Retriever": "Borador",
  "Maltese/Miniature Poodle": "Maltipoo",
  "Miniature Poodle/Maltese": "Maltipoo",
  "Maltese/Toy Poodle": "Maltipoo",
  "Toy Poodle/Maltese": "Maltipoo",
  "Maltese/Standard Poodle": "Maltipoo",
  "Standard Poodle/Maltese": "Maltipoo",
  "Pit Bull/Boxer": "Bullboxer Pit",
  "Boxer/Pit Bull" : "Bullboxer Pit",
  "Boxer/American Pit Bull Terrier" : "Bullboxer Pit",
  "American Pit Bull Terrier/Boxer" : "Bullboxer Pit",
  "Siberian Husky/German Shepherd" : "Shepsky",
  "German Shepherd/Siberian Husky" : "Shepsky",
  "Australian Shepherd/Siberian Husky" : "Australian Shepherd Husky",
  "Siberian Husky/Australian Shepherd" : "Australian Shepherd Husky",
  "Australian Shepherd/Pit Bull" : "Australian Shepherd Pit Bull",
  "Pit Bull/Australian Shepherd" : "Australian Shepherd Pit Bull"
}

In [13]:
import logging

# Configure logging to file
logging.basicConfig(
    filename='breed_mapping.log',         # Log file name
    level=logging.INFO,                   # Log level (use DEBUG for more detail)
    format='[%(asctime)s] %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [14]:
def map_dog_cluster(df):
    # Load data
    dog_info = pd.read_csv('dog_info.csv')
    known_breeds = set(dog_info['Breed Name'].dropna().unique())

    # Clean dog records
    df_dog = df[df["animal_type"] == "Dog"].copy()

    # Merge to find unmatched
    breeds = pd.merge(df_dog, dog_info, left_on='breed', right_on='Breed Name', how='left')
    unmapped = breeds[breeds['Breed Name'].isna()]
    unmapped_breed_list = sorted([str(b).strip() for b in unmapped['breed'].dropna().unique()])
   # print(breeds.head())
    # Create final rename mapping and unmatched list
    breed_rename_map = {}
    unmatched_breeds = []
    
    logging.info("\n[LOG] Starting breed resolution process...\n")

    for original_breed in unmapped_breed_list:
        # Apply manual correction if available
        if original_breed in manual_fixes:
            fixed = manual_fixes[original_breed]
            logging.info(f"[MANUAL] '{original_breed}' → '{fixed}'")
            breed_rename_map[original_breed] = fixed
            continue

        # Split the breed string
        parts = [p.strip() for p in original_breed.split('/')]

        # Check for "Mix" at the end
        if original_breed.strip().endswith("Mix"):
            candidate = original_breed.replace("Mix", "").strip()
            if candidate in known_breeds:
                logging.info(f"[MIX] '{original_breed}' ends in 'Mix', resolved to known breed '{candidate}'")
                breed_rename_map[original_breed] = candidate
            else:
                logging.warning(f"[MIX-FAIL] '{original_breed}' ends in 'Mix' but '{candidate}' not found in dog_info")
                unmatched_breeds.append(original_breed)
            continue

        # Try resolving multi-part breed
        matched = None
        for part in parts:
            if part in known_breeds:
                matched = part
                logging.info(f"[MATCH] '{original_breed}' → first matched breed '{matched}'")
                breed_rename_map[original_breed] = matched
                break

        if not matched:
            logging.warning(f"[UNMATCHED] No parts of '{original_breed}' matched known breeds")
            unmatched_breeds.append(original_breed)

    # Apply initial mapping
    df_dog['breed'] = df_dog['breed'].apply(
        lambda b: manual_fixes.get(b, breed_rename_map.get(b, b))
    )

    # Create a lookup dictionary from dog_info
    breed_to_size = dict(zip(dog_info['Breed Name'], dog_info['Size']))

    # Apply the mapping
    df_dog['size'] = df_dog['breed'].map(breed_to_size)

    # Set Unknown breeds to 3
   # df_dog.loc[df_dog['breed'] == 'Unknown', 'size'] = 3

    # If you want it to be integer type
    df_dog['size'] = df_dog['size'].astype('Int64')  # Allows NA-safe ints

    # Combine all rename logic
    combined_rename_map = {**breed_rename_map, **manual_fixes}

    df_dog['breed'] = df_dog['breed'].apply(lambda b: combined_rename_map.get(b, b))
    
    # Load breed-to-cluster mapping
    breed_cluster_df = pd.read_csv('dog_breed_to_cluster_map.csv')
    breed_to_cluster = dict(zip(breed_cluster_df['Breed Name'], breed_cluster_df['Cluster']))

    # Map breed to cluster
    df_dog['breed'] = df_dog['breed'].apply(
        lambda b: breed_to_cluster.get(b, "Rare") if b != "Unknown" else "Unknown"
    )
    
    # Optional: Log all final unmapped as rare
    logging.info("\n[SUMMARY] Breeds marked as 'Rare' due to no valid match:")
    for breed in unmatched_breeds:
        logging.info(f" - {breed}")


    return df_dog

In [15]:
color_group_map = {
  'blue tabby':       'gray tabby',
  'silver tabby':     'gray tabby',
  'silver':           'gray',
  'blue':             'gray',
  'orange tabby':     'orange',
  'orange tiger':     'orange',
  'red':              'orange',
  'red tabby':        'orange',
  'red tick':         'orange',
  'yellow':           'orange',
  'tan':              'cream',
}

def clean_color(df):
  # lowercase
  df['color'] = df['color'].str.lower().str.strip()

  # feature engineering -> primary color 
  df['primary_color'] = df['color'].astype(str).apply(
      lambda x: x.split('/')[0].strip() if '/' in x else x.strip()
  )

  # simplify synonymous colors if in map
  df['primary_color'] = df['primary_color'].map(color_group_map).fillna(df['primary_color'])

  df = df.drop(columns=['color'])
  return df


def freq_encode(df, col):
  # count frequencies
  freq_series = df[col].value_counts()

  # map frequencies back to the original column, replacing values
  df[col] = df[col].map(freq_series)

  return df

In [16]:
def clean_intake_cond(df):
    # Known mappings
    df['intake_condition'] = df['intake_condition'].replace({'Unknown': 'Unknown Condition / Other', 'Other': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Space': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Behavior': 'Normal / Behavior', 'Normal': 'Normal / Behavior'})
    df['intake_condition'] = df['intake_condition'].replace({'Neonatal': 'Nursing / Neonatal', 'Nursing': 'Nursing / Neonatal'})
    df['intake_condition'] = df['intake_condition'].replace({'Neurologic': 'Med Urgent', 'Agonal': 'Med Urgent', 'Parvo': 'Med Urgent'})
    df['intake_condition'] = df['intake_condition'].replace({'Agonal': 'Med Urgent / Neurological'})
    df['intake_condition'] = df['intake_condition'].replace({'Congenital': 'Sick'})
    
    return df


In [17]:
def clean_data(df):
    df = drop_cols(df)
    print('dropped columns')
    df = clean_intake_time(df)
    print('cleaned intake time')
    df = clean_intake_cond(df)
    print('cleaned intake condition')
    df = clean_age_and_sex_upon_intake(df)
    print('cleaned age and sex')
    df = clean_color(df)
    print('cleaned color')
    df = clean_breed(df)
    print('cleaned breed')
    # df = clean_animal_type(df)
    # print('cleaned animal type')
    return df

In [18]:
from sklearn.preprocessing import OneHotEncoder
def encode_columns(df):

    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder.fit(df[['category_column']])

    dummies = pd.get_dummies(df['intake_type'])
    df = df.drop('intake_type', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['intake_condition'])
    df = df.drop('intake_condition', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['sex_upon_intake'])
    df = df.drop('sex_upon_intake', axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [19]:
df_train = clean_data(df_train)
# df_train = encode_columns(df_train)
label_column = df_train.pop('outcome_type')
df_train.insert(df_train.shape[1], 'outcome_type', label_column)   
# df_train[df_train['size'].isna()]
df_train.head()

dropped columns
cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned color
cleaned breed


Unnamed: 0,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,intake_year,intake_month,intake_hour,primary_color,is_mix,size,outcome_type
0,Stray,Normal / Behavior,Dog,Spayed Female,96.0,2,2015,7,12,white,0,3,Return to Owner
1,Stray,Normal / Behavior,Dog,Intact Male,11.0,7,2016,4,18,sable,1,2,Return to Owner
2,Public Assist,Normal / Behavior,Cat,Neutered Male,24.0,Common,2022,5,0,orange,0,2,Transfer
3,Owner Surrender,Normal / Behavior,Dog,Neutered Male,24.0,2,2017,2,12,chocolate,1,4,Return to Owner
4,Public Assist,Normal / Behavior,Dog,Neutered Male,72.0,3,2019,4,9,black,1,5,Return to Owner


In [20]:
df_test = clean_data(df_test)
df_train[df_train['size'].isna()]
df_test.head()
df_test['breed'].unique()

dropped columns


  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


cleaned intake time
cleaned intake condition
cleaned age and sex
cleaned color
cleaned breed


array([7, 'Common', 3, 4, 1, 5, 0, 2, 'Rare', 8, 6, 'Unknown'],
      dtype=object)

In [44]:
# I am going to try and split betwen df_dog and df_Cat
d = df_train.groupby(['primary_color','outcome_type']).size().unstack(fill_value=0)
d_percent = d.div(d.sum(axis=1), axis=0)

d_percent


outcome_type,Adoption,Died,Euthanasia,Return to Owner,Transfer
primary_color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
agouti,0.636364,0.0,0.0,0.090909,0.272727
apricot,0.43038,0.0,0.012658,0.278481,0.278481
black,0.503646,0.010866,0.030902,0.149319,0.305267
black brindle,0.512821,0.005698,0.031339,0.213675,0.236467
black smoke,0.601695,0.004237,0.025424,0.059322,0.309322
black tabby,0.367698,0.013746,0.051546,0.020619,0.546392
black tiger,0.625,0.0,0.125,0.0,0.25
blue cream,0.52381,0.0,0.083333,0.071429,0.321429
blue merle,0.51626,0.002033,0.018293,0.27439,0.189024
blue point,0.564356,0.009901,0.049505,0.049505,0.326733


In [88]:
# Transformer code for pipelines

# Use FunctionTransformer to wrap the freq_encode function
def apply_freq_encode(df):
    df = freq_encode(df, 'primary_color')
    df = freq_encode(df, 'breed')

    return df

In [89]:
def combine_predictions(dog_pred, cat_pred, dog_true, cat_true):
    """
    Combines dog and cat predictions, ordering them by the original test set indices,
    and writes a CSV with sequential Ids and Outcome Types.
    
    Example CSV output:
    Id,Outcome Type
    1,Died
    2,Euthanasia
    3,Adoption
    ...
    """
    # Create prediction series with the same indices as the original true labels
    dog_pred_series = pd.Series(dog_pred, index=dog_true.index)
    cat_pred_series = pd.Series(cat_pred, index=cat_true.index)
    
    # Concatenate both series and sort by the original index so the output
    # reflects the same order as the original test dataset.
    all_predictions = pd.concat([dog_pred_series, cat_pred_series]).sort_index()
    
    # Generate sequential IDs starting from 1 and create the final DataFrame
    final_df = pd.DataFrame({
        'Id': range(1, len(all_predictions) + 1),
        'Outcome Type': all_predictions.values
    })
    
    # Save the DataFrame to CSV with a header row; commas separate each field.
    csv_path = './test_predictions_combined.csv'
    final_df.to_csv(csv_path, index=False)
    print(f"Combined test predictions saved to: {csv_path}")

In [90]:
def save_predictions(y_pred, model_name):

    """
    Alternate version that does not have stitching.
    
    Example CSV output:
    Id,Outcome Type
    1,Died
    2,Euthanasia
    3,Adoption
    ...
    
    Parameters:
        y_pred (numpy.ndarray): Predicted labels.
    """
    import pandas as pd

    # Create the final DataFrame with sequential Ids and Outcome Types.
    final_df = pd.DataFrame({
        'Id': range(1, len(y_pred) + 1),
        'Outcome Type': y_pred  # y_pred is a numpy array; no need for .values.
    })
    
    # Save the DataFrame to a CSV file with a header row and comma separation.
    csv_path = './test_' + model_name + '_predictions_combined.csv'
    final_df.to_csv(csv_path, index=False)
    print(f"Combined test predictions saved to: {csv_path}")


In [91]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
balanced_acc_scorer = make_scorer(balanced_accuracy_score)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def classification_report_with_accuracy_score(y_true, y_pred):

    print (classification_report(y_true, y_pred)) # print classification report
    return balanced_accuracy_score(y_true, y_pred) # return accuracy score

In [92]:
from sklearn.utils.class_weight import compute_class_weight

def get_class_weights(y_train):
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )

    class_weight_dict = dict(zip(np.unique(y_train), class_weights))
    return class_weight_dict

In [93]:
print('Done running ml_project.ipynb.')

Done running ml_project.ipynb.
