In [97]:
import pandas as pd
import numpy as np

In [98]:
def drop_cols(df):
    cols_to_drop = ['id', 'name', 'date_of_birth', 'outcome_time', 'found_location']
    existing_cols = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols)
    return df

In [99]:
# THIS IS FOR DATA CLEANING - all the stuff should be here
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train = df_train[df_train['intake_type'] != 'Wildlife']
df_train = df_train.dropna(subset=['age_upon_intake'])

In [100]:
def clean_intake_time(df):
    # Intake Time
    # Convert string timestamps to UNIX timestamp
    dt_series = pd.to_datetime(df['intake_time'], errors='coerce')
    df['intake_time'] = dt_series.astype('int64') // (10 ** 9)
    return df

# Age Upon Intake
def convert_age(age_str):
    """
    Convert age strings to years.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num
    elif "month" in unit:
        return num / 12
    elif "week" in unit:
        return num / 52
    elif "day" in unit:
        return num / 365
    else:
        return None

def clean_age_and_sex_upon_intake(df):
    df.loc[df['sex_upon_intake'].isna(), 'sex_upon_intake'] = 'Unknown'
    df['age_upon_intake'] = df['age_upon_intake'].apply(convert_age)
    df.loc[df['age_upon_intake'] < 0, 'age_upon_intake'] = 0
    return df

# Breed
def clean_breed(df):
# Create is_mix column
    df['is_mix'] = df['breed'].str.contains('mix', case=False, na=False).astype(int)
    # remove mix from all breeds
    df['breed'] = df['breed'].str.replace(' mix', '', case=False)
    return df

In [101]:
color_group_map = {
  'blue tabby':       'gray tabby',
  'silver tabby':     'gray tabby',
  'silver':           'gray',
  'blue':             'gray',
  'orange tabby':     'orange',
  'orange tiger':     'orange',
  'red':              'orange',
  'red tabby':        'orange',
  'red tick':         'orange',
  'yellow':           'orange',
  'tan':              'cream',
}

def clean_color(df):
  # lowercase
  df['color'] = df['color'].str.lower().str.strip()

  # feature engineering -> primary color 
  df['primary_color'] = df['color'].astype(str).apply(
      lambda x: x.split('/')[0].strip() if '/' in x else x.strip()
  )

  # simplify synonymous colors if in map
  df['primary_color'] = df['primary_color'].map(color_group_map).fillna(df['primary_color'])

  df = df.drop(columns=['color'])
  return df


def freq_encode(df, col):
  # count frequencies
  freq_series = df[col].value_counts()

  # map frequencies back to the original column, replacing values
  df[col] = df[col].map(freq_series)

  return df

In [None]:
def clean_intake_cond(df):
    # Known mappings
    df['intake_condition'] = df['intake_condition'].replace({
        'Unknown': 'Unknown Condition / Other', 
        'Other': 'Unknown Condition / Other',
        'Space': 'Unknown Condition / Other',
        'Behavior': 'Normal / Behavior', 
        'Normal': 'Normal / Behavior',
        'Neonatal': 'Nursing / Neonatal', 
        'Nursing': 'Nursing / Neonatal',
        'Neurologic': 'Med Urgent', 
        'Agonal': 'Med Urgent', 
        'Parvo': 'Med Urgent',
        'Congenital': 'Sick'
    })

# Replace any unknown conditions (those not in the mapping) with 'Med Urgent'
df['intake_condition'] = df['intake_condition'].apply(lambda x: x if x in df['intake_condition'].unique() else 'Med Urgent')








    df['intake_condition'] = df['intake_condition'].replace({'Unknown': 'Unknown Condition / Other', 'Other': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Space': 'Unknown Condition / Other'})
    df['intake_condition'] = df['intake_condition'].replace({'Behavior': 'Normal / Behavior', 'Normal': 'Normal / Behavior'})
    df['intake_condition'] = df['intake_condition'].replace({'Neonatal': 'Nursing / Neonatal', 'Nursing': 'Nursing / Neonatal'})
    df['intake_condition'] = df['intake_condition'].replace({'Neurologic': 'Med Urgent', 'Agonal': 'Med Urgent', 'Parvo': 'Med Urgent'})
    df['intake_condition'] = df['intake_condition'].replace({'Agonal': 'Med Urgent / Neurological'})
    df['intake_condition'] = df['intake_condition'].replace({'Congenital': 'Sick'})
    return df


In [103]:
def clean_animal_type(df):
    dummies = pd.get_dummies(df['animal_type'], drop_first=True)
    df = df.drop('animal_type', axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [104]:
def clean_data(df):
    df = drop_cols(df)
    df = clean_intake_time(df)
    df = clean_intake_cond(df)
    df = clean_age_and_sex_upon_intake(df)
    df = clean_breed(df)
    df = clean_color(df)
    df = clean_animal_type(df)
    return df

In [105]:
def encode_columns(df):
    dummies = pd.get_dummies(df['intake_type'])
    df = df.drop('intake_type', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['intake_condition'])
    df = df.drop('intake_condition', axis=1)
    df = pd.concat([df, dummies], axis=1)

    dummies = pd.get_dummies(df['sex_upon_intake'])
    df = df.drop('sex_upon_intake', axis=1)
    df = pd.concat([df, dummies], axis=1)
    return df

In [106]:
df_train = clean_data(df_train)
df_train = encode_columns(df_train)
label_column = df_train.pop('outcome_type')
df_train.insert(df_train.shape[1], 'outcome_type', label_column)    
df_train.head()

Unnamed: 0,intake_time,age_upon_intake,breed,is_mix,primary_color,Dog,Abandoned,Euthanasia Request,Owner Surrender,Public Assist,...,Nursing / Neonatal,Pregnant,Sick,Unknown Condition / Other,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown,outcome_type
0,1436101140,8.0,English Springer Spaniel,0,white,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,Return to Owner
1,1460659380,0.916667,Basenji,1,sable,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,Return to Owner
2,1652314980,2.0,Domestic Shorthair,0,orange,False,False,False,False,True,...,False,False,False,False,False,False,True,False,False,Transfer
3,1487421960,2.0,Labrador Retriever,1,chocolate,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,Return to Owner
4,1555408380,6.0,Great Dane,1,black,True,False,False,False,True,...,False,False,False,False,False,False,True,False,False,Return to Owner


In [107]:

# cats = df_train[df_train['animal_type'] == 'Cat']
# dogs = df_train[df_train['animal_type'] == 'Dog']

In [108]:
# cats = cats.drop('animal_type', axis=1)
# dogs = dogs.drop('animal_type', axis=1)

# cats_data = cats.iloc[:, :-1]
# cats_labels = cats.iloc[:, -1:]

# dogs_data = dogs.iloc[:, :-1]
# dogs_labels = dogs.iloc[:, -1:]

In [109]:
# cats.head()

In [110]:
# Working with Decision Trees
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [111]:
# Transformer code for pipelines

# Use FunctionTransformer to wrap the freq_encode function
def apply_freq_encode(df):
    df = freq_encode(df, 'primary_color')
    df = freq_encode(df, 'breed')

    return df

In [112]:
sample = df_train.sample(n=1000)
df_data     = sample.iloc[:, :-1]
df_labels   = sample.iloc[:, -1:]

# we are now going to clean test so we can test
df_test = clean_data(df_test)
df_test = encode_columns(df_test)

  dt_series = pd.to_datetime(df['intake_time'], errors='coerce')


In [113]:
# SLAY CHAT!
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
import pandas as pd

tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced')

pipe = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('DT', tree)
])

# Hyperparameter grid
HP = {
    "DT__max_depth": [10, 12, 15, 17, 20, 25],
    "DT__max_features": [None, 5, 15, 20, 28],  
    "DT__min_samples_leaf": [5, 10, 20, 30, 40, 50]
}

# Grid search with cross-validation
griddy = GridSearchCV(estimator=pipe, param_grid=HP, cv=10, scoring='accuracy')

# Now actually fit the model
griddy.fit(df_data, df_labels.values.ravel())

# Output results
print('The best parameters for our model are:', griddy.best_params_)
print('The best accuracy we obtained using the best hyperparameter values is:', griddy.best_score_)

# Generalization accuracy using cross_val_score (optional)
accs = cross_val_score(griddy.best_estimator_, X=df_data, y=df_labels.values.ravel(), cv=10)
print('The generalization accuracy of the tuned CV model is:', accs.mean())

# Apply predictions using the best estimator from the grid search
test_predictions = griddy.predict(df_test)

# Save test predictions to CSV
df_test_output = pd.DataFrame({
  'Predicted_Label': test_predictions
})

csv_test_path = './test_predictions.csv'
df_test_output.to_csv(csv_test_path, index=False)
print(f'Test predictions saved to: {csv_test_path}')



The best parameters for our model are: {'DT__max_depth': 25, 'DT__max_features': None, 'DT__min_samples_leaf': 5}
The best accuracy we obtained using the best hyperparameter values is: 0.42800000000000005
The generalization accuracy of the tuned CV model is: 0.398




ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Panleuk
