In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [2]:
# THIS IS FOR DATA CLEANING - all the stuff should be here
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# we are dropping color after we establish a primary color columns
df_train = df_train.drop(columns=['id', 'name', 'date_of_birth', 'outcome_time', 'found_location'])
df_train = df_train.dropna(subset=['age_upon_intake'])

In [3]:
# Intake Time
# Convert string timestamps to UNIX timestamp
dt_series = pd.to_datetime(df_train['intake_time'])
df_train['intake_time'] = dt_series.astype('int64') // (10 ** 9)
print(df_train['intake_time'])

# Sex Upon Intake
# Replace NA values with 'Unknown' using .loc for proper assignment
df_train.loc[df_train['sex_upon_intake'].isna(), 'sex_upon_intake'] = 'Unknown'

# Age Upon Intake
def convert_age(age_str):
    """
    Convert age strings to years.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num
    elif "month" in unit:
        return num / 12
    elif "week" in unit:
        return num / 52
    elif "day" in unit:
        return num / 365
    else:
        return None

df_train['age_upon_intake'] = df_train['age_upon_intake'].apply(convert_age)
df_train.loc[df_train['age_upon_intake'] < 0, 'age_upon_intake'] = 0

# Breed
# Create is_mix column
df_train['is_mix'] = df_train['breed'].str.contains('mix', case=False, na=False).astype(int)
# remove mix from all breeds
df_train['breed'] = df_train['breed'].str.replace(' mix', '', case=False)

0         1436101140
1         1460659380
2         1652314980
3         1487421960
4         1555408380
             ...    
111152    1724930100
111153    1725115920
111154    1725200160
111155    1715623860
111156    1724509500
Name: intake_time, Length: 111156, dtype: int64


In [4]:
color_group_map = {
  'blue tabby':       'gray tabby',
  'silver tabby':     'gray tabby',
  'silver':           'gray',
  'blue':             'gray',
  'orange tabby':     'orange',
  'orange tiger':     'orange',
  'red':              'orange',
  'red tabby':        'orange',
  'red tick':         'orange',
  'yellow':           'orange',
  'tan':              'cream',
  'tricolor':         'calico'
}

# lowercase
df_train['color'] = df_train['color'].str.lower().str.strip()

# feature engineering -> primary color 
df_train['primary_color'] = df_train['color'].astype(str).apply(
    lambda x: x.split('/')[0].strip() if '/' in x else x.strip()
)

# simplify synonymous colors if in map
df_train['primary_color'] = df_train['primary_color'].map(color_group_map).fillna(df_train['primary_color'])

df_train = df_train.drop(columns=['color'])


def freq_encode(df, col):
  # count frequencies
  freq_series = df[col].value_counts()

  # map frequencies back to the original column, replacing values
  df[col] = df[col].map(freq_series)

  return df

In [5]:
# cleaning intake type + condition
df_train = df_train[df_train['intake_type'] != 'Wildlife']


df_train['intake_condition'] = df_train['intake_condition'].replace({'Unknown': 'Unknown Condition / Other', 'Other': 'Unknown Condition / Other'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Space': 'Unknown Condition / Other'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Behavior': 'Normal / Behavior', 'Normal': 'Normal / Behavior'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Neonatal': 'Nursing / Neonatal', 'Nursing': 'Nursing / Neonatal'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Neurologic': 'Med Urgent', 'Agonal': 'Med Urgent', 'Parvo': 'Med Urgent'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Agonal': 'Med Urgent / Neurological'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Congenital': 'Sick'})



In [6]:
# One hot encode intake type
df = pd.get_dummies(df_train['intake_type'])
df_train = df_train.drop('intake_type', axis=1)
df_train = pd.concat([df_train, df], axis=1)
df_train.head()

Unnamed: 0,intake_time,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,outcome_type,is_mix,primary_color,Abandoned,Euthanasia Request,Owner Surrender,Public Assist,Stray
0,1436101140,Normal / Behavior,Dog,Spayed Female,8.0,English Springer Spaniel,Return to Owner,0,white,False,False,False,False,True
1,1460659380,Normal / Behavior,Dog,Intact Male,0.916667,Basenji,Return to Owner,1,sable,False,False,False,False,True
2,1652314980,Normal / Behavior,Cat,Neutered Male,2.0,Domestic Shorthair,Transfer,0,orange,False,False,False,True,False
3,1487421960,Normal / Behavior,Dog,Neutered Male,2.0,Labrador Retriever,Return to Owner,1,chocolate,False,False,True,False,False
4,1555408380,Normal / Behavior,Dog,Neutered Male,6.0,Great Dane,Return to Owner,1,black,False,False,False,True,False


In [7]:
# One hot encode intake condition
df = pd.get_dummies(df_train['intake_condition'])
df_train = df_train.drop('intake_condition', axis=1)
df_train = pd.concat([df_train, df], axis=1)
df_train.head()

Unnamed: 0,intake_time,animal_type,sex_upon_intake,age_upon_intake,breed,outcome_type,is_mix,primary_color,Abandoned,Euthanasia Request,...,Feral,Injured,Med Attn,Med Urgent,Medical,Normal / Behavior,Nursing / Neonatal,Pregnant,Sick,Unknown Condition / Other
0,1436101140,Dog,Spayed Female,8.0,English Springer Spaniel,Return to Owner,0,white,False,False,...,False,False,False,False,False,True,False,False,False,False
1,1460659380,Dog,Intact Male,0.916667,Basenji,Return to Owner,1,sable,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1652314980,Cat,Neutered Male,2.0,Domestic Shorthair,Transfer,0,orange,False,False,...,False,False,False,False,False,True,False,False,False,False
3,1487421960,Dog,Neutered Male,2.0,Labrador Retriever,Return to Owner,1,chocolate,False,False,...,False,False,False,False,False,True,False,False,False,False
4,1555408380,Dog,Neutered Male,6.0,Great Dane,Return to Owner,1,black,False,False,...,False,False,False,False,False,True,False,False,False,False


In [8]:
# One hot encode sex upon intake
df = pd.get_dummies(df_train['sex_upon_intake'])
df_train = df_train.drop('sex_upon_intake', axis=1)
df_train = pd.concat([df_train, df], axis=1)
df_train.head()

Unnamed: 0,intake_time,animal_type,age_upon_intake,breed,outcome_type,is_mix,primary_color,Abandoned,Euthanasia Request,Owner Surrender,...,Normal / Behavior,Nursing / Neonatal,Pregnant,Sick,Unknown Condition / Other,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown
0,1436101140,Dog,8.0,English Springer Spaniel,Return to Owner,0,white,False,False,False,...,True,False,False,False,False,False,False,False,True,False
1,1460659380,Dog,0.916667,Basenji,Return to Owner,1,sable,False,False,False,...,True,False,False,False,False,False,True,False,False,False
2,1652314980,Cat,2.0,Domestic Shorthair,Transfer,0,orange,False,False,False,...,True,False,False,False,False,False,False,True,False,False
3,1487421960,Dog,2.0,Labrador Retriever,Return to Owner,1,chocolate,False,False,True,...,True,False,False,False,False,False,False,True,False,False
4,1555408380,Dog,6.0,Great Dane,Return to Owner,1,black,False,False,False,...,True,False,False,False,False,False,False,True,False,False


In [9]:
print(df_train.shape)
print(df_train.head())

(111155, 28)
   intake_time animal_type  age_upon_intake                     breed  \
0   1436101140         Dog         8.000000  English Springer Spaniel   
1   1460659380         Dog         0.916667                   Basenji   
2   1652314980         Cat         2.000000        Domestic Shorthair   
3   1487421960         Dog         2.000000        Labrador Retriever   
4   1555408380         Dog         6.000000                Great Dane   

      outcome_type  is_mix primary_color  Abandoned  Euthanasia Request  \
0  Return to Owner       0         white      False               False   
1  Return to Owner       1         sable      False               False   
2         Transfer       0        orange      False               False   
3  Return to Owner       1     chocolate      False               False   
4  Return to Owner       1         black      False               False   

   Owner Surrender  ...  Normal / Behavior  Nursing / Neonatal  Pregnant  \
0            False  .

In [10]:
label_column = df_train.pop('outcome_type')
df_train.insert(df_train.shape[1], 'outcome_type', label_column)
cats = df_train[df_train['animal_type'] == 'Cat']
dogs = df_train[df_train['animal_type'] == 'Dog']

In [11]:
cats = cats.drop('animal_type', axis=1)
dogs = dogs.drop('animal_type', axis=1)

cats_data = cats.iloc[:, :-1]
cats_labels = cats.iloc[:, -1:]

dogs_data = dogs.iloc[:, :-1]
dogs_labels = dogs.iloc[:, -1:]

In [12]:
cats.head()

Unnamed: 0,intake_time,age_upon_intake,breed,is_mix,primary_color,Abandoned,Euthanasia Request,Owner Surrender,Public Assist,Stray,...,Nursing / Neonatal,Pregnant,Sick,Unknown Condition / Other,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown,outcome_type
2,1652314980,2.0,Domestic Shorthair,0,orange,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,Transfer
5,1634298000,0.5,Domestic Shorthair,0,brown tabby,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,Adoption
7,1592491980,0.076923,Domestic Shorthair,0,cream tabby,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,Adoption
8,1528703100,0.076923,Domestic Shorthair,1,black,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,Transfer
9,1470678720,0.416667,Domestic Shorthair,1,cream tabby,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,Transfer


In [13]:
# Working with Decision Trees
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [14]:
def print_runtime(start, end):
    total_time = end - start
    min = int(total_time // 60)
    sec = int(total_time % 60)
    print(f"Time taken: {min}:{sec:02d}")

In [15]:
# Transformer code for pipelines

# Use FunctionTransformer to wrap the freq_encode function
def apply_freq_encode(df):
    df = freq_encode(df, 'primary_color')
    df = freq_encode(df, 'breed')

    return df

In [16]:
# THIS IS A TEST

df_data = cats.head(n=1000).iloc[:, :-1]
df_labels = cats.head(n=1000).iloc[:, -1:]

tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced') # it's balancced to handle class imbalances
pipe = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('DT', tree)
])


# make a matrix of HP values to tune
HP = {
    "DT__max_depth": [10, 25],
    "DT__max_features" : [None, 28],  
    "DT__min_samples_leaf": [5, 10]
    }

start = time.time()

griddy = GridSearchCV(estimator=pipe, param_grid=HP, cv=10, scoring='accuracy')
accs = cross_val_score(griddy, X=df_data, y=df_labels, cv=10)    
end = time.time()

print_runtime(start, end)

print('The best parameters for our model are: ', griddy.best_params_)
print('The best accuracies we obtained using the best hyperparameter values are: ', griddy.best_score_)
print('The Generalization accuracy of tuned, CV model is ', griddy.mean())

Time taken: 0:06


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
# chat 

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
import pandas as pd

tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced')

pipe = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('DT', tree)
])

# Hyperparameter grid
HP = {
    "DT__max_depth": [10, 12, 15, 17, 20, 25],
    "DT__max_features": [None, 5, 15, 20, 28],  
    "DT__min_samples_leaf": [5, 10, 20, 30, 40, 50]
}

# Grid search with cross-validation
griddy = GridSearchCV(estimator=pipe, param_grid=HP, cv=10, scoring='accuracy')

# Now actually fit the model
griddy.fit(df_data, df_labels.values.ravel())

# Output results
print('The best parameters for our model are:', griddy.best_params_)
print('The best accuracy we obtained using the best hyperparameter values is:', griddy.best_score_)

# Generalization accuracy using cross_val_score (optional)
accs = cross_val_score(griddy.best_estimator_, X=df_data, y=df_labels.values.ravel(), cv=10)
print('The generalization accuracy of the tuned CV model is:', accs.mean())

# Load test data
test_df = pd.read_csv('test.csv')  # Ensure the file is in your working directory

# Apply predictions using the best estimator from the grid search
test_predictions = griddy.predict(test_df)

# Save test predictions to CSV
df_test_output = pd.DataFrame({
  'Predicted_Label': test_predictions
})

csv_test_path = './test_predictions.csv'
df_test_output.to_csv(csv_test_path, index=False)
print(f'Test predictions saved to: {csv_test_path}')

The best parameters for our model are: {'DT__max_depth': 10, 'DT__max_features': 28, 'DT__min_samples_leaf': 5}
The best accuracy we obtained using the best hyperparameter values is: 0.561
The generalization accuracy of the tuned CV model is: 0.562


KeyError: 'primary_color'