### Load Packages

In [1]:
# Reading in, manipulations
import pandas as pd
import numpy as np
import re

# Plotting
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import matplotlib.pyplot as plt

# Modeling
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier

#### Set a theme

In [2]:
pio.templates.default = "plotly_white"
my_color_scheme = [
    ['#890000','#890000','#5c0000'],
    ['#2a6b28','#0b4c07','#003206'],
    ['#4f5a90','#374798','#30375a'],
    ['#fff4b1','#ffed86','#ffdb00']
]

### Read data and manipulate it

In [3]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

Convert from CamelCase to snake

In [4]:
def to_snake(camel_case_str):
    return ''.join(['_' + i.lower() if i.isupper() else i for i in camel_case_str]).lstrip('_')
df_train.columns = [to_snake(camel_str) for camel_str in df_train.columns]
df_test.columns = [to_snake(camel_str) for camel_str in df_test.columns]

#### Add names
The data is manually inputted and probably not validated, therefore I'm  
applying some changes and adding new variables:

1. Dates
2. Age in days and age in years
3. Sex (M, F, unknown)
4. Has Name (no name versus named)
5. Mix (is mix versus not a mix)
6. Intactness (intact versus not versus unknown)
7. Color


Add dates and time measures

In [5]:
df_train['date_time'] = pd.to_datetime(df_train['date_time'])
df_test['date_time'] = pd.to_datetime(df_test['date_time'])

df_train['date'] = df_train['date_time'].dt.date
df_test['date'] = df_test['date_time'].dt.date

In [6]:
df_train['time'] = df_train['date_time'].dt.time
df_test['time'] = df_test['date_time'].dt.time
df_train['day_of_week'] = df_train['date_time'].dt.day_name()
df_train['hour'] = df_train['date_time'].dt.hour

Add months

In [14]:
df_train['month'] = df_train['date_time'].dt.month
df_test['month'] = df_test['date_time'].dt.month
df_train['year_month'] = df_train['date_time'].dt.strftime("%Y/%m")
df_test['year_month'] = df_test['date_time'].dt.strftime("%Y/%m")

Convert the age of pets in days

In [7]:
def convert_to_days(age_str):
    if pd.isna(age_str):
        return None
    match = re.match(r'(\d+)\s*(year|month|week|day)s?', str(age_str))
    if match:
        value, unit = match.groups()
        value = int(value)
        if unit == 'year':
            return value * 365
        elif unit == 'month':
            return value * 30
        elif unit == 'week':
            return value * 7
        elif unit == 'day':
            return value
    return None 

df_train['age_in_days'] = df_train['ageupon_outcome'].apply(convert_to_days)
df_train['age_in_years'] = df_train['age_in_days'] / 365

df_test['age_in_days'] = df_test['ageupon_outcome'].apply(convert_to_days)
df_test['age_in_years'] = df_test['age_in_days'] / 365

Extract sex

In [8]:
df_train['sex'] = df_train['sexupon_outcome'].str.split().str[-1]
df_train['reproduction'] = df_train['sexupon_outcome'].str.split().str[0].fillna('Unknown')

df_test['sex'] = df_test['sexupon_outcome'].str.split().str[-1]
df_test['reproduction'] = df_test['sexupon_outcome'].str.split().str[0].fillna('Unknown')

Simplify color

In [9]:
df_train['simple_color'] = df_train['color'].apply(lambda x: x.split('/')[0])
df_test['simple_color'] = df_test['color'].apply(lambda x: x.split('/')[0])

Extract mix from breed

In [10]:
df_train['is_mix'] = df_train['breed'].str.contains('mix', case=False, na = False).map({True: 1, False: 0})
df_test['is_mix'] = df_test['breed'].str.contains('mix', case=False, na = False).map({True: 1, False: 0})

Add name flag

In [34]:
df_train['has_name'] = df_train.name.notna()
df_test['has_name'] = df_test.name.notna()

And finally, let's see how many babies and adult animals we have.  
This is a bit hard to derive, as this differes by animal and by breed. Also it's a process not an outcome.  

Will make it simple for this analysis and have 3 categories:
- newborn (8 weeks)
- baby (up to 1 year)
- adults (from 1 year or more)

In [39]:
newborn_max_age = 8 * 7  # 8 weeks converted to days
baby_max_age = 365       # 1 year converted to days

df_train['age_category'] = pd.cut(df_train['age_in_days'], bins = [0, newborn_max_age, baby_max_age, float('inf')],
                                  labels = ['newborn', 'baby', 'adult'], right = False)

Now, let's analyze!

In [12]:
df_animals_date = df_train.groupby(['date', 'animal_type']).size().reset_index(name = 'count')
px.line(df_animals_date, x = 'date', y = 'count', color = 'animal_type')

* The data seems to be more tracked from end of 2013 until end of 2016.   
* The animal shelter gets both cats and dogs, and more likely to shelter dogs. 
* Interestingly, there were some peaks when a lot of cats had to be sheltered.

In [49]:
px.box(df_train, x = 'animal_type', y = 'age_in_years', log_y = True) 

Most sheltered animals are young, especially the cats.


In [44]:
df_animals_date = df_train.groupby(['year_month', 'animal_type', 'age_category']).size().reset_index(name = 'count')
px.line(df_animals_date, x = 'year_month', y = 'count', color = 'age_category', facet_col = 'animal_type')

* That's because most cats are newborns or babies.
* Most dogs that end up in shelters are already adults.
* There are some peaks, most likely because of transfers?

In [51]:
df_animals_date = df_train.groupby(['year_month', 'animal_type', 'age_category', 'outcome_type']).size().reset_index(name = 'count')
px.line(df_animals_date, x = 'year_month', y = 'count', color = 'outcome_type', facet_col = 'age_category', facet_row = 'animal_type', log_y = True)

In [35]:
df_animal_name = df_train.groupby(['animal_type', 'has_name', 'outcome_type']).size().reset_index(name = 'count')
px.bar(df_animal_name, x = 'has_name', y = 'count', color = 'outcome_type', facet_col = 'animal_type')

* More than 50% of the cats have no name, whereas dogs are less likely to not have one.
* Dogs with names are more likely to be returned to their owner.
* Cats with names are more likely to be adopted. There must be something more to that...

In [36]:
df_animal_name = df_train.groupby(['animal_type', 'has_name', 'outcome_type', 'reproduction']).size().reset_index(name = 'count')
px.bar(df_animal_name, x = 'has_name', y = 'count', color = 'outcome_type', facet_col = 'animal_type', facet_row = 'reproduction')

In [None]:
df_count = df_train.groupby(['date', 'outcome_type']).size().reset_index(name = 'count')
fig = px.scatter(df_count, x = 'date', y = 'count', color = 'outcome_type', title = 'Count of Rows by Date and Target')
fig.update_layout(yaxis_type='log')

In [None]:
# Outcomes for cats and dogs.
df_outcome_cnt = df_train.groupby(['outcome_subtype', 'animal_type']).size().reset_index(name = 'count')
df_outcome_cnt['proportion'] = df_outcome_cnt.groupby('animal_type')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(df_outcome_cnt, x = 'animal_type', y = 'proportion', color='outcome_subtype', title='Count of Rows by Date and Target')
fig.show()

In [None]:
df_outcome_cnt = df_train.groupby(['outcome_type', 'animal_type']).size().reset_index(name = 'count')
df_outcome_cnt['proportion'] = df_outcome_cnt.groupby('animal_type')['count'].transform(lambda x: x / x.sum() * 100)

In [21]:
px.box(df_train, x = 'outcome_type', y = 'age_in_years')

* Most adopted pets are young, a huge part being babies. 
* When it comes to pets returned to owners, they are more likely already adults.

In [26]:
px.box(df_train, x = 'has_name', y = 'age_in_years', facet_col = 'animal_type')

* Also, it is more likely that pets without name are also younger. 
* For cats, they are much younger, because in general they tend to be younger.

In [53]:
px.histogram(df_train, x = 'hour', facet_row = 'outcome_type', facet_col = 'animal_type', height = 1000, width = 800)

* Adoption is more likely to happen in the late hours.
* A lot of cat transfers happened in the early hours. Did this happen at a particular time?

In [26]:
df_transfers = df_train[df_train['outcome_type'] == 'Transfer'].groupby(['year_month', 'animal_type']).size().reset_index(name = 'count')
px.line(df_transfers, x = 'year_month', y = 'count', color = 'animal_type')

There were some transfer peaks for cats in June...  
Not sure if it was by chance or particularly during that time period there are more transfers?

In [28]:
df_transfers_cats = df_train[(df_train['outcome_type'] == 'Transfer') & (df_train['animal_type'] == 'Cat')].groupby(['year_month', 'hour']).size().reset_index(name = 'count')
px.imshow(df_transfers_cats.pivot("year_month", "hour", "count"),
            labels=dict(x="hour", y="year_month", count="Count"),
            title="Day of Week vs Time of Day Heatmap")


In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.



* So it is true, they do tend to happen early in the morning.
* Nevertheless, we should not use these timestamps for the probabilistic model, as it would act as a data leakage. In reality, we do not know when a pet will be transferred or adopted.

### Modeling with Random Forest
1. Fill NAs
2. One-hot encode

1. Filling NAs

In [None]:
px.histogram(df_train, x = 'age_in_years')

Will use the median for imputation.

In [137]:
df_train['age_in_years'].fillna(df_train['age_in_years'].median(), inplace = True)

2. Finally, one-hot encoding ...

In [138]:
df_dummies = pd.get_dummies(df_train[['sex', 'reproduction', 'simple_color', 'animal_type']], drop_first = True)
df_train = pd.concat([df_dummies, df_train], axis = 1)

df_test_dummies = pd.get_dummies(df_test[['sex', 'reproduction', 'simple_color', 'animal_type']], drop_first = True)
df_test = pd.concat([df_test_dummies, df_test], axis = 1)

In [139]:
features = ['is_mix', 'sex_Male', 'has_name', 'age_in_years', 'reproduction_Unknown', 'animal_type_Dog']
X = df_train[features].values.reshape(-1, 6)
y = df_train['outcome_type']
X_test = df_test[features].values.reshape(-1, 6)

In [140]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size =  0.2)

In [None]:
X_train.shape, y_train.shape

Modeling in Random Forest ...

In [141]:
model = RandomForestClassifier()
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
result = cross_val_score(model, X_train, y_train, cv = cv, scoring = 'roc_auc', n_jobs = -1, verbose = 1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.7s finished


In [143]:
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.6001028854697658