In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.dummy
import sklearn.metrics
import sklearn.model_selection
import tqdm

# Intro
This challenge is a fairly simple multiclass classification challenge. The goal is to predict the probabilities of each type of outcome for an animal shelter pet based upon basic infromation about the pet.

[Here is the link to the Kaggle competition page.](https://www.kaggle.com/c/shelter-animal-outcomes)

In [2]:
train_df = pd.read_csv('data/train.csv.gz')
test_df = pd.read_csv('data/test.csv.gz')
train_target = train_df.pop('OutcomeType')

# dog named "Diego" in training set has NaN for Sex,
# replace with "Male"
train_df.loc[3174, ['SexuponOutcome']] = 'Male'

# Removing unfair features
The rules of the competition state that “Your model should only use information which was available prior to the time for which it is forecasting.” Accordingly, we remove outcome information from the features.

In [3]:
def get_features(df):
    res = df.copy()
    res['Sex'] = 'Male'
    res.loc[res['SexuponOutcome'].str.contains('Female')] = 'Female'
    features = ['Name', 'AnimalType', 'Breed', 'Color', 'Sex']
    res = res[features].copy()
    return res

In [4]:
train_df = get_features(train_df)
test_df = get_features(test_df)

In [5]:
train_df.head()

Unnamed: 0,Name,AnimalType,Breed,Color,Sex
0,Hambone,Dog,Shetland Sheepdog Mix,Brown/White,Male
1,Female,Female,Female,Female,Female
2,Pearce,Dog,Pit Bull Mix,Blue/White,Male
3,,Cat,Domestic Shorthair Mix,Blue Cream,Male
4,,Dog,Lhasa Apso/Miniature Poodle,Tan,Male


In [6]:
train_target.head()

0    Return_to_owner
1         Euthanasia
2           Adoption
3           Transfer
4           Transfer
Name: OutcomeType, dtype: object

# Defining a baseline model

In [7]:
# define a "dummy" model that just predicts the averages over the training set
model = sklearn.dummy.DummyClassifier(strategy='prior')
cv = sklearn.model_selection.StratifiedKFold(10, shuffle=True, random_state=0)
dummy_logloss = -np.mean(sklearn.model_selection.cross_val_score(
    model, train_df, train_target, cv=cv, scoring='neg_log_loss'))
print(f'Dummy model gets a logloss of {dummy_logloss:.3f} in cross-validation')

Dummy model gets a logloss of 1.243 in cross-validation
