# Pet Adoption Classification

## Problem Definition
Predict pet adoption speed. Adoption speed is expressed as ranges of number of days (ie. classes labeled 0, 1, 2, 3, 4).

In [None]:
# import necessary libraries
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

## Load Data

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df_train.shape, df_test.shape)

## Clean Up and Examine Dataset

In [None]:
df_train = df_train.drop(['RescuerID', 'PetID'], axis=1)
df_test = df_test.drop(['RescuerID', 'PetID'], axis=1)

In [None]:
# examine dataset for unique categorical values among applicable categorical features
cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',\
           'Sterilized', 'Health', 'Quantity', 'State']

In [None]:
# examine df_train
print(df_train.columns)
df_train.head()

In [None]:
# examine df_test
print(df_test.columns)
df_test.head()

## Missing Values

In [None]:
# see how many missing values exist and where they are
print('Train:')
print(df_train.isnull().sum())
print('')
print('Test:')
print(df_test.isnull().sum())

## Feature Engineering

In [None]:
# examine unique elements and the number of unique elements per categorical feature
print(df.groupby('Type').nunique())
print('')


In [None]:
# one-hot encode relevant categorical columns in train dataset
df_type = pd.get_dummies(df['Type'], prefix='Type')
df_train = pd.concat([df_train, df_type], axis=1)

df_breed1 = pd.get_dummies(df['Breed1'], prefix='Breed1')
df_train = pd.concat([df_train, df_breed1], axis=1)

df_breed2 = pd.get_dummies(df['Breed2'], prefix='Breed2')
df_train = pd.concat([df_train, df_breed2], axis=1)

df_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df_train = pd.concat([df_train, df_gender], axis=1)

df_color1 = pd.get_dummies(df['Color1'], prefix='Color1')
df_train = pd.concat([df_train, df_color1], axis=1)

df_color2 = pd.get_dummies(df['Color2'], prefix='Color2')
df_train = pd.concat([df_train, df_color2], axis=1)

df_color3 = pd.get_dummies(df['Color3'], prefix='Color3')
df_train = pd.concat([df_train, df_color3], axis=1)

df_size = pd.get_dummies(df['MaturitySize'], prefix='MatSize')
df_train = pd.concat([df_train, df_size], axis=1)

df_furlength = pd.get_dummies(df['FurLength'], prefix='FurLength')
df_train = pd.concat([df_train, df_furlength], axis=1)

df_vaccinated = pd.get_dummies(df['Vaccinated'], prefix='Vaccinated')
df_train = pd.concat([df_train, df_vaccinated], axis=1)

df_dewormed = pd.get_dummies(df['Dewormed'], prefix='Dewormed')
df_train = pd.concat([df_train, df_dewormed], axis=1)

df_sterilized = pd.get_dummies(df['Sterilized'], prefix='Sterilized')
df_train = pd.concat([df_train, df_sterilized], axis=1)

df_health = pd.get_dummies(df['Health'], prefix='Health')
df_train = pd.concat([df_train, df_health], axis=1)

df_quantity = pd.get_dummies(df['Quantity'], prefix='Quantity')
df_train = pd.concat([df_train, df_quantity], axis=1)

df_state = pd.get_dummies(df['State'], prefix='State')
df_train = pd.concat([df_train, df_state], axis=1)

In [None]:
# one-hot encode relevant categorical columns in test dataset
df_test_type = pd.get_dummies(df['Type'], prefix='Type')
df_test = pd.concat([df_test, df_test_type], axis=1)

df_test_breed1 = pd.get_dummies(df['Breed1'], prefix='Breed1')
df_test = pd.concat([df_test, df_test_breed1], axis=1)

df_test_breed2 = pd.get_dummies(df['Breed2'], prefix='Breed2')
df_test = pd.concat([df_test, df_test_breed2], axis=1)

df_test_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df_test = pd.concat([df_test, df_test_gender], axis=1)

df_test_color1 = pd.get_dummies(df['Color1'], prefix='Color1')
df_test = pd.concat([df_test, df_test_color1], axis=1)

df_test_color2 = pd.get_dummies(df['Color2'], prefix='Color2')
df_test = pd.concat([df_test, df_test_color2], axis=1)

df_test_color3 = pd.get_dummies(df['Color3'], prefix='Color3')
df_test = pd.concat([df_test, df_test_color3], axis=1)

df_test_size = pd.get_dummies(df['MaturitySize'], prefix='MatSize')
df_test = pd.concat([df_test, df_test_size], axis=1)

df_test_furlength = pd.get_dummies(df['FurLength'], prefix='FurLength')
df_test = pd.concat([df_test, df_test_furlength], axis=1)

df_test_vaccinated = pd.get_dummies(df['Vaccinated'], prefix='Vaccinated')
df_test = pd.concat([df_test, df_test_vaccinated], axis=1)

df_test_dewormed = pd.get_dummies(df['Dewormed'], prefix='Dewormed')
df_test = pd.concat([df_test, df_test_dewormed], axis=1)

df_test_sterilized = pd.get_dummies(df['Sterilized'], prefix='Sterilized')
df_test = pd.concat([df_test, df_test_sterilized], axis=1)

df_test_health = pd.get_dummies(df['Health'], prefix='Health')
df_test = pd.concat([df_test, df_test_health], axis=1)

df_test_quantity = pd.get_dummies(df['Quantity'], prefix='Quantity')
df_test = pd.concat([df_test, df_test_quantity], axis=1)

df_test_state = pd.get_dummies(df['State'], prefix='State')
df_test = pd.concat([df_test, df_test_state], axis=1)

## Model Training

In [None]:
# import model utils, training, evaluation libraries

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

In [None]:
# select the columns
X_columns = ['Age', 'Fee', 'Health_1', 'Health_2', 'Health_3']
y_column = ['AdoptionSpeed']

In [None]:
# split the data using sklearn
threshold = 0.8
X = df_train[X_columns]
y = df_train[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

## Model Evaluation

In [None]:
# plot of predicted vs actual adoption speed class

In [None]:
# evaluation scoring
kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
print('kappa', round(kappa, 4))
print(confusion_matrix(y_test, y_pred))

In [None]:
# Cross Validation
k = 10
results = []
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, weights ='quadratic')
    results.append(round(kappa, 4))

print('Kappa for each fold:', results)
print('AVG(kappa)', round(np.mean(results), 4))
print('STD(kappa)', round(np.std(results), 4))

## Submission