In [6]:
# Required Python Machine learning Packages
import pandas as pd
import numpy as np

# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split

# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB

# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
df = pd.read_csv("aac_shelter_outcomes.csv")
df.head()


Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,,Partner,Transfer,Intact Male
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
4,5 months,A683115,Other,Bat Mix,Brown,2014-01-07T00:00:00,2014-07-07T14:04:00,2014-07-07T14:04:00,,Rabies Risk,Euthanasia,Unknown


In [8]:
# detect values that are NaN in each row
# there are no null values in our data
df.isnull().sum()

age_upon_outcome        8
animal_id               0
animal_type             0
breed                   0
color                   0
date_of_birth           0
datetime                0
monthyear               0
name                23886
outcome_subtype     42293
outcome_type           12
sex_upon_outcome        2
dtype: int64

In [9]:
df["outcome_type"].value_counts()

Adoption           33112
Transfer           23499
Return to Owner    14354
Euthanasia          6080
Died                 680
Disposal             307
Rto-Adopt            150
Missing               46
Relocate              16
Name: outcome_type, dtype: int64

In [10]:
df["outcome_type"].fillna("Adoption", inplace=True)

In [11]:
df["outcome_type"].isnull().sum()

0

In [12]:
df["outcome_subtype"].value_counts()

Partner                19660
Foster                  5558
SCRP                    3211
Suffering               2514
Rabies Risk             2417
Snr                      626
Aggressive               506
Offsite                  367
In Kennel                343
Medical                  259
In Foster                182
Behavior                 142
At Vet                    59
Enroute                   45
Underage                  28
Court/Investigation       18
In Surgery                16
Possible Theft             9
Barn                       3
Name: outcome_subtype, dtype: int64

In [13]:
df["outcome_subtype"].fillna("Partner", inplace=True)

In [14]:
df["outcome_subtype"].isnull().sum()

0

In [15]:
df["animal_type"].value_counts()

Dog          44242
Cat          29422
Other         4249
Bird           334
Livestock        9
Name: animal_type, dtype: int64

## fill missing values

we fill the missing values with the most frequently appeared data

we do not fill missing values for the name column because it doesn't make sense

In [16]:
df["age_upon_outcome"].value_counts()

1 year       14355
2 years      11194
2 months      9213
3 years       5157
3 months      3442
1 month       3344
4 years       2990
5 years       2691
4 months      2425
5 months      1951
6 months      1897
6 years       1810
8 years       1554
7 years       1537
3 weeks       1467
2 weeks       1330
10 months     1204
4 weeks       1194
8 months      1178
10 years      1159
7 months       963
9 years        822
9 months       673
12 years       609
1 weeks        513
11 months      490
11 years       429
1 week         427
13 years       389
14 years       253
3 days         235
2 days         217
15 years       208
1 day          153
6 days         152
4 days         136
5 days         116
16 years       101
0 years         95
5 weeks         61
17 years        58
18 years        26
19 years        13
20 years        12
22 years         4
25 years         1
Name: age_upon_outcome, dtype: int64

In [17]:
df["age_upon_outcome"].fillna("1 year", inplace=True)

In [18]:
df["sex_upon_outcome"].value_counts()

Neutered Male    27784
Spayed Female    25203
Intact Male       9549
Intact Female     9143
Unknown           6575
Name: sex_upon_outcome, dtype: int64

In [19]:
df["sex_upon_outcome"].fillna("Neutered Male", inplace=True)

In [20]:
df.isnull().sum()

age_upon_outcome        0
animal_id               0
animal_type             0
breed                   0
color                   0
date_of_birth           0
datetime                0
monthyear               0
name                23886
outcome_subtype         0
outcome_type            0
sex_upon_outcome        0
dtype: int64

## convert adoption column

In [21]:
dummy = pd.get_dummies(df["outcome_type"])
dummy.head()

Unnamed: 0,Adoption,Died,Disposal,Euthanasia,Missing,Relocate,Return to Owner,Rto-Adopt,Transfer
0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,0


In [22]:
dummy['Adoption'].value_counts()

0    45132
1    33124
Name: Adoption, dtype: int64

In [23]:
dummy['Adoption'] =  dummy['Adoption'] + dummy['Return to Owner']

In [24]:
dummy['Adoption'].value_counts()

1    47478
0    30778
Name: Adoption, dtype: int64

In [25]:
dummy['Adoption'] =  dummy['Adoption'] + dummy['Rto-Adopt'] 

In [26]:
dummy['Adoption'].value_counts()

1    47628
0    30628
Name: Adoption, dtype: int64

In [27]:
# convert the outcome column to a new column consisting of only 1 = adoption and 0 = not adopted
df = pd.concat([df, dummy["Adoption"]], axis = 1)

In [28]:
df

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome,Adoption
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,,Partner,Transfer,Intact Male,0
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female,0
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,Partner,Adoption,Neutered Male,1
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male,0
4,5 months,A683115,Other,Bat Mix,Brown,2014-01-07T00:00:00,2014-07-07T14:04:00,2014-07-07T14:04:00,,Rabies Risk,Euthanasia,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78251,1 month,A764894,Dog,Golden Retriever/Labrador Retriever,Brown/White,2017-12-04T00:00:00,2018-02-01T18:26:00,2018-02-01T18:26:00,,Foster,Adoption,Spayed Female,1
78252,3 years,A764468,Dog,Mastiff Mix,Blue/White,2014-12-30T00:00:00,2018-02-01T18:06:00,2018-02-01T18:06:00,Max,Partner,Adoption,Neutered Male,1
78253,1 year,A766098,Other,Bat Mix,Brown,2017-02-01T00:00:00,2018-02-01T18:08:00,2018-02-01T18:08:00,,Rabies Risk,Euthanasia,Unknown,0
78254,2 months,A765858,Dog,Standard Schnauzer,Red,2017-11-13T00:00:00,2018-02-01T18:32:00,2018-02-01T18:32:00,,Partner,Adoption,Spayed Female,1


From previous value_counts, we could see that there are no **outliers**, wrong input data

There are no data that needs to be **centered and scaled**, since we have all categorical values

No **data transformation** needed

## We chose "age_upon_outcome", "animal_type", "outcome_subtype", "sex_upon_outcome" as predictors

Age - because younger animals tend to be more easily adopted compared to animals that were 25 years old


Animal_type - because animals such as bats, livestock .. etc tend to be less adopted


outcome_subtype - because animals with outcomes such as rabies or diseases tend to be less adopted


sex_upon_outcome - this is debatable, because I am not exactly sure if the gender really affects the adoption rate

name - whether an animal has a name or not may also affect the adoption result, but it doesnt seem to affect as significant as other columns so we decide to leave it out

## Deal with labels

In [29]:
def label_encode(df, columns):
    for col in columns:
        le = LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        #le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed

In [30]:
features = df[['age_upon_outcome','animal_type','outcome_subtype','sex_upon_outcome']]

label_encode(features, features.columns.values)

target = df[['Adoption']]
label_encode(target, target.columns.values)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [31]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.33, random_state = 10)


In [32]:
features

Unnamed: 0,age_upon_outcome,animal_type,outcome_subtype,sex_upon_outcome
0,20,1,12,1
1,5,2,12,3
2,5,2,12,2
3,45,2,12,2
4,34,4,14,4
...,...,...,...,...
78251,2,2,6,3
78252,28,2,12,2
78253,5,4,14,4
78254,19,2,12,3


## Naive Bayes

In [33]:
nb = GaussianNB()
nb.fit(features_train, target_train)
target_pred = nb.predict(features_test)
target_pred

  y = column_or_1d(y, warn=True)


array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [34]:
from sklearn import metrics
print(metrics.accuracy_score(target_test, target_pred))

0.8049951597289449


In [35]:
print(metrics.roc_auc_score(target_test, target_pred))

0.764625172951617


## SVM

In [31]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.20, random_state=1)

In [32]:
from sklearn import svm

# instantiate model
model = svm.SVC() 

# fit model
model.fit(features_train, target_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [35]:
# make class predictions for the testing set
target_pred = model.predict(features_test)
target_pred

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [34]:
# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(target_test, target_pred))

0.8473677485305392


In [36]:
# IMPORTANT: first argument is true values, second argument is predicted probabilities
print(metrics.roc_auc_score(target_test, target_pred))

0.8195881775885233


## KNN

In [74]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.20, random_state=1)

In [75]:
from sklearn.neighbors import KNeighborsClassifier

# instantiate model
model = KNeighborsClassifier(n_neighbors=2)

# fit model
model.fit(features_train, target_train)

  import sys


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [76]:
# make class predictions for the testing set
target_pred = model.predict(features_test)
target_pred

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [77]:
# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(target_test, target_pred))

0.7048939432660363


In [78]:
# IMPORTANT: first argument is true values, second argument is predicted probabilities
print(metrics.roc_auc_score(target_test, target_pred))

0.7246527666357836


## Random forest

In [42]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.20, random_state=1)

In [43]:
from sklearn.ensemble import RandomForestClassifier

# instantiate model
model = RandomForestClassifier(random_state=1, max_depth=10)

# fit model
model.fit(features_train, target_train)

  import sys


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [44]:
# make class predictions for the testing set
target_pred = model.predict(features_test)
target_pred

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

In [45]:
# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(target_test, target_pred))

0.8450038333759264


In [46]:
# IMPORTANT: first argument is true values, second argument is predicted probabilities
print(metrics.roc_auc_score(target_test, target_pred))

0.816891735516526


**Let's see how well our model will hold up with k-fold cross validation**

In [47]:
from sklearn.model_selection import KFold # import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict

In [50]:
# Retrain model on the whole dataset
model.fit(features, target)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [51]:
# Perform 10-fold cross validation
kf = KFold(n_splits=10, random_state=1, shuffle=False)
scores = cross_val_score(model, features, target, cv=kf)
print('Cross-validated scores:', scores)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Cross-validated scores: [0.84040378 0.84002044 0.8446205  0.84692052 0.850115   0.8439816
 0.85060703 0.84920128 0.8572524  0.84741214]


In [52]:
print(scores.mean())

0.8470534688582092


In [54]:
# Make cross validated predictions
pred = cross_val_predict(model, features, target, cv=kf)
pred

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

## Accuracy :

## Navie Bayes  


accuracy score : 0.8049951597289449


roc score      : 0.764625172951617
               
               
## SVM           

accuracy score : 0.8473677485305392


roc score      : 0.81958817758852337
               
               
## KNN          

accuracy score : 0.7048939432660363


roc score      : 0.8229036944047988
               
               
## Random Forest 

accuracy score : 0.8450038333759264


roc score      : 0.816891735516526


10-fold cross validation score: 0.8470534688582092

**The highest score we could get would be using the SVM or Random Forest or 10 fold validation method**

SVM takes a lot of computation time, so random forest is recommended

FIrst, naive bayes is not considered to be a appropriate model here because it is very likely that according to different animals there may be different traits (outcome subtypes) for each individual and naive bayes would not be able to react to new categorical columns. Naive Bayes is bad when facing new unknown data.


I would say the SVM model is not the best, because it has a relatively low roc score and it is extremely slow, takes a long time to compile.
 

The KNN model has a  nice performace with the ROC score, and may also be calculated fast. But one major weakness is that it is hard to calculate the differences between categorical data.

In conclusion, I would consider the Random Forest model to be my best model, it doesn't take super long calculation time and is also accurate and precise with many different categorical datas especially if they have low correlation.