In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import evaluate as ev
import prepare as prep
import acquire as ac

# importing machine learning libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import PolynomialFeatures

from sklearn.preprocessing import LabelEncoder
# set seed number to be used for all random states so my results can be duplicated
seed=100

In [99]:
a = ac.animals()

In [3]:
a.head()

Unnamed: 0,name,outcome_type,animal_type,color,age,gender,neut_spay,condition,breed1,breed2
0,False,Adoption,Dog,Brown/White,180,male,True,Normal,Other,purebred
1,True,Adoption,Dog,Tan,730,male,True,Normal,Pit Bull,mix
2,True,Adoption,Dog,Other,120,male,True,Normal,Labrador Retriever,purebred
3,False,Transfer,Cat,Brown Tabby/White,1095,unk,False,Normal,Domestic Shorthair,mix
4,False,Transfer,Cat,Black/White,1095,unk,False,Normal,Domestic Shorthair,mix


In [4]:
train, val, test = prep.train_val_test(a, 'outcome_type', stratify=True)

(72426, 10) (15520, 10) (15521, 10)


In [5]:
list(a.select_dtypes('number').columns)

['age']

In [6]:
train, val, test = prep.scale(train, val, test, scaled_cols=['age'])

In [7]:
train.head()

Unnamed: 0,name,outcome_type,animal_type,color,age,gender,neut_spay,condition,breed1,breed2
102324,False,Transfer,Dog,White,1.0,female,False,Normal,Chihuahua Shorthair,mix
61026,True,Adoption,Dog,Tan,1.0,male,True,Normal,Australian Cattle Dog,Cardigan Welsh Corgi
44543,True,Adoption,Dog,Tricolor,0.314554,female,True,Normal,Chihuahua Shorthair,mix
49660,True,Return to Owner,Dog,Other,1.0,male,True,Normal,Jack Russell Terrier,Chihuahua Shorthair
72159,True,Adoption,Dog,Brown,0.314554,female,True,Normal,Other,mix


In [8]:
train, val, test = prep.dummies(train, val, test, drop_first=['name', 'gender', 'neut_spay', 'condition'],
                                normal_list=['animal_type','color', 'breed1', 'breed2'])

print(train.shape, val.shape, test.shape)

(72426, 89) (15520, 89) (15521, 89)
(72426, 89) (15520, 89) (15521, 89)


In [9]:
train.head()

Unnamed: 0,outcome_type,age,name_True,gender_male,gender_unk,neut_spay_True,condition_Normal,animal_type_Bird,animal_type_Cat,animal_type_Dog,...,breed2_Other,breed2_Pit Bull,breed2_Pointer,breed2_Pug,breed2_Rat Terrier,breed2_Rottweiler,breed2_Tan Hound,breed2_Yorkshire Terrier,breed2_mix,breed2_purebred
102324,Transfer,1.0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
61026,Adoption,1.0,1,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
44543,Adoption,0.314554,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
49660,Return to Owner,1.0,1,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
72159,Adoption,0.314554,1,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [10]:
x_train, y_train = prep.split_xy(train,'outcome_type')  # split data to remove the target variable churn
x_val, y_val = prep.split_xy(val,'outcome_type')
x_test, y_test = prep.split_xy(test,'outcome_type')

print(x_train.shape, x_val.shape, x_test.shape)

(72426, 88) (15520, 88) (15521, 88)


In [11]:
def preprocess(dataframe):
    a = dataframe.copy()
    train, val, test = prep.train_val_test(a, 'outcome_type', stratify=True)
    train, val, test = prep.scale(train, val, test, scaled_cols=['age'])
    train, val, test = prep.dummies(train, val, test, drop_first=['name', 'gender', 'neut_spay', 'condition'],
                                                     normal_list=['animal_type','color', 'breed1', 'breed2'])
    x_train, y_train = prep.split_xy(train,'outcome_type')  # split data to remove the target variable churn
    x_val, y_val = prep.split_xy(val,'outcome_type')
    x_test, y_test = prep.split_xy(test,'outcome_type')
    return x_train, y_train, x_val, y_val, x_test, y_test, train, val, test

In [12]:
x_train, y_train, x_val, y_val, x_test, y_test, train, val, test = preprocess(a)

(72426, 10) (15520, 10) (15521, 10)
(72426, 89) (15520, 89) (15521, 89)


# All animals

In [13]:
res = ev.baseline_classification(train, 'outcome_type', 'Adoption')

Baseline accuracy is: 43.38%.
Baseline recall is: 100.0%.
Baseline precision is: 43.38%.



In [14]:
res.head()

Unnamed: 0,outcome_type,baseline
102324,Transfer,Adoption
61026,Adoption,Adoption
44543,Adoption,Adoption
49660,Return to Owner,Adoption
72159,Adoption,Adoption


In [15]:
clf = DecisionTreeClassifier(random_state=100)
clf.fit(x_train, y_train)
print(clf.score(x_train, y_train))
print(clf.score(x_val, y_val))

0.7851876397978627
0.6933634020618556


In [16]:
preds = clf.predict(x_train)

In [17]:
res['preds'] = preds

In [18]:
rf1 = RandomForestClassifier(random_state=seed)
ev.train_model(rf1, x_train, y_train, x_val, y_val)

0.7851876397978627
0.6978737113402061


In [19]:
knn = KNeighborsClassifier()
ev.train_model(knn, x_train, y_train, x_val, y_val)

0.7148123602021373
0.6735180412371135


In [20]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.fit_transform(y_val)

In [21]:
y_train_encoded

array([5, 0, 0, ..., 5, 5, 0])

In [22]:
xgb = XGBClassifier()

ev.train_model(xgb, x_train, y_train_encoded, x_val, y_val_encoded)

0.7309529726893657
0.7177835051546392


In [23]:
poly = PolynomialFeatures()
x_train_s = poly.fit_transform(x_train)
x_val_s = poly.transform(x_val)

In [24]:
clf = DecisionTreeClassifier(random_state=100)

ev.train_model(rf1, x_train_s, y_train, x_val_s, y_val)

0.7851876397978627
0.6990979381443299


# Dogs

In [71]:
def preprocess_dog(dataframe):
    a = dataframe.copy()
    train, val, test = prep.train_val_test(a, 'outcome_type', stratify=True)
    train, val, test = prep.scale(train, val, test, scaled_cols=['age'])
    train, val, test = prep.dummies(train, val, test, drop_first=['name', 'gender', 'neut_spay', 'condition'],
                                                     normal_list=['animal_type', 'breed1', 'breed2'])
    x_train, y_train = prep.split_xy(train,'outcome_type')  # split data to remove the target variable churn
    x_val, y_val = prep.split_xy(val,'outcome_type')
    x_test, y_test = prep.split_xy(test,'outcome_type')
    return x_train, y_train, x_val, y_val, x_test, y_test, train, val, test

In [100]:
bird = a[a.animal_type == 'Bird']
dog = a[a.animal_type == 'Dog']
cat = a[a.animal_type == 'Cat']
other = a[(a.animal_type == 'Livestock') | (a.animal_type == 'Other')]

In [73]:
dog = dog.drop(columns='color')

In [74]:
x_train, y_train, x_val, y_val, x_test, y_test, train, val, test = preprocess_dog(dog)

(41300, 9) (8850, 9) (8850, 9)
(41300, 59) (8850, 59) (8850, 59)


In [75]:
res = ev.baseline_classification(train, 'outcome_type', 'Adoption')

Baseline accuracy is: 46.31%.
Baseline recall is: 100.0%.
Baseline precision is: 46.31%.



In [76]:
clf = DecisionTreeClassifier(random_state=100)
ev.train_model(clf, x_train, y_train, x_val, y_val)

0.6753268765133172
0.6274576271186441


In [77]:
rf1 = RandomForestClassifier(random_state=seed)
ev.train_model(rf1, x_train, y_train, x_val, y_val)

0.6753026634382566
0.628361581920904


In [78]:
knn = KNeighborsClassifier()
ev.train_model(knn, x_train, y_train, x_val, y_val)

0.6064406779661017
0.5833898305084746


In [79]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.fit_transform(y_val)

xgb = XGBClassifier()

ev.train_model(xgb, x_train, y_train_encoded, x_val, y_val_encoded)

0.6516707021791768
0.639774011299435


# Cats

In [80]:
def preprocess_cat(dataframe):
    a = dataframe.copy()
    train, val, test = prep.train_val_test(a, 'outcome_type', stratify=True)
    train, val, test = prep.scale(train, val, test, scaled_cols=['age'])
    train, val, test = prep.dummies(train, val, test, drop_first=['name', 'gender', 'neut_spay', 'condition'],
                                                     normal_list=['animal_type', 'breed1', 'breed2'])
    x_train, y_train = prep.split_xy(train,'outcome_type')  # split data to remove the target variable churn
    x_val, y_val = prep.split_xy(val,'outcome_type')
    x_test, y_test = prep.split_xy(test,'outcome_type')
    return x_train, y_train, x_val, y_val, x_test, y_test, train, val, test

In [82]:
cat = cat.drop(columns='color')

In [84]:
x_train, y_train, x_val, y_val, x_test, y_test, train, val, test = preprocess_cat(cat)

(26964, 9) (5778, 9) (5778, 9)
(26964, 16) (5778, 16) (5778, 16)


In [33]:
train.head()

Unnamed: 0,outcome_type,age,name_True,gender_male,gender_unk,neut_spay_True,condition_Normal,animal_type_Cat,color_Black,color_Black/Brown,...,color_White/Black,color_White/Brown,breed1_Domestic Longhair,breed1_Domestic Medium Hair,breed1_Domestic Shorthair,breed1_Other,breed1_Siamese,breed2_Other,breed2_mix,breed2_purebred
79345,Transfer,0.028169,1,0,0,1,1,1,0,0,...,0,0,0,0,1,0,0,0,1,0
82764,Adoption,1.0,1,0,0,1,1,1,0,0,...,0,0,0,0,1,0,0,0,1,0
40071,Adoption,1.0,1,1,0,1,1,1,0,0,...,1,0,0,0,1,0,0,0,1,0
89761,Transfer,0.314554,1,0,0,0,1,1,1,0,...,0,0,0,0,1,0,0,0,1,0
19646,Adoption,0.0,0,1,0,1,1,1,0,0,...,0,0,0,0,1,0,0,0,1,0


In [85]:
res = ev.baseline_classification(train, 'outcome_type', 'Adoption')

Baseline accuracy is: 44.64%.
Baseline recall is: 0.0%.
Baseline precision is: 0.0%.



In [86]:
#x_train = x_train.drop(columns='color_Tricolor')

In [87]:
clf = DecisionTreeClassifier(random_state=100)
ev.train_model(clf, x_train, y_train, x_val, y_val)

0.8246180091974484
0.8153340256143995


In [88]:
rf1 = RandomForestClassifier(random_state=seed)
ev.train_model(rf1, x_train, y_train, x_val, y_val)

0.8246180091974484
0.8156801661474559


In [89]:
knn = KNeighborsClassifier()
ev.train_model(knn, x_train, y_train, x_val, y_val)

0.8090787716955942
0.8068535825545171


In [90]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.fit_transform(y_val)

xgb = XGBClassifier()

ev.train_model(xgb, x_train, y_train_encoded, x_val, y_val_encoded)

0.8238762794837561
0.8161993769470405


# Bird

In [101]:
bird = bird.drop(columns='color')

In [102]:
x_train, y_train, x_val, y_val, x_test, y_test, train, val, test = preprocess_dog(bird)

(334, 9) (72, 9) (72, 9)
(334, 10) (72, 10) (72, 10)


In [103]:
res = ev.baseline_classification(train, 'outcome_type', 'Adoption')

Baseline accuracy is: 36.83%.
Baseline recall is: 100.0%.
Baseline precision is: 36.83%.



In [104]:
clf = DecisionTreeClassifier(random_state=100)
ev.train_model(clf, x_train, y_train, x_val, y_val)

0.718562874251497
0.5138888888888888


In [105]:
rf1 = RandomForestClassifier(random_state=seed)
ev.train_model(rf1, x_train, y_train, x_val, y_val)

0.718562874251497
0.5694444444444444


In [106]:
knn = KNeighborsClassifier()
ev.train_model(knn, x_train, y_train, x_val, y_val)

0.6586826347305389
0.5694444444444444


In [107]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.fit_transform(y_val)

xgb = XGBClassifier()

ev.train_model(xgb, x_train, y_train_encoded, x_val, y_val_encoded)

0.7155688622754491
0.5277777777777778


# Other

In [64]:
other = other.drop(columns='color')

In [66]:
def preprocess_other(dataframe):
    a = dataframe.copy()
    train, val, test = prep.train_val_test(a, 'outcome_type', stratify=True)
    train, val, test = prep.scale(train, val, test, scaled_cols=['age'])
    train, val, test = prep.dummies(train, val, test, drop_first=['name', 'gender', 'neut_spay', 'condition'],
                                                     normal_list=['animal_type', 'breed1', 'breed2'])
    x_train, y_train = prep.split_xy(train,'outcome_type')  # split data to remove the target variable churn
    x_val, y_val = prep.split_xy(val,'outcome_type')
    x_test, y_test = prep.split_xy(test,'outcome_type')
    return x_train, y_train, x_val, y_val, x_test, y_test, train, val, test

In [92]:
x_train, y_train, x_val, y_val, x_test, y_test, train, val, test = preprocess_other(other)

(3828, 9) (820, 9) (821, 9)
(3828, 14) (820, 14) (821, 14)


In [68]:
res = ev.baseline_classification(train, 'outcome_type', 'Adoption')

Baseline accuracy is: 70.45%.
Baseline recall is: 0.0%.
Baseline precision is: 0.0%.



In [69]:
res.head()

Unnamed: 0,outcome_type,baseline
40662,Euthanasia,Euthanasia
41788,Euthanasia,Euthanasia
3490,Adoption,Euthanasia
49616,Euthanasia,Euthanasia
13071,Adoption,Euthanasia


In [62]:
#x_train = x_train.drop(columns=['color_Tortie','color_Tan/White', 'color_Tan', 'color_Tan/White', 'color_Black/Tan'])
#x_val = x_val.drop(columns=['color_Tan/White', 'color_Tan', 'color_Tortie','color_Tan/White', 'color_Black/Tan'])

In [70]:
clf = DecisionTreeClassifier(random_state=100)
ev.train_model(clf, x_train, y_train, x_test, y_test)

0.9098746081504702
0.881851400730816


In [93]:
clf = DecisionTreeClassifier(random_state=100)
ev.train_model(clf, x_train, y_train, x_val, y_val)

0.9098746081504702
0.8853658536585366


In [94]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.fit_transform(y_val)

xgb = XGBClassifier()

ev.train_model(xgb, x_train, y_train_encoded, x_val, y_val_encoded)

0.9096133751306165
0.8865853658536585
