In [1]:
# Required Python Machine learning Packages
import pandas as pd
import numpy as np

# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split

# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB

# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("aac_shelter_outcomes.csv")
#df.head(100)

In [3]:
df.isnull().sum()

age_upon_outcome        8
animal_id               0
animal_type             0
breed                   0
color                   0
date_of_birth           0
datetime                0
monthyear               0
name                23886
outcome_subtype     42293
outcome_type           12
sex_upon_outcome        2
dtype: int64

In [4]:
# for the missing values in name i am going to assign animals with null names the value of "no name"
# null values in outcome_subtype are the same as not applicable values so i will replace them with that
# null values in sex_upon_outcome, age_upon_outcome and outcome_type are unknown values so i replace them with unknown
df["name"].fillna("No Name", inplace=True)
df["outcome_subtype"].fillna("Not Applicable", inplace=True)
df["sex_upon_outcome"].fillna("Unknown", inplace=True)
df["age_upon_outcome"].fillna("Unknown", inplace=True)
df["outcome_type"].fillna("Unknown", inplace=True)
df.isnull().sum()

age_upon_outcome    0
animal_id           0
animal_type         0
breed               0
color               0
date_of_birth       0
datetime            0
monthyear           0
name                0
outcome_subtype     0
outcome_type        0
sex_upon_outcome    0
dtype: int64

In [5]:
# there are no continuous values so i dont have to scale, center and check for outliers in the data

# now i can start thinking about what features would predict adoption
# age of the animal would be good for predicting because people want to adopt puppies and kittens more than older dogs and cats
# animal id is irrelevant
# animal type is important because people adopt more dogs and cats than bats for example
# breed is important as well
# color could be useful
# the date of birth, datetime and monthyear columns are just age in a differnt form so i wont use that
# name could be useful because no name pets could be adopted less or something
# i wont use outcome_subtype because this is only applicable if the animal is not adopted
# outcome_type is what im predicting so no
# gender could matter so ill use it


In [6]:
def label_encode(df, columns):
    for col in columns:
        le = LabelEncoder()
        col_values_unique = list(df[col].unique())
        le_fitted = le.fit(col_values_unique)
 
        col_values = list(df[col].values)
        #le.classes_
        col_values_transformed = le.transform(col_values)
        df[col] = col_values_transformed
        

In [7]:
#for idx, row in df.iterrows():
  #  print(row['name'])

In [8]:
df['adopted_or_not'] = ''
for idx, row in df.iterrows():
    if "Adoption" in row['outcome_type']:
        row['outcome_type'] = 1
    else:
        row['outcome_type'] = 0

In [9]:
df['name_or_not'] = ''
for idx, row in df.iterrows():
    if "Name" in row['name']:
        row['name'] = 0
    else:
        row['name'] = 1

In [10]:
for idx, row in df.iterrows():
    if "weeks" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "really young"
    elif "1 month" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "really young"    
    elif "months" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "young"
    elif "1 year" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "young"   
    elif "2 years" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "medium"
    elif "3 years" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "medium"
    elif "4 years" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "medium"
    elif "5 years" in row['age_upon_outcome']:
        row['age_upon_outcome'] = "medium"
    else:
        row['age_upon_outcome'] = "old"

In [11]:
df['color_filtered'] = ''
for idx, row in df.iterrows():
    if "Black" in row['color']:
        row['color_filtered'] = 'Black'
    elif "Brown" in row['color']:
        row['color_filtered'] = 'Brown'
    elif "White" in row['color']:
        row['color_filtered'] = 'White'
    elif "Tan" in row['color']:
        row['color_filtered'] = 'Tan'
    elif "Orange" in row['color']:
        row['color_filtered'] = 'Orange'
    elif "Cream" in row['color']:
        row['color_filtered'] = 'Cream'
    elif "Yellow" in row['color']:
        row['color_filtered'] = 'Yellow'    
    else:
        row['color_filtered'] = 'Other'


In [12]:

for idx, row in df.iterrows():
    if "Retriever " in row['breed']:
        row['breed'] = 'Retriever'
    elif "Terrier" in row['breed']:
        row['breed'] = 'Terrier'
    elif "Pit Bull" in row['breed']:
        row['breed'] = 'Pit Bull'
    elif "Domestic" in row['breed']:
        row['breed'] = 'Domestic '
    elif "Husky " in row['breed']:
        row['breed'] = 'Husky '
    elif "Pointer" in row['breed']:
        row['breed'] = 'Pointer'    
    else:
        row['breed'] = 'Other'

In [13]:
features = df[['age_upon_outcome','animal_type','breed','color_filtered','sex_upon_outcome']]
label_encode(features, features.columns.values)

target = df[['outcome_type']]
label_encode(target, target.columns.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [14]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.33, random_state = 10)

# Naive Bayes Model

In [43]:
nb = GaussianNB()
nb.fit(features_train, target_train)
target_pred = nb.predict(features_test)
target_pred

  y = column_or_1d(y, warn=True)


array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

# Accuracy Score, AUC Score

In [44]:
from sklearn import metrics
print(metrics.accuracy_score(target_test, target_pred))
print(metrics.roc_auc_score(target_test, target_pred))

0.7273959341723136
0.7297157708291955


In [17]:
# SVM
X = features
y = target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=None)

In [18]:
#SVM never finishes running so i commented it out, but this is how svm is done

#from sklearn import svm
#model = svm.SVC()
#model.fit(X_train, y_train)

In [19]:
#y_pred_class = model.predict(X_test)

In [20]:
#from sklearn import metrics
#print(metrics.accuracy_score(y_test, y_pred_class))
#print(metrics.roc_auc_score(y_test, y_pred_class))

# KNN

In [21]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [22]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [23]:
y_pred_class = model.predict(X_test)

# Accuracy Score and AUC Score

In [24]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.roc_auc_score(y_test, y_pred_class))

0.7229108101201125
0.7148112612997745


# Random Forest Regressor

In [26]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 500, random_state = 42)
rf.fit(X_train, y_train);
y_pred_class = rf.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [27]:
print(metrics.accuracy_score(y_test, y_pred_class.round()))
print(metrics.roc_auc_score(y_test, y_pred_class.round()))

0.7574111934577051
0.7502522973273086


# Random Forest Classifier

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [36]:
from sklearn.ensemble import RandomForestClassifier

# instantiate model
model = RandomForestClassifier(random_state=1, max_depth=10)

# fit model
model.fit(X_train, y_train)

  import sys


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [37]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

In [38]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

0.7582417582417582


In [39]:
from sklearn.model_selection import KFold # import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
model.fit(X, y)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [42]:
kf = KFold(n_splits=10, random_state=1, shuffle=False)
scores = cross_val_score(model, X, y, cv=kf)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [41]:
#10-fold cross validation
print(scores.mean())

0.7575904680801033


In [28]:
# my model is pretty decent because all three algorithms returned an accuracy score of 70-75.
# the best model was the random forest because it had the highest accuracy score.
# the worst was either the SVM model because it never finished running or the naive bayes model because it had the lowest score
