In [1]:
# Essentials 
import pandas as pd
import sklearn as sk
import numpy as np

# Feature Engineering
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Model Processing and Evaluating
import imblearn
import sklearn.pipeline
from imblearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

# Models
from sklearn import tree
from sklearn import neighbors
from sklearn import neural_network

#supress warnings
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('aac_data.csv')
data.head()

Unnamed: 0,outcome_type,name,age_upon_outcome,sex_upon_outcome,animal_type
0,Adoption,*Bradley,1 year,Neutered Male,Dog
1,Adoption,*Rajah,5 months,Neutered Male,Cat
2,Adoption,*Pebbles,10 months,Spayed Female,Cat
3,Return to Owner,Benji,5 years,Neutered Male,Dog
4,Euthanasia,,2 years,Unknown,Other


The `name` and `age_upon_outcome` features have too many possible values. To narrow this down, the following simplification methods will be applied.

In [2]:
data.fillna(False,inplace=True)

simplify_age = lambda age: '< 1 year' if (age == '0 years' or 'week' in str(age) or 'month' in str(age) or 'day' in str(age)) else age
data['age_upon_outcome'] = data['age_upon_outcome'].apply(simplify_age)

simplify_name = lambda name: 1 if name != False else 0
data['name'] = data['name'].apply(simplify_name)

data

Unnamed: 0,outcome_type,name,age_upon_outcome,sex_upon_outcome,animal_type
0,Adoption,1,1 year,Neutered Male,Dog
1,Adoption,1,< 1 year,Neutered Male,Cat
2,Adoption,1,< 1 year,Spayed Female,Cat
3,Return to Owner,1,5 years,Neutered Male,Dog
4,Euthanasia,0,2 years,Unknown,Other
...,...,...,...,...,...
95351,Return to Owner,1,1 year,Intact Male,Dog
95352,Return to Owner,1,< 1 year,Intact Female,Dog
95353,Return to Owner,1,6 years,Neutered Male,Dog
95354,Adoption,1,3 years,Neutered Male,Cat


To prepare for model creation, "dummy variables" turning each categorical feature into multiple 1 (true)/0 (false) features. The resultant column names are output below.

In [3]:
cat_vars = ['outcome_type','age_upon_outcome','sex_upon_outcome','animal_type']
for var in cat_vars:
    cat_list = 'var'+'_'+var
    cat_list = pd.get_dummies(data[var],prefix=var)
    data1=data.join(cat_list)
    data=data1

data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]

data_final = data[to_keep]
data_final.columns.values

array(['name', 'outcome_type_Adoption', 'outcome_type_Died',
       'outcome_type_Disposal', 'outcome_type_Euthanasia',
       'outcome_type_Missing', 'outcome_type_Relocate',
       'outcome_type_Return to Owner', 'outcome_type_Rto-Adopt',
       'outcome_type_Transfer', 'age_upon_outcome_1 year',
       'age_upon_outcome_10 years', 'age_upon_outcome_11 years',
       'age_upon_outcome_12 years', 'age_upon_outcome_13 years',
       'age_upon_outcome_14 years', 'age_upon_outcome_15 years',
       'age_upon_outcome_16 years', 'age_upon_outcome_17 years',
       'age_upon_outcome_18 years', 'age_upon_outcome_19 years',
       'age_upon_outcome_2 years', 'age_upon_outcome_20 years',
       'age_upon_outcome_22 years', 'age_upon_outcome_24 years',
       'age_upon_outcome_25 years', 'age_upon_outcome_3 years',
       'age_upon_outcome_4 years', 'age_upon_outcome_5 years',
       'age_upon_outcome_6 years', 'age_upon_outcome_7 years',
       'age_upon_outcome_8 years', 'age_upon_outcome_9 y

In [4]:
# separate data btwn train and test

label = pd.get_dummies(data['name'])
label = label.drop(0,axis=1)

features = data_final.drop(columns='name')

In [5]:
feat_train, feat_test, label_train, label_test = train_test_split(features,label,test_size=0.3, random_state=1, stratify=label)
label_train=label_train.squeeze()

In [6]:
#create model
mlp_model = sk.neural_network.MLPClassifier()

#fit and get predictions
predictions = mlp_model.fit(feat_train, label_train).predict(feat_test)

#print results
print(pd.Series(predictions).value_counts())
print(classification_report(label_test, predictions))

1    22017
0     6590
dtype: int64
              precision    recall  f1-score   support

           0       0.82      0.61      0.70      8827
           1       0.84      0.94      0.89     19780

    accuracy                           0.84     28607
   macro avg       0.83      0.77      0.79     28607
weighted avg       0.84      0.84      0.83     28607



In [7]:
#create model
knn_model = sk.neighbors.KNeighborsClassifier(n_neighbors=2)

#fit and get predictions
predictions = knn_model.fit(feat_train, label_train).predict(feat_test)

#print results
print(pd.Series(predictions).value_counts())
print(classification_report(label_test, predictions))

1    16910
0    11697
dtype: int64
              precision    recall  f1-score   support

           0       0.54      0.72      0.62      8827
           1       0.85      0.73      0.79     19780

    accuracy                           0.73     28607
   macro avg       0.70      0.72      0.70     28607
weighted avg       0.76      0.73      0.73     28607



In [8]:
#Create Decision Tree
decision_tree = tree.DecisionTreeClassifier(max_depth=5)

#fit and get the predictions
predictions = decision_tree.fit(feat_train, label_train).predict(feat_test)

#print results
print(pd.Series(predictions).value_counts())
print(classification_report(label_test, predictions))

1    21905
0     6702
dtype: int64
              precision    recall  f1-score   support

           0       0.80      0.61      0.69      8827
           1       0.84      0.93      0.89     19780

    accuracy                           0.83     28607
   macro avg       0.82      0.77      0.79     28607
weighted avg       0.83      0.83      0.83     28607

