In [2]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
plt.style.use('metis')
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score, precision_recall_curve, make_scorer, recall_score, accuracy_score, f1_score, precision_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from graphviz import Source
from IPython.display import SVG
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [3]:
#Connection to AWS postgres host
connection_args = {
    'host': '52.53.162.215',
    'user': 'ubuntu',    
    'dbname': 'pets',   
    'port': 5432         
}

connection = pg.connect(**connection_args)

In [4]:
#Querying DB for relevent cols
pupper_query = "SELECT data.outcome_type,data.color,data.days_in_shelter,data.age,data.is_female,data.big,raw_data.intake_condition FROM data INNER JOIN raw_data ON data.animal_id=raw_data.animal_id WHERE data.type = '1' AND (data.outcome_type = 'ADOPTION' or data.outcome_type ='EUTHANIZE');"

model_df = pd_sql.read_sql(pupper_query, connection)

In [5]:
model_df.head()

Unnamed: 0,outcome_type,color,days_in_shelter,age,is_female,big,intake_condition
0,ADOPTION,0,52,7.961833576,1,1,TREATABLE/REHAB
1,ADOPTION,0,16,1.998672115,1,1,HEALTHY
2,EUTHANIZE,0,34,5.952209833,1,1,UNTREATABLE
3,ADOPTION,0,212,4.974777032,0,1,TREATABLE/REHAB
4,ADOPTION,0,11,3.531900039,0,0,TREATABLE/MANAGEABLE


In [6]:
model_df.dropna(0, inplace=True)

In [7]:
#Check that our outcomes are not too unbalanced
model_df['outcome_type'].value_counts()

ADOPTION     3016
EUTHANIZE    1158
Name: outcome_type, dtype: int64

In [8]:
#Binary value mapping for intake condition 1 will be healthy
model_df.loc[model_df.intake_condition != 'HEALTHY', 'healthy'] = 0
model_df.loc[model_df.intake_condition == 'HEALTHY', 'healthy'] = 1

In [9]:
model_df.head()

Unnamed: 0,outcome_type,color,days_in_shelter,age,is_female,big,intake_condition,healthy
0,ADOPTION,0,52,7.961833576,1,1,TREATABLE/REHAB,0.0
1,ADOPTION,0,16,1.998672115,1,1,HEALTHY,1.0
2,EUTHANIZE,0,34,5.952209833,1,1,UNTREATABLE,0.0
3,ADOPTION,0,212,4.974777032,0,1,TREATABLE/REHAB,0.0
4,ADOPTION,0,11,3.531900039,0,0,TREATABLE/MANAGEABLE,0.0


In [10]:
#Encode for outcomes(1 = euthanized)
encoded_column_vector = label_binarize(model_df['outcome_type'], classes=['ADOPTION', 'EUTHANIZE']) 
encoded_labels = np.ravel(encoded_column_vector)
X = model_df[['color','days_in_shelter','age','is_female','big', 'healthy']]
y = encoded_labels

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1838425944)
ssX = StandardScaler()
X_train_scaled = ssX.fit_transform(X_train)
X_test_scaled  = ssX.transform(X_test) 

In [12]:
#Decision Tree
estimator = DecisionTreeClassifier(random_state = 1838425944) 
estimator.fit(X_train,y_train)
estimator.score(X_train,y_train)
graph_options = {
    'feature_names': X.columns,
    'class_names': ['Adoption', 'Euthanize'],
    'out_file': 'tree.svg',
    'filled': True,
    'rounded': True
}

graphviz_code = Source(tree.export_graphviz(estimator, **graph_options))

In [13]:
my_tree = DecisionTreeClassifier(random_state = 1838425944)

params = {
    'max_depth': [1,2,3],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}

grid = GridSearchCV(my_tree, param_grid=params, scoring='f1', cv=10)
grid.fit(X_train,y_train)

importances = sorted(zip(grid.best_estimator_.feature_importances_, X.columns), reverse=True)

for importance, name in importances:
    print(f'{importance:6.4f}:  {name}')

0.7881:  healthy
0.1933:  days_in_shelter
0.0186:  age
0.0000:  is_female
0.0000:  color
0.0000:  big


In [14]:
#Stupid Baseline Model
records = list(X_train['color'])
num_euth = []
for i in range(len(records)):
    num_euth.append(1)
stupid_confusion = confusion_matrix(y_train, num_euth)
sns.heatmap(stupid_confusion, annot=True, cmap='RdYlGn', square=True, fmt='d',
           xticklabels=['EUTHANIZE', 'ADOPTION'],
           yticklabels=['EUTHANIZE', 'ADOPTION']);
plt.xlabel('prediction')
plt.ylabel('actual')
plt.savefig('stupid confusion.png', bbox_inches="tight", dpi=1000)

In [15]:
f1_score(y_train, (num_euth))

0.4291476203280452

In [27]:
models = [('knn', KNeighborsClassifier), 
          ('logistic', LogisticRegression),
          ('tree', DecisionTreeClassifier),
          ('forest', RandomForestClassifier)
         ]

param_choices = [
    {
        'n_neighbors': range(1, 12)
    },
    {
        'C': np.logspace(-3,6, 12),
        'penalty': ['l1', 'l2'],
        'random_state': [1838425944]
    },
    {
        'max_depth': [1,2,3,4,5],
        'min_samples_leaf': [3,6,10], 
        'random_state': [1838425944]
    },
    {
        'n_estimators': [50, 100, 200],
        'max_depth': [1,2,3,4,5],
        'min_samples_leaf': [3,6,10], 
        'random_state': [1838425944]
    }
]

grids = {}
for model_info, params in zip(models, param_choices):
    name, model = model_info
    grid = GridSearchCV(model(), params, scoring = 'f1')
    grid.fit(X_train[['age', 'days_in_shelter', 'healthy']], y_train)
    s = f"{name}: best score: {grid.best_score_}"
    print(s)
    grids[name] = grid

knn: best score: 0.6540424086379024


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


logistic: best score: 0.7074846632888584
tree: best score: 0.7089588383062096


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


forest: best score: 0.7053151390557522


In [17]:
grids['knn'].best_params_

{'n_neighbors': 3}

In [18]:
grids['logistic'].best_params_

{'C': 0.2848035868435802, 'penalty': 'l2', 'random_state': 1838425944}

In [19]:
grids['tree'].best_params_

{'max_depth': 4, 'min_samples_leaf': 10, 'random_state': 1838425944}

In [20]:
grids['forest'].best_params_

{'max_depth': 4,
 'min_samples_leaf': 3,
 'n_estimators': 50,
 'random_state': 1838425944}

In [21]:
#Bayes
model_bayes = GaussianNB()
model_bayes.fit(X_train[['age', 'days_in_shelter', 'healthy']], y_train)
predictions = model_bayes.predict(X_train[['age', 'days_in_shelter', 'healthy']])
f1_score(y_train, predictions)

0.6939704209328782

In [22]:
model_svc = SVC(C=10, random_state = 1838425944)
model_svc.fit(X_train[['age', 'days_in_shelter', 'healthy']], y_train)
predictions = model_svc.predict(X_train[['age', 'days_in_shelter', 'healthy']])
f1_score(y_train, predictions)

0.8211488250652742

In [26]:
eclf = VotingClassifier(estimators=[('knn', grids['knn']), ('lr', grids['logistic']), ('bayes',model_bayes), ('svc', model_svc),('d_tree', grids['tree']), ('random_forest', grids['forest'])])
eclf.fit(X_train[['age', 'days_in_shelter', 'healthy']], y_train)
pred_vals = eclf.predict(X_train[['age', 'days_in_shelter', 'healthy']])
f1_score(y_train,pred_vals)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if diff:


0.7351627313337588

In [27]:
eclf = VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors = 3)),('lr', LogisticRegression(C = 0.2848035868435802, penalty = 'l2', random_state = 1838425944)), ('bayes',GaussianNB()), 
                                    ('svc', SVC(C=10, probability=True, random_state = 1838425944)),('d_tree', DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 10, random_state = 1838425944)), 
                                    ('random_forest', RandomForestClassifier(max_depth = 5, min_samples_leaf = 3, n_estimators = 50, random_state = 1838425944))],voting='soft')
eclf.fit(X_train[['age', 'days_in_shelter', 'healthy']], y_train)
pred_vals = eclf.predict(X_train[['age', 'days_in_shelter', 'healthy']])
f1_score(y_train,pred_vals)

  if diff:


0.7527579493835171

In [28]:
pred_test = eclf.predict(X_test[['age', 'days_in_shelter', 'healthy']])
f1_score(y_test,pred_test)

  if diff:


0.7048710601719197

In [29]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('scaler', StandardScaler()),
                     ('ensemble_soft', eclf)])

pipeline.fit(X[['age', 'days_in_shelter', 'healthy']],y)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ensemble_soft', VotingClassifier(estimators=[('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')), ('...      warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None))])

In [30]:
import pickle

pickle.dump(pipeline, open('model.pkl', 'wb'))

In [31]:
example = {
  'Age': 8,  
  'Days in Shelter': 52,    
  'Good Health': False,  
}

def make_prediction(features):
    X = np.array([features['Age'], features['Days in Shelter'],int(features['Good Health'] == True)]).reshape(1,-1)
    prob_euth = pipeline.predict_proba(X)[0, 1]
    
    result = {
        'prediction': int(prob_euth > 0.5),
        'prob_euthanization': prob_euth
    }
    return result

In [32]:
make_prediction(example)



{'prediction': 0, 'prob_euthanization': 0.39931227065440195}