In [201]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier

from skopt import BayesSearchCV

from xgboost import XGBClassifier

import pickle

import animalhelper as ah

%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [138]:
!ls data

[31maac_intakes.csv[m[m          df_dog.csv               master12218.csv
[31maac_intakes_outcomes.csv[m[m df_mod.csv               master_df.csv
[31maac_outcomes.csv[m[m         engineered.csv           master_df_1128.csv
all_categoricals.csv     group_shade.csv          master_df_1129.csv
concat_backup.csv        hour_ids.csv             model12218.csv
df_cat.csv               hr_name.csv


In [139]:
df = pd.read_csv('./data/aac_intakes_outcomes.csv')

In [233]:
df2 = pd.read_csv('./data/aac_intakes.csv')

In [235]:
df2 = df2[['animal_id', 'name']]

In [236]:
name_freqs = pickle.load(open('name_freqs.pkl', 'rb'))

In [238]:
def get_name_frequencies(x):
    return name_freqs[x]

In [240]:
df2['name_freqs'] = df2['name'].apply(lambda x: get_name_frequencies(x))

In [242]:
df2 = df2.drop(labels=['name'], axis=1)

In [243]:
df2.head()

Unnamed: 0,animal_id,name_freqs
0,A706918,52
1,A724273,1
2,A665644,0
3,A682524,22
4,A743852,32


In [140]:
df.head()

Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_subtype,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,...,age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_monthyear,intake_weekday,intake_hour,intake_number,time_in_shelter,time_in_shelter_days
0,10 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,3650,10.0,"(7.5, 10.0]",2017-12-07 14:07:00,...,"(7.5, 10.0]",2017-12-07 00:00:00,12,2017,2017-12,Thursday,14,1.0,0 days 14:07:00.000000000,0.588194
1,7 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2555,7.0,"(5.0, 7.5]",2014-12-20 16:35:00,...,"(5.0, 7.5]",2014-12-19 10:21:00,12,2014,2014-12,Friday,10,2.0,1 days 06:14:00.000000000,1.259722
2,6 years,A006100,2007-07-09 00:00:00,,Return to Owner,Neutered Male,2190,6.0,"(5.0, 7.5]",2014-03-08 17:10:00,...,"(5.0, 7.5]",2014-03-07 14:26:00,3,2014,2014-03,Friday,14,3.0,1 days 02:44:00.000000000,1.113889
3,10 years,A047759,2004-04-02 00:00:00,Partner,Transfer,Neutered Male,3650,10.0,"(7.5, 10.0]",2014-04-07 15:12:00,...,"(7.5, 10.0]",2014-04-02 15:55:00,4,2014,2014-04,Wednesday,15,1.0,4 days 23:17:00.000000000,4.970139
4,16 years,A134067,1997-10-16 00:00:00,,Return to Owner,Neutered Male,5840,16.0,"(15.0, 17.5]",2013-11-16 11:54:00,...,"(15.0, 17.5]",2013-11-16 09:02:00,11,2013,2013-11,Saturday,9,1.0,0 days 02:52:00.000000000,0.119444


In [141]:
df.columns.values

array(['age_upon_outcome', 'animal_id_outcome', 'date_of_birth',
       'outcome_subtype', 'outcome_type', 'sex_upon_outcome',
       'age_upon_outcome_(days)', 'age_upon_outcome_(years)',
       'age_upon_outcome_age_group', 'outcome_datetime', 'outcome_month',
       'outcome_year', 'outcome_monthyear', 'outcome_weekday',
       'outcome_hour', 'outcome_number', 'dob_year', 'dob_month',
       'dob_monthyear', 'age_upon_intake', 'animal_id_intake',
       'animal_type', 'breed', 'color', 'found_location',
       'intake_condition', 'intake_type', 'sex_upon_intake', 'count',
       'age_upon_intake_(days)', 'age_upon_intake_(years)',
       'age_upon_intake_age_group', 'intake_datetime', 'intake_month',
       'intake_year', 'intake_monthyear', 'intake_weekday', 'intake_hour',
       'intake_number', 'time_in_shelter', 'time_in_shelter_days'],
      dtype=object)

In [142]:
df = df.iloc[:, 18:]

In [143]:
df2 = pd.read_csv('./data/aac_intakes_outcomes.csv')

In [144]:
df = pd.concat([df, df2['outcome_type']], axis=1)

In [145]:
len(df['animal_id_intake'].unique())

71961

In [146]:
df = df.drop_duplicates(subset='animal_id_intake')

In [147]:
df = df.drop(labels=['found_location', 'count', 'age_upon_intake_(years)', 'intake_month', 'intake_year', 
                'intake_monthyear', 'intake_number', 'time_in_shelter'], axis=1)

In [148]:
df = df[(df['animal_type'] == 'Dog') | (df['animal_type'] == 'Cat')]

In [149]:
df['intake_datetime'] = pd.to_datetime(df['intake_datetime'])

In [150]:
df.columns.values

array(['dob_monthyear', 'age_upon_intake', 'animal_id_intake',
       'animal_type', 'breed', 'color', 'intake_condition', 'intake_type',
       'sex_upon_intake', 'age_upon_intake_(days)',
       'age_upon_intake_age_group', 'intake_datetime', 'intake_weekday',
       'intake_hour', 'time_in_shelter_days', 'outcome_type'],
      dtype=object)

In [151]:
df['intake_hour'] = df['intake_datetime'].dt.hour

In [167]:
df = df.drop(labels=['age_upon_intake'], axis=1)

KeyError: "labels ['age_upon_intake'] not contained in axis"

In [168]:
df = df.drop(labels=['dob_monthyear'], axis=1)

## Reduce color to basic colors

In [153]:
df['color'] = df['color'].apply(lambda x: ah.color(x))

## Group breeds

In [154]:
df['breed'] = df['breed'].apply(lambda x: ah.group_dogs(x, False))

In [155]:
df_cat = df[df['animal_type'] == 'Cat']

In [156]:
df_cat['breed'] = df_cat['breed'].apply(lambda x: ah.cat_breed(x, False))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [157]:
df = pd.concat([df, df_cat])

In [158]:
df.columns.values

array(['dob_monthyear', 'animal_id_intake', 'animal_type', 'breed',
       'color', 'intake_condition', 'intake_type', 'sex_upon_intake',
       'age_upon_intake_(days)', 'age_upon_intake_age_group',
       'intake_datetime', 'intake_weekday', 'intake_hour',
       'time_in_shelter_days', 'outcome_type'], dtype=object)

In [159]:
df = df.drop_duplicates(subset='animal_id_intake', keep='last')

In [160]:
df['breed'].value_counts()

short hair       27060
Terrier           9524
Toy               7376
Sporting          6529
Herding           5897
Working           4604
Hound             2996
Non-Sporting      1871
long hair         1243
Terrier & Toy      111
Unknown              1
Name: breed, dtype: int64

In [161]:
def fix_breed(s):
    if s == 'Terrier & Toy':
        return 'Terrier'
    else:
        return s

In [162]:
df['breed'] = df['breed'].apply(lambda x: fix_breed(x))

In [163]:
df['breed'].value_counts()

short hair      27060
Terrier          9635
Toy              7376
Sporting         6529
Herding          5897
Working          4604
Hound            2996
Non-Sporting     1871
long hair        1243
Unknown             1
Name: breed, dtype: int64

In [164]:
df = df[df['breed'] != 'Unknown']

In [169]:
df.head()

Unnamed: 0,animal_id_intake,animal_type,breed,color,intake_condition,intake_type,sex_upon_intake,age_upon_intake_(days),age_upon_intake_age_group,intake_datetime,intake_weekday,intake_hour,time_in_shelter_days,outcome_type
0,A006100,Dog,Sporting,Yellow,Normal,Stray,Neutered Male,3650,"(7.5, 10.0]",2017-12-07 00:00:00,Thursday,0,0.588194,Return to Owner
3,A047759,Dog,Hound,Tricolor,Normal,Owner Surrender,Neutered Male,3650,"(7.5, 10.0]",2014-04-02 15:55:00,Wednesday,15,4.970139,Transfer
4,A134067,Dog,Herding,Brown,Injured,Public Assist,Neutered Male,5840,"(15.0, 17.5]",2013-11-16 09:02:00,Saturday,9,0.119444,Return to Owner
5,A141142,Dog,Sporting,Black,Aged,Stray,Spayed Female,5475,"(12.5, 15.0]",2013-11-16 14:46:00,Saturday,14,0.870833,Return to Owner
6,A163459,Dog,Terrier,Black,Normal,Stray,Intact Female,5475,"(12.5, 15.0]",2014-11-14 15:11:00,Friday,15,0.178472,Return to Owner


## Add in name frequencies

In [246]:
df = df2.merge(df, left_on='animal_id', right_on='animal_id_intake')

In [250]:
df = df.drop_duplicates()

## Remove rare occurences of features and response

In [176]:
df = df[(df['outcome_type'] != 'Rto-Adopt') & (df['outcome_type'] != 'Missing') & (df['outcome_type'] != 'Disposal')]

In [180]:
df = df[df['intake_type'] != 'Euthanasia Request']

In [181]:
df.head()

Unnamed: 0,animal_id_intake,animal_type,breed,color,intake_condition,intake_type,sex_upon_intake,age_upon_intake_(days),age_upon_intake_age_group,intake_datetime,intake_weekday,intake_hour,time_in_shelter_days,outcome_type
0,A006100,Dog,Sporting,Yellow,Normal,Stray,Neutered Male,3650,"(7.5, 10.0]",2017-12-07 00:00:00,Thursday,0,0.588194,Return to Owner
3,A047759,Dog,Hound,Tricolor,Normal,Owner Surrender,Neutered Male,3650,"(7.5, 10.0]",2014-04-02 15:55:00,Wednesday,15,4.970139,Transfer
4,A134067,Dog,Herding,Brown,Injured,Public Assist,Neutered Male,5840,"(15.0, 17.5]",2013-11-16 09:02:00,Saturday,9,0.119444,Return to Owner
5,A141142,Dog,Sporting,Black,Aged,Stray,Spayed Female,5475,"(12.5, 15.0]",2013-11-16 14:46:00,Saturday,14,0.870833,Return to Owner
6,A163459,Dog,Terrier,Black,Normal,Stray,Intact Female,5475,"(12.5, 15.0]",2014-11-14 15:11:00,Friday,15,0.178472,Return to Owner


## Model

In [251]:
df_model = df.copy()

In [252]:
df_model = df_model.drop(labels=['animal_id_intake', 'intake_datetime', 'time_in_shelter_days'], axis=1)

In [254]:
df_model = df_model.drop(labels=['animal_id'], axis=1)

In [255]:
df_model.head()

Unnamed: 0,name_freqs,animal_type,breed,color,intake_condition,intake_type,sex_upon_intake,age_upon_intake_(days),age_upon_intake_age_group,intake_weekday,intake_hour,outcome_type
0,52,Dog,Sporting,White,Normal,Stray,Spayed Female,2920,"(7.5, 10.0]",Sunday,12,Return to Owner
1,1,Dog,Hound,Sable,Normal,Stray,Intact Male,330,"(-0.025, 2.5]",Thursday,18,Return to Owner
2,0,Cat,short hair,Tricolor,Sick,Stray,Intact Female,28,"(-0.025, 2.5]",Monday,7,Transfer
3,22,Dog,Working,Yellow,Normal,Stray,Neutered Male,1460,"(2.5, 5.0]",Sunday,10,Return to Owner
4,32,Dog,Sporting,Brown,Normal,Owner Surrender,Neutered Male,730,"(-0.025, 2.5]",Saturday,12,Return to Owner


In [256]:
def simplify_response(s):
    if s == 'Adoption':
        return s
    else:
        return 'Not Adoption'

In [257]:
df_model['outcome_type'] = df_model['outcome_type'].apply(lambda x: simplify_response(x))

In [258]:
df_model = df_model.dropna()

In [259]:
df_model.head()

Unnamed: 0,name_freqs,animal_type,breed,color,intake_condition,intake_type,sex_upon_intake,age_upon_intake_(days),age_upon_intake_age_group,intake_weekday,intake_hour,outcome_type
0,52,Dog,Sporting,White,Normal,Stray,Spayed Female,2920,"(7.5, 10.0]",Sunday,12,Not Adoption
1,1,Dog,Hound,Sable,Normal,Stray,Intact Male,330,"(-0.025, 2.5]",Thursday,18,Not Adoption
2,0,Cat,short hair,Tricolor,Sick,Stray,Intact Female,28,"(-0.025, 2.5]",Monday,7,Not Adoption
3,22,Dog,Working,Yellow,Normal,Stray,Neutered Male,1460,"(2.5, 5.0]",Sunday,10,Not Adoption
4,32,Dog,Sporting,Brown,Normal,Owner Surrender,Neutered Male,730,"(-0.025, 2.5]",Saturday,12,Not Adoption


In [260]:
X = df_model.iloc[:, :10]

In [261]:
y = df_model.iloc[:, -1]

In [262]:
le = LabelEncoder()
y = le.fit_transform(y)

In [263]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2325)

In [264]:
X_train.columns.values

array(['name_freqs', 'animal_type', 'breed', 'color', 'intake_condition',
       'intake_type', 'sex_upon_intake', 'age_upon_intake_(days)',
       'age_upon_intake_age_group', 'intake_weekday'], dtype=object)

In [227]:
from catboost import CatBoostClassifier

In [266]:
c = CatBoostClassifier(cat_features=[1, 2, 3, 4, 5, 6, 8, 9, 10], depth=8, learning_rate = 0.23,
                   n_estimators = 800, one_hot_max_size=24, od_type = 'IncToDec', od_pval=1e-5,
                       l2_leaf_reg=1, eval_metric='AUC', class_weights=[1.1887, 1])

scores = cross_val_score(c, X_train, y_train, cv=3, n_jobs=-1, scoring='roc_auc')

CatboostError: Invalid cat_features[8] = 10 value: must be < 10.

In [None]:
scores