# Feature Engineering

## Import libraries

In [2]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib as jl

## Import dataset

In [3]:
dataset = pd.read_csv("Data/dogs_intakes_outcomes_working.csv")

print (dataset.columns)

Index(['age_upon_outcome', 'animal_id_outcome', 'date_of_birth',
       'outcome_subtype', 'outcome_type', 'sex_upon_outcome',
       'age_upon_outcome_(days)', 'age_upon_outcome_(years)',
       'age_upon_outcome_age_group', 'outcome_datetime', 'outcome_month',
       'outcome_year', 'outcome_monthyear', 'outcome_weekday', 'outcome_hour',
       'outcome_number', 'dob_year', 'dob_month', 'dob_monthyear',
       'age_upon_intake', 'animal_id_intake', 'animal_type', 'breed', 'color',
       'found_location', 'intake_condition', 'intake_type', 'sex_upon_intake',
       'count', 'age_upon_intake_(days)', 'age_upon_intake_(years)',
       'age_upon_intake_age_group', 'intake_datetime', 'intake_month',
       'intake_year', 'intake_monthyear', 'intake_weekday', 'intake_hour',
       'intake_number', 'time_in_shelter', 'time_in_shelter_days'],
      dtype='object')


## Split sex upon intake into two features

In [4]:
dataset['sex'] = np.where(dataset['sex_upon_intake'].str.contains('Male'), 'Male', 'Female')    
dataset['spay/neuter'] = np.where(dataset['sex_upon_intake'].str.contains('Intact'), 'No', 'Yes')

## Denote if intake is mixed or purebred

In [5]:
dataset['mixed_breed'] = np.where((dataset['breed'].str.contains('Mix')) 
                                  | (dataset['breed'].str.contains('/')), 'Mixed', 'Purebred')

## Split color feature into two colors and one pattern feature

In [6]:
coat_patterns = ['bicolor', 'tricolor', 'merle', 'tuxedo', 'harlequin', 'spotted', 'tick', 'ticked', 'fleck', 
                 'flecked', 'speck', 'speckled', 'brindle', 'saddle', 'blanket', 'sable', 'hairless']
coat_reg = '(' + '|'.join(coat_patterns) + ')'

dataset['color'] = dataset['color'].str.lower()

dataset['coat_pattern'] = dataset['color'].str.extract(coat_reg, expand=False)



# Concat colors and coat into coat field

In [7]:
dataset = pd.concat([dataset, 
                     dataset['color'].str.split('/', expand=True).rename(columns={0:'color1', 1:'color2'})], axis=1)

dataset['color1'] = np.where(dataset['color1'] == '', 'Breed Specific', dataset['color1'])

dataset['coat'] = np.where(dataset['color1'] == 'Breed Specific', dataset['coat_pattern'], dataset['color1'])

## Clean coat colors

In [8]:
dataset['coat'] = np.where(dataset['coat'] == 'fawn', 'tan', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'chocolate', 'brown', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'ruddy', 'red', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'liver', 'brown', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'agouti', 'brown brindle', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'liver tick', 'brown brindle', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'brown tiger', 'brown', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'blue tiger', 'blue', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'black tiger', 'black', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'blue smoke', 'blue', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'black smoke', 'black', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'apricot', 'orange', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'blue cream', 'blue', dataset['coat'])

coats = dataset['coat'].value_counts()
print(coats)

black             11606
white              8064
brown              6967
tan                5947
brown brindle      2163
tricolor           2087
red                2064
blue               1754
sable               745
cream               646
yellow              615
buff                547
gray                501
blue merle          394
black brindle       288
gold                187
brown merle         175
blue tick           126
silver              112
red merle           110
red tick            107
yellow brindle       82
orange               79
Name: coat, dtype: int64


# Bin continuous column into multiple buckets

In [9]:
dataset['time_in_shelter_days_12'], bins = pd.qcut(dataset['time_in_shelter_days'], q=12, retbins=True)
time = dataset['time_in_shelter_days_12'].value_counts()

print(time)

(4.197, 5.065]        3786
(-0.001, 0.21]        3784
(2.093, 3.853]        3782
(42.261, 1606.194]    3781
(8.161, 11.117]       3781
(19.116, 42.261]      3780
(11.117, 19.116]      3780
(3.853, 4.197]        3779
(1.04, 2.093]         3779
(0.21, 1.04]          3779
(5.065, 6.156]        3778
(6.156, 8.161]        3777
Name: time_in_shelter_days_12, dtype: int64


# Create bully breed field

In [10]:
dataset['Bully_breed'] = np.where((dataset['breed'].str.contains('Pit Bull')) | 
                                (dataset['breed'].str.contains('Staffordshire')) | 
                                (dataset['breed'].str.contains('Bull Terrier')), 
                                'Bully', 'Not Bully')

## Create new feature for puppy/adult

In [11]:
# puppies definied as younger than 12 months
dataset['Puppy/Dog_intake'] = np.where(dataset['age_upon_intake_(days)'] < 365, 'Puppy', 'Dog')

# Clean intake conditions

In [16]:
dataset['intake_condition_clean'] = np.where(dataset['outcome_subtype'].str.contains('Suffering') |
                                             dataset['outcome_subtype'].str.contains('Medical') |
                                             dataset['outcome_subtype'].str.contains('Rabies Risk') |
                                             dataset['intake_condition'].str.contains('Sick') , 'Sick', 'Normal')

dataset['intake_condition_clean'] = np.where(dataset['intake_condition'].str.contains('Injured') , 
                                             'Injured', dataset['intake_condition_clean'])
dataset['intake_condition_clean'] = np.where(dataset['intake_condition'].str.contains('Nursing') , 
                                             'Nursing', dataset['intake_condition_clean'])

dataset['intake_condition_clean'] = np.where(dataset['outcome_subtype'].str.contains('Aggressive') |
                                             dataset['outcome_subtype'].str.contains('Behavior'), 
                                             'Aggressive', dataset['intake_condition_clean'])

condition = dataset['intake_condition'].value_counts()

dataset.drop(dataset[dataset['intake_condition'].str.contains('Other') ].index, inplace = True) 
dataset.drop(dataset[dataset['intake_condition'].str.contains('Feral') ].index, inplace = True) 
print(condition)

intake_types = dataset['intake_condition'].value_counts()

Normal      41558
Injured      1793
Sick         1016
Nursing       585
Aged          277
Pregnant       32
Name: intake_condition, dtype: int64


# Drop nan values from important features

In [13]:
dataset.dropna(subset = ["sex"], inplace=True)
dataset.dropna(subset = ["outcome_type"], inplace=True)


## Save cleaned dataset

In [14]:
dataset.to_csv('Data/dogs_intakes_outcomes_clean.csv', index=False, encoding='utf-8')

## Save bins

In [15]:
jl.dump(bins, 'Data/bins.pkl') 

['Data/bins.pkl']