# Feature Engineering

## Import libraries

In [18]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib as jl

## Import dataset

In [19]:
dataset = pd.read_csv("Data/dogs_intakes_outcomes_working.csv")

## Split sex upon intake into two features

In [20]:
dataset['sex'] = np.where(dataset['sex_upon_intake'].str.contains('Male'), 'Male', 'Female')    
dataset['spay/neuter'] = np.where(dataset['sex_upon_intake'].str.contains('Intact'), 'No', 'Yes')

## Denote if intake is mixed or purebred

In [21]:
dataset['mixed_breed'] = np.where((dataset['breed'].str.contains('Mix')) | (dataset['breed'].str.contains('/')), 'Mixed', 'Purebred')

## Split color feature into two colors and one pattern feature

In [22]:
coat_patterns = ['bicolor', 'tricolor', 'merle', 'tuxedo', 'harlequin', 'spotted', 'tick', 'ticked', 'fleck', 
                 'flecked', 'speck', 'speckled', 'brindle', 'saddle', 'blanket', 'sable', 'hairless']
coat_reg = '(' + '|'.join(coat_patterns) + ')'

dataset['color'] = dataset['color'].str.lower()

dataset['coat_pattern'] = dataset['color'].str.extract(coat_reg, expand=False)



# Concat colors and coat into coat field

In [23]:
dataset = pd.concat([dataset, 
                     dataset['color'].str.split('/', expand=True).rename(columns={0:'color1', 1:'color2'})], axis=1)

dataset['color1'] = np.where(dataset['color1'] == '', 'Breed Specific', dataset['color1'])

dataset['coat'] = np.where(dataset['color1'] == 'Breed Specific', dataset['coat_pattern'], dataset['color1'])



## Clean coat colors

In [24]:
dataset['coat'] = np.where(dataset['coat'] == 'fawn', 'tan', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'chocolate', 'brown', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'ruddy', 'red', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'liver', 'brown', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'agouti', 'brown brindle', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'liver tick', 'brown brindle', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'brown tiger', 'brown', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'blue tiger', 'blue', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'black tiger', 'black', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'blue smoke', 'blue', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'black smoke', 'black', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'apricot', 'orange', dataset['coat'])
dataset['coat'] = np.where(dataset['coat'] == 'blue cream', 'blue', dataset['coat'])




# Bin continuous column into multiple buckets

In [38]:

dataset['time_in_shelter_days_12'], bins = pd.qcut(dataset['time_in_shelter_days'], q=12, retbins=True)
time = dataset['time_in_shelter_days_12'].value_counts()

print(time)

(3.853, 4.197]        3776
(4.197, 5.065]        3774
(0.21, 1.04]          3774
(42.147, 1606.194]    3772
(8.159, 11.115]       3772
(-0.001, 0.21]        3772
(19.103, 42.147]      3771
(11.115, 19.103]      3771
(2.092, 3.853]        3771
(6.156, 8.159]        3770
(1.04, 2.092]         3770
(5.065, 6.156]        3768
Name: time_in_shelter_days_12, dtype: int64


# Create bully breed field

In [32]:
dataset['Bully_breed'] = np.where((dataset['breed'].str.contains('Pit Bull')) | 
                                (dataset['breed'].str.contains('Staffordshire')) | 
                                (dataset['breed'].str.contains('Bull Terrier')), 
                                'Bully', 'Not Bully')

## Create new feature for puppy/adult

In [33]:
# puppies definied as younger than 6 months

dataset['Puppy/Dog_intake'] = np.where(dataset['age_upon_intake_(days)'] < 365, 'Puppy', 'Dog')

# Clean intake conditions

In [34]:
dataset['intake_condition_clean'] = np.where(dataset['outcome_subtype'].str.contains('Suffering') |
                                             dataset['outcome_subtype'].str.contains('Medical') |
                                             dataset['outcome_subtype'].str.contains('Rabies Risk') |
                                             dataset['intake_condition'].str.contains('Sick') , 'Sick', 'Normal')

dataset['intake_condition_clean'] = np.where(dataset['intake_condition'].str.contains('Injured') , 'Injured', dataset['intake_condition_clean'])
dataset['intake_condition_clean'] = np.where(dataset['intake_condition'].str.contains('Nursing') , 'Nursing', dataset['intake_condition_clean'])

dataset['intake_condition_clean'] = np.where(dataset['outcome_subtype'].str.contains('Aggressive') |
                                             dataset['outcome_subtype'].str.contains('Behavior'), 'Aggressive', dataset['intake_condition_clean'])

condition = dataset['intake_condition'].value_counts()

dataset.drop(dataset[dataset['intake_condition'].str.contains('Other') ].index, inplace = True) 
dataset.drop(dataset[dataset['intake_condition'].str.contains('Feral') ].index, inplace = True) 
print(condition)


intake_types = dataset['intake_type'].value_counts()
print(intake_types)

Normal      41558
Injured      1793
Sick         1016
Nursing       585
Aged          277
Pregnant       32
Name: intake_condition, dtype: int64
Stray                 32090
Owner Surrender        8915
Public Assist          4075
Euthanasia Request      181
Name: intake_type, dtype: int64


# Change transfer outcomes into adoption

# Drop nan values from important features

In [35]:
dataset.dropna(subset = ["sex"], inplace=True)
dataset.dropna(subset = ["outcome_type"], inplace=True)


# Reorder columns

## Save cleaned dataset

In [36]:
dataset.to_csv('Data/dogs_intakes_outcomes_clean.csv', index=False, encoding='utf-8')

## Save bins

In [37]:
jl.dump(bins, 'Data/bins.pkl') 

['Data/bins.pkl']