# Feature Engineering

## Import libraries

In [19]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import dataset

In [20]:
dataset = pd.read_csv("Data/dogs_intakes_outcomes_working.csv")

## Split sex upon intake into two features

In [21]:
dataset['sex'] = np.where(dataset['sex_upon_intake'].str.contains('Male'), 'Male', 'Female')    
dataset['spay/neuter'] = np.where(dataset['sex_upon_intake'].str.contains('Intact'), 'No', 'Yes')

In [22]:
dataset.dropna(subset = ["sex"], inplace=True)

age_upon_outcome              0.000000
animal_id_outcome             0.000000
date_of_birth                 0.000000
outcome_subtype               0.696513
outcome_type                  0.000044
sex_upon_outcome              0.000022
age_upon_outcome_(days)       0.000000
age_upon_outcome_(years)      0.000000
age_upon_outcome_age_group    0.000000
outcome_datetime              0.000000
outcome_month                 0.000000
outcome_year                  0.000000
outcome_monthyear             0.000000
outcome_weekday               0.000000
outcome_hour                  0.000000
outcome_number                0.000000
dob_year                      0.000000
dob_month                     0.000000
dob_monthyear                 0.000000
age_upon_intake               0.000000
animal_id_intake              0.000000
animal_type                   0.000000
breed                         0.000000
color                         0.000000
found_location                0.000000
intake_condition         

## Denote if intake is mixed or purebred

In [24]:
dataset['mixed_breed'] = np.where((dataset['breed'].str.contains('Mix')) | (dataset['breed'].str.contains('/')), True, False)

## Split color feature into two colors and one pattern feature

In [25]:
coat_patterns = ['bicolor', 'tricolor', 'merle', 'tuxedo', 'harlequin', 'spotted', 'tick', 'ticked', 'fleck', 
                 'flecked', 'speck', 'speckled', 'brindle', 'saddle', 'blanket', 'sable', 'hairless']
coat_reg = '(' + '|'.join(coat_patterns) + ')'

dataset['color'] = dataset['color'].str.lower()

dataset['coat_pattern'] = dataset['color'].str.extract(coat_reg, expand=False)

# Concat colors and coat into coat field

In [26]:
dataset = pd.concat([dataset, 
                     dataset['color'].str.split('/', expand=True).rename(columns={0:'color1', 1:'color2'})], axis=1)

dataset['color1'] = np.where(dataset['color1'] == '', 'Breed Specific', dataset['color1'])

dataset['coat'] = np.where(dataset['color1'] == 'Breed Specific', dataset['coat_pattern'], dataset['color1'])

# Turn weekday strings into numeric values

In [None]:
dataset['intake_weekday_num'] = np.

# Bin continuous column into multiple buckets

In [27]:
dataset['time_in_shelter_days'].describe()
pd.qcut(dataset['time_in_shelter_days'], q=6)


0        (-0.001, 1.04]
1         (1.04, 3.853]
2         (1.04, 3.853]
3        (3.853, 5.065]
4        (-0.001, 1.04]
              ...      
45361    (3.853, 5.065]
45362    (5.065, 8.161]
45363    (-0.001, 1.04]
45364     (1.04, 3.853]
45365    (-0.001, 1.04]
Name: time_in_shelter_days, Length: 45366, dtype: category
Categories (6, interval[float64]): [(-0.001, 1.04] < (1.04, 3.853] < (3.853, 5.065] < (5.065, 8.161] < (8.161, 19.116] < (19.116, 1606.194]]

## Save cleaned dataset

In [28]:
dataset.to_csv('Data/dogs_intakes_outcomes_clean.csv', index=False, encoding='utf-8')

In [29]:
dataset.apply(lambda x: sum(x.isnull()/len(dataset)))

age_upon_outcome              0.000000
animal_id_outcome             0.000000
date_of_birth                 0.000000
outcome_subtype               0.696513
outcome_type                  0.000044
sex_upon_outcome              0.000022
age_upon_outcome_(days)       0.000000
age_upon_outcome_(years)      0.000000
age_upon_outcome_age_group    0.000000
outcome_datetime              0.000000
outcome_month                 0.000000
outcome_year                  0.000000
outcome_monthyear             0.000000
outcome_weekday               0.000000
outcome_hour                  0.000000
outcome_number                0.000000
dob_year                      0.000000
dob_month                     0.000000
dob_monthyear                 0.000000
age_upon_intake               0.000000
animal_id_intake              0.000000
animal_type                   0.000000
breed                         0.000000
color                         0.000000
found_location                0.000000
intake_condition         