# Feature Engineering

## Import libraries

In [22]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import dataset

In [23]:
dataset = pd.read_csv("Data/dogs_intakes_outcomes_working.csv")

## Split sex upon intake into two features

In [24]:
dataset['sex'] = np.where(dataset['sex_upon_intake'].str.contains('Male'), 'Male', 'Female')    
dataset['spay/neuter'] = np.where(dataset['sex_upon_intake'].str.contains('Intact'), 'No', 'Yes')

## Denote if intake is mixed or purebred

In [25]:
dataset['mixed_breed'] = np.where((dataset['breed'].str.contains('Mix')) | (dataset['breed'].str.contains('/')), 'Mixed', 'Purebred')

## Split color feature into two colors and one pattern feature

In [26]:
coat_patterns = ['bicolor', 'tricolor', 'merle', 'tuxedo', 'harlequin', 'spotted', 'tick', 'ticked', 'fleck', 
                 'flecked', 'speck', 'speckled', 'brindle', 'saddle', 'blanket', 'sable', 'hairless']
coat_reg = '(' + '|'.join(coat_patterns) + ')'

dataset['color'] = dataset['color'].str.lower()

dataset['coat_pattern'] = dataset['color'].str.extract(coat_reg, expand=False)

# Concat colors and coat into coat field

In [27]:
dataset = pd.concat([dataset, 
                     dataset['color'].str.split('/', expand=True).rename(columns={0:'color1', 1:'color2'})], axis=1)

dataset['color1'] = np.where(dataset['color1'] == '', 'Breed Specific', dataset['color1'])

dataset['coat'] = np.where(dataset['color1'] == 'Breed Specific', dataset['coat_pattern'], dataset['color1'])

# Bin continuous column into multiple buckets

In [28]:
dataset['time_in_shelter_days'].describe()
dataset['time_in_shelter_days_12'] = pd.qcut(dataset['time_in_shelter_days'], q=12)
dataset['time_in_shelter_days_6'] = pd.qcut(dataset['time_in_shelter_days'], q=6)


# Create bully breed field

In [32]:
dataset['Bully_breed'] = np.where((dataset['breed'].str.contains('Pit Bull')) | 
                                (dataset['breed'].str.contains('Staffordshire')) | 
                                (dataset['breed'].str.contains('Bull Terrier')), 
                                'Bully', 'Not Bully')

## Create new feature for puppy/adult

In [33]:
# puppies definied as younger than 6 months

dataset['Puppy/Dog_intake'] = np.where(dataset['age_upon_intake_(days)'] < 365, 'Puppy', 'Dog')

# Drop nan values from important features

In [34]:
dataset.dropna(subset = ["sex"], inplace=True)
dataset.dropna(subset = ["outcome_type"], inplace=True)

# Reorder columns

## Save cleaned dataset

In [35]:
dataset.to_csv('Data/dogs_intakes_outcomes_clean.csv', index=False, encoding='utf-8')