# Intakes Data

#### Imports

In [1]:
import pandas as pd
import janitor
import numpy as np
import re

#### Read in data

In [2]:
intakes = pd.read_csv('../data/intakes.csv')

#### Standardize column names

In [3]:
intakes = intakes.clean_names() #from janitor package

## Clean columns

#### Name

In [4]:
intakes['name'].str.startswith("*").value_counts(normalize=True)

False    0.669175
True     0.330825
Name: name, dtype: float64

In [5]:
#after doing some research, I couldn't any info on why some names have the *
intakes['name'] = intakes['name'].str.replace('*', '') 

#### Date time

In [6]:
intakes['datetime'] = pd.to_datetime(intakes['datetime'])

#### Month year

In [7]:
intakes = intakes.drop(columns='monthyear') # we'll add this as a feature later if needed

#### Found location

In [8]:
intakes['found_location']

0             2501 Magin Meadow Dr in Austin (TX)
1                9409 Bluegrass Dr in Austin (TX)
2              2818 Palomino Trail in Austin (TX)
3                                     Austin (TX)
4                   800 Grove Blvd in Austin (TX)
                           ...                   
126138           124 W Anderson Ln in Austin (TX)
126139    1912 E William Cannon Dr in Austin (TX)
126140         Cesar Chavez Street in Austin (TX)
126141       1000 East 41St Street in Austin (TX)
126142          5020Bonneville Bnd in Austin (TX)
Name: found_location, Length: 126143, dtype: object

The pattern of this column is: <br>
    1) Street address that starts with numbers and ends with "in __ (TX)" <br>
    2) City that is formatted like "__ (TX)"

In [9]:
intakes['found_location_split'] = intakes['found_location'].str.split(' in ')

In [10]:
intakes['found_address'] = [x[0] if len(x) > 1 else np.nan for x in intakes['found_location_split']]

In [11]:
intakes['found_city'] = [x[1] if len(x) > 1 else x[0] for x in intakes['found_location_split']]

In [12]:
# function returns the string if it starts with a number otherwise returns NaN

def contains_numbers(string):
    match = re.search('^\d', string)
    
    if match == None:
        return np.nan
    else:
        return string

In [13]:
intakes['found_address'] = [contains_numbers(str(x)) for x in intakes['found_address']]

#### Intake Type

In [14]:
intakes['intake_type'].value_counts()

Stray                 87635
Owner Surrender       25085
Public Assist          7734
Wildlife               5047
Abandoned               385
Euthanasia Request      257
Name: intake_type, dtype: int64

#### Animal Type

In [15]:
intakes['animal_type'].value_counts()

Dog          71465
Cat          47272
Other         6789
Bird           595
Livestock       22
Name: animal_type, dtype: int64

#### Sex upon intake

In [16]:
intakes['sex_upon_intake'] = intakes['sex_upon_intake'].str.replace('Unknown', 'Unknown Unknown')

In [17]:
intakes['sex_split'] = intakes['sex_upon_intake'].str.split(' ')

In [18]:
intakes['spay_neuter'] = intakes['sex_upon_intake'].str.split(' ').str[0]

In [19]:
intakes['sex'] = intakes['sex_upon_intake'].str.split(' ').str[1]

#### Age upon intake

Strategy is to split the digit from the unit of time. Standardize to year. Make function to multiply digit based on unit of time

In [20]:
intakes['age_digit'] = [str.split(x, ' ')[0] for x in intakes['age_upon_intake']]

There are 7 negative values in the dataset that suggest the animal is less than __ years old (ex: -1 years old means less than 1 year old). Because these are estimates from the shelter, we'll convert them to positive numbers. Overall, the age numbers should not be taken as absolute truth

In [21]:
intakes['age_digit'] = intakes['age_digit'].str.replace('-', '')

In [22]:
intakes['age_digit'] = intakes['age_digit'].astype('int')

In [23]:
intakes['age_unit'] = [str.split(x, ' ')[1] for x in intakes['age_upon_intake']]

In [24]:
age_unit_dict = {
    'year':'years',
    'month':'months',
    'week': 'weeks',
    'day':'days'
}

intakes['age_unit'] = intakes['age_unit'].map(age_unit_dict).fillna(intakes['age_unit'])

In [25]:
def age_in_years(unit):
    if unit == 'years':
        return 1
    elif unit == 'months':
        return 12
    elif unit == 'weeks':
        return 52.143
    elif unit == 'days':
        return 365

In [26]:
intakes['age_conversion'] = [age_in_years(x) for x in intakes['age_unit']]



In [27]:
intakes['age_in_years'] = round(intakes['age_digit'] / intakes['age_conversion'], 2)

#### Breed

In [28]:
#Mixed breeds either contain "Mix" or "/"
intakes['mix_flag'] = np.where((intakes['breed'].str.contains(' Mix')) | (intakes['breed'].str.contains('\/')), True, False)

In [29]:
#Remove "Mix" from breed since we created a flag
intakes['breed'] = intakes['breed'].str.split(' Mix').str[0]

In [30]:
intakes['breed'] = intakes['breed'].str.split('\/').str[0]

In [31]:
intakes['breed_2'] = intakes['breed'].str.split('\/').str[1]

In [32]:
intakes['breed_3'] = intakes['breed'].str.split('\/').str[2]

#### Color

In [33]:
intakes['color_1'] = intakes['color'].str.split('\/').str[0]

In [34]:
intakes['color_2'] = intakes['color'].str.split('\/').str[1]

## Feature Creation

#### Intake Count

In [35]:
intakes['intake_count'] = intakes.groupby('animal_id')['datetime'].rank(method='dense')

#### Dropping unnecessary columns

In [36]:
cols_to_drop = [
    'found_location',
    'sex_upon_intake',
    'age_upon_intake',
    'found_location_split',
    'sex_split',
    'age_digit',
    'age_unit',
    'age_conversion'
]

intakes = intakes.drop(columns=cols_to_drop)

In [37]:
intakes.to_csv('../data/intakes_clean.csv', index=False)