# Intakes Data

#### Imports

In [1]:
import pandas as pd
import janitor
import numpy as np
import re

#### Read in data

In [2]:
intakes = pd.read_csv('../data/intakes.csv')

#### Standardize column names

In [3]:
intakes = intakes.clean_names() #from janitor package

## Clean columns

#### Name

In [4]:
intakes['name'].str.startswith("*").value_counts(normalize=True)

False    0.669175
True     0.330825
Name: name, dtype: float64

In [5]:
#after doing some research, I couldn't any info on why some names have the *
intakes['name'] = intakes['name'].str.replace('*', '') 

#### Date time

In [6]:
intakes['datetime'] = pd.to_datetime(intakes['datetime'])

#### Year and Month

In [7]:
# we'll recreate this as a feature 
intakes = intakes.drop(columns='monthyear') 

In [8]:
intakes['intake_year'] = intakes['datetime'].dt.year

In [9]:
intakes['intake_month'] = intakes['datetime'].dt.month

#### Found location

In [10]:
intakes['found_location']

0             2501 Magin Meadow Dr in Austin (TX)
1                9409 Bluegrass Dr in Austin (TX)
2              2818 Palomino Trail in Austin (TX)
3                                     Austin (TX)
4                   800 Grove Blvd in Austin (TX)
                           ...                   
126138           124 W Anderson Ln in Austin (TX)
126139    1912 E William Cannon Dr in Austin (TX)
126140         Cesar Chavez Street in Austin (TX)
126141       1000 East 41St Street in Austin (TX)
126142          5020Bonneville Bnd in Austin (TX)
Name: found_location, Length: 126143, dtype: object

The pattern of this column is: <br>
    1) Street address that starts with numbers and ends with "in __ (TX)" <br>
    2) City that is formatted like "__ (TX)"

In [11]:
intakes['found_location_split'] = intakes['found_location'].str.split(' in ')

In [12]:
intakes['found_address'] = [x[0] if len(x) > 1 else np.nan for x in intakes['found_location_split']]

In [13]:
intakes['found_city'] = [x[1] if len(x) > 1 else x[0] for x in intakes['found_location_split']]

In [14]:
intakes['found_city'] = intakes['found_city'].str.replace(' \(TX\)', '')

In [15]:
# function returns the string if it starts with a number otherwise returns NaN

def contains_numbers(string):
    match = re.search('^\d', string)
    
    if match == None:
        return np.nan
    else:
        return string

In [16]:
intakes['found_address'] = [contains_numbers(str(x)) for x in intakes['found_address']]

#### Intake Type

In [17]:
intakes['intake_type'].value_counts()

Stray                 87635
Owner Surrender       25085
Public Assist          7734
Wildlife               5047
Abandoned               385
Euthanasia Request      257
Name: intake_type, dtype: int64

#### Animal Type

In [18]:
intakes['animal_type'].value_counts()

Dog          71465
Cat          47272
Other         6789
Bird           595
Livestock       22
Name: animal_type, dtype: int64

#### Sex upon intake

In [19]:
intakes['sex_upon_intake'] = intakes['sex_upon_intake'].str.replace('Unknown', 'Unknown Unknown')

In [20]:
intakes['sex_split'] = intakes['sex_upon_intake'].str.split(' ')

In [21]:
intakes['spay_neuter'] = intakes['sex_upon_intake'].str.split(' ').str[0]

In [22]:
intakes['sex'] = intakes['sex_upon_intake'].str.split(' ').str[1]

#### Age upon intake

Strategy is to split the digit from the unit of time. Standardize to year. Make function to multiply digit based on unit of time

In [23]:
intakes['age_digit'] = [str.split(x, ' ')[0] for x in intakes['age_upon_intake']]

There are 7 negative values in the dataset that suggest the animal is less than __ years old (ex: -1 years old means less than 1 year old). Because these are estimates from the shelter, we'll convert them to positive numbers. Overall, the age numbers should not be taken as absolute truth

In [24]:
intakes['age_digit'] = intakes['age_digit'].str.replace('-', '')

In [25]:
intakes['age_digit'] = intakes['age_digit'].astype('int')

In [26]:
intakes['age_unit'] = [str.split(x, ' ')[1] for x in intakes['age_upon_intake']]

In [27]:
age_unit_dict = {
    'year':'years',
    'month':'months',
    'week': 'weeks',
    'day':'days'
}

intakes['age_unit'] = intakes['age_unit'].map(age_unit_dict).fillna(intakes['age_unit'])

In [28]:
def age_in_years(unit):
    if unit == 'years':
        return 1
    elif unit == 'months':
        return 12
    elif unit == 'weeks':
        return 52.143
    elif unit == 'days':
        return 365

In [29]:
intakes['age_conversion'] = [age_in_years(x) for x in intakes['age_unit']]



In [30]:
intakes['age_in_years'] = round(intakes['age_digit'] / intakes['age_conversion'], 2)

#### Categorical age grouping

In [31]:
def age_group(number):
    if number < 1:
        return 'under 1'
    elif 1 <= number < 2.99:
        return 'between 1 and 3'
    elif 3 <= number < 4.99:
        return 'between 3 and 5'
    else: 
        return 'over 5'

In [32]:
intakes['age_group'] = intakes['age_in_years'].apply(age_group)

#### Breed

In [33]:
#Black and Tan Hounds' is listed as Black/Tan Hound and that naming convention causes flag and/or split issues

intakes['breed'] = intakes['breed'].str.replace('Black/Tan Hound', 'Black and Tan Hound')

In [34]:
#Mixed breeds either contain "Mix" or "/"
intakes['mix_flag'] = np.where((intakes['breed'].str.contains(' Mix')) | (intakes['breed'].str.contains('\/')), True, False)

In [35]:
# Flag for dog breed containing Pit Bull
intakes['pit_flag'] = np.where(intakes['breed'].str.contains('Pit Bull'), True, False)

In [36]:
intakes['breed_split'] = intakes['breed'].str.split('\/')

In [37]:
#sorts the list of mixed breeds alphabetically to avoid combinations from ordering of breed

intakes['breed_split_sort'] = [sorted(x) if len(x) > 1 else x for x in intakes['breed_split']]

In [38]:
intakes['breed_1'] = intakes['breed_split_sort'].str[0]
intakes['breed_2'] = intakes['breed_split_sort'].str[1]

In [39]:
#remove mix

intakes['breed_1'] = intakes['breed_1'].str.replace(' Mix', '')
intakes['breed_2'] = intakes['breed_2'].str.replace(' Mix', '')

#### Color

In [40]:
intakes['color_1'] = intakes['color'].str.split('\/').str[0]

In [41]:
intakes['color_2'] = intakes['color'].str.split('\/').str[1]

## Additional Feature Creation

#### Category
This will be used for an intermediate table along the way to properly align intakes and outcomes

In [42]:
intakes['category'] = 'intake'

#### Dropping unnecessary columns

In [43]:
cols_to_drop = [
    'found_location',
    'sex_upon_intake',
    'age_upon_intake',
    'found_location_split',
    'sex_split',
    'age_digit',
    'age_unit',
    'age_conversion',
    'breed_split',
    'breed_split_sort'
]

intakes = intakes.drop(columns=cols_to_drop)

In [44]:
intakes.to_csv('../data/intakes_clean.csv', index=False)

In [49]:
intakes['datetime'].sort_values()

67176    2013-10-01 07:51:00
106680   2013-10-01 08:33:00
2185     2013-10-01 08:33:00
35879    2013-10-01 08:33:00
108286   2013-10-01 08:53:00
                 ...        
126136   2021-05-15 16:15:00
126135   2021-05-15 16:36:00
126137   2021-05-15 18:04:00
126138   2021-05-16 08:01:00
126142   2021-05-16 09:57:00
Name: datetime, Length: 126143, dtype: datetime64[ns]