# Intakes Data

#### Imports

In [1]:
import pandas as pd
import janitor
import numpy as np

#### Read in data

In [2]:
intakes = pd.read_csv('../data/intakes.csv')

#### Standardize column names

In [3]:
intakes = intakes.clean_names() #from janitor package

## Clean columns

#### Name

In [4]:
intakes['name'].str.startswith("*").value_counts(normalize=True)

False    0.669175
True     0.330825
Name: name, dtype: float64

In [5]:
#after doing some research, I couldn't any info on why some names have the *
intakes['name'] = intakes['name'].str.replace('*', '') 

#### Date time

In [6]:
intakes['datetime'] = pd.to_datetime(intakes['datetime'])

#### Month year

In [7]:
intakes = intakes.drop(columns='monthyear') # we'll add this as a feature later if needed

#### Found location

In [8]:
intakes['found_location']

0             2501 Magin Meadow Dr in Austin (TX)
1                9409 Bluegrass Dr in Austin (TX)
2              2818 Palomino Trail in Austin (TX)
3                                     Austin (TX)
4                   800 Grove Blvd in Austin (TX)
                           ...                   
126138           124 W Anderson Ln in Austin (TX)
126139    1912 E William Cannon Dr in Austin (TX)
126140         Cesar Chavez Street in Austin (TX)
126141       1000 East 41St Street in Austin (TX)
126142          5020Bonneville Bnd in Austin (TX)
Name: found_location, Length: 126143, dtype: object

The pattern of this column is: <br>
    1) Street address that starts with numbers and ends with "in __ (TX)" <br>
    2) City that is formatted like "__ (TX)"

In [9]:
intakes['found_location_split'] = intakes['found_location'].str.split(' in ')

In [10]:
intakes['found_address'] = [x[0] if len(x) > 1 else np.nan for x in intakes['found_location_split']]

In [11]:
intakes['found_city'] = [x[1] if len(x) > 1 else x[0] for x in intakes['found_location_split']]

In [12]:
intakes['found_address'].value_counts().head(25) #potentially fuzzy match?

7201 Levander Loop                   846
4434 Frontier Trl                    208
124 W Anderson Ln                    193
4434 Frontier Trail                  178
124 West Anderson Lane               163
1156 W Cesar Chavez                  147
12034 Research Blvd                  136
12034 Research                       115
1834 Ferguson                         90
1156 W Cesar Chavez St                82
4106 N Lamar                          74
4106 N Lamar Blvd                     70
14811 Chicadee                        69
4434 Frontier                         64
1156 West Cesar Chavez                62
508 East Howard Lane                  62
5800 Techni Center Dr                 59
7619 Scenic Brook                     53
1156 Cesar Chavez                     53
6600 Elm Creek Dr                     48
7601 Daffan Lane                      48
600 Barwood Park                      47
9201 Circuit Of The Americas Blvd     45
21413 Webber Oaks Cv                  45
1601 E Slaughter

#### Intake Type

In [13]:
intakes['intake_type'].value_counts()

Stray                 87635
Owner Surrender       25085
Public Assist          7734
Wildlife               5047
Abandoned               385
Euthanasia Request      257
Name: intake_type, dtype: int64

#### Animal Type

In [14]:
intakes['animal_type'].value_counts()

Dog          71465
Cat          47272
Other         6789
Bird           595
Livestock       22
Name: animal_type, dtype: int64

#### Sex upon intake

In [15]:
intakes['sex_upon_intake'].value_counts()

Intact Male      41015
Intact Female    38794
Neutered Male    19389
Spayed Female    16530
Unknown          10414
Name: sex_upon_intake, dtype: int64

In [16]:
intakes['sex_upon_intake'] = intakes['sex_upon_intake'].str.replace('Unknown', 'Unknown Unknown')

In [17]:
intakes['sex_split'] = intakes['sex_upon_intake'].str.split(' ')

In [18]:
intakes['sex_split'].value_counts()

[Intact, Male]        41015
[Intact, Female]      38794
[Neutered, Male]      19389
[Spayed, Female]      16530
[Unknown, Unknown]    10414
Name: sex_split, dtype: int64

In [19]:
intakes['sex_split'][1][0]

'Spayed'

In [20]:
intakes['sex_split'][0][1]

'Male'

In [21]:
intakes['spay_neuter'] = intakes['sex_upon_intake'].str.split(' ').str[0]

In [22]:
intakes['sex'] = intakes['sex_upon_intake'].str.split(' ').str[1]

#### Age upon intake

In [23]:
intakes['age_upon_intake'].value_counts()

1 year       22083
2 years      19446
1 month      12052
3 years       7537
2 months      6823
4 weeks       4515
4 years       4513
5 years       4128
3 weeks       3701
3 months      3326
4 months      3237
5 months      3112
6 years       2755
2 weeks       2586
6 months      2414
7 years       2357
8 years       2308
7 months      1885
10 years      1856
9 months      1844
8 months      1507
9 years       1346
1 week        1054
10 months     1030
1 weeks        927
12 years       888
11 months      806
0 years        765
11 years       756
1 day          671
3 days         591
13 years       583
2 days         500
14 years       395
15 years       341
4 days         329
6 days         327
5 weeks        320
5 days         188
16 years       146
17 years        82
18 years        50
19 years        27
20 years        20
22 years         5
-1 years         5
25 years         1
24 years         1
-3 years         1
23 years         1
-2 years         1
21 years         1
Name: age_up

Strategy is to split the digit from the unit of time. Standardize to year. Make function to multiply digit based on unit of time

In [24]:
intakes['age_digit'] = [str.split(x, ' ')[0] for x in intakes['age_upon_intake']]

In [25]:
intakes['age_digit'].value_counts()

1     36787
2     29355
3     15155
4     12594
5      7748
6      5496
7      4242
8      3815
9      3190
10     2886
11     1562
12      888
0       765
13      583
14      395
15      341
16      146
17       82
18       50
19       27
20       20
22        5
-1        5
21        1
-2        1
-3        1
23        1
24        1
25        1
Name: age_digit, dtype: int64

There are 7 negative values in the dataset that suggest the animal is less than __ years old (ex: -1 years old means less than 1 year old). Because these are estimates from the shelter, we'll convert them to positive numbers. Overall, the age numbers should not be taken as absolute truth

In [26]:
intakes['age_digit'] = intakes['age_digit'].str.replace('-', '')

In [27]:
intakes['age_digit'] = intakes['age_digit'].astype('int')

In [28]:
intakes['age_unit'] = [str.split(x, ' ')[1] for x in intakes['age_upon_intake']]

In [29]:
intakes['age_unit'].value_counts()

years     50315
months    25984
year      22083
month     12052
weeks     12049
days       1935
week       1054
day         671
Name: age_unit, dtype: int64

In [30]:
age_unit_dict = {
    'year':'years',
    'month':'months',
    'week': 'weeks',
    'day':'days'
}

intakes['age_unit'] = intakes['age_unit'].map(age_unit_dict).fillna(intakes['age_unit'])

In [31]:
intakes['age_unit'].value_counts()

years     72398
months    38036
weeks     13103
days       2606
Name: age_unit, dtype: int64

In [32]:
def age_in_years(unit):
    if unit == 'years':
        return 1
    elif unit == 'months':
        return 12
    elif unit == 'weeks':
        return 52.143
    elif unit == 'days':
        return 365

In [33]:
intakes['age_conversion'] = [age_in_years(x) for x in intakes['age_unit']]



In [34]:
intakes['age_in_years'] = round(intakes['age_digit'] / intakes['age_conversion'], 2)

#### Breed

In [35]:
intakes['breed'].nunique()

2647

## Feature Creation

#### Intake Count

In [36]:
intakes['intake_count'] = intakes.groupby('animal_id')['datetime'].rank(method='dense')

#### Dropping unnecessary columns

In [37]:
cols_to_drop = [
    'found_location',
    'sex_upon_intake',
    'age_upon_intake',
    'found_location_split',
    'sex_split',
    'age_digit',
    'age_unit',
    'age_conversion'
]

intakes = intakes.drop(columns=cols_to_drop)

In [38]:
intakes.to_csv('../data/intakes_clean.csv', index=False)