# Intakes Data

#### Imports

In [1]:
import pandas as pd
import janitor
import numpy as np
import re

#### Read in data

In [2]:
intakes = pd.read_csv('../data/intakes.csv')

#### Standardize column names

In [3]:
intakes = intakes.clean_names() #from janitor package

## Clean columns

#### Name

In [4]:
intakes['name'].str.startswith("*").value_counts(normalize=True)

False    0.669175
True     0.330825
Name: name, dtype: float64

In [5]:
#after doing some research, I couldn't any info on why some names have the *
intakes['name'] = intakes['name'].str.replace('*', '') 

#### Date time

In [6]:
intakes['datetime'] = pd.to_datetime(intakes['datetime'])

#### Month year

In [7]:
intakes = intakes.drop(columns='monthyear') # we'll add this as a feature later if needed

#### Found location

In [8]:
intakes['found_location']

0             2501 Magin Meadow Dr in Austin (TX)
1                9409 Bluegrass Dr in Austin (TX)
2              2818 Palomino Trail in Austin (TX)
3                                     Austin (TX)
4                   800 Grove Blvd in Austin (TX)
                           ...                   
126138           124 W Anderson Ln in Austin (TX)
126139    1912 E William Cannon Dr in Austin (TX)
126140         Cesar Chavez Street in Austin (TX)
126141       1000 East 41St Street in Austin (TX)
126142          5020Bonneville Bnd in Austin (TX)
Name: found_location, Length: 126143, dtype: object

The pattern of this column is: <br>
    1) Street address that starts with numbers and ends with "in __ (TX)" <br>
    2) City that is formatted like "__ (TX)"

In [9]:
intakes['found_location_split'] = intakes['found_location'].str.split(' in ')

In [10]:
intakes['found_address'] = [x[0] if len(x) > 1 else np.nan for x in intakes['found_location_split']]

In [11]:
intakes['found_city'] = [x[1] if len(x) > 1 else x[0] for x in intakes['found_location_split']]

In [12]:
# function returns the string if it starts with a number otherwise returns NaN

def contains_numbers(string):
    match = re.search('^\d', string)
    
    if match == None:
        return np.nan
    else:
        return string

In [13]:
intakes['found_address'] = [contains_numbers(str(x)) for x in intakes['found_address']]

#### Intake Type

In [14]:
intakes['intake_type'].value_counts()

Stray                 87635
Owner Surrender       25085
Public Assist          7734
Wildlife               5047
Abandoned               385
Euthanasia Request      257
Name: intake_type, dtype: int64

#### Animal Type

In [15]:
intakes['animal_type'].value_counts()

Dog          71465
Cat          47272
Other         6789
Bird           595
Livestock       22
Name: animal_type, dtype: int64

#### Sex upon intake

In [16]:
intakes['sex_upon_intake'].value_counts()

Intact Male      41015
Intact Female    38794
Neutered Male    19389
Spayed Female    16530
Unknown          10414
Name: sex_upon_intake, dtype: int64

In [17]:
intakes['sex_upon_intake'] = intakes['sex_upon_intake'].str.replace('Unknown', 'Unknown Unknown')

In [18]:
intakes['sex_split'] = intakes['sex_upon_intake'].str.split(' ')

In [19]:
intakes['sex_split'].value_counts()

[Intact, Male]        41015
[Intact, Female]      38794
[Neutered, Male]      19389
[Spayed, Female]      16530
[Unknown, Unknown]    10414
Name: sex_split, dtype: int64

In [20]:
intakes['sex_split'][1][0]

'Spayed'

In [21]:
intakes['sex_split'][0][1]

'Male'

In [22]:
intakes['spay_neuter'] = intakes['sex_upon_intake'].str.split(' ').str[0]

In [23]:
intakes['sex'] = intakes['sex_upon_intake'].str.split(' ').str[1]

#### Age upon intake

In [24]:
intakes['age_upon_intake'].value_counts()

1 year       22083
2 years      19446
1 month      12052
3 years       7537
2 months      6823
4 weeks       4515
4 years       4513
5 years       4128
3 weeks       3701
3 months      3326
4 months      3237
5 months      3112
6 years       2755
2 weeks       2586
6 months      2414
7 years       2357
8 years       2308
7 months      1885
10 years      1856
9 months      1844
8 months      1507
9 years       1346
1 week        1054
10 months     1030
1 weeks        927
12 years       888
11 months      806
0 years        765
11 years       756
1 day          671
3 days         591
13 years       583
2 days         500
14 years       395
15 years       341
4 days         329
6 days         327
5 weeks        320
5 days         188
16 years       146
17 years        82
18 years        50
19 years        27
20 years        20
-1 years         5
22 years         5
25 years         1
-2 years         1
24 years         1
-3 years         1
23 years         1
21 years         1
Name: age_up

Strategy is to split the digit from the unit of time. Standardize to year. Make function to multiply digit based on unit of time

In [25]:
intakes['age_digit'] = [str.split(x, ' ')[0] for x in intakes['age_upon_intake']]

In [26]:
intakes['age_digit'].value_counts()

1     36787
2     29355
3     15155
4     12594
5      7748
6      5496
7      4242
8      3815
9      3190
10     2886
11     1562
12      888
0       765
13      583
14      395
15      341
16      146
17       82
18       50
19       27
20       20
-1        5
22        5
-3        1
25        1
24        1
23        1
21        1
-2        1
Name: age_digit, dtype: int64

There are 7 negative values in the dataset that suggest the animal is less than __ years old (ex: -1 years old means less than 1 year old). Because these are estimates from the shelter, we'll convert them to positive numbers. Overall, the age numbers should not be taken as absolute truth

In [27]:
intakes['age_digit'] = intakes['age_digit'].str.replace('-', '')

In [28]:
intakes['age_digit'] = intakes['age_digit'].astype('int')

In [29]:
intakes['age_unit'] = [str.split(x, ' ')[1] for x in intakes['age_upon_intake']]

In [30]:
intakes['age_unit'].value_counts()

years     50315
months    25984
year      22083
month     12052
weeks     12049
days       1935
week       1054
day         671
Name: age_unit, dtype: int64

In [31]:
age_unit_dict = {
    'year':'years',
    'month':'months',
    'week': 'weeks',
    'day':'days'
}

intakes['age_unit'] = intakes['age_unit'].map(age_unit_dict).fillna(intakes['age_unit'])

In [32]:
intakes['age_unit'].value_counts()

years     72398
months    38036
weeks     13103
days       2606
Name: age_unit, dtype: int64

In [33]:
def age_in_years(unit):
    if unit == 'years':
        return 1
    elif unit == 'months':
        return 12
    elif unit == 'weeks':
        return 52.143
    elif unit == 'days':
        return 365

In [34]:
intakes['age_conversion'] = [age_in_years(x) for x in intakes['age_unit']]



In [35]:
intakes['age_in_years'] = round(intakes['age_digit'] / intakes['age_conversion'], 2)

#### Breed

In [36]:
intakes['breed_1'] = intakes['breed'].str.split('\/').str[0]

In [37]:
intakes['breed_2'] = intakes['breed'].str.split('\/').str[1]

In [38]:
intakes['breed_3'] = intakes['breed'].str.split('\/').str[2]

In [39]:
intakes['breed_1'].str.contains('Mix', case=False)

0          True
1         False
2          True
3          True
4         False
          ...  
126138    False
126139    False
126140    False
126141    False
126142     True
Name: breed_1, Length: 126143, dtype: bool

In [40]:
intakes['breed_1'].value_counts().head(20)

Domestic Shorthair Mix       31086
Pit Bull Mix                  8471
Domestic Shorthair            7629
Labrador Retriever Mix        6926
Chihuahua Shorthair Mix       6253
Labrador Retriever            3141
Domestic Medium Hair Mix      3118
German Shepherd Mix           3038
Pit Bull                      2088
Chihuahua Shorthair           2038
Bat Mix                       1756
German Shepherd               1723
Bat                           1571
Domestic Longhair Mix         1543
Australian Cattle Dog Mix     1521
Siamese Mix                   1291
Dachshund Mix                 1048
Boxer Mix                      984
Border Collie Mix              951
Miniature Poodle Mix           861
Name: breed_1, dtype: int64

In [41]:
intakes[['animal_id', 'animal_type', 'breed', 'breed_1', 'breed_2', 'breed_3']]

Unnamed: 0,animal_id,animal_type,breed,breed_1,breed_2,breed_3
0,A786884,Dog,Beagle Mix,Beagle Mix,,
1,A706918,Dog,English Springer Spaniel,English Springer Spaniel,,
2,A724273,Dog,Basenji Mix,Basenji Mix,,
3,A665644,Cat,Domestic Shorthair Mix,Domestic Shorthair Mix,,
4,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,Doberman Pinsch,Australian Cattle Dog,
...,...,...,...,...,...,...
126138,A834632,Other,Guinea Pig,Guinea Pig,,
126139,A834626,Cat,Domestic Shorthair,Domestic Shorthair,,
126140,A834634,Dog,Unknown,Unknown,,
126141,A834633,Cat,Domestic Shorthair,Domestic Shorthair,,


In [42]:
intakes['mix_1'] = intakes['breed'].str.contains('\/')

In [43]:
intakes['mix_2'] = intakes['breed'].str.contains('Mix', case=False)

In [44]:
intakes['mix_breed'] = np.where((intakes['mix_1'] == True) | (intakes['mix_2'] == True), True, False)

#### Color

In [45]:
intakes['color_1'] = intakes['color'].str.split('\/').str[0]

In [46]:
intakes['color_2'] = intakes['color'].str.split('\/').str[1]

## Feature Creation

#### Intake Count

In [47]:
intakes['intake_count'] = intakes.groupby('animal_id')['datetime'].rank(method='dense')

#### Dropping unnecessary columns

In [48]:
cols_to_drop = [
    'found_location',
    'sex_upon_intake',
    'age_upon_intake',
    'found_location_split',
    'sex_split',
    'age_digit',
    'age_unit',
    'age_conversion',
    'mix_1',
    'mix_2'
]

intakes = intakes.drop(columns=cols_to_drop)

In [49]:
intakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126143 entries, 0 to 126142
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   animal_id         126143 non-null  object        
 1   name              86384 non-null   object        
 2   datetime          126143 non-null  datetime64[ns]
 3   intake_type       126143 non-null  object        
 4   intake_condition  126143 non-null  object        
 5   animal_type       126143 non-null  object        
 6   breed             126143 non-null  object        
 7   color             126143 non-null  object        
 8   found_address     74175 non-null   object        
 9   found_city        126143 non-null  object        
 10  spay_neuter       126142 non-null  object        
 11  sex               126142 non-null  object        
 12  age_in_years      126143 non-null  float64       
 13  breed_1           126143 non-null  object        
 14  bree

In [None]:
intakes.to_csv('../data/intakes_clean.csv', index=False)