# Intakes Data

#### Imports

In [1]:
import pandas as pd
import janitor
import re
import numpy as np

#### Read in data

In [2]:
intakes = pd.read_csv('../data/intakes.csv')

#### Standardize column names

In [3]:
intakes = intakes.clean_names() #from janitor package

## Clean columns

#### Name

In [4]:
intakes['name'].str.startswith("*").value_counts(normalize=True)

False    0.669175
True     0.330825
Name: name, dtype: float64

In [5]:
#after doing some research, I couldn't any info on why some names have the *
intakes['name'] = intakes['name'].str.replace('*', '') 

#### Date time

In [6]:
intakes['datetime'] = pd.to_datetime(intakes['datetime'])

#### Month year

In [7]:
intakes = intakes.drop(columns='monthyear') # we'll add this as a feature later if needed

#### Found location

In [8]:
intakes['found_location']

0             2501 Magin Meadow Dr in Austin (TX)
1                9409 Bluegrass Dr in Austin (TX)
2              2818 Palomino Trail in Austin (TX)
3                                     Austin (TX)
4                   800 Grove Blvd in Austin (TX)
                           ...                   
126138           124 W Anderson Ln in Austin (TX)
126139    1912 E William Cannon Dr in Austin (TX)
126140         Cesar Chavez Street in Austin (TX)
126141       1000 East 41St Street in Austin (TX)
126142          5020Bonneville Bnd in Austin (TX)
Name: found_location, Length: 126143, dtype: object

The pattern of this column is: <br>
    1) Street address that starts with numbers and ends with "in __ (TX)" <br>
    2) City that is formatted like "__ (TX)"

In [9]:
intakes['found_city'] = intakes['found_location'].str.split(' in ', expand=True)[1]

In [10]:
intakes['found_location'].str.endswith('in Austin (TX)').value_counts()

True     81377
False    44766
Name: found_location, dtype: int64

In [11]:
intakes['found_location'].str.endswith(')').value_counts() #the 1558 are 'Outside Jurisdiction'

True     124585
False      1558
Name: found_location, dtype: int64

In [12]:
[x.str.split(' in ') if x.str.contains(' in ') else x for x in intakes['found_location']]

AttributeError: 'str' object has no attribute 'str'

#### Intake Type

In [13]:
intakes['intake_type'].value_counts()

Stray                 87635
Owner Surrender       25085
Public Assist          7734
Wildlife               5047
Abandoned               385
Euthanasia Request      257
Name: intake_type, dtype: int64

#### Animal Type

In [14]:
intakes['animal_type'].value_counts()

Dog          71465
Cat          47272
Other         6789
Bird           595
Livestock       22
Name: animal_type, dtype: int64

#### Sex upon intake

In [15]:
intakes['sex_upon_intake'].value_counts()

Intact Male      41015
Intact Female    38794
Neutered Male    19389
Spayed Female    16530
Unknown          10414
Name: sex_upon_intake, dtype: int64

#### Age upon intake

In [16]:
intakes['age_upon_intake'].value_counts()

1 year       22083
2 years      19446
1 month      12052
3 years       7537
2 months      6823
4 weeks       4515
4 years       4513
5 years       4128
3 weeks       3701
3 months      3326
4 months      3237
5 months      3112
6 years       2755
2 weeks       2586
6 months      2414
7 years       2357
8 years       2308
7 months      1885
10 years      1856
9 months      1844
8 months      1507
9 years       1346
1 week        1054
10 months     1030
1 weeks        927
12 years       888
11 months      806
0 years        765
11 years       756
1 day          671
3 days         591
13 years       583
2 days         500
14 years       395
15 years       341
4 days         329
6 days         327
5 weeks        320
5 days         188
16 years       146
17 years        82
18 years        50
19 years        27
20 years        20
-1 years         5
22 years         5
-3 years         1
25 years         1
21 years         1
24 years         1
-2 years         1
23 years         1
Name: age_up

Strategy is to split the digit from the unit of time. Standardize to year. Make function to multiply digit based on unit of time

In [17]:
intakes['age_digit'] = [str.split(x, ' ')[0] for x in intakes['age_upon_intake']]

In [18]:
intakes['age_digit'].value_counts()

1     36787
2     29355
3     15155
4     12594
5      7748
6      5496
7      4242
8      3815
9      3190
10     2886
11     1562
12      888
0       765
13      583
14      395
15      341
16      146
17       82
18       50
19       27
20       20
22        5
-1        5
-3        1
-2        1
25        1
24        1
21        1
23        1
Name: age_digit, dtype: int64

There are 7 negative values in the dataset that suggest the animal is less than __ years old (ex: -1 years old means less than 1 year old). Because these are estimates from the shelter, we'll convert them to positive numbers. Overall, the age numbers should not be taken as absolute truth

In [19]:
intakes['age_digit'] = intakes['age_digit'].str.replace('-', '')

In [20]:
intakes['age_digit'] = intakes['age_digit'].astype('int')

In [21]:
intakes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126143 entries, 0 to 126142
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   animal_id         126143 non-null  object        
 1   name              86384 non-null   object        
 2   datetime          126143 non-null  datetime64[ns]
 3   found_location    126143 non-null  object        
 4   intake_type       126143 non-null  object        
 5   intake_condition  126143 non-null  object        
 6   animal_type       126143 non-null  object        
 7   sex_upon_intake   126142 non-null  object        
 8   age_upon_intake   126143 non-null  object        
 9   breed             126143 non-null  object        
 10  color             126143 non-null  object        
 11  found_city        96792 non-null   object        
 12  age_digit         126143 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(11)
memory usage: 12

In [22]:
intakes['age_unit'] = [str.split(x, ' ')[1] for x in intakes['age_upon_intake']]

In [23]:
intakes['age_unit'].value_counts()

years     50315
months    25984
year      22083
month     12052
weeks     12049
days       1935
week       1054
day         671
Name: age_unit, dtype: int64

In [33]:
age_unit_dict = {
    'year':'years',
    'month':'months',
    'week': 'weeks',
    'day':'days'
}

intakes['age_unit'] = intakes['age_unit'].map(age_unit_dict).fillna(intakes['age_unit'])

In [35]:
intakes['age_unit'].value_counts()

years     72398
months    38036
weeks     13103
days       2606
Name: age_unit, dtype: int64

### EDA

In [None]:
intakes['intake_type'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['intake_condition'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['animal_type'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['sex_upon_intake'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['color'].value_counts()

In [None]:
intakes[intakes['color'].str.contains('/')]

In [None]:
intakes.info()

# To Do

#### 1) Figure out if * means anything in names
 done
#### 2) Deal with 'in ____ (TX)' from found locations
#### 3) Standardize age to years