# Intakes Data

#### Imports

In [1]:
import pandas as pd
import janitor
import re
import numpy as np

#### Read in data

In [2]:
intakes = pd.read_csv('../data/intakes.csv')

#### Standardize column names

In [3]:
intakes = intakes.clean_names() #from janitor package

## Clean columns

#### Name

In [4]:
intakes['name'].str.startswith("*").value_counts(normalize=True)

False    0.669175
True     0.330825
Name: name, dtype: float64

In [5]:
#after doing some research, I couldn't any info on why some names have the *
intakes['name'] = intakes['name'].str.replace('*', '') 

#### Date time

In [6]:
intakes['datetime'] = pd.to_datetime(intakes['datetime'])

#### Month year

In [7]:
intakes = intakes.drop(columns='monthyear') # we'll add this as a feature later if needed

#### Found location

In [8]:
intakes['found_location']

0             2501 Magin Meadow Dr in Austin (TX)
1                9409 Bluegrass Dr in Austin (TX)
2              2818 Palomino Trail in Austin (TX)
3                                     Austin (TX)
4                   800 Grove Blvd in Austin (TX)
                           ...                   
126138           124 W Anderson Ln in Austin (TX)
126139    1912 E William Cannon Dr in Austin (TX)
126140         Cesar Chavez Street in Austin (TX)
126141       1000 East 41St Street in Austin (TX)
126142          5020Bonneville Bnd in Austin (TX)
Name: found_location, Length: 126143, dtype: object

The pattern of this column is: <br>
    1) Street address that starts with numbers and ends with "in __ (TX)" <br>
    2) City that is formatted like "__ (TX)"

In [9]:
intakes['found_location_split'] = intakes['found_location'].str.split(' in ')

In [10]:
intakes['found_address'] = [x[0] if len(x) > 1 else np.nan for x in intakes['found_location_split']]

In [11]:
intakes['found_city'] = [x[1] if len(x) > 1 else x[0] for x in intakes['found_location_split']]

In [12]:
intakes['found_address'].value_counts().head(25) #potentially fuzzy match?

7201 Levander Loop                   846
4434 Frontier Trl                    208
124 W Anderson Ln                    193
4434 Frontier Trail                  178
124 West Anderson Lane               163
1156 W Cesar Chavez                  147
12034 Research Blvd                  136
12034 Research                       115
1834 Ferguson                         90
1156 W Cesar Chavez St                82
4106 N Lamar                          74
4106 N Lamar Blvd                     70
14811 Chicadee                        69
4434 Frontier                         64
1156 West Cesar Chavez                62
508 East Howard Lane                  62
5800 Techni Center Dr                 59
7619 Scenic Brook                     53
1156 Cesar Chavez                     53
7601 Daffan Lane                      48
6600 Elm Creek Dr                     48
600 Barwood Park                      47
21413 Webber Oaks Cv                  45
9201 Circuit Of The Americas Blvd     45
1601 E Slaughter

#### Intake Type

In [None]:
intakes['intake_type'].value_counts()

#### Animal Type

In [None]:
intakes['animal_type'].value_counts()

#### Sex upon intake

In [None]:
intakes['sex_upon_intake'].value_counts()

#### Age upon intake

In [None]:
intakes['age_upon_intake'].value_counts()

Strategy is to split the digit from the unit of time. Standardize to year. Make function to multiply digit based on unit of time

In [None]:
intakes['age_digit'] = [str.split(x, ' ')[0] for x in intakes['age_upon_intake']]

In [None]:
intakes['age_digit'].value_counts()

There are 7 negative values in the dataset that suggest the animal is less than __ years old (ex: -1 years old means less than 1 year old). Because these are estimates from the shelter, we'll convert them to positive numbers. Overall, the age numbers should not be taken as absolute truth

In [None]:
intakes['age_digit'] = intakes['age_digit'].str.replace('-', '')

In [None]:
intakes['age_digit'] = intakes['age_digit'].astype('int')

In [None]:
intakes.info()

In [None]:
intakes['age_unit'] = [str.split(x, ' ')[1] for x in intakes['age_upon_intake']]

In [None]:
intakes['age_unit'].value_counts()

In [None]:
age_unit_dict = {
    'year':'years',
    'month':'months',
    'week': 'weeks',
    'day':'days'
}

intakes['age_unit'] = intakes['age_unit'].map(age_unit_dict).fillna(intakes['age_unit'])

In [None]:
intakes['age_unit'].value_counts()

### EDA

In [None]:
intakes['intake_type'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['intake_condition'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['animal_type'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['sex_upon_intake'].value_counts(normalize=True).plot(kind='barh');

In [None]:
intakes['color'].value_counts()

In [None]:
intakes[intakes['color'].str.contains('/')]

In [None]:
intakes.info()

# To Do

#### 1) Figure out if * means anything in names
 done
#### 2) Deal with 'in ____ (TX)' from found locations
#### 3) Standardize age to years