In [1]:
import pandas as pd
import numpy as np

In [2]:
#bloomington = pd.read_csv("Data/Bloomington_Animal_Shelter_Animals.csv")

## Ongoing Data Wrangling To Do List
- explore observations with negative "time_in" values, seems to represent same animal id appearing multiple times without updating intake_date data for each instance
- plot years of data to determine if there's enough data from the first recorded years in each location
- standardize categorical intake and outcomes entries across each location

### Austin Dataframe Formatting

In [3]:
austin_outcomes = pd.read_csv("Data/Austin_Animal_Center_Outcomes.csv")
austin_intakes = pd.read_csv("Data/Austin_Animal_Center_Intakes.csv")

In [4]:
austin_intakes.drop(['Name', 'MonthYear', 'Found Location'],axis=1, inplace=True)

austin_intakes.rename(columns={'Animal ID':'animal_id',
                      'Animal Type':'animal_type',
                      'DateTime':'intake_date',
                      'Intake Type':'intake_type',
                      'Intake Condition':'intake_condition',
                      'Breed':'breed',
                      'Color':'color',
                      'Sex upon Intake':'intake_sex',
                      'Age upon Intake':'intake_age'}, inplace=True)

In [5]:
austin_outcomes.drop(['Name', 'MonthYear'],axis=1, inplace=True)
austin_outcomes.rename(columns={'Animal ID':'animal_id',
                      'Animal Type':'animal_type',
                      'DateTime':'outcome_date',
                      'Outcome Type':'outcome_type',
                      'Outcome Subtype':'outcome_subtype',
                      'Breed':'breed',
                      'Color':'color',
                      'Sex upon Outcome':'outcome_sex',
                      'Age upon Outcome':'outcome_age',
                      'Date of Birth':'birth_date'}, inplace=True)

In [6]:
austin = austin_intakes.merge(austin_outcomes, how='left')
austin.head()

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,intake_sex,intake_age,breed,color,outcome_date,birth_date,outcome_type,outcome_subtype,outcome_sex,outcome_age
0,A786884,01/03/2019 04:19:00 PM,Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor,01/08/2019 03:11:00 PM,01/03/2017,Transfer,Partner,Neutered Male,2 years
1,A706918,07/05/2015 12:59:00 PM,Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,07/05/2015 03:13:00 PM,07/05/2007,Return to Owner,,Spayed Female,8 years
2,A724273,04/14/2016 06:43:00 PM,Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,04/21/2016 05:17:00 PM,04/17/2015,Return to Owner,,Neutered Male,1 year
3,A665644,10/21/2013 07:59:00 AM,Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico,10/21/2013 11:39:00 AM,09/21/2013,Transfer,Partner,Intact Female,4 weeks
4,A682524,06/29/2014 10:38:00 AM,Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,07/02/2014 02:16:00 PM,06/29/2010,Return to Owner,,Neutered Male,4 years


In [7]:
# split intake_sex into gender and intake_repro
austin[['intake_repro','gender']]= austin['intake_sex'].str.split(" ", expand=True)
# split outcome_sex into gender and outcome_repro
austin[['outcome_repro','gender']]= austin['outcome_sex'].str.split(" ", expand=True)

# set the new city variable to Austin
austin['city'] = 'austin'

# after creating the new gender and repro columns, drop the old columns
austin.drop(['intake_sex','outcome_sex'], axis=1, inplace=True)

# subset to only Dogs
austin.drop(austin.loc[austin['animal_type']!='Dog'].index, inplace=True)

# drop observations where outcome is unknown
austin.drop(austin.loc[austin['outcome_type'].isna()].index, inplace=True)

# set all string columns to lower case for consistency across all datasets
austin = austin.applymap(lambda s: s.lower() if type(s) == str else s)


austin.tail()

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,intake_age,breed,color,outcome_date,birth_date,outcome_type,outcome_subtype,outcome_age,intake_repro,gender,outcome_repro,city
172475,a844230,11/05/2021 10:18:00 am,owner surrender,normal,dog,1 year,pit bull,white/brown,11/01/2021 06:08:00 pm,04/13/2020,adoption,,1 year,neutered,male,neutered,austin
172510,a836314,11/06/2021 11:58:00 am,owner surrender,normal,dog,2 years,pit bull mix,white/blue,09/29/2021 01:39:00 pm,06/08/2019,adoption,,2 years,neutered,male,neutered,austin
172511,a836314,11/06/2021 11:58:00 am,owner surrender,normal,dog,2 years,pit bull mix,white/blue,07/16/2021 05:32:00 pm,06/08/2019,adoption,,2 years,neutered,male,neutered,austin
172512,a836314,11/06/2021 11:58:00 am,owner surrender,normal,dog,2 years,pit bull mix,white/blue,11/01/2021 04:55:00 pm,06/08/2019,adoption,,2 years,neutered,male,neutered,austin
172547,a812375,01/21/2020 01:57:00 pm,owner surrender,normal,dog,10 months,great dane,black/white,01/25/2020 02:29:00 pm,03/21/2019,adoption,,10 months,spayed,female,spayed,austin


In [8]:
# transform the single column of date and time into a column for each
# for both intake and outcome

austin['intake_time'] = pd.to_datetime(austin['intake_date']).dt.time
austin['intake_date'] = pd.to_datetime(austin['intake_date']).dt.date

austin['outcome_time'] = pd.to_datetime(austin['outcome_date']).dt.time
austin['outcome_date'] = pd.to_datetime(austin['outcome_date']).dt.date

# create a column for the difference between outcome and intake
# representing how long the animal has been in the system

austin.loc[:,'time_in'] = austin.loc[:,'outcome_date'] - austin.loc[:,'intake_date']
austin['time_in'] = pd.to_numeric(austin['time_in'])/86400000000000

In [9]:
# to correctly track age, split the number and length of measurement into two columns
austin[['intake_age_n', 'intake_age_t']] = austin['intake_age'].str.split(" ",1,expand=True)

# set the number to int for calculating
austin['intake_age_n'] = austin['intake_age_n'].astype('int')

# when age is listed in years, months, or weeks, transform into days
i=0
while i < len(austin['intake_age_n']):
    if austin.iloc[i,-1] in ['year', 'years']:
        austin.iloc[i,-2] = austin.iloc[i,-2] * 365
    elif austin.iloc[i,-1] in ['month', 'months']:
        austin.iloc[i,-2] = austin.iloc[i,-2] * 30
    elif austin.iloc[i,-1] in ['week', 'weeks']:
        austin.iloc[i,-2] = austin.iloc[i,-2] * 7    
    i+=1

# drop the now unnecessary text portion of the original age column and the original column itself    
austin.drop(['intake_age_t', 'intake_age'], axis=1, inplace=True)
austin.rename(columns={'intake_age_n':'age'},inplace=True)


### Louisville Dataframe Formatting

In [10]:
louisville = pd.read_csv("Data/Louisville_Animal_IO_Data_5.csv")

In [11]:
louisville.drop(['SecondaryColor',
                 'IntakeReason',
                 'IntakeInternalStatus',
                 'OutcomeReason',
                 'OutcomeInternalStatus',
                 'SecondaryBreed'], axis=1, inplace=True)

In [12]:
louisville.rename(columns={'AnimalID':'animal_id',
                          'AnimalType':'animal_type',
                          'IntakeDate':'intake_date',
                          'IntakeType':'intake_type',
                          'IntakeSubtype':'intake_subtype',
                          'PrimaryColor':'color',
                          'PrimaryBreed':'breed',
                          'Gender':'gender',
                          'DOB':'birth_date',
                          'IntakeAsilomarStatus':'intake_condition',
                          'ReproductiveStatusAtIntake':'intake_repro',
                          'OutcomeDate':'outcome_date',
                          'OutcomeType':'outcome_type',
                          'OutcomeSubtype':'outcome_subtype',
                          'OutcomeAsilomarStatus':'outcome_condition',
                          'ReproductiveStatusAtOutcome':'outcome_repro',
                          }, inplace=True)

In [13]:
# filter to only dogs
louisville.drop(louisville.loc[louisville['animal_type']!='DOG'].index, inplace=True)

# drop observations where outcome is unknown
louisville.drop(louisville.loc[louisville['outcome_type'].isna()].index, inplace=True)

# turn all string columns to all lowercase
louisville = louisville.applymap(lambda s: s.lower() if type(s) == str else s)

In [14]:
# transform the single column of date and time into a column for each
# for both intake and outcome and in the case of louisville, birth_date

louisville['intake_time'] = pd.to_datetime(louisville['intake_date']).dt.time
louisville['intake_date'] = pd.to_datetime(louisville['intake_date']).dt.date

louisville['outcome_time'] = pd.to_datetime(louisville['outcome_date']).dt.time
louisville['outcome_date'] = pd.to_datetime(louisville['outcome_date']).dt.date

louisville['birth_date'] = pd.to_datetime(louisville['birth_date']).dt.date

# calculate age as the difference between their intake date and date of birth where available
louisville.loc[:,'age'] = louisville.loc[:,'intake_date'] - louisville.loc[:,'birth_date']
louisville['age'] = pd.to_numeric(louisville['age'])/86400000000000

# calculate time in system
louisville.loc[:,'time_in'] = louisville.loc[:,'outcome_date'] - louisville.loc[:,'intake_date']
louisville['time_in'] = pd.to_numeric(louisville['time_in'])/86400000000000

In [15]:
louisville['city'] = 'louisville'

In [16]:
louisville['gender'] = louisville['gender'].str.split().str[-1]

In [17]:
louisville.drop(['birth_date'],axis=1,inplace=True)

In [18]:
louisville

Unnamed: 0,animal_id,animal_type,intake_date,intake_type,intake_subtype,color,breed,gender,intake_condition,intake_repro,outcome_date,outcome_type,outcome_subtype,outcome_condition,outcome_repro,intake_time,outcome_time,age,time_in,city
4,a281756,dog,2006-09-11,owner sur,otc,white,pit bull terrier,male,healthy,fertile,2006-09-12,euth,time/space,healthy,fertile,18:10:00,13:44:00,365.000000,1.0,louisville
6,a256128,dog,2005-11-26,stray,field,brown,american pit bull terrier,male,healthy,fertile,2005-12-08,euth,medical,healthy,fertile,12:35:00,23:59:00,-106751.991167,12.0,louisville
8,a316619,dog,2007-06-29,stray,field,white,labrador retriever,male,healthy,fertile,2007-07-04,euth,time/space,healthy,fertile,20:10:00,13:12:00,-106751.991167,5.0,louisville
15,a319056,dog,2007-07-19,stray,otc,tricolor,beagle,male,healthy,altered,2007-08-07,euth,time/space,healthy,altered,22:32:00,12:13:00,-106751.991167,19.0,louisville
17,a258842,dog,2005-12-21,confiscate,neglect,white,pit bull terrier,male,healthy,fertile,2005-12-29,euth,breed,healthy,fertile,14:30:00,11:05:00,-106751.991167,8.0,louisville
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150835,a489623,dog,2013-04-17,stray,field,apricot,dandie dinmont terrier,male,healthy,fertile,2013-05-08,adoption,internet,healthy,altered,23:01:00,16:19:00,1096.000000,21.0,louisville
150836,a493152,dog,2013-06-04,stray,field,black,dachshund - wirehaired,female,healthy,fertile,2013-06-16,adoption,web pf,healthy,altered,11:02:00,12:55:00,1096.000000,12.0,louisville
150837,a591486,dog,2016-06-02,owner sur,otc,yellow brindle,greyhound,male,healthy,fertile,2016-06-25,adoption,friend,healthy,altered,14:48:00,16:22:00,548.000000,23.0,louisville
150838,a523743,dog,2014-04-16,stray,otc,cream,cairn terrier,female,healthy,fertile,2014-04-23,adoption,internet,healthy,altered,12:05:00,15:11:00,212.000000,7.0,louisville


### Dallas Dataframe Formatting

In [19]:
dallas14 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2014.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas15 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2015.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas16 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2016.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas17 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2017.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas18 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2018.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas19 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2019.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas20 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2020.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})

In [20]:
dallas1718 = dallas17.append(dallas18)
dallas1718.rename(columns={'Animal_Id':'Animal Id',
                           'Animal_Type':'Animal Type',
                           'Animal_Breed':'Animal Breed',
                           'Kennel_Number':'Kennel Number',
                           'Kennel_Status':'Kennel Status',
                           'Tag_Type':'Tag Type',
                           'Activity_Number':'Activity Number',
                           'Activity_Sequence':'Activity Sequence',
                           'Source_Id':'Source Id',
                           'Census_Tract':'Census Tract',
                           'Council_District':'Council District',
                           'Intake_Type':'Intake Type',
                           'Intake_Subtype':'Intake Subtype',
                           'Intake_Total':'Intake Total',
                           'Staff_Id':'Staff Id',
                           'Intake_Date':'Intake Date',
                           'Intake_Time':'Intake Time',
                           'Due_Out':'Due Out',
                           'Intake_Condition':'Intake Condition',
                           'Hold_Request':'Hold Request',
                           'Outcome_Type':'Outcome Type',
                           'Outcome_Subtype':'Outcome Subtype',
                           'Outcome_Date':'Outcome Date',
                           'Outcome_Time':'Outcome Time',
                           'Receipt_Number':'Receipt Number',
                           'Impound_Number':'Impound Number',
                           'Service_Request_Number':'Service Request Number',
                           'Outcome_Condition':'Outcome Condition',
                           'Chip_Status':'Chip Status',
                           'Animal_Origin':'Animal Origin',
                           'Additional_Information':'Additional Information'
                          }, inplace=True)

dallas16.rename(columns={
                        'Animal ID':'Animal Id',
                        'Staff ID':'Staff Id',
                        'Source ID':'Source Id'
                        }, inplace=True)

In [21]:
dallas = dallas14.append([dallas15,dallas16,dallas1718, dallas19, dallas20])
dallas.shape

(236972, 34)

In [22]:
dallas.drop(['Kennel Number','Kennel Status','Tag Type',
             'Activity Number','Activity Sequence','Source Id',
             'Census Tract', 'Council District', 'Intake Total', 'Reason', 'Staff Id',
            'Due Out', 'Hold Request','Receipt Number', 'Impound Number',
            'Service Request Number', 'Chip Status', 'Animal Origin',
            'Additional Information', 'Month','Year'], axis=1, inplace=True)

In [23]:
dallas.rename(columns={'Animal Id':'animal_id',
                       'Animal Type':'animal_type',
                       'Animal Breed':'breed',
                       'Intake Type':'intake_type',
                       'Intake Subtype':'intake_subtype',
                       'Intake Date':'intake_date',
                       'Intake Time':'intake_time',
                       'Intake Condition':'intake_condition',
                       'Outcome Type':'outcome_type',
                       'Outcome Date':'outcome_date',
                       'Outcome Time':'outcome_time',
                       'Outcome Condition':'outcome_condition',
                       'Outcome Subtype':'outcome_subtype',
                       'Month':'month',
                       'Year':'year'
    
}, inplace=True)
dallas.head()

Unnamed: 0,animal_id,animal_type,breed,intake_type,intake_subtype,intake_date,intake_time,intake_condition,outcome_type,outcome_date,outcome_time,outcome_condition,outcome_subtype
0,A0000575,CAT,DOMESTIC SH,STRAY,CONFINED,10/02/2014 12:00:00 AM,12/31/1899 11:56:00 AM,TREATABLE REHABILITABLE NON-CONTAGIOUS,ADOPTION,10/12/2014 12:00:00 AM,12/31/1899 03:25:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,
1,A0008962,DOG,LABRADOR RETR,CONFISCATED,KEEP SAFE,09/24/2015 12:00:00 AM,12/31/1899 03:50:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,EUTHANIZED,10/04/2015 12:00:00 AM,12/31/1899 12:22:00 PM,TREATABLE MANAGEABLE NON-CONTAGIOUS,
2,A0121376,DOG,GERM SHEPHERD,STRAY,CONFINED,05/01/2015 12:00:00 AM,12/31/1899 12:09:00 PM,TREATABLE MANAGEABLE NON-CONTAGIOUS,EUTHANIZED,05/03/2015 12:00:00 AM,12/31/1899 11:53:00 AM,TREATABLE MANAGEABLE NON-CONTAGIOUS,
3,A0129114,CAT,DOMESTIC SH,OWNER SURRENDER,GENERAL,09/19/2015 12:00:00 AM,12/31/1899 04:46:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,ADOPTION,10/26/2015 12:00:00 AM,12/31/1899 02:09:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,
4,A0157434,DOG,ROTTWEILER,OWNER SURRENDER,- DEAD ON ARRIVAL,12/03/2014 12:00:00 AM,12/31/1899 08:06:00 PM,UNHEALTHY UNTREATABLE NON-CONTAGIOUS,DEAD ON ARRIVAL,12/04/2014 12:00:00 AM,12/31/1899 12:00:00 PM,UNHEALTHY UNTREATABLE NON-CONTAGIOUS,


In [24]:
# filter to only dogs
dallas.drop(dallas.loc[dallas['animal_type']!='DOG'].index, inplace=True)

# drop observations where outcome is unknown
dallas.drop(dallas.loc[dallas['outcome_type'].isna()].index, inplace=True)

# turn all string columns to all lowercase
dallas = dallas.applymap(lambda s: s.lower() if type(s) == str else s)

dallas['city'] = 'dallas'

In [25]:
# transform the single column of date into a column for each
# for both intake and outcome, not using time because it doesn't contain meaningful data for this particular
# dataset

dallas['intake_date'] = pd.to_datetime(dallas['intake_date']).dt.date
dallas['intake_time'] = pd.to_datetime(dallas['intake_time']).dt.time
dallas['outcome_date'] = pd.to_datetime(dallas['outcome_date']).dt.date
dallas['outcome_time'] = pd.to_datetime(dallas['outcome_time']).dt.time


# create a column for the difference between outcome and intake
# representing how long the animal has been in the system

dallas.loc[:,'time_in'] = dallas.loc[:,'outcome_date'] - dallas.loc[:,'intake_date']
dallas['time_in'] = pd.to_numeric(dallas['time_in'])/86400000000000

In [26]:
dallas.head()

Unnamed: 0,animal_id,animal_type,breed,intake_type,intake_subtype,intake_date,intake_time,intake_condition,outcome_type,outcome_date,outcome_time,outcome_condition,outcome_subtype,city,time_in
6,a0183589,dog,aust cattle dog,owner surrender,general,2015-06-01,15:47:00,unhealthy untreatable non-contagious,euthanized,2015-06-01,00:00:00,unhealthy untreatable non-contagious,,dallas,0.0
17,a0275199,dog,shih tzu,owner surrender,general,2014-12-03,13:09:00,unhealthy untreatable non-contagious,euthanized,2014-12-03,14:17:00,unhealthy untreatable non-contagious,,dallas,0.0
26,a0337666,dog,rat terrier,owner surrender,euthanasia requested,2015-05-13,17:08:00,unhealthy untreatable non-contagious,euthanized,2015-05-13,17:29:00,unhealthy untreatable non-contagious,,dallas,0.0
29,a0350222,dog,labrador retr,owner surrender,general,2015-07-07,11:52:00,unhealthy untreatable non-contagious,euthanized,2015-07-07,12:29:00,unhealthy untreatable non-contagious,,dallas,0.0
30,a0350630,dog,papillon,owner surrender,euthanasia requested,2015-03-02,15:32:00,treatable manageable non-contagious,euthanized,2015-03-02,16:04:00,unhealthy untreatable non-contagious,,dallas,0.0


In [150]:
all = austin.merge(louisville, how="outer")
all = all.merge(dallas, how='outer')



In [28]:
all.shape

(220606, 22)

In [30]:
all.dtypes

animal_id             object
intake_date           object
intake_type           object
intake_condition      object
animal_type           object
breed                 object
color                 object
outcome_date          object
birth_date            object
outcome_type          object
outcome_subtype       object
outcome_age           object
intake_repro          object
gender                object
outcome_repro         object
city                  object
intake_time           object
outcome_time          object
time_in              float64
age                  float64
intake_subtype        object
outcome_condition     object
dtype: object

## Predictors to Standardize
- breed
- intake_subtype

In [151]:
# replace frequent shorthands and typos to make the rest of the processing
# below easier

all['breed'].replace({
    'alask ':'alaskan ',
    'belg ':'belgian ',
    'span ':'spaniel ',
    'span/':'spaniel/',
    'mtn':'mountain',
    'amer ':'american ',
    'aust ':'australian ',
    'retr':'retriever',
    'chesa':'chesapeake bay',
    ' lh':' longhair',
    ' bordx':' bordeaux',
    'eng ':'english ',
    ' sprngr ':' springer ',
    'flat-coated':'flat coat',
    ' terr ':' terrier ',
    'gr swiss ':'great swiss ',
    'ital ':'italian ',
    'mex ':'mexican ',
    'min ':'minature ',
    'norw ':'norwegian ',
    'ns ':'nova scotia',
    'oldeng':'old english',
    '(jack) ':'',
    'terr/':'terrier/',
    'pbgv':'petit basset griffon vendeen',
    'pitbull':'pit bull',
    'rhod ':'rhodesian ',
    'scot ':'scottish ',
    'sheltd ':'shetland ',
    'soft-coated':'soft coated',
    ' rgh':' rough coat',
    ' smth':' smooth coat',
    'swed ':'swedish',
    'tenn tr ':'tennesee treeing',
    'tr ':'tree ',
    ' - ':' '
}, inplace=True, regex=True)

In [152]:
for u in all.breed.unique():
    print(u)

beagle mix
english springer spaniel
basenji mix
doberman pinsch/australian cattle dog
labrador retrieveriever mix
great dane mix
chihuahua shorthair
pit bull
australian cattle dog/labrador retrieveriever
parson russell terrier mix
norfolk terrier
yorkshire terrier mix
maltese mix
dachshund mix
boxer mix
plott hound mix
labrador retrieveriever
pit bull mix
tibetan spaniel mix
miniature pinscher mix
chihuahua shorthair mix
pit bull/australian cattle dog
yorkshire terrier
dachshund/chihuahua shorthair
german shepherd/chow chow
miniature schnauzer mix
german shepherd/australian cattle dog
german shepherd
great dane
australian kelpie mix
alaskan husky
great pyrenees mix
german shepherd mix
feist/beagle
american pit bull terrier mix
norfolk terrier mix
rottweiler mix
chihuahua shorthair/russell terrier
italian greyhound mix
treeing walker coonhound mix
boxer
staffordshire/english bulldog
bull terrier mix
australian shepherd/chow chow
chihuahua longhair mix
black mouth cur/pit bull
boston ter

bulldog/boston terrier
harrier/english coonhound
queensland heeler/pointer
miniature poodle/english cocker spaniel
golden retrieveriever/pit bull
alaskan malamute/akita
cardigan welsh corgi/american eskimo
siberian husky/alaskan malamute
rat terrier/miniature pinscher
plott hound/german shepherd
welsh springer spaniel/labrador retrieveriever
australian cattle dog/american staffordshire terrier
weimaraner/vizsla
irish setter/golden retrieveriever
maltese/dachshund
miniature poodle/pomeranian
dachshund/bull terrier
german shepherd/queensland heeler
pug/cocker spaniel
mastiff/staffordshire
pug/staffordshire
beagle/staffordshire
pit bull/pit bull
rat terrier/smooth fox terrier
bulldog/american bulldog
treeing walker coonhound/plott hound
pit bull/siberian husky
chihuahua longhair/border terrier
cocker spaniel/toy poodle
collie rough/great pyrenees
dachshund/greyhound
labrador retrieveriever/english pointer
collie rough/catahoula
dachshund wirehair/west highland
airedale terrier/labrador re

In [None]:
# replace one-off breed names that are the entire entry

all['breed'].replace({
    'airedale terr':'airedale terrier',
    'am pit bull ter':'american pit bull terrier',
    'american staff':'american staffordshire terrier',
    'anatolian shepherd':'anatol shepherd',
    'black and tan coonound':'black/tan hound',
    'bluetick coonhound':'bluetick hound'
    'bouvier des flandres':'bouv flandres',
    'boykin span':'boykin spaniel',
    'brussels griffon':'bruss griffon',
    'bull terrier min':'bull terrier miniature',
    'catahoula leopard hound':'catahoula',
    'cavalier span':'cavalier spaniel',
    'chihuahua long haired':'chihuahua longhair',
    'chihuahua sh':'chihuahua shorthair',
    'chihuahua smooth coated':'chihuahua shorthair',
    'cocker amer':'cocker spaniel',
    'cocker-poo':'cocker spaniel poodle',
    'collie':'border collie',
    'dutch sheepdog':'dutch shepherd',
    'gbgv':'',
    'germ sh point':
}, inplace=True)


In [None]:

breeds = ['affenpinscher', 'afghan hound', 'airedale terrier', 'akbash', 'akita',
          'australian shepherd', 'alaskan husky', 'alaskan klee kai', 'alaskan malamute',
          'australian kelpie', 'american bulldog', 'american eskimo',
          'american foxhound' 'american pit bull terrier',
          'american staffordshire terrier', 'anatol shepherd',
          'australian cattle dog', 'australian kelpie', 'australian shepherd',
          'australian terrier', 'basenji', 'basset hound',
          'beagle', 'bearded collie', 'beauceron', 'bedlington terrier',
          'belgian laekenois', 'belgian malinois', 'belgian sheepdog',
          'belgian tervuren', 'bernese hound', 'bernese mountain dog', 'bichon frise',
          'black mouth cur', 'black/tan hound', 'bloodhound', 'blue lacy',
          'bluetick hound', 'boerboel', 'border collie', 'border terrier',
          'borzoi', 'boston terrier', 'bouv flandres', 'boxer', 'boykin spaniel',
          'briard', 'brittany', 'bruss griffon', 'bull terrier',
          'bull terrier miniature', 'bulldog', 'bullmastiff', 'cairn terrier',
          'canann dog', 'cane corso', 'cardigan welsh corgi', 'carolina dog',
          'catahoula', 'cavalier spaniel', 'chesapeake bay retriever', 'chihuahua longhair',
          'chihuahua shorthair', 'chinese crested', 'chinese sharpei', 'chow chow',
          'clumber spaniel', 'cocker spaniel', 'collie rough', 'collie smooth',
          'coonhound', 'coton de tulear', 'dachshund', 'dachsund longhair', 'dachshund wirehair',
          'dalmatian', 'dandie dinmont', 'doberman pinsch', 'dogo aregentino', 'dogue de bordeaux',
          'dutch shepherd', 'english bulldog', 'english cocker spaniel', 'english coonhound',
          'english foxhound', 'english pointer', 'english setter', 'english shepherd',
          'english springer spaniel', 'english toy spaniel', 'entlebucher',
          'feist', 'field spaniel', 'fila brasileiro', 'finnish spitz', 'flat coat retriever',
          'fox terrier', 'french bulldog'
          
          
         ]

In [138]:
all['color'].replace({
    'apricot':'yellow',
    'beige':'tan',
    'blonde':'yellow',
    'liver':'brindle',
    'orange':'yellow',
    'pink':'red',
    'ruddy':'red',
    'tiger':'brindle',
    'gold':'yellow',
    'silver':'gray'
}, inplace=True, regex=True)


colors = ['black','blue','brindle','brown','buff','chocolate',
          'cream','fawn','gray','merle','red',
          'sable','tan','tick','tricolor','white','yellow']

for c in colors:
    all[c] = all['color'].apply(lambda x: int(c in x) if isinstance(x,str) else 0)

In [59]:
all['intake_type'].replace({
# standardize across different datasets
    'confiscate':'confiscated',
    'foster':'return',
    'owner sur':'owner surrender',
# set all euthanasia and disposal to euthanasia request
    'euthanasia':'euthanasia request',
    'euth req':'euthanasia request',
    'dispos req':'euthanasia request',
    'et request':'euthanasia request',
    'disposal':'euthanasia request',
# aggregate all medical reasons
    'outsurgery':'medical',
    'treatment':'medical',
    'quarantine':'medical',
    'med observ':'medical',
# generalize several niche stray conditions to stray
    'lost':'stray',
    'found':'stray',
    'wildlife':'stray',
    'evacuee':'stray',
    'abandoned':'stray',
# add reported animals to public assist    
    'lost report':'public assist',
    'found report':'public assist',
# add all infrequent specifications to other to reduce number of categories
    'khs':'other',
    'investigat':'other',
    'keepsafe':'other',
    'transport':'other',
    'for transp':'other',
    'transfer':'other'
}, inplace=True)

all.intake_type.value_counts()

stray                 137778
owner surrender        52978
public assist          11747
confiscated             6018
euthanasia request      4353
medical                 3643
return                  3565
other                    524
Name: intake_type, dtype: int64

In [45]:
all['intake_condition'].replace({
# mark all normal, treatable+manageable+non-contagious as relatively healthy
# as there is no "more positive" denotation in the dataset
    'normal':'healthy', 
    'treatable/manageable':'treatable manageable non-contagious',
    'treatable manageable non-contagious':'healthy',
# mark all non-contagious rehabilitable instances to injured
    'treatable rehabilitable non-contagious':'injured',
# if not clear whether sick or injured, list as unhealthy
    'unhealthy untreatable non-contagious':'unhealthy',
    'unhealthy/untreatable':'unhealthy',
    'critical':'unhealthy',
    'med urgent':'unhealthy',
    'fatal':'unhealthy',
    'medical':'unhealthy',
# mark all "contagious" instances to sick
    'unhealthy untreatable contagious':'sick',
    'treatable rehabilitable contagious':'sick',
    'treatable manageable contagious':'sick',
# mark behavior issues as other given relative infrequency
    'feral':'other',
    'behavior':'other',
# combine nursing and pregnant
    'nursing':'pregnant/nursing',
    'pregnant':'pregnant/nursing',
# standardize other category names
    'app well':'healthy',
    'app inj':'injured',
    'app sick':'sick',
    'app wnl':'healthy',
    'unknown':'other',
    'deceased':'dead',
# drop aged and neonatal since age is captured in another variable
# assume that if age is the only qualifier, they are likely otherwise healthy
    'underage':'healthy',
    'aged':'healthy',
    'neonatal':'healthy',
}, inplace=True)

all.intake_condition.value_counts()

In [69]:
all['gender'].replace({'litter':'unknown'},inplace=True)

all.gender.value_counts()

In [128]:
all['intake_repro'].replace({
    'fertile':'intact',
    'neutered':'altered',
    'spayed':'altered'
},inplace=True)

all.intake_repro.value_counts()

intact     108857
altered     74808
unknown      1742
Name: intake_repro, dtype: int64

In [79]:
all['intake_subtype'].replace({
    'with id':'other',
    'spca texas':'other',
    'trap':'field',
    'aid':'otc',
    'night':'field',
    'died':'dead',
    'weather':'field',
    'left at':'field',
    'vehicletow':'field'
}, inplace=True)

all.intake_subtype.value_counts()

field            30804
otc              30150
at large         16176
general           7582
confined          3415
                 ...  
injured              3
transport            3
owner surr           3
dead                 2
other shelter        1
Name: intake_subtype, Length: 79, dtype: int64

In [82]:
all['intake_subtype'].replace({
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
    '':'',
}, inplace=True)

## Need to fix/examine age in Louisville - lots of large negative numbers related to not having birth date data or it being in the wrong format

In [None]:
# drop dead
# drop euthanasia request

In [None]:
NET DOG VARIABLE - how at capacity is the shelter?