# Data Wrangling

In [1]:
import pandas as pd
import numpy as np

### Austin Dataframe Formatting

In [2]:
austin_outcomes = pd.read_csv("Data/Austin_Animal_Center_Outcomes.csv")
austin_intakes = pd.read_csv("Data/Austin_Animal_Center_Intakes.csv")

In [3]:
austin_intakes.drop(['Name', 'MonthYear', 'Found Location'],axis=1, inplace=True)

austin_intakes.rename(columns={'Animal ID':'animal_id',
                      'Animal Type':'animal_type',
                      'DateTime':'intake_date',
                      'Intake Type':'intake_type',
                      'Intake Condition':'intake_condition',
                      'Breed':'breed',
                      'Color':'color',
                      'Sex upon Intake':'intake_sex',
                      'Age upon Intake':'intake_age'}, inplace=True)

In [4]:
austin_outcomes.drop(['Name', 'MonthYear'],axis=1, inplace=True)
austin_outcomes.rename(columns={'Animal ID':'animal_id',
                      'Animal Type':'animal_type',
                      'DateTime':'outcome_date',
                      'Outcome Type':'outcome_type',
                      'Outcome Subtype':'outcome_subtype',
                      'Breed':'breed',
                      'Color':'color',
                      'Sex upon Outcome':'outcome_sex',
                      'Age upon Outcome':'outcome_age',
                      'Date of Birth':'birth_date'}, inplace=True)

In [5]:
austin = austin_intakes.merge(austin_outcomes, how='left')
austin.head()

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,intake_sex,intake_age,breed,color,outcome_date,birth_date,outcome_type,outcome_subtype,outcome_sex,outcome_age
0,A786884,01/03/2019 04:19:00 PM,Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor,01/08/2019 03:11:00 PM,01/03/2017,Transfer,Partner,Neutered Male,2 years
1,A706918,07/05/2015 12:59:00 PM,Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,07/05/2015 03:13:00 PM,07/05/2007,Return to Owner,,Spayed Female,8 years
2,A724273,04/14/2016 06:43:00 PM,Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,04/21/2016 05:17:00 PM,04/17/2015,Return to Owner,,Neutered Male,1 year
3,A665644,10/21/2013 07:59:00 AM,Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico,10/21/2013 11:39:00 AM,09/21/2013,Transfer,Partner,Intact Female,4 weeks
4,A682524,06/29/2014 10:38:00 AM,Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,07/02/2014 02:16:00 PM,06/29/2010,Return to Owner,,Neutered Male,4 years


In [6]:
# split intake_sex into gender and intake_repro
austin[['intake_repro','gender']]= austin['intake_sex'].str.split(" ", expand=True)
# split outcome_sex into gender and outcome_repro
austin[['outcome_repro','gender']]= austin['outcome_sex'].str.split(" ", expand=True)

# set the new city variable to Austin
austin['city'] = 'austin'

# after creating the new gender and repro columns, drop the old columns
austin.drop(['intake_sex','outcome_sex'], axis=1, inplace=True)

# subset to only Dogs
austin.drop(austin.loc[austin['animal_type']!='Dog'].index, inplace=True)

# drop observations where outcome is unknown
austin.drop(austin.loc[austin['outcome_type'].isna()].index, inplace=True)

# set all string columns to lower case for consistency across all datasets
austin = austin.applymap(lambda s: s.lower() if type(s) == str else s)


austin.tail()

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,intake_age,breed,color,outcome_date,birth_date,outcome_type,outcome_subtype,outcome_age,intake_repro,gender,outcome_repro,city
172475,a844230,11/05/2021 10:18:00 am,owner surrender,normal,dog,1 year,pit bull,white/brown,11/01/2021 06:08:00 pm,04/13/2020,adoption,,1 year,neutered,male,neutered,austin
172510,a836314,11/06/2021 11:58:00 am,owner surrender,normal,dog,2 years,pit bull mix,white/blue,09/29/2021 01:39:00 pm,06/08/2019,adoption,,2 years,neutered,male,neutered,austin
172511,a836314,11/06/2021 11:58:00 am,owner surrender,normal,dog,2 years,pit bull mix,white/blue,07/16/2021 05:32:00 pm,06/08/2019,adoption,,2 years,neutered,male,neutered,austin
172512,a836314,11/06/2021 11:58:00 am,owner surrender,normal,dog,2 years,pit bull mix,white/blue,11/01/2021 04:55:00 pm,06/08/2019,adoption,,2 years,neutered,male,neutered,austin
172547,a812375,01/21/2020 01:57:00 pm,owner surrender,normal,dog,10 months,great dane,black/white,01/25/2020 02:29:00 pm,03/21/2019,adoption,,10 months,spayed,female,spayed,austin


In [7]:
# transform the single column of date and time into a column for each
# for both intake and outcome

austin['intake_time'] = pd.to_datetime(austin['intake_date']).dt.time
austin['intake_date'] = pd.to_datetime(austin['intake_date']).dt.date

austin['outcome_time'] = pd.to_datetime(austin['outcome_date']).dt.time
austin['outcome_date'] = pd.to_datetime(austin['outcome_date']).dt.date

# create a column for the difference between outcome and intake
# representing how long the animal has been in the system

austin.loc[:,'days_in'] = austin.loc[:,'outcome_date'] - austin.loc[:,'intake_date']
austin['days_in'] = pd.to_numeric(austin['days_in'])/86400000000000

In [8]:
# to correctly track age, split the number and length of measurement into two columns
austin[['intake_age_n', 'intake_age_t']] = austin['intake_age'].str.split(" ",1,expand=True)

# set the number to int for calculating
austin['intake_age_n'] = austin['intake_age_n'].astype('int')

# when age is listed in years, months, or weeks, transform into days
i=0
while i < len(austin['intake_age_n']):
    if austin.iloc[i,-1] in ['year', 'years']:
        austin.iloc[i,-2] = austin.iloc[i,-2] * 365
    elif austin.iloc[i,-1] in ['month', 'months']:
        austin.iloc[i,-2] = austin.iloc[i,-2] * 30
    elif austin.iloc[i,-1] in ['week', 'weeks']:
        austin.iloc[i,-2] = austin.iloc[i,-2] * 7    
    i+=1

# drop the now unnecessary text portion of the original age column and the original column itself    
austin.drop(['intake_age_t', 'intake_age'], axis=1, inplace=True)
austin.rename(columns={'intake_age_n':'age_days'},inplace=True)


### Louisville Dataframe Formatting

In [15]:
louisville = pd.read_csv("Data/Louisville_Animal_IO_Data_5.csv")

In [16]:
louisville.rename(columns={'AnimalID':'animal_id',
                          'AnimalType':'animal_type',
                          'IntakeDate':'intake_date',
                          'IntakeType':'intake_type',
                          'IntakeSubtype':'intake_subtype',
                          'PrimaryColor':'primary_color',
                          'PrimaryBreed':'primary_breed',
                          'SecondaryBreed':'secondary_breed',
                          'SecondaryColor':'secondary_color',
                          'Gender':'gender',
                          'DOB':'birth_date',
                          'IntakeAsilomarStatus':'intake_condition',
                          'ReproductiveStatusAtIntake':'intake_repro',
                          'OutcomeDate':'outcome_date',
                          'OutcomeType':'outcome_type',
                          'OutcomeSubtype':'outcome_subtype',
                          'OutcomeAsilomarStatus':'outcome_condition',
                          'ReproductiveStatusAtOutcome':'outcome_repro',
                          }, inplace=True)

# filter to only dogs
louisville.drop(louisville.loc[louisville['animal_type']!='DOG'].index, inplace=True)

# drop observations where outcome is unknown
louisville.drop(louisville.loc[louisville['outcome_type'].isna()].index, inplace=True)

louisville.sample(25)

Unnamed: 0,animal_id,animal_type,intake_date,intake_type,intake_subtype,primary_color,primary_breed,secondary_breed,gender,secondary_color,...,IntakeInternalStatus,intake_condition,intake_repro,outcome_date,outcome_type,outcome_subtype,OutcomeReason,OutcomeInternalStatus,outcome_condition,outcome_repro
117122,A339010,DOG,2008-01-18 10:40:00,STRAY,OTC,RED,DACHSHUND - LONGHAIRED,,NEUTERED MALE,BLACK,...,NORMAL,HEALTHY,ALTERED,2008-01-29 11:04:00,ADOPTION,WALK IN,,,HEALTHY,ALTERED
101574,A493114,DOG,2013-06-03 16:56:00,STRAY,OTC,RED,PIT BULL TERRIER,,MALE,,...,SICK,HEALTHY,FERTILE,2013-06-03 18:09:00,EUTH,CONTAG DIS,,PARVO,HEALTHY,FERTILE
90210,A440053,DOG,2011-06-27 20:55:00,STRAY,FIELD,BLACK,ROTTWEILER,MIX,MALE,BROWN,...,NORMAL,HEALTHY,FERTILE,2011-06-28 12:34:00,RTO,,,,HEALTHY,FERTILE
88058,A351008,DOG,2008-05-23 18:18:00,OWNER SUR,OTC,BROWN,AMERICAN PIT BULL TERRIER,,FEMALE,BLUE,...,NORMAL,HEALTHY,FERTILE,2008-05-25 23:59:00,EUTH,BEHAV OBSV,,,HEALTHY,FERTILE
110604,A567000,DOG,2015-07-29 12:27:00,STRAY,OTC,BROWN,BEAGLE,CHIHUAHUA - SMOOTH COATED,SPAYED FEMALE,WHITE,...,NORMAL,HEALTHY,FERTILE,2015-08-08 12:50:00,ADOPTION,INTERNET,,NORMAL,HEALTHY,ALTERED
94346,A607755,DOG,2017-02-21 12:09:00,RETURN,ADOPTION,BROWN,BEAGLE,,NEUTERED MALE,WHITE,...,NORMAL,HEALTHY,ALTERED,2017-03-02 18:20:00,ADOPTION,INTERNET,,NORMAL,HEALTHY,ALTERED
27428,A309495,DOG,2007-05-09 18:45:00,STRAY,FIELD,BLACK,FOX TERRIER - SMOOTH,,FEMALE,WHITE,...,FEARFUL,HEALTHY,FERTILE,2007-05-23 14:17:00,TRANSFER,RESCUE GRP,,,HEALTHY,FERTILE
60352,A512059,DOG,2013-11-29 11:20:00,STRAY,FIELD,BROWN BRINDLE,BOXER,,NEUTERED MALE,,...,NORMAL,HEALTHY,ALTERED,2013-12-02 17:34:00,RTO,,,NORMAL,HEALTHY,ALTERED
51137,A362094,DOG,2008-09-10 21:25:00,STRAY,FIELD,TAN,BOXER,MIX,FEMALE,WHITE,...,AGGRESSIVE,UNHEALTHY/UNTREATABLE,FERTILE,2008-09-16 23:59:00,EUTH,BEHAV OBSV,,AGGRESSIVE,HEALTHY,FERTILE
136155,A484183,DOG,2013-01-13 17:50:00,OWNER SUR,FIELD,BROWN BRINDLE,GERMAN SHEPHERD DOG,MIX,SPAYED FEMALE,WHITE,...,NORMAL,HEALTHY,FERTILE,2013-01-29 16:05:00,ADOPTION,INTERNET,,,HEALTHY,ALTERED


In [17]:
louisville['secondary_breed'].fillna("", inplace=True)
louisville['secondary_color'].fillna("", inplace=True)

In [18]:
louisville['breed'] = louisville['primary_breed'] + " / " + louisville['secondary_breed']
louisville['color'] = louisville['primary_color'] + " / " + louisville['secondary_color']

In [19]:
louisville

Unnamed: 0,animal_id,animal_type,intake_date,intake_type,intake_subtype,primary_color,primary_breed,secondary_breed,gender,secondary_color,...,intake_repro,outcome_date,outcome_type,outcome_subtype,OutcomeReason,OutcomeInternalStatus,outcome_condition,outcome_repro,breed,color
4,A281756,DOG,2006-09-11 18:10:00,OWNER SUR,OTC,WHITE,PIT BULL TERRIER,,MALE,BROWN,...,FERTILE,2006-09-12 13:44:00,EUTH,TIME/SPACE,,,HEALTHY,FERTILE,PIT BULL TERRIER /,WHITE / BROWN
6,A256128,DOG,2005-11-26 12:35:00,STRAY,FIELD,BROWN,AMERICAN PIT BULL TERRIER,MIX,MALE,WHITE,...,FERTILE,2005-12-08 23:59:00,EUTH,MEDICAL,,,HEALTHY,FERTILE,AMERICAN PIT BULL TERRIER / MIX,BROWN / WHITE
8,A316619,DOG,2007-06-29 20:10:00,STRAY,FIELD,WHITE,LABRADOR RETRIEVER,MIX,MALE,TAN,...,FERTILE,2007-07-04 13:12:00,EUTH,TIME/SPACE,,,HEALTHY,FERTILE,LABRADOR RETRIEVER / MIX,WHITE / TAN
15,A319056,DOG,2007-07-19 22:32:00,STRAY,OTC,TRICOLOR,BEAGLE,MIX,NEUTERED MALE,,...,ALTERED,2007-08-07 12:13:00,EUTH,TIME/SPACE,,,HEALTHY,ALTERED,BEAGLE / MIX,TRICOLOR /
17,A258842,DOG,2005-12-21 14:30:00,CONFISCATE,NEGLECT,WHITE,PIT BULL TERRIER,,MALE,,...,FERTILE,2005-12-29 11:05:00,EUTH,BREED,,,HEALTHY,FERTILE,PIT BULL TERRIER /,WHITE /
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150835,A489623,DOG,2013-04-17 23:01:00,STRAY,FIELD,APRICOT,DANDIE DINMONT TERRIER,MIX,NEUTERED MALE,,...,FERTILE,2013-05-08 16:19:00,ADOPTION,INTERNET,,NORMAL,HEALTHY,ALTERED,DANDIE DINMONT TERRIER / MIX,APRICOT /
150836,A493152,DOG,2013-06-04 11:02:00,STRAY,FIELD,BLACK,DACHSHUND - WIREHAIRED,,SPAYED FEMALE,TAN,...,FERTILE,2013-06-16 12:55:00,ADOPTION,WEB PF,,,HEALTHY,ALTERED,DACHSHUND - WIREHAIRED /,BLACK / TAN
150837,A591486,DOG,2016-06-02 14:48:00,OWNER SUR,OTC,YELLOW BRINDLE,GREYHOUND,MIX,NEUTERED MALE,,...,FERTILE,2016-06-25 16:22:00,ADOPTION,FRIEND,,NORMAL,HEALTHY,ALTERED,GREYHOUND / MIX,YELLOW BRINDLE /
150838,A523743,DOG,2014-04-16 12:05:00,STRAY,OTC,CREAM,CAIRN TERRIER,,SPAYED FEMALE,,...,FERTILE,2014-04-23 15:11:00,ADOPTION,INTERNET,,NORMAL,HEALTHY,ALTERED,CAIRN TERRIER /,CREAM /


In [20]:
louisville.drop(['secondary_color',
                 'IntakeReason',
                 'IntakeInternalStatus',
                 'OutcomeReason',
                 'OutcomeInternalStatus',
                 'secondary_breed',
                'primary_breed',
                'primary_color'], axis=1, inplace=True)

In [21]:


# turn all string columns to all lowercase
louisville = louisville.applymap(lambda s: s.lower() if type(s) == str else s)

In [22]:
# transform the single column of date and time into a column for each
# for both intake and outcome and in the case of louisville, birth_date

louisville['intake_time'] = pd.to_datetime(louisville['intake_date']).dt.time
louisville['intake_date'] = pd.to_datetime(louisville['intake_date']).dt.date

louisville['outcome_time'] = pd.to_datetime(louisville['outcome_date']).dt.time
louisville['outcome_date'] = pd.to_datetime(louisville['outcome_date']).dt.date

louisville['birth_date'] = pd.to_datetime(louisville['birth_date']).dt.date

# calculate age as the difference between their intake date and date of birth where available
louisville.loc[:,'age_days'] = louisville.loc[:,'intake_date'] - louisville.loc[:,'birth_date']
louisville['age_days'] = pd.to_numeric(louisville['age_days'])/86400000000000

# calculate time in system
louisville.loc[:,'days_in'] = louisville.loc[:,'outcome_date'] - louisville.loc[:,'intake_date']
louisville['days_in'] = pd.to_numeric(louisville['days_in'])/86400000000000

In [23]:
louisville['city'] = 'louisville'

louisville['gender'] = louisville['gender'].str.split().str[-1]

louisville.drop(['birth_date'],axis=1,inplace=True)

louisville.head()

Unnamed: 0,animal_id,animal_type,intake_date,intake_type,intake_subtype,primary_color,primary_breed,gender,intake_condition,intake_repro,...,outcome_subtype,outcome_condition,outcome_repro,breed,color,intake_time,outcome_time,age_days,days_in,city
4,a281756,dog,2006-09-11,owner sur,otc,white,pit bull terrier,male,healthy,fertile,...,time/space,healthy,fertile,pit bull terrier /,white / brown,18:10:00,13:44:00,365.0,1.0,louisville
6,a256128,dog,2005-11-26,stray,field,brown,american pit bull terrier,male,healthy,fertile,...,medical,healthy,fertile,american pit bull terrier / mix,brown / white,12:35:00,23:59:00,-106751.991167,12.0,louisville
8,a316619,dog,2007-06-29,stray,field,white,labrador retriever,male,healthy,fertile,...,time/space,healthy,fertile,labrador retriever / mix,white / tan,20:10:00,13:12:00,-106751.991167,5.0,louisville
15,a319056,dog,2007-07-19,stray,otc,tricolor,beagle,male,healthy,altered,...,time/space,healthy,altered,beagle / mix,tricolor /,22:32:00,12:13:00,-106751.991167,19.0,louisville
17,a258842,dog,2005-12-21,confiscate,neglect,white,pit bull terrier,male,healthy,fertile,...,breed,healthy,fertile,pit bull terrier /,white /,14:30:00,11:05:00,-106751.991167,8.0,louisville


### Dallas Dataframe Formatting

In [24]:
dallas14 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2014.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas15 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2015.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas16 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2016.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas17 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2017.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas18 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2018.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas19 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2019.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})
dallas20 = pd.read_csv("Data/Dallas_Animal_Shelter_Data_Fiscal_Year_2020.csv", dtype={'Tag Type': 'string', 'Activity Number': 'string', "Service Request Number":'string'})

In [25]:
# standardize the column names across each year of the dallas datasets, including 1718 which featured
# a different naming convention that every other year.

dallas1718 = dallas17.append(dallas18)
dallas1718.rename(columns={'Animal_Id':'Animal Id',
                           'Animal_Type':'Animal Type',
                           'Animal_Breed':'Animal Breed',
                           'Kennel_Number':'Kennel Number',
                           'Kennel_Status':'Kennel Status',
                           'Tag_Type':'Tag Type',
                           'Activity_Number':'Activity Number',
                           'Activity_Sequence':'Activity Sequence',
                           'Source_Id':'Source Id',
                           'Census_Tract':'Census Tract',
                           'Council_District':'Council District',
                           'Intake_Type':'Intake Type',
                           'Intake_Subtype':'Intake Subtype',
                           'Intake_Total':'Intake Total',
                           'Staff_Id':'Staff Id',
                           'Intake_Date':'Intake Date',
                           'Intake_Time':'Intake Time',
                           'Due_Out':'Due Out',
                           'Intake_Condition':'Intake Condition',
                           'Hold_Request':'Hold Request',
                           'Outcome_Type':'Outcome Type',
                           'Outcome_Subtype':'Outcome Subtype',
                           'Outcome_Date':'Outcome Date',
                           'Outcome_Time':'Outcome Time',
                           'Receipt_Number':'Receipt Number',
                           'Impound_Number':'Impound Number',
                           'Service_Request_Number':'Service Request Number',
                           'Outcome_Condition':'Outcome Condition',
                           'Chip_Status':'Chip Status',
                           'Animal_Origin':'Animal Origin',
                           'Additional_Information':'Additional Information'
                          }, inplace=True)

dallas16.rename(columns={
                        'Animal ID':'Animal Id',
                        'Staff ID':'Staff Id',
                        'Source ID':'Source Id'
                        }, inplace=True)

In [26]:
# combine each year into one dataset
dallas = dallas14.append([dallas15,dallas16,dallas1718, dallas19, dallas20])
dallas.shape

(236972, 34)

In [27]:
dallas.sample(10)

Unnamed: 0,Animal Id,Animal Type,Animal Breed,Kennel Number,Kennel Status,Tag Type,Activity Number,Activity Sequence,Source Id,Census Tract,...,Receipt Number,Impound Number,Service Request Number,Outcome Condition,Chip Status,Animal Origin,Additional Information,Month,Year,Outcome Subtype
8387,A1015605,WILDLIFE,HAMSTER,HABITAT,UNAVAILABLE,,,1,P0813739,7102.0,...,R17-520058,K17-401438,,TREATABLE REHABILITABLE NON-CONTAGIOUS,SCAN NO CHIP,OVER THE COUNTER,,NOV.2017,FY2018,WALK IN
574,A0852018,DOG,PIT BULL,LFD 155,UNAVAILABLE,,,1,P0878955,20500.0,...,R19-545808,K19-456024,,TREATABLE REHABILITABLE NON-CONTAGIOUS,SCAN CHIP,OVER THE COUNTER,ADOPTED,APR.2019,FY2019,WALK IN
16885,A1003930,DOG,SHIH TZU,CC 02,UNAVAILABLE,,,1,P0797885,9000.0,...,R17-513094,K17-389360,,TREATABLE REHABILITABLE NON-CONTAGIOUS,SCAN NO CHIP,OVER THE COUNTER,ADOPT PEND SX8/8/17,AUG.2017,FY2017,
18016,A1025262,CAT,DOMESTIC SH,LFC 050,LAB,,,1,P0828341,10804.0,...,,K18-412269,,UNHEALTHY UNTREATABLE NON-CONTAGIOUS,SCAN NO CHIP,OVER THE COUNTER,,MAR.2018,FY2018,HUMANE
28692,A1067339,DOG,PIT BULL,VIP27,UNAVAILABLE,,,1,P0887236,20500.0,...,R19-555596,K19-477749,,TREATABLE REHABILITABLE NON-CONTAGIOUS,SCAN CHIP,OVER THE COUNTER,ADOPTED,AUG.2019,FY2019,WALK IN
13989,A1072121,DOG,BOXER,VT 01,IMPOUNDED,,A21-264795,1,P0888659,8701.0,...,,K21-528113,,CRITICAL,SCAN CHIP,FIELD,,MAY.2021,FY2021,HUMANE
32611,A0979013,CAT,DOMESTIC SH,CC 25,UNAVAILABLE,,,1,P0772948,11500.0,...,R17-486863,K17-370420,,TREATABLE REHABILITABLE NON-CONTAGIOUS,SCAN NO CHIP,OVER THE COUNTER,ADOPT PEND SX 2/8/17,FEB.2017,FY2017,
17816,A0938117,CAT,DOMESTIC SH,VT 06,LAB,,A16-011692,1,P0733020,900.0,...,,K16-345850,,UNHEALTHY UNTREATABLE NON-CONTAGIOUS,SCAN NO CHIP,FIELD,ORAL DS/URI,MAY.2016,FY2016,
30453,A1068758,CAT,DOMESTIC SH,FOSTER,UNAVAILABLE,,,1,P0884981,9301.0,...,,K19-462477,,TREATABLE REHABILITABLE NON-CONTAGIOUS,UNABLE TO SCAN,OVER THE COUNTER,CC 40,MAY.2019,FY2019,UNDERAGE
7465,A1002552,CAT,DOMESTIC SH,RECEIVING,IMPOUNDED,,A17-067135,1,P0000000,1600.0,...,,K17-388058,,UNHEALTHY UNTREATABLE NON-CONTAGIOUS,SCAN NO CHIP,FIELD,,JUL.2017,FY2017,


In [28]:
dallas.drop(['Kennel Number','Kennel Status','Tag Type',
             'Activity Number','Activity Sequence','Source Id',
             'Census Tract', 'Council District', 'Intake Total', 'Reason', 'Staff Id',
            'Due Out', 'Hold Request','Receipt Number', 'Impound Number',
            'Service Request Number', 'Chip Status', 'Animal Origin',
            'Additional Information', 'Month','Year'], axis=1, inplace=True)

In [29]:
dallas.rename(columns={'Animal Id':'animal_id',
                       'Animal Type':'animal_type',
                       'Animal Breed':'breed',
                       'Intake Type':'intake_type',
                       'Intake Subtype':'intake_subtype',
                       'Intake Date':'intake_date',
                       'Intake Time':'intake_time',
                       'Intake Condition':'intake_condition',
                       'Outcome Type':'outcome_type',
                       'Outcome Date':'outcome_date',
                       'Outcome Time':'outcome_time',
                       'Outcome Condition':'outcome_condition',
                       'Outcome Subtype':'outcome_subtype',
                       'Month':'month',
                       'Year':'year'
    
}, inplace=True)
dallas.head()

Unnamed: 0,animal_id,animal_type,breed,intake_type,intake_subtype,intake_date,intake_time,intake_condition,outcome_type,outcome_date,outcome_time,outcome_condition,outcome_subtype
0,A0000575,CAT,DOMESTIC SH,STRAY,CONFINED,10/02/2014 12:00:00 AM,12/31/1899 11:56:00 AM,TREATABLE REHABILITABLE NON-CONTAGIOUS,ADOPTION,10/12/2014 12:00:00 AM,12/31/1899 03:25:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,
1,A0008962,DOG,LABRADOR RETR,CONFISCATED,KEEP SAFE,09/24/2015 12:00:00 AM,12/31/1899 03:50:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,EUTHANIZED,10/04/2015 12:00:00 AM,12/31/1899 12:22:00 PM,TREATABLE MANAGEABLE NON-CONTAGIOUS,
2,A0121376,DOG,GERM SHEPHERD,STRAY,CONFINED,05/01/2015 12:00:00 AM,12/31/1899 12:09:00 PM,TREATABLE MANAGEABLE NON-CONTAGIOUS,EUTHANIZED,05/03/2015 12:00:00 AM,12/31/1899 11:53:00 AM,TREATABLE MANAGEABLE NON-CONTAGIOUS,
3,A0129114,CAT,DOMESTIC SH,OWNER SURRENDER,GENERAL,09/19/2015 12:00:00 AM,12/31/1899 04:46:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,ADOPTION,10/26/2015 12:00:00 AM,12/31/1899 02:09:00 PM,TREATABLE REHABILITABLE NON-CONTAGIOUS,
4,A0157434,DOG,ROTTWEILER,OWNER SURRENDER,- DEAD ON ARRIVAL,12/03/2014 12:00:00 AM,12/31/1899 08:06:00 PM,UNHEALTHY UNTREATABLE NON-CONTAGIOUS,DEAD ON ARRIVAL,12/04/2014 12:00:00 AM,12/31/1899 12:00:00 PM,UNHEALTHY UNTREATABLE NON-CONTAGIOUS,


In [30]:
# filter to only dogs
dallas.drop(dallas.loc[dallas['animal_type']!='DOG'].index, inplace=True)

# drop observations where outcome is unknown
dallas.drop(dallas.loc[dallas['outcome_type'].isna()].index, inplace=True)
dallas.drop(dallas.loc[dallas['outcome_date'].isna()].index, inplace=True)

# turn all string columns to all lowercase
dallas = dallas.applymap(lambda s: s.lower() if type(s) == str else s)

dallas['city'] = 'dallas'

In [31]:
# transform the single column of date into a column for each
# for both intake and outcome, not using time because it doesn't contain meaningful data for this particular
# dataset

dallas['intake_date'] = pd.to_datetime(dallas['intake_date']).dt.date
dallas['intake_time'] = pd.to_datetime(dallas['intake_time']).dt.time
dallas['outcome_date'] = pd.to_datetime(dallas['outcome_date']).dt.date
dallas['outcome_time'] = pd.to_datetime(dallas['outcome_time']).dt.time

# create a column for the difference between outcome and intake
# representing how long the animal has been in the system

dallas.loc[:,'days_in'] = dallas.loc[:,'outcome_date'] - dallas.loc[:,'intake_date']
dallas['days_in'] = pd.to_numeric(dallas['days_in'])/86400000000000



In [32]:
dallas.shape

(34809, 15)

In [33]:
austin.shape

(108503, 20)

In [34]:
louisville.shape

(76905, 22)

In [50]:
all = austin.merge(louisville, how="outer")
all = all.merge(dallas, how='outer')



In [51]:
# drop variables not used after subsetting individual data sets
all.drop(['intake_time','outcome_time','animal_type', 'outcome_subtype', 'birth_date'], inplace=True, axis=1)

# set all negative ages to NaN, implies missing birth_date data
all['age_days'] = all['age_days'].mask(all['age_days'] < 0, 'NaN')

all.shape

(220217, 19)

In [52]:
print(all.isnull().sum())

animal_id                 0
intake_date               0
intake_type               0
intake_condition          0
breed                     0
color                 34823
outcome_date              6
outcome_type              0
outcome_age          111714
intake_repro          34810
gender                35300
outcome_repro         34816
city                      0
days_in                   0
age_days              34809
intake_subtype       112182
primary_color        143326
primary_breed        143312
outcome_condition    110140
dtype: int64


In [38]:
d = all[all['city']=='dallas']
print(d.isnull().sum())

animal_id                0
intake_date              0
intake_type              0
intake_condition         0
breed                    0
color                34809
outcome_date             0
outcome_type             0
outcome_age          34809
intake_repro         34809
gender               34809
outcome_repro        34809
city                     0
days_in                  0
age_days             34809
intake_subtype         671
primary_color        34809
primary_breed        34809
outcome_condition     1631
dtype: int64


In [39]:
a = all[all['city']=='austin']
print(a.isnull().sum())

animal_id                 0
intake_date               0
intake_type               0
intake_condition          0
breed                     0
color                     0
outcome_date              0
outcome_type              0
outcome_age               0
intake_repro              1
gender                  491
outcome_repro             1
city                      0
days_in                   0
age_days                  0
intake_subtype       108503
primary_color        108503
primary_breed        108503
outcome_condition    108503
dtype: int64


In [40]:
l = all[all['city']=='louisville']
print(l.isnull().sum())

animal_id                0
intake_date              0
intake_type              0
intake_condition         0
breed                    0
color                   14
outcome_date             6
outcome_type             0
outcome_age          76905
intake_repro             0
gender                   0
outcome_repro            6
city                     0
days_in                  0
age_days                 0
intake_subtype        3008
primary_color           14
primary_breed            0
outcome_condition        6
dtype: int64


## Process Categorical Variables

### Consolidate Intake_Condition

In [53]:
all['intake_condition'].replace({
# mark all normal, treatable+manageable+non-contagious as relatively healthy
# as there is no "more positive" denotation in the dataset
    'normal':'healthy', 
    'treatable/manageable':'healthy',
    'treatable manageable non-contagious':'healthy',
# mark all non-contagious rehabilitable instances to injured
    'treatable rehabilitable non-contagious':'injured',
# if not clear whether sick or injured, list as unhealthy
    'unhealthy untreatable non-contagious':'unhealthy',
    'unhealthy/untreatable':'unhealthy',
    'critical':'unhealthy',
    'med urgent':'unhealthy',
    'fatal':'unhealthy',
    'medical':'unhealthy',
# mark all "contagious" instances to sick
    'unhealthy untreatable contagious':'sick',
    'treatable rehabilitable contagious':'sick',
    'treatable manageable contagious':'sick',
# mark behavior issues as other given relative infrequency
    'feral':'other',
    'behavior':'other',
# combine nursing and pregnant
    'nursing':'pregnant/nursing',
    'pregnant':'pregnant/nursing',
# standardize other category names
    'app well':'healthy',
    'app inj':'injured',
    'app sick':'sick',
    'app wnl':'healthy',
    'unknown':'other',
    'deceased':'dead',
# drop aged and neonatal since age is captured in another variable
# assume that if age is the only qualifier, they are likely otherwise healthy
    'underage':'healthy',
    'aged':'healthy',
    'neonatal':'healthy',
}, inplace=True)

all.intake_condition.value_counts()

healthy             181966
injured              25673
unhealthy             7729
sick                  3020
pregnant/nursing      1405
other                  369
dead                    55
Name: intake_condition, dtype: int64

### Consolidate Intake_Subtype

In [54]:
all['intake_subtype'].replace({
    'with id':'other',
    'spca texas':'other',
    'trap':'field',
    'aid':'assist',
    'night':'field',
    'died':'dead',
    'weather':'field',
    'left at':'abuse',
    'vehicletow':'field',
    'neglect':'abuse',
    'cruelty':'abuse',
    'otc owned':'surrender',
    'mas':'other',
    'abandoned':'abuse',
    'sick':'condition',
    'field own':'surrender',
    'unpermited':'owner',
    'chaining':'abuse',
    'bite':'behavior',
    'aggressive':'behavior',
    'potdanger':'behavior',
    'police':'assist',
    'k humane s':'assist',
    'an control':'assist',
    'return':'adoption',
    'danger dog':'behavior',
    'old':'condition',
    'restraint':'behavior',
    'owner died':'owner',
    'hospital':'owner',
    'web':'adoption',
    'eviction':'owner',
    'owner sur':'surrender',
    'return 30':'return',
    'neuter':'medical',
    'post surg':'medical',
    'court ord':'owner',
    'rescue grp':'transfer',
    'quarantine':'medical',
    'at large':'behavior',
    'possibly owned':'assist',
    'confined':'abuse',
    'keep safe':'other',
    '- dead on arrival':'doa',
    'return30':'return',
    'quarantine - dead on arrival':'doa',
    'trap program':'field',
    'keep safe - dead on arrival':'doa',
    'cruelt - dead on arrival':'doa',
    'dangerous':'behavior',
    'dead on arrival':'doa',
    'quarantine dead on arrival':'doa',
    'heart worm':'condition',
    'keep safe dead on arrival':'doa',
    'surgery':'medical',
    'cruelt dead on arrival':'doa',
    'treatment':'medical',
    'sx post op':'medical',
    'dead':'doa',
    'appoint':'medical',
    'followup':'medical',
    'alumni':'other',
    'sac':'other',
    'own hospit':'owner',
    'spay/neut':'owner',
    'own deceas':'owner',
    'illness':'condition',
    'agg opps':'field',
    'own arrest':'owner',
    'for adopt':'adoption',
    'disaster':'owner',
    'injured':'condition',
    'walk in':'assist',
    'transport':'other',
    'pick up':'other',
    'urgent':'medical',
    'owner surr':'surrender',
    'euthanasia request':'euth req',
    'euthanasia requested':'euth req',
    'transfer':'other',
    ' ':'other'
}, inplace=True)

all.intake_subtype.value_counts()

field        30817
otc          30148
behavior     16828
general       7501
abuse         6072
adoption      3302
surrender     2638
euth req      2422
medical       1757
owner         1337
assist        1257
return        1121
other         1104
condition     1078
stray          373
doa            267
transfer        11
dead             2
Name: intake_subtype, dtype: int64

### Remove all dogs that were DOA :(

In [55]:
all = all[all.intake_condition != 'dead']
all = all[all.intake_subtype != 'doa']
all.shape

(219908, 19)

### Transform Breed into Dummy Variables

In [57]:
# replace frequent shorthands and typos to make the rest of the processing
# below easier

all['breed'].replace({
    'alask ':'alaskan ',
    'belg ':'belgian ',
    'span ':'spaniel ',
    'span/':'spaniel/',
    'mtn':'mountain',
    'amer ':'american ',
    'aust ':'australian ',
    'retr':'retriever',
    'chesa':'chesapeake bay',
    ' lh':' longhair',
    ' bordx':' bordeaux',
    'eng ':'english ',
    ' sprngr ':' springer ',
    'flat-coated':'flat coat',
    ' terr ':' terrier ',
    'gr swiss ':'great swiss ',
    'ital ':'italian ',
    'mex ':'mexican ',
    'min ':'minature ',
    'norw ':'norwegian ',
    'ns ':'nova scotia',
    'oldeng':'old english',
    '(jack) ':'',
    'russ terr':'russell terrier',
    'terr/':'terrier/',
    'pbgv':'petit basset griffon vendeen',
    'pitbull':'pit bull',
    'rhod ':'rhodesian ',
    'scot ':'scottish ',
    'sheltd ':'shetland ',
    'soft-coated':'soft coated',
    'soft coated':'soft coat',
    ' rgh':' rough coat',
    'rough coated':'rough coat',
    ' smth':' smooth coat',
    'swed ':'swedish',
    'tenn tr ':'tennesee treeing',
    'tr ':'tree ',
    ' - ':' ',
    'austalian':'australian',
    'stffordshire':'staffordshire',
    'bostn':'boston',
    'mastff':'mastiff',
    'chinese crestd':'chinese crested',
    'dachsund':'dachshund',
    'manchestr':'manchester',
    'munstrlander':'munsterlander',
    'stndard':'standard',
    'anatol shepherd':'anatolian shepherd',
    'bluetick hound':'bluetick coonhound',
    'bouv flandres':'bouvier des flandres',
    'bruss griffon':'brussels griffon',
    'catahoula leopard hound':'catahoula',
    'cavalier spaniel':'cavalier king charles spaniel',
    'chihuahua shorthair':'chihuahua',
    'chihuahua longhair':'chihuahua',
    'chihuahua short haired':'chihuahua',
    'chihuahua long haired':'chihuahua',
    'chihuahua smooth coated':'chihuahua',
    'cocker span':'cocker spaniel',
    'cocker american':'cocker spaniel',
    'cocker-poo':'cocker spaniel poodle',
    'dandie dinmont terrier':'dandie dinmont',
    'doberman pinsch':'doberman pinscher',
    'german shorthaired pointer':'german shorthair pointer',
    'german wirehaired pointer':'german wirehair pointer',
    'picardy sheepdg':'picardy sheepdog',
    'redbone hound':'redbone coonhound',
    'soft coated wheaten terrier':'soft coat wheaten terrier',
    'st bernard smooth coated':'st bernard smooth coat',
    'schnauzerard':'schnauzer'
}, inplace=True, regex=True)

In [63]:
# replace one-off breed names that are the entire entry

all['breed'].replace({
    'airedale terr':'airedale terrier',
    'am pit bull ter':'american pit bull terrier',
    'american staff':'american staffordshire terrier',
    'black and tan coonound':'black/tan hound',
    'boykin span':'boykin spaniel',
    'bull terrier min':'bull terrier miniature',
    'cavalier span':'cavalier spaniel',
    'chihuahua sh':'chihuahua shorthair',
    'cocker amer':'cocker spaniel',
    'cocker-poo':'cocker spaniel poodle',
    'collie':'border collie',
    'dutch sheepdog':'dutch shepherd',
    'gbgv':'grand basset griffon vendeen',
    'germ sh point':'german shorthair pointer',
    'germ shepherd':'german shepherd',
    'germ wh point':'german wirehair pointer',
    'german shorthaired pointer':'german shorthair pointer',
    'german wirehaired pointer':'german wirehair pointer',
    'neopolitan mast':'neopolitan mastiff',
    'nova scotiaduck tolling':'nova scotia duck tolling retriever',
    'sheepdg':'sheepdog',
    'polish lowland sheepdog':'polish lowland',
    'poodle min':'minature poodle',
    'poodle minature':'minature poodle',
    'poodle standard':'standard poodle',
    'poodle stnd':'standard poodle',
    'poodle toy':'toy poodle',
    'port water dog':'portuguese water dog',
    'queensland heeler':'queensland heel',
    'redbone coonhound':'redbone hound',
    'schnauzer min':'minature schnauzer',
    'schnauzer standard':'schnauzer stand',
    'tennesee treeingbrindle':'tennesee treeing brindle hound',
    'treeing tennesse brindle':'tennesee treeing brindle hound',
    'tibetan span':'tibetan spaniel',
    'kerry blue terr':'kerry blue terrier',
    'manchester terr':'manchester terrier',
    'westhighland':'west highland',
    'st\. bernard rough coat':'st bernard',
    'st\. bernard smooth coat':'st bernard',
    'pit bull':'pitbull',
    'pit bull mix':'pitbull',
    'lakeland terr':'lakeland terrier',
    'neapolitan mast':'neapolitan mastiff',
    'parson russ ter':'parson russel terrier'
}, inplace=True)

# replace nonsensical consequencial errors that occur due to the above changes
all['breed'].replace({'retrieveriever':'retriever',
                      'schnauzer stand':'standard schnauzer',
                      'queensland heel':'queensland heeler',
                      'bay bay':'bay',
                      'chesapeake baypeake bay retriever':'chesapeake bay retriever',
                      'doberman pinscherer':'doberman pinscher',
                      'st\.':'st'
                     }, inplace=True, regex=True)

In [60]:
# create comphrehensive list of breeds to turn into dummy variables
# based on their appearance in the breed description column

breeds = ['affenpinscher', 'afghan hound', 'airedale terrier', 'akbash', 'akita',
          'australian shepherd', 'alaskan husky', 'alaskan klee kai', 'alaskan malamute',
          'australian kelpie', 'american bulldog', 'american eskimo',
          'american foxhound', 'american pit bull terrier',
          'american staffordshire terrier', 'anatolian shepherd',
          'australian cattle dog', 'australian kelpie', 'australian shepherd',
          'australian terrier', 'basenji', 'basset hound',
          'beagle', 'bearded collie', 'beauceron', 'bedlington terrier',
          'belgian laekenois', 'belgian malinois', 'belgian sheepdog',
          'belgian tervuren', 'bernese hound', 'bernese mountain dog', 'bichon frise',
          'black mouth cur', 'black/tan hound', 'bloodhound', 'blue lacy',
          'bluetick coonhound', 'boerboel', 'border collie', 'border terrier',
          'borzoi', 'boston terrier', 'bouvier des flandres', 'boxer', 'boykin spaniel',
          'briard', 'brittany', 'brussels griffon', 'bull terrier',
          'bull terrier miniature', 'bulldog', 'bullmastiff', 'cairn terrier',
          'canaan dog', 'cane corso', 'cardigan welsh corgi', 'carolina dog',
          'catahoula', 'cavalier king charles spaniel', 'chesapeake bay retriever', 'chihuahua longhair',
          'chihuahua shorthair', 'chinese crested', 'chinese sharpei', 'chow chow',
          'clumber spaniel', 'cocker spaniel', 'collie rough', 'collie smooth',
          'coonhound', 'coton de tulear', 'dachshund',
          'dalmatian', 'dandie dinmont', 'doberman pinscher', 'dogo argentino', 'dogue de bordeaux',
          'dutch shepherd', 'english bulldog', 'english cocker spaniel', 'english coonhound',
          'english foxhound', 'english pointer', 'english setter', 'english shepherd',
          'english springer spaniel', 'english toy spaniel', 'entlebucher',
          'feist', 'field spaniel', 'fila brasileiro', 'finnish spitz', 'flat coat retriever',
          'fox terrier', 'french bulldog', 'german pinscher', 'german shepherd', 'german shorthair pointer',
          'german wirehair pointer','glen of imaal', 'golden retriever', 'gordon setter', 'great dane',
          'great pyrenees', 'great swiss mountain', 'greyhound', 'harrier', 'havanese', 'hound',
          'hovawart', 'ibizan hound', 'irish setter', 'irish terrier', 'irish wolfhound', 'italian greyhound',
          'japanese chin', 'jindo', 'kangal', 'karelian bear dog', 'keeshond', 'kerry blue terrier',
          'korean jindo', 'kuvasz', 'labrador retriever', 'lakeland terrier', 'landseer', 'leonberger',
          'lhasa apso', 'lowchen', 'maltese', 'manchester terrier', 'mastiff', 'mexican hairless',
          'minature pinscher', 'minature poodle', 'minature schnauzer', 'munsterlander', 'neapolitan mastiff',
          'newfoundland', 'norfolk terrier', 'norwegian buhund', 'norwegian elkhound', 'norwich terrier',
          'nova scotia duck tolling retriever', 'old english bulldog', 'old english sheepdog',
          'otterhound', 'papillon', 'parson russell terrier', 'patterdale terrier', 'pekapoo',
          'pekingese','pembroke welsh corgi', 'petit basset griffon vendeen', 'pharaoh hound',
          'picardy sheepdog', 'pit bull', 'plott hound', 'podengo pequeno', 'pointer', 'polish lowland',
          'pomeranian', 'toy poodle', 'portuguese water dog', 'presa canario', 'pug',
          'puli', 'queensland heeler', 'rat terrier', 'redbone coonhound', 'rhodesian ridgeback',
          'rottweiler', 'russell terrier', 'saluki', 'samoyed', 'schipperke', 'schnauzer giant',
          'scottish deerhound', 'scottish terrier', 'shepherd', 'shetland sheepdog',
          'shiba inu', 'shih tzu', 'siberian husky', 'silky terrier', 'smooth fox terrier',
          'soft coated wheaten terrier', 'spanish mastiff', 'spanish water dog', 'spinone italiano',
          'st bernard rough coat', 'st bernard smooth coat', 'staffordshire', 'standard poodle',
          'standard schnauzer', 'sussex spaniel', 'swedish vallhund', 'swiss hound',
          'tennesee treeing brindle hound', 'terrier', 'tibetan mastiff', 'tibetan spaniel',
          'tibetan terrier', 'toy fox terrier', 'toy poodle', 'tree walker hound', 'treeing cur',
          'treeing walker coonhound', 'vizsla', 'weimaraner', 'welsh springer spaniel', 'west highland',
          'whippet', 'wire hair fox terrier', 'wirehaired pointing griffon', 'wirehaired vizsla', 
          'wolf hybrid', 'yorkshire terrier'     
         ]
len(breeds)

218

In [64]:
for b in all['breed'].unique():
    print(b)

beagle mix
english springer spaniel
basenji mix
doberman pinscher/australian cattle dog
labrador retriever mix
great dane mix
chihuahua
pitbull
australian cattle dog/labrador retriever
parson russell terrier mix
norfolk terrier
yorkshire terrier mix
maltese mix
dachshund mix
boxer mix
plott hound mix
labrador retriever
tibetan spaniel mix
miniature pinscher mix
chihuahua mix
pit bull/australian cattle dog
yorkshire terrier
dachshund/chihuahua
german shepherd/chow chow
miniature schnauzer mix
german shepherd/australian cattle dog
german shepherd
great dane
australian kelpie mix
alaskan husky
great pyrenees mix
german shepherd mix
feist/beagle
american pit bull terrier mix
norfolk terrier mix
rottweiler mix
chihuahua/russell terrier
italian greyhound mix
treeing walker coonhound mix
boxer
staffordshire/english bulldog
bull terrier mix
australian shepherd/chow chow
black mouth cur/pit bull
boston terrier/pembroke welsh corgi
black mouth cur mix
german shepherd/great pyrenees
cairn terrier

miniature pinscher/smooth fox terrier
labrador retriever/vizsla
doberman pinscher/beauceron
catahoula/cardigan welsh corgi
lhasa apso/pekingese
pointer/harrier
labrador retriever/belgian malinois
bluetick coonhound/australian kelpie
rat terrier/pit bull
whippet/borzoi
cavalier spaniel/border collie
boxer/dogue de bordeaux
pug/french bulldog
black mouth cur/blue lacy
rottweiler/doberman pinscher
skye terrier/miniature poodle
rottweiler/australian shepherd
catahoula/great dane
chihuahua/doberman pinscher
boxer/dachshund
pit bull/rat terrier
collie smooth/saluki
bulldog/boxer
border collie/basset hound
pit bull/cardigan welsh corgi
dachshund/english foxhound
basset hound/beauceron
welsh springer spaniel
english coonhound/beagle
blue lacy/labrador retriever
siberian husky/american pit bull terrier
chesapeake bay retriever
labrador retriever/pekingese
wire hair fox terrier/golden retriever
norfolk terrier/dachshund
standard schnauzer/australian shepherd
old english bulldog
presa canario
whi

shih tzu / cairn terrier
anatolian shepherd / great dane
polish lowland sheepdog / 
bichon frise / pure bred
shetland sheepdog / tibetan terrier
chow chow / rat terrier
shetland sheepdog / chow chow
shetland sheepdog / beagle
collie rough / siberian husky
chihuahua smooth coated / scottish terrier
whippet / labrador retriever
schnauzer miniature / schnauzer miniature
cairn terrier / poodle miniature
parson (jack) russell terrier / dachshund
mastiff / rottweiler
korean jindo / carolina dog
chihuahua smooth coated / basenji
pomeranian / poodle miniature
pomeranian / parson (jack) russell terrier
carolina dog / mix
fox terrier wirehaired / beagle
labrador retriever / american bulldog
pug / pit bull terrier
bloodhound / german shepherd dog
pit bull terrier / black and tan coonound
australian shepherd / australian shepherd
yorkshire terrier / schnauzer miniature
shih tzu / poodle miniature
patterdale terrier / mix
chihuahua smooth coated / fox terrier smooth
pug / boxer
chow chow / shetland

feist / parson (jack) russell terrier
norfolk terrier / shih tzu
beauceron / german shepherd dog
cane corso / st bernard smooth coated
russ ter / patterdale terrier
leonberger / 
welsh corgi pembroke / pug
german shepherd dog / whippet
bernese mountain dog / german shepherd dog
chihuahua smooth coated / french bulldog
english shepherd / australian shepherd
miniature pinscher / pomeranian
st bernard smooth coated / english coonhound (redtick coonhound)
schnauzer miniature / pug
english toy spaniel / pug
black mouth cur / mastiff
boxer / st bernard rough coat
boston terrier / welsh corgi pembroke
brussels griffon / cairn terrier
chinese sharpei / boxer
cairn terrier / petit basset griffon vendeen
basenji / labrador retriever
border terrier / pekingese
akita / bearded collie
english coonhound (redtick coonhound) / redbone coonhound
bulldog / german shepherd dog
bull terrier / weimaraner
whippet / beagle
boston terrier / scottish terrier
soft coated wheaten terrier / australian shepherd
du

In [61]:
# create a dummy variable for each breed in the above list
# if the breed is listed in the breed variable column, set new dummy variable to 1 else 0

for b in breeds:
    all[b] = all['breed'].apply(lambda x: int(b in x) if isinstance(x,str) else 0)

In [79]:
for c in all.columns[19:]:
    print(c + ": " + str(all[c].sum()))

affenpinscher: 62
afghan hound: 7
airedale terrier: 142
akbash: 16
akita: 788
australian shepherd: 2783
alaskan husky: 1407
alaskan klee kai: 5
alaskan malamute: 337
australian kelpie: 930
american bulldog: 1973
american eskimo: 291
american foxhound: 196
american pit bull terrier: 3200
american staffordshire terrier: 1354
anatolian shepherd: 1294
australian cattle dog: 5315
australian terrier: 93
basenji: 369
basset hound: 1389
beagle: 8911
bearded collie: 104
beauceron: 104
bedlington terrier: 13
belgian laekenois: 1
belgian malinois: 441
belgian sheepdog: 18
belgian tervuren: 21
bernese hound: 8
bernese mountain dog: 112
bichon frise: 375
black mouth cur: 1149
black/tan hound: 320
bloodhound: 284
blue lacy: 459
bluetick coonhound: 174
boerboel: 17
border collie: 4979
border terrier: 631
borzoi: 6
boston terrier: 924
bouvier des flandres: 19
boxer: 7114
boykin spaniel: 17
briard: 11
brittany: 189
brussels griffon: 188
bull terrier: 19736
bull terrier miniature: 34
bulldog: 3120
bullm

### Create Breed Group Column

In [94]:
# create a new empty string column for the breed group names to be appended to
all['group'] = ''

# assign all relevant breeds to a list based on AKS classification
herding_list = [
    'australian cattle dog', 'australian shepherd', 'bearded collie', 'beauceron', 'belgian laekenois',
    'belgian malinois', 'belgian sheepdog', 'belgian tervuren', 'border collie', 'bouvier des flandres', 'briard',
    'canaan dog', 'cardigan welsh corgi', 'collie', 'entlebucher', 'german shepherd', 'norwegian buhund',
    'old english sheepdog', 'pembroke welsh corgi', 'puli', 'shetland sheepdog', 'swedish vallhund'
]

hound_list = [
    'afghan hound', 'american foxhound', 'basenji', 'basset hound', 'beagle', 'black/tan hound', 'coonhound',
    'bloodhound', 'bluetick coonhound', 'borzoi', 'dachshund', 'english foxhound', 'greyhound',
    'harrier', 'ibizan hound', 'irish wolfhound', 'norwegian elkhound', 'otterhound',
    'petit basset griffon vendeen', 'pharaoh hound', 'plott hound', 'podengo pequeno',
    'redbone hound', 'rhodesian ridgeback', 'saluki', 'scotish deerhound', 'treeing walker coonhound',
    'whippet'
]

toy_list = [
    'affenpinscher', 'brussels griffon', 'cavalier king charles spaniel', 'chihuahua', 'chinese crested',
    'english toy spaniel', 'havanese', 'italian greyhound', 'japanese chin', 'maltese', 'manchester terrier',
    'miniature pinscher', 'papillon', 'pekingese', 'pomeranian', 'toy poodle', 'pug', 'shih tzu',
    'silky terrier', 'toy fox terrier', 'yorkshire terrier'
]

nonsport_list = ['american eskimo', 'bichon frise', 'boston terrier', 'bulldog', 'chinese sharpei',
                'chow chow', 'coton de tulear', 'dalmatian', 'finnish spitz', 'french bulldog', 'keeshond',
                'lhasa apso', 'lowchen', 'standard poodle', 'shiba inu', 'tibetan spaniel', 'tibetan terrier'
                ]

sport_list = [
    'boykin spaniel', 'brittany', 'chesapeake bay retriever', 'clumber spaniel', 'cocker spaniel',
    'english cocker spaniel', 'english setter', 'english springer spaniel', 'field spaniel',
    'flat coat retriever', 'german shorthair pointer', 'german wirehair pointer', 'golden retriever',
    'gordon setter', 'irish setter', 'labrador retriever', 'nova scotia duck tolling retriever', 'pointer',
    'spinone italiano', 'sussex spaniel', 'vizsla', 'weimaraner', 'welsh springer spaniel',
    'wirehaired pointed griffon', 'wirehaired vizsla'
            ]

terrier_list = [
    'airedale terrier', 'american staffordshire terrier', 'australian terrier', 'bedlington terrier',
    'border terrier', 'bull terrier', 'cairn terrier', 'dandie dinmont', 'glen of imaal',
    'irish terrier', 'kerry blue terrier', 'lakeland terrier', 'manchester terrier', 'bull terrier minature',
    'norfolk terrier', 'norwich terrier', 'parson russell terrier', 'rat terrier', 'russell terrier',
    'scottish terrier', 'smooth fox terrier', 'soft coated wheaten terrier', 'staffordshire', 'welsh terrier',
    'west highland', 'wire hair fox terrier'
]

working_list = [
    'akita', 'alaskan malamute', 'anatolian shepherd', 'bernese mountain dog', 'boerboel', 'boxer',
    'bullmastiff', 'cane corso', 'doberman pinscher', 'dogo argentino', 'dogue de bordeaux',
    'german pinscher', 'schnauzer giant', 'great dane', 'great pyrenees', 'great swiss mountain',
    'kuvasz', 'leonberger', 'mastiff', 'neapolitan mastiff', 'newfoundland', 'portuguese water dog',
    'rottweiler', 'st bernard', 'samoyed', 'siberian husky', 'standard schnauzer', 'tibetan mastiff'
]

misc_list = [
    'belgian laekenois', 'dutch shepherd'
]

fss_list = [
    'american bulldog', 'australian kelpie', 'carolina dog', 'catahoula', 'hovawart', 'jindo',
    'karelian bear dog'
]

# in addition to the AKS classification, create a subset of breeds typically considered pit bulls
# based on the American Humane Society's classification
pit_list = [
    'american staffordshire terrier', 'american bull terrier', 'american pit bull terrier', 'pitbull',
    'american bulldog', 'staffordshire', 'pit bull'
]

In [81]:
# evaluate each row and each column that represents a breed
# from 22 up to the newly created group column
r=0
c=22

while r < len(all):
    while c < len(all.columns):
        
        # if the column name is in a given list and the value in the given cell is 1
            # assign that breed group name to the group variable, appending it to the existing string
        
        if (all.columns[c] in herding_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "herding, "
            
        elif (all.columns[c] in hound_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "hound, "
       
        elif (all.columns[c] in toy_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "toy, "
        
        elif (all.columns[c] in nonsport_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "non-sporting, "
        
        elif (all.columns[c] in sport_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "sporting, "
        
        elif (all.columns[c] in terrier_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "terrier, "
        
        elif (all.columns[c] in working_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "working, "
        
        elif (all.columns[c] in misc_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "misc, "
        
        elif (all.columns[c] in fss_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "fss, "
        
        elif (all.columns[c] in pit_list) and (all.iloc[r,c] == 1):
            all.iloc[r,-1] = all.iloc[r,-1] + "pitbulls, "
    
    #iterate through each column, then reset to the first breed column and repeat for the next row
        c+=1
    c=22    
    r+=1

In [96]:
## SPLIT GROUPS INTO EACH COLUMN
all['group'].value_counts()
groups = ['herding', 'hound', 'toy', 'non-sporting',
               'sporting', 'terrier', 'working', 'misc', 'fss', 'pitbulls']

for g in groups:
    all[g] = all['group'].apply(lambda x: int(g in x) if isinstance(x,str) else 0)

In [98]:
all.drop(['group'], inplace=True, axis=1)

### Transform Color into Dummy Variables

In [99]:
all['color'].replace({
    'apricot':'yellow',
    'beige':'tan',
    'blonde':'yellow',
    'liver':'brindle',
    'orange':'yellow',
    'pink':'red',
    'ruddy':'red',
    'tiger':'brindle',
    'gold':'yellow',
    'silver':'gray'
}, inplace=True, regex=True)


colors = ['black','blue','brindle','brown','buff','chocolate',
          'cream','fawn','gray','merle','red',
          'sable','tan','tick','tricolor','white','yellow']

for c in colors:
    all[c] = all['color'].apply(lambda x: int(c in x) if isinstance(x,str) else 0)

### Consolidate Intake_Type

In [100]:
all['intake_type'].replace({
# standardize across different datasets
    'confiscate':'confiscated',
    'foster':'return',
    'owner sur':'owner surrender',
# set all euthanasia and disposal to euthanasia request
    'euthanasia':'euthanasia request',
    'euth req':'euthanasia request',
    'dispos req':'euthanasia request',
    'et request':'euthanasia request',
    'disposal':'euthanasia request',
# aggregate all medical reasons
    'outsurgery':'medical',
    'treatment':'medical',
    'quarantine':'medical',
    'med observ':'medical',
# generalize several niche stray conditions to stray
    'lost':'stray',
    'found':'stray',
    'wildlife':'stray',
    'evacuee':'stray',
    'abandoned':'stray',
# add reported animals to public assist    
    'lost report':'public assist',
    'found report':'public assist',
# add all infrequent specifications to other to reduce number of categories
    'khs':'other',
    'investigat':'other',
    'keepsafe':'other',
    'transport':'other',
    'for transp':'other',
    'transfer':'other'
}, inplace=True)

all.intake_type.value_counts()

stray                 137443
owner surrender        52750
public assist          11738
confiscated             5922
euthanasia request      4343
medical                 3636
return                  3553
other                    523
Name: intake_type, dtype: int64

### Consolidate Gender & Reproductive Status

In [101]:
all['gender'].replace({'litter':'unknown'},inplace=True)

all.gender.value_counts()

male       101317
female      82348
unknown      1252
Name: gender, dtype: int64

In [102]:
all['intake_repro'].replace({
    'fertile':'intact',
    'neutered':'altered',
    'spayed':'altered'
},inplace=True)

all.intake_repro.value_counts()

intact     108857
altered     74808
unknown      1742
Name: intake_repro, dtype: int64

### Consolidate Outcome Condition

In [103]:
all['outcome_condition'].replace({
# mark all normal, treatable+manageable+non-contagious as relatively healthy
# as there is no "more positive" denotation in the dataset
    'normal':'healthy', 
    'treatable/manageable':'healthy',
    'treatable manageable non-contagious':'healthy',
# mark all non-contagious rehabilitable instances to injured
    'treatable rehabilitable non-contagious':'injured',
# if not clear whether sick or injured, list as unhealthy
    'unhealthy untreatable non-contagious':'unhealthy',
    'unhealthy/untreatable':'unhealthy',
    'critical':'unhealthy',
    'med urgent':'unhealthy',
    'fatal':'unhealthy',
    'medical':'unhealthy',
# mark all "contagious" instances to sick
    'unhealthy untreatable contagious':'sick',
    'treatable rehabilitable contagious':'sick',
    'treatable manageable contagious':'sick',
# mark behavior issues as other given relative infrequency
    'feral':'other',
    'behavior':'other',
# combine nursing and pregnant
    'nursing':'pregnant/nursing',
    'pregnant':'pregnant/nursing',
# standardize other category names
    'app well':'healthy',
    'app inj':'injured',
    'app sick':'sick',
    'app wnl':'healthy',
    'unknown':'other',
    'deceased':'dead',
# drop aged and neonatal since age is captured in another variable
# assume that if age is the only qualifier, they are likely otherwise healthy
    'underage':'healthy',
    'aged':'healthy',
    'neonatal':'healthy',
}, inplace=True)

all.outcome_condition.value_counts()

healthy      85377
injured      19433
unhealthy     3443
sick          1420
other           82
dead            19
Name: outcome_condition, dtype: int64

### Create Euthanasia Outcome Dummy Variable

In [104]:
# consolidate the euthanasia outcomes to create a dummy variable
# for whether the dog was euthanized or not

all['outcome_type'].replace({
   'euth':'euthanasia',
    'euthanized':'euthanasia',
}, inplace=True)
all.outcome_type.value_counts()

# 
all['euthanasia'] = 1*(all['outcome_type'] == 'euthanasia')

## Finalize City Datasets

In [106]:
# set final variables as categorical
for col in ['intake_type', 'intake_condition', 'outcome_type', 
            'intake_repro','gender','outcome_repro',
            'city', 'intake_subtype', 'outcome_condition']:
    all[col] = all[col].astype('category')
    

# drop variables used for subsetting and creating other variables
all.drop(['breed','color', 'outcome_type'], inplace=True, axis=1)

all.sample(5)

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,outcome_date,outcome_age,intake_repro,gender,outcome_repro,city,...,gray,merle,red,sable,tan,tick,tricolor,white,yellow,euthanasia
143628,a313306,2007-07-10,owner surrender,healthy,2007-07-27,,altered,female,altered,louisville,...,0,0,0,0,0,0,0,0,0,0
12280,a746960,2017-04-11,stray,healthy,2017-04-16,5 years,intact,male,neutered,austin,...,0,0,0,0,0,0,0,0,1,0
21745,a708411,2020-09-30,public assist,healthy,2016-02-15,2 years,altered,male,neutered,austin,...,0,0,1,0,0,0,0,1,0,0
208029,a1074496,2019-07-07,stray,injured,2019-07-07,,,,,dallas,...,0,0,0,0,0,0,0,0,0,0
43860,a713208,2015-11-07,stray,healthy,2015-11-17,7 months,altered,male,neutered,austin,...,0,0,0,0,0,0,0,1,0,0


In [125]:
all.drop(['primary_color','primary_breed'], inplace=True, axis=1)

In [134]:
# subset each city with consolidated and consistent categories and breed data structure
austin = all[all['city'] == 'austin']
louisville = all[all['city'] == 'louisville']
dallas = all[all['city'] == 'dallas']

In [127]:
# drop columns not present in each respective city
dallas.drop(['outcome_age', 'intake_repro', 'gender', 'outcome_repro', 'age_days',
            'black','blue','brindle','brown','buff','chocolate',
            'cream','fawn','gray','merle','red',
            'sable','tan','tick','tricolor','white','yellow'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [129]:
louisville.drop(['outcome_age'], inplace=True, axis=1)

In [135]:
austin.drop(['intake_subtype','outcome_condition'], inplace=True, axis=1)
austin['gender'].fillna('unknown', inplace=True)
austin['intake_repro'].fillna('unknown', inplace=True)
austin['outcome_repro'].fillna('unknown', inplace=True)
austin['age_days'].fillna(austin['age_days'].median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [136]:
austin.to_csv('austin_dw.csv', index=False)