### 1. Data Import

In [291]:
# import animal shelter data set and investigate
% matplotlib inline
import pylab as p
import pandas as pd
import numpy as np

animaltrain = pd.read_csv('train.csv', header = 0)
animaltest = pd.read_csv('test.csv', header = 0)

In [292]:
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White


In [293]:
animaltest.head(3)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby


### 2. Data Exploration and Adaptation

In [294]:
animaltrain.shape == animaltrain.dropna().shape # there are nan's in the training set

False

In [295]:
animaltest.shape == animaltest.dropna().shape # there are nan's in the test set

False

In [296]:
animaltrain[ animaltrain['SexuponOutcome'].isnull()]

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
3174,A667395,Diego,2013-11-27 16:11:00,Return_to_owner,,Dog,,7 years,Dachshund,Brown Merle


In [297]:
animaltrain[ animaltrain['AgeuponOutcome'].isnull()].head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
2480,A720674,,2016-02-16 10:17:00,Transfer,SCRP,Cat,Intact Male,,Domestic Shorthair Mix,Blue Tabby/White
2912,A720973,,2016-02-18 19:07:00,Transfer,SCRP,Cat,Unknown,,Domestic Shorthair Mix,Gray Tabby
3766,A720820,,2016-02-16 18:55:00,Transfer,SCRP,Cat,Intact Female,,Domestic Shorthair Mix,Brown Tabby


- the nan's are in SexuponOutcome and AgeuponOutcome
- convert everything into numbers

#### 2.1 Animal Types

In [298]:
# find unique animal types
animaltrain['AnimalType'].unique()

array(['Dog', 'Cat'], dtype=object)

In [299]:
# only cats and dogs. Map animal type to an integer
animaltrain['Type'] = animaltrain['AnimalType'].map({'Dog': 0, 'Cat': 1}).astype(int)
animaltest['Type'] = animaltest['AnimalType'].map({'Dog': 0, 'Cat': 1}).astype(int)
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Type
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0


#### 2.2 Animal Sex

In [300]:
# find unique values for SexuponOutcome
animaltrain['SexuponOutcome'].unique()

array(['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female',
       'Unknown', nan], dtype=object)

In [301]:
# copy to new column 'Sex'
animaltrain['Sex'] = animaltrain['SexuponOutcome']
animaltest['Sex'] = animaltest['SexuponOutcome']

In [302]:
# if the value is nan, replace with Unknown
animaltrain.loc[animaltrain['SexuponOutcome'].isnull(), 'Sex'] = 'Unknown' 
animaltest.loc[animaltest['SexuponOutcome'].isnull(), 'Sex'] = 'Unknown'

In [303]:
# map the unique values for SexuponOutcome to integers
animaltrain['Sex'] = animaltrain['Sex'].map({'Neutered Male': 0, 'Spayed Female': 1, 'Intact Male': 2, 'Intact Female': 3, 'Unknown': 4}).astype(int)
animaltest['Sex'] = animaltest['Sex'].map({'Neutered Male': 0, 'Spayed Female': 1, 'Intact Male': 2, 'Intact Female': 4, 'Unknown': 4})

In [304]:
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Type,Sex
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0


#### 2.3 Age

In [305]:
# check for ages
animaltrain['AgeuponOutcome'].unique()

array(['1 year', '2 years', '3 weeks', '1 month', '5 months', '4 years',
       '3 months', '2 weeks', '2 months', '10 months', '6 months',
       '5 years', '7 years', '3 years', '4 months', '12 years', '9 years',
       '6 years', '1 weeks', '11 years', '4 weeks', '7 months', '8 years',
       '11 months', '4 days', '9 months', '8 months', '15 years',
       '10 years', '1 week', '0 years', '14 years', '3 days', '6 days',
       '5 days', '5 weeks', '2 days', '16 years', '1 day', '13 years', nan,
       '17 years', '18 years', '19 years', '20 years'], dtype=object)

In [306]:
# function for calculating numerical value of ages

def agecalculator(agedf):
    ages = []
    for age in agedf:
        if 'day' in age:
            ages.append(int(age.split(' ')[0]) * 1)
        elif 'week' in age:
            ages.append(int(age.split(' ')[0]) * 7)
        elif 'month' in age:
            ages.append(int(age.split(' ')[0]) * 30)
        elif 'year' in age:
            ages.append(int(age.split(' ')[0]) * 365)
    return np.array(ages)

In [307]:
# age
animaltrain.loc[animaltrain['AgeuponOutcome'].isnull(), 'AgeuponOutcome'] = '0 days'
trainages = agecalculator(animaltrain['AgeuponOutcome'])

animaltest.loc[animaltest['AgeuponOutcome'].isnull(), 'AgeuponOutcome'] = '0 days'
testages = agecalculator(animaltest['AgeuponOutcome'])

In [308]:
# calculate mean and standard deviation of training set to scale ages
mean_age = trainages.mean()
std_age = trainages.std()

# scale ages
trainages = (trainages - mean_age) / std_age
testages = (testages - mean_age) / std_age

In [309]:
# assign to the dataframes
animaltrain['Age'] = trainages
animaltest['Age'] = testages
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Type,Sex,Age
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0,-0.39588
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,1,-0.39588
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,-0.058742


#### 2.4 Breeds

In [310]:
# divide into mix or not mix
def mixornot(breedsinput):
    breeds = []
    for breed in breedsinput:
        if 'Mix' in breed:
            breeds.append(1)
        else:
            breeds.append(0)
    return breeds

In [311]:
animaltrain['Mix'] = mixornot(animaltrain['Breed'])
animaltest['Mix'] = mixornot(animaltest['Breed'])
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Type,Sex,Age,Mix
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0,-0.39588,1
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,1,-0.39588,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,-0.058742,1


In [312]:
# find unique breeds
#len(animaltrain['Breed'].unique())

In [313]:
#breeds = animaltrain['Breed'].append(animaltest['Breed'])
#len(breeds.unique())

In [314]:
# lots of breeds...find labels with factorize
#labels = pd.factorize(breeds)
#labels

In [315]:
# map breeds to labels
#animaltrain['BreedNb'] = labels[0][0:len(animaltrain['Breed'])]
#animaltest['BreedNb'] = labels[0][len(animaltrain['Breed'])::] # doesn't work...
#animaltrain.head(3)

#### 2.5 Colors

In [316]:
# find unique colors
#len(animaltrain['Color'].unique())

In [317]:
#colors = animaltrain['Color'].append(animaltest['Color'])
#len(colors.unique())

In [318]:
# also lots of colors...find labels with factorize
#labels = pd.factorize(colors)
#labels

In [319]:
# map colors to labels
#animaltrain['ColorNb'] = labels[0][0:len(animaltrain['Color'])]
#animaltest['colorNb'] = labels[0][len(animaltrain['Color'])::] # doesn't work...
#animaltrain.head(3)

#### 2.6 Names

In [320]:
# add feature if the animal has a name or not
animaltrain['NameOrNot'] = animaltrain['Name'].isnull()
animaltest['NameOrNot'] = animaltest['Name'].isnull()
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Type,Sex,Age,Mix,NameOrNot
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0,-0.39588,1,False
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,1,-0.39588,1,False
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,-0.058742,1,False


#### 2.7 Time

In [321]:
# divide into time of day categories
def timeofday(times):
    timecategory = []
    for datestring in times:
        time = datestring.split(' ') # split date string into date and time of day
        time = int(time[1][0:2]) # separate hour from time of day
        # add to a category
        if time > 6 and time <= 10:
            timecategory.append(0) # morning
        elif time > 10 and time <= 14:
            timecategory.append(1) # mid day
        elif time > 14 and time <= 18:
            timecategory.append(2) # afternoon
        elif time > 18 and time <= 24:
            timecategory.append(3) # evening
        else:
            timecategory.append(4) # night
    return timecategory

In [322]:
# add feature for time
animaltrain['Time'] = timeofday(animaltrain['DateTime'])
animaltest['Time'] = timeofday(animaltest['DateTime'])
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Type,Sex,Age,Mix,NameOrNot,Time
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0,-0.39588,1,False,2
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,1,-0.39588,1,False,1
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,-0.058742,1,False,1


#### 2.8 Outcomes

In [323]:
# Outcome Type
animaltrain['OutcomeType'].unique()

array(['Return_to_owner', 'Euthanasia', 'Adoption', 'Transfer', 'Died'], dtype=object)

In [324]:
animaltrain['Outcome'] = animaltrain['OutcomeType'].map({'Adoption': 0, 'Died': 1, 'Euthanasia': 2, 'Return_to_owner': 3, 'Transfer': 4})

In [325]:
animaltrain.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Type,Sex,Age,Mix,NameOrNot,Time,Outcome
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0,-0.39588,1,False,2,3
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,1,-0.39588,1,False,1,2
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,-0.058742,1,False,1,0


### 3. Prediction

- Prediction using random forest classifier

In [326]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

# split hourly data into a training set and test set -> randomly selects 10% of data as test set
train, test = train_test_split(animaltrain, test_size = 0.1)

# fit the random forest classifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train.values[0::, 10:16], train.values[0::, 16].astype(int))
# calculate score
r_squared = forest.score(test.values[0::, 10:16], test.values[0::, 16].astype(int))
r_squared

0.65095398428731766

In [327]:
# fit the random forest classifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(animaltrain.values[0::, 10:16], animaltrain.values[0::, 16].astype(int))

In [328]:
# predict_proba
predictions = forest.predict_proba(animaltest.values[0::, 8:14])
predictions

array([[ 0.        ,  0.        ,  0.17125   ,  0.42366883,  0.40508117],
       [ 0.60962758,  0.        ,  0.        ,  0.25702017,  0.13335226],
       [ 0.26994977,  0.        ,  0.        ,  0.11272875,  0.61732147],
       ..., 
       [ 0.        ,  0.0540865 ,  0.04121407,  0.        ,  0.90469942],
       [ 0.30867315,  0.        ,  0.08982796,  0.46256754,  0.13893134],
       [ 0.        ,  0.        ,  0.46095217,  0.10576854,  0.4332793 ]])

In [329]:
data = {'ID': animaltest['ID'], 'Adoption': predictions[0::, 0], 'Died': predictions[0::, 1], 'Euthanasia': predictions[0::, 2], 'Return_to_owner': predictions[0::, 3], 'Transfer': predictions[0::, 4]}
outputdf = pd.DataFrame(data, columns = ['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])

In [330]:
outputdf.to_csv('AnimalPredictions.csv', index=False)