In [1]:
# Some of the data preprocessing has been already done during the data exploration.

# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from __future__ import print_function
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Restore the dataframe from the analysis phase
data = pd.read_pickle("data_after_first_step.pkl")
data.head(2)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,DayoftheWeek,Hour,Date,AgeDaysuponOutcome,LargeBreed,BasicColor
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 years,Shetland Sheepdog Mix,Brown/White,2014,2,2,18,2014-02-12,365,Shetland Sheepdog,Brown
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 years,Domestic Shorthair Mix,Cream Tabby,2013,10,6,12,2013-10-13,365,Domestic,Cream


In [3]:
# Store label in its own variable
labels = data["OutcomeType"]

In [4]:
# Encode the label
le = preprocessing.LabelEncoder()
le.fit(labels)
labels = le.transform(labels)
labels = pd.DataFrame(labels)

In [5]:
# Remove following columns:
# AnimalID: an unique ID not needed
# Breed: LargeBreed will be used insted
# AgeuponOutcome: AgeDaysuponOutcome will be used instead 
# DateTime: Year, Month and day of the week will be used instead
# Date: was only used for forecasting. Not needed anymore
# Color: BasicColor will be used instead.
# OutcomeSubType: can leak information to our prediction model.
# OutcomeType: it is the label to predict.
data.drop(["AnimalID","OutcomeType","Breed","AgeuponOutcome","DateTime","Date","Color","OutcomeSubtype"],inplace=True,axis=1)

In [6]:
data.head()

Unnamed: 0,Name,AnimalType,SexuponOutcome,Year,Month,DayoftheWeek,Hour,AgeDaysuponOutcome,LargeBreed,BasicColor
0,Hambone,Dog,Neutered Male,2014,2,2,18,365,Shetland Sheepdog,Brown
1,Emily,Cat,Spayed Female,2013,10,6,12,365,Domestic,Cream
2,Pearce,Dog,Neutered Male,2015,1,5,12,730,Pit Bull,Blue
3,,Cat,Intact Male,2014,7,4,19,21,Domestic,Blue
4,,Dog,Neutered Male,2013,11,4,12,730,Lhasa Apso,Tan


In [7]:
pd.isnull(data).sum()

Name                  7691
AnimalType               0
SexuponOutcome           1
Year                     0
Month                    0
DayoftheWeek             0
Hour                     0
AgeDaysuponOutcome       0
LargeBreed               0
BasicColor               0
dtype: int64

In [8]:
# Sex has one NaN
data[data["SexuponOutcome"].isnull()]

Unnamed: 0,Name,AnimalType,SexuponOutcome,Year,Month,DayoftheWeek,Hour,AgeDaysuponOutcome,LargeBreed,BasicColor
3174,Diego,Dog,,2013,11,2,16,2555,Dachshund,Brown


In [9]:
# It's name is diego, it is 7 years old. So I am going to change its sex to the most possible one, Neutered Male
data['SexuponOutcome'].iloc[3174] = "Neutered Male"
data.iloc[3174]

Name                          Diego
AnimalType                      Dog
SexuponOutcome        Neutered Male
Year                           2013
Month                            11
DayoftheWeek                      2
Hour                             16
AgeDaysuponOutcome             2555
LargeBreed                Dachshund
BasicColor                    Brown
Name: 3174, dtype: object

In [10]:
# Name
# Change NaN to "NoName"
data["Name"] = data["Name"].fillna("NoName")
# Encode as integers
def intenger_encode(feature):
    le.fit(data[feature])
    return le.transform(data[feature])
data["Name"] = intenger_encode("Name")

In [11]:
# Convert categorical features with one hot encoding
list = ["AnimalType", "SexuponOutcome","Year","Month","DayoftheWeek"]
data = pd.get_dummies(data,columns=list)

In [12]:
# There are many BasicColor and LargeBreed, so doing one hot encoding on them could lead to 
# curse of dimensionality problems. Those are going to be converted to integers and then just normalized.
data["LargeBreed"] = intenger_encode("LargeBreed")
data["BasicColor"] = intenger_encode("BasicColor")

In [13]:
# Finally, normalize integer features
def normalize_feature(feature):
    nee = preprocessing.StandardScaler()
    nee.fit(data[feature])
    return nee.fit_transform(data[feature])

data["LargeBreed"] = normalize_feature("LargeBreed")
data["BasicColor"] = normalize_feature("BasicColor")
data["Name"] = normalize_feature("Name")
data["AgeDaysuponOutcome"] = normalize_feature("AgeDaysuponOutcome")
data["Hour"] = normalize_feature("Hour")

In [14]:
data.head(1)

Unnamed: 0,Name,Hour,AgeDaysuponOutcome,LargeBreed,BasicColor,AnimalType_Cat,AnimalType_Dog,SexuponOutcome_Intact Female,SexuponOutcome_Intact Male,SexuponOutcome_Neutered Male,...,Month_10,Month_11,Month_12,DayoftheWeek_0,DayoftheWeek_1,DayoftheWeek_2,DayoftheWeek_3,DayoftheWeek_4,DayoftheWeek_5,DayoftheWeek_6
0,-0.695432,1.065124,-0.397404,1.905664,-0.704667,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [15]:
# Save
# Save DataFrame state to disk
data.to_pickle("features.pkl")
labels.to_pickle("labels.pkl")