# <center>Transforming Shelter Data</center>

In [3]:
import pandas as pd, numpy as np

In [6]:
cleaned = pd.read_csv('data/shelter-clean.csv')
transformed = cleaned.copy()

In [9]:
cleaned.describe(include="all")

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
count,25612,25612,25612,25612,25612,25612,25612,25612,25612,25612
unique,25612,6365,22328,5,17,2,4,43,1378,361
top,A705677,noName,2015-08-11 00:00:00,Adoption,noSubOutcome,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,1,6612,19,10769,13594,15510,9779,3853,7892,2700


In [21]:
cleaned.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,noSubOutcome,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White


In [19]:
# check cleaned as no nulls
np.sum(cleaned.isnull())

AnimalID          0
Name              0
DateTime          0
OutcomeType       0
OutcomeSubtype    0
AnimalType        0
SexuponOutcome    0
AgeuponOutcome    0
Breed             0
Color             0
dtype: int64

## Extract Binary Name vs NoName From Name Column

In [17]:
transformed['Name'] = transformed['Name'].apply(lambda name: "hasName" if name!="noName" else name)

In [23]:
# check transformed has "hasName" and "noName" as values
[name for name in transformed['Name'].unique()]

['hasName', 'noName']

## Drop OutcomeSubtype for Target Leakage

Since we are unaware of the OutcomeSubtype until the OutcomeType, our label, is determined, we should drop it to prevent target leakage. Besides, OutcomeSubtype is not a feature in the test.csv set.

In [63]:
transformed = transformed.drop('OutcomeSubtype', axis=1)

ValueError: labels ['OutcomeSubtype'] not contained in axis

## Extract Days from AgeuponOutcome

In [33]:
# check string structure of AgeuponOutcome
transformed['AgeuponOutcome'].unique()

array(['1 year', '2 years', '3 weeks', '1 month', '5 months', '4 years',
       '3 months', '2 weeks', '2 months', '10 months', '6 months',
       '5 years', '7 years', '3 years', '4 months', '12 years', '9 years',
       '6 years', '1 weeks', '11 years', '7 months', '8 years',
       '11 months', '4 days', '4 weeks', '9 months', '8 months',
       '15 years', '10 years', '14 years', '3 days', '5 weeks', '2 days',
       '16 years', '6 days', '1 day', '13 years', '1 week', '17 years',
       '18 years', '5 days', '19 years', '20 years'], dtype=object)

In [36]:
# extract days, the lowest common denominator, from AgeuponOutcome
# age_to_days will be saved to src folder

def age_to_days(age_str):
    age_val = int(str(age_str).split()[0])
    
    if "day" in age_str or "days" in "age_str":
        return age_val
    elif "week" in age_str or "weeks" in "age_str":
        return age_val * 7
    elif "month" in age_str or "months" in "age_str":
        return age_val * 30
    elif "year" in age_str or "years" in "age_str":
        return age_val * 365
    else:
        return "unknownAge"

In [41]:
transformed['AgeuponOutcome'] = transformed['AgeuponOutcome'].apply(age_to_days)

In [43]:
# check unique values and datatype for AgeuponOutcome
print transformed['AgeuponOutcome'].unique()
print transformed['AgeuponOutcome'].dtype

[ 365  730   21   30  150 1460   90   14   60  300  180 1825 2555 1095  120
 4380 3285 2190    7 4015  210 2920  330    4   28  270  240 5475 3650 5110
    3   35    2 5840    6    1 4745 6205 6570    5 6935 7300]
int64


In [44]:
transformed.describe(include="all")

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
count,25612,25612,25612,25612,25612,25612,25612.0,25612,25612
unique,25612,2,22328,5,2,4,,1378,361
top,A705677,hasName,2015-08-11 00:00:00,Adoption,Dog,Neutered Male,,Domestic Shorthair Mix,Black/White
freq,1,19000,19,10769,15510,9779,,7892,2700
mean,,,,,,,818.863775,,
std,,,,,,,1093.967793,,
min,,,,,,,1.0,,
25%,,,,,,,90.0,,
50%,,,,,,,365.0,,
75%,,,,,,,1095.0,,


## Extract Day of the Week and Month from DateTime

Studying the day of the week and month of the outcome may help understand when animals are most prone to unfavorable outcomes. It would help us if we also knew when the animals where admitted so we can determined if there is correlation between how long an animals stays (date of outcome - date of admission) and its outcome.

In [46]:
# convert "DateTime" to pandas.DateTime objects
transformed['DateTime'] = pd.to_datetime(transformed['DateTime'])

In [60]:
# create date features
transformed['dow'] = transformed['DateTime'].apply(lambda dt: dt.dayofweek)
# transformed['dom'] = transformed['DateTime'].apply(lambda dt: dt.day) # day of month might be useful
transformed['month'] = transformed['DateTime'].apply(lambda dt: dt.month)
transformed['doy'] = transformed['DateTime'].apply(lambda dt: dt.dayofyear)

In [61]:
# convert day of week to weekday names
weekdays = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
transformed['dow'] = transformed['dow'].apply(lambda dow: weekdays[dow])

In [62]:
# convert nominal month to month names
months = {1: 'Jan', 2: 'Feb', 3: 'March', 4: 'April', 5: 'May', 6: "June", 7: "July",\
           8: 'Aug', 9: 'Sept', 10: "Oct", 11: 'Nov', 12: 'Dec'}

transformed['month'] = transformed['month'].apply(lambda month: months[month])

## Extract Mixed or Pure from Breed

In [85]:
# too many unique breeds and mixes to dummify
len(transformed['Breed'].unique())

1378

In [92]:
transformed.groupby('AnimalType').count()

Unnamed: 0_level_0,AnimalID,Name,DateTime,OutcomeType,SexuponOutcome,AgeuponOutcome,Breed,Color,dow,month,doy
AnimalType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Cat,10102,10102,10102,10102,10102,10102,10102,10102,10102,10102,10102
Dog,15510,15510,15510,15510,15510,15510,15510,15510,15510,15510,15510


In [96]:
cat = transformed[transformed['AnimalType']=='Cat']
dog = transformed[transformed['AnimalType']=='Dog']

In [110]:
# breed names with "/" look like mixed breeds
cat[cat['Breed'].apply(lambda breed: "/" in breed)].head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,dow,month,doy
1872,A459845,hasName,2014-05-05 13:25:00,Return_to_owner,Cat,Spayed Female,2555,Snowshoe/Ragdoll,Seal Point/White,Mon,May,125
2356,A682368,noName,2014-06-28 09:00:00,Transfer,Cat,Intact Male,365,Domestic Medium Hair/Siamese,Lynx Point,Sat,June,179
2843,A701771,hasName,2015-05-20 11:22:00,Transfer,Cat,Spayed Female,730,Siamese/Domestic Shorthair,Lynx Point,Wed,May,140
3097,A717131,hasName,2015-12-16 13:00:00,Transfer,Cat,Intact Female,60,Domestic Shorthair/Manx,Brown Tabby,Wed,Dec,350
8352,A668342,hasName,2013-12-08 14:31:00,Transfer,Cat,Neutered Male,60,Siamese/Domestic Shorthair,Lynx Point/Brown Tabby,Sun,Dec,342


In [112]:
# breed names with "Mix" are presumed to mean mixed breed
cat[cat['Breed'].apply(lambda breed: "Mix" in breed)].head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,dow,month,doy
1,A656520,hasName,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,365,Domestic Shorthair Mix,Cream Tabby,Sun,Oct,286
3,A683430,noName,2014-07-11 19:09:00,Transfer,Cat,Intact Male,21,Domestic Shorthair Mix,Blue Cream,Fri,July,192
6,A699218,hasName,2015-03-28 13:11:00,Transfer,Cat,Intact Male,21,Domestic Shorthair Mix,Blue Tabby,Sat,March,87
13,A678825,hasName,2014-07-12 12:10:00,Adoption,Cat,Neutered Male,90,Domestic Shorthair Mix,Orange Tabby/White,Sat,July,193
14,A678050,noName,2014-05-03 16:15:00,Transfer,Cat,Intact Male,21,Domestic Shorthair Mix,Brown Tabby,Sat,May,123


In [116]:
# breed names with "/" look like mixed
cat[cat['Breed'].apply(lambda breed: "Mix" not in breed and "/" not in breed and "Domestic" not in breed)].head(20)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,dow,month,doy
718,A669173,noName,2013-12-20 17:52:00,Adoption,Cat,Neutered Male,180,Burmese,Chocolate,Fri,Dec,354
2325,A701800,noName,2015-05-05 16:38:00,Transfer,Cat,Intact Female,30,Siamese,Lynx Point/White,Tues,May,125
2820,A708402,noName,2015-07-28 11:29:00,Transfer,Cat,Intact Male,21,Siamese,Seal Point,Tues,July,209
3031,A677091,hasName,2014-06-29 16:00:00,Adoption,Cat,Spayed Female,120,Siamese,Flame Point,Sun,June,180
3067,A694791,hasName,2015-01-05 13:01:00,Transfer,Cat,Intact Female,240,Siamese,Lynx Point,Mon,Jan,5
3433,A665496,hasName,2013-10-22 17:54:00,Adoption,Cat,Neutered Male,1095,Bengal,Brown Tiger,Tues,Oct,295
3475,A682943,hasName,2014-07-09 16:52:00,Adoption,Cat,Neutered Male,60,Siamese,Lilac Point,Wed,July,190
4611,A680908,noName,2014-06-15 18:48:00,Adoption,Cat,Spayed Female,60,Siamese,Flame Point,Sun,June,166
4670,A680909,noName,2014-06-12 17:46:00,Adoption,Cat,Spayed Female,60,Siamese,Flame Point,Thurs,June,163
4729,A631781,hasName,2014-07-30 17:57:00,Adoption,Cat,Spayed Female,3285,British Shorthair,Orange Tabby/White,Wed,July,211


In [117]:
# most cats that end up in the shelter are mixed
print "unique cat breeds:", len(cat['Breed'].unique())
print "% mixed breed cats", sum(cat['Breed'].apply(lambda breed: "Mix" in breed or "/" in breed)) / float(cat.shape[0])

unique cat breeds: 59
% mixed breed cats 0.976044347654


In [119]:
# same w/ dogs; most dogs in the shelter are mixed
print "Unique dog breeds:", len(dog['Breed'].unique())
print "% mixed breed dogs", sum(dog['Breed'].apply(lambda breed: "Mix" in breed or "/" in breed)) / float(dog.shape[0])

 Unique dog breeds: 1319
% mixed breed dogs 0.92695035461


In [120]:
mixed = transformed['Breed'].apply(lambda breed: "Mix" in breed or "/" in breed)
print "% mixed breed animals", float(sum(mixed))/transformed.shape[0]

% mixed breed animals 0.946314227706


In [122]:
# extract mixed vs. pure for Breed
transformed['Breed'] = transformed['Breed'].apply(lambda breed: "Mix" if "Mix" in breed or "/" in breed else "Pure")

In [123]:
# check Breed only has "mixed" and "pure" as values
[breed_type for breed_type in transformed['Breed'].unique()]

['Mix', 'Pure']

If a breeds database exists where the characteristics of each breed are explored (i.e. long vs. short hair, demeanor, personality, etc.), we can imply further features about each breed that may have predictive value on the outcome.

## Drop Color

In [128]:
# there are too many unique colors
# drop Color for now and see how our model performs
len(transformed['Color'].unique())

361

## Dummify Categorical Variables

In [125]:
transformed.dtypes

AnimalID                  object
Name                      object
DateTime          datetime64[ns]
OutcomeType               object
AnimalType                object
SexuponOutcome            object
AgeuponOutcome             int64
Breed                     object
Color                     object
dow                       object
month                     object
doy                        int64
dtype: object

In [129]:
categoricals = ['Name', 'AnimalType', 'SexuponOutcome', 'Breed', 'dow', 'month']

In [144]:
# extract dummy variables
final_dframe = pd.get_dummies(transformed, columns=categoricals)

In [145]:
final_dframe.head(4)

Unnamed: 0,AnimalID,DateTime,OutcomeType,AgeuponOutcome,Color,doy,Name_hasName,Name_noName,AnimalType_Cat,AnimalType_Dog,...,month_Dec,month_Feb,month_Jan,month_July,month_June,month_March,month_May,month_Nov,month_Oct,month_Sept
0,A671945,2014-02-12 18:22:00,Return_to_owner,365,Brown/White,43,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A656520,2013-10-13 12:44:00,Euthanasia,365,Cream Tabby,286,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,A686464,2015-01-31 12:28:00,Adoption,730,Blue/White,31,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A683430,2014-07-11 19:09:00,Transfer,21,Blue Cream,192,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Return Final Select Columns

In [150]:
# take only the columns we want
final_dframe = final_dframe.drop(['AnimalID', 'DateTime', 'Color'], axis=1)

## Transformation Pipeline

In [153]:
# transform_df() saved to src folder

def transform_df(df):
    dframe = df.copy()
    
    dframe['Name'] = dframe['Name'].apply(lambda name: "hasName" if name!="noName" else name)

    dframe = dframe.drop('OutcomeSubtype', axis=1)

    dframe['AgeuponOutcome'] = dframe['AgeuponOutcome'].apply(age_to_days)
    
    # create date features
    dframe['DateTime'] = pd.to_datetime(dframe['DateTime'])
    dframe['dow'] = dframe['DateTime'].apply(lambda dt: dt.dayofweek)
    # dframe['dom']= dframe['DateTime'].apply(lambda dt: dt.day) # day of month might be useful
    dframe['month'] = dframe['DateTime'].apply(lambda dt: dt.month)
    dframe['doy'] = dframe['DateTime'].apply(lambda dt: dt.dayofyear)
    
    # convert day of week to weekday names
    weekdays = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
    dframe['dow'] = dframe['dow'].apply(lambda dow: weekdays[dow])
    
    # convert nominal month to month names
    months = {1: 'Jan', 2: 'Feb', 3: 'March', 4: 'April', 5: 'May', 6: "June", 7: "July",\
           8: 'Aug', 9: 'Sept', 10: "Oct", 11: 'Nov', 12: 'Dec'}
    dframe['month'] = dframe['month'].apply(lambda month: months[month])
    
    # extract mixed vs. pure for Breed
    dframe['Breed'] = dframe['Breed'].apply(lambda breed: "Mix" if "Mix" in breed or "/" in breed else "Pure")
    
    # get dummies
    categoricals = ['Name', 'AnimalType', 'SexuponOutcome', 'Breed', 'dow', 'month']
    dframe = pd.get_dummies(dframe, columns=categoricals)

    # return only select columns
    return dframe.drop(['AnimalID', 'DateTime', 'Color'], axis=1)
    

In [154]:
test_dframe = transform_df(cleaned)

In [161]:
# any nulls in test_dframe
np.any(np.sum(test_dframe.isnull()) > 0)

False

In [170]:
print test_dframe.shape == final_dframe.shape
print test_dframe.shape
print final_dframe.shape

True
(25612, 32)
(25612, 32)


In [172]:
## save transformed dataframe
#final_dframe.to_csv('data/transformed.csv', index=False)

# <center>Further Transformations That May Be Necessary</center>

## Encode Labels (Revisit)

I'm not sure if we need to encode the labels for the sklearn random forest classifer so I will perform a basic label encoding here just in case we need it.

In [66]:
from sklearn import preprocessing

In [67]:
le = preprocessing.LabelEncoder()
le.fit(transformed['OutcomeType'])

LabelEncoder()

In [68]:
le.classes_

array(['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'], dtype=object)

In [69]:
labels = le.transform(transformed['OutcomeType'])
# transformed['OutcomeType'] = le.transform(transformed['OutcomeType']) # if encoded labels are needed, use this

**Update 2016-07-24:**

Scikit Learn classification models like RandomForestClassifier and GradientBoostClassifier both can handle multi-classe

## Oversampling for Euthanasia and Died (Revisit)

I'm not sure if we need to oversample for the outcomes of 'euthanasia' and 'died' until I run a baseline model first and check its precision and recall for performance. I have a function here for outsampling if needed.

In [70]:
def oversample(X, y, target):
    """
    INPUT:
    X, y - your data
    target - the percentage of positive class 
             observations in the output
    OUTPUT:
    X_oversampled, y_oversampled - oversampled data
    `oversample` randomly replicates positive observations
    in X, y to achieve the target proportion
    """
    if target < sum(y)/float(len(y)):
        return X, y
    # determine how many new positive observations to generate
    positive_count = sum(y)
    negative_count = len(y) - positive_count
    target_positive_count = target*negative_count / (1. - target)
    target_positive_count = int(round(target_positive_count))
    number_of_new_observations = target_positive_count - positive_count
    # randomly generate new positive observations
    positive_obs_indices = np.where(y==1)[0] # np.where returns a tuple containing an array of indices
    new_obs_indices = np.random.choice(positive_obs_indices, 
                                       size=number_of_new_observations, 
                                       replace=True)
    X_new, y_new = X[new_obs_indices], y[new_obs_indices]
    X_positive = np.vstack((X[positive_obs_indices], X_new))
    y_positive = np.concatenate((y[positive_obs_indices], y_new))
    X_negative = X[y==0]
    y_negative = y[y==0]
    X_oversampled = np.vstack((X_negative, X_positive))
    y_oversampled = np.concatenate((y_negative, y_positive))

    return X_oversampled, y_oversampled