# <center>Transforming Shelter Data</center>

In [1]:
import pandas as pd, numpy as np

In [None]:
cleaned = pd.read_csv('../data/shelter-clean.csv')
transformed = cleaned.copy()

In [None]:
cleaned.describe(include="all")

In [None]:
cleaned.head(3)

In [None]:
# check cleaned as no nulls
np.sum(cleaned.isnull())

## Extract Binary Name vs NoName From Name Column

In [None]:
transformed['Name'] = transformed['Name'].apply(lambda name: "hasName" if name!="noName" else name)

In [None]:
# check transformed has "hasName" and "noName" as values
[name for name in transformed['Name'].unique()]

## Drop OutcomeSubtype for Target Leakage

Since we are unaware of the OutcomeSubtype until the OutcomeType, our label, is determined, we should drop it to prevent target leakage. Besides, OutcomeSubtype is not a feature in the test.csv set.

In [None]:
transformed = transformed.drop('OutcomeSubtype', axis=1)

## Extract Days from AgeuponOutcome

In [None]:
# check string structure of AgeuponOutcome
transformed['AgeuponOutcome'].unique()

In [None]:
# extract days, the lowest common denominator, from AgeuponOutcome
# age_to_days will be saved to src folder

def age_to_days(age_str):
    age_val = int(str(age_str).split()[0])
    
    if "day" in age_str or "days" in "age_str":
        return age_val
    elif "week" in age_str or "weeks" in "age_str":
        return age_val * 7
    elif "month" in age_str or "months" in "age_str":
        return age_val * 30
    elif "year" in age_str or "years" in "age_str":
        return age_val * 365
    else:
        return "unknownAge"

In [None]:
transformed['AgeuponOutcome'] = transformed['AgeuponOutcome'].apply(age_to_days)

In [None]:
# check unique values and datatype for AgeuponOutcome
print transformed['AgeuponOutcome'].unique()
print transformed['AgeuponOutcome'].dtype

In [None]:
transformed.describe(include="all")

## Extract Day of the Week and Month from DateTime

Studying the day of the week and month of the outcome may help understand when animals are most prone to unfavorable outcomes. It would help us if we also knew when the animals where admitted so we can determined if there is correlation between how long an animals stays (date of outcome - date of admission) and its outcome.

In [None]:
# convert "DateTime" to pandas.DateTime objects
transformed['DateTime'] = pd.to_datetime(transformed['DateTime'])

In [None]:
# create date features
transformed['dow'] = transformed['DateTime'].apply(lambda dt: dt.dayofweek)
# transformed['dom'] = transformed['DateTime'].apply(lambda dt: dt.day) # day of month might be useful
transformed['month'] = transformed['DateTime'].apply(lambda dt: dt.month)
transformed['doy'] = transformed['DateTime'].apply(lambda dt: dt.dayofyear)

In [None]:
# convert day of week to weekday names
weekdays = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
transformed['dow'] = transformed['dow'].apply(lambda dow: weekdays[dow])

In [None]:
# convert nominal month to month names
months = {1: 'Jan', 2: 'Feb', 3: 'March', 4: 'April', 5: 'May', 6: "June", 7: "July",\
           8: 'Aug', 9: 'Sept', 10: "Oct", 11: 'Nov', 12: 'Dec'}

transformed['month'] = transformed['month'].apply(lambda month: months[month])

## Extract Mixed or Pure from Breed

In [None]:
# too many unique breeds and mixes to dummify
len(transformed['Breed'].unique())

In [None]:
transformed.groupby('AnimalType').count()

In [None]:
cat = transformed[transformed['AnimalType']=='Cat']
dog = transformed[transformed['AnimalType']=='Dog']

In [None]:
# breed names with "/" look like mixed breeds
cat[cat['Breed'].apply(lambda breed: "/" in breed)].head()

In [None]:
# breed names with "Mix" are presumed to mean mixed breed
cat[cat['Breed'].apply(lambda breed: "Mix" in breed)].head()

In [None]:
# breed names with "/" look like mixed
cat[cat['Breed'].apply(lambda breed: "Mix" not in breed and "/" not in breed and "Domestic" not in breed)].head(20)

In [None]:
# most cats that end up in the shelter are mixed
print "unique cat breeds:", len(cat['Breed'].unique())
print "% mixed breed cats", sum(cat['Breed'].apply(lambda breed: "Mix" in breed or "/" in breed)) / float(cat.shape[0])

In [None]:
# same w/ dogs; most dogs in the shelter are mixed
print "Unique dog breeds:", len(dog['Breed'].unique())
print "% mixed breed dogs", sum(dog['Breed'].apply(lambda breed: "Mix" in breed or "/" in breed)) / float(dog.shape[0])

In [None]:
mixed = transformed['Breed'].apply(lambda breed: "Mix" in breed or "/" in breed)
print "% mixed breed animals", float(sum(mixed))/transformed.shape[0]

In [None]:
# extract mixed vs. pure for Breed
transformed['Breed'] = transformed['Breed'].apply(lambda breed: "Mix" if "Mix" in breed or "/" in breed else "Pure")

In [None]:
# check Breed only has "mixed" and "pure" as values
[breed_type for breed_type in transformed['Breed'].unique()]

If a breeds database exists where the characteristics of each breed are explored (i.e. long vs. short hair, demeanor, personality, etc.), we can imply further features about each breed that may have predictive value on the outcome.

## Drop Color

In [None]:
# there are too many unique colors
# drop Color for now and see how our model performs
len(transformed['Color'].unique())

In [None]:
transformed['Color'].unique()

## Dummify Categorical Variables

In [None]:
transformed.dtypes

In [None]:
categoricals = ['Name', 'AnimalType', 'SexuponOutcome', 'Breed', 'dow', 'month']

In [None]:
# extract dummy variables
final_dframe = pd.get_dummies(transformed, columns=categoricals)

In [None]:
final_dframe.head(4)

# Return Final Select Columns

In [None]:
# take only the columns we want
final_dframe = final_dframe.drop(['AnimalID', 'DateTime', 'Color'], axis=1)

## Transformation Pipeline

In [None]:
# transform_df() saved to src folder

def transform_df(df):
    dframe = df.copy()
    
    dframe['Name'] = dframe['Name'].apply(lambda name: "hasName" if name!="noName" else name)

    dframe = dframe.drop('OutcomeSubtype', axis=1)

    dframe['AgeuponOutcome'] = dframe['AgeuponOutcome'].apply(age_to_days)
    
    # create date features
    dframe['DateTime'] = pd.to_datetime(dframe['DateTime'])
    dframe['dow'] = dframe['DateTime'].apply(lambda dt: dt.dayofweek)
    # dframe['dom']= dframe['DateTime'].apply(lambda dt: dt.day) # day of month might be useful
    dframe['month'] = dframe['DateTime'].apply(lambda dt: dt.month)
    dframe['doy'] = dframe['DateTime'].apply(lambda dt: dt.dayofyear)
    
    # convert day of week to weekday names
    weekdays = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
    dframe['dow'] = dframe['dow'].apply(lambda dow: weekdays[dow])
    
    # convert nominal month to month names
    months = {1: 'Jan', 2: 'Feb', 3: 'March', 4: 'April', 5: 'May', 6: "June", 7: "July",\
           8: 'Aug', 9: 'Sept', 10: "Oct", 11: 'Nov', 12: 'Dec'}
    dframe['month'] = dframe['month'].apply(lambda month: months[month])
    
    # extract mixed vs. pure for Breed
    dframe['Breed'] = dframe['Breed'].apply(lambda breed: "Mix" if "Mix" in breed or "/" in breed else "Pure")
    
    # get dummies
    categoricals = ['Name', 'AnimalType', 'SexuponOutcome', 'Breed', 'dow', 'month']
    dframe = pd.get_dummies(dframe, columns=categoricals)

    # return only select columns
    return dframe.drop(['AnimalID', 'DateTime', 'Color'], axis=1)
    

In [None]:
test_dframe = transform_df(cleaned)

In [None]:
# any nulls in test_dframe
np.any(np.sum(test_dframe.isnull()) > 0)

In [None]:
print test_dframe.shape == final_dframe.shape
print test_dframe.shape
print final_dframe.shape

In [None]:
## save transformed dataframe
#final_dframe.to_csv('data/transformed.csv', index=False)

# <center>Further Transformations That May Be Necessary</center>

## Encode Labels (Revisit)

I'm not sure if we need to encode the labels for the sklearn random forest classifer so I will perform a basic label encoding here just in case we need it.

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(transformed['OutcomeType'])

In [None]:
le.classes_

In [None]:
labels = le.transform(transformed['OutcomeType'])
# transformed['OutcomeType'] = le.transform(transformed['OutcomeType']) # if encoded labels are needed, use this

**Update 2016-07-24:**

Scikit Learn classification models like RandomForestClassifier and GradientBoostClassifier both can handle multi-classes as strings so there is no need for encoding the labels.

## Oversampling for Euthanasia and Died (Revisit)

I'm not sure if we need to oversample for the outcomes of 'euthanasia' and 'died' until I run a baseline model first and check its precision and recall for performance. I have a function here for outsampling if needed.

In [None]:
def oversample(X, y, target):
    """
    INPUT:
    X, y - your data
    target - the percentage of positive class 
             observations in the output
    OUTPUT:
    X_oversampled, y_oversampled - oversampled data
    `oversample` randomly replicates positive observations
    in X, y to achieve the target proportion
    """
    if target < sum(y)/float(len(y)):
        return X, y
    # determine how many new positive observations to generate
    positive_count = sum(y)
    negative_count = len(y) - positive_count
    target_positive_count = target*negative_count / (1. - target)
    target_positive_count = int(round(target_positive_count))
    number_of_new_observations = target_positive_count - positive_count
    # randomly generate new positive observations
    positive_obs_indices = np.where(y==1)[0] # np.where returns a tuple containing an array of indices
    new_obs_indices = np.random.choice(positive_obs_indices, 
                                       size=number_of_new_observations, 
                                       replace=True)
    X_new, y_new = X[new_obs_indices], y[new_obs_indices]
    X_positive = np.vstack((X[positive_obs_indices], X_new))
    y_positive = np.concatenate((y[positive_obs_indices], y_new))
    X_negative = X[y==0]
    y_negative = y[y==0]
    X_oversampled = np.vstack((X_negative, X_positive))
    y_oversampled = np.concatenate((y_negative, y_positive))

    return X_oversampled, y_oversampled

**Update 2016-07-24:**

Because I decided on Scikit Learn's GradientBoostClassifier, there is no need to oversample or undersample our imbalanced classes. Gradient boost models assign heavier weight to misclassified samples so that each subsequent estimator (iteration) works harder to classify the misclassified samples. In the end, the "ensemble" of different classifiers with different weighted samples all vote together when making a prediction. This handles our class imbalance problem with "Died" and "Euthanasia" outcomes being undersampled.