# Project 1: Data Exploration, Visualization and Dimensionality Reduction
## Animal Shelter Outcomes: A Classification Problem
#### Alex Matsunami, RJ Smith, Cory Nichols

### Data Preprocessing 

In [340]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler,Imputer
import seaborn as sns
%matplotlib inline

In [351]:
df = pd.read_csv('~/desktop/train.csv')

In [352]:
df.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White


In [353]:
df.describe()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
count,26729,19038,26729,26729,13117,26729,26728,26711,26729,26729
unique,26729,6374,22918,5,16,2,5,44,1380,366
top,A705677,Max,2015-08-11 00:00:00,Adoption,Partner,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,1,136,19,10769,7816,15595,9779,3969,8810,2824


In [354]:
print df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.2+ MB
None


In [346]:
del df['OutcomeSubtype'], df['AnimalID']
df = df.dropna(subset=['SexuponOutcome'])

print 'Number of unique, non nan ages:', len(np.unique(df.AgeuponOutcome[~pd.isnull(df['AgeuponOutcome'])])) 

from collections import Counter
from random import randint

randChooser = []
for i in ['AgeuponOutcome', 'SexuponOutcome']:
    randChooser.append(Counter(df[i].values).most_common(3))

# fill the nulls with top 3 randomly which make up ~40% of the dataset and have roughly similar counts
# further, there is a string for age "0 years" with 22 instances. This provides about "0" value. 
# We will fill these columns by iterating and randomly choosing between the top 3 values instead of assigning
# an arbitrary value

df['AgeuponOutcome'] = [randChooser[0][randint(0,2)][0]
                        if pd.isnull(i) or i.lower() == '0 years' else i for i in df['AgeuponOutcome'].values]
 
# Fill 'Unknown' values in SexuponOutcome feature with the same logic as above
df['SexuponOutcome'] = [randChooser[1][randint(0,2)][0]\
                        if i.lower() == 'unknown' else i for i in df['SexuponOutcome'].values]

Number of unique, non nan ages: 44


In [355]:
# now that we have taken care of the dataset's integrity, it's time to slice it up
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.2+ MB


In [348]:
print 'There are %d unique names in the data set' % len(np.unique(df['Name'].values))

# 6375 unique names. That is quite a few names. Let's generalize into a binary feature called has_name instead:
df['has_name'] = [0 if pd.isnull(i) else 1 for i in df['Name']]

# Further, let's split out the intactness of an animal: whether or not the animal has been spayed or neutered
# as well as the gender:
df['intact'] = [i.split()[0] for i in df['SexuponOutcome']]
df['gender'] = [i.split()[1] for i in df['SexuponOutcome']]

# we will also convert ageuponoutcome into days:
ageMap = {'year': 365, 'week': 7, 'month': 30, 'day': 1} # use 30 for month
df['AgeuponOutcome'] = [i.rstrip('s').split() for i in df['AgeuponOutcome'].values] # clean text to keep mapping simple

ageList = []
for i in df['AgeuponOutcome'].values:
    for k,v in ageMap.items():
        if i[1] == k:
            ageList.append(int(i[0]) * v)
df['AgeuponOutcome'] = ageList

There are 6375 unique names in the data set


In [349]:
# lets also identify potentially 'aggressive' breeds using a bit of regex and top 10 aggressive breeds list: 
# http://www.therichest.com/rich-list/the-biggest/the-worlds-10-most-dangerous-dog-breeds/?view=all

import re
agg_breeds = ['Bull','Doberman','Rottweiler','Husky','German','Boxer','Malamute']
df['Breed'] = [re.split('\W+',i) for i in df['Breed']]

aggColumn = []
for i in df['Breed'].values:
    if len(agg_breeds) != len(set(agg_breeds).difference(i)):
        aggColumn.append((1))
    else:
        aggColumn.append(0)
        
df['is_aggressive'] = aggColumn