# Project 1: Data Exploration, Visualization and Dimensionality Reduction
## Animal Shelter Outcomes: A Classification Problem
#### Alex Matsunami, RJ Smith, Cory Nichols

### Data Preprocessing 

In [310]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler,Imputer
import seaborn as sns
%matplotlib inline

In [311]:
df = pd.read_csv('~/desktop/train.csv')

In [317]:
df.head(3)

Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White


In [313]:
df.describe()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
count,26729,19038,26729,26729,13117,26729,26728,26711,26729,26729
unique,26729,6374,22918,5,16,2,5,44,1380,366
top,A705677,Max,2015-08-11 00:00:00,Adoption,Partner,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,1,136,19,10769,7816,15595,9779,3969,8810,2824


In [314]:
print df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.2+ MB
None


In [315]:
del df['OutcomeSubtype'], df['AnimalID']

In [319]:
df = df.dropna(subset=['SexuponOutcome'])

print 'Number of unique, non nan ages:', len(np.unique(df.AgeuponOutcome[~pd.isnull(df['AgeuponOutcome'])])) 

# There are 44 null age values, lets use a random combination of the most frequent strings to fill our 11 objects - 
# this should not have a large impact on the analysis

from collections import Counter
from random import randint

def pickRand(frame = 'df', stringCol = 'col', topN = 3):
    return Counter(frame[stringCol].values).most_common(topN)[randint(0, topN-1)][0]



# fill the nulls with top 3 randomly which make up ~40% of the dataset and have roughly similar counts
# further, there is a string for age "0 years" with 22 instances. This provides about "0" value. We will fill this
# column the same way

df['AgeuponOutcome'] = [pickRand(df,'AgeuponOutcome',3)\
                        if pd.isnull(i) or i.lower() == '0 years' else i for i in df['AgeuponOutcome'].values]
 
# Fill 'Unknown' values in SexuponOutcome feature with the same logic as above
df['SexuponOutcome'] = [pickRand(df,'SexuponOutcome',3)\
                        if i.lower() == 'unknown' else i for i in df['SexuponOutcome'].values]

Number of unique, non nan ages: 43


Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,Elsa,2014-04-25 13:04:00,Transfer,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,Jimmy,2015-03-28 13:11:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
7,,2015-04-30 17:02:00,Transfer,Cat,Neutered Male,3 weeks,Domestic Shorthair Mix,Brown Tabby
8,Lucy,2014-02-04 17:17:00,Adoption,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White
9,,2014-05-03 07:48:00,Adoption,Dog,Spayed Female,1 year,Cairn Terrier,White


In [309]:
# now that we have taken care of the dataset's integrity, it's time to slice it up
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26728 entries, 0 to 26728
Data columns (total 8 columns):
Name              19037 non-null object
DateTime          26728 non-null object
OutcomeType       26728 non-null object
AnimalType        26728 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26728 non-null object
Breed             26728 non-null object
Color             26728 non-null object
dtypes: object(8)
memory usage: 1.8+ MB


In [320]:
print 'There are %d unique names in the data set' % len(np.unique(df['Name'].values))

# 6375 unique names. That is quite a few names. Let's generalize into a binary feature called has_name instead:

df['has_name'] = [0 if pd.isnull(i) else 1 for i in df['Name']]

# Further, let's split out sex: more to come

# df['test'] = [i.split() for i in df['SexuponOutcome']]

There are 6375 unique names in the data set
