In [256]:
%matplotlib inline

In [257]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [258]:
folder_path = "c:/users/jeff/dropbox/kaggle_animal_shelter/"

In [259]:
train = pd.read_csv(folder_path + "train.csv", encoding='utf-8')
test = pd.read_csv(folder_path + "test.csv", encoding='utf-8')

In [273]:
combined = train.append(test)

In [274]:
combined

Unnamed: 0,AgeuponOutcome,AnimalID,AnimalType,Breed,Color,DateTime,ID,Name,OutcomeSubtype,OutcomeType,SexuponOutcome
0,1 year,A671945,Dog,Shetland Sheepdog Mix,Brown/White,2014-02-12 18:22:00,,Hambone,,Return_to_owner,Neutered Male
1,1 year,A656520,Cat,Domestic Shorthair Mix,Cream Tabby,2013-10-13 12:44:00,,Emily,Suffering,Euthanasia,Spayed Female
2,2 years,A686464,Dog,Pit Bull Mix,Blue/White,2015-01-31 12:28:00,,Pearce,Foster,Adoption,Neutered Male
3,3 weeks,A683430,Cat,Domestic Shorthair Mix,Blue Cream,2014-07-11 19:09:00,,,Partner,Transfer,Intact Male
4,2 years,A667013,Dog,Lhasa Apso/Miniature Poodle,Tan,2013-11-15 12:52:00,,,Partner,Transfer,Neutered Male
5,1 month,A677334,Dog,Cairn Terrier/Chihuahua Shorthair,Black/Tan,2014-04-25 13:04:00,,Elsa,Partner,Transfer,Intact Female
6,3 weeks,A699218,Cat,Domestic Shorthair Mix,Blue Tabby,2015-03-28 13:11:00,,Jimmy,Partner,Transfer,Intact Male
7,3 weeks,A701489,Cat,Domestic Shorthair Mix,Brown Tabby,2015-04-30 17:02:00,,,Partner,Transfer,Unknown
8,5 months,A671784,Dog,American Pit Bull Terrier Mix,Red/White,2014-02-04 17:17:00,,Lucy,,Adoption,Spayed Female
9,1 year,A677747,Dog,Cairn Terrier,White,2014-05-03 07:48:00,,,Offsite,Adoption,Spayed Female


### Separating out sex and spay/neuter info

In [264]:
def cleanFeatures(df):
    df['Male'] = (df['SexuponOutcome'] == u'Intact Male') | (df['SexuponOutcome'] == u'Neutered Male')
    df['NeuteredSpayed'] = (df['SexuponOutcome'] == u'Spayed Female') | (df['SexuponOutcome'] == u'Neutered Male')
    df['SexuponOutcomeKnown'] = ~df['SexuponOutcome'].isnull()

    df = df.drop('SexuponOutcome',1)

    df['Dog'] = df['AnimalType'] == u'Dog'
    df = df.drop('AnimalType',1)

    (df['Age (Weeks)'],df['Units']) = (df['AgeuponOutcome'].str.split(' ', expand=True)[0],
                               df['AgeuponOutcome'].str.split(' ', expand=True)[1])

    df['Age (Weeks)'] = df['Age (Weeks)'].convert_objects(convert_numeric=True)

    df.loc[df['Units'] == 'years','Units'] = 'year'
    df.loc[df['Units'] == 'months','Units'] = 'month'
    df.loc[df['Units'] == 'weeks','Units'] = 'week'

    df.loc[df['Units'] == 'year','Age (Weeks)'] = df.loc[df['Units'] == 'year','Age (Weeks)'] * 52
    df.loc[df['Units'] == 'month','Age (Weeks)'] = df.loc[df['Units'] == 'month','Age (Weeks)'] * 4

    df.loc[df['Age (Weeks)'].isnull(),'Age (Weeks)'] = 0

    df = df.drop(['AgeuponOutcome','Units'],1)

    df['DateTime'] = pd.to_datetime(df['DateTime'])

    df['YearOutcome'] = df['DateTime'].dt.year
    df['MonthOutcome'] = df['DateTime'].dt.month

    df = df.drop('DateTime',1)

    df['HasName'] = ~df['Name'].isnull()

    df = df.drop('Name',axis=1)

    df['IsMix'] = df['Breed'].str.contains('Mix')

    color_list = []

    for color in df['Color'].unique():
        new_colors = color.split('/')
        for new_color in new_colors:
            new_color_list = new_color.split(' ')
            for identifier in new_color_list:
                if str(identifier) not in color_list:
                    color_list.append(str(identifier))

    for color in color_list:
        df[str(color)] = df['Color'].str.contains(color)

    df = df.drop('Color', axis=1)

    breed_list = []

    df['Mixed'] = df['Breed'].str.contains('Mix')
    df['Breed'] = df['Breed'].str.rstrip('Mix').str.rstrip()

    df['PitBull'] = df['Breed'].str.contains('Pit Bull')
    df['Terrier'] = df['Breed'].str.contains('Terrier')
    df['Mini'] = df['Breed'].str.contains('Miniature')

    df_breeds = df['Breed'].reset_index().drop('index',axis=1)
    df_breeds_counts = df_breeds['Breed'].value_counts().to_frame().rename(columns={0L: 'Frequency'})
    df_breeds_counts.insert(0, 'Name', df_breeds_counts)

    df['Corgi'] = df['Breed'].str.contains('Corgi')
    df['Retriever'] = df['Breed'].str.contains('Retriever')
    df['Hound'] = df['Breed'].str.contains('Hound')
    df['Husky'] = df['Breed'].str.contains('Husky')
    df['Beagle'] = df['Breed'].str.contains('Beagle')
    df['Chihuahua'] = df['Breed'].str.contains('Chichuahua')
    df['Bulldog'] = df['Breed'].str.contains('Bulldog')

    df['CatMixed'] = (df['Breed'].str.contains('Domestic Shorthair') |
                     df['Breed'].str.contains('Domestic Longhair') |
                     df['Breed'].str.contains('Domestic Medium Hair'))

    df = df.drop('Breed',axis=1)

    if 'OutcomeSubtype' in df.columns:
        df = df.drop('OutcomeSubtype',axis=1)

In [277]:
combined = cleanFeatures(combined)

In [278]:
combined

Unnamed: 0,AnimalID,ID,OutcomeType,Male,NeuteredSpayed,SexuponOutcomeKnown,Dog,Age (Weeks),YearOutcome,MonthOutcome,...,Terrier,Mini,Corgi,Retriever,Hound,Husky,Beagle,Chihuahua,Bulldog,CatMixed
0,A671945,,Return_to_owner,True,True,True,True,52,2014,2,...,False,False,False,False,False,False,False,False,False,False
1,A656520,,Euthanasia,False,True,True,False,52,2013,10,...,False,False,False,False,False,False,False,False,False,True
2,A686464,,Adoption,True,True,True,True,104,2015,1,...,False,False,False,False,False,False,False,False,False,False
3,A683430,,Transfer,True,False,True,False,3,2014,7,...,False,False,False,False,False,False,False,False,False,True
4,A667013,,Transfer,True,True,True,True,104,2013,11,...,False,True,False,False,False,False,False,False,False,False
5,A677334,,Transfer,False,False,True,True,4,2014,4,...,True,False,False,False,False,False,False,False,False,False
6,A699218,,Transfer,True,False,True,False,3,2015,3,...,False,False,False,False,False,False,False,False,False,True
7,A701489,,Transfer,False,False,True,False,3,2015,4,...,False,False,False,False,False,False,False,False,False,True
8,A671784,,Adoption,False,True,True,True,20,2014,2,...,True,False,False,False,False,False,False,False,False,False
9,A677747,,Adoption,False,True,True,True,52,2014,5,...,True,False,False,False,False,False,False,False,False,False


In [279]:
train_cleaned = combined[combined['ID'].isnull()].drop('ID',axis=1)

In [282]:
test_cleaned = combined[combined['AnimalID'].isnull()].drop('AnimalID',axis=1)

## RandomForest Implementation

In [283]:
train_y = train_cleaned['OutcomeType']
train_X = train_cleaned.drop(['OutcomeType','AnimalID'],axis=1)

In [289]:
test_X = test_cleaned.drop(['ID','OutcomeType'],axis=1)

In [285]:
rf = RandomForestClassifier(n_estimators=100)

In [291]:
rf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [306]:
predictions = rf.predict_proba(test_X)

In [303]:
headers = ['AnimalID','Adoption','Died','Euthanasia','Return_to_owner','Transfer']

In [312]:
output_df = pd.DataFrame(columns=headers)

In [317]:
test_cleaned['ID'] = 'A' + test_cleaned['ID'].astype(str)

In [318]:
test_cleaned

Unnamed: 0,ID,OutcomeType,Male,NeuteredSpayed,SexuponOutcomeKnown,Dog,Age (Weeks),YearOutcome,MonthOutcome,HasName,...,Terrier,Mini,Corgi,Retriever,Hound,Husky,Beagle,Chihuahua,Bulldog,CatMixed
0,A1.0,,False,False,True,True,40,2015,10,True,...,False,False,False,True,False,False,False,False,False,False
1,A2.0,,False,True,True,True,104,2014,7,True,...,False,False,False,False,False,True,False,False,False,False
2,A3.0,,True,True,True,False,52,2016,1,True,...,False,False,False,False,False,False,False,False,False,True
3,A4.0,,True,False,True,True,16,2013,12,True,...,False,False,False,False,False,False,False,False,False,False
4,A5.0,,True,True,True,True,104,2015,9,True,...,False,True,False,False,False,False,False,False,False,False
5,A6.0,,True,True,True,True,156,2015,6,True,...,False,False,False,False,False,False,True,False,False,False
6,A7.0,,True,True,True,False,676,2014,3,True,...,False,False,False,False,False,False,False,False,False,True
7,A8.0,,False,True,True,False,24,2014,6,True,...,False,False,False,False,False,False,False,False,False,True
8,A9.0,,False,True,True,True,12,2014,11,True,...,True,False,False,False,False,False,False,False,False,False
9,A10.0,,False,True,True,True,52,2014,4,True,...,False,False,False,False,False,False,False,False,False,False
