In [1]:
%matplotlib inline

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [3]:
folder_path = "c:/users/jeff/dropbox/kaggle_animal_shelter/"

In [4]:
train = pd.read_csv(folder_path + "train.csv", encoding='utf-8')
test = pd.read_csv(folder_path + "test.csv", encoding='utf-8')

In [5]:
combined = train.append(test)

### Feature engineering/cleaning

In [6]:
def cleanFeatures(df):
    
    # Separate out the sex and if the animal's genitals are intact
    df['Male'] = (df['SexuponOutcome'] == u'Intact Male') | (df['SexuponOutcome'] == u'Neutered Male')
    df['NeuteredSpayed'] = (df['SexuponOutcome'] == u'Spayed Female') | (df['SexuponOutcome'] == u'Neutered Male')
    df['SexuponOutcomeKnown'] = ~df['SexuponOutcome'].isnull()
    df = df.drop('SexuponOutcome',1)

    # Determine if the animal is a dog or not
    df['Dog'] = df['AnimalType'] == u'Dog'
    df = df.drop('AnimalType',1)

    # Transform each animal's age into a more standard form
    # Note: 99999 indicates an unknown age. Of course, it's not
    # an actual age, but something to separate the knowns from
    # unknowns and effectively "discretize" on a continuous spectrum
    (df['Age (Weeks)'],df['Units']) = (df['AgeuponOutcome'].str.split(' ', expand=True)[0],
                               df['AgeuponOutcome'].str.split(' ', expand=True)[1])
    df['Age (Weeks)'] = df['Age (Weeks)'].convert_objects(convert_numeric=True)
    df.loc[df['Units'] == 'years','Units'] = 'year'
    df.loc[df['Units'] == 'months','Units'] = 'month'
    df.loc[df['Units'] == 'weeks','Units'] = 'week'
    df.loc[df['Units'] == 'year','Age (Weeks)'] = df.loc[df['Units'] == 'year','Age (Weeks)'] * 52
    df.loc[df['Units'] == 'month','Age (Weeks)'] = df.loc[df['Units'] == 'month','Age (Weeks)'] * 4
    df.loc[df['Age (Weeks)'].isnull(),'Age (Weeks)'] = 99999
    df = df.drop(['AgeuponOutcome','Units'],1)

    # New feature: neutered young (less than ~1 year of age)
    df['NeuteredYoung'] = (df['Age (Weeks)'].astype(int) < 53 &
                           df['NeuteredSpayed'])
    
    # Separate out the date/time into its individual components
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df['YearOutcome'] = df['DateTime'].dt.year
    df['MonthOutcome'] = df['DateTime'].dt.month
    df['DayOutcome'] = df['DateTime'].dt.day
    df['HourOutcome'] = df['DateTime'].dt.hour
    df['MinuteOutcome'] = df['DateTime'].dt.minute
    df['SecondOutcome'] = df['DateTime'].dt.second
    df['DayOfWeekOutcome'] = df['DateTime'].dt.dayofweek
    df = df.drop('DateTime',1)

    # Features from names
    df['HasName'] = ~df['Name'].isnull()
    df['ShortName'] = df['Name'].str.len() < 5
    names = df[~df['Name'].isnull()]['Name']
    names_count = names.value_counts().to_frame()
    names_list = names_count.index
    tolerance_common = 0.005
    tolerance_rare = 0.001
    df['Common'] = pd.Series()
    df['Uncommon'] = pd.Series()
    df['Rare'] = pd.Series()
        # For animals with unknown names, I'll assume it's equally likely
        # for it to take on a name of varying rarity. Really, though,
        # it should follow some probability distribution
    df.loc[df['Name'].isnull(),'Common'] = True
    df.loc[df['Name'].isnull(),'Uncommon'] = True
    df.loc[df['Name'].isnull(),'Rare'] = True
    names_count['Rarity'] = pd.Series()
    names_count.loc[names_count[0L].astype(float) / names.shape[0] > 
                    tolerance_common,
                    'Rarity'] = 0
    names_count.loc[(names_count[0L].astype(float) / names.shape[0] <= 0.005) & 
                    names_count[0L].astype(float) / names.shape[0] > 0.001,
                    'Rarity'] = 1
    names_count.loc[names_count[0L].astype(float) / names.shape[0] <= 0.001,
                    'Rarity'] = 2
    for name in names_list:
        if names_count['Rarity'][name] == 0:
            df.loc[df['Name'] == name,'Common'] = True
            df.loc[df['Name'] == name,'Uncommon'] = False
            df.loc[df['Name'] == name,'Rare'] = False
        elif names_count['Rarity'][name] == 1:
            df.loc[df['Name'] == name,'Common'] = False
            df.loc[df['Name'] == name,'Uncommon'] = True
            df.loc[df['Name'] == name,'Rare'] = False
        else:
            df.loc[df['Name'] == name,'Common'] = False
            df.loc[df['Name'] == name,'Uncommon'] = False
            df.loc[df['Name'] == name,'Rare'] = True
    df = df.drop('Name',axis=1)

    # Features from colors
    color_list = []
    for color in df['Color'].unique():
        new_colors = color.split('/')
        for new_color in new_colors:
            new_color_list = new_color.split(' ')
            for identifier in new_color_list:
                if str(identifier) not in color_list:
                    color_list.append(str(identifier))
    for color in color_list:
        df[str(color)] = df['Color'].str.contains(color)
    df = df.drop('Color', axis=1)

    # There were so many breeds, so I tried my best to categorize based on
    # what seemed to be popular and recognizable breeds. Later, I think
    # this could be improved by looking a bit more at histograms of these
    # various categorizations I created, and more
    df['Mixed'] = (df['Breed'].str.contains('Mix') |
                   df['Breed'].str.contains('/'))
    df['Breed'] = df['Breed'].str.rstrip('Mix').str.rstrip()
    df['PitBull'] = df['Breed'].str.contains('Pit Bull')
    df['Terrier'] = df['Breed'].str.contains('Terrier')
    df['Mini'] = df['Breed'].str.contains('Miniature')
    df['Corgi'] = df['Breed'].str.contains('Corgi')
    df['Retriever'] = df['Breed'].str.contains('Retriever')
    df['Hound'] = df['Breed'].str.contains('Hound')
    df['Husky'] = df['Breed'].str.contains('Husky')
    df['Beagle'] = df['Breed'].str.contains('Beagle')
    df['Chihuahua'] = df['Breed'].str.contains('Chichuahua')
    df['Bulldog'] = df['Breed'].str.contains('Bulldog')
    df['ShireDog'] = df['Breed'].str.contains('shire')
    df['GreatPyrenees'] = df['Breed'].str.contains('Great Pyrenees')
    df['Shepherd'] = df['Breed'].str.contains('Shepherd')
    df['Dachshund'] = df['Breed'].str.contains('Dachshund')
    df['Rottweiler'] = df['Breed'].str.contains('Rottweiler')
    df['CatMixed'] = (df['Breed'].str.contains('Domestic Shorthair') |
                     df['Breed'].str.contains('Domestic Longhair') |
                     df['Breed'].str.contains('Domestic Medium Hair'))
    df['ExoticForeignCat'] = ( (df['Dog'] == False) & 
                       ( df['Breed'].str.contains('Siamese') |
                        df['Breed'].str.contains('Himalayan') |
                        df['Breed'].str.contains('Persian') |
                        df['Breed'].str.contains('Angora') |
                        df['Breed'].str.contains('Bombay') |
                        df['Breed'].str.contains('Japanese') |
                        df['Breed'].str.contains('Bengal') |
                        df['Breed'].str.contains('Cymric') |
                        df['Breed'].str.contains('Abyssinian') |
                        df['Breed'].str.contains('Sphynx') |
                        df['Breed'].str.contains('Javanese') |
                        df['Breed'].str.contains('Turkish') |
                        df['Breed'].str.contains('Chartreaux') |
                        df['Breed'].str.contains('Norwegian') |
                        df['Breed'].str.contains('Russian') ) )
    df['MaineCoon'] = df['Breed'].str.contains('Maine Coon')
    df['Shorthair'] = df['Breed'].str.contains('Shorthair')
    df['Longhair'] = df['Breed'].str.contains('Longhair')
    df['Ragdoll'] = df['Breed'].str.contains('Ragdoll')
    df['American'] = df['Breed'].str.contains('American')
    df['Australian'] = df['Breed'].str.contains('Australian')
    df['German'] = df['Breed'].str.contains('German')
    df['Japanese'] = df['Breed'].str.contains('Japanese')
    df['Munchkin'] = df['Breed'].str.contains('Munchkin')
    df['RexCat'] = df['Breed'].str.contains('Rex')
    df['ColdWeather'] = (df['Breed'].str.contains('Siberian') |
                         df['Breed'].str.contains('Russian') |
                         df['Breed'].str.contains('Longhair') |
                         df['Breed'].str.contains('Norwegian'))
    df = df.drop('Breed',axis=1)

    # This is actually not useful because these are usually just comments
    # on the outcome and exist because the outcome existed in the first place
    if 'OutcomeSubtype' in df.columns:
        df = df.drop('OutcomeSubtype',axis=1)
        
    return df

In [7]:
combined = cleanFeatures(combined)

In [8]:
train_cleaned = combined[combined['ID'].isnull()].drop(['ID'],axis=1)

In [9]:
test_cleaned = combined[combined['AnimalID'].isnull()].drop(['AnimalID'],axis=1)

## RandomForest Implementation

In [10]:
train_y = train_cleaned['OutcomeType']
train_X = train_cleaned.drop(['OutcomeType','AnimalID'],axis=1)

In [11]:
test_X = test_cleaned.drop(['ID','OutcomeType'],axis=1)

In [12]:
rf = RandomForestClassifier(n_estimators=250, max_depth=None, 
                            min_samples_split=1)

In [13]:
rf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
predictions = rf.predict_proba(test_X)

In [15]:
headers = ['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer']

In [16]:
output_df = pd.DataFrame(columns=headers)

In [17]:
for outcome in range(predictions.shape[1]): 
    
    new_list = []

    for row in range(predictions.shape[0]):
        new_list.append(predictions[row][outcome])
        
    output_df[headers[1+outcome]] = new_list



In [18]:
output_df['ID'] = test_cleaned['ID'].astype(int)

In [19]:
output_df.to_csv(folder_path + 'predicted.csv', index=False, encoding='utf-8')