# Initial procedures

To outline what I did here, I combined the training and testing sets in order to guarantee that whatever automatically-generated features I obtain are trained by the model and will work in predicting (possibly or not possibly well) for the testing set. In a more math-y way of speaking, I'm making sure that the basis vectors are consistent.

I separated out and created a lot of features here. I'm guessing that there probably is a much more efficient way of doing all of this, but I think that in order to explain the model well, it's best if I have more control over what I think is important (and later on, I will be verifying that it is or is not important for the model).

There's a very trivial procedure to decompose the combined set (the very last lines).

In [115]:
%matplotlib inline

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

folder_path = '/home/pbnjeff/Dropbox/KaggleAnimalShelter/'
combined_cleaned_path = '/home/pbnjeff/Dropbox/KaggleAnimalShelter/combined_cleaned.csv'


train = pd.read_csv(folder_path + "train.csv", encoding='utf-8')
test = pd.read_csv(folder_path + "test.csv", encoding='utf-8')

test = test.rename(columns={'ID':'AnimalID'})

In [116]:
def sexGenitalFeatures(df):
    
    # Separate out the sex and if the animal's genitals are intact
    df['Male'] = ((df['SexuponOutcome'] == u'Intact Male') | (df['SexuponOutcome'] == u'Neutered Male')).astype(int)
    df['NeuteredSpayed'] = ((df['SexuponOutcome'] == u'Spayed Female') | (df['SexuponOutcome'] == u'Neutered Male')).astype(int)
    df['SexuponOutcomeKnown'] = (~df['SexuponOutcome'].isnull()).astype(int)
    df = df.drop('SexuponOutcome',1)
    
    return df

In [117]:
def dogOrNah(df):

    # Determine if the animal is a dog or not
    df['Dog'] = (df['AnimalType'] == u'Dog').astype(int)
    df = df.drop('AnimalType',1)

    return df

In [118]:
def timeFeatures(df):
    
    # Transform each animal's age into a more standard form
    # Note: 99999 indicates an unknown age. I deal with this later
    # by using a RandomForestRegressor to predict what age an
    # animal should be
    
    (df['Age (Weeks)'],df['Units']) = (df['AgeuponOutcome'].str.split(' ', expand=True)[0],
                               df['AgeuponOutcome'].str.split(' ', expand=True)[1])
    df.loc[df['Age (Weeks)'].isnull(),'Age (Weeks)'] = 99999
    df['Age (Weeks)'] = df['Age (Weeks)'].astype(int)
    df.loc[df['Units'] == 'years','Units'] = 'year'
    df.loc[df['Units'] == 'months','Units'] = 'month'
    df.loc[df['Units'] == 'weeks','Units'] = 'week'
    df.loc[df['Units'] == 'year','Age (Weeks)'] = df.loc[df['Units'] == 'year','Age (Weeks)'] * 52
    df.loc[df['Units'] == 'month','Age (Weeks)'] = df.loc[df['Units'] == 'month','Age (Weeks)'] * 4
    df = df.drop(['AgeuponOutcome','Units'],1)

    # New feature: neutered young (less than ~1 year of age)
    # df['NeuteredYoung'] = (df['Age (Weeks)'].astype(int) < 53 & df['NeuteredSpayed'])
    
    # Separate out the date/time into its individual components
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    # df['YearOutcome'] = df['DateTime'].dt.year
    # df['MonthOutcome'] = df['DateTime'].dt.month
    # df['DayOfWeekOutcome'] = df['DateTime'].dt.dayofweek
    df = df.drop('DateTime',1)

    return df
    

In [119]:
def nameFeatures(df):
    
    # Features from names
    df['HasName'] = (~df['Name'].isnull()).astype(int)
    df = df.drop('Name',axis=1)

    return df
  

In [120]:
 
def colorFeatures(df):
    
    # Features from colors
    color_list = []
    #for color in df['Color'].unique():
    #    new_colors = color.split('/')
    #    for new_color in new_colors:
    #        new_color_list = new_color.split(' ')
    #        for identifier in new_color_list:
    #            if str(identifier) not in color_list:
    #                color_list.append(str(identifier))
    #for color in color_list:
    #    df[str(color)] = df['Color'].str.contains(color)
    df = df.drop('Color', axis=1)

    return df

In [121]:
def breedFeatures(df):
    
    # There were so many breeds, so I tried my best to categorize based on
    # what seemed to be popular and recognizable breeds. Later, I think
    # this could be improved by looking a bit more at histograms of these
    # various categorizations I created, and more
    #df['Mixed'] = (df['Breed'].str.contains('Mix') |
    #               df['Breed'].str.contains('/'))
    #df['Breed'] = df['Breed'].str.rstrip('Mix').str.rstrip()
    #df['PitBull'] = df['Breed'].str.contains('Pit Bull')
    #df['Terrier'] = df['Breed'].str.contains('Terrier')
    #df['Mini'] = df['Breed'].str.contains('Miniature')
    #df['Corgi'] = df['Breed'].str.contains('Corgi')
    #df['Retriever'] = df['Breed'].str.contains('Retriever')
    #df['Hound'] = df['Breed'].str.contains('Hound')
    #df['Husky'] = df['Breed'].str.contains('Husky')
    #df['Beagle'] = df['Breed'].str.contains('Beagle')
    #df['Chihuahua'] = df['Breed'].str.contains('Chichuahua')
    #df['Bulldog'] = df['Breed'].str.contains('Bulldog')
    #df['ShireDog'] = df['Breed'].str.contains('shire')
    #df['GreatPyrenees'] = df['Breed'].str.contains('Great Pyrenees')
    #df['Shepherd'] = df['Breed'].str.contains('Shepherd')
    #df['Dachshund'] = df['Breed'].str.contains('Dachshund')
    #df['Rottweiler'] = df['Breed'].str.contains('Rottweiler')
    #df['CatMixed'] = (df['Breed'].str.contains('Domestic Shorthair') |
    #                 df['Breed'].str.contains('Domestic Longhair') |
    #                 df['Breed'].str.contains('Domestic Medium Hair'))
    #df['ExoticForeignCat'] = ( (df['Dog'] == False) & 
    #                   ( df['Breed'].str.contains('Siamese') |
    #                    df['Breed'].str.contains('Himalayan') |
    #                    df['Breed'].str.contains('Persian') |
    #                    df['Breed'].str.contains('Angora') |
    #                    df['Breed'].str.contains('Bombay') |
    #                    df['Breed'].str.contains('Japanese') |
    #                    df['Breed'].str.contains('Bengal') |
    #                    df['Breed'].str.contains('Cymric') |
    #                    df['Breed'].str.contains('Abyssinian') |
    #                    df['Breed'].str.contains('Sphynx') |
    #                    df['Breed'].str.contains('Javanese') |
    #                    df['Breed'].str.contains('Turkish') |
    #                    df['Breed'].str.contains('Chartreaux') |
    #                    df['Breed'].str.contains('Norwegian') |
    #                    df['Breed'].str.contains('Russian') ) )
    #df['MaineCoon'] = df['Breed'].str.contains('Maine Coon')
    #df['Shorthair'] = df['Breed'].str.contains('Shorthair')
    #df['Longhair'] = df['Breed'].str.contains('Longhair')
    #df['Ragdoll'] = df['Breed'].str.contains('Ragdoll')
    #df['American'] = df['Breed'].str.contains('American')
    #df['Australian'] = df['Breed'].str.contains('Australian')
    #df['German'] = df['Breed'].str.contains('German')
    #df['Japanese'] = df['Breed'].str.contains('Japanese')
    #df['Munchkin'] = df['Breed'].str.contains('Munchkin')
    #df['RexCat'] = df['Breed'].str.contains('Rex')
    #df['ColdWeather'] = (df['Breed'].str.contains('Siberian') |
    #                     df['Breed'].str.contains('Russian') |
    #                     df['Breed'].str.contains('Longhair') |
    #                     df['Breed'].str.contains('Norwegian'))
    df = df.drop('Breed',axis=1)

    return df

In [122]:
combined = train.append(test)

combined = sexGenitalFeatures(combined)
combined = dogOrNah(combined)
combined = timeFeatures(combined)
combined = nameFeatures(combined)
combined = colorFeatures(combined)
combined = breedFeatures(combined)

if 'OutcomeSubtype' in combined.columns:
    combined = combined.drop('OutcomeSubtype', axis = 1)

In [163]:
train_clean = combined[~combined['OutcomeType'].isnull()]
test_clean = combined[combined['OutcomeType'].isnull()]

In [156]:
def fillMissingAges(df):
    
    '''
    df is a pandas DataFrame object which contains
    only the columns with the relevant X and y
    matrix elements
    '''
    
    known = df[df['Age (Weeks)'] < 99999]
    unknown = df[df['Age (Weeks)'] == 99999]
    
    rf = RandomForestRegressor(n_estimators=250, max_depth=None, min_samples_split=1)
    
    
        
    train_X = known.drop(['OutcomeType','Age (Weeks)','AnimalID'], axis = 1)
    train_y = pd.DataFrame(known['Age (Weeks)'])
    
    rf.fit(train_X, train_y)
    
    for index, row in unknown.iterrows():
        
        predict_X = row.drop(['OutcomeType','AnimalID','Age (Weeks)'])
        df.loc[index, 'Age (Weeks)'] = rf.predict(predict_X.reshape(1, -1))
    
    return df

In [164]:
train_clean = fillMissingAges(train_clean)



In [165]:
test_clean = fillMissingAges(test_clean)



In [166]:
train_clean.to_csv('/home/pbnjeff/Dropbox/KaggleAnimalShelter/traincleaned.csv',index=False)
test_clean.to_csv('/home/pbnjeff/Dropbox/KaggleAnimalShelter/testcleaned.csv',index=False)

In [167]:
train_y = pd.DataFrame(train_clean['OutcomeType'])
train_X = train_clean.drop(['OutcomeType','AnimalID'],axis=1)
test_X = test_clean.drop(['AnimalID','OutcomeType'],axis=1)

In [170]:
train_y.head(3)

Unnamed: 0,OutcomeType
0,Return_to_owner
1,Euthanasia
2,Adoption


In [169]:
train_X.head(3)

Unnamed: 0,Male,NeuteredSpayed,SexuponOutcomeKnown,Dog,Age (Weeks),HasName
0,1,1,1,1,52.0,1
1,0,1,1,0,52.0,1
2,1,1,1,1,104.0,1


In [171]:
test_X.head(3)

Unnamed: 0,Male,NeuteredSpayed,SexuponOutcomeKnown,Dog,Age (Weeks),HasName
0,0,0,1,1,40.0,1
1,0,1,1,1,104.0,1
2,1,1,1,0,52.0,1


In [172]:
def createYmatrix(df):
    
    outcomes = df['OutcomeType'].unique()
    
    for outcome in outcomes:
        df[str(outcome)] = pd.Series(data = [0] * df.shape[0])
        df.loc[df['OutcomeType'] == str(outcome), str(outcome)] = 1
        
    return df.drop('OutcomeType', axis = 1)

In [173]:
train_y = createYmatrix(train_y)

## Using Random Forests to predict

In [176]:
rf = RandomForestClassifier(n_estimators=250, max_depth=None, min_samples_split=1)

In [177]:
rf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [188]:
predictions = np.array(rf.predict_proba(test_X))

In [195]:
predictions.shape

(5, 11456, 2)

In [224]:
for i in range(5):
    print(predictions[i][1][1])

0.281526443793
0.0277100890159
0.53044678435
0.15829306776
0.00202361508164


In [235]:
headers = ['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer']

In [236]:
output = pd.DataFrame(columns=headers)

In [246]:
for i in range(predictions.shape[1]):
    
    # order should be 2,5,1,0,3
    output.loc[i, 'ID'] = test_clean.iloc[i]['AnimalID']
    output.loc[i, 'Adoption'] = predictions[2][i][1]
    output.loc[i, 'Died'] = predictions[4][i][1]
    output.loc[i, 'Euthanasia'] = predictions[1][i][1]
    output.loc[i, 'Return_to_owner'] = predictions[0][i][1]
    output.loc[i, 'Transfer'] = predictions[3][i][1]

In [249]:
output.to_csv('/home/pbnjeff/Dropbox/KaggleAnimalShelter/predicted.csv',index=False)

## Using TensorFlow to predict

In [250]:
import tensorflow as tf

In [255]:
x = tf.placeholder(tf.float32, shape=[None, train_X.shape[1]])

In [256]:
y_ = tf.placeholder(tf.float32, shape=[None, train_y.shape[1]])

In [257]:
W = tf.Variable(tf.zeros([train_X.shape[1],train_y.shape[1]]))
b = tf.Variable(tf.zeros([train_y.shape[1]]))

In [258]:
sess.run(tf.initialize_all_variables())

In [259]:
y = tf.nn.softmax(tf.matmul(x,W) + b)

In [260]:
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))

In [261]:
train_step = tf.train.GradientDescentOptimizer(0.25).minimize(cross_entropy)

In [263]:
start = 0

while start < train_X.shape[0]:
    end = start + 500
    if end > train_X.shape[0]:
        end = train_X.shape[0]
    batch_x = train_X[start:end]
    batch_y = train_y[start:end]
    train_step.run(feed_dict={x: batch_x, y_: batch_y})
    start += 500

In [271]:
probabilities = y

In [272]:
print(probabilities.eval(feed_dict={x: test_X[50:100]}))

[[ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  nan  nan]
 [ nan  nan  nan  na

In [269]:
print(prediction.eval(feed_dict={x: test_X[50:100]}))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [308]:
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

In [309]:
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [310]:
print(accuracy.eval(feed_dict={x: train_X, y_:train_y}))

0.179056


In [311]:
feed_dict = {x: test_X}
classification = sess.run(tf.argmax(y,1), feed_dict)

In [312]:
np.unique(classification)

array([0])

### Output

In [None]:
headers = ['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer']

In [None]:
output_df = pd.DataFrame(columns=headers)

In [None]:
for outcome in range(predictions.shape[1]): 
    
    new_list = []

    for row in range(predictions.shape[0]):
        new_list.append(predictions[row][outcome])
        
    output_df[headers[1+outcome]] = new_list



In [None]:
output_df['ID'] = test_cleaned['ID'].astype(int)

In [None]:
output_df.to_csv(folder_path + 'predicted.csv', index=False, encoding='utf-8')