# Initial procedures

To outline what I did here, I combined the training and testing sets in order to guarantee that whatever automatically-generated features I obtain are trained by the model and will work in predicting (possibly or not possibly well) for the testing set. In a more math-y way of speaking, I'm making sure that the basis vectors are consistent.

I separated out and created a lot of features here. I'm guessing that there probably is a much more efficient way of doing all of this, but I think that in order to explain the model well, it's best if I have more control over what I think is important (and later on, I will be verifying that it is or is not important for the model).

There's a very trivial procedure to decompose the combined set (the very last lines).

In [33]:
%matplotlib inline

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os

folder_path = '/home/pbnjeff/Dropbox/KaggleAnimalShelter/'
combined_cleaned_path = '/home/pbnjeff/Dropbox/KaggleAnimalShelter/combined_cleaned.csv'


train = pd.read_csv(folder_path + "train.csv", encoding='utf-8')
test = pd.read_csv(folder_path + "test.csv", encoding='utf-8')
combined = train.append(test)

def cleanFeatures(df):
    
    # Separate out the sex and if the animal's genitals are intact
    df['Male'] = (df['SexuponOutcome'] == u'Intact Male') | (df['SexuponOutcome'] == u'Neutered Male')
    df['NeuteredSpayed'] = (df['SexuponOutcome'] == u'Spayed Female') | (df['SexuponOutcome'] == u'Neutered Male')
    df['SexuponOutcomeKnown'] = ~df['SexuponOutcome'].isnull()
    df = df.drop('SexuponOutcome',1)

    # Determine if the animal is a dog or not
    df['Dog'] = df['AnimalType'] == u'Dog'
    df = df.drop('AnimalType',1)

    # Transform each animal's age into a more standard form
    # Note: 99999 indicates an unknown age. Of course, it's not
    # an actual age, but something to separate the knowns from
    # unknowns and effectively "discretize" on a continuous spectrum
    (df['Age (Weeks)'],df['Units']) = (df['AgeuponOutcome'].str.split(' ', expand=True)[0],
                               df['AgeuponOutcome'].str.split(' ', expand=True)[1])
    df['Age (Weeks)'] = df['Age (Weeks)'].convert_objects(convert_numeric=True)
    df.loc[df['Units'] == 'years','Units'] = 'year'
    df.loc[df['Units'] == 'months','Units'] = 'month'
    df.loc[df['Units'] == 'weeks','Units'] = 'week'
    df.loc[df['Units'] == 'year','Age (Weeks)'] = df.loc[df['Units'] == 'year','Age (Weeks)'] * 52
    df.loc[df['Units'] == 'month','Age (Weeks)'] = df.loc[df['Units'] == 'month','Age (Weeks)'] * 4
    df.loc[df['Age (Weeks)'].isnull(),'Age (Weeks)'] = 99999
    df = df.drop(['AgeuponOutcome','Units'],1)

    # New feature: neutered young (less than ~1 year of age)
    # df['NeuteredYoung'] = (df['Age (Weeks)'].astype(int) < 53 & df['NeuteredSpayed'])
    
    # Separate out the date/time into its individual components
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    # df['YearOutcome'] = df['DateTime'].dt.year
    # df['MonthOutcome'] = df['DateTime'].dt.month
    # df['DayOfWeekOutcome'] = df['DateTime'].dt.dayofweek
    df = df.drop('DateTime',1)

    # Features from names
    df['HasName'] = ~df['Name'].isnull()
    df = df.drop('Name',axis=1)

    # Features from colors
    color_list = []
    #for color in df['Color'].unique():
    #    new_colors = color.split('/')
    #    for new_color in new_colors:
    #        new_color_list = new_color.split(' ')
    #        for identifier in new_color_list:
    #            if str(identifier) not in color_list:
    #                color_list.append(str(identifier))
    #for color in color_list:
    #    df[str(color)] = df['Color'].str.contains(color)
    df = df.drop('Color', axis=1)

    # There were so many breeds, so I tried my best to categorize based on
    # what seemed to be popular and recognizable breeds. Later, I think
    # this could be improved by looking a bit more at histograms of these
    # various categorizations I created, and more
    #df['Mixed'] = (df['Breed'].str.contains('Mix') |
    #               df['Breed'].str.contains('/'))
    #df['Breed'] = df['Breed'].str.rstrip('Mix').str.rstrip()
    #df['PitBull'] = df['Breed'].str.contains('Pit Bull')
    #df['Terrier'] = df['Breed'].str.contains('Terrier')
    #df['Mini'] = df['Breed'].str.contains('Miniature')
    #df['Corgi'] = df['Breed'].str.contains('Corgi')
    #df['Retriever'] = df['Breed'].str.contains('Retriever')
    #df['Hound'] = df['Breed'].str.contains('Hound')
    #df['Husky'] = df['Breed'].str.contains('Husky')
    #df['Beagle'] = df['Breed'].str.contains('Beagle')
    #df['Chihuahua'] = df['Breed'].str.contains('Chichuahua')
    #df['Bulldog'] = df['Breed'].str.contains('Bulldog')
    #df['ShireDog'] = df['Breed'].str.contains('shire')
    #df['GreatPyrenees'] = df['Breed'].str.contains('Great Pyrenees')
    #df['Shepherd'] = df['Breed'].str.contains('Shepherd')
    #df['Dachshund'] = df['Breed'].str.contains('Dachshund')
    #df['Rottweiler'] = df['Breed'].str.contains('Rottweiler')
    #df['CatMixed'] = (df['Breed'].str.contains('Domestic Shorthair') |
    #                 df['Breed'].str.contains('Domestic Longhair') |
    #                 df['Breed'].str.contains('Domestic Medium Hair'))
    #df['ExoticForeignCat'] = ( (df['Dog'] == False) & 
    #                   ( df['Breed'].str.contains('Siamese') |
    #                    df['Breed'].str.contains('Himalayan') |
    #                    df['Breed'].str.contains('Persian') |
    #                    df['Breed'].str.contains('Angora') |
    #                    df['Breed'].str.contains('Bombay') |
    #                    df['Breed'].str.contains('Japanese') |
    #                    df['Breed'].str.contains('Bengal') |
    #                    df['Breed'].str.contains('Cymric') |
    #                    df['Breed'].str.contains('Abyssinian') |
    #                    df['Breed'].str.contains('Sphynx') |
    #                    df['Breed'].str.contains('Javanese') |
    #                    df['Breed'].str.contains('Turkish') |
    #                    df['Breed'].str.contains('Chartreaux') |
    #                    df['Breed'].str.contains('Norwegian') |
    #                    df['Breed'].str.contains('Russian') ) )
    #df['MaineCoon'] = df['Breed'].str.contains('Maine Coon')
    #df['Shorthair'] = df['Breed'].str.contains('Shorthair')
    #df['Longhair'] = df['Breed'].str.contains('Longhair')
    #df['Ragdoll'] = df['Breed'].str.contains('Ragdoll')
    #df['American'] = df['Breed'].str.contains('American')
    #df['Australian'] = df['Breed'].str.contains('Australian')
    #df['German'] = df['Breed'].str.contains('German')
    #df['Japanese'] = df['Breed'].str.contains('Japanese')
    #df['Munchkin'] = df['Breed'].str.contains('Munchkin')
    #df['RexCat'] = df['Breed'].str.contains('Rex')
    #df['ColdWeather'] = (df['Breed'].str.contains('Siberian') |
    #                     df['Breed'].str.contains('Russian') |
    #                     df['Breed'].str.contains('Longhair') |
    #                     df['Breed'].str.contains('Norwegian'))
    df = df.drop('Breed',axis=1)

    # This is actually not useful because these are usually just comments
    # on the outcome
    if 'OutcomeSubtype' in df.columns:
        df = df.drop('OutcomeSubtype',axis=1)
        
    return df

combined = cleanFeatures(combined)



In [34]:
features = combined.columns

for feature in features:
    if (combined[feature].dtype == 'bool') | (combined[feature].dtype == 'object'):
        combined[feature] = combined[feature].astype(int)

Index([           u'AnimalID',                  u'ID',         u'OutcomeType',
                      u'Male',      u'NeuteredSpayed', u'SexuponOutcomeKnown',
                       u'Dog',         u'Age (Weeks)',             u'HasName'],
      dtype='object')

In [36]:
train_cleaned = combined[combined['ID'].isnull()].drop(['ID'],axis=1)
test_cleaned = combined[combined['AnimalID'].isnull()].drop(['AnimalID'],axis=1)

I want to save the cleaned data so I don't have to run all the functions to get cleaned data.

In [37]:
train_cleaned.to_csv('/home/pbnjeff/Dropbox/KaggleAnimalShelter/traincleaned.csv',index=False)
test_cleaned.to_csv('/home/pbnjeff/Dropbox/KaggleAnimalShelter/testcleaned.csv',index=False)

In [38]:
train_y = train_cleaned['OutcomeType']
train_X = train_cleaned.drop(['OutcomeType','AnimalID'],axis=1)
test_X = test_cleaned.drop(['ID','OutcomeType'],axis=1)

In [40]:
features = train_X.columns

for feature in features:
    if (train_X[feature].dtype == 'bool') | (train_X[feature].dtype == 'object'):
        train_X[feature] = train_X[feature].astype(int)
        test_X[feature] = test_X[feature].astype(int)

In [47]:
ages_known = train_cleaned[train_cleaned['Age (Weeks)'] < 90000]

In [48]:
ages_unknown = train_cleaned[train_cleaned['Age (Weeks)'] == 99999]

In [None]:
outcomes = train_y.unique()

In [None]:
train_y = pd.DataFrame(train_cleaned['OutcomeType'])

In [None]:
outcomes = train_y['OutcomeType'].unique()

In [None]:
for feature in train_X.columns:
    print(feature)

In [None]:
for outcome in outcomes:
    
    train_y[str(outcome)] = pd.Series(data=[0] * train_y.shape[0])
    train_y.loc[train_y['OutcomeType'] == str(outcome), str(outcome)] = 1

In [None]:
train_y = train_y.dropna()

In [None]:
train_y = train_y.drop('OutcomeType', axis = 1)

features = train_y.columns

In [None]:
train_X.head(3)

In [None]:
train_y.head(3)

In [None]:
train_X.shape

In [None]:
train_y.shape

In [69]:
short_X = train_X[['Male','NeuteredSpayed','SexuponOutcomeKnown','Dog','Age']]

In [83]:
short_X_test = test_X[['Male','NeuteredSpayed','SexuponOutcomeKnown','Dog','Age']]

## Using TensorFlow to predict

In [1]:
import tensorflow as tf

In [293]:
x = tf.placeholder(tf.float32, shape=[None, train_X.shape[1]])

In [294]:
y_ = tf.placeholder(tf.float32, shape=[None, train_y.shape[1]])

In [295]:
W = tf.Variable(tf.zeros([train_X.shape[1],train_y.shape[1]]))
b = tf.Variable(tf.zeros([train_y.shape[1]]))

In [296]:
sess.run(tf.initialize_all_variables())

In [297]:
y = tf.nn.softmax(tf.matmul(x,W) + b)

In [298]:
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))

In [306]:
train_step = tf.train.GradientDescentOptimizer(0.25).minimize(cross_entropy)

In [307]:
start = 0

while start < short_X.shape[0]:
    end = start + 500
    if end > train_X.shape[0]:
        end = train_X.shape[0]
    batch_x = train_X[start:end]
    batch_y = train_y[start:end]
    train_step.run(feed_dict={x: batch_x, y_: batch_y})
    start += 500

In [308]:
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

In [309]:
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [310]:
print(accuracy.eval(feed_dict={x: train_X, y_:train_y}))

0.179056


In [311]:
feed_dict = {x: test_X}
classification = sess.run(tf.argmax(y,1), feed_dict)

In [312]:
np.unique(classification)

array([0])

### Output

In [None]:
headers = ['ID','Adoption','Died','Euthanasia','Return_to_owner','Transfer']

In [None]:
output_df = pd.DataFrame(columns=headers)

In [None]:
for outcome in range(predictions.shape[1]): 
    
    new_list = []

    for row in range(predictions.shape[0]):
        new_list.append(predictions[row][outcome])
        
    output_df[headers[1+outcome]] = new_list



In [None]:
output_df['ID'] = test_cleaned['ID'].astype(int)

In [None]:
output_df.to_csv(folder_path + 'predicted.csv', index=False, encoding='utf-8')