In [243]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare

In [244]:
train = pd.read_csv("train.csv")
train = train.drop(["Name", "OutcomeSubtype"], axis=1)

train.head()

Unnamed: 0,AnimalID,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


## Chi Square Test
More details can be found on wikipedia: https://en.wikipedia.org/wiki/Chi-squared_test
but in essense we will test to see if the outcomes follows a uniform distribution for each breed, this will tell us when knowing the breed is useful and when it's not.

## Building the table
First we need to build a table in the following way:

In [245]:
chi_table = train.groupby(["Breed", "OutcomeType"]).size().reset_index(name="Occurances")
chi_table = chi_table.pivot_table('Occurances', 'Breed', 'OutcomeType')
chi_table = chi_table.fillna(0)
chi_table = chi_table.loc[(table.sum(axis=1) > 5)] #If the count in a single breed is less than 5 we do not consider that breed
chi_table.head()                                   #5 is the conventional cutoff, but for future analysisit may be better to be more strict

OutcomeType,Adoption,Died,Euthanasia,Return_to_owner,Transfer
Breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Affenpinscher Mix,4,0,0,2,0
Akita Mix,4,0,0,5,2
Alaskan Husky Mix,5,0,0,5,0
American Bulldog,2,0,0,4,0
American Bulldog Mix,37,0,11,35,26


In [246]:
p_vals = pd.Series()
i = 0
matrix = chi_table.values
for rows in matrix: #We will test each row independently
    p_vals = p_vals.set_value(i, chisquare(rows)[1])
    i+=1
pvals = pd.DataFrame({'p_values':p_vals.values})
pvals.index = chi_table.index
res = pd.concat([chi_table, pvals], axis=1)
res.head()

OutcomeType,Adoption,Died,Euthanasia,Return_to_owner,Transfer,p_values
Breed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Affenpinscher Mix,4,0,0,2,0,0.03057702
Akita Mix,4,0,0,5,2,0.05068966
Alaskan Husky Mix,5,0,0,5,0,0.004701217
American Bulldog,2,0,0,4,0,0.03057702
American Bulldog Mix,37,0,11,35,26,1.891732e-09


In [254]:
res.to_csv("chi_square_prelim_analysis.csv", cols=['Adoption', 'Died', 'Transfer','p_values'])

## Future analysis
For this analysis we did not manipulate the breed data in anyway looking, for future tests it might be worth putting the breeds into their respective breed groups as outlined here: https://github.com/Kristjansson/Shelter_animals/blob/master/dog_breed_info.csv