In [1]:
import numpy as np
import pandas as pd

# Import data
dogs = pd.read_csv('dog_data.csv')

dogs.head()

Unnamed: 0,is_rescue,weight,tail_length,age,color,likes_children,is_hypoallergenic,name,breed
0,0,6,2.25,2,black,1,0,Huey,chihuahua
1,0,4,5.36,4,black,0,0,Cherish,chihuahua
2,0,7,3.63,3,black,0,1,Becka,chihuahua
3,0,5,0.19,2,black,0,0,Addie,chihuahua
4,0,5,0.37,1,black,1,1,Beverlee,chihuahua


In [2]:
# calculate the number of whippet rescue

whippet_rescue = dogs.is_rescue[dogs.breed == 'whippet']

num_whippet_rescues = np.sum(whippet_rescue == 1)
num_whippet_rescues

6

In [3]:
# Calculate the number of whippets
num_whippets = len(whippet_rescue)
num_whippets

100

Null: 8% of whippets are rescues

Alternative: more or less than 8% of whippets are rescues

In [4]:
# Run a binomial test 
from scipy.stats import binom_test
pval = binom_test(num_whippet_rescues, num_whippets, .08)
pval < 0.05

False

Null: whippets, terriers, and pitbulls all weigh the same amount on average
    
Alternative: whippets, terriers, and pitbulls do not all weigh the same amount on average (at least one pair of breeds has differing average weights)

In [5]:
# Save the weights of whippets, terriers, and pitbulls
wt_whippets = dogs.weight[dogs.breed == 'whippet']
wt_terriers = dogs.weight[dogs.breed == 'terrier']
wt_pitbulls = dogs.weight[dogs.breed == 'pitbull']

# Run an ANOVA 
from scipy.stats import f_oneway
Fstat, pval = f_oneway(wt_whippets, wt_terriers, wt_pitbulls)
pval < 0.05

True

hypothesis test to determine which of those breeds (whippets, terriers, and pitbulls) weigh different amounts on average.

In [7]:
# Subset to just whippets, terriers, and pitbulls
dogs_wtp = dogs[dogs.breed.isin(['whippet', 'terrier', 'pitbull'])]

# Run Tukey's Range Test
from statsmodels.stats.multicomp import pairwise_tukeyhsd
output = pairwise_tukeyhsd(dogs_wtp.weight, dogs_wtp.breed)
print(output)

  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower    upper  reject
-------------------------------------------------------
pitbull terrier   -13.24    0.0 -16.7278 -9.7522   True
pitbull whippet    -3.34 0.0638  -6.8278  0.1478  False
terrier whippet      9.9    0.0   6.4122 13.3878   True
-------------------------------------------------------


In [8]:
# Subset to just poodles and shihtzus
dogs_ps = dogs[dogs.breed.isin(['poodle', 'shihtzu'])]

# Create a contingency table of color vs. breed
Xtab = pd.crosstab(dogs_ps.color, dogs_ps.breed)
print(Xtab)

breed  poodle  shihtzu
color                 
black      17       10
brown      13       36
gold        8        6
grey       52       41
white      10        7


Null: There is an association between breed (poodle vs. shihtzu) and color.
    
Alternative: There is not an association between breed (poodle vs. shihtzu) and color.

In [9]:
# Run a Chi-Square Test
from scipy.stats import chi2_contingency
chi2, pval, dof, exp = chi2_contingency(Xtab)
pval < 0.05

True