In [None]:
"""
https://github.com/rlowd/python-bigdata/blob/master/intro2stats/
notebooks/Hypothesis-Testing-executed-no-solutions.ipynb

Fill-in-the-blank notebook
"""

In [7]:
import numpy as np
import pandas as pd
from scipy import stats
import datetime as dt
import matplotlib as mpl
%matplotlib inline

In [8]:
import seaborn as sns
sns.set(color_codes = True)

In [9]:
weed_pd = pd.read_csv("https://raw.githubusercontent.com/rlowd/python-bigdata/master/intro2stats/data/Weed_Price.csv",
                     parse_dates = [-1])
weed_pd.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01
1,Alaska,288.75,252,260.6,297,388.58,26,2014-01-01
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01


In [11]:
# Parse the date column to create two new columns: month & year 

weed_pd['month'] = weed_pd.date.dt.month
weed_pd['year'] = weed_pd.date.dt.year

In [12]:
weed_pd.head()

Unnamed: 0,State,HighQ,HighQN,MedQ,MedQN,LowQ,LowQN,date,month,year
0,Alabama,339.06,1042,198.64,933,149.49,123,2014-01-01,1,2014
1,Alaska,288.75,252,260.6,297,388.58,26,2014-01-01,1,2014
2,Arizona,303.31,1941,209.35,1625,189.45,222,2014-01-01,1,2014
3,Arkansas,361.85,576,185.62,544,125.87,112,2014-01-01,1,2014
4,California,248.78,12096,193.56,12812,192.92,778,2014-01-01,1,2014


In [23]:
# Parse the weed_pd DataFrame so it contains only entries from CA in 2014

weed_ca_2014 = weed_pd[(weed_pd.year == 2014) & (weed_pd.State == 'California')]

In [24]:
print(weed_ca_2014['State'].unique())
print(weed_ca_2014['year'].unique())

['California']
[2014]


In [25]:
# Find the mean and standard deviation of high quality weed's price in CA

ca_highq_mean = weed_ca_2014.HighQ.mean()
ca_highq_stdv = weed_ca_2014.HighQ.std()

print("Mean: ", ca_highq_mean, "\n Standard Deviation: ", ca_highq_stdv)

Mean:  245.8942307692309 
 Standard Deviation:  1.289907939371412


In [28]:
# Calculate the 95% confidence interval on the mean

highq_conf_int = stats.norm.interval(0.95, loc=ca_highq_mean,
                                     scale= ca_highq_stdv / np.sqrt(len(weed_ca_2014)))
print(highq_conf_int)

(245.761718492726, 246.02674304573577)


In [31]:
# Are high quality weed prices in Jan 2014 significantly higher than in Jan 2015?
# Make two numpy arrays by subetting the original DataFrame

# Get the data
weed_ca_jan2014 = np.array(weed_pd.HighQ[(weed_pd.year == 2014) & (weed_pd.month == 1)
                                         & (weed_pd.State == 'California')])
weed_ca_jan2015 = np.array(weed_pd.HighQ[(weed_pd.year == 2015) & (weed_pd.month == 1)
                                         & (weed_pd.State == 'California')])

print(type(weed_ca_jan2014))
weed_ca_jan2014

<class 'numpy.ndarray'>


array([248.78, 248.67, 248.67, 248.65, 248.68, 248.68, 248.64, 248.63,
       248.58, 248.56, 248.54, 248.47, 248.45, 248.41, 248.48, 248.44,
       248.43, 248.49, 248.44, 248.39, 248.33, 248.28, 248.25, 248.22,
       248.18, 248.21, 248.24, 248.28, 248.23, 248.28, 248.23])

In [32]:
# Find the mean value for each of the arrays

jan2014_mean = weed_ca_jan2014.mean()
jan2015_mean = weed_ca_jan2015.mean()

print("Mean for Jan. 2014: ", jan2014_mean, "\n Mean for Jan. 2015: ", jan2015_mean)

Mean for Jan. 2014:  248.4454838709677 
 Mean for Jan. 2015:  243.60225806451612


In [36]:
# Calculate the effect size of one year on the mean value of weed prices in California
# What difference does one year make in terms of average weed price?

effect_size = jan2015_mean - jan2014_mean
print("Effect size: ", abs(effect_size))

Effect size:  4.843225806451585


In [37]:
# Now test the null hypothesis: Mean prices aren't significantly different
# Perform a t-test and determine the p-value

stats.ttest_ind(weed_ca_jan2014, weed_ca_jan2015, equal_var=True)

Ttest_indResult(statistic=98.01132523815805, pvalue=6.297971818508403e-68)

In [38]:
# Since the p-value is almost 0, we can reject the null hypothesis
# Conclusion: The price difference is significant

In [None]:
"""
t-test challenge: Determine if prices of medium quality weed
for Jan and Feb 2015 are significantly different for New York
"""

In [39]:
ny_medq_jan = weed_pd.MedQ[(weed_pd.State == 'New York') & (weed_pd.year == 2015)
                           & (weed_pd.month == 1)]
ny_medq_feb = weed_pd.MedQ[((weed_pd.State == 'New York') & (weed_pd.year == 2015)
                            & (weed_pd.month == 2))]

In [40]:
stats.ttest_ind(ny_medq_jan, ny_medq_feb, equal_var=True)

Ttest_indResult(statistic=6.889244564601663, pvalue=1.4876340462804882e-08)

In [41]:
# Result: The price difference for medium quality weed in NY is significant between Jan and Feb.

In [None]:
"""
Impact of regulation and deregulation on weed prices in Alaska and Maryland
"""

In [44]:
# In Nov 2014, Alaska legalized it: Did price change significantly from Oct to Dec?
ak_highq_oct = weed_pd.HighQ[(weed_pd.State == 'Alaska') & (weed_pd.year == 2014) 
                            & (weed_pd.month == 10)]
ak_highq_dec = weed_pd.HighQ[(weed_pd.State == 'Alaska') & (weed_pd.year == 2014) 
                            & (weed_pd.month == 12)]

stats.ttest_ind(ak_highq_oct, ak_highq_dec, equal_var=True)

Ttest_indResult(statistic=-15.14330239573019, pvalue=3.655192936628986e-22)

In [46]:
# Yes, prices did change significantly after it was legalized.

ak_effect_size = abs(ak_highq_oct.mean() - ak_highq_dec.mean())
print(ak_effect_size)

10.284193548387236


In [47]:
# Maryland decriminalized it in Oct 2014: Did prices change significantly compared to Sep 2014?
md_highq_oct = weed_pd.HighQ[(weed_pd.State == 'Maryland') & (weed_pd.year == 2014) 
                            & (weed_pd.month == 10)]
md_highq_sep = weed_pd.HighQ[(weed_pd.State == 'Maryland') & (weed_pd.year == 2014) 
                            & (weed_pd.month == 9)]

stats.ttest_ind(md_highq_oct, md_highq_sep, equal_var=True)

Ttest_indResult(statistic=-11.79234284307281, pvalue=3.742582041613048e-17)

In [48]:
# Yes, prices did change significantly after decriminalization

md_effect_size = abs(md_highq_oct.mean() - md_highq_sep.mean())
print(md_effect_size)

0.8180752688171538


In [49]:
# However, though significant, the difference is not especially important.

In [None]:
"""
Chi-square tests for goodness of fit:
Assuming the proportion of people who bough High, Med, and Low quality weed in Jan 2014
is the expected proportion, find out if the proportion of people who bought weed in
Jan 2015 conformed to that norm.
"""

In [52]:
# Find the total numbers of people who bought High/Med/Low quality weed in Jan of each year

weed_jan2014 = weed_pd[(weed_pd.year==2014) & (weed_pd.month==1)][["HighQN", "MedQN", "LowQN"]]
weed_jan2015 = weed_pd[(weed_pd.year==2015) & (weed_pd.month==1)][["HighQN", "MedQN", "LowQN"]]


Expected = np.array(weed_jan2014.apply(sum, axis=0))
Observed = np.array(weed_jan2015.apply(sum, axis=0))

In [53]:
print("Expected: ", Expected, "\n Observed: ", Observed)

Expected:  [2918004 2644757  263958] 
 Observed:  [4057716 4035049  358088]


In [54]:
# Print the proportions of High/Med/Low quality weed expected and observed

print("Expected:", Expected/np.sum(Expected.astype(float)), "\n" , 
      "Observed:", Observed/np.sum(Observed.astype(float)))

Expected: [0.5007971  0.45390159 0.04530131] 
 Observed: [0.48015461 0.47747239 0.042373  ]


In [55]:
# Do the Chi-Squared test

stats.chisquare(Observed, Expected)

Power_divergenceResult(statistic=1209562.2775169075, pvalue=0.0)

In [None]:
# Result: we reject the null hypothesis; the proportions in Jan 2015 are different than expected.