In [1]:
import pandas as pd
import numpy as np
import random 

import statsmodels.formula.api as sm
import sys
from matplotlib import pyplot as plt
%matplotlib inline 

sys.version

'3.6.3 |Anaconda custom (64-bit)| (default, Oct  6 2017, 12:04:38) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [81]:
!pip show statsmodels

Name: statsmodels
Version: 0.8.0
Summary: Statistical computations and models for Python
Home-page: http://www.statsmodels.org/
Author: Skipper Seabold, Josef Perktold
Author-email: pystatsmodels@googlegroups.com
License: BSD License
Location: /anaconda3/lib/python3.6/site-packages
Requires: 
[33mYou are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [89]:
### read in raw data

d = pd.read_csv('/data/raw_data.csv')
d = d.dropna(how='any')
d.head()

FileNotFoundError: File b'/data/raw_data.csv' does not exist

In [65]:
d.shape

(206, 9)

In [66]:
### get summary stats for treatment useage
d[d['Treatment'] == 1]['amount_used'].describe()

count     77.000000
mean      12.051948
std       31.195013
min       -4.000000
25%        3.000000
50%        7.000000
75%       12.000000
max      273.000000
Name: amount_used, dtype: float64

In [67]:
### get summary stats for treament useage
d[d['Treatment'] == 0]['amount_used'].describe()

count    129.000000
mean       7.736434
std       27.301419
min     -158.000000
25%        1.000000
50%        5.000000
75%        9.000000
max      203.000000
Name: amount_used, dtype: float64

In [68]:
### find difference in means for ctrl/treatment
diff_mean = (d[d['Treatment'] == 1]['amount_used'].describe()['mean'] - d[d['Treatment'] == 0]['amount_used'].describe()['mean'])
diff_mean

4.3155139434209202

In [70]:
### create numeric mapping for locations
unique_locs = d.Location.value_counts().index.tolist()
loc_map = [x for x in range(len(unique_locs))]
loc_set = dict(zip(unique_locs, loc_map))
#loc_set

d['numeric_location'] = d['Location'].map(lambda x: loc_set[x] if x in loc_set.keys() else -1)
d.head()
#d['Location'].value_counts().sum()

Unnamed: 0,Date,Location,Treatment,as_measured,amount_used,collector,google_second_wave_trend,google_drug_resistant_trend,is_suspicious,numeric_location
0,3/10/18,eca- front desk,0.0,1594,0.0,Nikki,0.0,0.0,0.0,0
1,3/11/18,eca- front desk,0.0,1583,11.0,Nikki,0.0,0.0,0.0,0
2,3/12/18,eca- front desk,0.0,1573,10.0,Nikki,0.0,0.0,0.0,0
3,3/13/18,eca- front desk,0.0,1566,7.0,Nikki,0.0,48.0,0.0,0
4,3/14/18,eca- front desk,0.0,1549,17.0,Nikki,0.0,0.0,0.0,0


In [82]:
loc_set

{'Avalon - Gym': 9,
 'Avalon - Lobby': 10,
 'concur - 10th floor': 7,
 'concur - 16th floor': 2,
 'concur - 3rd floor': 5,
 'concur - 5th floor': 8,
 'concur -12th floor': 6,
 'eca- front desk': 0,
 'ped - well': 3,
 'ped- not well': 4,
 'yarn shop': 1}

In [83]:
d.tail()

Unnamed: 0,Date,Location,Treatment,as_measured,amount_used,collector,google_second_wave_trend,google_drug_resistant_trend,is_suspicious,numeric_location
212,4/2/18,Avalon - Gym,1.0,1799,7.0,Carmen,46.0,0.0,0.0,9
213,4/3/18,Avalon - Gym,1.0,1787,12.0,Carmen,0.0,0.0,0.0,9
214,4/4/18,Avalon - Gym,1.0,1776,11.0,Carmen,14.0,49.0,0.0,9
215,4/5/18,Avalon - Gym,1.0,1767,9.0,Carmen,22.0,40.0,0.0,9
216,4/6/18,Avalon - Gym,1.0,1763,4.0,Carmen,11.0,50.0,0.0,9


In [72]:
d['numeric_location'].value_counts()

0     28
1     26
5     20
4     20
3     20
2     20
7     19
6     19
8     16
9     15
10     3
Name: numeric_location, dtype: int64

In [85]:
### without clustering, without ignoring outlyers, just straight OLS


ols_revised = sm.ols(formula="amount_used ~ Treatment", data=d).fit(
                                                         use_t=True)
ols_revised.summary()

0,1,2,3
Dep. Variable:,amount_used,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.082
Date:,"Thu, 12 Apr 2018",Prob (F-statistic):,0.3
Time:,18:14:47,Log-Likelihood:,-983.63
No. Observations:,206,AIC:,1971.0
Df Residuals:,204,BIC:,1978.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,7.7364,2.537,3.050,0.002,2.764,12.709
Treatment,4.3155,4.149,1.040,0.298,-3.817,12.448

0,1,2,3
Omnibus:,255.15,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21728.736
Skew:,4.912,Prob(JB):,0.0
Kurtosis:,52.346,Cond. No.,2.43


In [75]:
### run a quick linear regression with clustering on location

ols_revised = sm.ols(formula="amount_used ~ Treatment", data=d).fit(
                                                        cov_type='cluster',
                                                        cov_kwds={'groups': d['numeric_location']},
                                                         use_t=True)
ols_revised.summary()

0,1,2,3
Dep. Variable:,amount_used,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,0.8586
Date:,"Thu, 12 Apr 2018",Prob (F-statistic):,0.376
Time:,17:48:07,Log-Likelihood:,-983.63
No. Observations:,206,AIC:,1971.0
Df Residuals:,204,BIC:,1978.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.7364,3.153,2.454,0.034,0.712,14.761
Treatment,4.3155,4.657,0.927,0.376,-6.061,14.692

0,1,2,3
Omnibus:,255.15,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21728.736
Skew:,4.912,Prob(JB):,0.0
Kurtosis:,52.346,Cond. No.,2.43


In [76]:
### We can see from the results that the pvalue for treatment is not below the Z test

In [78]:
ols_revised = sm.ols(formula="amount_used ~ Treatment + google_second_wave_trend + Treatment*google_second_wave_trend",
                                                        data=d).fit(
                                                        cov_type='cluster',
                                                        cov_kwds={'groups': d['numeric_location']},
                                                         use_t=True)
ols_revised.summary()

0,1,2,3
Dep. Variable:,amount_used,R-squared:,0.007
Model:,OLS,Adj. R-squared:,-0.008
Method:,Least Squares,F-statistic:,0.8464
Date:,"Thu, 12 Apr 2018",Prob (F-statistic):,0.499
Time:,17:50:15,Log-Likelihood:,-983.45
No. Observations:,206,AIC:,1975.0
Df Residuals:,202,BIC:,1988.0
Df Model:,3,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.9850,2.204,3.170,0.010,2.075,11.895
Treatment,4.6082,3.066,1.503,0.164,-2.222,11.439
google_second_wave_trend,0.0493,0.132,0.372,0.718,-0.246,0.345
Treatment:google_second_wave_trend,-0.0336,0.157,-0.214,0.834,-0.383,0.316

0,1,2,3
Omnibus:,251.971,Durbin-Watson:,2.032
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21274.093
Skew:,4.803,Prob(JB):,0.0
Kurtosis:,51.849,Cond. No.,121.0


In [None]:
###next steps:
### add in dummies for location and then add them into the OLS formula
