In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import random 

import statsmodels.formula.api as sm
import math
from sklearn import linear_model

from matplotlib import pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline 

import datetime
import sys

### for debugging purposes
###sys.version
### should say:
###'3.6.3 |Anaconda custom (64-bit)| (default, Oct  6 2017, 12:04:38) \n
###[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [5]:
### begin ETL

d = pd.read_csv('../data/raw_data.csv')
d_working = d.dropna(how='any')


### create numeric mapping for locations
unique_locs = d.location.value_counts().index.tolist()
loc_map = [x for x in range(len(unique_locs))]
loc_set = dict(zip(unique_locs, loc_map))
#loc_set

d_working['numeric_location'] = d_working['location'].map(lambda x: loc_set[x] if x in loc_set.keys() else -1)

### get dummies for locations
dum = pd.get_dummies(d_working['numeric_location'])
dum.columns = ['loc_%s' % (x) for x in dum.columns.tolist()]
d_working = pd.concat([d_working, dum], axis = 1)

### create week aggregates

def week_num(date_string):
    date = datetime.datetime.strptime(date_string,'%m/%d/%y')
    return datetime.date.isocalendar(date)[1]

d_working['week_number'] = d_working['date'].apply(week_num)

d_working.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,date,location,location_type,treatment,as_measured,amount_used,collector,google_second_wave_trend,google_drug_resistant_trend,stagger_second_wave_trend,...,loc_2,loc_3,loc_4,loc_5,loc_6,loc_7,loc_8,loc_9,loc_10,week_number
0,3/10/18,eca- front desk,gym,0.0,1594,0.0,Nikki,0,0,0,...,0,0,0,0,0,0,0,0,0,10
1,3/11/18,eca- front desk,gym,0.0,1583,11.0,Nikki,0,44,0,...,0,0,0,0,0,0,0,0,0,10
2,3/12/18,eca- front desk,gym,0.0,1573,10.0,Nikki,0,0,0,...,0,0,0,0,0,0,0,0,0,11
3,3/13/18,eca- front desk,gym,0.0,1566,7.0,Nikki,0,62,0,...,0,0,0,0,0,0,0,0,0,11
4,3/14/18,eca- front desk,gym,0.0,1549,17.0,Nikki,0,12,0,...,0,0,0,0,0,0,0,0,0,11


In [6]:
### begin ETL

d_week = d_working.groupby(['week_number', 'treatment', 'numeric_location']).sum()
d_week.reset_index(inplace=True)
d_week = d_week[['week_number', 'treatment', 'numeric_location', 'amount_used',
       'google_second_wave_trend', 'google_drug_resistant_trend',
       'is_suspicious', 'stagger_second_wave_trend', 'stagger_drug_resistant_trend']]

loc_dum = pd.get_dummies(d_week['numeric_location'])
loc_dum.columns = ['loc_%s' % (x) for x in loc_dum.columns.tolist()]
d_week = pd.concat([d_week, loc_dum], axis = 1)

d_week.head()

Unnamed: 0,week_number,treatment,numeric_location,amount_used,google_second_wave_trend,google_drug_resistant_trend,is_suspicious,stagger_second_wave_trend,stagger_drug_resistant_trend,loc_0,loc_1,loc_2,loc_3,loc_4,loc_5,loc_6,loc_7,loc_8,loc_9,loc_10
0,10,0.0,0,4.0,0,52,0.0,0,52,1,0,0,0,0,0,0,0,0,0,0
1,10,0.0,1,11.0,0,44,0.0,0,39,0,1,0,0,0,0,0,0,0,0,0
2,10,0.0,4,73.0,0,139,0.0,0,115,0,0,0,0,1,0,0,0,0,0,0
3,10,0.0,6,53.0,0,139,0.0,0,115,0,0,0,0,0,0,1,0,0,0,0
4,10,0.0,10,4.0,0,44,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1
