In [174]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy
import tensorflow as tf
from tensorflow.python.framework import ops

In [240]:
train_sample = pd.read_csv('/home/sophia/Downloads/grocery_dataFiles/train.csv', nrows = 1000000, 
                    dtype = {'onpromotion': str}, 
                    converters={'unit_sales': lambda u: float(u) if float(u) > 0 else 0})

stores = pd.read_csv('/home/sophia/Downloads/grocery_dataFiles/stores.csv')
items = pd.read_csv('/home/sophia/Downloads/grocery_dataFiles/items.csv')
transactions = pd.read_csv('/home/sophia/Downloads/grocery_dataFiles/transactions.csv')
oil = pd.read_csv('/home/sophia/Downloads/grocery_dataFiles/oil.csv')
train_sample.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,
4,4,2013-01-01,25,108701,1.0,


In [241]:
print ('dimention of train_sample = ', train_sample.shape)
print ('dimention of stores = ', stores.shape)
print ('dimention of items = ', items.shape)
print ('dimention of oil = ', oil.shape)
print ('dimention of transactions = ', transactions.shape)

print ('dtypes of train_sample = ', train_sample.dtypes)
print ('dtypes of stores = ', stores.dtypes)
print ('dtypes of items = ', items.dtypes)
print ('dtypes of oil = ', oil.dtypes)
print ('dtypes of transactions = ', transactions.dtypes)

dimention of train_sample =  (1000000, 6)
dimention of stores =  (54, 5)
dimention of items =  (4100, 4)
dimention of oil =  (1218, 2)
dimention of transactions =  (83488, 3)
dtypes of train_sample =  id               int64
date            object
store_nbr        int64
item_nbr         int64
unit_sales     float64
onpromotion     object
dtype: object
dtypes of stores =  store_nbr     int64
city         object
state        object
type         object
cluster       int64
dtype: object
dtypes of items =  item_nbr       int64
family        object
class          int64
perishable     int64
dtype: object
dtypes of oil =  date           object
dcoilwtico    float64
dtype: object
dtypes of transactions =  date            object
store_nbr        int64
transactions     int64
dtype: object


In [242]:
#extract features for the particular date(oil price, stores (e.g. locations and transactions)
#and items (e.g. types) 
join1 = train_sample.join(oil.set_index('date'), on = 'date')
join2 = join1.join(stores.set_index('store_nbr'), on = 'store_nbr', rsuffix = '_store')
join3 = join2.join(items.set_index('item_nbr'), on = 'item_nbr', rsuffix = '_item')
train = join3.join(transactions.set_index(['date','store_nbr']), on = ['date', 'store_nbr'], rsuffix = '_store')

#print (train_sample.dtypes)
#print (train.dtypes)  

holidays = pd.read_csv('/home/sophia/Downloads/grocery_dataFiles/holidays_events.csv')
holidays.loc[holidays['transferred'] == True, 'holiday_event'] = False
holidays.loc[holidays['transferred'] != True, 'holiday_event'] = True
holidays.loc[holidays['type'] == 'Work Day', 'holiday_event'] = False

train = train.join(holidays.set_index('date'), on = 'date', rsuffix = '_holidays')
train['holiday_loc_spec'] = False
train['holiday_event'] = train['holiday_event'].astype('bool')

# Step1: if it is National holiday, holiday_loc_spec is true 
train.loc[(train['holiday_event'] == True) & (train['locale'] == 'National'), 'holiday_loc_spec'] = True

# Step2: if it is Regional holiday (state == locale_name), holiday_loc_spec is true 
train.loc[(train['holiday_event'] == True) & (train['locale'] == 'Regional') & (train['state'] == train['locale_name']), 
          'holiday_loc_spec'] = True

# step3: if it is Local (locale_name == city), holiday_loc_spec is true 
train.loc[(train['holiday_event'] == True) & (train['locale'] == 'Local') & (train['city'] == train['locale_name']), 
     'holiday_loc_spec'] = True

#print ('holidays: ', train.loc[train['holiday_loc_spec'] == True].shape[0])
#print ('none-holidays: ', train.loc[train['holiday_loc_spec'] == False].shape[0])
#delete some columns (metadata about holidays)
train = train.drop(columns= ['type_holidays', 'locale', 'locale_name', 'description', 'transferred'])
train['year'] = pd.DatetimeIndex(train['date']).year.astype('object')
train['month'] = pd.DatetimeIndex(train['date']).month.astype('object')
train['dayofweek'] = pd.DatetimeIndex(train['date']).dayofweek.astype('object')
train['unit_sales_log'] = np.log(train['unit_sales'] + 1)
print (train['unit_sales_log'].describe())
print (train.columns)
#fix dtypes
train['perishable'] = train['perishable'].astype('bool')
#train['onpromotion'] = train['onpromotion'].astype('bool')
train['cluster'] = train['cluster'].astype('object')
train['class'] = train['class'].astype('object')


count    1000000.000000
mean           1.785632
std            0.874475
min            0.000000
25%            1.098612
50%            1.609438
75%            2.302585
max            8.560403
Name: unit_sales_log, dtype: float64
Index(['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion',
       'dcoilwtico', 'city', 'state', 'type', 'cluster', 'family', 'class',
       'perishable', 'transactions', 'holiday_event', 'holiday_loc_spec',
       'year', 'month', 'dayofweek', 'unit_sales_log'],
      dtype='object')


In [243]:
train_none_cate = train.loc[:, ['unit_sales', 'unit_sales_log', 'dcoilwtico',
                                'perishable','transactions', 'holiday_loc_spec', 'onpromotion']]
train_cate = train.loc[:,['city', 'state', 'type', 'cluster', 'family', 'class', 'year', 'month', 'dayofweek']]
train_cate = pd.get_dummies(train_cate, prefix = ['city', 'state', 'type', 
                                                  'cluster', 'family', 'class', 'year', 'month', 'dayofweek'])
train = pd.concat([train_none_cate, train_cate], axis = 1)
print (train_cate.shape)

(1000000, 262)


In [187]:
print (train.head())



   unit_sales  unit_sales_log  dcoilwtico  perishable  transactions  \
0         7.0        2.079442         NaN        True           770   
1         1.0        0.693147         NaN       False           770   
2         2.0        1.098612         NaN       False           770   
3         1.0        0.693147         NaN       False           770   
4         1.0        0.693147         NaN        True           770   

   holiday_loc_spec onpromotion  city_Quito  city_Salinas  city_Santo Domingo  \
0              True         NaN           0             1                   0   
1              True         NaN           0             1                   0   
2              True         NaN           0             1                   0   
3              True         NaN           0             1                   0   
4              True         NaN           0             1                   0   

      ...       class_6810  class_6918  class_6920  class_7002  class_7016  \
0     ..

In [246]:
X = train.loc[:, 'dcoilwtico':]
y = train.loc[:, 'unit_sales_log']

X.loc[X['onpromotion'].isnull(), 'onpromotion'] = False
X['onpromotion'] = X['onpromotion'].astype(bool)

print ('infomation about missing value')
print (X.isnull().sum())
print (X.head())
print (type(X))
X_columns = X.columns

from sklearn.preprocessing import Imputer, StandardScaler
#imput NAN 
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = imp.fit_transform(X)

print('after impute number of missing value: ' , np.isnan(X).sum()) #after imputting it returns a np.array

X = pd.DataFrame(X, columns = X_columns)
X.head()



infomation about missing value
dcoilwtico            327733
perishable                 0
transactions               0
holiday_loc_spec           0
onpromotion                0
city_Ambato                0
city_Babahoyo              0
city_Cayambe               0
city_Cuenca                0
city_Daule                 0
city_El Carmen             0
city_Esmeraldas            0
city_Guaranda              0
city_Guayaquil             0
city_Ibarra                0
city_Latacunga             0
city_Loja                  0
city_Machala               0
city_Playas                0
city_Quevedo               0
city_Quito                 0
city_Riobamba              0
city_Salinas               0
city_Santo Domingo         0
state_Azuay                0
state_Bolivar              0
state_Chimborazo           0
state_Cotopaxi             0
state_El Oro               0
state_Esmeraldas           0
                       ...  
class_3044                 0
class_3046                 0
class_3060  

Unnamed: 0,dcoilwtico,perishable,transactions,holiday_loc_spec,onpromotion,city_Ambato,city_Babahoyo,city_Cayambe,city_Cuenca,city_Daule,...,class_7034,year_2013,month_1,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6
0,94.153816,1.0,770.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,94.153816,0.0,770.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,94.153816,0.0,770.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,94.153816,0.0,770.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,94.153816,1.0,770.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [247]:
from sklearn.preprocessing import StandardScaler, Imputer
std = StandardScaler()
X.loc[:,'dcoilwtico'] = std.fit_transform(X.loc[:,['dcoilwtico']].values)
X.loc[:,['transactions']] = std.fit_transform(X.loc[:,['transactions']].values)

In [248]:
print (X['dcoilwtico'].describe())
print (X.loc[:,['transactions']].describe())


count    1.000000e+06
mean     5.856023e-14
std      1.000001e+00
min     -1.379772e+00
25%     -1.041769e+00
50%      4.968951e-14
75%      1.470713e-01
max      2.256680e+00
Name: dcoilwtico, dtype: float64
       transactions
count  1.000000e+06
mean  -8.645884e-17
std    1.000001e+00
min   -1.390118e+00
25%   -7.422046e-01
50%   -2.928455e-01
75%    5.337663e-01
max    3.724216e+00


In [None]:
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_log_error


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

model1 = SVR(kernel='linear', C=1)
model2 = DecisionTreeRegressor(max_depth = 2)

model1.fit(X_train, y_train)
y_hat1_train = model1.predict(X_train)
y_hat1_test = model1.predict(X_test)

model2.fit(X_train, y_train)
y_hat2_train = model2.predict(X_train)
y_hat2_test = model1.predict(X_test)

MSLE1_train = mean_squared_log_error(y_train, y_hat1_train)
MSLE2_train = mean_squared_log_error(y_train, y_hat2_train)
MSLE1_test = mean_squared_log_error(y_test, y_hat1_test)
MSLE2_test = mean_squared_log_error(y_test, y_hat2_test)
print ('train MSE, eva MSE, train MSE, eva MSE is ' , [MSLE1_train, MSLE2_train, MSLE1_train, MSLE2_test])

In [None]:
# work with holiday file, the goal is to determined if a particuar city of a store on a particular date
# was celebrating holiday_event 
holidays = pd.read_csv('/home/sophia/Downloads/grocery_dataFiles/holidays_events.csv')
list1 = list(holidays.locale_name.unique())
list2 = list(stores.city.unique())
print ('holiday locale names:', sorted(list1))
print ('store city names:', sorted(list2))
print ('states name: ', sorted(list(stores.state.unique())))
n, bins, patches = plt.hist(holidays['type'], 50, normed=1, facecolor='green', alpha=0.75)
plt.show()


# Step one: if holiday is transferred (only holiday has transferred examples, it was not celebrated, so labeled as not holiday_event
# Step two: if holiday is not transferred, it remains as holiday_event
# step three: Work Day are meant to make up Bridge, so not celebrated as holiday_event (it is all national) 
holidays.loc[holidays['transferred'] == True, 'holiday_event'] = False
holidays.loc[holidays['transferred'] != True, 'holiday_event'] = True
holidays.loc[holidays['type'] == 'Work Day', 'holiday_event'] = False
holidays[holidays['type'] == 'Work Day']
print (holidays.dtypes)

In [None]:
#now check the two things we have marked: holiday that has been transferred and work day that meant to make 
#up Bridge is labeled not holiday_event
#plt.style.use('seaborn-white')
plt.style.use('dark_background')
holiday_locale_type = holidays.groupby(['type', 'holiday_event']).size()
holiday_locale_type.unstack().plot(kind='bar',stacked=True, figsize=(12,10),  grid=False)
plt.title('Stacked Barplot of holiday_event label against event type')
plt.ylabel('Count of entries')
plt.show()
