In [18]:
import h5py as h5
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR
from sklearn import preprocessing
import calendar as cal
import seaborn as sns
import datetime as dt
%matplotlib inline

In [2]:
path = '/home/sadat/Documents/DSE/Data/yogurt.h5'
df = pd.read_hdf(path, key='panel')
df.head()

Unnamed: 0,COLUPC,DOLLARS,IRI_KEY,MINUTE,OUTLET,PANID,UNITS,WEEK,file
0,17047000300,0.99,1097117,5068.0,DK,3138826,1.0,1514,yogurt_PANEL_DK_1479_1530.DAT
1,17047000302,0.89,1097117,2224.0,DK,3138826,1.0,1489,yogurt_PANEL_DK_1479_1530.DAT
2,17047000650,0.89,1097117,6551.0,DK,3138826,1.0,1488,yogurt_PANEL_DK_1479_1530.DAT
3,17047000650,2.97,1097117,6177.0,DK,3182048,3.0,1526,yogurt_PANEL_DK_1479_1530.DAT
4,17047001372,1.49,1097117,973.0,DK,3184911,1.0,1495,yogurt_PANEL_DK_1479_1530.DAT


#Feature Engineering

####Fix adjust time features

In [3]:
def get_datetime(week, minute):
    week, minute = int(week), int(minute)
    # Reference orginal date is September 3, 1979
    ref_orginal = dt.date.toordinal(dt.date(1979, 9, 3))
    # Convert minutes to days, hours, minutes
    days, hours, mins = (minute / 1440), ((minute % 1440) / 60), ((minute % 1440) % 60)
    # Convert from ordinal.
    return dt.datetime.fromordinal(ref_orginal + (week - 1) * 7 + days).replace(
        hour=hours, minute=mins)

week = df['WEEK'].values
minute = df['MINUTE'].fillna(df['MINUTE'].median()).values
datetime = []

for w,m in zip(week,minute):
    datetime.append(get_datetime(w,m))

df = df.drop(['MINUTE','WEEK'], axis=1)
df['DATETIME']=datetime

In [4]:
df.head()

Unnamed: 0,COLUPC,DOLLARS,IRI_KEY,OUTLET,PANID,UNITS,file,DATETIME
0,17047000300,0.99,1097117,DK,3138826,1.0,yogurt_PANEL_DK_1479_1530.DAT,2008-09-04 12:28:00
1,17047000302,0.89,1097117,DK,3138826,1.0,yogurt_PANEL_DK_1479_1530.DAT,2008-03-11 13:04:00
2,17047000650,0.89,1097117,DK,3138826,1.0,yogurt_PANEL_DK_1479_1530.DAT,2008-03-07 13:11:00
3,17047000650,2.97,1097117,DK,3182048,3.0,yogurt_PANEL_DK_1479_1530.DAT,2008-11-28 06:57:00
4,17047001372,1.49,1097117,DK,3184911,1.0,yogurt_PANEL_DK_1479_1530.DAT,2008-04-21 16:13:00


In [5]:
df['MONTH'] = df.DATETIME.dt.month
df['HOUR'] = df.DATETIME.dt.hour
df['WEEK'] = df.DATETIME.dt.week
df = df.drop(['DATETIME'], axis=1)

In [6]:
df.head()

Unnamed: 0,COLUPC,DOLLARS,IRI_KEY,OUTLET,PANID,UNITS,file,MONTH,HOUR,WEEK
0,17047000300,0.99,1097117,DK,3138826,1.0,yogurt_PANEL_DK_1479_1530.DAT,9,12,36
1,17047000302,0.89,1097117,DK,3138826,1.0,yogurt_PANEL_DK_1479_1530.DAT,3,13,11
2,17047000650,0.89,1097117,DK,3138826,1.0,yogurt_PANEL_DK_1479_1530.DAT,3,13,10
3,17047000650,2.97,1097117,DK,3182048,3.0,yogurt_PANEL_DK_1479_1530.DAT,11,6,48
4,17047001372,1.49,1097117,DK,3184911,1.0,yogurt_PANEL_DK_1479_1530.DAT,4,16,17


####Drop redundant columns

In [7]:
df = df.drop(['COLUPC','IRI_KEY','PANID','file'], axis=1)
df.head()

Unnamed: 0,DOLLARS,OUTLET,UNITS,MONTH,HOUR,WEEK
0,0.99,DK,1.0,9,12,36
1,0.89,DK,1.0,3,13,11
2,0.89,DK,1.0,3,13,10
3,2.97,DK,3.0,11,6,48
4,1.49,DK,1.0,4,16,17


####Create dummies for Outlet

In [8]:
dummies_Outlet = pd.get_dummies(df['OUTLET'], prefix= 'OUTLET') 
df = pd.concat([df, dummies_Outlet], axis=1)
df = df.drop(['OUTLET'], axis=1)
df.head()

Unnamed: 0,DOLLARS,UNITS,MONTH,HOUR,WEEK,OUTLET_DK,OUTLET_DR,OUTLET_GK,OUTLET_GR,OUTLET_KK,OUTLET_MA,OUTLET_MK
0,0.99,1.0,9,12,36,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.89,1.0,3,13,11,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.89,1.0,3,13,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.97,3.0,11,6,48,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.49,1.0,4,16,17,1.0,0.0,0.0,0.0,0.0,0.0,0.0


####Normalize Data

In [9]:
df_norm = df / (df.max() - df.min())

In [10]:
df_norm.head()

Unnamed: 0,DOLLARS,UNITS,MONTH,HOUR,WEEK,OUTLET_DK,OUTLET_DR,OUTLET_GK,OUTLET_GR,OUTLET_KK,OUTLET_MA,OUTLET_MK
0,0.013842,0.014961,0.818182,0.521739,0.692308,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.012444,0.014961,0.272727,0.565217,0.211538,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.012444,0.014961,0.272727,0.565217,0.192308,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.041527,0.044883,1.0,0.26087,0.923077,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.020833,0.014961,0.363636,0.695652,0.326923,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#Split Data

In [11]:
X = df.ix[:,1:]
y = df['DOLLARS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44)

#Feature selection and regression

In [22]:
clf = Pipeline([
  ('feature_selection', SelectKBest(k=6)),
  ('classification', RandomForestRegressor(n_jobs=-1))
])
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.34196106131569942

In [None]:
clf = Pipeline([
  ('feature_selection', SelectKBest(k=6)),
  ('classification', SVR())
])
clf.fit(X_train, y_train)
clf.score(X_test, y_test)