In [1]:
import pandas as pd
from pandas import *
import numpy as np

from datetime import datetime

# Get the training data
df = pd.read_csv("data/train.csv.gz")

# TODO Read test data

# Just get a subset for fast results
df = df[1:15000]

# Show first 5 columns
df.head(5)
df.describe()

Unnamed: 0,store,item,sales
count,14999.0,14999.0,14999.0
mean,4.617841,1.0,21.404694
std,2.372351,0.0,8.500957
min,1.0,1.0,1.0
25%,3.0,1.0,15.0
50%,5.0,1.0,20.0
75%,7.0,1.0,27.0
max,9.0,1.0,59.0


In [2]:
# convert day of week into 3 booleans, so that hidden layers of neural network do the rest
def day_of_week_and_month(date):
    datetime_object = datetime.strptime(date, "%Y-%m-%d")
    week_day = datetime_object.weekday()
    dow_0 = week_day % 2
    dow_1 = int(week_day / 2) % 2
    dow_2 = int(week_day / 4) % 2
    
    # Same for month
    month = datetime_object.month
    month_0 = month % 2
    month_1 = int(month / 2) % 2
    month_2 = int(month / 4) % 2
    month_3 = int(month / 8) % 2
    
    # TODO, same for calendar week + year 
    return (dow_0, dow_1, dow_2, month_0, month_1, month_2, month_3)

df['dow_0'], df['dow_1'], df['dow_2'], df['month_0'], df['month_1'], df['month_2'], df['month_3'] = zip(*df["date"].map(day_of_week_and_month))
# Column not needed anymore
df = df.drop(['date'], axis=1)
df.head(5)

Unnamed: 0,store,item,sales,dow_0,dow_1,dow_2,month_0,month_1,month_2,month_3
1,1,1,11,0,1,0,1,0,0,0
2,1,1,14,1,1,0,1,0,0,0
3,1,1,13,0,0,1,1,0,0,0
4,1,1,10,1,0,1,1,0,0,0
5,1,1,12,0,1,1,1,0,0,0


In [3]:
# Same for the store
def store_type(store):
    d_0 = store % 2
    d_1 = int(store / 2) % 2
    d_2 = int(store / 4) % 2
    d_3 = int(store / 8) % 2
    return (d_0, d_1, d_2, d_3)

df['store_0'], df['store_1'], df['store_2'], df['store_3'] = zip(*df["store"].map(store_type))

# And for the item as well
def item_type(item):
    d_0 = item % 2
    d_1 = int(item / 2) % 2
    d_2 = int(item / 4) % 2
    d_3 = int(item / 8) % 2
    d_4 = int(item / 16) % 2
    d_5 = int(item / 32) % 2
    return (d_0, d_1, d_2, d_3, d_4, d_5)

df['item_0'], df['item_1'], df['item_2'], df['item_3'], df['item_4'], df['item_5'] = zip(*df["item"].map(item_type))

# Columns not needed anymore
df = df.drop(['item', 'store'], axis=1)
df.head(5)

Unnamed: 0,sales,dow_0,dow_1,dow_2,month_0,month_1,month_2,month_3,store_0,store_1,store_2,store_3,item_0,item_1,item_2,item_3,item_4,item_5
1,11,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
2,14,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
3,13,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0
4,10,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0
5,12,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0


In [4]:
# Get maximum number of sale, adding 30%
max = df['sales'].max()
theory_max = int(max * 1.3)
print("Maximum value is {0}, assuming maximum of {1}".format(max, theory_max))

# Should we do it like that?
df['sales'] = df['sales'] / theory_max
# Remove result
#df = df.drop(['sales'], axis=1)
df.head(5)

Maximum value is 59, assuming maximum of 76


Unnamed: 0,sales,dow_0,dow_1,dow_2,month_0,month_1,month_2,month_3,store_0,store_1,store_2,store_3,item_0,item_1,item_2,item_3,item_4,item_5
1,0.144737,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
2,0.184211,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0
3,0.171053,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0
4,0.131579,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0
5,0.157895,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0


In [5]:
# Generate our training/validation datasets
from sklearn import model_selection

array = df.values

X = array[:,1:]
Y = array[:,0]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [6]:
# Import algorithm
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC




# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []

# Let's deactivate it
models = []
for name, model in models:
    print("Executing for model {0}".format(name))
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [8]:
from sklearn.neural_network import MLPClassifier


clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X_train, Y_train)     

ValueError: Unknown label type: (array([ 0.21052632,  0.38157895,  0.30263158, ...,  0.25      ,
        0.17105263,  0.22368421]),)