In [1]:
import pandas
from pandas import *
import numpy
from datetime import datetime

# Get the training data
df_train = pandas.read_csv("data/train.csv.gz")

# Only small sample for now
#
df_train = df_train.sample(100000)
# Reading test data set
df_test = pandas.read_csv("data/test.csv.gz")


print("Starting with {0}".format(df_train.count()))
display(df_train.sample(10))
df_train.describe()

Starting with date     100000
store    100000
item     100000
sales    100000
dtype: int64


Unnamed: 0,date,store,item,sales
566889,2015-04-10,1,32,48
241818,2015-02-26,3,14,43
489601,2013-08-22,9,27,22
742150,2015-03-06,7,41,21
127451,2016-12-28,10,7,47
889052,2017-06-05,7,49,23
412387,2017-03-18,6,23,26
613755,2013-08-08,7,34,14
517326,2014-07-23,4,29,78
666143,2017-01-19,5,37,12


Unnamed: 0,store,item,sales
count,100000.0,100000.0,100000.0
mean,5.49242,25.51429,52.31664
std,2.874388,14.454745,28.88542
min,1.0,1.0,2.0
25%,3.0,13.0,30.0
50%,5.0,26.0,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,201.0


In [2]:
# Adding various information to the dataframe for later 
# DayOfWeek
# DayOfMonth
# Month
# Year
# CalendarWeek


# Add all variables that we would like to this model

# Extracting some variable from date
def date_info(date):
    datetime_object = datetime.strptime(date, "%Y-%m-%d")
    
    week_day = datetime_object.weekday()
    cw = datetime_object.isocalendar()[1]
    
    day = datetime_object.day
    month = datetime_object.month
    year = datetime_object.year
    
    return (week_day, cw, day, month, year)


def extrapolate(df):
    df['week_day'], df['cw'], df['day'], df['month'], df['year'] = zip(*df["date"].map(date_info))

    # Set type of store and item to category
    df['item'] = df['item'].astype('category')
    df['store'] = df['store'].astype('category')
    
    
    # Convert date as proper time as well
    df['time'] = df['date'].astype('datetime64[ns]')
    
    # Month should be a category as well
    df['week_day'] = df['week_day'].astype('category')
    df['month'] = df['month'].astype('category')

    return df

# Add those information to df_train and df_test
df_train = extrapolate(df_train)
df_test = extrapolate(df_test)

# Understand how scale change depending on dow
df_dow = df_train
display(df_train.sample(10))


Unnamed: 0,date,store,item,sales,week_day,cw,day,month,year,time
704430,2016-11-21,6,39,26,0,47,21,11,2016,2016-11-21
107584,2017-08-04,9,6,92,4,31,4,8,2017,2017-08-04
216159,2014-11-23,9,12,80,6,47,23,11,2014,2014-11-23
803857,2014-02-22,1,45,61,5,8,22,2,2014,2014-02-22
487249,2017-03-14,7,27,12,1,11,14,3,2017,2017-03-14
112731,2016-09-07,2,7,57,2,36,7,9,2016,2016-09-07
563591,2016-03-29,9,31,53,1,13,29,3,2016,2016-03-29
33447,2014-08-03,9,2,77,6,31,3,8,2014,2014-08-03
603681,2016-01-07,1,34,14,3,1,7,1,2016,2016-01-07
297424,2017-06-01,3,17,57,3,22,1,6,2017,2017-06-01


In [3]:
# Generate our training/validation datasets
from sklearn import model_selection

result_cols = ['sales']
input_cols = ['store', 'item', 'week_day', 'cw', 'day', 'month', 'year']

def get_values(df, cols=[]):
    final_df = df
    
    print(final_df.columns.values)
    # Remove all columns that are not inside the list
    for column in final_df.columns.values:
        print(column)
        if column not in cols:
            final_df = final_df.drop(columns=[column])
    return final_df.values


X = get_values(df_train, input_cols)
Y = get_values(df_train, result_cols).ravel()
X_test = get_values(df_test, input_cols)

validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

# Create validation + training set

['date' 'store' 'item' 'sales' 'week_day' 'cw' 'day' 'month' 'year' 'time']
date
store
item
sales
week_day
cw
day
month
year
time
['date' 'store' 'item' 'sales' 'week_day' 'cw' 'day' 'month' 'year' 'time']
date
store
item
sales
week_day
cw
day
month
year
time
['id' 'date' 'store' 'item' 'week_day' 'cw' 'day' 'month' 'year' 'time']
id
date
store
item
week_day
cw
day
month
year
time


In [None]:
# Import algorithm
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC




# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
#models.append(('LR', LogisticRegression()))
#models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []

print("Starting all models")
#Y_train=Y_train.astype('int')

for name, model in models:
    print("Executing for model {0}".format(name))
    #kfold = model_selection.KFold(n_splits=10, random_state=seed)
    #cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    #results.append(cv_results)
    #names.append(name)
    #msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    #print(msg)
    
    # Writing ht
    
    model.fit(X_train, Y_train)
    Y_test = model.predict(X_test)
    print("Result should be {0}".format(Y_test))
    print(len(Y_test))
    print(len(X_test))

    # Create new result dataframe:
    result_df = df_test['id']
    df_test['sales'] = Y_test
    df_test.head(5)
    df_result = df_test[['id', 'sales']]

    # Write the dataframe as output
    df_result.head(5)

    df_result.to_csv("{0}.csv".format(name), index=False)

Starting all models
Executing for model LDA
Result should be [49 49 49 ..., 34 34 34]
45000
45000
Executing for model KNN
Result should be [12 12 12 ..., 34 29 29]
45000
45000
Executing for model CART
Result should be [19 19 18 ..., 52 52 52]
45000
45000
Executing for model NB
Result should be [17 17 17 ..., 34 34 34]
45000
45000
Executing for model LR
Result should be [ 9 17 17 ..., 34 34 34]
45000
45000
Executing for model SVM
