In [None]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np

%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (16, 8)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

In [None]:
data = pd.read_excel('./data/shop.xls', header=None)

In [None]:
df = data[[0]]
df['sales'] = 0

for i in range(1, len(data.columns), 2):
    sales = data[i]
    sales = sales.replace(' ', 0)
    sales = sales.astype('float64')
    df['sales'] = df['sales'] + sales

df.drop(0, axis=1, inplace=True)

In [None]:
not_empty = []
for i in range(data.shape[0]):
    row = data.iloc[[i]]
    count = 0
    for j in range(1, len(data.columns), 2):
        if row[j][i] != ' ':
            count += 1
    not_empty.append(count)

In [None]:
df['sales_avg'] = np.divide(df['sales'], not_empty)

In [None]:
y = df['sales_avg']

In [None]:
train_size = int(len(y) * 0.6)
test_size = int((len(y) - train_size) / 2)
train, test = y[0:train_size], y[train_size:train_size + test_size]
X_train = np.arange(train_size)
X_test = np.arange(train_size, train_size + test_size)
X_holdout = np.arange(train_size + test_size, len(y))

In [None]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
look_back = 5
trainX, trainY = create_dataset(train.reshape(-1, 1), look_back)
testX, testY = create_dataset(test.reshape(-1, 1), look_back)

In [None]:
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsRegressor(n_jobs=-1))])
knn_params = {'knn__n_neighbors': range(1, 20)}

model = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1)
model.fit(trainX, trainY)
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

print("kNN(train) mse =", mean_squared_error(trainY, trainPredict))
print("kNN(test) mse =", mean_squared_error(testY, testPredict))
print(model.best_params_)

In [None]:
plt.plot(y);

y_train_pred = np.r_[y[0:look_back], trainPredict]
plt.plot(X_train, y_train_pred);
y_test_pred = np.r_[y[train_size: train_size + look_back], testPredict]
plt.plot(X_test, y_test_pred);

yy = y[train_size + test_size - look_back: train_size + test_size]

nextX = yy

for i in range(len(X_holdout)):
    pred = model.predict(nextX.reshape(1, -1))
    yy = np.append(yy, pred)
    nextX = np.append(nextX[1:len(nextX)], pred)

plt.plot(X_holdout, yy[look_back:len(yy)]);
# plt.savefig('graph.svg', format='svg')

In [None]:
res = np.r_[y[0:look_back], trainPredict, y[train_size: train_size + look_back], testPredict]

yy = y[train_size + test_size - look_back: train_size + test_size]

nextX = yy

for i in range(len(X_holdout)):
    pred = model.predict(nextX.reshape(1, -1))
    yy = np.append(yy, pred)
    nextX = np.append(nextX[1:len(nextX)], pred)

res = np.r_[res, yy[look_back:len(yy)]]

In [None]:
sales_avg = np.array(y)
writer = pd.ExcelWriter('res.xls')
res_df = pd.DataFrame({
    'sales_avg': sales_avg, 'sales_avg_pred': res
})
res_df.to_excel(writer, sheet_name='Sheet1')
writer.save()