In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data_dir = 'store-sales-time-series-forecasting'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
stores_df = pd.read_csv(os.path.join(data_dir, 'stores.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))

In [2]:
print(oil_df.columns.tolist())

['date', 'dcoilwtico']


In [3]:
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))
oil_df['date'] = pd.to_datetime(oil_df['date'])
oil_df.set_index('date', inplace=True)

train_df['date'] = pd.to_datetime(train_df['date'])
train_df.set_index('date', inplace=True)

sales_mean = train_df.groupby(['date'])['sales'].mean()
sales_std = train_df.groupby(['date'])['sales'].std()

train_df = pd.merge(train_df, oil_df, how='left', left_index=True, right_index=True)

In [4]:
oil_mean = oil_df['dcoilwtico'].mean()
oil_std = oil_df['dcoilwtico'].std()

In [6]:
train_df = pd.merge(train_df, oil_df, how='left', left_index=True, right_index=True)

train_df['sales'] = train_df['sales'].fillna(method='ffill')
train_df['dcoilwtico'] = train_df['dcoilwtico'].fillna(oil_mean)

In [7]:
train_df['sales_mean'] = train_df['sales'].rolling(window=7).mean()
train_df['sales_std'] = train_df['sales'].rolling(window=7).std()

train_df['oil_mean'] = train_df['dcoilwtico'].rolling(window=7).mean()
train_df['oil_std'] = train_df['dcoilwtico'].rolling(window=7).std()

train_df['year'] = train_df.index.year
train_df['month'] = train_df.index.month
train_df['week'] = train_df.index.week
train_df['dayofweek'] = train_df.index.dayofweek

  train_df['week'] = train_df.index.week


In [21]:
train_df

Unnamed: 0_level_0,id,store_nbr,family,sales,onpromotion,dcoilwtico_x,dcoilwtico_y,dcoilwtico,sales_mean,sales_std,oil_mean,oil_std,year,month,week,dayofweek
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-01-01,0,1,AUTOMOTIVE,0.000,0,,,67.714366,,,,,2013,1,1,1
2013-01-01,1,1,BABY CARE,0.000,0,,,67.714366,,,,,2013,1,1,1
2013-01-01,2,1,BEAUTY,0.000,0,,,67.714366,,,,,2013,1,1,1
2013-01-01,3,1,BEVERAGES,0.000,0,,,67.714366,,,,,2013,1,1,1
2013-01-01,4,1,BOOKS,0.000,0,,,67.714366,,,,,2013,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-15,3000883,9,POULTRY,438.133,0,47.57,47.57,47.570000,208.194429,246.188912,47.57,0.0,2017,8,33,1
2017-08-15,3000884,9,PREPARED FOODS,154.553,1,47.57,47.57,47.570000,226.702000,234.724439,47.57,0.0,2017,8,33,1
2017-08-15,3000885,9,PRODUCE,2419.729,148,47.57,47.57,47.570000,570.806143,843.065816,47.57,0.0,2017,8,33,1
2017-08-15,3000886,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57,47.57,47.570000,523.916429,859.914106,47.57,0.0,2017,8,33,1


In [8]:
import xgboost as xgb

def train_xgb(train_df):
    feature_cols = ['store_nbr', 'oil_mean', 'oil_std', 'sales_mean', 'sales_std', 'year', 'month', 'week', 'dayofweek']
    dtrain = xgb.DMatrix(train_df[feature_cols], label=train_df['sales'])
    params = {
        'max_depth': 5,
        'eta': 0.1,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse'
    }
    model = xgb.train(params, dtrain)
    return model

def predict_sales(train_df, test_df):
    test_df = test_df.copy()
    test_df['date'] = pd.to_datetime(test_df['date'])
    test_df.set_index('date', inplace=True)

    test_df = pd.merge(test_df, oil_df, how='left', left_index=True, right_index=True)
    test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(oil_mean)

    test_df['sales_mean'] = test_df['sales'].rolling(window=7).mean()
    test_df['sales_std'] = test_df['sales'].rolling(window=7).std()

    test_df['oil_mean'] = test_df['dcoilwtico'].rolling(window=7).mean()
    test_df['oil_std'] = test_df['dcoilwtico'].rolling(window=7).std()

    test_df['year'] = test_df.index.year
    test_df['month'] = test_df.index.month
    test_df['week'] = test_df.index.week
    test_df['dayofweek'] = test_df.index.dayofweek







In [10]:
test_df

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error



In [13]:
model=train_xgb(train_df)

In [17]:
test_df

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [19]:
features=['store_nbr', 'family', 'date']
X_test=test_df[features]

In [20]:
y_pred=model.predict(X_test)

output = pd.DataFrame({'id': test_df['id'], 'sales': y_pred})
output.to_csv('submission.csv', index=False)

TypeError: ('Expecting data to be a DMatrix object, got: ', <class 'pandas.core.frame.DataFrame'>)