In [2]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data_dir = 'store-sales-time-series-forecasting'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
stores_df = pd.read_csv(os.path.join(data_dir, 'stores.csv'))
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))

In [6]:
print(oil_df.columns.tolist())

['dcoilwtico']


In [3]:
oil_df = pd.read_csv(os.path.join(data_dir, 'oil.csv'))
oil_df['date'] = pd.to_datetime(oil_df['date'])
oil_df.set_index('date', inplace=True)

train_df['date'] = pd.to_datetime(train_df['date'])
train_df.set_index('date', inplace=True)

sales_mean = train_df.groupby(['date'])['sales'].mean()
sales_std = train_df.groupby(['date'])['sales'].std()

train_df = pd.merge(train_df, oil_df, how='left', left_index=True, right_index=True)

In [4]:
oil_mean = oil_df['dcoilwtico'].mean()
oil_std = oil_df['dcoilwtico'].std()

In [5]:
train_df = pd.merge(train_df, oil_df, how='left', left_index=True, right_index=True)

train_df['sales'] = train_df['sales'].fillna(method='ffill')
train_df['dcoilwtico'] = train_df['dcoilwtico'].fillna(oil_mean)

KeyError: 'dcoilwtico'

In [None]:
train_df['sales_mean'] = train_df['sales'].rolling(window=7).mean()
train_df['sales_std'] = train_df['sales'].rolling(window=7).std()

train_df['oil_mean'] = train_df['dcoilwtico'].rolling(window=7).mean()
train_df['oil_std'] = train_df['dcoilwtico'].rolling(window=7).std()

train_df['year'] = train_df.index.year
train_df['month'] = train_df.index.month
train_df['week'] = train_df.index.week
train_df['dayofweek'] = train_df.index.dayofweek

In [None]:
import xgboost as xgb

def train_xgb(train_df):
    feature_cols = ['store_nbr', 'family', 'oil_mean', 'oil_std', 'sales_mean', 'sales_std', 'year', 'month', 'week', 'dayofweek']
    dtrain = xgb.DMatrix(train_df[feature_cols], label=train_df['sales'])
    params = {
        'max_depth': 5,
        'eta': 0.1,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse'
    }
    model = xgb.train(params, dtrain)
    return model

def predict_sales(train_df, test_df):
    test_df = test_df.copy()
    test_df['date'] = pd.to_datetime(test_df['date'])
    test_df.set_index('date', inplace=True)

    test_df = pd.merge(test_df, oil_df, how='left', left_index=True, right_index=True)
    test_df['dcoilwtico'] = test_df['dcoilwtico'].fillna(oil_mean)

    test_df['sales_mean'] = test_df['sales'].rolling(window=7).mean()
    test_df['sales_std'] = test_df['sales'].rolling(window=7).std()

    test_df['oil_mean'] = test_df['dcoilwtico'].rolling(window=7).mean()
    test_df['oil_std'] = test_df['dcoilwtico'].rolling(window=7).std()

    test_df['year'] = test_df.index.year
    test_df['month'] = test_df.index.month
    test_df['week'] = test_df.index.week
    test_df['dayofweek'] = test_df.index.dayofweek




from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate_model(train_df, test_df):
    model = train_xgb(train_df)
    y_true = test_df['sales'].values
    y_pred = predict_sales(train_df, test_df)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f'RMSE: {rmse:.2f}')
    print(f'MAE: {mae:.2f}')


In [None]:
test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))
evaluate_model(train_df, test_df)

