In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
# --- Imports ---
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder

# --- Load the Data ---
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])

# --- Feature Engineering Functions ---
def create_date_features(df):
    df['day'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype('int')
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
    return df

def create_lag_features(df, lags=[1, 7, 14, 28]):
    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)
    return df

def create_rolling_features(df, windows=[7, 14, 28]):
    for window in windows:
        df[f'sales_roll_mean_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(1).rolling(window=window).mean()
    return df

# --- Label Encoding ---
le = LabelEncoder()
train['family'] = le.fit_transform(train['family'])
test['family'] = le.transform(test['family'])

# --- Merge train and test for feature engineering ---
test['sales'] = np.nan  # Dummy sales column
all_data = pd.concat([train, test], axis=0).sort_values(['store_nbr', 'family', 'date'])

# --- Feature Engineering ---
all_data = create_date_features(all_data)
all_data = create_lag_features(all_data)
all_data = create_rolling_features(all_data)

# --- Split back ---
train = all_data[~all_data['sales'].isna()]
test = all_data[all_data['sales'].isna()]

# --- Fill missing ---
train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

# --- Prepare Features and Target ---
features = [
    'store_nbr', 'family', 'onpromotion', 'day', 'weekofyear', 'month', 'year', 'dayofweek',
    'is_weekend', 'is_month_start', 'is_month_end',
    'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_28',
    'sales_roll_mean_7', 'sales_roll_mean_14', 'sales_roll_mean_28'
]

target = 'sales'

# --- Train-Validation Split ---
X_train = train[train['date'] < '2017-07-01'][features]
y_train = train[train['date'] < '2017-07-01'][target]
X_valid = train[train['date'] >= '2017-07-01'][features]
y_valid = train[train['date'] >= '2017-07-01'][target]

# --- Important Trick: Log Transform Target ---
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

# --- LightGBM Dataset ---
train_data = lgb.Dataset(X_train, label=y_train_log, categorical_feature=['family'])
valid_data = lgb.Dataset(X_valid, label=y_valid_log, categorical_feature=['family'])

# --- LightGBM Parameters ---
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.02,
    'num_leaves': 256,
    'max_depth': 8,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'seed': 42,
    'verbosity': -1,
}

# --- Train the Model ---
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=20000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=300),
        lgb.log_evaluation(period=500)
    ]
)

# --- Validation RMSLE ---
y_pred_valid_log = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_valid = np.expm1(y_pred_valid_log)  # Inverse transform

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, np.maximum(0, y_pred)))

print(f'Validation RMSLE: {rmsle(y_valid, y_pred_valid):.5f}')

# --- Predict on Test Set ---
X_test = test[features]
y_test_pred_log = model.predict(X_test, num_iteration=model.best_iteration)
y_test_pred = np.expm1(y_test_pred_log)  # Inverse transform
y_test_pred = np.maximum(0, y_test_pred)

# --- Submission ---
submission = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv')
submission['sales'] = y_test_pred
submission.to_csv('submission.csv', index=False)

print("✅ Submission file created successfully brother (Optimized)!")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.fillna(-1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.fillna(-1, inplace=True)


Training until validation scores don't improve for 300 rounds
[500]	training's rmse: 0.374564	valid_1's rmse: 0.375972
[1000]	training's rmse: 0.362269	valid_1's rmse: 0.372369
[1500]	training's rmse: 0.355273	valid_1's rmse: 0.370785
[2000]	training's rmse: 0.350275	valid_1's rmse: 0.370003
[2500]	training's rmse: 0.346521	valid_1's rmse: 0.369552
[3000]	training's rmse: 0.343396	valid_1's rmse: 0.369444
[3500]	training's rmse: 0.340539	valid_1's rmse: 0.369198
[4000]	training's rmse: 0.337966	valid_1's rmse: 0.369017
[4500]	training's rmse: 0.335471	valid_1's rmse: 0.36897
Early stopping, best iteration is:
[4295]	training's rmse: 0.336486	valid_1's rmse: 0.3689
Validation RMSLE: 0.36889
✅ Submission file created successfully brother (Optimized)!
