In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from sklearn.linear_model import LinearRegression
from sklearn import tree

In [3]:
features = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
train = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
test = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
stores = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')

# merging data

In [4]:
df = pd.merge(train, features, how = "left")

In [5]:
df = pd.merge(df, stores, how = "left")

# transform data to datetime

In [6]:
df.Date = pd.to_datetime(df.Date)

In [7]:
df.set_index(keys = "Date", inplace = True)

In [8]:
df.drop(axis = 1, columns = ["MarkDown1", "MarkDown2","MarkDown3","MarkDown4", "MarkDown5"], inplace = True)

# handle categoricals 

In [9]:
df.Store = pd.Categorical(df.Store)
df.Dept = pd.Categorical(df.Dept)

In [11]:
df.Weekly_Sales[df.Weekly_Sales == max(df.Weekly_Sales)]

Date
2010-11-26    693099.36
Name: Weekly_Sales, dtype: float64

In [13]:
df.IsHoliday = pd.Categorical(df.IsHoliday)
df.Type = pd.Categorical(df.Type)

In [14]:
df_X = df.drop("Weekly_Sales", axis = 1)

In [15]:
df_Y = df.Weekly_Sales

In [16]:
df_X = pd.get_dummies(df_X, drop_first = True)
df_X.reset_index(inplace = True)
df_X = df_X.iloc[:,1:]

In [17]:
from sklearn.model_selection import train_test_split
df_X_train, df_X_validation, df_Y_train, df_Y_validation = train_test_split(df_X , df_Y, test_size = 0.2)

In [18]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [19]:
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
from lightgbm import LGBMRegressor

In [22]:
lgbm = LGBMRegressor()
lgbm_model = lgbm.fit(df_X_train, df_Y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 130
[LightGBM] [Info] Start training from score 15971.940286


In [23]:
lgbm_model.score(df_X_validation, df_Y_validation)

0.868223750985688

In [24]:
y_pred = lgbm_model.predict(df_X_validation)

In [25]:
rmse = np.sqrt(mean_squared_error(df_Y_validation, y_pred))
rmse

8305.268278450305

In [26]:
lgbm_grid = {'n_estimators': [20, 40, 100, 200, 500, 1000],
             'learning_rate': [0.1, 0.01, 0.5]}

# use grind search for best params

In [27]:
from sklearn.model_selection import GridSearchCV

In [29]:
lgbm = LGBMRegressor()
lgbm_cv_model = GridSearchCV(lgbm, lgbm_grid, cv=5, n_jobs = -1, verbose = 2)

In [30]:
lgbm_cv_model.fit(df_X_train, df_Y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.088509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 269805, number of used features: 130
[LightGBM] [Info] Start training from score 15972.514868
[CV] END .................learning_rate=0.1, n_estimators=20; total time=   4.7s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022484 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 269805, number of used features: 130
[LightGBM] [Info] Start training from score 15972.514868
[CV] END .................learning_rate=0.1, n_estimators=40; total time=   4.6s
[LightGBM] [Info]

In [31]:
lgbm_cv_model.best_params_

{'learning_rate': 0.5, 'n_estimators': 1000}

In [32]:
lgbm_tuned = LGBMRegressor(learning_rate = 0.5, n_estimators = 1000)

lgbm_tuned = lgbm_tuned.fit(df_X_train, df_Y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1300
[LightGBM] [Info] Number of data points in the train set: 337256, number of used features: 130
[LightGBM] [Info] Start training from score 15971.940286


In [33]:
y_pred = lgbm_tuned.predict(df_X_validation)

# final score

In [34]:
np.sqrt(mean_squared_error(df_Y_validation, y_pred))

4969.7372343221805

In [35]:
lgbm_tuned.score(df_X_validation, df_Y_validation)

0.9528157926665131