# Importing Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")



In [8]:
sales = pd.read_csv('/kaggle/input/build-model/Sales data-Set.csv')
print("Sales Total Col.",len(sales.columns),"\nShape:",sales.shape)

Sales Total Col. 5 
Shape: (421570, 5)


In [9]:
stores = pd.read_csv('/kaggle/input/build-model/stores.csv')
print("Stores Total Col.",len(stores.columns),"\nShape:",stores.shape)

Stores Total Col. 3 
Shape: (45, 3)


In [12]:
features = pd.read_csv('/kaggle/input/build-model/features.csv')
print("Features Total Col.",len(features.columns),"\nShape:",features.shape)

Features Total Col. 12 
Shape: (8190, 12)


# Merging Data

In [50]:
data = sales.merge(features,how="left", on=['Store', 'Date', 'IsHoliday'])
data.head()


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106
2,1,1,2010-02-19,41595.55,False,39.93,2.514,,,,,,211.289143,8.106
3,1,1,2010-02-26,19403.54,False,46.63,2.561,,,,,,211.319643,8.106
4,1,1,2010-03-05,21827.9,False,46.5,2.625,,,,,,211.350143,8.106


In [51]:
data = data.merge(stores, how= "left", on=['Store'])
data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,,,,,,211.289143,8.106,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,,,,,,211.319643,8.106,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,,,,,,211.350143,8.106,A,151315


In [52]:
data['Date']=pd.to_datetime(data['Date'], infer_datetime_format=True)  

In [56]:
#create new column
data["Month"] = pd.DatetimeIndex(data['Date']).month
data["Year"] = pd.DatetimeIndex(data['Date']).year
data["WeekofYear"] = pd.DatetimeIndex(data['Date']).week
data.drop(['Date'], axis=1, inplace=True)
data.head()


KeyError: 'Date'

In [57]:
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables : \n'.format(len(categorical)), categorical)


There are 1 categorical variables : 
 ['Type']


In [58]:
# view the categorical variables
data[categorical].head()

Unnamed: 0,Type
0,A
1,A
2,A
3,A
4,A


# check and replacing missing values in numerical variables

In [59]:
data.isnull().sum()

Store                0
Dept                 0
Weekly_Sales         0
IsHoliday            0
Temperature          0
Fuel_Price           0
MarkDown1       270889
MarkDown2       310322
MarkDown3       284479
MarkDown4       286603
MarkDown5       270138
CPI                  0
Unemployment         0
Type                 0
Size                 0
Month                0
Year                 0
WeekofYear           0
dtype: int64

In [60]:
# Replace NaN with default values
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan,strategy='constant', fill_value = 0) 
imputer.fit(data[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']]) 
data[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']]=imputer.transform(data[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']])
data.isnull().sum()

Store           0
Dept            0
Weekly_Sales    0
IsHoliday       0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
Type            0
Size            0
Month           0
Year            0
WeekofYear      0
dtype: int64

In [61]:
data.head()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Month,Year,WeekofYear
0,1,1,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2,2010,5
1,1,1,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315,2,2010,6
2,1,1,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315,2,2010,7
3,1,1,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315,2,2010,8
4,1,1,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315,3,2010,9


In [62]:
# use LabelEncoder to replace Type 
from sklearn.preprocessing import LabelEncoder
data['Type']= LabelEncoder().fit_transform(data['Type'])
data.head()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Month,Year,WeekofYear
0,1,1,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,151315,2,2010,5
1,1,1,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,0,151315,2,2010,6
2,1,1,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,0,151315,2,2010,7
3,1,1,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,0,151315,2,2010,8
4,1,1,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,0,151315,3,2010,9


# Split Data

In [64]:
x = data[['Store','Dept','IsHoliday','Size','Month','Year','WeekofYear']]
y = data['Weekly_Sales']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


Build and Evaluate Model
XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.

LightGBM, short for Light Gradient Boosting Machine, is a free and open source distributed gradient boosting framework for machine learning originally developed by Microsoft. It is based on decision tree algorithms and used for ranking, classification and other machine learning tasks. The development focus is on performance and scalability.



In [65]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.linear_model import ElasticNet, Lasso, RidgeCV,LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb


In [None]:
modelList= [LinearRegression(),ElasticNet(),RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]),Lasso(alpha =16, random_state=100),ElasticNet(alpha=0.8),DecisionTreeRegressor(),RandomForestRegressor(),GradientBoostingRegressor(),AdaBoostRegressor(), SVR(),LinearSVR(random_state=0, tol=1e-5, C=1500), NuSVR(),xgb.XGBRegressor(),lgb.LGBMRegressor()]
name = []
score = []
models = []
rmse = []
r2 = []
mae = []
i = 0

for model in modelList:
    print(model)
    models.append(model)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    name.append(type(model).__name__)
    score.append(model.score(x_test, y_test))
    r2.append(r2_score(y_test, y_pred))
    mae.append(mean_absolute_error(y_test, y_pred))
    rmse.append(mean_squared_error(y_test, y_pred, squared=False)) # Setting squared to False to get RMSE    

LinearRegression()
ElasticNet()
RidgeCV(alphas=[0.001, 0.01, 0.1, 1])
Lasso(alpha=16, random_state=100)
ElasticNet(alpha=0.8)
DecisionTreeRegressor()
RandomForestRegressor()
GradientBoostingRegressor()


In [None]:
df_score = pd.DataFrame(list(zip(name,r2,mae,rmse, score, models)),columns=['name','r2','mae','rmse','score',"models"])
df_score.set_index('name',inplace=True)
df_score.sort_values(by=['score'],inplace=True)
df_score

In [None]:
model = df_score.loc["RandomForestRegressor","models"]
y_pred = model.predict(x_test)
print(y_pred)  