In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

In [10]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [3]:
import os
os.chdir('C:\\AADeloitte\\learning\\Data Science Practise\\Big Mart Sales')

In [4]:
train = pd.read_csv('train.csv',na_values={'Item_Visibility': 0})
test = pd.read_csv('test.csv', na_values={'Item_Visibility': 0})

In [5]:
print(train.shape, test.shape)

(8523, 12) (5681, 11)


In [6]:
train['source'] = 'train'
test['source'] = 'test'

In [7]:
data = pd.concat([train, test],sort=False)

In [8]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


In [None]:
# cat_col = data.select_dtypes(include='object').columns
# # print(cat_col)

In [None]:
# cat_col = [x for x in cat_col if x not in ['Item_Identifier']]

In [None]:
# num_col = data.select_dtypes(exclude='object')

In [11]:
#imputing data in Item_Weight columns
data.Item_Weight = data.groupby('Item_Identifier')['Item_Weight'].transform(lambda x: x.fillna(x.mean()))

In [12]:
## imputing data in Item_visibility columns
data.Item_Visibility = data.groupby('Item_Identifier')['Item_Visibility'].transform(lambda x: x.fillna(x.mean()))

In [13]:
data.Outlet_Size.fillna(value='Medium', inplace=True)

In [14]:
data.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [None]:
plt.style.use('ggplot')

In [None]:
data.boxplot(column='Item_Outlet_Sales', by=['Outlet_Type', 'Outlet_Establishment_Year'],
             figsize=(30,7))

In [None]:
# cat_col = data.select_dtypes(include=['object', 'category']).columns
# col_list = [x for x in cat_col if x in ['Outlet_Type', 'Outlet_Location_Type',
#                                         'Outlet_Establishment_Year']]

In [None]:
print(data.info())
print('``````````````````````````````````````')
data.Outlet_Size = data.Outlet_Size.astype(np.object)
print('``````````````````````````````````````')
print(data.info())

In [15]:
# Extracting the first two  letters of the strings Item_Identifiers

data['Item_type_combined'] = data.Item_Identifier.str[0:2]

data['Item_type_combined'] = data['Item_type_combined'].map({'FD':'Food',
                               'NC': 'Non-Consumable',
                               'DR': 'Drinks'})

data['Outlet_Years'] = np.subtract(2013, data['Outlet_Establishment_Year'])

In [16]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_type_combined,Outlet_Years
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train,Food,14
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train,Drinks,4
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train,Food,14
3,FDX07,19.2,Regular,0.02293,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38,train,Food,15
4,NCD19,8.93,Low Fat,0.01467,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train,Non-Consumable,26


In [17]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF': 'Low Fat',
                                 'reg': 'Regular',
                                 'low fat': 'Low Fat'})

In [18]:
data.loc[data['Item_type_combined'] == 'Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'

In [19]:
data.Item_type_combined.value_counts(dropna=False)

Food              10201
Non-Consumable     2686
Drinks             1317
Name: Item_type_combined, dtype: int64

In [None]:
data.head()

In [36]:
data_pre = data.copy() # pre-processed data

In [111]:
data_pre.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_type_combined,Outlet_Years
0,FDA15,9.3,0,0.016047,Dairy,249.8092,9,1999,1,0,1,3735.138,train,1,14
1,DRC01,5.92,2,0.019278,Soft Drinks,48.2692,3,2009,1,2,2,443.4228,train,0,4
2,FDN15,17.5,0,0.01676,Meat,141.618,9,1999,1,0,1,2097.27,train,1,14
3,FDX07,19.2,2,0.02293,Fruits and Vegetables,182.095,0,1998,1,2,0,732.38,train,1,15
4,NCD19,8.93,1,0.01467,Household,53.8614,1,1987,0,2,1,994.7052,train,2,26


In [112]:
# print(data_pre.info())
cat_col = data_pre.select_dtypes(include='object').columns

In [113]:
cat_col = [x for x in cat_col if x not in ['Item_Identifier', 'source', 'Item_Type']]

In [114]:
cat_col

[]

In [115]:
le = LabelEncoder()

In [116]:
for i in cat_col:
    data_pre[i] = (le.fit_transform(data_pre[i]))

In [117]:
data_pre.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_type_combined,Outlet_Years
0,FDA15,9.3,0,0.016047,Dairy,249.8092,9,1999,1,0,1,3735.138,train,1,14
1,DRC01,5.92,2,0.019278,Soft Drinks,48.2692,3,2009,1,2,2,443.4228,train,0,4
2,FDN15,17.5,0,0.01676,Meat,141.618,9,1999,1,0,1,2097.27,train,1,14
3,FDX07,19.2,2,0.02293,Fruits and Vegetables,182.095,0,1998,1,2,0,732.38,train,1,15
4,NCD19,8.93,1,0.01467,Household,53.8614,1,1987,0,2,1,994.7052,train,2,26


In [124]:
train = data_pre[data_pre['source'] == 'train']

In [125]:
type(train)

pandas.core.frame.DataFrame

In [126]:
train.drop(['Item_Type', 'Outlet_Establishment_Year', 'source', 'Item_Identifier'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [127]:
train.head(2)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_type_combined,Outlet_Years
0,9.3,0,0.016047,249.8092,9,1,0,1,3735.138,1,14
1,5.92,2,0.019278,48.2692,3,1,2,2,443.4228,0,4


In [132]:
X, y = train.drop('Item_Outlet_Sales', axis=1).values, train.Item_Outlet_Sales.values

In [133]:
pd.Series(y)

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
5        556.6088
6        343.5528
7       4022.7636
8       1076.5986
9       4710.5350
10      1516.0266
11      2187.1530
12      1589.2646
13      2145.2076
14      1977.4260
15      1547.3192
16      1621.8888
17       718.3982
18      2303.6680
19      2748.4224
20      3775.0860
21      4064.0432
22      1587.2672
23       214.3876
24      4078.0250
25       838.9080
26      1065.2800
27       308.9312
28       178.4344
29       125.8362
          ...    
8493    4727.1800
8494    1637.8680
8495    4314.3840
8496    3715.1640
8497    2247.0750
8498    1601.2490
8499    2976.1260
8500    1907.5170
8501    1508.0370
8502    3020.0688
8503    3392.9168
8504    4138.6128
8505    2117.2440
8506    3944.8650
8507     515.9950
8508    2587.9646
8509     424.7804
8510    7182.6504
8511    4207.8560
8512    2479.4392
8513     595.2252
8514     468.7232
8515    1571.2880
8516     858.8820
8517    36

In [134]:
for i in enumerate(data_model.columns):
    print(i)

(0, 'Item_Weight')
(1, 'Item_Fat_Content')
(2, 'Item_Visibility')
(3, 'Item_MRP')
(4, 'Outlet_Identifier')
(5, 'Outlet_Size')
(6, 'Outlet_Location_Type')
(7, 'Outlet_Type')
(8, 'Item_Outlet_Sales')
(9, 'Item_type_combined')
(10, 'Outlet_Years')


In [135]:
ohe = OneHotEncoder(categorical_features=[1,4,5,6,7,9])

In [136]:
X = ohe.fit_transform(X).toarray()

In [137]:
print(X.shape, y.shape)

(8523, 36) (8523,)


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('LSS', Lasso()))
models.append(('RG', Ridge()))
models.append(('RF', RandomForestRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('KN', KNeighborsRegressor()))
models.append(('SR', SVR()))

results = []
names = []

for name, model in models:
    
#     Kfold estimation
    kfold = KFold(n_splits=5, random_state=421, shuffle=True)
    
#     Cross-validation
    cv_results = cross_val_score(model, X_train, y_train, scoring='r2', cv=kfold)
    
    results.append(cv_results)
    names.append(name)
    
#     printing the message
    msg = '%s %f (%f)' % (name, (1- cv_results.mean()), cv_results.std())
    print(msg)

In [139]:
xgr = XGBRegressor(n_estimators=1000, n_jobs=-1)

In [140]:
xgr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [141]:
y_pred = xgr.predict(X_test)

In [144]:
r2_score(y_test, y_pred)

0.45611981785536493

In [145]:
mean_squared_error(y_test, y_pred)

1334998.6005194013