# Big Data Mart Sales Problem Project

In [None]:
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
import joblib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
training_dataset = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/bigdatamart_rep/master/bigdatamart_Train.csv")

In [None]:
training_dataset

In [None]:
testing_dataset_original = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/bigdatamart_rep/master/bigdatamart_Test.csv")

In [None]:
testing_dataset_original

# Exploratory Data Analysis (EDA)

In [None]:
training_dataset.columns

In [None]:
training_dataset.isnull().sum()

In [None]:
training_dataset.describe()

In [None]:
training_dataset.info()

In [None]:
training_dataset.apply(lambda x: len(x.unique()))

In [None]:
obj_columns = []
for x in training_dataset.dtypes.index:
    if training_dataset.dtypes[x] == 'object':
        obj_columns.append(x)
        

obj_columns.remove('Item_Identifier')
obj_columns.remove('Outlet_Identifier')

obj_columns

In [None]:
for col in obj_columns:
    print(col)
    print(training_dataset[col].value_counts())
    print("="*40)

# Filling missing values present in our training dataset.

In [None]:
item_weight_mean = training_dataset.pivot_table(values = "Item_Weight", index = 'Item_Identifier')
item_weight_mean

In [None]:
missing_data = training_dataset['Item_Weight'].isnull()

for i, item in enumerate(training_dataset['Item_Identifier']):
    if missing_data[i]:
        if item in item_weight_mean:
            training_dataset['Item_Weight'][i] = item_weight_mean.loc[item]['Item_Weight']
        else:
            training_dataset['Item_Weight'][i] = np.mean(training_dataset['Item_Weight'])

In [None]:
training_dataset['Item_Weight'].isnull().sum()

In [None]:
training_dataset['Item_Weight'].isnull().sum()

In [None]:
missing_val = training_dataset['Outlet_Size'].isnull()
training_dataset.loc[missing_val, 'Outlet_Size'] = training_dataset.loc[missing_val, 'Outlet_Type'].apply(lambda x: outlet_size_mode[x])

In [None]:
training_dataset['Outlet_Size'].isnull().sum()

In [None]:
sum(training_dataset['Item_Visibility']==0)

In [None]:
training_dataset.loc[:, 'Item_Visibility'].replace([0], [training_dataset['Item_Visibility'].mean()], inplace=True)

In [None]:
sum(training_dataset['Item_Visibility']==0)

In [None]:
training_dataset['Item_Fat_Content'] = training_dataset['Item_Fat_Content'].replace({'LF':'Low Fat', 'reg':'Regular', 'low fat':'Low Fat'})
training_dataset['Item_Fat_Content'].value_counts()

# Adding more columns/categories from the existing one's

In [None]:
training_dataset['New_Item_Type'] = training_dataset['Item_Identifier'].apply(lambda x: x[:2])
training_dataset['New_Item_Type'] = training_dataset['New_Item_Type'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})
training_dataset['New_Item_Type'].value_counts()

In [None]:
training_dataset.loc[training_dataset['New_Item_Type']=='Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'
training_dataset['Item_Fat_Content'].value_counts()

In [None]:
training_dataset['Outlet_Years'] = 2013 - training_dataset['Outlet_Establishment_Year']
training_dataset['Outlet_Years'].head()

In [None]:
training_dataset = training_dataset.drop("Outlet_Establishment_Year", axis=1)
training_dataset

In [None]:
training_dataset.skew()

# Visualization

In [None]:
plt.figure(figsize=(15,9))
values = list(training_dataset['Item_Type'].unique())
diag = sns.countplot(training_dataset["Item_Type"])
diag.set_xticklabels(labels=values, rotation=90)
plt.title("Item Type Column Details\n")
plt.xlabel("Product category names")
plt.ylabel("Count of rows in the dataset")
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(training_dataset["Item_Fat_Content"])
plt.ylabel("Count of rows in the dataset")
plt.xlabel("Item Categories with respect to Fat")
plt.title("Item_Fat_Content Column Details\n")
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(training_dataset["Outlet_Size"])
plt.ylabel("Count of rows in the dataset")
plt.xlabel("Outlet Size Variations")
plt.title("Outlet_Size Column Details\n")
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(training_dataset["Outlet_Location_Type"])
plt.ylabel("Count of rows in the dataset")
plt.xlabel("Outlet Location on Tier Level")
plt.title("Outlet_Location_Type Column Details\n")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(training_dataset["Outlet_Type"])
plt.ylabel("Count of rows in the dataset")
plt.xlabel("Outlet Type based on it's interiors")
plt.title("Outlet_Type Column Details\n")
plt.show()

In [None]:
sns.pairplot(training_dataset)

In [None]:
fig, ax = plt.subplots(ncols=4, nrows=1, figsize=(12,7))
index = 0
ax = ax.flatten()
numeric_column_names = ["Item_Weight", "Item_Visibility", "Item_MRP", "Item_Outlet_Sales"]
for col, value in training_dataset[numeric_column_names].items():
    sns.boxplot(y=col, data=training_dataset, ax=ax[index], palette="bone")
    index += 1
plt.tight_layout(pad=0.2, w_pad=0.9, h_pad=5.0)
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=4, nrows=1, figsize=(12,7))
index = 0
ax = ax.flatten()
numeric_column_names = ["Item_Weight", "Item_Visibility", "Item_MRP", "Item_Outlet_Sales"]
for col, value in training_dataset[numeric_column_names].items():
    sns.distplot(value, ax=ax[index], hist=False, color="r", kde_kws={"shade": True})
    index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)
plt.show()


# Correlation using a Heatmap

In [None]:
lower_triangle = np.tril(training_dataset.corr())
plt.figure(figsize=(12,6))
sns.heatmap(training_dataset.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.5f', 
            annot_kws={'size':10}, cmap="Set3", mask=lower_triangle)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

# Log Transformation on the Target Label column

In [None]:
# log transformation on "Item_Outlet_Sales" column

training_dataset['Item_Outlet_Sales'] = np.log(1+training_dataset['Item_Outlet_Sales'])

# Encoding

In [None]:
training_dataset = training_dataset.drop(["Item_Identifier","Outlet_Identifier"], axis=1)

In [None]:
label_encoder = LabelEncoder()
col_name = ["Item_Type"]
for col in col_name:
    training_dataset[col] = label_encoder.fit_transform(training_dataset[col])

In [None]:
training_dataset = pd.get_dummies(training_dataset, columns=['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'New_Item_Type'])
training_dataset

# Heatmap

In [None]:
triangle = np.triu(training_dataset.corr())
plt.figure(figsize=(20,15))
sns.heatmap(training_dataset.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', 
            annot_kws={'size':10}, cmap="YlGnBu", mask=triangle)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

# Splitting the dataset into 2 variables namely 'X' and 'Y' for feature and label

In [None]:
X = training_dataset.drop('Item_Outlet_Sales', axis=1)
Y = training_dataset['Item_Outlet_Sales']

# Feature scaling

In [None]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X # Displaying all the features after applying scaling technique to avoid bias output

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=111)

# Machine Learning Model for Regression and Evaluation Metrics

In [None]:
# Regression Model Function

def reg(model, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=111)
    
    # Training the model
    model.fit(X_train, Y_train)
    
    # Predicting Y_test
    pred = model.predict(X_test)
    
    # RMSE - a lower RMSE score is better than a higher one
    rmse = mean_squared_error(Y_test, pred, squared=False)
    print("RMSE Score is:", rmse)
    
    # R2 score
    r2 = r2_score(Y_test, pred, multioutput='variance_weighted')*100
    print("R2 Score is:", r2)
    
    # Cross Validation Score
    cv_score = (cross_val_score(model, X, Y, cv=5).mean())*100
    print("Cross Validation Score:", cv_score)
    
    # Result of r2 score minus cv score
    result = r2 - cv_score
    print("R2 Score - Cross Validation Score is", result)

In [None]:
# Linear Regression Model

model=LinearRegression()
reg(model, X, Y)

In [None]:
# Support Vector Regression

model=SVR(C=1.0, epsilon=0.2, kernel='poly', gamma='auto')
reg(model, X, Y)

In [None]:
# Decision Tree Regressor

model=DecisionTreeRegressor(criterion="poisson", random_state=111)
reg(model, X, Y)

In [None]:
# Random Forest Regressor

model=RandomForestRegressor(max_depth=2, max_features="sqrt")
reg(model, X, Y)

In [None]:
# K Neighbors Regressor

KNeighborsRegressor(n_neighbors=2, algorithm='kd_tree')
reg(model, X, Y)

In [None]:
# Gradient Boosting Regressor

model=GradientBoostingRegressor(loss='quantile', n_estimators=200, max_depth=5)
reg(model, X, Y)

In [None]:
# Ada Boost Regressor

model=AdaBoostRegressor(n_estimators=300, learning_rate=1.05, random_state=42)
reg(model, X, Y)

In [None]:
# Extra Trees Regressor

model=ExtraTreesRegressor(n_estimators=200, max_features='sqrt', n_jobs=6)
reg(model, X, Y)

# Hyper parameter tuning

In [None]:
# Choosing Support Vector Regression

fmod_param = {'kernel' : ["linear", "rbf"],
              'gamma' : ["scale", "auto"],
              'C' : [2.0, 4.0],
              'epsilon' : [0.2, 0.4]
             }

In [None]:
GSCV = GridSearchCV(SVR(), fmod_param, cv=5)

In [None]:
GSCV.fit(X_train,Y_train)

In [None]:
GSCV.best_params_

In [None]:
Final_Model = SVR(C=2.0, epsilon=0.4, gamma="auto", kernel="rbf")
Model_Training = Final_Model.fit(X_train, Y_train)
fmod_pred = Final_Model.predict(X_test)
fmod_r2 = r2_score(Y_test, fmod_pred, multioutput='variance_weighted')*100
print("R2 score for the Best Model is:", fmod_r2)

# Pre processing the Testing Dataset to predict the Sales column

In [None]:
testing_dataset = testing_dataset_original.copy()
testing_dataset

In [None]:
# filling missing data in the testing dataset for column "Item_Weight"

item_weight_mean = testing_dataset.pivot_table(values = "Item_Weight", index = 'Item_Identifier')
missing_data = testing_dataset['Item_Weight'].isnull()
for i, item in enumerate(testing_dataset['Item_Identifier']):
    if missing_data[i]:
        if item in item_weight_mean:
            testing_dataset['Item_Weight'][i] = item_weight_mean.loc[item]['Item_Weight']
        else:
            testing_dataset['Item_Weight'][i] = np.mean(testing_dataset['Item_Weight'])
            
# filling missing data in the testing dataset for column "Outlet_Size"

outlet_size_mode = testing_dataset.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
missing_val = testing_dataset['Outlet_Size'].isnull()
testing_dataset.loc[missing_val, 'Outlet_Size'] = testing_dataset.loc[missing_val, 'Outlet_Type'].apply(lambda x: outlet_size_mode[x])

# filling zero values in the testing dataset for column "Item_Visibility"

testing_dataset.loc[:, 'Item_Visibility'].replace([0], [testing_dataset['Item_Visibility'].mean()], inplace=True)

In [None]:
# Clubbing similar data rows together for column "Item_Fat_Content" using replace

testing_dataset['Item_Fat_Content'] = testing_dataset['Item_Fat_Content'].replace({'LF':'Low Fat', 'reg':'Regular', 'low fat':'Low Fat'})
testing_dataset['Item_Fat_Content'].value_counts()

In [None]:
# Creating a new column named "New_Item_Type" and adding proper categories using map

testing_dataset['New_Item_Type'] = testing_dataset['Item_Identifier'].apply(lambda x: x[:2])
testing_dataset['New_Item_Type'] = testing_dataset['New_Item_Type'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})
testing_dataset['New_Item_Type'].value_counts()

In [None]:
# Creating a new column named "Item_Fat_Content" and adding proper categories

testing_dataset.loc[testing_dataset['New_Item_Type']=='Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'
testing_dataset['Item_Fat_Content'].value_counts()

In [None]:
# Creating a new column named "Outlet_Years" and removing the column "Outlet_Establishment_Year" that was used to derive it

testing_dataset['Outlet_Years'] = 2013 - testing_dataset['Outlet_Establishment_Year']
testing_dataset = testing_dataset.drop("Outlet_Establishment_Year", axis=1)

In [None]:
# Dropping the ID columns "Item_Identifier" and "Outlet_Identifier"

testing_dataset = testing_dataset.drop(["Item_Identifier","Outlet_Identifier"], axis=1)

In [None]:
# Label Encoder

label_encoder = LabelEncoder()
col_name = ["Item_Type"]
for col in col_name:
    testing_dataset[col] = label_encoder.fit_transform(testing_dataset[col])

# One Hot Encoder
testing_dataset = pd.get_dummies(testing_dataset, columns=['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'New_Item_Type'])

In [None]:
# Feature Scaling on all testing dataset rows

scaler = StandardScaler()
testing_dataset = pd.DataFrame(scaler.fit_transform(testing_dataset), columns=testing_dataset.columns)

# Predition result

In [None]:
# Predicting the Item_Outlet_Sales from the feature columns of our Testing dataset

Predicted_Sales = Final_Model.predict(testing_dataset)

# Reversing the Log Transformation that was performed on the Target column while training the ML Model

Predicted_Sales = np.exp(Predicted_Sales)
Predicted_Sales

# Converting the sales output back in CSV format

In [None]:
predicted_output = pd.DataFrame()
predicted_output['Item_Outlet_Sales']=Predicted_Sales
predicted_output

In [None]:
predicted_output.to_csv("Predicted_Sales_Data.csv", index=False)