In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor

# Problem Statement

The data scientists at BigMart have collected 2013 sales data for 1559 products across 10 stores in different cities. There's multiple attributes included in the dataset for different products and stores. BigMart's goal is to build a machine learning model that can predict sales of each product at a particular store.

Using this model, BigMart will try to understand the properties of products and stores which play a key role in increasing sales.

In order to start, we need to figure out the essential features that have the most effect on the sales of a particular product.

# Hypothesis Generation

After exploring the data by eyes via excel sheets, these following attributes are possible potential features that will affect the sales of products:

**The Store Related:**
* Outlet_Establishment_Year: because people are likely to buy from an old store that they know
* Outlet_Location: Stores that is located in popular areas are more likely to have high sales
* Outlet_Size: Stores with high size tend to have more space to show the product, resulting in more advertising

**The Product Related:**
* Item_Visibility: Item that is more visible will sell more
* Item_Type: People can prefer one type over the others
* Item_MRP: People tend to spend less, so the more suited the price with the product value, the more it'll sell 

# Data Exploration

In [None]:
train = pd.read_csv("/kaggle/input/bignmart/Train.csv")
test = pd.read_csv("/kaggle/input/bignmart/Test.csv")

##see how many records and attributes it have
train.shape, test.shape

In [None]:
##take a look at the attributes to describe it
train.columns, test.columns

**Here's a description of these columns**

* **Item_Identifier**: Unique product ID
* **Item_Weight**: Weight of product
* **Item_Fat_Content**: is it low fat or not?
* **Item_Visibility**: The % of total display area of product in store
* **Item_Type**: the category to which the product belong 
* **Item_MRP**: Maximum Retail Price of the product
* **Outlet_Identifier**: Unique store ID
* **Outlet_Establishment_Year**: The year the store was established
* **Outlet_Size**: The size of the store (Area covered)
* **Outlet_Location_Type**: The type of city in which the store is located
* **Outlet_Type**: the store is a grocery store or a supermarket?
* **Item_Outlet_Sales**: Sales of the product in the particular store. This is to be predicted.

In [None]:
##determining the numerical and categorical attributes
##object for categorical, float64 for numerical
train.dtypes

In [None]:
##looking at the data
train.loc[:10]

In [None]:
train.describe()

**Some Comments**
* Item_Visibility minimum value is zero, that make no sense because if a product is been sold in some store then it has to be displayed in some dedicated area.
* Minimum value for Item_Outlet_Sales is not zero thankfully, otherwise we would have to deal with this using imputation methods

In [None]:
train.apply(lambda x: sum(x.isnull())),
##train.isnull().sum()

**Some Comments**
* Item_Weight and Outlet_Size attributes are missing much values, in this case, we can conclude that the reason for data missing is that some stores do not report all the data due to technical glitches.

In [None]:
for col in train:
    print('\n%s column: '%col)
    print(train[col].value_counts())

**Some Comments**
* Item_Fat_Content column needs to be modified because it seems that Low-Fat products are expressed in 3 ways: low fat, Low Fat, LF. Same with regular, so we need to make a unified form.

In [None]:
plt.subplot(311)
sns.scatterplot(x= train["Item_Weight"], y = train["Item_Outlet_Sales"])

plt.subplot(312)
sns.scatterplot(x= train["Item_Visibility"], y = train["Item_Outlet_Sales"])

plt.subplot(313)
sns.scatterplot(x= train["Item_MRP"], y = train["Item_Outlet_Sales"])

plt.subplots_adjust(wspace = 0.5, hspace = 0.4,top = 1.5)

In [None]:
plt.subplot(311)
sns.boxplot(x= train["Item_Fat_Content"], y = train["Item_Outlet_Sales"])

plt.subplot(312)
sns.boxplot(x= train["Item_Type"], y = train["Item_Outlet_Sales"])

plt.subplot(313)
sns.boxplot(x= train["Outlet_Identifier"], y = train["Item_Outlet_Sales"])

plt.subplots_adjust(wspace = 0.5, hspace = 0.4,top = 1.5)

In [None]:
plt.subplot(311)
sns.boxplot(x= train["Outlet_Size"], y = train["Item_Outlet_Sales"])

plt.subplot(312)
sns.boxplot(x= train["Outlet_Location_Type"], y = train["Item_Outlet_Sales"])

plt.subplot(313)
sns.boxplot(x= train["Outlet_Type"], y = train["Item_Outlet_Sales"])

plt.subplots_adjust(wspace = 0.5, hspace = 0.4,top = 1.5)

**Based on that, the features we'll be using is the following:**
* Item_MRP
* Item_Type
* Outlet_Identifier
* Outlet_Size
* Outlet_Location_Type
* Outlet_Type

# Fixing the data

In [None]:
## unifying all low fat items into Low Fat, and all regular to Regular
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace({'LF':'Low Fat',
                                                             'reg':'Regular',
                                                             'low fat':'Low Fat'})

test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({'LF':'Low Fat',
                                                             'reg':'Regular',
                                                             'low fat':'Low Fat'})

In [None]:
## imputing missing weight data with the mean
train['Item_Weight']=train['Item_Weight'].fillna(train['Item_Weight'].mean())
test['Item_Weight']=test['Item_Weight'].fillna(test['Item_Weight'].mean())

## imputing missing outlet_size data with the mode (most frequently appearing value)
train['Outlet_Size']=train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])
test['Outlet_Size']=test['Outlet_Size'].fillna(test['Outlet_Size'].mode()[0])

## checking if there's any more null values
train.apply(lambda x: sum(x.isnull()))

In [None]:
## imputing 0 visibility values with the mean
train['Item_Visibility'] = train['Item_Visibility'].replace({0:train["Item_Visibility"].mean()})
test['Item_Visibility'] = test['Item_Visibility'].replace({0:test["Item_Visibility"].mean()})

test["Item_Visibility"].min()

In [None]:
## one hot encoding
train = pd.get_dummies(train, columns=['Item_Type','Outlet_Location_Type','Outlet_Size','Outlet_Type',
                              'Outlet_Identifier'])

test = pd.get_dummies(test, columns=['Item_Type','Outlet_Location_Type','Outlet_Size','Outlet_Type',
                              'Outlet_Identifier'])

# Model Creation

In [None]:
train.dtypes

**Getting training data**

In [None]:
xtrain = train[['Item_MRP','Item_Type_Baking Goods',
'Item_Type_Breads',                     
'Item_Type_Breakfast',                  
'Item_Type_Canned',                     
'Item_Type_Dairy',                      
'Item_Type_Frozen Foods',               
'Item_Type_Fruits and Vegetables',      
'Item_Type_Hard Drinks',                
'Item_Type_Health and Hygiene',         
'Item_Type_Household',                  
'Item_Type_Meat',                       
'Item_Type_Others',                     
'Item_Type_Seafood',                    
'Item_Type_Snack Foods',               
'Item_Type_Soft Drinks',                
'Item_Type_Starchy Foods',              
'Outlet_Location_Type_Tier 1',          
'Outlet_Location_Type_Tier 2',          
'Outlet_Location_Type_Tier 3',          
'Outlet_Size_High',                     
'Outlet_Size_Medium',                   
'Outlet_Size_Small',                    
'Outlet_Type_Grocery Store',            
'Outlet_Type_Supermarket Type1',        
'Outlet_Type_Supermarket Type2',        
'Outlet_Type_Supermarket Type3',        
'Outlet_Identifier_OUT010',             
'Outlet_Identifier_OUT013',             
'Outlet_Identifier_OUT017',             
'Outlet_Identifier_OUT018',             
'Outlet_Identifier_OUT019',             
'Outlet_Identifier_OUT027',             
'Outlet_Identifier_OUT035',             
'Outlet_Identifier_OUT045',             
'Outlet_Identifier_OUT046',             
'Outlet_Identifier_OUT049']]        

ytrain = train['Item_Outlet_Sales']


**Creating the model**

In [None]:
mymodel = DecisionTreeRegressor(random_state=1)

mymodel.fit(xtrain, ytrain)

In [None]:
xtest = test[['Item_MRP','Item_Type_Baking Goods',
'Item_Type_Breads',                     
'Item_Type_Breakfast',                  
'Item_Type_Canned',                     
'Item_Type_Dairy',                      
'Item_Type_Frozen Foods',               
'Item_Type_Fruits and Vegetables',      
'Item_Type_Hard Drinks',                
'Item_Type_Health and Hygiene',         
'Item_Type_Household',                  
'Item_Type_Meat',                       
'Item_Type_Others',                     
'Item_Type_Seafood',                    
'Item_Type_Snack Foods',               
'Item_Type_Soft Drinks',                
'Item_Type_Starchy Foods',              
'Outlet_Location_Type_Tier 1',          
'Outlet_Location_Type_Tier 2',          
'Outlet_Location_Type_Tier 3',          
'Outlet_Size_High',                     
'Outlet_Size_Medium',                   
'Outlet_Size_Small',                    
'Outlet_Type_Grocery Store',            
'Outlet_Type_Supermarket Type1',        
'Outlet_Type_Supermarket Type2',        
'Outlet_Type_Supermarket Type3',        
'Outlet_Identifier_OUT010',             
'Outlet_Identifier_OUT013',             
'Outlet_Identifier_OUT017',             
'Outlet_Identifier_OUT018',             
'Outlet_Identifier_OUT019',             
'Outlet_Identifier_OUT027',             
'Outlet_Identifier_OUT035',             
'Outlet_Identifier_OUT045',             
'Outlet_Identifier_OUT046',             
'Outlet_Identifier_OUT049']]  

In [None]:
test['Item_Outlet_Sales'] = mymodel.predict(xtest)
#test['Item_Outlet_Sales'].loc[:10],
test.head()