In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
%cd "/gdrive/MyDrive/BlackBelt/DLFundamentals/009A_Assignment_1_BigMart/"
!pwd

/gdrive/MyDrive/BlackBelt/DLFundamentals/009A_Assignment_1_BigMart
/gdrive/MyDrive/BlackBelt/DLFundamentals/009A_Assignment_1_BigMart


# Steps to build a Neural Network using Keras

<ol>1. Loading the dataset</ol>
<ol>2. Pre-process the data: impute the missing values, normalize the variables,etc </ol>
<ol>3. Creating training and validation set</ol>
<ol>4. Defining the architecture of the model</ol>
<ol>5. Compiling the model (defining loss function, optimizer)</ol>
<ol>6. Training the model</ol>
<ol>7. Evaluating model performance on training and validation set</ol>
<ol>8. Do the same pre-processing steps on the test set as you did for the training set.</ol>
<ol>9. Generate the predictions for the test set using the trained model.</ol>
<ol>10. Save the predictions in a csv file (to check the format, refer to the sample submission file provided on the problem page.</ol>
<ol>11. Submit your predictions on the problem page and check your rank on the leaderboard.</ol>

## 1. Loading the dataset

In [3]:
# importing the required libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# loading the pre-processed dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train['source']='train'
test['source']='test'
data = pd.concat([train, test],ignore_index=True)
print(train.head())
print(test.head())
print(train.shape, test.shape, data.shape)

  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15         9.30          Low Fat         0.016047   
1           DRC01         5.92          Regular         0.019278   
2           FDN15        17.50          Low Fat         0.016760   
3           FDX07        19.20          Regular         0.000000   
4           NCD19         8.93          Low Fat         0.000000   

               Item_Type  Item_MRP Outlet_Identifier  \
0                  Dairy  249.8092            OUT049   
1            Soft Drinks   48.2692            OUT018   
2                   Meat  141.6180            OUT049   
3  Fruits and Vegetables  182.0950            OUT010   
4              Household   53.8614            OUT013   

   Outlet_Establishment_Year Outlet_Size Outlet_Location_Type  \
0                       1999      Medium               Tier 1   
1                       2009      Medium               Tier 3   
2                       1999      Medium               Tier

In [5]:
# looking at the first five rows of the data
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


In [6]:
data.shape

(14204, 13)

In [7]:
# checking missing values in the data
data.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [8]:
# data types of the variables
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
source                        object
dtype: object

## 2. Filling the missing values

### Categorical Data: Mode

In [9]:
# filling missing values of categorical variables with mode
data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0], inplace=True)

In [10]:
#Fill missing values of weight of Item According to means of Item Identifier
data['Item_Weight']=data['Item_Weight'].fillna(data.groupby('Item_Identifier')['Item_Weight'].transform('mean'))

In [11]:
data['Item_Weight']=data['Item_Weight'].fillna(data.groupby('Item_Type')['Item_Weight'].transform('mean'))

## 3. Feature Engineering

In [12]:
# Create a broad category of Type of Item
#Get the first two characters of ID:
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])
#Rename them to more intuitive categories:
data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',
                                                             'NC':'Non-Consumable',
                                                             'DR':'Drinks'})
data['Item_Type_Combined'].value_counts()

Food              10201
Non-Consumable     2686
Drinks             1317
Name: Item_Type_Combined, dtype: int64

In [13]:
#Years:
data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']
data['Outlet_Years'].describe()

count    14204.000000
mean        15.169319
std          8.371664
min          4.000000
25%          9.000000
50%         14.000000
75%         26.000000
max         28.000000
Name: Outlet_Years, dtype: float64

In [14]:
# Create a new mapping (dictionary) 
mapping = {
    'Low Fat' : 'LF',
    'Regular' : 'R',
    'LF' : 'LF',
    'reg': 'R',
    'low fat' : 'LF'
}
# use the  map function to update the values
data.Item_Fat_Content = data.Item_Fat_Content.map(mapping)

In [15]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_Type_Combined,Outlet_Years
0,FDA15,9.3,LF,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train,Food,14
1,DRC01,5.92,R,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train,Drinks,4
2,FDN15,17.5,LF,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train,Food,14
3,FDX07,19.2,R,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38,train,Food,15
4,NCD19,8.93,LF,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train,Non-Consumable,26


## 4. Encoding Categorical Variables

In [16]:
#One Hot Coding:
data['Outlet_Identifier_1'] = data['Outlet_Identifier']
data = pd.get_dummies(data, columns=['Item_Fat_Content','Item_Type','Outlet_Identifier_1', 'Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Type_Combined'])

In [17]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,source,Outlet_Years,Item_Fat_Content_LF,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Item_Type_Combined_Drinks,Item_Type_Combined_Food,Item_Type_Combined_Non-Consumable
0,FDA15,9.3,0.016047,249.8092,OUT049,1999,3735.138,train,14,1,...,1,0,0,0,1,0,0,0,1,0
1,DRC01,5.92,0.019278,48.2692,OUT018,2009,443.4228,train,4,0,...,0,0,1,0,0,1,0,1,0,0
2,FDN15,17.5,0.01676,141.618,OUT049,1999,2097.27,train,14,1,...,1,0,0,0,1,0,0,0,1,0
3,FDX07,19.2,0.0,182.095,OUT010,1998,732.38,train,15,0,...,0,0,1,1,0,0,0,0,1,0
4,NCD19,8.93,0.0,53.8614,OUT013,1987,994.7052,train,26,1,...,0,0,1,0,1,0,0,0,0,1


In [18]:
data.shape

(14204, 50)

In [19]:
data.dtypes

Item_Identifier                       object
Item_Weight                          float64
Item_Visibility                      float64
Item_MRP                             float64
Outlet_Identifier                     object
Outlet_Establishment_Year              int64
Item_Outlet_Sales                    float64
source                                object
Outlet_Years                           int64
Item_Fat_Content_LF                    uint8
Item_Fat_Content_R                     uint8
Item_Type_Baking Goods                 uint8
Item_Type_Breads                       uint8
Item_Type_Breakfast                    uint8
Item_Type_Canned                       uint8
Item_Type_Dairy                        uint8
Item_Type_Frozen Foods                 uint8
Item_Type_Fruits and Vegetables        uint8
Item_Type_Hard Drinks                  uint8
Item_Type_Health and Hygiene           uint8
Item_Type_Household                    uint8
Item_Type_Meat                         uint8
Item_Type_

In [20]:
# Drop the columns which have been converted to different types:
data.drop(['Outlet_Establishment_Year'],axis=1,inplace=True)

#Divide into test and train:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

#Drop unnecessary columns:
test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)



#Export files as modified versions:
train.to_csv("train_modified.csv",index=False)
test.to_csv("test_modified.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
