# BASIC PRE-PROCESSING - {"BIGMART SALES" DATASET}

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from pickle import dump, load

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

### 2.1 Train Dataset

In [3]:
tr = pd.read_pickle('bms_train_init.pkl')

In [4]:
tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8323 entries, 2171 to 760
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8323 non-null   object 
 1   Item_Weight                6896 non-null   float64
 2   Item_Fat_Content           8323 non-null   object 
 3   Item_Visibility            8323 non-null   float64
 4   Item_Type                  8323 non-null   object 
 5   Item_MRP                   8323 non-null   float64
 6   Outlet_Identifier          8323 non-null   object 
 7   Outlet_Establishment_Year  8323 non-null   int64  
 8   Outlet_Size                5966 non-null   object 
 9   Outlet_Location_Type       8323 non-null   object 
 10  Outlet_Type                8323 non-null   object 
 11  Item_Outlet_Sales          8323 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 845.3+ KB


In [5]:
tr.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
2171,FDP28,13.65,Regular,0.0808,Frozen Foods,262.8936,OUT049,1999,Medium,Tier 1,Supermarket Type1,4958.8784
5657,FDK14,6.98,Low Fat,0.0412,Canned,82.8934,OUT045,2002,,Tier 2,Supermarket Type1,818.934
2156,FDX50,20.1,Low Fat,0.0746,Dairy,110.3228,OUT046,1997,Small,Tier 1,Supermarket Type1,1768.3648
110,FDD03,13.3,Low Fat,0.0798,Dairy,232.53,OUT046,1997,Small,Tier 1,Supermarket Type1,699.09
6709,NCV05,,Low Fat,0.0301,Health and Hygiene,154.3656,OUT027,1985,Medium,Tier 3,Supermarket Type3,2471.4496


## 3. Pre-Processing Level - II, After Data Assessment Process

### 0. Pre_Computed Values
#### - Pre_Computed values are executed separately once, to be applied on Train, Validation, Test.
#### - Pre_Computed values created using Train dataset, must be used in pipeline later to transform Train, Validation, and Test datasets
#### - Learning from Train and applying on Train, Validation, Test datasets

##### Note: Maps are created separately by first creating 'Item_Category' feature on Train dataset

In [6]:
# # Creating 'Item_Category' to calculate imputation values below

# tr['Item_Category'] = tr['Item_Identifier'].str[:2]
# # replacing created categories with more appropriate names
# tr['Item_Category'] = tr['Item_Category'].replace({'FD':'Foods', 'DR':'Drinks', 'NC':'Non Consumables'}) 


# # --------------------------------------ITEM_WEIGHT ------------------------------------------------------------------------------
# # if the item is present in the train dataset, use the mapping ('Item_Identifier' - 'Item_Weight') to impute the missing value,
# # else, use the median weight of the 'Item_Category' for imputation.

# # mapping for existing item in DS
# itid_weight = tr.loc[:,['Item_Identifier','Item_Weight']].drop_duplicates().dropna().sort_values(by=['Item_Identifier'])
# itid_weight_map = dict(zip(itid_weight['Item_Identifier'],itid_weight['Item_Weight']))
# dump(itid_weight_map, open('itid_wt_map_1.pkl','wb'))

# # mapping for new item in DS
# itca_weight = tr.pivot_table(index='Item_Category', values='Item_Weight', aggfunc='median').reset_index()   
# itca_weight_map = dict(zip(itca_weight['Item_Category'],itca_weight['Item_Weight']))
# print(itca_weight_map)
# dump(itca_weight_map, open('itid_wt_map_2.pkl','wb'))


# # ---------------------------------------OUTLET SIZE -----------------------------------------------------------------------------
# ot_os_mode = tr.pivot_table(index='Outlet_Type', values='Outlet_Size', aggfunc=(lambda x: x.mode()[0])).reset_index()
# ot_os_mode_map = dict(zip(ot_os_mode['Outlet_Type'],ot_os_mode['Outlet_Size']))
# print(ot_os_mode_map)
# dump(ot_os_mode_map, open('ot_os_mode_map.pkl','wb'))

##### Note: Mean value of 'Item_Visibility' to be replaced, if the value is 0. (As visibility can't be 0)

In [7]:
# mean_val = tr['Item_Visibility'].mean()
# dump(mean_val, open('it_vis_mean.pkl','wb'))

### 3.1 Creating new high level feature "Item_Category" from the existing feature "Item_Identifier"

In [8]:
def create_item_category(df):
    print('Creating new high level feature "Item_Category" from "Item_Identifier" ... \n')

    # feature creation
    df['Item_Category'] = df['Item_Identifier'].str[:2]


    # replacing created categories with more appropriate names
    df['Item_Category'] = df['Item_Category'].replace({'FD':'Foods', 'DR':'Drinks', 'NC':'Non Consumables'}) 
    
    return df

### 3.2 Missing value imputation for features "Item_Weight" and "Outlet_Size"

In [9]:
def mvi_item_weight(dff):
    itid_weight_map = load(open('itid_wt_map_1.pkl','rb'))
    itca_weight_map = load(open('itid_wt_map_2.pkl','rb'))
    dff.loc[:,'Item_Weight'] = dff.loc[:,'Item_Weight'].fillna(dff.loc[:,'Item_Identifier'].map(itid_weight_map))   # when item exists
    dff.loc[:,'Item_Weight'] = dff.loc[:,'Item_Weight'].fillna(dff.loc[:,'Item_Category'].map(itca_weight_map))     # when new item in DS
    
    return dff
    
def mvi_outlet_size(dff):
    ot_os_mode_map = load(open('ot_os_mode_map.pkl','rb'))
    dff.loc[:,'Outlet_Size'] = dff.loc[:,'Outlet_Size'].fillna(dff.loc[:,'Outlet_Type'].map(ot_os_mode_map))
    
    return dff

def mvi(trf):
    print('Missing values before imputation : ')
    print(trf[['Item_Weight','Outlet_Size']].isna().sum(), '\n\n')

    trf = mvi_item_weight(trf)
    trf = mvi_outlet_size(trf)
    
    print('Missing values after imputation : ')
    print(trf[['Item_Weight','Outlet_Size']].isna().sum())
    
    return trf

### 3.3 Correcting "Item_Fat_Content" values as per the context

In [10]:
def replace_item_fat_content(df):
    
    print(f'Feature values and Distribution of "Item_Fat_Content" : \n {df["Item_Fat_Content"].value_counts(normalize=True)*100} \n')


    print(f'Correcting the "Item_Fat_Content" values, w.r.t "Item_Category" value "Non Consumables" ... as "Non Edible" \n ')
    filt = (df['Item_Category'] == 'Non Consumables')
    df.loc[filt,'Item_Fat_Content'] = 'Non Edible'

    # --------------------------------------------------------------------------------------------------------------------------------

    #print(f'Feature values and Distribution of "Item_Fat_Content" : \n {df["Item_Fat_Content"].value_counts(normalize=True)*100} \n')

    print(f'Remapping the feature values as per the Data Assessment ... \n')
    df['Item_Fat_Content'].replace({'LF':'Low Fat','low fat':'Low Fat','reg':'Regular'}, inplace=True)


    print(f'Feature values and Distribution of "Item_Fat_Content" : \n {df["Item_Fat_Content"].value_counts(normalize=True)*100} \n')
    
    return df

### 3.4 Correcting "Item_Type" values for "Foods" of "Dairy" type

In [11]:
def replace_item_type(df):
    
    filtc = (((df['Item_Category'] == 'Foods')|(df['Item_Category'] == 'Drinks')) & (df['Item_Type'] == 'Dairy'))
    print(f'No of observations to be corrected : {df.loc[filtc].shape[0]} ... \n')


    print('Correcting the "Item_Type"="Dairy" --->>> "Dairy Foods", "Dairy Drinks" ... \n')

    for it in df['Item_Category'].unique().tolist():
        if it == 'Foods':
            print(f'{it} .....')
            filtf = ((df['Item_Category'] == 'Foods') & (df['Item_Type'] == 'Dairy'))
            df.loc[filtf,'Item_Type'] = 'Dairy Foods'
            print(f'No of observations corrected to "Dairy Foods" : {df.loc[filtf,"Item_Type"].shape[0]} \n')

        if it == 'Drinks':
            print(f'{it} .....')
            filtd = ((df['Item_Category'] == 'Drinks') & (df['Item_Type'] == 'Dairy'))
            df.loc[filtd,'Item_Type'] = 'Dairy Drinks'
            print(f'No of observations corrected to "Dairy Drinks" : {df.loc[filtd,"Item_Type"].shape[0]} \n')
            
    return df

### 3.5 Creating new feature "Outlet_Age" from the existing "Outlet_Establishment_Year" 

In [12]:
def create_outlet_age(df):
    print('Creating new feature "Outlet_Age" from "Outlet_Establishment_Year" : \n')

    df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']
    
    return df

### 3.6 Correcting 'Item_Visibility' values. If 0, replace with Mean

In [13]:
def check_replace_item_vis(df):   
    print('Correcting "Item_Visibility" values by replacing with mean, if 0 ...')
    
    mean_val = load(open('it_vis_mean.pkl','rb'))
    df['Item_Visibility'] = df['Item_Visibility'].replace({0:mean_val})
    
    return df

## 4. PreProcess Dataset

### 4.1 PreProcess Pipeline

In [14]:
def preprocess_dataset(ppdf):

    ppdf = create_item_category(ppdf)         # create 'Item_Category'
    print('\n --------------------------------------------------------------------------------------------------------------- \n')
    
    ppdf = mvi(ppdf)                          # imputer missing values in 'Item_Weight', 'Outlet_Size'
    print('\n --------------------------------------------------------------------------------------------------------------- \n')
    
    ppdf = replace_item_fat_content(ppdf)     # correct 'Item_Fat_Content' labels
    print('\n --------------------------------------------------------------------------------------------------------------- \n')
    
    ppdf = replace_item_type(ppdf)            # correct 'Item_Type' labels ('Dairy')
    print('\n --------------------------------------------------------------------------------------------------------------- \n')
    
    ppdf = create_outlet_age(ppdf)            # create 'Outlet_Age'
    print('\n --------------------------------------------------------------------------------------------------------------- \n')
    
    ppdf = check_replace_item_vis(ppdf)       # correcting 'Item_Visibility'
    
    return ppdf

### 4.2 PreProcess Train Dataset

In [15]:
tr = preprocess_dataset(tr)

Creating new high level feature "Item_Category" from "Item_Identifier" ... 


 --------------------------------------------------------------------------------------------------------------- 

Missing values before imputation : 
Item_Weight    1427
Outlet_Size    2357
dtype: int64 


Missing values after imputation : 
Item_Weight    0
Outlet_Size    0
dtype: int64

 --------------------------------------------------------------------------------------------------------------- 

Feature values and Distribution of "Item_Fat_Content" : 
 Item_Fat_Content
Low Fat    59.5939
Regular    34.0262
LF          3.6886
reg         1.3817
low fat     1.3096
Name: proportion, dtype: float64 

Correcting the "Item_Fat_Content" values, w.r.t "Item_Category" value "Non Consumables" ... as "Non Edible" 
 
Remapping the feature values as per the Data Assessment ... 

Feature values and Distribution of "Item_Fat_Content" : 
 Item_Fat_Content
Low Fat       45.8609
Regular       35.4079
Non Edible    18.731

In [16]:
tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8323 entries, 2171 to 760
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8323 non-null   object 
 1   Item_Weight                8323 non-null   float64
 2   Item_Fat_Content           8323 non-null   object 
 3   Item_Visibility            8323 non-null   float64
 4   Item_Type                  8323 non-null   object 
 5   Item_MRP                   8323 non-null   float64
 6   Outlet_Identifier          8323 non-null   object 
 7   Outlet_Establishment_Year  8323 non-null   int64  
 8   Outlet_Size                8323 non-null   object 
 9   Outlet_Location_Type       8323 non-null   object 
 10  Outlet_Type                8323 non-null   object 
 11  Item_Outlet_Sales          8323 non-null   float64
 12  Item_Category              8323 non-null   object 
 13  Outlet_Age                 8323 non-null   int64  


In [17]:
tr.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Category,Outlet_Age
2171,FDP28,13.65,Regular,0.0808,Frozen Foods,262.8936,OUT049,1999,Medium,Tier 1,Supermarket Type1,4958.8784,Foods,14
5657,FDK14,6.98,Low Fat,0.0412,Canned,82.8934,OUT045,2002,Small,Tier 2,Supermarket Type1,818.934,Foods,11
2156,FDX50,20.1,Low Fat,0.0746,Dairy Foods,110.3228,OUT046,1997,Small,Tier 1,Supermarket Type1,1768.3648,Foods,16
110,FDD03,13.3,Low Fat,0.0798,Dairy Foods,232.53,OUT046,1997,Small,Tier 1,Supermarket Type1,699.09,Foods,16
6709,NCV05,10.1,Non Edible,0.0301,Health and Hygiene,154.3656,OUT027,1985,Medium,Tier 3,Supermarket Type3,2471.4496,Non Consumables,28


### 4.3 PreProcess Validation Dataset

In [18]:
va = pd.read_pickle('bms_valid_init.pkl')

va.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
945,FDO10,13.65,Regular,0.0213,Snack Foods,58.3588,OUT010,1998,,Tier 3,Grocery Store,114.5176
1794,FDF16,7.3,Low Fat,0.0861,Frozen Foods,149.8076,OUT013,1987,High,Tier 3,Supermarket Type1,2808.3444
3022,FDA04,11.3,Regular,0.0667,Frozen Foods,257.2962,OUT046,1997,Small,Tier 1,Supermarket Type1,4920.9278
23,FDC37,,Low Fat,0.0576,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
2601,FDJ16,9.195,LF,0.1149,Frozen Foods,59.0246,OUT035,2004,Small,Tier 2,Supermarket Type1,1853.5872


In [19]:
va = preprocess_dataset(va)

va.head()

Creating new high level feature "Item_Category" from "Item_Identifier" ... 


 --------------------------------------------------------------------------------------------------------------- 

Missing values before imputation : 
Item_Weight    22
Outlet_Size    24
dtype: int64 


Missing values after imputation : 
Item_Weight    0
Outlet_Size    0
dtype: int64

 --------------------------------------------------------------------------------------------------------------- 

Feature values and Distribution of "Item_Fat_Content" : 
 Item_Fat_Content
Low Fat    61.0
Regular    30.0
LF          7.0
reg         1.0
low fat     1.0
Name: proportion, dtype: float64 

Correcting the "Item_Fat_Content" values, w.r.t "Item_Category" value "Non Consumables" ... as "Non Edible" 
 
Remapping the feature values as per the Data Assessment ... 

Feature values and Distribution of "Item_Fat_Content" : 
 Item_Fat_Content
Low Fat       47.0
Regular       31.0
Non Edible    22.0
Name: proportion, dtype: f

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Category,Outlet_Age
945,FDO10,13.65,Regular,0.0213,Snack Foods,58.3588,OUT010,1998,Small,Tier 3,Grocery Store,114.5176,Foods,15
1794,FDF16,7.3,Low Fat,0.0861,Frozen Foods,149.8076,OUT013,1987,High,Tier 3,Supermarket Type1,2808.3444,Foods,26
3022,FDA04,11.3,Regular,0.0667,Frozen Foods,257.2962,OUT046,1997,Small,Tier 1,Supermarket Type1,4920.9278,Foods,16
23,FDC37,15.5,Low Fat,0.0576,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876,Foods,28
2601,FDJ16,9.195,Low Fat,0.1149,Frozen Foods,59.0246,OUT035,2004,Small,Tier 2,Supermarket Type1,1853.5872,Foods,9


### 4.3 PreProcess Test Dataset

In [20]:
te = pd.read_pickle('bms_test_init.pkl')

te.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
2412,NCQ06,13.0,Low Fat,0.0418,Household,254.5014,OUT035,2004,Small,Tier 2,Supermarket Type1,4845.0266
4984,FDC26,10.195,Low Fat,0.1264,Canned,112.1886,OUT046,1997,Small,Tier 1,Supermarket Type1,667.1316
953,FDZ23,17.75,Regular,0.113,Baking Goods,185.424,OUT010,1998,,Tier 3,Grocery Store,745.696
5100,NCE19,8.97,Low Fat,0.0932,Household,55.7956,OUT049,1999,Medium,Tier 1,Supermarket Type1,1037.3164
4297,FDC14,14.5,Regular,0.069,Canned,41.4454,OUT010,1998,,Tier 3,Grocery Store,41.9454


In [21]:
te = preprocess_dataset(te)

te.head()

Creating new high level feature "Item_Category" from "Item_Identifier" ... 


 --------------------------------------------------------------------------------------------------------------- 

Missing values before imputation : 
Item_Weight    14
Outlet_Size    29
dtype: int64 


Missing values after imputation : 
Item_Weight    0
Outlet_Size    0
dtype: int64

 --------------------------------------------------------------------------------------------------------------- 

Feature values and Distribution of "Item_Fat_Content" : 
 Item_Fat_Content
Low Fat    68.0
Regular    27.0
LF          2.0
low fat     2.0
reg         1.0
Name: proportion, dtype: float64 

Correcting the "Item_Fat_Content" values, w.r.t "Item_Category" value "Non Consumables" ... as "Non Edible" 
 
Remapping the feature values as per the Data Assessment ... 

Feature values and Distribution of "Item_Fat_Content" : 
 Item_Fat_Content
Low Fat       54.0
Regular       28.0
Non Edible    18.0
Name: proportion, dtype: f

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Category,Outlet_Age
2412,NCQ06,13.0,Non Edible,0.0418,Household,254.5014,OUT035,2004,Small,Tier 2,Supermarket Type1,4845.0266,Non Consumables,9
4984,FDC26,10.195,Low Fat,0.1264,Canned,112.1886,OUT046,1997,Small,Tier 1,Supermarket Type1,667.1316,Foods,16
953,FDZ23,17.75,Regular,0.113,Baking Goods,185.424,OUT010,1998,Small,Tier 3,Grocery Store,745.696,Foods,15
5100,NCE19,8.97,Non Edible,0.0932,Household,55.7956,OUT049,1999,Medium,Tier 1,Supermarket Type1,1037.3164,Non Consumables,14
4297,FDC14,14.5,Regular,0.069,Canned,41.4454,OUT010,1998,Small,Tier 3,Grocery Store,41.9454,Foods,15


## 4. Saving the Pre-Processed Data to CSV File and PKL File

In [22]:
def train_valid_test_save_CSV_PKL(trdf, vadf, tedf):
    trdf.to_csv('bms_train_pp.csv', index=False)
    trdf.to_pickle('bms_train_pp.pkl')
    
    vadf.to_csv('bms_valid_pp.csv', index=False)
    vadf.to_pickle('bms_valid_pp.pkl')
    
    tedf.to_csv('bms_test_pp.csv', index=False)
    tedf.to_pickle('bms_test_pp.pkl')
    
    print(f'Train, Validation, and Test files created successfully ...')

In [23]:
train_valid_test_save_CSV_PKL(tr, va, te)

Train, Validation, and Test files created successfully ...
