# BASIC EDA - {"BIGMART SALES" DATASET} 

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

### 2.1 Train Dataset

In [3]:
data = pd.read_csv('bms_train.csv')
df = data.copy()

### 2.2 Test Dataset

In [4]:
dfte = pd.read_csv('bms_test.csv')

print(f'Shape of the Test dataset : {dfte.shape}')

Shape of the Test dataset : (5681, 11)


## 3. Basic EDA

### 3.1 Sample of the Dataset

In [5]:
df.sample(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
7809,FDM08,10.1,Regular,0.0897,Fruits and Vegetables,225.5088,OUT010,1998,,Tier 3,Grocery Store,1342.2528
5027,FDC45,,Low Fat,0.1351,Fruits and Vegetables,170.3106,OUT027,1985,Medium,Tier 3,Supermarket Type3,3422.212
2786,FDG26,18.85,Low Fat,0.0428,Canned,254.433,OUT018,2009,Medium,Tier 3,Supermarket Type2,2050.664
5310,DRI11,8.26,Low Fat,0.0576,Hard Drinks,113.7834,OUT010,1998,,Tier 3,Grocery Store,115.1834
8011,FDV48,9.195,Regular,0.0518,Baking Goods,77.4644,OUT018,2009,Medium,Tier 3,Supermarket Type2,1414.1592


### 3.2 Shape of the Dataset

In [6]:
print(f'No. of Observations (rows) : {df.shape[0]}')
print(f'No. of Features (columns)  : {df.shape[1]}')

No. of Observations (rows) : 8523
No. of Features (columns)  : 12


### 3.3 Dataset Information

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


### 3.4 Checking Any Missing Values in the Dataset

In [8]:
miss_cnt = df.isna().sum().reset_index().rename(columns={'index':'Features', 0:'Missing Count'})
miss_cnt['Missing PCT'] = miss_cnt['Missing Count'].apply(lambda x: (x/df.shape[0])*100)
miss_cnt

Unnamed: 0,Features,Missing Count,Missing PCT
0,Item_Identifier,0,0.0
1,Item_Weight,1463,17.1653
2,Item_Fat_Content,0,0.0
3,Item_Visibility,0,0.0
4,Item_Type,0,0.0
5,Item_MRP,0,0.0
6,Outlet_Identifier,0,0.0
7,Outlet_Establishment_Year,0,0.0
8,Outlet_Size,2410,28.2764
9,Outlet_Location_Type,0,0.0


### 3.5 Checking Unique Values for Individual Features

In [9]:
unq_cnt = df.nunique().reset_index().rename(columns={'index':'Features', 0:'Unique Values'})
unq_cnt['Unique Values PCT'] = unq_cnt['Unique Values'].apply(lambda x: (x/df.shape[0])*100)
unq_cnt

Unnamed: 0,Features,Unique Values,Unique Values PCT
0,Item_Identifier,1559,18.2917
1,Item_Weight,415,4.8692
2,Item_Fat_Content,5,0.0587
3,Item_Visibility,7880,92.4557
4,Item_Type,16,0.1877
5,Item_MRP,5938,69.6703
6,Outlet_Identifier,10,0.1173
7,Outlet_Establishment_Year,9,0.1056
8,Outlet_Size,3,0.0352
9,Outlet_Location_Type,3,0.0352


### 3.6 Checking Any Duplicate Records

In [10]:
print(f'Total No. of Duplicated Observations in the Dataset : {df.duplicated().sum()}')

Total No. of Duplicated Observations in the Dataset : 0


### 3.7 Generating Statistical Summary on Numerical Columns

In [11]:
df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.8576,0.0661,140.9928,1997.8319,2181.2889
std,4.6435,0.0516,62.2751,8.3718,1706.4996
min,4.555,0.0,31.29,1985.0,33.29
25%,8.7737,0.027,93.8265,1987.0,834.2474
50%,12.6,0.0539,143.0128,1999.0,1794.331
75%,16.85,0.0946,185.6437,2004.0,3101.2964
max,21.35,0.3284,266.8884,2009.0,13086.9648


### 3.8 Features Specific to a Datatype

In [12]:
print(f'Number of Unique Data Types : {df.dtypes.nunique()}')
print(f'Unique Data Types : {df.dtypes.unique().tolist()}')

Number of Unique Data Types : 3
Unique Data Types : [dtype('O'), dtype('float64'), dtype('int64')]


In [13]:
fea_int = df.select_dtypes(include='int64').columns.values.tolist()
fea_flo = df.select_dtypes(include='float64').columns.values.tolist()
fea_obj = df.select_dtypes(include='object').columns.values.tolist()

print(f'Integer Type Features  : {len(fea_int)} \n{fea_int} \n\n')
print(f'Float Type Features    : {len(fea_flo)} \n{fea_flo} \n\n')
print(f'Float Type Features    : {len(fea_obj)} \n{fea_obj}')

Integer Type Features  : 1 
['Outlet_Establishment_Year'] 


Float Type Features    : 4 
['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales'] 


Float Type Features    : 7 
['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
