# BASIC EDA - {"BLACK FRIDAY SALES" DATASET} 

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

In [3]:
data = pd.read_csv('bfs_train.csv')
df = data.copy()

## 3. Basic EDA

### 3.1 Sample of the Dataset

In [4]:
df.sample(5)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
101425,1003685,P00041842,M,26-35,5,C,1,1,6,8.0,,12420
505991,1005952,P00128042,F,46-50,1,C,4+,1,1,5.0,18.0,15617
374683,1003690,P00300642,M,18-25,0,B,2,0,5,14.0,,3539
139055,1003504,P00357442,M,18-25,7,B,1,0,5,8.0,,5144
96524,1002951,P00032842,F,36-45,14,B,0,1,8,,,9795


### 3.2 Shape of the Dataset

In [5]:
print(f'No. of Observations (rows) : {df.shape[0]}')
print(f'No. of Features (columns)  : {df.shape[1]}')

No. of Observations (rows) : 550068
No. of Features (columns)  : 12


### 3.3 Dataset Information

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


### 3.4 Checking Any Missing Values in the Dataset

In [7]:
miss_cnt = df.isna().sum().reset_index().rename(columns={'index':'Features', 0:'Missing Count'})
miss_cnt['Missing PCT'] = miss_cnt['Missing Count'].apply(lambda x: (x/df.shape[0])*100)
miss_cnt

Unnamed: 0,Features,Missing Count,Missing PCT
0,User_ID,0,0.0
1,Product_ID,0,0.0
2,Gender,0,0.0
3,Age,0,0.0
4,Occupation,0,0.0
5,City_Category,0,0.0
6,Stay_In_Current_City_Years,0,0.0
7,Marital_Status,0,0.0
8,Product_Category_1,0,0.0
9,Product_Category_2,173638,31.5666


### 3.5 Checking Unique Values for Individual Features

In [8]:
unq_cnt = df.nunique().reset_index().rename(columns={'index':'Features', 0:'Unique Values'})
unq_cnt['Unique Values PCT'] = unq_cnt['Unique Values'].apply(lambda x: (x/df.shape[0])*100)
unq_cnt

Unnamed: 0,Features,Unique Values,Unique Values PCT
0,User_ID,5891,1.071
1,Product_ID,3631,0.6601
2,Gender,2,0.0004
3,Age,7,0.0013
4,Occupation,21,0.0038
5,City_Category,3,0.0005
6,Stay_In_Current_City_Years,5,0.0009
7,Marital_Status,2,0.0004
8,Product_Category_1,20,0.0036
9,Product_Category_2,17,0.0031


### 3.6 Checking Any Duplicate Records

In [9]:
print(f'Total No. of Duplicated Observations in the Dataset : {df.duplicated().sum()}')

Total No. of Duplicated Observations in the Dataset : 0


### 3.7 Generating Statistical Summary on Numerical Columns

In [10]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550070.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003000.0,8.0767,0.4097,5.4043,9.8423,12.6682,9263.9687
std,1727.6,6.5227,0.4918,3.9362,5.0866,4.1253,5023.0654
min,1000000.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001500.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003100.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004500.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006000.0,20.0,1.0,20.0,18.0,18.0,23961.0


### 3.8 Features Specific to a Datatype

In [11]:
print(f'Number of Unique Data Types : {df.dtypes.nunique()}')
print(f'Unique Data Types : {df.dtypes.unique().tolist()}')

Number of Unique Data Types : 3
Unique Data Types : [dtype('int64'), dtype('O'), dtype('float64')]


In [12]:
fea_flo = df.select_dtypes(include='float64').columns.values.tolist()
fea_int = df.select_dtypes(include='int64').columns.values.tolist()
fea_obj = df.select_dtypes(include='object').columns.values.tolist()

print(f'Float Type Features    : {len(fea_flo)} \n{fea_flo} \n\n')
print(f'Integer Type Features  : {len(fea_int)} \n{fea_int} \n\n')
print(f'Object Type Features   : {len(fea_obj)} \n{fea_obj}')

Float Type Features    : 2 
['Product_Category_2', 'Product_Category_3'] 


Integer Type Features  : 5 
['User_ID', 'Occupation', 'Marital_Status', 'Product_Category_1', 'Purchase'] 


Object Type Features   : 5 
['Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']
