## Import modules

In [2]:
# Import the pandas library, often used for data manipulation and analysis
import pandas as pd

# Import the numpy library, used for numerical operations
import numpy as np

# Import the seaborn library, a data visualization library based on matplotlib
import seaborn as sns

# Import the pyplot module from the matplotlib library for plotting
import matplotlib.pyplot as plt

# Import the warnings module to manage warnings in the code
import warnings

# Ensure that matplotlib plots are displayed inline in Jupyter Notebooks
%matplotlib inline

# Ignore all warnings that might occur during code execution
warnings.filterwarnings('ignore')


## Loading the dataset

In [7]:
# Read the CSV file into a DataFrame
# 'Big-Mart-Sales-Prediction/Train.csv' is the file path to the CSV file
df = pd.read_csv('Big-Mart-Sales-Prediction/Train.csv')

# Display the first five rows of the DataFrame to get an overview of the data
# This helps in understanding the structure and content of the dataset
df.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [8]:
# statistical info
df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [9]:
# datatyoe of attributes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [11]:
# Check the unique values in each column of the dataset
# The apply function applies a given function to each column of the DataFrame
# lambda x: len(x.unique()) is an anonymous function that calculates the number of unique values in each column
# x represents each column in the DataFrame
df.apply(lambda x: len(x.unique()))


Item_Identifier              1559
Item_Weight                   416
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     4
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

## Preprocessing the dataset

In [13]:
# Check for null values in the DataFrame
# df.isnull() returns a DataFrame of the same shape as df, where each element is a boolean value indicating whether that element is null (True) or not (False)
# .sum() is then called on this boolean DataFrame to get the count of null values in each column
df.isnull().sum()


Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [19]:
# Check for categorical attributes
cat_col = []  # Initialize an empty list to store the names of categorical columns

# Iterate over the column names and their data types
for x in df.dtypes.index:
    # Check if the data type of the column is 'object'
    if df.dtypes[x] == 'object':
        # Append the column name to the list if it is categorical
        cat_col.append(x)

# Display the list of categorical columns
cat_col


['Item_Identifier',
 'Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [20]:
# Remove specific columns from the list of categorical columns
cat_col.remove('Item_Identifier')  # Remove the 'Item_Identifier' column from the list
cat_col.remove('Outlet_Identifier')  # Remove the 'Outlet_Identifier' column from the list

# Display the updated list of categorical columns
cat_col


['Item_Fat_Content',
 'Item_Type',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [22]:
# Print the categorical columns and their value counts
for col in cat_col:  # Iterate over each column in the list of categorical columns
    print(col)  # Print the name of the categorical column
    print(df[col].value_counts())  # Print the value counts for the current categorical column
    print()  # Print a blank line for better readability between columns


Item_Fat_Content
Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

Item_Type
Item_Type
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: count, dtype: int64

Outlet_Size
Outlet_Size
Medium    2793
Small     2388
High       932
Name: count, dtype: int64

Outlet_Location_Type
Outlet_Location_Type
Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: count, dtype: int64

Outlet_Type
Outlet_Type
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2   

In [24]:
# Fill the missing values

# Create a pivot table to calculate the mean Item_Weight for each Item_Identifier
# values = "Item_Weight" specifies that we want to calculate the mean for the 'Item_Weight' column
# index = 'Item_Identifier' specifies that we want to group by the 'Item_Identifier' column
item_weight_mean = df.pivot_table(values = "Item_Weight", index = 'Item_Identifier')

# Display the resulting pivot table which contains the mean Item_Weight for each Item_Identifier
item_weight_mean


Unnamed: 0_level_0,Item_Weight
Item_Identifier,Unnamed: 1_level_1
DRA12,11.600
DRA24,19.350
DRA59,8.270
DRB01,7.390
DRB13,6.115
...,...
NCZ30,6.590
NCZ41,19.850
NCZ42,10.500
NCZ53,9.600


In [26]:
# Create a boolean Series to identify missing values in the 'Item_Weight' column
# df['Item_Weight'].isnull() returns a Series of the same length as df['Item_Weight']
# Each element in the Series is a boolean value indicating whether the corresponding element in df['Item_Weight'] is null (True) or not (False)
miss_bool = df['Item_Weight'].isnull()

# Display the boolean Series
miss_bool


0       False
1       False
2       False
3       False
4       False
        ...  
8518    False
8519    False
8520    False
8521    False
8522    False
Name: Item_Weight, Length: 8523, dtype: bool

In [27]:
# Iterate over each item in the 'Item_Identifier' column by index and value
for i, item in enumerate(df['Item_Identifier']):
    # Check if the 'Item_Weight' is missing (True in the miss_bool Series) for the current row
    if miss_bool[i]:
        # If the 'Item_Identifier' is in the item_weight_mean pivot table
        if item in item_weight_mean.index:
            # Fill the missing 'Item_Weight' with the mean weight from the pivot table for the specific 'Item_Identifier'
            df['Item_Weight'][i] = item_weight_mean.loc[item]['Item_Weight']
        else:
            # If 'Item_Identifier' is not in the pivot table, fill the missing 'Item_Weight' with the overall mean weight
            df['Item_Weight'][i] = np.mean(df['Item_Weight'])


In [28]:
# Check the number of missing values in the 'Item_Weight' column
# df['Item_Weight'].isnull() returns a Series of boolean values indicating whether each value in 'Item_Weight' is null (True) or not (False)
# .sum() is called on this Series to count the number of True values, which represent the missing values
df['Item_Weight'].isnull().sum()


0