## Exploratory Data Analysis of Supermart Grocery Sales

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.io as pio
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname,filename))

In [None]:
# importing datasets
df = pd.read_csv('../input/supermart-grocery-sales-retail-analytics-dataset/Supermart Grocery Sales - Retail Analytics Dataset.csv')
df

In [None]:
df.info()

## Let's check if there's any invalid data like negative values are present in sales,discount and profit data 

In [None]:
(df['Sales'].values <= 0).any()

In [None]:
(df['Profit'].values <=0.00).any()
(df['Discount'].values <=0.00).any()

## Let's see the regions of the sales

In [None]:
df['Region'].unique()

In [None]:
df['Region'].value_counts()

**As there is only one sale in North region, this may be a mistake**

In [None]:
df.drop((df[df['Region']=='North']).index,inplace=True) 

In [None]:
pio.templates.default = "plotly_dark"
fig = px.histogram(df, x = "Region", y = "Sales", color="Region", title = "Sales by the Regions",width=500)
fig.show()

## Total sales by categories

In [None]:
fig = px.histogram(df, x = "Category", y = "Sales", color="Category", title = "Sales by the Product Categories")
fig.show()

## Total sales by sub categories

In [None]:
sale_profit = pd.pivot_table(data=df, index=['Category', 'Sub Category'], values=['Profit','Sales'], 
               aggfunc='sum').reset_index().sort_values(['Category', 'Sales'], ascending=True)
sale_profit

## Total sales in sub categories

In [None]:
fig = px.histogram(sale_profit, x = "Sub Category", y = "Sales", color="Sub Category", title = "Sales by the Product sub categories")
fig.show()

## Profit Margin in each sub category

In [None]:
profit = pd.pivot_table(data=df, index='Sub Category', values=['Sales', 'Profit'], aggfunc='sum').reset_index()
profit['Profit Margin'] = round((profit['Profit']/profit['Sales']) * 100,2)
profit.sort_values(by='Profit Margin', ascending=False, inplace = True)
profit

## Yearly sales and Order

In [None]:
df['Order Year'] = pd.DatetimeIndex(df['Order Date']).year
df['Order month'] = pd.DatetimeIndex(df['Order Date']).month

In [None]:
year_sale = pd.pivot_table(data=df, index='Order Year', values=['Order ID','Sales','Profit'],
            aggfunc={'Order ID': 'count', 'Sales': 'sum', 'Profit':'sum'}).reset_index()
year_sale


In [None]:
pio.templates.default = "plotly_white"
fig = px.histogram(df, x = "Order Year", y = "Sales", color="Order Year", title = "Sales by the Year",width=500)
fig.show()

In [None]:
fig = px.histogram(df, x = "Order month", y = "Sales", color="Order month", title = "Sales by the Months",width=500)
fig.show()