Maud Lecerf | AI for Retail Consumer Goods

# Challenge 1 - POS analysis 


## Imports (files and packages)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.stats import gaussian_kde

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
DeprecationWarning



In [3]:
pos = pd.read_csv('POS_data.csv')
supermarket = pd.read_csv('supermarket_POS_data.csv')

## Initial Analysis & Cleaning

In [4]:
pos.head()

Unnamed: 0,Date,Time,Transaction,Item
0,10/30/2016,9:58:11,1,Bread
1,10/30/2016,10:05:34,2,Scandinavian
2,10/30/2016,10:05:34,2,Scandinavian
3,10/30/2016,10:07:57,3,Hot chocolate
4,10/30/2016,10:07:57,3,Jam


In [5]:
supermarket.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [6]:
pos.shape

(21293, 4)

In [7]:
supermarket.shape

(1000, 17)

### Changing Column names 
To be able to manipulate and claculate more easily, and make sure they all follow the same format

In [8]:
pos.columns = [col.replace(" ", "_").lower() for col in pos.columns]

In [9]:
pos.columns

Index(['date', 'time', 'transaction', 'item'], dtype='object')

In [10]:
supermarket.columns = [col.replace(" ", "_").lower() for col in supermarket.columns]


In [11]:
supermarket.columns

Index(['invoice_id', 'branch', 'city', 'customer_type', 'gender',
       'product_line', 'unit_price', 'quantity', 'tax_5%', 'total', 'date',
       'time', 'payment', 'cogs', 'gross_margin_percentage', 'gross_income',
       'rating'],
      dtype='object')

### Null Values

In [12]:
pos.isna().sum()

date           0
time           0
transaction    0
item           0
dtype: int64

In [13]:
supermarket.isna().sum()

invoice_id                 0
branch                     0
city                       0
customer_type              0
gender                     0
product_line               0
unit_price                 0
quantity                   0
tax_5%                     0
total                      0
date                       0
time                       0
payment                    0
cogs                       0
gross_margin_percentage    0
gross_income               0
rating                     0
dtype: int64

No null values, no need for cleaning on this part. 

### Values types 

In [14]:
pos.dtypes

date           object
time           object
transaction     int64
item           object
dtype: object

In [15]:
# changing the date and time to a datetime object 
pos['datetime'] = pd.to_datetime(pos['date'].astype(str) + ' ' + pos['time'].astype(str))


In [16]:
#removing the date and time columns as they are now redundant 
pos = pos.drop(columns = ['date', 'time'], axis=1)

In [17]:
supermarket.dtypes

invoice_id                  object
branch                      object
city                        object
customer_type               object
gender                      object
product_line                object
unit_price                 float64
quantity                     int64
tax_5%                     float64
total                      float64
date                        object
time                        object
payment                     object
cogs                       float64
gross_margin_percentage    float64
gross_income               float64
rating                     float64
dtype: object

In [18]:
#changing the date and time to a datetime object as well
supermarket['datetime'] = pd.to_datetime(supermarket['date'].astype(str) + ' ' + supermarket['time'].astype(str))

In [19]:
#removing the date and time columns as they are now redundant 
supermarket = supermarket.drop(columns = ['date', 'time'], axis=1)

### Dates Dataframes

Creating dataframes with date-based indexes to be able to do analysis based on dates. 

In [20]:
pos_dt = pos.copy()
pos_dt.set_index('datetime', inplace=True)

In [21]:
#addding date-related columns for visuals later 
pos_dt['year'] = pos_dt.index.year
pos_dt['month'] = pos_dt.index.month_name()
pos_dt['day_of_month'] = pos_dt.index.day
pos_dt['weekday'] = pos_dt.index.day_name()
pos_dt['date'] = pos_dt.index.date
pos_dt['hour_of_day'] = pos_dt.index.hour

In [22]:
supermarket_dt = supermarket.copy()
supermarket_dt.set_index('datetime', inplace=True)

In [23]:
#addding date-related columns for visuals later 
supermarket_dt['day_of_month'] = supermarket_dt.index.day
supermarket_dt['month'] = supermarket_dt.index.month_name()
supermarket_dt['weekday'] = supermarket_dt.index.day_name()
supermarket_dt['date'] = supermarket_dt.index.date
supermarket_dt['hour_of_day'] = supermarket_dt.index.hour

### Non-date related dataframes 

In [24]:
#putting transaction as the index of the original pos dataset as it is already a unique identifier
pos.set_index('transaction', inplace=True)  

In [25]:
pos.head()

Unnamed: 0_level_0,item,datetime
transaction,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Bread,2016-10-30 09:58:11
2,Scandinavian,2016-10-30 10:05:34
2,Scandinavian,2016-10-30 10:05:34
3,Hot chocolate,2016-10-30 10:07:57
3,Jam,2016-10-30 10:07:57


In [26]:
#putting the invoice number as the index 
supermarket.set_index('invoice_id', inplace=True) 

In [27]:
supermarket.head()

Unnamed: 0_level_0,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5%,total,payment,cogs,gross_margin_percentage,gross_income,rating,datetime
invoice_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,Ewallet,522.83,4.761905,26.1415,9.1,2019-01-05 13:08:00
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,Cash,76.4,4.761905,3.82,9.6,2019-03-08 10:29:00
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,Credit card,324.31,4.761905,16.2155,7.4,2019-03-03 13:23:00
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,Ewallet,465.76,4.761905,23.288,8.4,2019-01-27 20:33:00
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,Ewallet,604.17,4.761905,30.2085,5.3,2019-02-08 10:37:00


## Exploratory Data Analysis

### POS Data EDA

#### All Items

We are looking at the number of items bought. 

In [72]:
px.bar(pos['item'].value_counts(), template='plotly_dark', title='Count of each Item Bought')

In [29]:
# Group by year and month, and count the number of items
items_monthly_counts = (pos_dt.groupby(['year', 'month']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
items_monthly_counts['date'] = pd.to_datetime(
    items_monthly_counts['year'].astype(str) + '-' + items_monthly_counts['month'].astype(str))

# Plot the data using px
px.bar( items_monthly_counts, x='date', y='count', template='plotly_dark', title='Monthly Sales (in number of items)', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Let's look at why we have such low values for the months of october and april

In [30]:
# Group by year and month, and count the number of items
items_daily_counts = (pos_dt.groupby(['date']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
items_daily_counts['date'] = pd.to_datetime(items_daily_counts['date'].astype(str))

# Plot the data using px
px.line( items_daily_counts, x='date', y='count', template='plotly_dark', title='Daily Sales (in number of items)', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})

By looking at coffee sales daily, we see that the low values in october 2016 and april 2017 are due to the dataset starting at the end of october and finishing beginning of april. Therefore we don't have data for all those months and cannot properly compare. 

Let's now look at values by day of the week.

In [31]:
items_avg_by_weekday = (pos_dt.groupby('weekday').size().groupby(level=0).mean().reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( items_avg_by_weekday, x='weekday', y='average_count', template='plotly_dark', 
       title='Average Item Sales by Day of the Week', labels={'weekday': 'Day of the Week', 'average_count': 'Average Count'})

In [32]:
items_avg_by_hour = (pos_dt.groupby('hour_of_day').size().groupby(level=0).mean().reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( items_avg_by_hour, x='hour_of_day', y='average_count', template='plotly_dark', 
       title='Average Item Sales by Day of the Week', labels={'hour_of_day': 'Hour of the Day', 'average_count': 'Average Count'})

In [33]:
# Group by weekday and hour_of_day, and calculate the count of all items
hourly_avg_by_day = ( pos_dt.groupby(['weekday', 'hour_of_day']).size().reset_index(name='item_count'))

# Plot the data using Plotly Express
fig = px.line(hourly_avg_by_day, x='hour_of_day', y='item_count', color='weekday', 
              title='Hourly Item Count by Day of the Week', template='plotly_dark',
              labels={'hour_of_day': 'Hour of the Day', 'item_count': 'Item Count', 'weekday': 'Day of the Week'})

# Customize the x-axis to show labels for every hour
fig.update_layout(xaxis=dict(tickmode='array', tickvals=list(range(24)), ticktext=[f"{hour:02d}:00" for hour in range(24)]))
fig.show()


It makes sense that most sales happen in between 8 am and 9 pm, probably matching with the store opening hours. 
Furthermore it also makes sense that most sales happen on saturday and sundays, when people usually do their shopping. 

#### Coffee Data
As coffee is the item most sold, let's look at the selling patterns of this item first. 

In [34]:
# Filter for Coffee items
coffee_data = pos_dt[pos_dt['item'] == 'Coffee']

# Group by year and month, and count the number of items
coffee_monthly_counts = (coffee_data.groupby(['year', 'month']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
coffee_monthly_counts['date'] = pd.to_datetime(
    coffee_monthly_counts['year'].astype(str) + '-' + coffee_monthly_counts['month'].astype(str))

# Plot the data using px
px.bar( coffee_monthly_counts, x='date', y='count', template='plotly_dark', title='Monthly Coffee Sales', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



As for the total number of items, we have a similar distribution for coffee. 

In [35]:
# Filter for Coffee items
coffee_data = pos_dt[pos_dt['item'] == 'Coffee']

# Group by year and month, and count the number of items
coffee_daily_counts = (coffee_data.groupby(['date']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
coffee_daily_counts['date'] = pd.to_datetime(coffee_daily_counts['date'].astype(str))

# Plot the data using px
px.line( coffee_daily_counts, x='date', y='count', template='plotly_dark', title='Daily Coffee Sales', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})

As we don't have values for a whole year, it might be more interesting to look at weekly and hourly changes and values in the POS data. 

In [36]:
# Filter for Coffee and calculate the average count by day of the week
coffee_avg_by_weekday = (pos_dt[pos_dt['item'] == 'Coffee'].groupby('weekday').size().groupby(level=0).mean()
                         .reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( coffee_avg_by_weekday, x='weekday', y='average_count', template='plotly_dark', 
       title='Average Coffee Sales by Day of the Week', labels={'weekday': 'Day of the Week', 'average_count': 'Average Count'})

In [37]:
# Filter for Coffee and calculate the average count by day of the week
coffee_avg_by_hour = (pos_dt[pos_dt['item'] == 'Coffee'].groupby('hour_of_day').size().groupby(level=0).mean()
                         .reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( coffee_avg_by_hour, x='hour_of_day', y='average_count', template='plotly_dark', 
       title='Average Coffee Sales by Hour of the Day', labels={'weekday': 'Hour of the Day', 'average_count': 'Average Count'})

#### Bread Data 
As bread is the second most sold item, let's analyze it and see if it's seasonality is different from Coffe and Total Sales.

In [38]:
# Filter for Bread items
bread_data = pos_dt[pos_dt['item'] == 'Bread']

# Group by year and month, and count the number of items
bread_monthly_counts = (bread_data.groupby(['year', 'month']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
bread_monthly_counts['date'] = pd.to_datetime(
    bread_monthly_counts['year'].astype(str) + '-' + bread_monthly_counts['month'].astype(str))

# Plot the data using px
px.bar( bread_monthly_counts, x='date', y='count', template='plotly_dark', title='Monthly Bread Sales', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



As for the total number of items and coffe, we have a similar distribution for bread. 

In [39]:
# Filter for Bread items
bread_data = pos_dt[pos_dt['item'] == 'Bread']

# Group by year and month, and count the number of items
bread_daily_counts = (bread_data.groupby(['date']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
bread_daily_counts['date'] = pd.to_datetime(bread_daily_counts['date'].astype(str))

# Plot the data using px
px.line( bread_daily_counts, x='date', y='count', template='plotly_dark', title='Daily Bread Sales', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})

In the graph above, we see more low values in november and january than for coffee, we have a bigger ranges of values on those months. 

As we don't have values for a whole year, it might be more interesting to look at weekly and hourly changes and values in the POS data. 

In [40]:
# Filter for Bread and calculate the average count by day of the week
bread_avg_by_weekday = (pos_dt[pos_dt['item'] == 'Bread'].groupby('weekday').size().groupby(level=0).mean()
                         .reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( bread_avg_by_weekday, x='weekday', y='average_count', template='plotly_dark', 
       title='Average Bread Sales by Day of the Week', labels={'weekday': 'Day of the Week', 'average_count': 'Average Count'})

The distribution is very similar to the total items and coffee ones. As bread is an item that is bought a lot, it makes sense that its distribution follows the days where most shopping is done. 

In [41]:
# Filter for Bread and calculate the average count by day of the week
bread_avg_by_hour = (pos_dt[pos_dt['item'] == 'Bread'].groupby('hour_of_day').size().groupby(level=0).mean()
                         .reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( bread_avg_by_hour, x='hour_of_day', y='average_count', template='plotly_dark', 
       title='Average Bread Sales by Hour of the Day', labels={'weekday': 'Hour of the Day', 'average_count': 'Average Count'})

Both bread and coffee are products bought by almost every every household, it makes sense that their distribution follows the distribution of the whole products.

### Supermarket Data EDA

#### Values Distributions 

In [42]:
supermarket.head()

Unnamed: 0_level_0,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5%,total,payment,cogs,gross_margin_percentage,gross_income,rating,datetime
invoice_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,Ewallet,522.83,4.761905,26.1415,9.1,2019-01-05 13:08:00
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,Cash,76.4,4.761905,3.82,9.6,2019-03-08 10:29:00
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,Credit card,324.31,4.761905,16.2155,7.4,2019-03-03 13:23:00
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,Ewallet,465.76,4.761905,23.288,8.4,2019-01-27 20:33:00
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,Ewallet,604.17,4.761905,30.2085,5.3,2019-02-08 10:37:00


#### Supermarket Sales

Let's look at the distribution of sales based on the branches, amount payed, categories of products...

In [43]:
# Count the occurrences of each branch
branch_counts = supermarket['branch'].value_counts().reset_index()
branch_counts.columns = ['branch', 'count']

px.bar(branch_counts, x='branch', y='count', template='plotly_dark', 
title='Count of Each Branch',labels={'branch': 'Branch', 'count': 'Count'})

Each branch has sold almost the same amount of products. 

In [44]:
# Group by product line and calculate the sum of unit_price
branch_sums = supermarket.groupby('branch')['unit_price'].sum().reset_index()

# Rename columns for clarity
branch_sums.columns = ['branch', 'total_unit_price']
branch_sums = branch_sums.sort_values(by='total_unit_price', ascending=False)

px.bar(branch_sums, x='branch', y='total_unit_price', template='plotly_dark', 
    title='Total Amount Payed by Branch',labels={'branch': 'Branch', 'total_unit_price': 'Total Unit Price'})

The difference seems even smaller between branches when looking at the total payed per branch. 

In [45]:
# Count the occurrences of each product line
product_line_counts = supermarket['product_line'].value_counts().reset_index()
product_line_counts.columns = ['product_line', 'count']

px.bar(product_line_counts, x='product_line', y='count', template='plotly_dark', 
title='Count of Each Product Line',labels={'product_line': 'Product Line', 'count': 'Count'})

In [46]:
# Group by product line and calculate the sum of unit_price
product_line_sums = supermarket.groupby('product_line')['unit_price'].sum().reset_index()

# Rename columns for clarity
product_line_sums.columns = ['product_line', 'total_unit_price']
product_line_sums = product_line_sums.sort_values(by='total_unit_price', ascending=False)

px.bar(product_line_sums, x='product_line', y='total_unit_price', template='plotly_dark', 
    title='Total  Price by Product Line',labels={'product_line': 'Product Line', 'total_unit_price': 'Total Unit Price'})

We can see the same order for the categories based on amount sold and number of products sold, which mean that the average product price should be very similar across categories. 

In [75]:
average_price_per_category = supermarket.groupby('product_line')['unit_price'].mean().round(2)
average_price_per_category


product_line
Electronic accessories    53.55
Fashion accessories       57.15
Food and beverages        56.01
Health and beauty         54.85
Home and lifestyle        55.32
Sports and travel         56.99
Name: unit_price, dtype: float64

Indeed, by calculating the average unit price per category we can see that they are very similar. 

Let's now look at the distribution of some of these values 

In [47]:
# Generate the KDE data using scipy
kde = gaussian_kde(supermarket['total'], bw_method=0.1)
x_values = np.linspace(supermarket['total'].min(), supermarket['total'].max(), 100)  # Range of total amounts
y_values = kde(x_values)  # Density values

# Create a DataFrame for Plotly
kde_data = pd.DataFrame({'total': x_values, 'density': y_values})

px.line(kde_data, x='total', y='density', template='plotly_dark', title='Distribution of Totals Payed',
              labels={'total': 'Total Payed', 'density': 'Density'},line_shape='spline')

This distribution is right skewed; with most totals around 90$. It makes sense that this skew is positive as it is more rare to leave the supermarket and pay amounts higher that 200$. 

In [48]:
kde = gaussian_kde(supermarket['cogs'], bw_method=0.1)
x_values = np.linspace(supermarket['cogs'].min(), supermarket['cogs'].max(), 100)  # Range of costs of goods sold
y_values = kde(x_values)  # Density values

# Create a DataFrame for Plotly
kde_data = pd.DataFrame({'cogs': x_values, 'density': y_values})

px.line(kde_data, x='cogs', y='density', template='plotly_dark', title='Distribution of Costs of Goods Sold',
              labels={'cogs': 'Cost of Goods Sold', 'density': 'Density'},line_shape='spline')

The cost of goods sold also follows a right skewed distribution, showing that most of the totals have a cost of goods sold lower that 200$. This distribution follows closely the total payed distribution as cogs is a percentage of total payed.

In [49]:
# Count the occurrences of each branch
quantity_counts = supermarket['quantity'].value_counts().reset_index()
quantity_counts.columns = ['quantity', 'count']

px.bar(quantity_counts, x='quantity', y='count', template='plotly_dark', 
title='Quantities Bought Count',labels={'quantity': 'Quantity', 'count': 'Count'})

There is no clear trend of the quantity bought. But there is surprisingly no basket with more than 10 items bought. 

#### Customer Informations

Let's look at the supermarket data based on customer information. 

In [50]:
# Count the occurrences of each branch
customer_type_counts = supermarket['customer_type'].value_counts().reset_index()
customer_type_counts.columns = ['customer_type', 'count']

px.bar(customer_type_counts, x='customer_type', y='count', template='plotly_dark', 
title='Count of Each Customer Type',labels={'customer_type': 'Customer Type', 'count': 'Count'})

We almost have the same number and non-member client shopping at our store. 

In [51]:
# Group by product line and calculate the sum of unit_price
customer_type_sums = supermarket.groupby('customer_type')['unit_price'].sum().reset_index()

# Rename columns for clarity
customer_type_sums.columns = ['customer_type', 'total_unit_price']
customer_type_sums = customer_type_sums.sort_values(by='total_unit_price', ascending=False)

px.bar(customer_type_sums, x='customer_type', y='total_unit_price', template='plotly_dark', 
    title='Total Amount Payed by Customer Type',labels={'customer_type': 'Customer Type', 'total_unit_price': 'Total Unit Price'})

The total amount payed by members is slightly higher than the total amount payed by non-member customers. 

In [52]:
# Count the occurrences of each branch
gender_counts = supermarket['gender'].value_counts().reset_index()
gender_counts.columns = ['gender', 'count']

px.bar(gender_counts, x='gender', y='count', template='plotly_dark', 
title='Count of Each Gender',labels={'gender': 'Gender', 'count': 'Count'})

Here the values are also almost the same. 

In [53]:
# Group by product line and calculate the sum of unit_price
gender_sums = supermarket.groupby('gender')['unit_price'].sum().reset_index()

# Rename columns for clarity
gender_sums.columns = ['gender', 'total_unit_price']
gender_sums = gender_sums.sort_values(by='total_unit_price', ascending=False)

px.bar(gender_sums, x='gender', y='total_unit_price', template='plotly_dark', 
    title='Total Amount Payed by Gender',labels={'gender': 'Gender', 'total_unit_price': 'Total Unit Price'})

In [54]:
# Generate the KDE data using scipy
kde = gaussian_kde(supermarket['rating'], bw_method=0.1)
x_values = np.linspace(supermarket['rating'].min(), supermarket['rating'].max(), 100)  # Range of ratings
y_values = kde(x_values)  # Density values

# Create a DataFrame for Plotly
kde_data = pd.DataFrame({'rating': x_values, 'density': y_values})

px.line(kde_data, x='rating', y='density', template='plotly_dark', title='Distribution of Ratings',
              labels={'rating': 'Rating', 'density': 'Density'},line_shape='spline')


The ratings customers have given go from 4 to 10 and don't follow a special distribution. 

In [55]:
# Count the occurrences of each branch
payment_counts = supermarket['payment'].value_counts().reset_index()
payment_counts.columns = ['payment', 'count']

px.bar(payment_counts, x='payment', y='count', template='plotly_dark', 
title='Count of Each Payment Type',labels={'payment': 'Payment Type', 'count': 'Count'})

Most people pay with an e-wallet or cash, although card payment is still very important. 

In [57]:
kde = gaussian_kde(supermarket['gross_income'], bw_method=0.1)
x_values = np.linspace(supermarket['gross_income'].min(), supermarket['gross_income'].max(), 100)  # Range of gross income
y_values = kde(x_values)  # Density values

# Create a DataFrame for Plotly
kde_data = pd.DataFrame({'gross_income': x_values, 'density': y_values})

px.line(kde_data, x='gross_income', y='density', template='plotly_dark', title='Distribution of Gross Income',
              labels={'gross_income': 'Total Payed', 'density': 'Density'},line_shape='spline')

The distribution of the customers gross income is also right skewed, but we would need more information of how this gross income is calculated. 

#### Cross-Analysis 

In [58]:
px.scatter(supermarket, 'total', 'rating', template='plotly_dark', trendline='ols', trendline_color_override='red', 
           title = 'Relationship between the total payed and the rating')

In [59]:
px.scatter(supermarket, 'gross_income', 'rating', template='plotly_dark', trendline='ols', trendline_color_override='red', 
           title= 'Relationship between Ratings and Gross Income')

From these plots, we can see that gross income, or total payed do not impact the rating given. 

In [60]:
# Group by product line and gender, summing the total amount spent
gender_spending = supermarket.groupby(['product_line', 'gender'])['total'].sum().reset_index()

fig = px.bar(gender_spending, x='product_line', y='total', color='gender', barmode='group', template='plotly_dark',
    title='Total Amount Spent by Gender in Each Product Line',
    labels={'product_line': 'Product Line', 'total': 'Total Amount Spent', 'gender': 'Gender'})

fig.show()


Women spend more than men in almost all categories but Electronics and Accessories and Health & Beauty. 

In [None]:
# Group by product line and payment type, summing the total amount spent
payment_type_spending = supermarket.groupby(['product_line', 'payment'])['total'].sum().reset_index()

fig = px.bar(payment_type_spending, x='product_line', y='total', color='payment', barmode='group', template='plotly_dark',
    title='Total Amount Spent based on payment method in each Product Line',
    labels={'product_line': 'Product Line', 'total': 'Total Amount Spent', 'payment': 'Gender'})

fig.show()

While across all categories, the payment method is almost perfectly distrubuted, it is interesting to note high discrepancies in the Home & Lifestyle category. 

In [None]:
# Group by product line and branch, summing the total amount spent
branch_spending = supermarket.groupby(['product_line', 'branch'])['total'].sum().reset_index()

fig = px.bar(branch_spending, x='product_line', y='total', color='branch', barmode='group', template='plotly_dark',
    title='Total Amount Spent by Product Line in each Branch',
    labels={'product_line': 'Product Line', 'total': 'Total Amount Spent', 'branch': 'Branch'})

fig.show()

Branch C is usually prefered for electronic and fashion accessories as well as F&B, whereas most Home & Lifestyle spendings are in branch A. 

In [81]:
# Group by product line and customer_type, summing the total amount spent
member_spending = supermarket.groupby(['product_line', 'customer_type'])['total'].sum().reset_index()

fig = px.bar(member_spending, x='product_line', y='total', color='customer_type', barmode='group', template='plotly_dark',
    title='Total Amount Spent by Product Line per Customer Type',
    labels={'product_line': 'Product Line', 'total': 'Total Amount Spent', 'customer_type': 'Customer Type'})

fig.show()

Non-members tend to spend more in electronic and fashion accessories, whereas member spend more in other categories. This can lead us to think, based on the previous graph, that they might be shopping more in the branch C. Let's check with another graph. 

In [82]:
# Group by customer type and branch, summing the total amount spent
branch_member_spending = supermarket.groupby(['customer_type', 'branch'])['total'].sum().reset_index()

fig = px.bar(branch_member_spending, x='branch', y='total', color='customer_type', barmode='group', template='plotly_dark',
    title='Total Amount Spent by Customer Type by Branch',
    labels={'customer_typee': 'Customer Type', 'total': 'Total Amount Spent', 'branch': 'Branch'})

fig.show()

From this graph, the theory above that non-members spend more in branch C is wrong. We can however see a similar distribution of member and non-members in each branch. 

### Date based analysis

In [62]:
# Group by day of the week and sum the total sales
sales_per_day = supermarket_dt.groupby('weekday')['total'].sum().reset_index()

fig = px.bar( sales_per_day, x='weekday', y='total', template='plotly_dark', 
    title='Total Sales per Day of the Week',
    labels={'weekday': 'Day of the Week', 'total': 'Total Sales'},)

fig.show()


In [63]:
# Group by weekday and hour_of_day, calculate average total sales per hour
hourly_avg_by_day = supermarket_dt.groupby(['weekday', 'hour_of_day'])['total'].mean().reset_index()

fig = px.line(hourly_avg_by_day, x='hour_of_day', y='total', color='weekday', 
    title='Average Total Sales by Hour for Each Day of the Week',
    labels={'hour_of_day': 'Hour of the Day', 'total': 'Total Sales', 'weekday': 'Day of the Week'},
    template='plotly_dark')

fig.update_layout(
    xaxis=dict(
        tickmode='array', 
        tickvals=[x for x in range(24)],  # Only tick for full hours (0 to 23)
        ticktext=[f"{x:02d}:00" for x in range(24)],  # Label as HH:00
        dtick=1 ))

fig.show()


In [64]:
sales_per_hour_mean = supermarket_dt.groupby('hour_of_day')['total'].mean().reset_index()

px.line(sales_per_hour_mean, 'hour_of_day', 'total', template='plotly_dark', title = 'Average total Sales by Hour of the day')

In [65]:
sales_per_hour_sum = supermarket_dt.groupby('hour_of_day')['total'].sum().reset_index()

px.line(sales_per_hour_sum, 'hour_of_day', 'total', template='plotly_dark', title = 'Total Sales by Hour of the day')

In [66]:
px.box(supermarket_dt, 'hour_of_day', 'total', template='plotly_dark')

## Correlations 

In [71]:
# Split numerical and categorical features
supermarket_numeric = supermarket.select_dtypes(include=['number'])

# Remove constant columns (with only one unique value)
supermarket_numeric = supermarket_numeric.loc[:, supermarket_numeric.nunique() > 1]

# Compute the correlation matrix
correlation_matrix = supermarket_numeric.corr().abs().round(2).dropna(axis=1, how='all')

# Plot the full correlation heatmap with larger size
fig = px.imshow(correlation_matrix,
                labels=dict(x="Variables", y="Variables", color="Correlation"),
                x=correlation_matrix.columns, y=correlation_matrix.index,
                color_continuous_scale="dense",
                title="Full Correlation Matrix (Numerical Variables Only)",
                template="plotly_dark",
                text_auto=True)

# Make the plot larger
fig.update_layout(width=1000, height=800)

fig.show()

