Maud Lecerf | AI for Retail Consumer Goods

# Challenge 1 - POS analysis 


## Imports (files and packages)

In [56]:
import pandas as pd
import numpy as np
import plotly.express as px

In [57]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
DeprecationWarning



In [58]:
pos = pd.read_csv('../POS_data.csv')
supermarket = pd.read_csv('../supermarket_POS_data.csv')

## Initial Analysis & Cleaning

In [59]:
pos.head()

Unnamed: 0,Date,Time,Transaction,Item
0,10/30/2016,9:58:11,1,Bread
1,10/30/2016,10:05:34,2,Scandinavian
2,10/30/2016,10:05:34,2,Scandinavian
3,10/30/2016,10:07:57,3,Hot chocolate
4,10/30/2016,10:07:57,3,Jam


In [60]:
supermarket.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [61]:
pos.shape

(21293, 4)

In [62]:
supermarket.shape

(1000, 17)

### Changing Column names 
To be able to manipulate and claculate more easily, and make sure they all follow the same format

In [63]:
pos.columns = [col.replace(" ", "_").lower() for col in pos.columns]

In [64]:
pos.columns

Index(['date', 'time', 'transaction', 'item'], dtype='object')

In [65]:
supermarket.columns = [col.replace(" ", "_").lower() for col in supermarket.columns]


In [66]:
supermarket.columns

Index(['invoice_id', 'branch', 'city', 'customer_type', 'gender',
       'product_line', 'unit_price', 'quantity', 'tax_5%', 'total', 'date',
       'time', 'payment', 'cogs', 'gross_margin_percentage', 'gross_income',
       'rating'],
      dtype='object')

### Null Values

In [67]:
pos.isna().sum()

date           0
time           0
transaction    0
item           0
dtype: int64

In [68]:
supermarket.isna().sum()

invoice_id                 0
branch                     0
city                       0
customer_type              0
gender                     0
product_line               0
unit_price                 0
quantity                   0
tax_5%                     0
total                      0
date                       0
time                       0
payment                    0
cogs                       0
gross_margin_percentage    0
gross_income               0
rating                     0
dtype: int64

No null values, no need for cleaning on this part. 

### Values types 

In [69]:
pos.dtypes

date           object
time           object
transaction     int64
item           object
dtype: object

In [70]:
# changing the date and time to a datetime object 
pos['datetime'] = pd.to_datetime(pos['date'].astype(str) + ' ' + pos['time'].astype(str))


In [71]:
#removing the date and time columns as they are now redundant 
pos = pos.drop(columns = ['date', 'time'], axis=1)

### Dates Dataframes

In [72]:
#creating a dataframe where datetime is the index to be able to have different insights based on dates
pos_dt = pos.copy()
pos_dt.set_index('datetime', inplace=True)

In [91]:
#addding date-related columns for visuals later 
pos_dt['year'] = pos_dt.index.year
pos_dt['month'] = pos_dt.index.month_name()
pos_dt['day_of_month'] = pos_dt.index.day
pos_dt['weekday'] = pos_dt.index.day_name()
pos_dt['date'] = pos_dt.index.date
pos_dt['hour_of_day'] = pos_dt.index.hour

In [73]:
#putting transaction as the index of the original pos dataset as it is already a unique identifier
pos.set_index('transaction', inplace=True)  

In [74]:
pos.head()

Unnamed: 0_level_0,item,datetime
transaction,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Bread,2016-10-30 09:58:11
2,Scandinavian,2016-10-30 10:05:34
2,Scandinavian,2016-10-30 10:05:34
3,Hot chocolate,2016-10-30 10:07:57
3,Jam,2016-10-30 10:07:57


In [75]:
supermarket.dtypes

invoice_id                  object
branch                      object
city                        object
customer_type               object
gender                      object
product_line                object
unit_price                 float64
quantity                     int64
tax_5%                     float64
total                      float64
date                        object
time                        object
payment                     object
cogs                       float64
gross_margin_percentage    float64
gross_income               float64
rating                     float64
dtype: object

In [76]:
#changing the date and time to a datetime object as well
supermarket['datetime'] = pd.to_datetime(supermarket['date'].astype(str) + ' ' + supermarket['time'].astype(str))

In [77]:
#removing the date and time columns as they are now redundant 
supermarket = supermarket.drop(columns = ['date', 'time'], axis=1)

In [78]:
#creating a dataframe where datetime is the index to be able to have different insights based on dates
supermarket_dt = supermarket.copy()
supermarket_dt.set_index('datetime', inplace=True)

In [86]:
#addding date-related columns for visuals later 
supermarket_dt['day_of_month'] = supermarket_dt.index.day
supermarket_dt['month'] = supermarket_dt.index.month_name()
supermarket_dt['weekday'] = supermarket_dt.index.day_name()
supermarket_dt['date'] = supermarket_dt.index.date
supermarket_dt['hour_of_day'] = supermarket_dt.index.hour

In [79]:
#putting the invoice number as the index 
supermarket.set_index('invoice_id', inplace=True) 

In [80]:
supermarket.head()

Unnamed: 0_level_0,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5%,total,payment,cogs,gross_margin_percentage,gross_income,rating,datetime
invoice_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,Ewallet,522.83,4.761905,26.1415,9.1,2019-01-05 13:08:00
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,Cash,76.4,4.761905,3.82,9.6,2019-03-08 10:29:00
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,Credit card,324.31,4.761905,16.2155,7.4,2019-03-03 13:23:00
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,Ewallet,465.76,4.761905,23.288,8.4,2019-01-27 20:33:00
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,Ewallet,604.17,4.761905,30.2085,5.3,2019-02-08 10:37:00


## Exploratory Data Analysis

### POS Data EDA

#### All Items

In [81]:
px.bar(pos['item'].value_counts(), template='plotly_dark', title='Item Counts')

In [104]:
# Group by year and month, and count the number of items
items_monthly_counts = (pos_dt.groupby(['year', 'month']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
items_monthly_counts['date'] = pd.to_datetime(
    items_monthly_counts['year'].astype(str) + '-' + items_monthly_counts['month'].astype(str))

# Plot the data using px
px.bar( items_monthly_counts, x='date', y='count', template='plotly_dark', title='Monthly Sales (in number of items)', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Let's look at why we have such low values for the months of october and april

In [107]:
# Group by year and month, and count the number of items
items_daily_counts = (pos_dt.groupby(['date']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
items_daily_counts['date'] = pd.to_datetime(items_daily_counts['date'].astype(str))

# Plot the data using px
px.line( items_daily_counts, x='date', y='count', template='plotly_dark', title='Daily Sales (in number of items)', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})

By looking at coffee sales daily, we see that the low values in october 2016 and april 2017 are due to the dataset starting at the end of october and finishing beginning of april. There =fore we don't have data for all those months and cannot properly compare. 

In [108]:
# Filter for Coffee and calculate the average count by day of the week
items_avg_by_weekday = (pos_dt.groupby('weekday').size().groupby(level=0).mean().reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( items_avg_by_weekday, x='weekday', y='average_count', template='plotly_dark', 
       title='Average Coffee Sales by Day of the Week', labels={'weekday': 'Day of the Week', 'average_count': 'Average Count'})

In [111]:
# Filter for Coffee and calculate the average count by day of the week
items_avg_by_hour = (pos_dt.groupby('hour_of_day').size().groupby(level=0).mean().reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( items_avg_by_hour, x='hour_of_day', y='average_count', template='plotly_dark', 
       title='Average Coffee Sales by Day of the Week', labels={'hour_of_day': 'Hour of the Day', 'average_count': 'Average Count'})

In [115]:
# Group by weekday and hour_of_day, and calculate the count of all items
hourly_avg_by_day = ( pos_dt.groupby(['weekday', 'hour_of_day']).size().reset_index(name='item_count'))

# Plot the data using Plotly Express
fig = px.line(hourly_avg_by_day, x='hour_of_day', y='item_count', color='weekday', 
              title='Hourly Item Count by Day of the Week', template='plotly_dark',
              labels={'hour_of_day': 'Hour of the Day', 'item_count': 'Item Count', 'weekday': 'Day of the Week'})

# Customize the x-axis to show labels for every hour
fig.update_layout(xaxis=dict(tickmode='array', tickvals=list(range(24)), ticktext=[f"{hour:02d}:00" for hour in range(24)]))
fig.show()


#### Coffee Data
As coffee is the item most sold, let's look at the selling patterns of this item first. 

In [None]:
# Filter for Coffee items
coffee_data = pos_dt[pos_dt['item'] == 'Coffee']

# Group by year and month, and count the number of items
coffee_monthly_counts = (coffee_data.groupby(['year', 'month']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
coffee_monthly_counts['date'] = pd.to_datetime(
    coffee_monthly_counts['year'].astype(str) + '-' + coffee_monthly_counts['month'].astype(str))

# Plot the data using px
px.bar( coffee_monthly_counts, x='date', y='count', template='plotly_dark', title='Monthly Coffee Sales', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



As for the total number of items, we have a similar distribution for coffee. 

In [97]:
# Filter for Coffee items
coffee_data = pos_dt[pos_dt['item'] == 'Coffee']

# Group by year and month, and count the number of items
coffee_daily_counts = (coffee_data.groupby(['date']).size().reset_index(name='count'))

# Create a column for full date (e.g., '2025-01')
coffee_daily_counts['date'] = pd.to_datetime(coffee_daily_counts['date'].astype(str))

# Plot the data using px
px.line( coffee_daily_counts, x='date', y='count', template='plotly_dark', title='Daily Coffee Sales', 
       labels={'date': 'Month-Year', 'count': 'Number of Items Sold'})

As we don't have values for a whole year, it might be more interesting to look at weekly and hourly changes and values in the POS data. 

In [None]:
# Filter for Coffee and calculate the average count by day of the week
coffee_avg_by_weekday = (pos_dt[pos_dt['item'] == 'Coffee'].groupby('weekday').size().groupby(level=0).mean()
                         .reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( coffee_avg_by_weekday, x='weekday', y='average_count', template='plotly_dark', 
       title='Average Coffee Sales by Day of the Week', labels={'weekday': 'Day of the Week', 'average_count': 'Average Count'})

In [103]:
# Filter for Coffee and calculate the average count by day of the week
coffee_avg_by_hour = (pos_dt[pos_dt['item'] == 'Coffee'].groupby('hour_of_day').size().groupby(level=0).mean()
                         .reset_index(name='average_count'))

# Plot the average count using Plotly Express
px.bar( coffee_avg_by_hour, x='hour_of_day', y='average_count', template='plotly_dark', 
       title='Average Coffee Sales by Hour of the Day', labels={'weekday': 'Hour of the Day', 'average_count': 'Average Count'})

### Supermarket Data EDA