Maud Lecerf | AI for Retail Consumer Goods

# Challenge 1 - POS analysis 


## Imports (files and packages)

In [45]:
import pandas as pd
import numpy as np
import plotly.express as px

In [90]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
DeprecationWarning



In [46]:
pos = pd.read_csv('../POS_data.csv')
supermarket = pd.read_csv('../supermarket_POS_data.csv')

## Initial Analysis & Cleaning

In [47]:
pos.head()

Unnamed: 0,Date,Time,Transaction,Item
0,10/30/2016,9:58:11,1,Bread
1,10/30/2016,10:05:34,2,Scandinavian
2,10/30/2016,10:05:34,2,Scandinavian
3,10/30/2016,10:07:57,3,Hot chocolate
4,10/30/2016,10:07:57,3,Jam


In [48]:
supermarket.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [49]:
pos.shape

(21293, 4)

In [50]:
supermarket.shape

(1000, 17)

### Changing Column names 
To be able to manipulate and claculate more easily, and make sure they all follow the same format

In [51]:
pos.columns = [col.replace(" ", "_").lower() for col in pos.columns]

In [52]:
pos.columns

Index(['date', 'time', 'transaction', 'item'], dtype='object')

In [53]:
supermarket.columns = [col.replace(" ", "_").lower() for col in supermarket.columns]


In [54]:
supermarket.columns

Index(['invoice_id', 'branch', 'city', 'customer_type', 'gender',
       'product_line', 'unit_price', 'quantity', 'tax_5%', 'total', 'date',
       'time', 'payment', 'cogs', 'gross_margin_percentage', 'gross_income',
       'rating'],
      dtype='object')

### Null Values

In [55]:
pos.isna().sum()

date           0
time           0
transaction    0
item           0
dtype: int64

In [56]:
supermarket.isna().sum()

invoice_id                 0
branch                     0
city                       0
customer_type              0
gender                     0
product_line               0
unit_price                 0
quantity                   0
tax_5%                     0
total                      0
date                       0
time                       0
payment                    0
cogs                       0
gross_margin_percentage    0
gross_income               0
rating                     0
dtype: int64

No null values, no need for cleaning on this part. 

### Values types 

In [57]:
pos.dtypes

date           object
time           object
transaction     int64
item           object
dtype: object

In [58]:
# changing the date and time to a datetime object 
pos['datetime'] = pd.to_datetime(pos['date'].astype(str) + ' ' + pos['time'].astype(str))


In [64]:
#removing the date and time columns as they are now redundant 
pos = pos.drop(columns = ['date', 'time'], axis=1)

In [66]:
#putting transaction as the index as it is already a unique identifier
pos.set_index('transaction', inplace=True)  

In [68]:
pos.head()

Unnamed: 0_level_0,item,datetime
transaction,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Bread,2016-10-30 09:58:11
2,Scandinavian,2016-10-30 10:05:34
2,Scandinavian,2016-10-30 10:05:34
3,Hot chocolate,2016-10-30 10:07:57
3,Jam,2016-10-30 10:07:57


In [59]:
supermarket.dtypes

invoice_id                  object
branch                      object
city                        object
customer_type               object
gender                      object
product_line                object
unit_price                 float64
quantity                     int64
tax_5%                     float64
total                      float64
date                        object
time                        object
payment                     object
cogs                       float64
gross_margin_percentage    float64
gross_income               float64
rating                     float64
dtype: object

In [60]:
#changing the date and time to a datetime object as well
supermarket['datetime'] = pd.to_datetime(supermarket['date'].astype(str) + ' ' + supermarket['time'].astype(str))

In [61]:
#removing the date and time columns as they are now redundant 
supermarket = supermarket.drop(columns = ['date', 'time'], axis=1)

In [None]:
#putting the invoice number as the index 
supermarket.set_index('invoice_id', inplace=True) #REMOVE TO LOOK AT DATETIMES ??? 

In [69]:
supermarket.head()

Unnamed: 0_level_0,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5%,total,payment,cogs,gross_margin_percentage,gross_income,rating,datetime
invoice_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,Ewallet,522.83,4.761905,26.1415,9.1,2019-01-05 13:08:00
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,Cash,76.4,4.761905,3.82,9.6,2019-03-08 10:29:00
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,Credit card,324.31,4.761905,16.2155,7.4,2019-03-03 13:23:00
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,Ewallet,465.76,4.761905,23.288,8.4,2019-01-27 20:33:00
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,Ewallet,604.17,4.761905,30.2085,5.3,2019-02-08 10:37:00


## Exploratory Data Analysis

### POS Data EDA

In [88]:
px.bar(pos, x='item', template='plotly_dark')

In [87]:
# Get the top 20 items by count
top_20 = pos['item'].value_counts().head(20).reset_index()
top_20.columns = ['item', 'count']

# Create the bar chart
px.bar(top_20, x='item', y='count', template='plotly_dark')


In [105]:
# Filter for 'Coffee' and group by time
coffee_counts = (pos[pos['item'] == 'Coffee'].groupby('datetime').size().reset_index(name='count'))

px.line(coffee_counts, x='datetime', y='count', title='Evolution of Coffee Transactions', template= 'plotly_dark')


### Supermarket Data EDA