# Challenge 1 - Analyzing POS Data
Completed by Christopher Stephan

## Loading & Understanding the Data

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import re
import math
from plotly.subplots import make_subplots
import plotly.graph_objects as go

### POS Data

In [16]:
pos_data = pd.read_csv("POS_data.csv")
super_data = pd.read_csv("supermarket_POS_data.csv")

In [17]:
pos_data.head()

Unnamed: 0,Date,Time,Transaction,Item
0,10/30/2016,9:58:11,1,Bread
1,10/30/2016,10:05:34,2,Scandinavian
2,10/30/2016,10:05:34,2,Scandinavian
3,10/30/2016,10:07:57,3,Hot chocolate
4,10/30/2016,10:07:57,3,Jam


In [19]:
#Extracting the total rows and columns in our pos dataset
pos_data.shape

(21293, 4)

In [21]:
pos_data.dtypes

Date           object
Time           object
Transaction     int64
Item           object
dtype: object

In [24]:
# changing the date and time to a datetime object 
pos_data['datetime'] = pd.to_datetime(pos_data['Date'].astype(str) + ' ' + pos_data['Time'].astype(str))

In [26]:
#removing the date and time columns as they are now redundant 
pos_data = pos_data.drop(columns = ['Date', 'Time'], axis=1)

In [27]:
pos_data.dtypes

Transaction             int64
Item                   object
datetime       datetime64[ns]
dtype: object

In [28]:
#Checking if any of the rows are duplicated in the dataset
print(f'There are {pos_data.duplicated().sum()} duplicated rows in our pos dataset.')

There are 1653 duplicated rows in our pos dataset.


In [30]:
#Rename columns for better readability
pos_data = pos_data.rename(columns={
    'Transaction': 'transaction',
    'Item': 'item',
})

#Confirm the column names are updated
pos_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21293 entries, 0 to 21292
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   transaction  21293 non-null  int64         
 1   item         21293 non-null  object        
 2   datetime     21293 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 499.2+ KB


In [32]:
pos_data.isna().mean()

transaction    0.0
item           0.0
datetime       0.0
dtype: float64

**The dataset is complete, no missing values**

In [33]:
#Summary statistics for integer & float features
pos_data.describe()

Unnamed: 0,transaction,datetime
count,21293.0,21293
mean,4951.990889,2017-01-17 14:54:19.669234176
min,1.0,2016-10-30 09:58:11
25%,2548.0,2016-12-03 14:07:07
50%,5067.0,2017-01-21 12:34:58
75%,7329.0,2017-02-28 10:58:04
max,9684.0,2017-04-09 15:04:24
std,2787.7584,


### SuperMarket Data

In [18]:
super_data.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [10]:
#Extracting the total rows and columns in our supermarket dataset
super_data.shape

(1000, 17)

In [34]:
super_data.dtypes

Invoice ID                  object
Branch                      object
City                        object
Customer type               object
Gender                      object
Product line                object
Unit price                 float64
Quantity                     int64
Tax 5%                     float64
Total                      float64
Date                        object
Time                        object
Payment                     object
cogs                       float64
gross margin percentage    float64
gross income               float64
Rating                     float64
dtype: object

In [35]:
#Checking if any of the rows are duplicated in the dataset
print(f'There are {super_data.duplicated().sum()} duplicated rows in our pos dataset.')

There are 0 duplicated rows in our pos dataset.


In [36]:
#changing the date and time to a datetime object as well
super_data['datetime'] = pd.to_datetime(super_data['Date'].astype(str) + ' ' + super_data['Time'].astype(str))

In [38]:
#removing the date and time columns as they are now redundant 
super_data = super_data.drop(columns = ['Date', 'Time'], axis=1)

In [39]:
#Rename columns for better readability
super_data.columns = [col.replace(" ", "_").lower() for col in super_data.columns]

In [40]:
super_data.columns

Index(['invoice_id', 'branch', 'city', 'customer_type', 'gender',
       'product_line', 'unit_price', 'quantity', 'tax_5%', 'total', 'payment',
       'cogs', 'gross_margin_percentage', 'gross_income', 'rating',
       'datetime'],
      dtype='object')

In [41]:
super_data.isna().mean()

invoice_id                 0.0
branch                     0.0
city                       0.0
customer_type              0.0
gender                     0.0
product_line               0.0
unit_price                 0.0
quantity                   0.0
tax_5%                     0.0
total                      0.0
payment                    0.0
cogs                       0.0
gross_margin_percentage    0.0
gross_income               0.0
rating                     0.0
datetime                   0.0
dtype: float64

**The dataset is complete, no missing values**

In [43]:
#Summary statistics for integer & float features
super_data.describe()

Unnamed: 0,unit_price,quantity,tax_5%,total,cogs,gross_margin_percentage,gross_income,rating,datetime
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000
mean,55.67213,5.51,15.379369,322.966749,307.58738,4.761905,15.379369,6.9727,2019-02-14 15:30:27.480000
min,10.08,1.0,0.5085,10.6785,10.17,4.761905,0.5085,4.0,2019-01-01 10:39:00
25%,32.875,3.0,5.924875,124.422375,118.4975,4.761905,5.924875,5.5,2019-01-24 17:58:45
50%,55.23,5.0,12.088,253.848,241.76,4.761905,12.088,7.0,2019-02-13 17:37:00
75%,77.935,8.0,22.44525,471.35025,448.905,4.761905,22.44525,8.5,2019-03-08 15:29:30
max,99.96,10.0,49.65,1042.65,993.0,4.761905,49.65,10.0,2019-03-30 20:37:00
std,26.494628,2.923431,11.708825,245.885335,234.17651,6.131498e-14,11.708825,1.71858,
