## Pizza Sales Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# create path to csvs
path = 'pizza_store_tables'


In [3]:
csv_list = os.listdir(path)
csv_list

['.ipynb_checkpoints',
 'orders.csv',
 'order_details - Copy.csv',
 'order_details.csv',
 'pizzas.csv',
 'pizza_types - Copy.csv',
 'pizza_types.csv']

### Orders Table EDA

In [17]:
# read in orders table
order_path = os.path.join(path,'orders.csv')
order_path
orders_df = pd.read_csv(order_path)
orders_df.head()

Unnamed: 0,order_id,date,time
0,1,2015-01-01,11:38:36
1,2,2015-01-01,11:57:40
2,3,2015-01-01,12:12:28
3,4,2015-01-01,12:16:31
4,5,2015-01-01,12:21:30


In [5]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21350 entries, 0 to 21349
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   order_id  21350 non-null  int64 
 1   date      21350 non-null  object
 2   time      21350 non-null  object
dtypes: int64(1), object(2)
memory usage: 500.5+ KB


In [7]:
# no nulls, check for duplicate order_ids
orders_df['order_id'].nunique()

21350

### Order Details table EDA

In [8]:
# read in order_details table
order_details_path = os.path.join(path,'order_details.csv')
order_details_df = pd.read_csv(order_details_path)
order_details_df.head()

Unnamed: 0,order_details_id,order_id,pizza_id,quantity
0,1,1,hawaiian_m,1
1,2,2,classic_dlx_m,1
2,3,2,five_cheese_l,1
3,4,2,ital_supr_l,1
4,5,2,mexicana_m,1


In [9]:
order_details_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48620 entries, 0 to 48619
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   order_details_id  48620 non-null  int64 
 1   order_id          48620 non-null  int64 
 2   pizza_id          48620 non-null  object
 3   quantity          48620 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [10]:
order_details_df.nunique()

order_details_id    48620
order_id            21350
pizza_id               91
quantity                4
dtype: int64

In [13]:
order_details_df['quantity'].max()


4

### Pizzas table EDA

In [14]:
# read in pizzas csv
pizzas_path = os.path.join(path,'pizzas.csv')
pizzas_df = pd.read_csv(pizzas_path)
pizzas_df.head()

Unnamed: 0,pizza_id,pizza_type_id,size,price
0,bbq_ckn_s,bbq_ckn,S,12.75
1,bbq_ckn_m,bbq_ckn,M,16.75
2,bbq_ckn_l,bbq_ckn,L,20.75
3,cali_ckn_s,cali_ckn,S,12.75
4,cali_ckn_m,cali_ckn,M,16.75


In [15]:
pizzas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pizza_id       96 non-null     object 
 1   pizza_type_id  96 non-null     object 
 2   size           96 non-null     object 
 3   price          96 non-null     float64
dtypes: float64(1), object(3)
memory usage: 3.1+ KB


In [18]:
# combine date and time and convert to date time
orders_df['order_date'] = orders_df['date'] + ' ' + orders_df['time']
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])
# orders_df.drop(columns=['date','time'],inplace = True)
orders_df.head()

Unnamed: 0,order_id,date,time,order_date
0,1,2015-01-01,11:38:36,2015-01-01 11:38:36
1,2,2015-01-01,11:57:40,2015-01-01 11:57:40
2,3,2015-01-01,12:12:28,2015-01-01 12:12:28
3,4,2015-01-01,12:16:31,2015-01-01 12:16:31
4,5,2015-01-01,12:21:30,2015-01-01 12:21:30


In [21]:
print(orders_df['date'].min(),orders_df['date'].max())

2015-01-01 2015-12-31


In [22]:
print(orders_df['time'].min(),orders_df['time'].max())

09:52:21 23:05:52


### Pizza Types Eda

In [28]:
pizza_types_df = pd.read_csv(os.path.join(path,'pizza_types.csv'),encoding= 'unicode_escape')
pizza_types_df.head()

Unnamed: 0,pizza_type_id,name,category,ingredients
0,bbq_ckn,The Barbecue Chicken Pizza,Chicken,"Barbecued Chicken, Red Peppers, Green Peppers,..."
1,cali_ckn,The California Chicken Pizza,Chicken,"Chicken, Artichoke, Spinach, Garlic, Jalapeno ..."
2,ckn_alfredo,The Chicken Alfredo Pizza,Chicken,"Chicken, Red Onions, Red Peppers, Mushrooms, A..."
3,ckn_pesto,The Chicken Pesto Pizza,Chicken,"Chicken, Tomatoes, Red Peppers, Spinach, Garli..."
4,southw_ckn,The Southwest Chicken Pizza,Chicken,"Chicken, Tomatoes, Red Peppers, Red Onions, Ja..."


In [29]:
pizza_types_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pizza_type_id  32 non-null     object
 1   name           32 non-null     object
 2   category       32 non-null     object
 3   ingredients    32 non-null     object
dtypes: object(4)
memory usage: 1.1+ KB


In [30]:
pizza_types_df.nunique()

pizza_type_id    32
name             32
category          4
ingredients      32
dtype: int64

In [31]:
monthly_orders_df = orders_df.resample(rule='M', on = 'order_date')['order_id'].count()
monthly_orders_df

order_date
2015-01-31    1845
2015-02-28    1685
2015-03-31    1840
2015-04-30    1799
2015-05-31    1853
2015-06-30    1773
2015-07-31    1935
2015-08-31    1841
2015-09-30    1661
2015-10-31    1646
2015-11-30    1792
2015-12-31    1680
Freq: M, Name: order_id, dtype: int64

In [33]:
# get sales $ for each month
orders_df2 = pd.merge(left=orders_df,right=order_details_df,how='inner',on='order_id')
orders_df2.head()

Unnamed: 0,order_id,date,time,order_date,order_details_id,pizza_id,quantity
0,1,2015-01-01,11:38:36,2015-01-01 11:38:36,1,hawaiian_m,1
1,2,2015-01-01,11:57:40,2015-01-01 11:57:40,2,classic_dlx_m,1
2,2,2015-01-01,11:57:40,2015-01-01 11:57:40,3,five_cheese_l,1
3,2,2015-01-01,11:57:40,2015-01-01 11:57:40,4,ital_supr_l,1
4,2,2015-01-01,11:57:40,2015-01-01 11:57:40,5,mexicana_m,1


In [34]:
line_item_df = pd.merge(left=orders_df2,right=pizzas_df,how='inner',on='pizza_id')
line_item_df.head()

Unnamed: 0,order_id,date,time,order_date,order_details_id,pizza_id,quantity,pizza_type_id,size,price
0,1,2015-01-01,11:38:36,2015-01-01 11:38:36,1,hawaiian_m,1,hawaiian,M,13.25
1,77,2015-01-02,12:22:46,2015-01-02 12:22:46,179,hawaiian_m,1,hawaiian,M,13.25
2,146,2015-01-03,14:22:10,2015-01-03 14:22:10,357,hawaiian_m,1,hawaiian,M,13.25
3,163,2015-01-03,16:54:54,2015-01-03 16:54:54,389,hawaiian_m,1,hawaiian,M,13.25
4,247,2015-01-04,20:55:29,2015-01-04 20:55:29,568,hawaiian_m,1,hawaiian,M,13.25


In [35]:
line_item_df['line_cost']=line_item_df['quantity'] * line_item_df['price']
line_item_df.head()

Unnamed: 0,order_id,date,time,order_date,order_details_id,pizza_id,quantity,pizza_type_id,size,price,line_cost
0,1,2015-01-01,11:38:36,2015-01-01 11:38:36,1,hawaiian_m,1,hawaiian,M,13.25,13.25
1,77,2015-01-02,12:22:46,2015-01-02 12:22:46,179,hawaiian_m,1,hawaiian,M,13.25,13.25
2,146,2015-01-03,14:22:10,2015-01-03 14:22:10,357,hawaiian_m,1,hawaiian,M,13.25,13.25
3,163,2015-01-03,16:54:54,2015-01-03 16:54:54,389,hawaiian_m,1,hawaiian,M,13.25,13.25
4,247,2015-01-04,20:55:29,2015-01-04 20:55:29,568,hawaiian_m,1,hawaiian,M,13.25,13.25


In [39]:
order_df3 = line_item_df.groupby(['order_id']).agg({'order_date':'max','quantity':'sum','line_cost':'sum'}).reset_index()
order_df3.head()

Unnamed: 0,order_id,order_date,quantity,line_cost
0,1,2015-01-01 11:38:36,1,13.25
1,2,2015-01-01 11:57:40,5,92.0
2,3,2015-01-01 12:12:28,2,37.25
3,4,2015-01-01 12:16:31,1,16.5
4,5,2015-01-01 12:21:30,1,16.5


In [43]:
monthly_sales_df = order_df3.resample('M',on='order_date').agg({'order_id':'count','quantity':'sum','line_cost':'sum'})
monthly_sales_df

Unnamed: 0_level_0,order_id,quantity,line_cost
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-31,1845,4232,69793.3
2015-02-28,1685,3961,65159.6
2015-03-31,1840,4261,70397.1
2015-04-30,1799,4151,68736.8
2015-05-31,1853,4328,71402.75
2015-06-30,1773,4107,68230.2
2015-07-31,1935,4392,72557.9
2015-08-31,1841,4168,68278.25
2015-09-30,1661,3890,64180.05
2015-10-31,1646,3883,64027.6
