In [1]:
# Imports 
import pandas as pd

In [2]:
# Data Path
path = 'Data/POS_Data.xlsx'

# Retrieve Data
raw_data = pd.read_excel(path, sheet_name='POS')

raw_data.head()

Unnamed: 0,Check ID,Item Name,Gross Revenue,Date,Sale Time - Exact,Category,Day Part,Is Beverage on Check,Cost Center
0,5723,BVC - WATER SMARTWATER 20OZ,1.59,2020-11-20,21:22:30,ADD>BreakfstAdds,Late Night,Yes,Hospital B
1,27172,BVC - WATER SMARTWATER 20OZ,0.0,2020-10-28,12:47:50,ADD>BreakfstAdds,Lunch,Yes,Hospital A
2,34799,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-28,07:15:04,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
3,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
4,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Mid Morning,Yes,Hospital A


In [3]:
# Clean Column Names
raw_data.columns = (
    raw_data.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('-', '_')
    .str.replace('___', '_')
    .str.replace('__', '_')
    .str.replace('-', '_')
    .str.replace(r'[^a-z0-9_]', '', regex=True)
)

raw_data.head()

Unnamed: 0,check_id,item_name,gross_revenue,date,sale_time_exact,category,day_part,is_beverage_on_check,cost_center
0,5723,BVC - WATER SMARTWATER 20OZ,1.59,2020-11-20,21:22:30,ADD>BreakfstAdds,Late Night,Yes,Hospital B
1,27172,BVC - WATER SMARTWATER 20OZ,0.0,2020-10-28,12:47:50,ADD>BreakfstAdds,Lunch,Yes,Hospital A
2,34799,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-28,07:15:04,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
3,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
4,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Mid Morning,Yes,Hospital A


In [4]:
raw_data['group'] = raw_data['item_name'].str.split(' - ', n=1).str[0]
raw_data['item_name'] = raw_data['item_name'].str.split(' - ', n=1).str[-1]
raw_data['sub_category'] = raw_data['category'].str.split('>').str[-1]
raw_data['category'] = raw_data['category'].str.split('>').str[0]

line_items = raw_data.assign(
    timestamp=lambda df: pd.to_datetime(raw_data['date'].astype(str) + ' ' + raw_data['sale_time_exact'].astype(str), format='%Y-%m-%d %H:%M:%S', errors='coerce'),
)

# Drop unneeded date/time columns
line_items = line_items.drop(columns=['date', 'sale_time_exact'])

line_items.head()

Unnamed: 0,check_id,item_name,gross_revenue,category,day_part,is_beverage_on_check,cost_center,group,sub_category,timestamp
0,5723,WATER SMARTWATER 20OZ,1.59,ADD,Late Night,Yes,Hospital B,BVC,BreakfstAdds,2020-11-20 21:22:30
1,27172,WATER SMARTWATER 20OZ,0.0,ADD,Lunch,Yes,Hospital A,BVC,BreakfstAdds,2020-10-28 12:47:50
2,34799,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,Yes,Hospital A,BVC,BreakfstAdds,2020-10-28 07:15:04
3,64634,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,Yes,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39
4,64634,WATER SMARTWATER 20OZ,1.59,ADD,Mid Morning,Yes,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39


In [5]:
line_items['is_beverage_on_check'] = line_items['is_beverage_on_check'].str.lower().str.strip() == 'yes'

line_items = line_items.astype({
    'check_id': 'string',
    'item_name': 'string',
    'gross_revenue': 'float',
    'category': 'string',
    'sub_category': 'string',
    'day_part': 'string',
    'is_beverage_on_check': 'boolean',
    'cost_center': 'string',
    'group': 'string',
    'timestamp': 'datetime64[ns]'
})
#line_items.to_sql('line_items', 'sqlite:///C1_case_study.db', if_exists='replace', index=False)

line_items.head()

Unnamed: 0,check_id,item_name,gross_revenue,category,day_part,is_beverage_on_check,cost_center,group,sub_category,timestamp
0,5723,WATER SMARTWATER 20OZ,1.59,ADD,Late Night,True,Hospital B,BVC,BreakfstAdds,2020-11-20 21:22:30
1,27172,WATER SMARTWATER 20OZ,0.0,ADD,Lunch,True,Hospital A,BVC,BreakfstAdds,2020-10-28 12:47:50
2,34799,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,True,Hospital A,BVC,BreakfstAdds,2020-10-28 07:15:04
3,64634,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,True,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39
4,64634,WATER SMARTWATER 20OZ,1.59,ADD,Mid Morning,True,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39


In [6]:
# build transactions table and enforce data types
transactions = line_items.groupby(['check_id']).agg(
    timestamp = ('timestamp', 'first'),
    total_amount=('gross_revenue', 'sum'),
    num_items=('item_name', 'count'),
    cost_center=('cost_center', 'first'),
    day_part=('day_part', 'first'),
    top_group = ('group', lambda x: x.mode()[0])
    ).reset_index().astype({
        'check_id': 'string',
        'timestamp': 'datetime64[ns]',
        'total_amount': 'float64',
        'num_items': 'Int64',
        'cost_center': 'string',
        'day_part': 'string',
        'top_group': 'string',
    })
#transactions.to_sql('transactions', 'sqlite:///C1_case_study.db', if_exists='replace', index=False)

transactions.head()

Unnamed: 0,check_id,timestamp,total_amount,num_items,cost_center,day_part,top_group
0,10000003,2020-10-23 11:58:15,4.09,1,Hospital A,Lunch,GRL
1,10000360,2020-10-23 13:05:04,9.66,4,Hospital B,Lunch,GRL
2,10000365,2020-10-23 12:33:39,5.49,1,Hospital B,Lunch,TUKEY PROVOLONE
3,10000385,2020-10-23 12:41:35,12.46,5,Hospital A,Lunch,GRL
4,10000390,2020-10-23 13:15:30,0.99,1,Hospital A,Lunch,BFK


In [7]:
items = line_items[['item_name', 'group', 'category', 'sub_category', 'gross_revenue']].drop_duplicates().sort_values('item_name').reset_index(drop=True)
items = items.astype({
    'item_name': 'string',
    'group': 'string',
    'category': 'string',
    'sub_category': 'string',
    'gross_revenue': 'float64'
}) 

items = items.rename(columns={'gross_revenue': 'price'})

items

Unnamed: 0,item_name,group,category,sub_category,price
0,CAFE KITCHEN CHICKEN,ENT,Entree,Entree,6.99
1,CAFE KITCHEN CHICKEN,ENT,Entree,Entree,0.00
2,CAFE KITCHEN PASTA,ENT,Entree,Entree,0.00
3,CAFE KITCHEN PASTA,ENT,Entree,Entree,5.99
4,GUEST TRAY 7.50,ENT,Entree,Entree,6.93
...,...,...,...,...,...
2808,YOGURT YOPLAIT STRAWBERRY,IMP,Impulse/Snack,Impulse/Snack,1.69
2809,YOGURT YOPLAIT STRAWBERRY,IMP,Impulse/Snack,Impulse/Snack,3.38
2810,YOPLAIT YOGURT LITE,IMP,Impulse/Snack,Impulse/Snack,2.98
2811,YOPLAIT YOGURT LITE,IMP,Impulse/Snack,Impulse/Snack,1.49


In [13]:
categories = line_items[['category', 'sub_category']].drop_duplicates().sort_values(['category']).reset_index(drop=True)

print(categories.head(70))

       category  sub_category
0           ADD  BreakfstAdds
1           ADD   Coffee Adds
2           ADD    Combo Adds
3           ADD    Pizza Adds
4           ADD     Sand Adds
..          ...           ...
64  Soup/Salads   Soup/Salads
65     Sushi DO      Sushi DO
66    Trattoria     Trattoria
67     Wellness      Wellness
68   zBeverage          Water

[69 rows x 2 columns]
