In [1]:
# Imports 
import pandas as pd
import sqlalchemy

In [None]:
# Create SQLAlchemy engine for connecting to SQLite database
engine = sqlalchemy.create_engine("sqlite:///test.db?journal_mode=DELETE&synchronous=NORMAL&temp_store=MEMORY")

In [3]:
# Data Path
path = 'Data/POS_Data.xlsx'

# Retrieve Data
raw_data = pd.read_excel(path, sheet_name='POS')

raw_data.head()

Unnamed: 0,Check ID,Item Name,Gross Revenue,Date,Sale Time - Exact,Category,Day Part,Is Beverage on Check,Cost Center
0,5723,BVC - WATER SMARTWATER 20OZ,1.59,2020-11-20,21:22:30,ADD>BreakfstAdds,Late Night,Yes,Hospital B
1,27172,BVC - WATER SMARTWATER 20OZ,0.0,2020-10-28,12:47:50,ADD>BreakfstAdds,Lunch,Yes,Hospital A
2,34799,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-28,07:15:04,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
3,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
4,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Mid Morning,Yes,Hospital A


In [4]:
# Clean Column Names
raw_data.columns = (
    raw_data.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('-', '_')
    .str.replace('___', '_')
    .str.replace('__', '_')
    .str.replace('-', '_')
    .str.replace(r'[^a-z0-9_]', '', regex=True)
)

raw_data.head()

Unnamed: 0,check_id,item_name,gross_revenue,date,sale_time_exact,category,day_part,is_beverage_on_check,cost_center
0,5723,BVC - WATER SMARTWATER 20OZ,1.59,2020-11-20,21:22:30,ADD>BreakfstAdds,Late Night,Yes,Hospital B
1,27172,BVC - WATER SMARTWATER 20OZ,0.0,2020-10-28,12:47:50,ADD>BreakfstAdds,Lunch,Yes,Hospital A
2,34799,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-28,07:15:04,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
3,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Breakfast,Yes,Hospital A
4,64634,BVC - WATER SMARTWATER 20OZ,1.59,2020-10-27,08:39:39,ADD>BreakfstAdds,Mid Morning,Yes,Hospital A


In [5]:
raw_data['group'] = raw_data['item_name'].str.split(' - ', n=1).str[0]
raw_data['item_name'] = raw_data['item_name'].str.split(' - ', n=1).str[-1]
raw_data['sub_category'] = raw_data['category'].str.split('>').str[-1]
raw_data['category'] = raw_data['category'].str.split('>').str[0]

line_items = raw_data.assign(
    timestamp=lambda df: pd.to_datetime(raw_data['date'].astype(str) + ' ' + raw_data['sale_time_exact'].astype(str), format='%Y-%m-%d %H:%M:%S', errors='coerce'),
)

# Drop unneeded date/time columns
line_items = line_items.drop(columns=['date', 'sale_time_exact'])

line_items.head()

Unnamed: 0,check_id,item_name,gross_revenue,category,day_part,is_beverage_on_check,cost_center,group,sub_category,timestamp
0,5723,WATER SMARTWATER 20OZ,1.59,ADD,Late Night,Yes,Hospital B,BVC,BreakfstAdds,2020-11-20 21:22:30
1,27172,WATER SMARTWATER 20OZ,0.0,ADD,Lunch,Yes,Hospital A,BVC,BreakfstAdds,2020-10-28 12:47:50
2,34799,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,Yes,Hospital A,BVC,BreakfstAdds,2020-10-28 07:15:04
3,64634,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,Yes,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39
4,64634,WATER SMARTWATER 20OZ,1.59,ADD,Mid Morning,Yes,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39


In [6]:
line_items['is_beverage_on_check'] = line_items['is_beverage_on_check'].str.lower().str.strip() == 'yes'

line_items = line_items.astype({
    'check_id': 'string',
    'item_name': 'string',
    'gross_revenue': 'float',
    'category': 'string',
    'sub_category': 'string',
    'day_part': 'string',
    'is_beverage_on_check': 'boolean',
    'cost_center': 'string',
    'group': 'string',
    'timestamp': 'datetime64[ns]'
})


# Add the primary key column (1-based index)
line_items['line_item_id'] = line_items.index + 1


# Load line items to SQLite database
line_items.to_sql("line_items", engine, if_exists="replace", index=True, chunksize=8192, method="multi")


line_items.head()

Unnamed: 0,check_id,item_name,gross_revenue,category,day_part,is_beverage_on_check,cost_center,group,sub_category,timestamp,line_item_id
0,5723,WATER SMARTWATER 20OZ,1.59,ADD,Late Night,True,Hospital B,BVC,BreakfstAdds,2020-11-20 21:22:30,1
1,27172,WATER SMARTWATER 20OZ,0.0,ADD,Lunch,True,Hospital A,BVC,BreakfstAdds,2020-10-28 12:47:50,2
2,34799,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,True,Hospital A,BVC,BreakfstAdds,2020-10-28 07:15:04,3
3,64634,WATER SMARTWATER 20OZ,1.59,ADD,Breakfast,True,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39,4
4,64634,WATER SMARTWATER 20OZ,1.59,ADD,Mid Morning,True,Hospital A,BVC,BreakfstAdds,2020-10-27 08:39:39,5


In [7]:
# build transactions table and enforce data types
transactions = line_items.groupby(['check_id']).agg(
    timestamp = ('timestamp', 'first'),
    total_amount=('gross_revenue', 'sum'),
    num_items=('item_name', 'count'),
    cost_center=('cost_center', 'first'),
    day_part=('day_part', 'first'),
    top_group = ('group', lambda x: x.mode()[0]),
    is_beverage_on_check = ('is_beverage_on_check', 'first')
    ).reset_index().astype({
        'check_id': 'string',
        'timestamp': 'datetime64[ns]',
        'total_amount': 'float64',
        'num_items': 'Int64',
        'cost_center': 'string',
        'day_part': 'string',
        'top_group': 'string',
    })

# # Add the primary key column (1-based index)
# transactions['transactions_id'] = transactions.index + 1

# Load transactions table to SQLite database
transactions.to_sql("transactions", engine, if_exists="replace", index=True, chunksize=8192, method="multi")

transactions.head()

Unnamed: 0,check_id,timestamp,total_amount,num_items,cost_center,day_part,top_group,is_beverage_on_check
0,10000003,2020-10-23 11:58:15,4.09,1,Hospital A,Lunch,GRL,False
1,10000360,2020-10-23 13:05:04,9.66,4,Hospital B,Lunch,GRL,True
2,10000365,2020-10-23 12:33:39,5.49,1,Hospital B,Lunch,TUKEY PROVOLONE,False
3,10000385,2020-10-23 12:41:35,12.46,5,Hospital A,Lunch,GRL,True
4,10000390,2020-10-23 13:15:30,0.99,1,Hospital A,Lunch,BFK,False


In [10]:
# Build items table
items = line_items[['item_name', 'group', 'category', 'sub_category', 'gross_revenue', 'cost_center']].drop_duplicates().sort_values('item_name').reset_index(drop=True)
items = items.astype({
    'item_name': 'string',
    'group': 'string',
    'category': 'string',
    'sub_category': 'string',
    'gross_revenue': 'float64',
    'cost_center': 'string'
}) 

items = items.rename(columns={'gross_revenue': 'price'})

# Load items table to SQLite database
items.to_sql("items", engine, if_exists="replace", index=True, chunksize=8192, method="multi")

items.head()

Unnamed: 0,item_name,group,category,sub_category,price,cost_center
0,CAFE KITCHEN CHICKEN,ENT,Entree,Entree,6.99,Hospital A
1,CAFE KITCHEN CHICKEN,ENT,Entree,Entree,0.0,Hospital A
2,CAFE KITCHEN PASTA,ENT,Entree,Entree,5.99,Hospital A
3,CAFE KITCHEN PASTA,ENT,Entree,Entree,0.0,Hospital A
4,GUEST TRAY 7.50,ENT,Entree,Entree,13.86,Hospital A


In [11]:
#Build categories table
categories = line_items[['category', 'sub_category', 'cost_center']].drop_duplicates().sort_values(['category', 'sub_category', 'cost_center']).reset_index(drop=True)

with pd.option_context('display.max_rows', None,
                    'display.max_columns', None,
                    'display.width', None,
                    'display.max_colwidth', None):
    display(categories)


# Load categories table into SQLite database
categories.to_sql("categories", engine, if_exists="replace", index=True, chunksize=8192, method="multi")

Unnamed: 0,category,sub_category,cost_center
0,ADD,BreakfstAdds,Hospital A
1,ADD,BreakfstAdds,Hospital B
2,ADD,Coffee Adds,Hospital A
3,ADD,Coffee Adds,Hospital B
4,ADD,Combo Adds,Hospital A
5,ADD,Combo Adds,Hospital B
6,ADD,Pizza Adds,Hospital A
7,ADD,Pizza Adds,Hospital B
8,ADD,Sand Adds,Hospital A
9,ALC,Beer,Hospital A


103