#### AI for Retail & Consumer Goods | Group Project | MBD April 2024

Group Members: Maud Lecerf | Cristina Mosquera | Christopher Stephan

# Imports & Color Palette

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Custom Theme
iowa = {
    "layout": {
        "colorway": ["#4c3624", "#ff0651", "#ff7e06", "#74B36B", "#628395",
                     "#996C48", "#FF5C8D", "#FF9633", "#8CC084", "#84A0AE",
                     "#B78B66", "#FF85A9", "#FFB570", "#A7CEA1", "#A9BCC7"],
        "plot_bgcolor": "white",
        "paper_bgcolor": "white",
        "font": {"color": "black"},
        "xaxis": {"gridcolor": "lightgray"},
        "yaxis": {"gridcolor": "lightgray"},
    }
}

# Register themes
pio.templates["iowa"] = iowa

In [None]:
daclean = pd.read_csv("G:\\Drive partagés\\RCG\\Iowa_Liquor_Sales_Post_Covid.csv") #Maud


## Splitting the data into alcohol types

In [None]:
## Regrouping the original categories into alcohol types
# Define mapping of keywords to subcategories
category_mapping = {
    'RUM': 'RUM',
    'VODKA': 'VODKAS',
    'VODKAS': 'VODKAS',
    'SCHNAPPS': 'SCHNAPPS',
    'BRANDIES': 'BRANDIES',
    'WHISKIES': 'WHISKIES',
    'SCOTCH': 'WHISKIES',
    'GINS': 'GINS',
    'GIN': 'GINS',
    'MEZCAL': 'MEZCAL',
    'TEQUILA': 'TEQUILA',
    'BOURBON': 'BOURBON',
    'LIQUEURS': 'LIQUEURS',
    'LIQUEUR':'LIQUEURS',
    'AMARETTO': 'AMARETTO',
    'CREME': 'CREME'
}

# Create a new column and assign "OTHERS" by default
data_post_covid['alcohol_type'] = 'OTHERS'

# Loop through mapping and assign subcategories
for keyword, alcohol_type in category_mapping.items():
    data_post_covid.loc[data_post_covid['category_name'].str.contains(keyword, case=False, na=False, regex=True), 'alcohol_type'] = alcohol_type



In [None]:
data_post_covid.nunique()

invoice_item_number      10414198
date                         1213
store_number                 2445
store_name                   2515
address                      2531
city                          477
zip_code                      514
store_location              10803
county_number                  99
county                         99
category                       59
category_name                  48
vendor_number                 345
vendor_name                   357
item_number                  8278
item_description             7554
pack                           22
bottle_volume_(ml)             26
state_bottle_cost            2231
state_bottle_retail          2235
bottles_sold                  610
sale_(dollars)              20487
volume_sold_(liters)         1227
volume_sold_(gallons)        1214
alcohol_type                   11
dtype: int64

In [None]:
data_post_covid['alcohol_type'].value_counts()

alcohol_type
VODKAS      2481955
WHISKIES    2093746
LIQUEURS    1377236
RUM          903040
OTHERS       849708
BOURBON      842461
TEQUILA      686286
SCHNAPPS     486618
BRANDIES     421626
GINS         260273
MEZCAL        11249
Name: count, dtype: int64

In [None]:
#separating the columns into numerical and categorical as it my be helpful later on
numerical_cols = ['pack', 'bottle_volume_(ml)', 'state_bottle_cost', 'state_bottle_retail', 'bottles_sold', 'sale_(dollars)',
                  'volume_sold_(liters)', 'volume_sold_(gallons)' ]
categorical_cols = ['invoice_item_number', 'date', 'store_number', 'store_name', 'address',
       'city', 'zip_code', 'store_location', 'county_number', 'county',
       'category', 'category_name', 'vendor_number', 'vendor_name',
       'item_number', 'item_description' ]

In [None]:
data_post_covid['date'] = pd.to_datetime(data_post_covid['date'])

# Create new columns
data_post_covid['year'] = data_post_covid['date'].dt.year
data_post_covid['month'] = data_post_covid['date'].dt.month
data_post_covid['year_month'] = data_post_covid['date'].dt.to_period('M')
data_post_covid['weekday'] = data_post_covid['date'].dt.weekday

# Grouping data
data_yearly = data_post_covid.groupby(['year', 'alcohol_type'])[numerical_cols].sum().reset_index()
data_monthly = data_post_covid.groupby(['year', 'month', 'alcohol_type'])[numerical_cols].sum().reset_index()
data_year_month = data_post_covid.groupby(['year_month', 'alcohol_type'])[numerical_cols].sum().reset_index()
data_weekday = data_post_covid.groupby(['weekday', 'alcohol_type'])[numerical_cols].sum().reset_index()

In [None]:
data_yearly.columns

Index(['year', 'alcohol_type', 'pack', 'bottle_volume_(ml)',
       'state_bottle_cost', 'state_bottle_retail', 'bottles_sold',
       'sale_(dollars)', 'volume_sold_(liters)', 'volume_sold_(gallons)'],
      dtype='object')

In [None]:
px.bar(data_yearly, x='year', y='volume_sold_(liters)', color= 'alcohol_type', template = iowa)

In [None]:
px.line(data_yearly, x='year', y='volume_sold_(liters)', color= 'alcohol_type', template = iowa)

In [None]:
px.bar(data_yearly, x='year', y='bottles_sold', color= 'alcohol_type', template = iowa)

In [None]:
px.line(data_yearly, x='year', y='bottles_sold', color= 'alcohol_type', template = iowa)

In [None]:
px.bar(data_yearly, x='year', y='sale_(dollars)', color= 'alcohol_type', template = iowa)

In [None]:
px.line(data_yearly, x='year', y='sale_(dollars)', color= 'alcohol_type', template = iowa)

In [None]:
#Creating new features to understand the performance of each cateogry

# Price related features
data_price['price_per_liter'] = data_post_covid['state_bottle_retail'] / data_post_covid['bottle_volume_(ml)'] * 1000
data_post_covid['profit_per_bottle'] = data_post_covid['state_bottle_retail'] - data_post_covid['state_bottle_cost']
data_post_covid['profit_margin'] = (data_post_covid['profit_per_bottle'] / data_post_covid['state_bottle_retail']) * 100


# Sales performance features
data_post_covid['sales_per_store'] = data_post_covid.groupby(['store_number', 'year_month'])['sale_(dollars)'].transform('sum')


# Time based features
data_post_covid['is_weekend'] = data_post_covid['weekday'].isin([5,6]).astype(int)
data_post_covid['quarter'] = data_post_covid['date'].dt.quarter

In [None]:
# Plotting sales for each category
fig_sales_category = px.bar(data_post_covid,
                           x='category_name',
                           y='sale_(dollars)',
                           color='category_name',
                           title='Total Sales by Category',
                           template='iowa')
fig_sales_category.show()


In [None]:
# Plotting sales for each alcohol type
fig_sales_alcohol_type = px.bar(data_post_covid,
                           x='alcohol_type',
                           y='sale_(dollars)',
                           color='alcohol_type',
                           title='Total Sales by Alcohol Type',
                           template='iowa')
fig_sales_alcohol_type.show()

In [None]:
# Plotting sales trend over time for each category (using year_month)
fig_sales_trend = px.line(data_year_month,
                         x='year_month',
                         y='sale_(dollars)',
                         color='alcohol_type',
                         title='Sales Trend by Alcohol Type Over Time',
                         template='iowa')
fig_sales_trend.show()


In [None]:
# Plotting sales over time for each alcohol_type
for alcohol_type in data_post_covid['alcohol_type'].unique():
  subset = data_post_covid[data_post_covid['alcohol_type']==alcohol_type]
  fig = px.line(subset, x='date', y='sale_(dollars)', title=f'Sales of {alcohol_type} over time')
  fig.show()
