<a href="https://colab.research.google.com/github/maudlcrf/rcg/blob/main/group_project/eda_post_covid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### AI for Retail & Consumer Goods | Group Project | MBD April 2024

Group Members: Maud Lecerf | Cristina Mosquera | Christopher Stephan

# Imports & Color Palette

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from google.colab import drive

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Custom Theme
iowa = {
    "layout": {
        "colorway": ["#4c3624", "#ff0651", "#ff7e06", "#74B36B", "#628395",
                     "#996C48", "#FF5C8D", "#FF9633", "#8CC084", "#84A0AE",
                     "#B78B66", "#FF85A9", "#FFB570", "#A7CEA1", "#A9BCC7"],
        "plot_bgcolor": "white",
        "paper_bgcolor": "white",
        "font": {"color": "black"},
        "xaxis": {"gridcolor": "lightgray"},
        "yaxis": {"gridcolor": "lightgray"},
    }
}

# Register themes
pio.templates["iowa"] = iowa

In [None]:
#data_post_covid = pd.read_csv("G:\\Drive partagés\\RCG\\Iowa_Liquor_Sales_Post_Covid.csv") #Maud

drive.mount('/content/drive') #Cris
data_post_covid = pd.read_csv("/content/drive/MyDrive/Iowa_Liquor_Sales_Post_Covid.csv") #Cris

## Splitting the data into alcohol types

In [None]:
## Regrouping the original categories into alcohol types
# Define mapping of keywords to subcategories
category_mapping = {
    'RUM': 'RUM',
    'VODKA': 'VODKAS',
    'VODKAS': 'VODKAS',
    'SCHNAPPS': 'SCHNAPPS',
    'BRANDIES': 'BRANDIES',
    'WHISKIES': 'WHISKIES',
    'SCOTCH': 'WHISKIES',
    'GINS': 'GINS',
    'GIN': 'GINS',
    'MEZCAL': 'MEZCAL',
    'TEQUILA': 'TEQUILA',
    'BOURBON': 'BOURBON',
    'LIQUEURS': 'LIQUEURS',
    'LIQUEUR':'LIQUEURS',
    'AMARETTO': 'AMARETTO',
    'CREME': 'CREME'
}

# Create a new column and assign "OTHERS" by default
data_post_covid['alcohol_type'] = 'OTHERS'

# Loop through mapping and assign subcategories
for keyword, alcohol_type in category_mapping.items():
    data_post_covid.loc[data_post_covid['category_name'].str.contains(keyword, case=False, na=False, regex=True), 'alcohol_type'] = alcohol_type



In [None]:
data_post_covid.nunique()

In [None]:
data_post_covid['alcohol_type'].value_counts()

In [None]:
data_post_covid['category_name'].value_counts()

In [None]:
#separating the columns into numerical and categorical as it my be helpful later on
numerical_cols = ['pack', 'bottle_volume_(ml)', 'state_bottle_cost', 'state_bottle_retail', 'bottles_sold', 'sale_(dollars)',
                  'volume_sold_(liters)', 'volume_sold_(gallons)' ]
categorical_cols = ['invoice_item_number', 'date', 'store_number', 'store_name', 'address',
       'city', 'zip_code', 'store_location', 'county_number', 'county',
       'category', 'category_name', 'vendor_number', 'vendor_name',
       'item_number', 'item_description' ]

In [None]:
data_post_covid['date'] = pd.to_datetime(data_post_covid['date'])

# Create new columns
data_post_covid['year'] = data_post_covid['date'].dt.year
data_post_covid['month'] = data_post_covid['date'].dt.month
data_post_covid['year_month'] = data_post_covid['date'].dt.to_period('M')
data_post_covid['weekday'] = data_post_covid['date'].dt.weekday

# Grouping data by alcohol
data_yearly_alcohol = data_post_covid.groupby(['year', 'alcohol_type'])[numerical_cols].sum().reset_index()
data_monthly_alcohol = data_post_covid.groupby(['year', 'month', 'alcohol_type'])[numerical_cols].sum().reset_index()
data_year_month_alcohol = data_post_covid.groupby(['year_month', 'alcohol_type'])[numerical_cols].sum().reset_index()
data_weekday_alcohol = data_post_covid.groupby(['weekday', 'alcohol_type'])[numerical_cols].sum().reset_index()

In [None]:
# Grouping data by category
data_yearly_category = data_post_covid.groupby(['year', 'category_name'])[numerical_cols].sum().reset_index()
data_monthly_category = data_post_covid.groupby(['year', 'month', 'category_name'])[numerical_cols].sum().reset_index()
data_year_month_category = data_post_covid.groupby(['year_month', 'category_name'])[numerical_cols].sum().reset_index()
data_weekday_category = data_post_covid.groupby(['weekday', 'category_name'])[numerical_cols].sum().reset_index()

In [None]:
px.bar(data_yearly_alcohol, x='year', y='volume_sold_(liters)', color= 'alcohol_type', template = iowa)

In [None]:
px.line(data_yearly_alcohol, x='year', y='volume_sold_(liters)', color= 'alcohol_type', template = iowa)

In [None]:
px.bar(data_yearly_alcohol, x='year', y='bottles_sold', color= 'alcohol_type', template = iowa)

In [None]:
px.line(data_yearly_alcohol, x='year', y='bottles_sold', color= 'alcohol_type', template = iowa)

In [None]:
px.bar(data_yearly_alcohol, x='year', y='sale_(dollars)', color= 'alcohol_type', template = iowa)

In [None]:
px.line(data_yearly_alcohol, x='year', y='sale_(dollars)', color= 'alcohol_type', template = iowa)

In [None]:
#Creating new features to understand the performance of each cateogry

# Price related features
data_post_covid['price_per_liter'] = data_post_covid['state_bottle_retail'] / data_post_covid['bottle_volume_(ml)'] * 1000
data_post_covid['profit_per_bottle'] = data_post_covid['state_bottle_retail'] - data_post_covid['state_bottle_cost']
data_post_covid['profit_margin'] = (data_post_covid['profit_per_bottle'] / data_post_covid['state_bottle_retail']) * 100


# Sales performance features
data_post_covid['sales_per_store'] = data_post_covid.groupby(['store_number', 'year_month'])['sale_(dollars)'].transform('sum')


# Time based features
data_post_covid['is_weekend'] = data_post_covid['weekday'].isin([5,6]).astype(int)
data_post_covid['quarter'] = data_post_covid['date'].dt.quarter

#Sales

In [None]:
# Evolutions of sales per category
fig = px.line(data_yearly_category,
              x='year',
              y='sale_(dollars)',
              color='category_name',
              title='Evolution of Sales by Category per Year',
              template='iowa')
fig.show()


Since we cannot see any peaks of find a specific pattern, we'll try to find it in the % growth of sales.

In [None]:
# Calculate year-over-year sales growth for each category
sales_growth = data_yearly_category.copy()
sales_growth['sales_growth'] = sales_growth.groupby('category_name')['sale_(dollars)'].pct_change() * 100

# Fill NaN values with 0
sales_growth['sales_growth'] = sales_growth['sales_growth'].fillna(0)

# Display the sales growth
print(sales_growth[['year', 'category_name', 'sale_(dollars)', 'sales_growth']])


In [None]:
# Plotting the growth of each category
fig_sales_growth = px.line(sales_growth,
                           x='year',
                           y='sales_growth',
                           color='category_name',
                           title='Year-over-Year Sales Growth by Category',
                           template='iowa')
fig_sales_growth.show()


it's a lot of information making it hard to analyze, let's filter by the % of fluctuation in sales growth, to see the categories that have growth or decrease their sales the most


In [None]:
# Calculate the absolute value of sales growth
sales_growth['abs_sales_growth'] = abs(sales_growth['sales_growth'])

# Find categories with the highest absolute sales growth
most_fluctuating = sales_growth.groupby('category_name')['abs_sales_growth'].mean().sort_values(ascending=False)

# Threshold 20% of fluctuation
threshold = 20
highly_fluctuating_categories = sales_growth[sales_growth['abs_sales_growth'] > threshold]
print("\nCategories with absolute sales growth greater than", threshold, "%:")
highly_fluctuating_categories


In [None]:
# Filter data for years starting from 2021
sales_growth_2021 = sales_growth[sales_growth['year'] >= 2021]

# Plotting the growth of highly fluctuating categories from 2021 onwards
fig_highly_fluctuating = px.line(sales_growth_2021[sales_growth_2021['category_name'].isin(highly_fluctuating_categories['category_name'])],
                                 x='year',
                                 y='sales_growth',
                                 color='category_name',
                                 title='Year-over-Year Sales Growth of Highly Fluctuating Categories (2021-2024)',
                                 template='iowa')
fig_highly_fluctuating.show()


The following categories don't have a continuis line, they stop at year 2022. Let's understand what hapenned
- imported gins
- imported whiskies
- destilled spirits specialty

In [None]:
# Filter data for specific categories
imported_gins = data_yearly_category[data_yearly_category['category_name'] == 'IMPORTED GINS']
imported_whiskies = data_yearly_category[data_yearly_category['category_name'] == 'IMPORTED WHISKIES']
distilled_spirits = data_yearly_category[data_yearly_category['category_name'] == 'DISTILLED SPIRITS SPECIALTY']

# Check if data exists for each category
if not imported_gins.empty:
    print("Data available for Imported Gins:")
    print(imported_gins[['year', 'sale_(dollars)']])
else:
    print("No data found for Imported Gins.")

if not imported_whiskies.empty:
    print("\nData available for Imported Whiskies:")
    print(imported_whiskies[['year', 'sale_(dollars)']])
else:
    print("No data found for Imported Whiskies.")

if not distilled_spirits.empty:
    print("\nData available for Distilled Spirits Specialty:")
    print(distilled_spirits[['year', 'category_name', 'sale_(dollars)']])
else:
    print("No data found for Distilled Spirits Specialty.")


Since we dont have data of these 3 categories for the last 2 years, we are not going to take them into consideration in the post covid analysis.

These three categories were the worst performers in 2022. This could be the reason they stopped buying in 2023.

In [None]:
# Count category names per alcohol type in highly_fluctuating_categories
category_counts_fluctuating  = highly_fluctuating_categories.groupby('alcohol_type')['category_name'].nunique()
category_counts_fluctuating


In [None]:
# Count category names per alcohol type
category_counts = data_post_covid.groupby('alcohol_type')['category_name'].nunique()
category_counts


- Bourbon: 66% of the categories have more than 20% fluctuations in sales growth/decrease.
- Gins: 40% of the categories have more than 20% fluctuations in sales growth/decrease.
- Others: 44% of the categories have more than 20% fluctuations in sales growth/decrease.
- Whiskies: 30% of the categories have more than 20% fluctuations in sales growth/decrease.


Let's dig into the sales per alcohol type so we have more details about these fluctuations.


## Sales per Alcohol type

In [None]:
# Merge 'sales_growth' with 'data_post_covid' to include 'alcohol_type'
sales_growth = pd.merge(sales_growth, data_post_covid[['category_name', 'alcohol_type']], on='category_name', how='left')

# Display the updated sales_growth DataFrame
sales_growth


### Bourbon

In [None]:
# Filter data for Bourbon
bourbon_data = data_yearly_alcohol[data_yearly_alcohol['alcohol_type'] == 'BOURBON']

# Create the line plot
fig = px.line(bourbon_data,
              x='year',
              y='sale_(dollars)',
              title='Bourbon Sales Over Time',
              template='iowa')

# Update x-axis to display integer years
fig.update_xaxes(type='category')

fig.show()


We can see a clear increase in sales over the past 4 years.

In [None]:
# Filter sales_growth for 'BOURBON'
bourbon_sales_growth = sales_growth[sales_growth['alcohol_type'] == 'BOURBON']

# Create the plot
fig_bourbon_growth = px.line(bourbon_sales_growth,
                             x='year',
                             y='sales_growth',
                             color='category_name',
                             title='Bourbon Sales Growth by Category over the Years',
                             template='iowa')
fig_bourbon_growth.show()


- SINGLE BARREL BOURBON WHISKIES experienced significant growth, peaking in 2023 with around 69% growth but then saw a sharp decline to below zero in 2024, indicating a major drop in sales.

- BOTTLED IN BOND BOURBON had steady growth, reaching its highest point in 2022 (37%) but slightly declined in 2023 and then dropped close to zero in 2024.

- STRAIGHT BOURBON WHISKIES maintained modest and steady growth throughout the period, peaking slightly in 2023 before a minor decline in 2024, though still positive.

The fluctuations show highly volatile categories within the bourbon.









In [None]:
# Filter data for Bourbon categories
bourbon_categories = sales_growth[sales_growth['alcohol_type'] == 'BOURBON']

# Group by category_name and sum the sales
bourbon_sales_by_category = bourbon_categories.groupby('category_name')['sale_(dollars)'].sum().reset_index()

# Sort by sales in descending order
bourbon_sales_by_category = bourbon_sales_by_category.sort_values('sale_(dollars)', ascending=False)

# Create the bar plot
fig = px.bar(bourbon_sales_by_category,
             x='category_name',
             y='sale_(dollars)',
             title='Total Bourbon Sales by Category',
             template='iowa')
fig.update_xaxes(title_text='Bourbon Category')
fig.update_yaxes(title_text='Total Sales ($)')
fig.show()


Bourbon sales peaked in 2023 but saw a sharp decline in 2024, especially in single barrel varieties. Despite growth fluctuations, Straight Bourbon Whiskies dominate total sales. The market remains steady, driven by traditional bourbon preferences.

### Gins

In [None]:
# Filter data for Gins
gins_data = data_yearly_alcohol[data_yearly_alcohol['alcohol_type'] == 'GINS']

# Create the line plot
fig = px.line(gins_data,
              x='year',
              y='sale_(dollars)',
              title='Gins Sales Over Time',
              template='iowa')

# Update x-axis to display integer years
fig.update_xaxes(type='category')

fig.show()


In [None]:
# Filter sales_growth for 'GINS'
gins_sales_growth = sales_growth[sales_growth['alcohol_type'] == 'GINS']

# Create the plot
fig_gins_growth = px.line(gins_sales_growth,
                             x='year',
                             y='sales_growth',
                             color='category_name',
                             title='Gins Sales Growth by Category over the Years',
                             template='iowa')
fig_gins_growth.show()


In [None]:
# Check if data exists for Imported Gins in 2023 and 2024
imported_gins_2023_2024 = data_yearly_category[
    (data_yearly_category['category_name'] == 'IMPORTED GINS') &
    (data_yearly_category['year'].isin([2023, 2024]))
]

if not imported_gins_2023_2024.empty:
    print("Data available for Imported Gins in 2023 and 2024:")
    print(imported_gins_2023_2024[['year', 'sale_(dollars)']])
else:
    print("No data found for Imported Gins in 2023 and 2024.")


Since it had a bad performance in 2022 (decreasing sales by 50%), they stopped buying this category.

In [None]:
# Filter data for Gins categories
gins_categories = sales_growth[sales_growth['alcohol_type'] == 'GINS']

# Group by category_name and sum the sales
gins_sales_by_category = gins_categories.groupby('category_name')['sale_(dollars)'].sum().reset_index()

# Sort by sales in descending order
gins_sales_by_category = gins_sales_by_category.sort_values('sale_(dollars)', ascending=False)

# Create the bar plot
fig = px.bar(gins_sales_by_category,
             x='category_name',
             y='sale_(dollars)',
             title='Total Gins Sales by Category',
             template='iowa')
fig.update_xaxes(title_text='Gins Category')
fig.update_yaxes(title_text='Total Sales ($)')
fig.show()


Gin experienced peak growth in 2022 but has since declined. Total sales remain strong, driven mainly by dry gins. Fluctuations suggest evolving consumer preferences, though overall demand stays stable.

### Others


In [None]:
# Filter data for Others
others_data = data_yearly_alcohol[data_yearly_alcohol['alcohol_type'] == 'OTHERS']

# Create the line plot
fig = px.line(others_data,
              x='year',
              y='sale_(dollars)',
              title='Others Sales Over Time',
              template='iowa')

# Update x-axis to display integer years
fig.update_xaxes(type='category')

fig.show()


In [None]:
# Filter sales_growth for 'OTHERS'
others_sales_growth = sales_growth[sales_growth['alcohol_type'] == 'OTHERS']

# Create the plot
fig_others_growth = px.line(others_sales_growth,
                             x='year',
                             y='sales_growth',
                             color='category_name',
                             title='Others Sales Growth by Category over the Years',
                             template='iowa')
fig_others_growth.show()


In [None]:
# Filter data for Others categories
others_categories = sales_growth[sales_growth['alcohol_type'] == 'OTHERS']
# Group by category_name and sum the sales
others_sales_by_category = others_categories.groupby('category_name')['sale_(dollars)'].sum().reset_index()
# Sort by sales in descending order
others_sales_by_category = others_sales_by_category.sort_values('sale_(dollars)', ascending=False)
# Create the bar plot
fig = px.bar(others_sales_by_category,
             x='category_name',
             y='sale_(dollars)',
             title='Total Others Sales by Category',
             template='iowa')
fig.update_xaxes(title_text='', tickangle=45, tickfont=dict(size=10)) # Rotate x-axis labels
fig.update_yaxes(title_text='Total Sales ($)')
fig.show()


Sales in the "Others" alcohol category show volatility, with sharp peaks and dips across years. Despite fluctuating growth, Temporary & Specialty Packages and Cocktails/RTD dominate total sales.

### Whiskies

In [None]:
# Filter data for Whiskies
whiskies_data = data_yearly_alcohol[data_yearly_alcohol['alcohol_type'] == 'WHISKIES']

# Create the line plot
fig = px.line(whiskies_data,
              x='year',
              y='sale_(dollars)',
              title='Whiskies Sales Over Time',
              template='iowa')

# Update x-axis to display integer years
fig.update_xaxes(type='category')

fig.show()


In [None]:
# Filter sales_growth for 'WHISKIES'
whiskies_sales_growth = sales_growth[sales_growth['alcohol_type'] == 'WHISKIES']

# Create the plot
fig_whiskies_growth = px.line(whiskies_sales_growth,
                             x='year',
                             y='sales_growth',
                             color='category_name',
                             title='Whiskies Sales Growth by Category over the Years',
                             template='iowa')
fig_whiskies_growth.show()


In [None]:
# Check if data exists for Imported Whiskies in 2023 and 2024
imported_whiskies_2023_2024 = data_yearly_category[
    (data_yearly_category['category_name'] == 'IMPORTED WHISKIES') &
    (data_yearly_category['year'].isin([2023, 2024]))
]

if not imported_whiskies_2023_2024.empty:
    print("Data available for Imported Whiskies in 2023 and 2024:")
    print(imported_whiskies_2023_2024[['year', 'sale_(dollars)']])
else:
    print("No data found for Imported Whiskies in 2023 and 2024.")


In [None]:
# Filter data for Whiskies categories
whiskies_categories = sales_growth[sales_growth['alcohol_type'] == 'WHISKIES']

# Group by category_name and sum the sales
whiskies_sales_by_category = whiskies_categories.groupby('category_name')['sale_(dollars)'].sum().reset_index()

# Sort by sales in descending order
whiskies_sales_by_category = whiskies_sales_by_category.sort_values('sale_(dollars)', ascending=False)

# Create the bar plot
fig = px.bar(whiskies_sales_by_category,
             x='category_name',
             y='sale_(dollars)',
             title='Total Whiskies Sales by Category',
             template='iowa')
fig.update_xaxes(title_text='', tickangle=45, tickfont=dict(size=10)) # Rotate x-axis labels
fig.update_yaxes(title_text='Total Sales ($)')
fig.show()


Whisky sales show overall market stability despite category-specific fluctuations. Canadian Whiskies dominate total sales, maintaining a strong market lead. While some types saw minor growth shifts, consumer preference remains largely consistent.

**Sales per Alcohol Type Takeaways**

The overall sales fluctuations often stem from categories with volatile growth but minimal market share, which can disproportionately affect sales trends without significantly impacting total revenue. In the graphs:

- Gin: The large spike in flavored gin sales caused notable growth, but since its total market share is low, it had limited impact on overall revenue.
- Others: Temporary & Specialty Packages show extreme volatility yet hold a dominant share, directly influencing total sales.
- Whiskies: Imported Whiskies experienced major declines, but since Canadian Whiskies dominate total sales, the impact was cushioned.
- Bourbon: Single Barrel Bourbons showed sharp growth and decline, but the market is led by Straight Bourbon Whiskies, stabilizing overall sales.

Large fluctuations in minor categories create spikes in growth rates but often don’t alter total sales trends unless the volatile category holds significant market share

In [None]:
## we could analyze the less fluctuations categories, since less fluctuations means more consistent, and less short-time spikes = markets success