## DEAD

Fitting a model (or two) based on our proposal.

- **Goal:** Build a model to predict sales in a month for any given store.
- **Response Variable:** Monthly Sales
- **Possible Features:** store, month, county, population stuff, proximity stuff, alcohol categories

In [2]:
import duckdb as db 
con = db.connect()
import pandas as pd 
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
# MAIN TABLE
con.execute("""
        DROP TABLE IF EXISTS sales;
        CREATE TABLE sales AS 
        SELECT EXTRACT(MONTH FROM date) AS month, EXTRACT (YEAR FROM date) AS year,
            store, city, county, 
            category_name AS category, sale_bottles AS bottles, sale_dollars AS dollars
        FROM read_parquet('../data/iowa_liquor_2023_2025.parquet');
""")
sales = con.execute("SELECT * FROM sales").df()

# POPULATION
con.execute(
"""
        DROP TABLE IF EXISTS population;
        CREATE TABLE population AS
        SELECT name AS county, year_1 AS year, popestimate AS population, over21, propOver21, median_age_tot AS median_age, (POPEST_MALE / (POPEST_FEM + POPEST_MALE)) AS pct_male
        FROM read_csv_auto('../data/pop.csv');
"""
)
pop = con.execute("SELECT * FROM population").df()

# PROXIMITY
con.execute(
"""
        DROP TABLE IF EXISTS proximity;
        CREATE TABLE proximity AS
        SELECT county
        FROM read_csv_auto('../data/proximity.csv');
"""
)
prox = con.execute("SELECT * FROM proximity").df()




In [191]:
income = pd.read_csv("../data/HDPulse_data_export.csv")

income["County"] = income["County"].str.replace(r"\s+County$", "", regex=True)
income.rename(columns={
    "County": "county",
    "Value (Dollars)": "median_income"
}, inplace=True)
income["median_income"] = income["median_income"].str.replace(",", "").astype(float)

income.head()

Unnamed: 0,county,FIPS,median_income,Rank within US (of 3141 counties)
0,Appanoose,19007.0,51146.0,2607
1,Audubon,19009.0,54152.0,2408
2,Union,19175.0,56813.0,2202
3,Jefferson,19101.0,56824.0,2201
4,Decatur,19053.0,57146.0,2175


In [40]:
prox = pd.DataFrame(prox.value_counts())
prox.rename(columns={'count': 'store_count'}, inplace=True)
prox.reset_index(inplace=True)
prox.head()

Unnamed: 0_level_0,count
county,Unnamed: 1_level_1
POLK,317
LINN,168
SCOTT,112
BLACK HAWK,108
JOHNSON,93


In [4]:
sales.columns

Index(['month', 'year', 'store', 'city', 'county', 'category', 'bottles',
       'dollars'],
      dtype='object')

## Creating the Dataset

First, I am going to engineer the category column a little bit to use as features. Knowing which alcohol sells the best is could be useful for telling Booze R Us what they should buy in order to increase profits.

In [5]:
con.execute("""
    CREATE OR REPLACE TABLE sales AS
    SELECT *,
        CASE
            WHEN category ILIKE '%VODKA%' THEN 'Vodka'
            WHEN category ILIKE '%WHISK%' THEN 'Whiskey'
            WHEN category ILIKE '%TEQUILA%' OR category ILIKE '%MEZCAL%' THEN 'Tequila'
            WHEN category ILIKE '%RUM%' THEN 'Rum'
            ELSE 'Other'
        END AS super_category
    FROM sales
""")
sales = con.execute("SELECT * FROM sales").df()

In [6]:
sales.head()

Unnamed: 0,month,year,store,city,county,category,bottles,dollars,super_category
0,1,2023,4829,DES MOINES,POLK,100% AGAVE TEQUILA,12,261.0,Tequila
1,1,2023,4829,DES MOINES,POLK,AMERICAN VODKAS,60,418.8,Vodka
2,1,2023,4829,DES MOINES,POLK,IMPORTED FLAVORED VODKA,24,358.56,Vodka
3,1,2023,4829,DES MOINES,POLK,CREAM LIQUEURS,12,306.0,Other
4,1,2023,4829,DES MOINES,POLK,SPICED RUM,60,1124.4,Rum


Now I need to agreggate to create our appropriate observational units: monthly sales per store.

- Dollars (our response variable) will be summed. 
- Category will be made into new columns representing the distribution of category sales
    - e.g. 70% tequila, 20% vodkas, etc.
    - we will not use total bottles because this would be almost perfectly collinear 
    - answer questions like: 'what liquor should we sell more/less of?'

In [7]:
monthly_sales = con.execute(
"""
    WITH month_totals AS (
        SELECT
            year,
            month,
            county,
            SUM(dollars) AS revenue
        FROM sales
        GROUP BY year, month, county
    ), category_totals AS (
        SELECT
            year,
            month,
            county,
            super_category,
            SUM(dollars) AS category_sales
        FROM sales
        GROUP BY year, month, county, super_category
    )
    SELECT
        mt.year,
        mt.month,
        mt.county,
        ROUND(SUM(CASE WHEN ct.super_category = 'Vodka' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS vodka_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Whiskey' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS whiskey_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Tequila' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS tequila_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Rum' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS rum_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Other' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS other_ptc,
        mt.revenue
    FROM month_totals mt
    LEFT JOIN category_totals ct
        ON mt.year = ct.year
        AND mt.month = ct.month
        AND mt.county = ct.county
    GROUP BY mt.year, mt.month, mt.county, mt.revenue
"""
).fetchdf()


In [8]:
monthly_sales.head(2)

Unnamed: 0,year,month,county,vodka_ptc,whiskey_ptc,tequila_ptc,rum_ptc,other_ptc,revenue
0,2023,3,HOWARD,0.22,0.39,0.03,0.11,0.24,63183.04
1,2023,3,BREMER,0.21,0.42,0.05,0.13,0.19,173736.94


Now I will join with our other datasets, proximity and population. Using an inner join because it still leaves plenty of complete data for modelling. 

In [192]:
df = con.execute(
    """
        SELECT sales.*, 
            pop.population, pop.over21, pop.propOver21, pop.median_age, pop.pct_male, prox.store_count, income.median_income
        FROM monthly_sales sales
        JOIN pop
            ON LOWER(sales.county) = LOWER(pop.county) AND sales.year = pop.year
        JOIN prox
            on LOWER(pop.county) = LOWER(prox.county)
        JOIN income
            on LOWER(pop.county) = LOWER(income.county)

        
    """
    ).fetchdf()

In [193]:
df.head(2)

Unnamed: 0,year,month,county,vodka_ptc,whiskey_ptc,tequila_ptc,rum_ptc,other_ptc,revenue,population,over21,propOver21,median_age,pct_male,store_count,median_income
0,2023,3,HOWARD,0.22,0.39,0.03,0.11,0.24,63183.04,9357,7031,0.751416,41.2,0.505504,10,67336.0
1,2023,3,BREMER,0.21,0.42,0.05,0.13,0.19,173736.94,25304,19650,0.776557,39.8,0.500356,16,83343.0


Now encode month to use as a categorical feature:

In [194]:
months = pd.get_dummies(df.month, prefix='month', drop_first=True)
df = pd.concat([df.drop(columns='month'), months], axis=1)
df.head()

Unnamed: 0,year,county,vodka_ptc,whiskey_ptc,tequila_ptc,rum_ptc,other_ptc,revenue,population,over21,propOver21,median_age,pct_male,store_count,median_income,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2023,HOWARD,0.22,0.39,0.03,0.11,0.24,63183.04,9357,7031,0.751416,41.2,0.505504,10,67336.0,False,True,False,False,False,False,False,False,False,False,False
1,2023,BREMER,0.21,0.42,0.05,0.13,0.19,173736.94,25304,19650,0.776557,39.8,0.500356,16,83343.0,False,True,False,False,False,False,False,False,False,False,False
2,2023,HANCOCK,0.26,0.45,0.03,0.09,0.17,60634.91,10631,8343,0.78478,43.9,0.505785,8,70212.0,False,True,False,False,False,False,False,False,False,False,False
3,2023,RINGGOLD,0.16,0.55,0.03,0.04,0.21,17068.38,4646,3594,0.773569,44.1,0.49957,2,69821.0,False,True,False,False,False,False,False,False,False,False,False
4,2023,HANCOCK,0.28,0.41,0.02,0.09,0.19,50004.69,10631,8343,0.78478,43.9,0.505785,8,70212.0,False,False,False,False,False,True,False,False,False,False,False


In [195]:
df = df.copy()
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce')
df = df[df['revenue'] > 0]   
df.to_csv('../data/dead_model_data.csv', index=False)

## Linear Regression (from scratch)

First, the function to fit the model. It will take observed X and Y matrices and return the predictions and estimators.

In [13]:
def fit_lr(X, Y):
    
    X = np.column_stack([np.ones((X.shape[0], 1)), X.astype(float)]) # add column of ones
    B = np.linalg.pinv(X.T @ X) @ X.T @ Y

    return B

In [14]:
def predict_lr(X, B):
    
    X = np.column_stack([np.ones((X.shape[0], 1)), X.astype(float)]) # add column of ones
    Y_hat = X @ B

    return Y_hat

A cross validation function:

In [15]:
def lr_cross_val(data, features, response, k=5):

    # make k folds
    n = data.shape[0]
    indices = np.arange(n)
    np.random.shuffle(indices)
    fold_size = n // k
    folds = [indices[i*fold_size:(i+1)*fold_size] for i in range(k-1)]
    folds.append(indices[(k-1)*fold_size:])  # last fold gets the remainder

    # loop over folds and store their predictions
    predictions = np.zeros(n)
    for fold_idx in folds:
        # training indices = everything not in this fold
        train_idx = np.setdiff1d(indices, fold_idx)
        test_idx = fold_idx

        train = data.iloc[train_idx]
        test = data.iloc[test_idx]

        # fit model
        B = fit_lr(train[features].values, train[response].values)

        # predict on test set
        Y_hat = predict_lr(test[features].values, B)

        predictions[test_idx] = Y_hat

    return predictions


Fit the model:

- *NOTE:* I first fit with all possible features, then after assessing model fit, feature importance, and multicollinearity (all below), I came back up and changed the features. So this is just the final model here, not all iterations.

In [None]:
df["store_ratio"] = df["store_count"] / df["over21"]


In [211]:
features = ['month_5','month_6','month_11',
        'propOver21', 'median_age', 'pct_male', 'store_count', 'median_income']
df['log_revenue'] = np.log(df['revenue'])
response = 'log_revenue'

# fit
Yhat = lr_cross_val(df, features, response, 5)

 Assess model fit:

In [212]:
Y = df[response].values
X = df[features].values
RSS = np.sum((Y - Yhat)**2)
TSS = np.sum((Y - np.mean(Y))**2)
R2 = 1 - RSS/TSS
n, p = X.shape  # after adding intercept
adj_R2 = 1 - (1 - R2) * (n - 1) / (n - p)
MAE = np.mean(np.abs(Y - Yhat))
RMSE = np.sqrt(RSS / n)

print(f"R2 = {R2:.4f}")
print(f"Adjusted R2 = {adj_R2:.4f}")
print(f"MAE = {MAE:.2f}")
print(f"RMSE = {RMSE:.2f}")

R2 = 0.6224
Adjusted R2 = 0.6212
MAE = 0.60
RMSE = 0.76


Fit the finalized model now on all the data and look at the estimators:

In [149]:
# fit a final model on all the data
B_final = fit_lr(X, Y)

In [150]:
coef_names = ['intercept'] + list(df[features].columns)
nonstd_df = pd.DataFrame({
    'feature': coef_names,
    'beta': B_final
})
nonstd_df = nonstd_df[nonstd_df['feature'] != 'intercept']
nonstd_df = nonstd_df.reindex(
    nonstd_df['beta'].abs().sort_values(ascending=False).index
)
print(nonstd_df)

       feature      beta
6     pct_male -8.936625
4   propOver21  6.286309
8         POLK -6.017989
3     month_11  0.130215
1      month_5  0.103540
2      month_6  0.083750
5   median_age -0.056131
7  store_count  0.033682
