## DEAD

Fitting a model (or two) based on our proposal.

- **Goal:** Build a model to predict sales in a month for any given store.
- **Response Variable:** Monthly Sales
- **Possible Features:** store, month, county, population stuff, proximity stuff, alcohol categories

In [1]:
import duckdb as db 
con = db.connect()
import pandas as pd 
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# MAIN TABLE
con.execute("""
        DROP TABLE IF EXISTS sales;
        CREATE TABLE sales AS 
        SELECT EXTRACT(MONTH FROM date) AS month, EXTRACT (YEAR FROM date) AS year,
            store, city, county, 
            category_name AS category, sale_bottles AS bottles, sale_dollars AS dollars
        FROM read_parquet('../data/iowa_liquor_2023_2025.parquet');
""")
sales = con.execute("SELECT * FROM sales").df()

# POPULATION
con.execute(
"""
        DROP TABLE IF EXISTS population;
        CREATE TABLE population AS
        SELECT name AS county, year_1 AS year, popestimate AS population, over21, propOver21, median_age_tot AS median_age, (POPEST_MALE / (POPEST_FEM + POPEST_MALE)) AS pct_male
        FROM read_csv_auto('../data/pop.csv');
"""
)
pop = con.execute("SELECT * FROM population").df()

# PROXIMITY
con.execute(
"""
        DROP TABLE IF EXISTS proximity;
        CREATE TABLE proximity AS
        SELECT county
        FROM read_csv_auto('../data/proximity.csv');
"""
)
prox = con.execute("SELECT * FROM proximity").df()




In [3]:
socioeconomic = pd.read_csv("../data/HDPulse_socioeconomic_data.csv")

socioeconomic["County"] = socioeconomic["County"].str.replace(r"\s+County$", "", regex=True)
socioeconomic.rename(columns={
    "County": "county",
    "FIPS": "fips",
    "% People (Education: Less Than 9th Grade)": "pct_less_than_9th_grade",
    "People (Education: Less Than 9th Grade)": "less_than_9th_grade",
    "% People (Unemployed)": "pct_unemployed",
    "People (Unemployed)": "unemployed",
    "% People (Below Poverty)": "pct_below_poverty",
    "People (Below Poverty)": "below_poverty",
    "% People (<150% Of Poverty)": "pct_below_150pct_poverty",
    "People (<150% Of Poverty)": "below_150pct_poverty",
    "% Families (Below Poverty)": "pct_families_below_poverty",
    "Families (Below Poverty)": "families_below_poverty",
    "% People (White)": "pct_white",
    "People (White)": "white",
    "% People (Hispanic)": "pct_hispanic",
    "People (Hispanic)": "hispanic",
    "% People (Foreign Born)": "pct_foreign_born",
    "People (Foreign Born)": "foreign_born",
    "% People (Black)": "pct_black",
    "People (Black)": "black",
    "% People (API)": "pct_api",
    "People (API)": "api",
    "% People (AI/AN)": "pct_aian",
    "People (AI/AN)": "aian",
    "% Households (language Isolation)": "pct_language_isolation",
    "Households (language Isolation)": "language_isolation",
    "% People(Education: Less Than High School)": "pct_less_than_high_school",
    "People(Education: Less Than High School)": "less_than_high_school",
    "% People (Education: At Least Bachelor's Degree)": "pct_at_bachelor_degree"
}, inplace=True)
socioeconomic.head()

Unnamed: 0,county,fips,pct_less_than_9th_grade,less_than_9th_grade,pct_unemployed,unemployed,pct_below_poverty,below_poverty,pct_below_150pct_poverty,below_150pct_poverty,pct_families_below_poverty,families_below_poverty,pct_white,white,pct_hispanic,hispanic,pct_foreign_born,foreign_born,pct_black,black,pct_api,api,pct_aian,aian,pct_language_isolation,language_isolation,pct_less_than_high_school,less_than_high_school,pct_at_bachelor_degree
0,Adair,19001.0,2.1,110.0,4.2,162.0,10.3,753.0,21.8,1591.0,5.0,92.0,94.9,7091.0,2.9,214.0,1.6,120.0,1.4,101.0,0.5,34.0,0.4,27.0,0.0,0.0,5.5,287.0,20.0
1,Adams,19003.0,2.6,68.0,2.9,53.0,9.5,339.0,17.7,629.0,3.6,35.0,95.4,3475.0,0.7,27.0,0.2,9.0,0.4,13.0,0.0,0.0,0.5,20.0,0.3,5.0,7.4,197.0,24.5
2,Allamakee,19005.0,3.0,293.0,3.6,252.0,12.1,1657.0,19.8,2716.0,9.4,350.0,90.4,12690.0,8.7,1225.0,6.7,937.0,1.8,246.0,0.5,76.0,0.8,115.0,4.4,265.0,7.5,721.0,19.9
3,Appanoose,19007.0,5.5,480.0,6.9,375.0,20.4,2468.0,31.9,3851.0,10.8,346.0,95.1,11641.0,1.7,212.0,1.2,150.0,1.0,121.0,0.4,53.0,0.1,7.0,1.0,51.0,10.3,907.0,19.3
4,Audubon,19009.0,2.7,108.0,1.9,52.0,11.8,649.0,20.9,1147.0,7.1,109.0,95.2,5351.0,2.0,115.0,0.9,48.0,0.3,15.0,0.1,7.0,0.2,12.0,0.9,23.0,8.0,322.0,20.6


In [4]:
socioeconomic["pct_high_school_lower"] = socioeconomic["pct_less_than_high_school"] + socioeconomic["pct_less_than_9th_grade"]

In [5]:
prox = pd.DataFrame(prox.value_counts())
prox.rename(columns={'count': 'store_count'}, inplace=True)
prox.reset_index(inplace=True)
prox.head()

Unnamed: 0,county,store_count
0,POLK,317
1,LINN,168
2,SCOTT,112
3,BLACK HAWK,108
4,JOHNSON,93


In [6]:
sales.columns

Index(['month', 'year', 'store', 'city', 'county', 'category', 'bottles',
       'dollars'],
      dtype='object')

## Creating the Dataset

First, I am going to engineer the category column a little bit to use as features. Knowing which alcohol sells the best is could be useful for telling Booze R Us what they should buy in order to increase profits.

In [7]:
con.execute("""
    CREATE OR REPLACE TABLE sales AS
    SELECT *,
        CASE
            WHEN category ILIKE '%VODKA%' THEN 'Vodka'
            WHEN category ILIKE '%WHISK%' THEN 'Whiskey'
            WHEN category ILIKE '%TEQUILA%' OR category ILIKE '%MEZCAL%' THEN 'Tequila'
            WHEN category ILIKE '%RUM%' THEN 'Rum'
            ELSE 'Other'
        END AS super_category
    FROM sales
""")
sales = con.execute("SELECT * FROM sales").df()

In [8]:
sales.head()

Unnamed: 0,month,year,store,city,county,category,bottles,dollars,super_category
0,1,2023,4829,DES MOINES,POLK,100% AGAVE TEQUILA,12,261.0,Tequila
1,1,2023,4829,DES MOINES,POLK,AMERICAN VODKAS,60,418.8,Vodka
2,1,2023,4829,DES MOINES,POLK,IMPORTED FLAVORED VODKA,24,358.56,Vodka
3,1,2023,4829,DES MOINES,POLK,CREAM LIQUEURS,12,306.0,Other
4,1,2023,4829,DES MOINES,POLK,SPICED RUM,60,1124.4,Rum


Now I need to agreggate to create our appropriate observational units: monthly sales per store.

- Dollars (our response variable) will be summed. 
- Category will be made into new columns representing the distribution of category sales
    - e.g. 70% tequila, 20% vodkas, etc.
    - we will not use total bottles because this would be almost perfectly collinear 
    - answer questions like: 'what liquor should we sell more/less of?'

In [9]:
monthly_sales = con.execute(
"""
    WITH month_totals AS (
        SELECT
            year,
            month,
            county,
            SUM(dollars) AS revenue
        FROM sales
        GROUP BY year, month, county
    ), category_totals AS (
        SELECT
            year,
            month,
            county,
            super_category,
            SUM(dollars) AS category_sales
        FROM sales
        GROUP BY year, month, county, super_category
    )
    SELECT
        mt.year,
        mt.month,
        mt.county,
        ROUND(SUM(CASE WHEN ct.super_category = 'Vodka' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS vodka_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Whiskey' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS whiskey_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Tequila' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS tequila_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Rum' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS rum_ptc,
        ROUND(SUM(CASE WHEN ct.super_category = 'Other' THEN ct.category_sales ELSE 0 END) / mt.revenue, 2) AS other_ptc,
        mt.revenue
    FROM month_totals mt
    LEFT JOIN category_totals ct
        ON mt.year = ct.year
        AND mt.month = ct.month
        AND mt.county = ct.county
    GROUP BY mt.year, mt.month, mt.county, mt.revenue
"""
).fetchdf()


In [10]:
monthly_sales.head(2)

Unnamed: 0,year,month,county,vodka_ptc,whiskey_ptc,tequila_ptc,rum_ptc,other_ptc,revenue
0,2023,4,TAMA,0.23,0.47,0.06,0.1,0.14,69418.54
1,2023,4,HAMILTON,0.21,0.41,0.1,0.08,0.2,74622.6


Now I will join with our other datasets, proximity and population. Using an inner join because it still leaves plenty of complete data for modelling. 

In [11]:
df = con.execute(
    """
        SELECT sales.*, 
            pop.population, pop.over21, pop.propOver21, pop.median_age, pop.pct_male, prox.store_count, socioeconomic.*
        FROM monthly_sales sales
        JOIN pop
            ON LOWER(sales.county) = LOWER(pop.county) AND sales.year = pop.year
        JOIN prox
            on LOWER(pop.county) = LOWER(prox.county)
        JOIN socioeconomic
            on LOWER(pop.county) = LOWER(socioeconomic.county)

        
    """
    ).fetchdf()

In [12]:
df.head(2)

Unnamed: 0,year,month,county,vodka_ptc,whiskey_ptc,tequila_ptc,rum_ptc,other_ptc,revenue,population,over21,propOver21,median_age,pct_male,store_count,county_1,fips,pct_less_than_9th_grade,less_than_9th_grade,pct_unemployed,unemployed,pct_below_poverty,below_poverty,pct_below_150pct_poverty,below_150pct_poverty,pct_families_below_poverty,families_below_poverty,pct_white,white,pct_hispanic,hispanic,pct_foreign_born,foreign_born,pct_black,black,pct_api,api,pct_aian,aian,pct_language_isolation,language_isolation,pct_less_than_high_school,less_than_high_school,pct_at_bachelor_degree,pct_high_school_lower
0,2023,4,TAMA,0.23,0.47,0.06,0.1,0.14,69418.54,16787,12808,0.762971,42.3,0.505927,16,Tama,19171.0,3.2,379.0,4.0,330.0,15.4,2544.0,23.6,3907.0,11.1,498.0,81.5,13845.0,11.3,1919.0,5.0,852.0,1.2,196.0,0.7,113.0,6.8,1151.0,2.6,174.0,8.5,997.0,53.3,11.7
1,2023,4,HAMILTON,0.21,0.41,0.1,0.08,0.2,74622.6,14848,11320,0.762392,42.2,0.503637,16,Hamilton,19079.0,2.8,291.0,3.9,294.0,8.0,1171.0,14.5,2127.0,7.7,296.0,87.4,13023.0,9.7,1438.0,6.7,1001.0,1.2,181.0,2.1,312.0,0.3,40.0,1.7,108.0,7.4,778.0,22.0,10.2


Now encode month to use as a categorical feature:

In [13]:
months = pd.get_dummies(df.month, prefix='month', drop_first=True)
df = pd.concat([df.drop(columns='month'), months], axis=1)
df.head()

Unnamed: 0,year,county,vodka_ptc,whiskey_ptc,tequila_ptc,rum_ptc,other_ptc,revenue,population,over21,propOver21,median_age,pct_male,store_count,county_1,fips,pct_less_than_9th_grade,less_than_9th_grade,pct_unemployed,unemployed,pct_below_poverty,below_poverty,pct_below_150pct_poverty,below_150pct_poverty,pct_families_below_poverty,families_below_poverty,pct_white,white,pct_hispanic,hispanic,pct_foreign_born,foreign_born,pct_black,black,pct_api,api,pct_aian,aian,pct_language_isolation,language_isolation,pct_less_than_high_school,less_than_high_school,pct_at_bachelor_degree,pct_high_school_lower,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2023,TAMA,0.23,0.47,0.06,0.1,0.14,69418.54,16787,12808,0.762971,42.3,0.505927,16,Tama,19171.0,3.2,379.0,4.0,330.0,15.4,2544.0,23.6,3907.0,11.1,498.0,81.5,13845.0,11.3,1919.0,5.0,852.0,1.2,196.0,0.7,113.0,6.8,1151.0,2.6,174.0,8.5,997.0,53.3,11.7,False,False,True,False,False,False,False,False,False,False,False
1,2023,HAMILTON,0.21,0.41,0.1,0.08,0.2,74622.6,14848,11320,0.762392,42.2,0.503637,16,Hamilton,19079.0,2.8,291.0,3.9,294.0,8.0,1171.0,14.5,2127.0,7.7,296.0,87.4,13023.0,9.7,1438.0,6.7,1001.0,1.2,181.0,2.1,312.0,0.3,40.0,1.7,108.0,7.4,778.0,22.0,10.2,False,False,True,False,False,False,False,False,False,False,False
2,2023,IOWA,0.24,0.36,0.09,0.07,0.25,132466.03,16386,12597,0.768766,42.6,0.509032,18,Iowa,19095.0,1.8,210.0,2.4,213.0,9.2,1502.0,15.2,2480.0,6.0,260.0,94.9,15697.0,3.3,552.0,0.9,153.0,0.6,104.0,0.2,36.0,0.1,13.0,0.0,3.0,5.3,610.0,30.9,7.1,False,False,False,False,False,False,False,True,False,False,False
3,2023,ADAMS,0.17,0.47,0.04,0.08,0.24,24869.27,3553,2788,0.784689,46.9,0.502392,3,Adams,19003.0,2.6,68.0,2.9,53.0,9.5,339.0,17.7,629.0,3.6,35.0,95.4,3475.0,0.7,27.0,0.2,9.0,0.4,13.0,0.0,0.0,0.5,20.0,0.3,5.0,7.4,197.0,24.5,10.0,False,False,False,False,False,False,False,True,False,False,False
4,2024,WEBSTER,0.21,0.38,0.1,0.1,0.21,390548.21,36909,29016,0.78615,38.7,0.525129,36,Webster,19187.0,1.8,458.0,3.8,690.0,11.7,3994.0,19.9,6788.0,6.3,532.0,86.9,32035.0,6.0,2217.0,3.3,1212.0,3.2,1163.0,1.2,436.0,0.3,110.0,1.2,193.0,7.7,1907.0,24.9,9.5,False,True,False,False,False,False,False,False,False,False,False


In [14]:
df = df.copy()
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce')
df = df[df['revenue'] > 0]   
df.to_csv('../data/dead_model_data.csv', index=False)

## Linear Regression (from scratch)

First, the function to fit the model. It will take observed X and Y matrices and return the predictions and estimators.

In [15]:
def fit_lr(X, Y):
    
    X = np.column_stack([np.ones((X.shape[0], 1)), X.astype(float)]) # add column of ones
    B = np.linalg.pinv(X.T @ X) @ X.T @ Y

    return B

In [16]:
def predict_lr(X, B):
    
    X = np.column_stack([np.ones((X.shape[0], 1)), X.astype(float)]) # add column of ones
    Y_hat = X @ B

    return Y_hat

A cross validation function:

In [17]:
def lr_cross_val(data, features, response, k=5):

    # make k folds
    n = data.shape[0]
    indices = np.arange(n)
    np.random.shuffle(indices)
    fold_size = n // k
    folds = [indices[i*fold_size:(i+1)*fold_size] for i in range(k-1)]
    folds.append(indices[(k-1)*fold_size:])  # last fold gets the remainder

    # loop over folds and store their predictions
    predictions = np.zeros(n)
    for fold_idx in folds:
        # training indices = everything not in this fold
        train_idx = np.setdiff1d(indices, fold_idx)
        test_idx = fold_idx

        train = data.iloc[train_idx]
        test = data.iloc[test_idx]

        # fit model
        B = fit_lr(train[features].values, train[response].values)

        # predict on test set
        Y_hat = predict_lr(test[features].values, B)

        predictions[test_idx] = Y_hat
    return predictions



Fit the model:

- *NOTE:* I first fit with all possible features, then after assessing model fit, feature importance, and multicollinearity (all below), I came back up and changed the features. So this is just the final model here, not all iterations.

In [18]:
df["store_ratio"] = df["store_count"] / df["over21"]
df['log_revenue'] = np.log(df['revenue'] / df['population'])

In [19]:
df.head()

Unnamed: 0,year,county,vodka_ptc,whiskey_ptc,tequila_ptc,rum_ptc,other_ptc,revenue,population,over21,propOver21,median_age,pct_male,store_count,county_1,fips,pct_less_than_9th_grade,less_than_9th_grade,pct_unemployed,unemployed,pct_below_poverty,below_poverty,pct_below_150pct_poverty,below_150pct_poverty,pct_families_below_poverty,families_below_poverty,pct_white,white,pct_hispanic,hispanic,pct_foreign_born,foreign_born,pct_black,black,pct_api,api,pct_aian,aian,pct_language_isolation,language_isolation,pct_less_than_high_school,less_than_high_school,pct_at_bachelor_degree,pct_high_school_lower,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,store_ratio,log_revenue
0,2023,TAMA,0.23,0.47,0.06,0.1,0.14,69418.54,16787,12808,0.762971,42.3,0.505927,16,Tama,19171.0,3.2,379.0,4.0,330.0,15.4,2544.0,23.6,3907.0,11.1,498.0,81.5,13845.0,11.3,1919.0,5.0,852.0,1.2,196.0,0.7,113.0,6.8,1151.0,2.6,174.0,8.5,997.0,53.3,11.7,False,False,True,False,False,False,False,False,False,False,False,0.001249,1.419549
1,2023,HAMILTON,0.21,0.41,0.1,0.08,0.2,74622.6,14848,11320,0.762392,42.2,0.503637,16,Hamilton,19079.0,2.8,291.0,3.9,294.0,8.0,1171.0,14.5,2127.0,7.7,296.0,87.4,13023.0,9.7,1438.0,6.7,1001.0,1.2,181.0,2.1,312.0,0.3,40.0,1.7,108.0,7.4,778.0,22.0,10.2,False,False,True,False,False,False,False,False,False,False,False,0.001413,1.614578
2,2023,IOWA,0.24,0.36,0.09,0.07,0.25,132466.03,16386,12597,0.768766,42.6,0.509032,18,Iowa,19095.0,1.8,210.0,2.4,213.0,9.2,1502.0,15.2,2480.0,6.0,260.0,94.9,15697.0,3.3,552.0,0.9,153.0,0.6,104.0,0.2,36.0,0.1,13.0,0.0,3.0,5.3,610.0,30.9,7.1,False,False,False,False,False,False,False,True,False,False,False,0.001429,2.089899
3,2023,ADAMS,0.17,0.47,0.04,0.08,0.24,24869.27,3553,2788,0.784689,46.9,0.502392,3,Adams,19003.0,2.6,68.0,2.9,53.0,9.5,339.0,17.7,629.0,3.6,35.0,95.4,3475.0,0.7,27.0,0.2,9.0,0.4,13.0,0.0,0.0,0.5,20.0,0.3,5.0,7.4,197.0,24.5,10.0,False,False,False,False,False,False,False,True,False,False,False,0.001076,1.945841
4,2024,WEBSTER,0.21,0.38,0.1,0.1,0.21,390548.21,36909,29016,0.78615,38.7,0.525129,36,Webster,19187.0,1.8,458.0,3.8,690.0,11.7,3994.0,19.9,6788.0,6.3,532.0,86.9,32035.0,6.0,2217.0,3.3,1212.0,3.2,1163.0,1.2,436.0,0.3,110.0,1.2,193.0,7.7,1907.0,24.9,9.5,False,True,False,False,False,False,False,False,False,False,False,0.001241,2.359096


Fit the model:

- *NOTE:* I first fit with all possible features, then after assessing model fit, feature importance, and multicollinearity (all below), I came back up and changed the features. So this is just the final model here, not all iterations.

In [20]:
# 'pct_less_than_9th_grade', 'pct_less_than_high_school',
features = ['vodka_ptc', 'whiskey_ptc', 'tequila_ptc', 'rum_ptc',
       'other_ptc', 'propOver21',
       'median_age', 'pct_male', 'pct_at_bachelor_degree',
        'pct_unemployed',
       'pct_below_poverty', 'pct_high_school_lower',
       'pct_families_below_poverty','pct_white',
       'pct_hispanic', 'pct_foreign_born',
       'pct_black', 'pct_api', 'pct_aian',
       'pct_language_isolation',
       'month_5', 'month_6',
'month_11']
response = 'log_revenue'

# fit
Yhat = lr_cross_val(df, features, response, k=5)

 Assess model fit:

In [21]:
Y = df[response].values
X = df[features].values
RSS = np.sum((Y - Yhat)**2)
TSS = np.sum((Y - np.mean(Y))**2)
R2 = 1 - RSS/TSS
n, p = X.shape  # after adding intercept
adj_R2 = 1 - (1 - R2) * (n - 1) / (n - p)
MAE = np.mean(np.abs(Y - Yhat))
RMSE = np.sqrt(RSS / n)

print(f"R2 = {R2:.4f}")
print(f"Adjusted R2 = {adj_R2:.4f}")
print(f"MAE = {MAE:.2f}")
print(f"RMSE = {RMSE:.2f}")

R2 = 0.4742
Adjusted R2 = 0.4689
MAE = 0.29
RMSE = 0.37


Fit a model on all the data and look at the estimators:

In [22]:
# fit a final model on all the data
X = df[features]
Y = df[response]
B_final = fit_lr(X, Y)

In [23]:
coef_names = ['intercept'] + list(X.columns)
nonstd_df = pd.DataFrame({
    'feature': coef_names,
    'beta': B_final
})
nonstd_df = nonstd_df[nonstd_df['feature'] != 'intercept']
nonstd_df = nonstd_df.reindex(
    nonstd_df['beta'].abs().sort_values(ascending=False).index
)
print(nonstd_df)

                       feature      beta
8                     pct_male -9.048813
3                  tequila_ptc  5.999014
4                      rum_ptc  2.979270
5                    other_ptc  2.947500
6                   propOver21 -0.539675
2                  whiskey_ptc  0.507166
1                    vodka_ptc -0.172528
21                     month_5  0.133458
18                     pct_api  0.085897
22                     month_6  0.072987
16            pct_foreign_born -0.061606
17                   pct_black  0.051935
12       pct_high_school_lower -0.043332
15                pct_hispanic  0.038166
19                    pct_aian -0.035129
20      pct_language_isolation  0.028408
11           pct_below_poverty  0.024720
10              pct_unemployed  0.021150
7                   median_age  0.017794
23                    month_11  0.015557
13  pct_families_below_poverty -0.009397
9       pct_at_bachelor_degree -0.002153
14                   pct_white  0.001450


Check standardized coefficients because their magnitudes actually tell us importance:

In [24]:

X_std = X.apply(pd.to_numeric, errors='coerce').astype(float).std(axis=0, ddof=0).to_numpy()
Y_std = pd.to_numeric(Y, errors='coerce').astype(float).std(ddof=0)
beta_no_intercept = np.ravel(B_final[1:])  # shape (p,)
std_beta = beta_no_intercept * (X_std / Y_std)
importance = (pd.DataFrame({"feature": X.columns, "std_coef": std_beta})
              .sort_values("std_coef", key=np.abs, ascending=False)
              .reset_index(drop=True))

print(importance)


                       feature  std_coef
0        pct_high_school_lower -0.398000
1                 pct_hispanic  0.387989
2             pct_foreign_born -0.377454
3                  tequila_ptc  0.340758
4                    other_ptc  0.271895
5                      rum_ptc  0.232399
6                      pct_api  0.219961
7                    pct_black  0.158557
8            pct_below_poverty  0.157123
9                     pct_male -0.149998
10                  median_age  0.111405
11      pct_language_isolation  0.081662
12                     month_5  0.071419
13                 whiskey_ptc  0.062452
14                    pct_aian -0.049159
15  pct_families_below_poverty -0.048883
16              pct_unemployed  0.045669
17                     month_6  0.039058
18      pct_at_bachelor_degree -0.032172
19                  propOver21 -0.022734
20                   pct_white  0.015613
21                   vodka_ptc -0.014252
22                    month_11  0.008325


Check for collinearity between features:

In [25]:
# compute correlation matrix for numeric features
corr = X.corr().abs()

# show only the top correlated pairs
high_corr = (
    corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    .stack()
    .sort_values(ascending=False)
)
print("Most correlated feature pairs:\n")
print(high_corr.head(10))


Most correlated feature pairs:

pct_white          pct_foreign_born              0.908788
                   pct_hispanic                  0.831578
pct_foreign_born   pct_language_isolation        0.823051
pct_hispanic       pct_foreign_born              0.821608
pct_below_poverty  pct_families_below_poverty    0.817360
pct_hispanic       pct_language_isolation        0.792529
pct_white          pct_language_isolation        0.776324
pct_foreign_born   pct_api                       0.723608
pct_black          pct_api                       0.693088
pct_white          pct_api                       0.691832
dtype: float64
