# Exercise 04

## Section 2: Predicting customer spending

We’ll be using the customers.csv data set for this exercise. The data set covers the demographic characteristics of some customers and the amount they spent over the past year at an online retailer. For this exercise it is recommended to use the sklearn packages for linear regression, ridge, and lasso. Sklearn documentation linked below.

Linear regression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

Ridge: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html 

Lasso: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

In order to interact the categorical variables you will need to dummy code them and manually multiply, an example is given below. 

```python
customerDf = pd.get_dummies(data=customerDf, columns=['sex', 'race'], prefix=['sex','race'])
customerDf["Hispanic_Male"] = np.multiply(customerDf["race_hispanic"],customerDf["sex_male"])
```

In [None]:
import pandas as pd
import numpy as np
import streamlit as st
import statsmodels.api as sm
import altair as alt
from sklearn.linear_model import Ridge, Lasso, LinearRegression

In [None]:
customer = pd.read_csv('customers.csv')

### 1. Build Linear Regression Model

Build a linear regression with all the dependent variables and two way interactions between sex and race, consider the other category for race and sex to be the reference category and treat it appropriately.

In [None]:
# Get our binary dummy columns based on combinations of sex and race values
# Note that you have to tell get_dummies() to create ints instead of bools
# OLS doesn't like bool columns
customer = pd.get_dummies(data=customer, columns=['sex', 'race'], prefix=['sex','race'], dtype=int)

# Create our combination columns by multiplying the individual binary columns
# We can leave out the "other" values since they're the default condition

customer["hispanic_male"] = np.multiply(customer["race_hispanic"],customer["sex_male"])
customer["hispanic_female"] = np.multiply(customer["race_hispanic"],customer["sex_female"])
#customer["hispanic_other"] = np.multiply(customer["race_hispanic"],customer["sex_other"])

customer["asian_male"] = np.multiply(customer["race_asian"],customer["sex_male"])
customer["asian_female"] = np.multiply(customer["race_asian"],customer["sex_female"])
#customer["asian_other"] = np.multiply(customer["race_asian"],customer["sex_other"])

customer["black_male"] = np.multiply(customer["race_black"],customer["sex_male"])
customer["black_female"] = np.multiply(customer["race_black"],customer["sex_female"])
#customer["black_other"] = np.multiply(customer["race_black"],customer["sex_other"])

customer["white_male"] = np.multiply(customer["race_white"],customer["sex_male"])
customer["white_female"] = np.multiply(customer["race_white"],customer["sex_female"])
#customer["white_other"] = np.multiply(customer["race_white"],customer["sex_other"])

#customer["other_male"] = np.multiply(customer["race_other"],customer["sex_male"])
#customer["other_female"] = np.multiply(customer["race_other"],customer["sex_female"])
#customer["other_other"] = np.multiply(customer["race_other"],customer["sex_other"])

customer.drop(columns=['sex_other', 'race_other'], inplace=True)


In [None]:
# Income is often very skewed and needs to be log transformed
customer["log_income"] = np.log(customer["income"])

In [None]:
# Use all the input except "spend" and untransformed "income"
xcols = list(customer.columns.values)
xcols.remove('spend')
xcols.remove('income')

X = customer[xcols]
Y = customer['spend']
X = sm.add_constant(X)

model = sm.OLS(np.asarray(Y),np.asarray(X))
results = model.fit()
results.summary()

### 2. Build a ridge model with different alphas

Build ridge models with various values for alpha. Create a chart showing how the coefficients change with alpha values

Ridge Regression (L2) - https://www.youtube.com/watch?v=Q81RR3yKn30&t=85s

Lasso Regression 

In [None]:
import warnings
warnings.filterwarnings("ignore")


# Start without the intercept term.
# Not needed for Ridge or Lasso
X = customer[xcols]

# linear = LinearRegression(fit_intercept=False).fit(X,Y)

# Let's use 100 different values of alpha (aka: lambda) between 10^-5 and 10^2
n_alphas = 100
alphas = np.logspace(-5, 2, n_alphas)

# Keep track of the resulting coefficients
ridge_coefs = []
lasso_coefs = []

# Then fit ridge and lasso models for each value of alpha
for a in alphas:
    # print(f'Fitting alpha = {a}')
    ridge = Ridge(alpha=a, fit_intercept=False)
    lasso = Lasso(alpha=a, fit_intercept=False)
    ridge.fit(X,Y)
    lasso.fit(X,Y)
    ridge_coefs.append(ridge.coef_)
    lasso_coefs.append(lasso.coef_)

In [None]:
# Put together a df with alphas and coefficients
ridges = pd.DataFrame()
ridges['alpha'] = alphas
ridges['coefs'] = ridge_coefs

# Give the coefficients a name
ridges['labels'] = [xcols] * len(ridge_coefs)

# Explode them into multiple rows so we can plot them as separate series
ridges = ridges.explode(['coefs','labels'])

# Line Chart
c = alt.Chart(ridges).mark_line().encode(
    alt.X('alpha', title='L2 Penalty', scale=alt.Scale(type="log", reverse=True)),
    alt.Y('coefs', title='Coefficients'),
    color='labels'
).properties(
    title='Ridge Coefficients as a function of Regularization',
    height=500
)

st.altair_chart(c, use_container_width=True)

In [None]:
# Put together a df with alphas and coefficients
lasso = pd.DataFrame()
lasso['alpha'] = alphas
lasso['coefs'] = lasso_coefs

# Give the coefficients a name
lasso['labels'] = [xcols] * len(lasso_coefs)

# Explode them into multiple rows so we can plot them as separate series
lasso = lasso.explode(['coefs','labels'])

# Line Chart
c = alt.Chart(lasso).mark_line().encode(
    alt.X('alpha', title='L2 Penalty', scale=alt.Scale(type="log", reverse=True)),
    alt.Y('coefs', title='Coefficients'),
    color='labels'
).properties(
    title='Lasso Coefficients as a function of Regularization',
    height=500
)

st.altair_chart(c, use_container_width=True)

### 4. Compare the coefficients from linear regression, ridge, and lasso (select an alpha value using your chart)

* Regression
* Ridge with alpha=0.01
* Lasso with alpha=0.0001

In [None]:
# Choosing 0.01 for Ridge because it seems that we've resolved the coefficients at that level
ridge = Ridge(alpha=0.01, fit_intercept=False)
ridge.fit(X,Y)

# Lasso at 0.001 for the same reason
lasso = Lasso(alpha=0.0001, fit_intercept=False)
lasso.fit(X,Y)

# Linear model
linear = LinearRegression(fit_intercept=False)
linear.fit(X,Y)


print(f'linear regression coefficients {linear.coef_}\nR2={linear.score(X,Y)}\n\n')
print(f'ridge regression coefficients {ridge.coef_}\nR2={ridge.score(X,Y)}\n\n')
print(f'lasso regression coefficients {lasso.coef_}\nR2={lasso.score(X,Y)}\n\n')


In [None]:
# Package up the coefficients into a table
out = pd.DataFrame(list(zip(
    xcols,
    linear.coef_, 
    ridge.coef_,
    lasso.coef_)),
   columns=['variable',
            f'linear ({linear.score(X,Y):.4f})',
            f'ridge ({ridge.score(X,Y):.4f})',
            f'lasso ({lasso.score(X,Y):.4f})'])
out