# Regularized Linear Regression

BUSMGT 7247

## Load libraries and data

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet
)
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib as mpl
import matplotlib.pyplot as plt

# Make this notebook's output stable across runs
np.random.seed(100)

# Do not use scientific notation in pandas tables
pd.options.display.float_format = '{:,.4f}'.format

# Plot formatting
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

### Data Description

When retailers close stores, they usually conduct going-out-of-business (GOB) sales.

In the data set, each row represents a store's going out of business sale.

| Attribute             | Definition                                                                |
| --------------------- | ------------------------------------------------------------------------- |
| StoreType             | Store's retail segment (bookstores or household goods)                    |
| LiquidationRevenue    | Revenue from the GOB sale                                                 |
| InventoryAtTicket     | Retail value of inventory held at the start of the GOB sale               |
| LastYearRevenue       | Store's revenue over the GOB sale dates during the prior year             |                      
| MedianHouseholdIncome | Median household income in the store's ZIP code (from U.S. Census Bureau) |
| MedianAge             | Median age in the store's ZIP code (from U.S. Census Bureau)              |


In [2]:
df = pd.read_csv('data/closings.csv')
df.head()

Unnamed: 0,StoreType,LiquidationRevenue,InventoryAtTicket,LastYearRevenue,MedianHouseholdIncome,MedianAge
0,Bookstore,1214776.2,2195069.8,465237.3,51290,37.8
1,Bookstore,1811896.9,3152301.9,897704.5,46702,33.4
2,Bookstore,1157614.7,2229837.5,477804.8,86804,41.7
3,Bookstore,2037136.2,3857466.8,1036097.7,83544,40.3
4,Bookstore,1326821.1,2345263.8,612793.5,23508,31.9


## Model

In [3]:
# Transform features
df['Bookstore'] = df['StoreType'].apply(lambda x: 1 if x == 'Bookstore' else 0)
df['BookstoreXInventoryAtTicket'] = df['Bookstore'] * df['InventoryAtTicket']

# Add square and cube terms
numvars = ['InventoryAtTicket', 'LastYearRevenue',
          'MedianHouseholdIncome', 'MedianAge']
for v in numvars:
    df[v + 'Sq'] = df[v]**2
    df[v + 'Cu'] = df[v]**3
    
# Fit a model that incorporates all variables
y = df['LiquidationRevenue']

feature_names = list(df.columns[2:])
X = df[feature_names]

# Scale data
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

# Split data into training and test sets
(X_train, X_test,
 y_train, y_test) = train_test_split(X_scale, y, test_size=0.75)

In [4]:
# Fit a linear regresion
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

In [5]:
# Retain coefficients and performance
df_regs = pd.DataFrame(columns=['Intercept'] + feature_names + ['Test R2'])
df_regs.loc['Linear'] = [linreg.intercept_] + list(linreg.coef_) + [r2_score(y_test, y_pred)]

In [6]:
# Set regularization parameter for ridge, lasso, and elasticnet
alpha=2
l1_ratio=0.5

### Ridge Regression

In [7]:
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

df_regs.loc['Ridge'] = [ridge.intercept_] + list(ridge.coef_) + [r2_score(y_test, y_pred)]

### LASSO Regression

In [8]:
lasso = Lasso(alpha=alpha, max_iter=100000)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

df_regs.loc['Lasso'] = [lasso.intercept_] + list(lasso.coef_) + [r2_score(y_test, y_pred)]

### Elastic Net Regression

In [9]:
elastic = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
elastic.fit(X_train, y_train)
y_pred = elastic.predict(X_test)

df_regs.loc['ElasticNet'] = [elastic.intercept_] + list(elastic.coef_) + [r2_score(y_test, y_pred)]

### Compare Regularization Methods

In [10]:
df_regs

Unnamed: 0,Intercept,InventoryAtTicket,LastYearRevenue,MedianHouseholdIncome,MedianAge,Bookstore,BookstoreXInventoryAtTicket,InventoryAtTicketSq,InventoryAtTicketCu,LastYearRevenueSq,LastYearRevenueCu,MedianHouseholdIncomeSq,MedianHouseholdIncomeCu,MedianAgeSq,MedianAgeCu,Test R2
Linear,1743761.284,35348.0193,-246953.5007,58268.457,469147.7064,14585.7644,-44139.9834,926612.9399,-817907.2647,1472339.8287,-949232.6769,130154.0345,-211064.5441,-1132144.4842,648109.2204,0.7625
Ridge,1764206.5118,179118.0645,330080.2278,109552.8269,-26683.2002,-84362.7372,29420.9039,48720.5776,-58982.1445,160216.2384,-92277.3346,-14341.0543,-101472.8931,-19833.1823,-11893.596,0.9175
Lasso,1743993.2892,45379.9223,-241790.3816,60757.102,450276.9896,13279.3897,-42947.4332,897062.385,-797849.6432,1462975.9094,-944056.4457,124501.8914,-207537.0434,-1091159.6461,624791.8015,0.7656
ElasticNet,1773525.0702,84469.7632,103577.7075,12258.2045,-8922.1994,-79634.1117,-58469.623,72085.2462,58621.5331,83557.2852,56460.6165,1617.0178,-5645.7604,-9728.1085,-10390.7233,0.8845
