<a href="https://colab.research.google.com/github/pharringtonp19/business-analytics/blob/main/notebooks/regression/forward_selection_step1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!git clone https://github.com/pharringtonp19/business-analytics.git

fatal: destination path 'business-analytics' already exists and is not an empty directory.


### **Import Packages**

In [24]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import jax.numpy as jnp
import jax
import seaborn as sb
from functools import partial
from sklearn.model_selection import train_test_split

### **Read In Data Set**

In [25]:
df = pd.read_csv('/content/business-analytics/datasets/brookline.csv')
df.head()

Unnamed: 0,price,stNumber,stName,size,beacon,baseFloor,buildingStyle,elevators,rooms,bedrooms,fullBathrooms,halfBathrooms,garage
0,174000,150,PLEASANT ST,1060,0,4,MID-RISE,1,4,2,1,1,1.0
1,337000,7,LEVERETT ST,831,0,1,DECKER,0,4,2,1,0,0.0
2,850000,24,EUSTON ST,2246,0,1,ROW-END,0,10,6,3,0,0.0
3,516000,417,WASHINGTON ST,1574,0,2,LOW-RISE,0,6,3,2,0,0.0
4,145000,150,PLEASANT ST,669,0,4,MID-RISE,1,3,1,1,0,1.0


### **Train/Test Split**

In [45]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

### **Helper Functions**

In [46]:
def in_sample_adj_r2(reg_formula):
  linear_model = smf.ols(reg_formula, data=df)
  results = linear_model.fit()
  return results.rsquared_adj

def out_sample_mse(reg_formula):
  linear_model = smf.ols(reg_formula, data=df_train)
  results = linear_model.fit()
  y_true = df_test[reg_formula.split('~')[0].strip()]
  predictions = results.predict(df_test)
  return ((y_true - predictions)**2).mean()

def update_regression_formula(regression_formula: str, var:str) -> str:
  if regression_formula.rstrip()[-1] == '~':
    regression_formula += var
  else:
    regression_formula += ' + ' + var
  return regression_formula

### **Variable Representations**

In [47]:
eligible_variables = df.columns.drop(['stNumber', 'stName', 'price']).tolist()
categorical_vars = ['buildingStyle']
eligible_variables_rep = {}
for var in eligible_variables:
  if var in categorical_vars:
    eligible_variables_rep[var] = 'C(' + var + ')'
  else:
    eligible_variables_rep[var] = var

### **Selection Criteria**

In [52]:
regression_formulas = list(map(partial(update_regression_formula, 'price ~ '), eligible_variables_rep.values()))
adjs = np.array(list(map(in_sample_adj_r2, regression_formulas)))
mses = np.array(list(map(out_sample_mse, regression_formulas)))