<a href="https://colab.research.google.com/github/pharringtonp19/business-analytics/blob/main/notebooks/regression/forward_selection_full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!git clone https://github.com/pharringtonp19/business-analytics.git

fatal: destination path 'business-analytics' already exists and is not an empty directory.


### **Import Packages**

In [13]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import numpy as np
from functools import partial
from typing import Callable

### **Read In Data Set**

In [14]:
df = pd.read_csv('/content/business-analytics/datasets/brookline.csv')
df.head()

Unnamed: 0,price,stNumber,stName,size,beacon,baseFloor,buildingStyle,elevators,rooms,bedrooms,fullBathrooms,halfBathrooms,garage
0,174000,150,PLEASANT ST,1060,0,4,MID-RISE,1,4,2,1,1,1.0
1,337000,7,LEVERETT ST,831,0,1,DECKER,0,4,2,1,0,0.0
2,850000,24,EUSTON ST,2246,0,1,ROW-END,0,10,6,3,0,0.0
3,516000,417,WASHINGTON ST,1574,0,2,LOW-RISE,0,6,3,2,0,0.0
4,145000,150,PLEASANT ST,669,0,4,MID-RISE,1,3,1,1,0,1.0


### **Update Regression Formula**

In [15]:
def update_reg_formula(reg_formula: str, initial: bool, var_rep: str) -> str:
  if initial:
    return reg_formula + var_rep
  else:
    return reg_formula + ' + ' + var_rep

### **Get Rsquared**

In [16]:
def get_rsqrs(reg_formula: str, df: pd.DataFrame) -> tuple[float, float]:
  linear_model = smf.ols(reg_formula, data=df)
  results = linear_model.fit()
  return results.rsquared, results.rsquared_adj

### **Objective Function**

In [17]:
def Obj(reg_formula: str, df: pd.DataFrame, initial: bool, var_rep: str):
  reg_formula = update_reg_formula(reg_formula, initial, var_rep)
  return get_rsqrs(reg_formula, df)

### **Solver**

In [18]:
def Solver(obj: Callable, eligible_vars_rep: dict[str, str]) -> str:
  results = np.array(list(map(obj, eligible_vars_rep.values())))
  idx = np.argmax(results[:,0])
  var = list(eligible_vars_rep.keys())[idx]
  return var, results[idx]

### **Step**

In [19]:
def Step(reg_formula: str, initial:bool, eligible_vars_rep: dict[str, str]):
  obj = partial(Obj, reg_formula, df, initial)
  new_var, results = Solver(obj, eligible_vars_rep)
  return new_var, results

### **Update**

In [20]:
def Update(initial, results_history, reg_formula, eligible_vars_rep, results, var):
  reg_formula = update_reg_formula(reg_formula, initial, eligible_vars_rep[var])
  eligible_vars_rep.pop(var)
  results_history['r2_adj'].append(results[0])
  results_history['r2'].append(results[1])
  return results_history, reg_formula, eligible_vars_rep

### **Run**

In [21]:
def Run(dep_var, df, categorical_vars, vars_to_drop):
  eligible_vars = df.columns.drop(vars_to_drop + [dep_var]).tolist()
  eligible_vars_rep = {key: f'C({key})' if key in categorical_vars else key for key in eligible_vars}
  reg_formula = dep_var + ' ~ '
  results_history = {'r2_adj': [],
                    'r2': []}
  for i in range(len(eligible_vars_rep)):
    initial = True if i == 0 else False
    new_var, results = Step(reg_formula, initial, eligible_vars_rep)
    results_history, reg_formula, eligible_vars_rep = Update(initial, results_history, reg_formula, eligible_vars_rep, results, new_var)
  return reg_formula, results_history

### **Run**

In [23]:
Run('price', df, ['buildingStyle'], ['stNumber', 'stName'])

('price ~ size + garage + C(buildingStyle) + fullBathrooms + halfBathrooms + beacon + elevators + baseFloor + rooms + bedrooms',
 {'r2_adj': [0.7490352448530876,
   0.7656970420339096,
   0.7752935438883197,
   0.7849768216069191,
   0.7978334931781431,
   0.8001267627031232,
   0.8036392346490624,
   0.8037253869598965,
   0.8038134531386681,
   0.804005091792453],
  'r2': [0.7488035137772363,
   0.7652639496901645,
   0.7729899362301385,
   0.7825698457293846,
   0.7953795579879619,
   0.7975115988506407,
   0.8008839385964299,
   0.8007849433188462,
   0.8006877068437828,
   0.8006956092898866]})