<a href="https://colab.research.google.com/github/pharringtonp19/business-analytics/blob/main/notebooks/regression/regression_transformations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/pharringtonp19/business-analytics.git

Cloning into 'business-analytics'...
remote: Enumerating objects: 1299, done.[K
remote: Counting objects: 100% (354/354), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 1299 (delta 322), reused 254 (delta 254), pack-reused 945 (from 1)[K
Receiving objects: 100% (1299/1299), 21.87 MiB | 10.28 MiB/s, done.
Resolving deltas: 100% (778/778), done.


### **Import Packages**

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import jax.numpy as jnp
import jax
import seaborn as sb
from functools import partial
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### **Read In Data Set**

In [3]:
df = pd.read_csv('/content/business-analytics/datasets/brookline.csv')
df.head()

Unnamed: 0,price,stNumber,stName,size,beacon,baseFloor,buildingStyle,elevators,rooms,bedrooms,fullBathrooms,halfBathrooms,garage
0,174000,150,PLEASANT ST,1060,0,4,MID-RISE,1,4,2,1,1,1.0
1,337000,7,LEVERETT ST,831,0,1,DECKER,0,4,2,1,0,0.0
2,850000,24,EUSTON ST,2246,0,1,ROW-END,0,10,6,3,0,0.0
3,516000,417,WASHINGTON ST,1574,0,2,LOW-RISE,0,6,3,2,0,0.0
4,145000,150,PLEASANT ST,669,0,4,MID-RISE,1,3,1,1,0,1.0


## **Helper Functions**

In [4]:
def apply_transformations(df, variable, list_of_transformations):
  """Treat this as a function defined in another package"""

  # Transformation function
  def transformations(column_name, f):
      return f.__name__, df[column_name].map(f)

  # Apply transformations
  results = list(map(partial(transformations, variable), list_of_transformations))

  # Convert results to a DataFrame with appropriate column names
  transformed_df = pd.DataFrame({key: value for key, value in results})

  # Concatenate original DataFrame with the transformed DataFrame
  df = pd.concat([df, transformed_df], axis=1)

  # Transformed Variable Names
  transformed_variable_names, transformed_columns = zip(*results)
  return df, transformed_variable_names


def create_regression_string(dep_var, transformed_variables):
  if len(transformed_variables) == 1:
    return f'{dep_var} ~ {transformed_variables[0]}'
  return f'{dep_var} ~ {transformed_variables[0] + " + " + " + ".join(transformed_variables[1:])}'

## **Initial Run**

### **Configuration**

In [5]:
dep_var = 'price'
rhs_var = 'fullBathrooms'

### **Create Functions**

In [6]:
# Functions
def B1(x):
  return x

def B2(x):
  return x ** 2

def B3(x):
  return np.tanh(x)

def B4(x):
  return np.sin(x)

### **Apply Transformations**

In [8]:
transformations = [B1, B2, B3, B4]
new_df, transformed_variables = apply_transformations(df, rhs_var, transformations)

### **Create Regression String**

In [9]:
regression_string = create_regression_string(dep_var, transformed_variables)

### **Define Linear Model**

In [10]:
linear_model = smf.ols(regression_string, data=new_df)

### **Results**

In [11]:
results = linear_model.fit()
results.params

Unnamed: 0,0
Intercept,165000.0
B1,1031251.0
B2,-223956.2
B3,-139388.7
B4,-591043.6


## **For Loop**

### **Transformations**

In [12]:
# Functions
def B1(x):
  return x

def B2(x):
  return x ** 2

def B3(x):
  return x**3

def B4(x):
  return x**4

def B5(x):
  return x**5

def B6(x):
  return x**6

def B7(x):
  return x**7

### **Configuration**

In [13]:
dep_var = 'price'
rhs_var = 'rooms'

### **Run Code**

In [14]:
list_of_transformations = [B1, B2, B3, B4, B5, B6]
adj = float('-inf')
signif_coeff = True
new_adj = 0
k=0
while signif_coeff and (new_adj >= adj):
  k += 1
  new_df, transformed_variables = apply_transformations(df, rhs_var, list_of_transformations[:k])
  regression_string = create_regression_string(dep_var, transformed_variables)
  linear_model = smf.ols(regression_string, data=new_df)
  results = linear_model.fit()
  new_adj = results.rsquared_adj
  signif_coeff = results.pvalues[-1] <0.05
  print(k, results.pvalues[-1], new_adj, adj)
  adj=new_adj

1 1.5078626542975521e-139 0.4419243178325073 -inf
2 0.017651662587212555 0.4443090600351466 0.4419243178325073
3 2.7572268155311456e-06 0.45499449502747946 0.4443090600351466
4 0.9472447787116507 0.4544920727005862 0.45499449502747946
