<a href="https://colab.research.google.com/github/pharringtonp19/business-analytics/blob/main/notebooks/regression/causality_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/pharringtonp19/business-analytics.git

Cloning into 'business-analytics'...
remote: Enumerating objects: 1215, done.[K
remote: Counting objects: 100% (832/832), done.[K
remote: Compressing objects: 100% (385/385), done.[K
remote: Total 1215 (delta 582), reused 601 (delta 422), pack-reused 383 (from 1)[K
Receiving objects: 100% (1215/1215), 21.39 MiB | 14.13 MiB/s, done.
Resolving deltas: 100% (710/710), done.


### **Import Packages**

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import jax.numpy as jnp
import jax
import seaborn as sb
from functools import partial

### **Read In Data Set**

In [3]:
df = pd.read_csv('/content/business-analytics/datasets/brookline.csv')
df.head()

Unnamed: 0,price,stNumber,stName,size,beacon,baseFloor,buildingStyle,elevators,rooms,bedrooms,fullBathrooms,halfBathrooms,garage
0,174000,150,PLEASANT ST,1060,0,4,MID-RISE,1,4,2,1,1,1.0
1,337000,7,LEVERETT ST,831,0,1,DECKER,0,4,2,1,0,0.0
2,850000,24,EUSTON ST,2246,0,1,ROW-END,0,10,6,3,0,0.0
3,516000,417,WASHINGTON ST,1574,0,2,LOW-RISE,0,6,3,2,0,0.0
4,145000,150,PLEASANT ST,669,0,4,MID-RISE,1,3,1,1,0,1.0


### **Variables of Interest**

In [71]:
dep_var = 'price'
causal_var = 'size'

### **Classify Variables**

In [72]:
eligible_vars = df.columns.tolist()
eligible_vars.remove(dep_var)
categorical_vars = ['stNumber', 'stName', 'buildingStyle']

### **Dictionary Comprehension**

In [73]:
eligible_vars_rep = {var : var if var not in categorical_vars else f'C({var})' for var in eligible_vars}
eligible_vars_rep

{'stNumber': 'C(stNumber)',
 'stName': 'C(stName)',
 'size': 'size',
 'beacon': 'beacon',
 'baseFloor': 'baseFloor',
 'buildingStyle': 'C(buildingStyle)',
 'elevators': 'elevators',
 'rooms': 'rooms',
 'bedrooms': 'bedrooms',
 'fullBathrooms': 'fullBathrooms',
 'halfBathrooms': 'halfBathrooms',
 'garage': 'garage'}

### **Create Function**

In [74]:
def get_pvalues(reg_formula, var_rep):
  updated_reg_formula = reg_formula + ' + ' + var_rep
  linear_model  = smf.ols(updated_reg_formula, df)
  results = linear_model.fit()
  pvalues_series = results.pvalues
  pvalues_df = pd.DataFrame(list(pvalues_series.items()), columns=['variable', 'pvalue'])
  pvalues_df['base_variable'] = pvalues_df['variable'].map(lambda x: x.split('[')[0] if 'C(' in x else x)
  condition = pvalues_df['variable'] != 'Intercept'
  pvalues_df = pvalues_df[condition]
  pvalues_df = pvalues_df.groupby('base_variable')['pvalue'].min()
  return pvalues_df[var_rep]

### **Check Relationships**

In [86]:
realatedy = np.array(list(map(partial(get_pvalues, dep_var + ' ~ '), eligible_vars_rep.values()))) < 0.05
relatedx =  np.array(list(map(partial(get_pvalues, causal_var + ' ~ '), eligible_vars_rep.values()))) < 0.05

### **Filter**

In [87]:
analysis_df = pd.DataFrame({'RelatedY': realatedy,
                            'RelatedX': relatedx}, index=eligible_vars)

condition = analysis_df['RelatedY'] &analysis_df['RelatedX']
selected_vars = analysis_df[condition].index
print(selected_vars)

Index(['stNumber', 'stName', 'size', 'baseFloor', 'buildingStyle', 'elevators',
       'rooms', 'bedrooms', 'fullBathrooms', 'halfBathrooms', 'garage'],
      dtype='object')
