<a href="https://colab.research.google.com/github/pharringtonp19/business-analytics/blob/main/notebooks/regression/causality_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!git clone https://github.com/pharringtonp19/business-analytics.git

fatal: destination path 'business-analytics' already exists and is not an empty directory.


### **Import Packages**

In [28]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import jax.numpy as jnp
import jax
import seaborn as sb
from functools import partial

### **Read In Data Set**

In [29]:
df = pd.read_csv('/content/business-analytics/datasets/lifeExpectancy_reduced.csv')
df.head()

Unnamed: 0,Country,Region,Year,Alcohol_consumption,BMI,GDP_per_capita,Population_mln,Schooling,Economy_status_Developed,Life_expectancy
0,Turkiye,Middle East,2015,1.32,27.8,11006,78.53,7.8,0,76.5
1,Spain,European Union,2015,10.35,26.0,25742,46.44,9.7,1,82.8
2,India,Asia,2007,1.57,21.2,1076,1183.21,5.0,0,65.4
3,Guyana,South America,2006,5.68,25.3,4146,0.75,7.9,0,67.0
4,Israel,Middle East,2012,2.89,27.0,33995,7.91,12.8,1,81.7


### **Variables of Interest**

In [30]:
dep_var = 'Life_expectancy'
causal_var = 'Alcohol_consumption'

### **Classify Variables**

In [31]:
eligible_vars = df.columns.drop(['Country', dep_var, causal_var]).tolist()
categorical_vars = ['Region']

### **Dictionary Comprehension**

In [32]:
eligible_vars_rep = {var : var if var not in categorical_vars else f'C({var})' for var in eligible_vars}
eligible_vars_rep

{'Region': 'C(Region)',
 'Year': 'Year',
 'BMI': 'BMI',
 'GDP_per_capita': 'GDP_per_capita',
 'Population_mln': 'Population_mln',
 'Schooling': 'Schooling',
 'Economy_status_Developed': 'Economy_status_Developed'}

### **Create Function**

In [33]:
def get_pvalues(reg_formula, var_rep):
  updated_reg_formula = reg_formula + ' + ' + var_rep
  linear_model  = smf.ols(updated_reg_formula, df)
  results = linear_model.fit()
  pvalues_series = results.pvalues
  pvalues_df = pd.DataFrame(list(pvalues_series.items()), columns=['variable', 'pvalue'])
  pvalues_df['base_variable'] = pvalues_df['variable'].map(lambda x: x.split('[')[0] if 'C(' in x else x)
  condition = pvalues_df['variable'] != 'Intercept'
  pvalues_df = pvalues_df[condition]
  pvalues_df = pvalues_df.groupby('base_variable')['pvalue'].min()
  return pvalues_df[var_rep]

### **Check Relationships**

In [34]:
realatedy = np.array(list(map(partial(get_pvalues, dep_var + ' ~ '), eligible_vars_rep.values()))) <= 0.05
relatedx =  np.array(list(map(partial(get_pvalues, causal_var + ' ~ '), eligible_vars_rep.values()))) <= 0.05

### **Filter**

In [35]:
analysis_df = pd.DataFrame({'RelatedY': realatedy,
                            'RelatedX': relatedx}, index=eligible_vars)

analysis_df

Unnamed: 0,RelatedY,RelatedX
Region,True,True
Year,True,False
BMI,True,True
GDP_per_capita,True,True
Population_mln,False,True
Schooling,True,True
Economy_status_Developed,True,True


In [36]:
condition = analysis_df['RelatedY'] &analysis_df['RelatedX']
selected_vars = analysis_df[condition].index
print(selected_vars)

Index(['Region', 'BMI', 'GDP_per_capita', 'Schooling',
       'Economy_status_Developed'],
      dtype='object')
