<a href="https://colab.research.google.com/github/pharringtonp19/business-analytics/blob/main/notebooks/regression/causality_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/pharringtonp19/business-analytics.git

fatal: destination path 'business-analytics' already exists and is not an empty directory.


### **Import Packages**

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from functools import partial
import numpy as np

### **Variables to Change**

In [3]:
dataset = 'lifeExpectancy_reduced.csv'
dep_var = 'Life_expectancy'
causal_var = 'Alcohol_consumption'
vars_to_drop = ['Country']
categorical_vars = ['Region']

### **Read In Data Set**

In [4]:
df = pd.read_csv(f'/content/business-analytics/datasets/{dataset}')
df.head()

Unnamed: 0,Country,Region,Year,Alcohol_consumption,BMI,GDP_per_capita,Population_mln,Schooling,Economy_status_Developed,Life_expectancy
0,Turkiye,Middle East,2015,1.32,27.8,11006,78.53,7.8,0,76.5
1,Spain,European Union,2015,10.35,26.0,25742,46.44,9.7,1,82.8
2,India,Asia,2007,1.57,21.2,1076,1183.21,5.0,0,65.4
3,Guyana,South America,2006,5.68,25.3,4146,0.75,7.9,0,67.0
4,Israel,Middle East,2012,2.89,27.0,33995,7.91,12.8,1,81.7


### **Classify Variables**

In [5]:
eligible_vars = df.columns.drop(vars_to_drop + [dep_var, causal_var]).tolist()

### **Dictionary Comprehension**

In [6]:
eligible_vars_rep = {var : var if var not in categorical_vars else f'C({var})' for var in eligible_vars}
eligible_vars_rep

{'Region': 'C(Region)',
 'Year': 'Year',
 'BMI': 'BMI',
 'GDP_per_capita': 'GDP_per_capita',
 'Population_mln': 'Population_mln',
 'Schooling': 'Schooling',
 'Economy_status_Developed': 'Economy_status_Developed'}

### **Create Function**

In [7]:
def get_pvalues(reg_formula, var_rep):
  updated_reg_formula = reg_formula + ' + ' + var_rep
  linear_model  = smf.ols(updated_reg_formula, df)
  results = linear_model.fit()
  pvalues_series = results.pvalues
  pvalues_df = pd.DataFrame(list(pvalues_series.items()), columns=['variable', 'pvalue'])
  pvalues_df['base_variable'] = pvalues_df['variable'].map(lambda x: x.split('[')[0] if 'C(' in x else x)
  condition = pvalues_df['variable'] != 'Intercept'
  pvalues_df = pvalues_df[condition]
  pvalues_df = pvalues_df.groupby('base_variable')['pvalue'].min()
  return pvalues_df[var_rep]

In [14]:
df['Alcohol_consumption'].corr(df['Year'])

-0.0006105222266049098

### **Check Relationships**

In [13]:
realatedy = np.array(list(map(partial(get_pvalues, dep_var + ' ~ '), eligible_vars_rep.values()))) <= 0.05
relatedx =  np.array(list(map(partial(get_pvalues, causal_var + ' ~ '), eligible_vars_rep.values()))) <= 0.05

analysis_df = pd.DataFrame({'RelatedY': realatedy,
                            'RelatedX': relatedx}, index=eligible_vars)

analysis_df

Unnamed: 0,RelatedY,RelatedX
Region,True,True
Year,True,False
BMI,True,True
GDP_per_capita,True,True
Population_mln,False,True
Schooling,True,True
Economy_status_Developed,True,True


### **Select Variables**

In [16]:
condition = analysis_df['RelatedY'] & analysis_df['RelatedX']
selected_vars = analysis_df[condition].index.sort_values()
print(selected_vars)

Index(['BMI', 'Economy_status_Developed', 'GDP_per_capita', 'Region',
       'Schooling'],
      dtype='object')


### **Estimator**

In [10]:
def Estimate(reg_formula):
  linear_model = smf.ols(reg_formula, data=df)
  results = linear_model.fit()
  return results

### **Compare Linear Models**

In [11]:
reg_formula1 = dep_var + ' ~ ' + causal_var
reg_formula2 = reg_formula1
for var in selected_vars:
  reg_formula2 += ' + ' + eligible_vars_rep[var]
results1 = Estimate(reg_formula1)
results2 = Estimate(reg_formula2)

### **Results**

In [12]:
summary_col([results1, results2])

0,1,2
,Life_expectancy I,Life_expectancy II
Intercept,64.3108,37.1771
,(0.2531),(1.5343)
Alcohol_consumption,0.9428,-0.3951
,(0.0405),(0.0391)
C(Region)[T.Asia],,8.7652
,,(0.3216)
C(Region)[T.Central America and Caribbean],,10.4334
,,(0.3699)
C(Region)[T.European Union],,8.9710
