In [2]:
#Career satisfaction between remote and non-remote workers
#Pick variables
#T-test Loop
#Chi-square
#Transform the character variables into binary
#Perform matching
#Perform a robustness check

#Directory, Libraries and data

In [3]:
%cd /content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching

/content/drive/MyDrive/Business Analyst course/Econometrics and Causal Inference/Matching


In [4]:
!pip install CausalInference

Collecting CausalInference
  Downloading CausalInference-0.1.3-py3-none-any.whl (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CausalInference
Successfully installed CausalInference-0.1.3


In [5]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
from causalinference import CausalModel

In [6]:
#Load the data
df=pd.read_csv("stackoverflow.csv")
df.head(1)

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction,Data_scientist,Database_administrator,...,Developer_with_stats_math_background,DevOps,Embedded_developer,Graphic_designer,Graphics_programming,Machine_learning_specialist,Mobile_developer,Quality_assurance_engineer,Systems_administrator,Web_developer
0,United Kingdom,100000.0,20,0,1,5000,Remote,8,0,0,...,0,0,1,0,0,0,0,0,0,0


#Data Analysis

In [7]:
#Picking variable
df=df.iloc[:,:8]
df.head(1)

Unnamed: 0,Country,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,Remote,CareerSatisfaction
0,United Kingdom,100000.0,20,0,1,5000,Remote,8


In [8]:
#Transforming character variables
df=pd.get_dummies(df,drop_first=True)
df.head(1)

Unnamed: 0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States,Remote_Remote
0,100000.0,20,0,1,5000,8,0,0,1,0,1


In [9]:
#Comparing group averages
df.groupby('Remote_Remote').mean()

Unnamed: 0_level_0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States
Remote_Remote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,70201.175042,7.142857,0.332736,0.76051,2187.325563,7.551106,0.142857,0.096035,0.189878,0.480175
1,87400.737001,10.12,0.443478,0.766957,1712.756522,7.855652,0.069565,0.097391,0.121739,0.662609


In [10]:
#T-test
group1=df.where(df.Remote_Remote==1).dropna()["CareerSatisfaction"]
group2=df.where(df.Remote_Remote==0).dropna()["CareerSatisfaction"]
stat, p=ss.ttest_ind(group1,group2)
print(p)

3.7011158878454286e-05


In [11]:
#T-test loop
#continous=["Salary","YearsCodedJob","OpenSource","Hobby","CompanySizeNumber","Country_Germany","Country_India","Country_United Kingdom","Country_United States"]
continous=["Salary","YearsCodedJob"]
#Where to store result
stat={}
p={}
#Loop
for x in continous:
  group1=df.where(df.Remote_Remote==0).dropna()[x]
  group2=df.where(df.Remote_Remote==1).dropna()[x]
  stat[x], p[x]=ss.ttest_ind(group1,group2)
ttests= pd.DataFrame.from_dict(p,orient="Index")
ttests.columns=["pvalue"]
print(ttests)


                     pvalue
Salary         1.057708e-22
YearsCodedJob  3.637316e-30


In [12]:
#Chi-Square Test
tab=pd.crosstab(index=df.OpenSource,
                columns=df.Remote_Remote)
statistic, p, dof, exp = ss.chi2_contingency(tab)
print(p)

1.5357696955569458e-07


In [13]:
df.head(0)

Unnamed: 0,Salary,YearsCodedJob,OpenSource,Hobby,CompanySizeNumber,CareerSatisfaction,Country_Germany,Country_India,Country_United Kingdom,Country_United States,Remote_Remote


#Matching

In [21]:
#Isolate y, treat, confounders
y=df.CareerSatisfaction.values
treat=df.Remote_Remote.values
confounders=df.drop(columns=["Remote_Remote", "CareerSatisfaction"]).values


In [22]:
#Matching
model=CausalModel(y,treat,confounders)
model.est_via_matching(bias_adj=True)
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.114      0.130      0.881      0.378     -0.140      0.368
           ATC      0.106      0.136      0.778      0.436     -0.161      0.372
           ATT      0.187      0.142      1.321      0.187     -0.090      0.464



  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef


In [24]:
df.shape

(5594, 11)

#Robustness check


In [26]:
#Remove 1 confounder
confounders = df.drop(columns=['Remote_Remote','CareerSatisfaction','Hobby']).values
#Matching
model = CausalModel(y, treat, confounders)
model.est_via_matching(bias_adj=True)
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.148      0.131      1.128      0.259     -0.109      0.406
           ATC      0.140      0.138      1.013      0.311     -0.131      0.411
           ATT      0.220      0.137      1.602      0.109     -0.049      0.488



  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef
