# Gender Wage Gap

In this case study we take a look at the impact of gender on weekly wage.  A few other variables including education level, geographic location, and years of experience.  Note that years of experience has exp1 which appears to be the raw value, and then exp2 and exp3 are transformations (it claims squared and cubic, though I don't really see how that could be true from the values). 

In [2]:
from itertools import chain, combinations
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from interpret import show
from interpret.perf import RegressionPerf
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree

In [3]:
data = pd.read_csv("data/wagedata.csv").drop('Unnamed: 0', axis=1)
data.head()

Unnamed: 0,female,cg,sc,hsg,mw,so,we,ne,exp1,exp2,exp3,wage
0,0,0,0,1,0,0,0,1,33.0,10.89,35.937,11.659091
1,0,1,0,0,0,0,0,1,27.0,7.29,19.683,12.825
2,0,0,1,0,0,0,0,1,13.0,1.69,2.197,5.777027
3,0,1,0,0,0,0,0,1,2.0,0.04,0.008,12.46875
4,1,1,0,0,0,0,0,1,15.0,2.25,3.375,18.525


## Data Centering & Scaling

In [4]:
cols = ['exp1', 'exp2', 'exp3']
clean_data = data.copy()
for col in cols:
    clean_data[f'{col}_c'] = (clean_data[col] - clean_data[col].mean()) / (2*clean_data[col].std())
clean_data.head()

Unnamed: 0,female,cg,sc,hsg,mw,so,we,ne,exp1,exp2,exp3,wage,exp1_c,exp2_c,exp3_c
0,0,0,0,1,0,0,0,1,33.0,10.89,35.937,11.659091,1.137054,1.436279,1.667453
1,0,1,0,0,0,0,0,1,27.0,7.29,19.683,12.825,0.789805,0.81784,0.767773
2,0,0,1,0,0,0,0,1,13.0,1.69,2.197,5.777027,-0.020441,-0.144176,-0.200101
3,0,1,0,0,0,0,0,1,2.0,0.04,0.008,12.46875,-0.657063,-0.427628,-0.321265
4,1,1,0,0,0,0,0,1,15.0,2.25,3.375,18.525,0.095308,-0.047975,-0.134897


### Data Exploration
We want to look at the differences in wages for mean and women.

In [6]:
clean_data.groupby('female').mean().T

female,0,1
cg,0.354839,0.406114
sc,0.301971,0.354336
hsg,0.34319,0.239551
mw,0.284946,0.291329
so,0.235215,0.255147
we,0.221326,0.198378
ne,0.258513,0.255147
exp1,13.580197,13.037118
exp2,2.586588,2.449453
exp3,5.964938,5.599297


In [7]:
clean_data.columns

Index(['female', 'cg', 'sc', 'hsg', 'mw', 'so', 'we', 'ne', 'exp1', 'exp2',
       'exp3', 'wage', 'exp1_c', 'exp2_c', 'exp3_c'],
      dtype='object')

In [8]:
independant_model = 'wage ~ female + cg + sc + hsg + mw + so + we + ne + exp1_c + exp2_c + exp3_c'
results = smf.ols(independant_model, data=clean_data).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     44.87
Date:                Mon, 20 May 2019   Prob (F-statistic):           3.17e-77
Time:                        11:55:17   Log-Likelihood:                -15235.
No. Observations:                3835   AIC:                         3.049e+04
Df Residuals:                    3825   BIC:                         3.055e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.0643      0.173     58.168      0.0

In [9]:
features = ['female', 'cg', 'sc', 'hsg', 'mw', 'so', 'we', 'ne', 'exp1_c']
X_train, X_test, y_train, y_test = train_test_split(clean_data[features], clean_data['wage'], test_size=0.20)
polyfeatures = PolynomialFeatures(degree=2, interaction_only=True)
model = Lasso(alpha=0.1)
model_pipeline = Pipeline([("preproc", polyfeatures),
                           ('model', model)])
model_pipeline.fit(X=X_train, y=y_train)

Pipeline(memory=None,
     steps=[('preproc', PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)), ('model', Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [19]:
dir(model_pipeline)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_param_names',
 '_get_params',
 '_inverse_transform',
 '_pairwise',
 '_replace_estimator',
 '_set_params',
 '_transform',
 '_validate_names',
 '_validate_steps',
 'classes_',
 'decision_function',
 'fit',
 'fit_predict',
 'fit_transform',
 'get_params',
 'inverse_transform',
 'memory',
 'named_steps',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_params',
 'steps',
 'transform']

In [10]:
import interpret

In [11]:
poly_perf = RegressionPerf(model_pipeline.predict).explain_perf(X_test.values, y_test.values, name='Poly Regression')
show(poly_perf)

In [14]:
from interpret import show
from interpret.data import Marginal

marginal = Marginal().explain_data(X_train.values, y_train.values, name = 'Train Data')
show(marginal)

In [15]:
lr = LinearRegression(random_state=1)
lr.fit(X_train.values, y_train.values)
lr_perf = RegressionPerf(lr.predict).explain_perf(X_test.values, y_test.values, name='Linear Regression')
show(lr_perf)

In [18]:
local = lr.explain_local(X_test.values, y_test.values, name='Linear Regression')
show(local)

In [17]:
global_ = lr.explain_global()
show(global_)