# Logit Regressions

## Experimental Setup

### Libraries & Settings

In [1]:
import os              # General OS commands
import numpy as np     # NumPy
import pandas as pd    # Python Data Analysis Library
import zipfile         # Compress/decompress ZIP files
import sqlite3         # SQLite3 Database Driver
import re              # Regular Expressions

In [2]:
import statsmodels.api as sm
from sklearn import linear_model, metrics
import math

In [3]:
# Never truncate columns, display all the data
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)

# Display floating-point numbers with 4 decimals in `pandas.DataFrame`
pd.options.display.float_format = '{:,.4f}'.format

import matplotlib.pyplot as plt
# Display MatPlotLib stuff inline
%matplotlib inline

# Temporary files to delete at the end of the experiment
temp_files = []

### Database

In [4]:
zip_filename = "../data/ee-insee-2015_custom-sqlite.zip"
with zipfile.ZipFile(zip_filename) as zip_file:
    zip_file.extractall("../data/")

eedb = zip_filename.replace("-sqlite.zip", ".sqlite")
temp_files.append(eedb)

In [5]:
with sqlite3.connect(eedb) as con:
    query = "SELECT * FROM eec15_custom"
    eec15 = pd.read_sql_query(query, con)

## Experiment

In [6]:
# Create a ("female" x "enfant") interaction variable
bool_ = eec15[["enfants_", "female_"]].astype(bool)
eec15["female_enfants_"] = (bool_.enfants_ & bool_.female_).astype(int)

# Drop data we don't need
eec15 = eec15[eec15.age60_ == 0]
eec15 = eec15.drop("age60_", 1)

# TODO (not finished): Create ("region" x "diplome") interaction variables
# parameters = [x for x in eec15.columns if (x.startswith("dip") or x.startswith("region")) and x.endswith("_")]
# bool_ = eec15[parameters].astype(bool)
# bool_.head()

# Merge region3 and region4 to make them significant
# df["region3"] = np.where(df["region4"] == 1, 1, np.where(df["region3"] == 1, 1, 0))

### Logit Regression

In [7]:
# Build a list of parameters to include in the model, using regex
# https://www.datarobot.com/blog/multiple-regression-using-statsmodels/
filters = {
    "age": "^age[0-9]{2}_$",
    "diploma": "^dip[0-9]{2}_$",
    "etranger": "^etranger_$",
    "domtom": "^domtom_$",
#     "trim": "^trim$",
    "female": "^female_$",
    "enfants": "^enfants_$",
#     "female_enfants": "^female_enfants_$",
    "region": "^region[1-2]_$"
}
params = {k: sorted([x for x in eec15.columns if re.match(r, x)]) for (k, r) in filters.items()}

# Avoid the dummy variable trap
params = {k: (v if len(v) == 1 else v[:-1]) for (k, v) in params.items()}
params["region"] += ["region2_"]

In [8]:
trims = ["t{}".format(x) for x in sorted(eec15["trim"].unique())]
eec15_ = {t: eec15[eec15.trim == int(t[1])] for t in trims}
X = {t: eec15_[t][sum(params.values(), [])] for t in eec15_}
y = {t: eec15_[t]["actop_"] for t in eec15_}

#### Train the model using `trim=1` data

In [9]:
# Fit using `sklearn`
reg_logit_sk = linear_model.LogisticRegression(solver="sag").fit(X["t1"], y["t1"])

# Fit using `statsmodels`
reg_logit_sm = sm.Logit(y["t1"], sm.add_constant(X["t1"])).fit(disp=False)

# Print the (`statsmodels`) model summary
reg_logit_sm.summary()

0,1,2,3
Dep. Variable:,actop_,No. Observations:,72838.0
Model:,Logit,Df Residuals:,72818.0
Method:,MLE,Df Model:,19.0
Date:,"Sat, 04 Feb 2017",Pseudo R-squ.:,0.222
Time:,13:11:17,Log-Likelihood:,-36757.0
converged:,True,LL-Null:,-47245.0
,,LLR p-value:,0.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-0.6978,0.041,-17.122,0.000,-0.778 -0.618
etranger_,0.6915,0.035,19.827,0.000,0.623 0.760
age15_,1.5972,0.026,62.597,0.000,1.547 1.647
age30_,0.3117,0.029,10.661,0.000,0.254 0.369
age40_,-0.0758,0.028,-2.714,0.007,-0.131 -0.021
dip10_,-1.9053,0.037,-50.911,0.000,-1.979 -1.832
dip11_,-2.3686,0.072,-32.829,0.000,-2.510 -2.227
dip30_,-1.0517,0.095,-11.076,0.000,-1.238 -0.866
dip31_,-1.8161,0.042,-43.478,0.000,-1.898 -1.734


In [10]:
# Compare the coefficients from both fits (they should be the same)
regressors = list(X["t1"].columns) + ["const"]

coeffs = pd.DataFrame(index=regressors)
coeffs["sm"] = reg_logit_sm.params
coeffs["sk"] = pd.Series(np.append(reg_logit_sk.coef_, reg_logit_sk.intercept_), index=regressors)
coeffs["diff"] = coeffs["sm"] - coeffs["sk"]

coeffs.sort_index().transpose()

Unnamed: 0,age15_,age30_,age40_,const,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,domtom_,enfants_,etranger_,female_,region1_,region2_
sm,1.5972,0.3117,-0.0758,-0.6978,-1.9053,-2.3686,-1.0517,-1.8161,-2.1377,-0.5214,-1.3976,-0.9673,0.1498,-0.1829,0.6042,-0.4556,0.6915,0.5176,0.2039,0.2614
sk,1.5948,0.3097,-0.0767,-0.7008,-1.8976,-2.3515,-1.0375,-1.8078,-2.1187,-0.5149,-1.3908,-0.9618,0.1556,-0.1776,0.6052,-0.4551,0.6924,0.5167,0.2029,0.2608
diff,0.0023,0.0021,0.0009,0.0031,-0.0077,-0.0171,-0.0142,-0.0083,-0.019,-0.0065,-0.0069,-0.0056,-0.0058,-0.0053,-0.001,-0.0004,-0.0009,0.0009,0.001,0.0005


In [11]:
# Check the accuracy of the model on the training set
print "Accuracy: {}".format(reg_logit_sk.score(X["t1"], y["t1"]))

Accuracy: 0.762417968643


#### Apply the model to test data (`trim=2/3/4`)

In [12]:
# Check the accuracy of the model on the training & test sets

for t in trims:
#     predicted = reg_logit_sk.predict_proba(X[t]).T[1]
#     predicted = np.where(predicted < 0.5, 0, 1)
    predicted = reg_logit_sk.predict(X[t])
    print "Accuracy ({}): {}".format(t, metrics.accuracy_score(y[t], predicted))

# Note: the two methods for computing `predicted` are equivalent (THRESHOLD = 0.5)

Accuracy (t1): 0.762417968643
Accuracy (t2): 0.767521534847
Accuracy (t3): 0.760154682639
Accuracy (t4): 0.763421317923


### Marginal Effects

In [13]:
marginal_effects = pd.DataFrame()

#### By "rule-of-thumb"
The "rule-of-thumb" divides all regression coefficients by 4 to approximate the marginal effect

In [14]:
marginal_effects["rule_of_thumb"] = coeffs["sm"]/4

##### Analytically (using the derivative formula)
Recall that:
$$EM(x_1) = \frac{\partial{\mathbb{E}[y|x]}}{\partial{x_1}} = \beta_1 \Lambda'(\beta_0 + \beta_1 x_1 + ... )$$
where
$$ \Lambda(x) = \frac{e^x}{1+e^x} = P(y=1|x)$$
and
$$\Lambda'(x) = \Lambda(x)\Lambda(1-x) = \Lambda(x)\Lambda(-x)$$

**Note**: for each parameter, we calculate the *mean* marginal effect over the entire training dataset

In [15]:
def lambda_(x):
    return math.exp(x)/(1+math.exp(x))

def dlambda(x):
    return lambda_(x)*lambda_(-x)

In [16]:
y_fitted = X["t1"].dot(coeffs["sm"][:-1])
y_fitted += coeffs["sm"]["const"]
dlambda_y = y_fitted.map(dlambda)

In [17]:
marginal_effects["analytical"] = coeffs["sm"]*dlambda_y.mean()

#### By modifying the dataset ("passage de tout le monde en licence")

In [18]:
brute_force = pd.Series()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1
        
        proba_one = reg_logit_sk.predict_proba(X_one).T[1].mean()
        proba_t1 = reg_logit_sk.predict_proba(X["t1"]).T[1].mean()
        brute_force[pivot] = proba_one - proba_t1

In [19]:
marginal_effects["brute_force"] = brute_force
marginal_effects.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_,const
rule_of_thumb,0.1729,0.3993,0.0779,-0.019,-0.4763,-0.5922,-0.2629,-0.454,-0.5344,-0.1303,-0.3494,-0.2418,0.0374,-0.0457,0.1294,-0.1139,0.051,0.0653,0.151,-0.1744
analytical,0.1148,0.2652,0.0518,-0.0126,-0.3164,-0.3934,-0.1747,-0.3016,-0.355,-0.0866,-0.2321,-0.1606,0.0249,-0.0304,0.086,-0.0757,0.0339,0.0434,0.1003,-0.1159
brute_force,0.1128,0.206,-0.0493,-0.1138,-0.166,-0.2192,-0.0303,-0.154,-0.1935,0.0709,-0.0913,-0.0164,0.2107,0.1407,0.0417,-0.0438,0.0022,0.0119,0.0927,


In [20]:
ratios = marginal_effects.copy()
ratios = ratios.div(ratios["analytical"], axis=0)
ratios.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_,const
rule_of_thumb,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054,1.5054
analytical,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
brute_force,0.9823,0.7767,-0.9528,9.033,0.5246,0.5571,0.1735,0.5105,0.5451,-0.8187,0.3934,0.1018,8.4692,-4.6329,0.4848,0.5788,0.0662,0.2753,0.9236,


## Cleanup

Now that the experiments have concluded, we delete all the "temporary" files.

In [21]:
for temp in temp_files:
    os.remove(temp)