# Logit Regressions

---
## Experimental Setup

### Libraries & Settings

In [1]:
import os              # General OS commands
import numpy as np     # NumPy
import pandas as pd    # Python Data Analysis Library
import zipfile         # Compress/decompress ZIP files
import sqlite3         # SQLite3 Database Driver
import re              # Regular Expressions

In [2]:
import statsmodels.api as sm
from sklearn import linear_model, metrics
import math

In [3]:
# Never truncate columns, display all the data
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)

# Display floating-point numbers with 4 decimals in `pandas.DataFrame`
pd.options.display.float_format = '{:,.4f}'.format

import matplotlib.pyplot as plt
# Display MatPlotLib stuff inline
%matplotlib inline

### Database

In [4]:
zip_filename = "../../data/ee-insee-2005_custom-sqlite.zip"
eedb = zip_filename.replace("-sqlite.zip", ".sqlite")

if not os.path.exists(eedb):
    with zipfile.ZipFile(zip_filename) as zip_file:
        zip_file.extractall("../../data/")

In [5]:
with sqlite3.connect(eedb) as con:
    query = "SELECT * FROM eec05_custom"
    eec05 = pd.read_sql_query(query, con)

In [6]:
# # Create a ("female" x "enfant") interaction variable
# bool_ = eec05[["enfants_", "female_"]].astype(bool)
# eec05["female_enfants_"] = (bool_.enfants_ & bool_.female_).astype(int)

# # Drop data we don't need
# eec05 = eec05[eec05.age60_ == 0]
# eec05 = eec05.drop("age60_", 1)

---
## Regression

In [7]:
# Build a list of parameters to include in the model, using regex
# https://www.datarobot.com/blog/multiple-regression-using-statsmodels/
filters = {
#     "trim": "^trim$",
    "age": "^age$",
    "female": "^female_$",
    
#     "nationalite": "^nat28_[0-9]*_$",
#     "matri": "^matri_[0-9]*_$",
#     "enfants": "^nbenf18_[0-9]*_$",
    
#     "diploma": "^dip_[0-9]*_$",
#     "specialite": "^spe_[0-9]*_$",
    
#     "region": "^reg_[0-9]*_$",
    "unite_urbaine": "^tur5_[0-9]*_",
    
#     "csp_pere": "^cspp_[0-9]*_$",
#     "csp": "^cstot_[0-9]*_$"
}

params = {k: sorted([x for x in eec05.columns if re.match(r, x)]) for (k, r) in filters.items()}

# Avoid the dummy variable trap
params = {k: (v if len(v) == 1 else v[:-1]) for (k, v) in params.items()}
# params["region"] += ["region2_"]

In [8]:
trims = ["t{}".format(x) for x in sorted(eec05["trim"].unique())]
eec05_ = {t: eec05[eec05.trim == int(t[1])] for t in trims}
X = {t: eec05_[t][sum(params.values(), [])] for t in eec05_}
y = {t: eec05_[t]["actop_"] for t in eec05_}

### Train the model using `trim=1` data

In [9]:
# Fit using `sklearn`
reg_logit_sk = linear_model.LogisticRegression(solver="sag").fit(X["t1"], y["t1"])

# Fit using `statsmodels`
reg_logit_sm = sm.Logit(y["t1"], sm.add_constant(X["t1"])).fit(disp=False)

# Print the (`statsmodels`) model summary
reg_logit_sm.summary()

0,1,2,3
Dep. Variable:,actop_,No. Observations:,71800.0
Model:,Logit,Df Residuals:,71793.0
Method:,MLE,Df Model:,6.0
Date:,"Mon, 27 Feb 2017",Pseudo R-squ.:,0.06665
Time:,12:09:14,Log-Likelihood:,-46404.0
converged:,True,LL-Null:,-49718.0
,,LLR p-value:,0.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,-1.6437,0.029,-56.382,0.000,-1.701 -1.587
age,0.0301,0.000,71.585,0.000,0.029 0.031
tur5_1_,-0.0229,0.027,-0.854,0.393,-0.076 0.030
tur5_2_,0.1419,0.029,4.966,0.000,0.086 0.198
tur5_3_,0.2440,0.026,9.265,0.000,0.192 0.296
tur5_4_,0.2476,0.026,9.378,0.000,0.196 0.299
female_,0.3989,0.016,25.452,0.000,0.368 0.430


In [10]:
# Compare the coefficients from both fits (they should be the same)
regressors = list(X["t1"].columns) + ["const"]

coeffs = pd.DataFrame(index=regressors)
coeffs["sm"] = reg_logit_sm.params
coeffs["sk"] = pd.Series(np.append(reg_logit_sk.coef_, reg_logit_sk.intercept_), index=regressors)
coeffs["diff"] = coeffs["sm"] - coeffs["sk"]

coeffs.sort_index().transpose()

Unnamed: 0,age,const,female_,tur5_1_,tur5_2_,tur5_3_,tur5_4_
sm,0.0301,-1.6437,0.3989,-0.0229,0.1419,0.244,0.2476
sk,0.0301,-1.6432,0.3988,-0.0233,0.1414,0.2435,0.2471
diff,0.0,-0.0005,0.0001,0.0004,0.0005,0.0005,0.0005


In [11]:
# Check the accuracy of the model on the training set
predicted = np.where(reg_logit_sm.predict() < 0.5, 0, 1)
print "Accuracy: {}".format(metrics.accuracy_score(y["t1"], predicted))

Accuracy: 0.621587743733


### Apply the model to test data (`trim=2/3/4`)

In [12]:
# Check the accuracy of the model on the training & test sets

for t in trims:
#     predicted = np.where(reg_logit_sk.predict_proba(X[t]).T[1] < 0.5, 0, 1)
#     predicted = reg_logit_sk.predict(X[t])
    predicted = np.where(reg_logit_sm.predict(sm.add_constant(X[t])) < 0.5, 0, 1)
    print "Accuracy ({}): {}".format(t, metrics.accuracy_score(y[t], predicted))

# Note: the three methods for computing `predicted` are equivalent (THRESHOLD = 0.5)

Accuracy (t1): 0.621587743733
Accuracy (t2): 0.62790205349
Accuracy (t3): 0.635073655261
Accuracy (t4): 0.63317104088


---
## Marginal Effects

In [13]:
marginal_effects = pd.DataFrame()

### Analytically (using the derivative formula)
Recall that:
$$EM(x_1) = \frac{\partial{\mathbb{E}[y|x]}}{\partial{x_1}} = \beta_1 \Lambda'(\beta_0 + \beta_1 x_1 + ... )$$
where
$$\Lambda(x^T\beta) = \frac{e^{x^T\beta}}{1+e^{x^T\beta}} = P(y=1|x) = \mathbb{E}[y|x]$$
and
$$\Lambda'(x) = \frac{e^x}{(1+e^x)^2} = \Lambda(x)\Lambda(1-x) = \Lambda(x)\Lambda(-x)$$

**Note**: for each parameter, we calculate the *mean* marginal effect over the entire training dataset

In [14]:
def lambda_(x):
    return math.exp(x)/(1+math.exp(x))

def dlambda(x):
    return lambda_(x)*lambda_(-x)

In [15]:
y_fitted = X["t1"].dot(coeffs["sm"][:-1])
y_fitted += coeffs["sm"]["const"]
dlambda_y = y_fitted.map(dlambda)

In [16]:
marginal_effects["analytical"] = coeffs["sm"]*dlambda_y.mean()

### By "rule-of-thumb"
Note (from above) that:
$$EM(x_1) = \beta_1 \Lambda'(\beta_0 + \beta_1 x_1 + ... )$$
and 
$$\max \Lambda'(x) = \Lambda'(0) = \max \frac{e^0}{(1+e^0)^2} = \frac{1}{4}$$
Therefore, we have that:
$$EM(x_1) \approx \frac{\beta_1}{4}$$

The "rule-of-thumb" thus divides all regression coefficients by 4 to approximate the marginal effect.

In [17]:
marginal_effects["rule_of_thumb"] = coeffs["sm"]/4

### By modifying the dataset ("passage de tout le monde en licence")

In [18]:
brute_force = pd.Series()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1

#         proba_one = reg_logit_sk.predict_proba(X_one).T[1].mean()
#         proba_t1 = reg_logit_sk.predict_proba(X["t1"]).T[1].mean()
        proba_one = reg_logit_sm.predict(sm.add_constant(X_one, has_constant="add")).mean()
        proba_t1 = reg_logit_sm.predict(sm.add_constant(X["t1"])).mean()
        brute_force[pivot] = proba_one - proba_t1

In [19]:
marginal_effects["brute_force"] = brute_force
marginal_effects.transpose()

Unnamed: 0,age,tur5_1_,tur5_2_,tur5_3_,tur5_4_,female_,const
analytical,0.0068,-0.0052,0.0323,0.0555,0.0563,0.0907,-0.3738
rule_of_thumb,0.0075,-0.0057,0.0355,0.061,0.0619,0.0997,-0.4109
brute_force,-0.2964,-0.0362,0.0014,0.0246,0.0254,0.0435,


In [20]:
ratios = marginal_effects.copy()
ratios = ratios.div(ratios["analytical"], axis=0)
ratios.transpose()

Unnamed: 0,age,tur5_1_,tur5_2_,tur5_3_,tur5_4_,female_,const
analytical,1.0,1.0,1.0,1.0,1.0,1.0,1.0
rule_of_thumb,1.0993,1.0993,1.0993,1.0993,1.0993,1.0993,1.0993
brute_force,-43.3718,6.9393,0.0438,0.4437,0.4518,0.4797,
