# Logit Regressions

## Experimental Setup

### Libraries & Settings

In [1]:
import os              # General OS commands
import numpy as np     # NumPy
import pandas as pd    # Python Data Analysis Library
import zipfile         # Compress/decompress ZIP files
import sqlite3         # SQLite3 Database Driver
import re              # Regular Expressions

In [2]:
import statsmodels.api as sm
from sklearn import linear_model, metrics
import math

In [3]:
# Never truncate columns, display all the data
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)

# Display floating-point numbers with 4 decimals in `pandas.DataFrame`
pd.options.display.float_format = '{:,.4f}'.format

import matplotlib.pyplot as plt
# Display MatPlotLib stuff inline
%matplotlib inline

# Temporary files to delete at the end of the experiment
temp_files = []

### Database

In [4]:
zip_filename = "../data/ee-insee-2015_custom-sqlite.zip"
with zipfile.ZipFile(zip_filename) as zip_file:
    zip_file.extractall("../data/")

eedb = zip_filename.replace("-sqlite.zip", ".sqlite")
temp_files.append(eedb)

In [5]:
with sqlite3.connect(eedb) as con:
    query = "SELECT * FROM eec15_custom"
    eec15 = pd.read_sql_query(query, con)

## Experiment

In [6]:
# Create a ("female" x "enfant") interaction variable
bool_ = eec15[["enfants_", "female_"]].astype(bool)
eec15["female_enfants_"] = (bool_.enfants_ & bool_.female_).astype(int)

# Drop data we don't need
eec15 = eec15[eec15.age60_ == 0]
eec15 = eec15.drop("age60_", 1)

# TODO (not finished): Create ("region" x "diplome") interaction variables
# parameters = [x for x in eec15.columns if (x.startswith("dip") or x.startswith("region")) and x.endswith("_")]
# bool_ = eec15[parameters].astype(bool)
# bool_.head()

# Merge region3 and region4 to make them significant
# df["region3"] = np.where(df["region4"] == 1, 1, np.where(df["region3"] == 1, 1, 0))

### Logit Regression

In [7]:
# Build a list of parameters to include in the model, using regex
# https://www.datarobot.com/blog/multiple-regression-using-statsmodels/
params = {
    "age": "^age[0-9]{2}_$",
    "diploma": "^dip[0-9]{2}_$",
    "etranger": "^etranger_$",
    "domtom": "^domtom_$",
#     "trim": "^trim$",
    "female": "^female_$",
    "enfants": "^enfants_$",
#     "female_enfants": "^female_enfants_$",
    "region": "^region[1-2]_$"
}
params = {k: sorted([x for x in eec15.columns if re.match(r, x)]) for (k, r) in params.items()}

# Avoid the dummy variable trap
params = {k: (v if len(v) == 1 else v[:-1]) for (k, v) in params.items()}

# Convert to a list
params = sum(params.values(), [])
params += ["region2_"]

#### Train the model using `trim=1` data

In [8]:
eec15_t1 = eec15[eec15.trim == 1]
X_t1 = eec15_t1[params]
y_t1 = eec15_t1["actop_"]

In [9]:
# Fit using `statsmodels`
reg_logit_sm = sm.Logit(y_t1, sm.add_constant(X_t1)).fit(disp=False)

# Fit using `sklearn`
reg_logit_sk = linear_model.LogisticRegression(solver="sag").fit(X_t1, y_t1)

In [10]:
# Compare the coefficients from both fits (they should be the same)
regressors = list(X_t1.columns) + ["const"]

coeffs = pd.DataFrame(index=regressors)
coeffs["sm"] = reg_logit_sm.params
coeffs["sk"] = pd.Series(np.append(reg_logit_sk.coef_, reg_logit_sk.intercept_), index=regressors)
coeffs["diff"] = coeffs["sm"] - coeffs["sk"]

coeffs.sort_index().transpose()

Unnamed: 0,age15_,age30_,age40_,const,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,domtom_,enfants_,etranger_,female_,region1_,region2_
sm,1.5972,0.3117,-0.0758,-0.6978,-1.9053,-2.3686,-1.0517,-1.8161,-2.1377,-0.5214,-1.3976,-0.9673,0.1498,-0.1829,0.6042,-0.4556,0.6915,0.5176,0.2039,0.2614
sk,1.595,0.3095,-0.0767,-0.7008,-1.8974,-2.3517,-1.0374,-1.8078,-2.1186,-0.515,-1.3906,-0.9616,0.1555,-0.1774,0.6053,-0.4552,0.6924,0.5166,0.2029,0.2607
diff,0.0022,0.0022,0.0009,0.003,-0.0079,-0.017,-0.0143,-0.0083,-0.0191,-0.0064,-0.007,-0.0057,-0.0057,-0.0055,-0.0011,-0.0004,-0.001,0.0009,0.0009,0.0007


In [11]:
# Check the accuracy of the model on the training set
print "Accuracy: {}".format(reg_logit_sk.score(X_t1, y_t1))

Accuracy: 0.762417968643


#### Apply the model to `trim=2` data

In [12]:
eec15_t2 = eec15[eec15.trim == 2]
X_t2 = eec15_t2[params]
y_t2 = eec15_t2["actop_"]

In [13]:
# Check the accuracy of the model on the test set
predicted = pd.DataFrame(reg_logit_sk.predict_proba(X_t2), columns=["p(y=0)", "p(y=1)"])
predicted["y"] = reg_logit_sk.predict(X_t2)
print "Accuracy: {}".format(metrics.accuracy_score(y_t2, predicted["y"]))

Accuracy: 0.767521534847


### Marginal Effects

In [14]:
marginal_effects = pd.DataFrame()

#### By "rule-of-thumb"
The "rule-of-thumb" divides all regression coefficients by 4 to approximate the marginal effect

In [15]:
marginal_effects["rule_of_thumb"] = coeffs["sm"]/4

##### Analytically (using the derivative formula)
Recall that:
$$EM(x_1) = \frac{\partial{\mathbb{E}[y|x]}}{\partial{x_1}} = \beta_1 \Lambda'(\beta_0 + \beta_1 x_1 + ... )$$
where
$$ \Lambda(x) = \frac{e^x}{1+e^x} = P(y=1|x)$$
and
$$\Lambda'(x) = \Lambda(x)\Lambda(1-x) = \Lambda(x)\Lambda(-x)$$

**Note**: for each parameter, we calculate the *mean* marginal effect over the entire training dataset

In [16]:
def lambda_(x):
    return math.exp(x)/(1+math.exp(x))

def dlambda(x):
    return lambda_(x)*lambda_(-x)

In [17]:
y_fitted = X_t1.dot(coeffs["sm"][:-1])
y_fitted += coeffs["sm"]["const"]
dlambda_y = y_fitted.map(dlambda)

In [18]:
marginal_effects["analytical"] = coeffs["sm"]*dlambda_y.mean()

#### By modifying the dataset ("passage de tout le monde en licence")
TODO

In [19]:
marginal_effects

Unnamed: 0,rule_of_thumb,analytical
etranger_,0.1729,0.1148
age15_,0.3993,0.2652
age30_,0.0779,0.0518
age40_,-0.019,-0.0126
dip10_,-0.4763,-0.3164
dip11_,-0.5922,-0.3934
dip30_,-0.2629,-0.1747
dip31_,-0.454,-0.3016
dip33_,-0.5344,-0.355
dip41_,-0.1303,-0.0866


## Cleanup

Now that the experiments have concluded, we delete all the "temporary" files.

In [20]:
for temp in temp_files:
    os.remove(temp)