# Boosting

---
## Experimental Setup

### Libraries & Settings

In [1]:
import os              # General OS commands
import numpy as np     # NumPy
import pandas as pd    # Python Data Analysis Library
import zipfile         # Compress/decompress ZIP files
import sqlite3         # SQLite3 Database Driver
import re              # Regular Expressions
import datetime        # Date/Time functions

In [2]:
from sklearn import ensemble, metrics, tree

In [3]:
# Never truncate columns, display all the data
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)

# Display floating-point numbers with 4 decimals in `pandas.DataFrame`
pd.options.display.float_format = '{:,.4f}'.format

import matplotlib.pyplot as plt
# Display MatPlotLib stuff inline
%matplotlib inline

### Database

In [4]:
zip_filename = "../../data/ee-insee-2015_custom-sqlite.zip"
eedb = zip_filename.replace("-sqlite.zip", ".sqlite")

if not os.path.exists(eedb):
    with zipfile.ZipFile(zip_filename) as zip_file:
        zip_file.extractall("../../data/")

In [5]:
with sqlite3.connect(eedb) as con:
    query = "SELECT * FROM eec15_custom"
    eec15 = pd.read_sql_query(query, con)

In [6]:
# Create a ("female" x "enfant") interaction variable
bool_ = eec15[["enfants_", "female_"]].astype(bool)
eec15["female_enfants_"] = (bool_.enfants_ & bool_.female_).astype(int)

# Drop data we don't need
eec15 = eec15[eec15.age60_ == 0]
eec15 = eec15.drop("age60_", 1)

---
## Boosting


In [7]:
# Build a list of parameters to include in the model, using regex
# https://www.datarobot.com/blog/multiple-regression-using-statsmodels/
filters = {
    "age": "^age[0-9]{2}_$",
    "diploma": "^dip[0-9]{2}_$",
    "etranger": "^etranger_$",
    "domtom": "^domtom_$",
#     "trim": "^trim$",
    "female": "^female_$",
    "enfants": "^enfants_$",
#     "female_enfants": "^female_enfants_$",
    "region": "^region[1-2]_$"
}
params = {k: sorted([x for x in eec15.columns if re.match(r, x)]) for (k, r) in filters.items()}

# Avoid the dummy variable trap
params = {k: (v if len(v) == 1 else v[:-1]) for (k, v) in params.items()}
params["region"] += ["region2_"]

In [8]:
trims = ["t{}".format(x) for x in sorted(eec15["trim"].unique())]
eec15_ = {t: eec15[eec15.trim == int(t[1])] for t in trims}
X = {t: eec15_[t][sum(params.values(), [])] for t in eec15_}
y = {t: eec15_[t]["actop_"] for t in eec15_}

### Train the model using `trim=1` data

In [9]:
simple_tree_sk = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10,random_state=0).fit(X["t1"], y["t1"])
boost_sk = ensemble.AdaBoostClassifier(
    base_estimator=simple_tree_sk,
    n_estimators=100,
    learning_rate=0.1,
    random_state=0
).fit(X["t1"], y["t1"])

### Apply the model to test data (`trim=2/3/4`)

In [10]:
for t in trims:
#     predicted = random_forest_sk.predict_proba(X[t]).T[1]
#     predicted = np.where(predicted < 0.5, 0, 1)
    predicted = boost_sk.predict(X[t])
    print "Accuracy ({}): {}".format(t, metrics.accuracy_score(y[t], predicted))

# Note: the two methods for computing `predicted` are equivalent (THRESHOLD = 0.5)

Accuracy (t1): 0.777410143057
Accuracy (t2): 0.780638214565
Accuracy (t3): 0.772148402291
Accuracy (t4): 0.773828962501


In [11]:
for t in trims:
    predicted_proba = boost_sk.predict_proba(X[t]).T[1]
    print "ROC Area-Under-Curve ({}): {}".format(t, metrics.roc_auc_score(y[t], predicted_proba))

ROC Area-Under-Curve (t1): 0.820594780348
ROC Area-Under-Curve (t2): 0.818967326384
ROC Area-Under-Curve (t3): 0.809561344427
ROC Area-Under-Curve (t4): 0.812219738034


### Accuracy of prediction of `1` and `0`

In [12]:
for actop_ in [0, 1]:
    print "===== actop_ = {} =====".format(actop_)
    for t in trims:
        y_zero = y[t][y[t] == actop_]
        X_zero = X[t][X[t].index.map(lambda x: x in y_zero)]
    
        predicted = boost_sk.predict(X_zero)
        print "Accuracy ({}): {}".format(t, metrics.accuracy_score(y_zero, predicted))
    print

===== actop_ = 0 =====
Accuracy (t1): 0.916532497246
Accuracy (t2): 0.91491410111
Accuracy (t3): 0.912740081079
Accuracy (t4): 0.915471813059

===== actop_ = 1 =====
Accuracy (t1): 0.521221814777
Accuracy (t2): 0.527269791288
Accuracy (t3): 0.503742863185
Accuracy (t4): 0.507765244401



---
## Marginal Effects

###  By modifying the dataset ("passage de tout le monde en licence")

In [13]:
brute_force = pd.DataFrame()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1
        
        # Note mean() on the other dimension as compared to simple tree
        # This makes no difference on the mean(), but gives us the right std()
        proba_one = boost_sk.predict_proba(X_one).T[1]
        proba_t1 = boost_sk.predict_proba(X["t1"]).T[1]
        
        brute_force[pivot] = proba_one - proba_t1    # these are now vectors

In [14]:
marginal_effects = pd.DataFrame()
marginal_effects["brute_force_mean"] = brute_force.mean()
marginal_effects["brute_force_std"] = brute_force.std()
marginal_effects.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
brute_force_mean,0.0136,0.0116,-0.0136,-0.0223,-0.0368,-0.0995,-0.0617,-0.0279,-0.1676,-0.0021,-0.0107,0.0014,0.0251,0.0384,0.0096,-0.0196,0.0032,-0.0231,-0.0066
brute_force_std,0.1632,0.095,0.0846,0.0824,0.1062,0.1815,0.2029,0.1061,0.2283,0.1117,0.0809,0.0753,0.1016,0.1847,0.0536,0.0843,0.051,0.1432,0.1413


---
## Mean Marginal Effects (*bootstrap*)

We use the *bootstrap* method to calculate a confidence interval for our marginal effects. The *bootstrap* method consists in successive **random samplings with replacement** of the original database.

**Note: this is computationally very intensive – be careful or else**

### Computation

In [15]:
n = 500                           # number of iterations
k = eec15.index.size              # number of samples per iteration

In [16]:
index = sum(params.values(), [])
marginal_effects_iterations = pd.DataFrame(index=index)

start = datetime.datetime.now()
print "start: {}".format(start)

for i in range(n):  
    # take a sample of the DataFrame
    eec15_sample = eec15.sample(k, replace=True)

    # separate parameters and output
    X_sample = eec15_sample[index]
    y_sample = eec15_sample["actop_"]

    # build the boost for the sample
    sample_boost_sk = ensemble.AdaBoostClassifier(
        base_estimator=simple_tree_sk,
        n_estimators=100,
        learning_rate=0.1,
        random_state=0
    ).fit(X_sample, y_sample)

    # calculate marginal effects for that sample
    brute_force = pd.Series()
    for category in filters:
        for pivot in params[category]:
            non_pivots = [x for x in params[category] if x != pivot]
        
            X_one = X_sample.copy() 
            X_one[non_pivots] = 0
            X_one[pivot] = 1
        
            proba_one = sample_boost_sk.predict_proba(X_one).T[1].mean()
            proba_base = sample_boost_sk.predict_proba(X_sample).T[1].mean()
            brute_force[pivot] = proba_one - proba_base
    
    # store the results in the dataframe
    marginal_effects_iterations[i] = brute_force

In [17]:
marginal_effects_iterations.to_csv("csv/boosting_{}.csv".format(n))

### Mean/Standard Deviation (across iterations)

In [18]:
marginal_effects_summary = pd.DataFrame()
marginal_effects_summary["mean"] = marginal_effects_iterations.mean(axis=1)
marginal_effects_summary["std"] = marginal_effects_iterations.std(axis=1)

marginal_effects_summary.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
mean,0.0131,0.0143,-0.0132,-0.0202,-0.0342,-0.0787,-0.0495,-0.0228,-0.1112,0.0007,-0.0105,0.0001,0.0215,0.0437,0.0086,-0.0187,0.0021,-0.0167,-0.0041
std,0.0031,0.0027,0.0018,0.0006,0.0021,0.0041,0.0085,0.0023,0.0087,0.002,0.0008,0.0005,0.0011,0.0078,0.0003,0.0024,0.0002,0.0022,0.0019


### 95% Confidence Interval

In [19]:
interval_size = 0.99
alpha = (1 - interval_size)/2
ix = int(round(alpha*n))      # ix has to be an integer

interval = pd.DataFrame(index=["lower", "upper"])
for category in filters:
    for pivot in params[category]:
        me_pivot = marginal_effects_iterations.loc[pivot].sort_values()
        interval[pivot] = [me_pivot.iloc[ix], me_pivot.iloc[-ix]]

interval

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
lower,0.0063,0.0079,-0.0183,-0.0219,-0.0403,-0.0896,-0.0669,-0.0286,-0.1554,-0.0046,-0.0124,-0.001,0.0189,0.0241,0.008,-0.0256,0.0016,-0.0223,-0.0097
upper,0.0219,0.0187,-0.0089,-0.0187,-0.03,-0.0706,-0.0253,-0.0173,-0.0974,0.0051,-0.0084,0.0015,0.0243,0.0652,0.0094,-0.0147,0.0027,-0.0111,-0.0004


---
## Most important parameters
_ranked by **entropy importance**_ (**TODO**: where did Bastien get this?!)

In [20]:
entropy = pd.DataFrame(boost_sk.feature_importances_, index=X["t1"].columns, columns=["entropy"])
#entropy_sorted = entropy.sort_values(by="entropy", ascending=False).transpose()
entropy.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
entropy,0.0623,0.1237,0.0559,0.0389,0.0421,0.0193,0.0126,0.0273,0.019,0.0384,0.0679,0.0667,0.0586,0.0098,0.0583,,0.0516,,


---
# Odds Ratios

The odds ratios formula is:
$$\frac{p/(1-p)}{q/(1-q)} = \frac{p(1-q)}{q(1-p)}$$

where $p$ and $q$ are the probability of being **unemployed**.

In [21]:
odds = pd.Series()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1

        proba_one = boost_sk.predict_proba(X_one).T[1].mean()
        odds[pivot] = proba_one/(1-proba_one)  

In [22]:
odds_ratios = pd.DataFrame(index=odds.index, columns=odds.index)
for col in odds.index:
    for row in odds.index:
        odds_ratios[col][row] = odds[col]/odds[row]

In [23]:
# odds_ratios     # col, row = odds[col]/odds[row]
odds_ratios.T   # row, col = odds[row]/odds[col]  

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
etranger_,1.0,1.0078,1.1153,1.1547,1.2249,1.5879,1.3558,1.1812,2.1495,1.0648,1.1023,1.0498,0.9549,0.9054,1.0161,1.1424,1.0424,1.1586,1.0841
age15_,0.9922,1.0,1.1066,1.1458,1.2154,1.5756,1.3453,1.172,2.1329,1.0565,1.0938,1.0417,0.9475,0.8983,1.0082,1.1335,1.0343,1.1496,1.0757
age30_,0.8967,0.9037,1.0,1.0354,1.0983,1.4238,1.2157,1.0591,1.9274,0.9548,0.9884,0.9413,0.8562,0.8118,0.9111,1.0243,0.9347,1.0389,0.972
age40_,0.866,0.8728,0.9658,1.0,1.0607,1.3751,1.1741,1.0229,1.8615,0.9221,0.9546,0.9091,0.8269,0.784,0.8799,0.9893,0.9027,1.0033,0.9388
dip10_,0.8164,0.8228,0.9105,0.9427,1.0,1.2964,1.1068,0.9643,1.7549,0.8693,0.8999,0.8571,0.7796,0.7391,0.8295,0.9327,0.851,0.9459,0.885
dip11_,0.6298,0.6347,0.7023,0.7272,0.7714,1.0,0.8538,0.7438,1.3537,0.6706,0.6942,0.6611,0.6013,0.5702,0.6399,0.7194,0.6565,0.7296,0.6827
dip30_,0.7376,0.7434,0.8226,0.8517,0.9035,1.1712,1.0,0.8712,1.5855,0.7854,0.8131,0.7743,0.7043,0.6678,0.7495,0.8426,0.7688,0.8546,0.7996
dip31_,0.8466,0.8532,0.9442,0.9776,1.037,1.3444,1.1478,1.0,1.8198,0.9015,0.9333,0.8888,0.8084,0.7665,0.8603,0.9672,0.8825,0.9809,0.9178
dip33_,0.4652,0.4689,0.5188,0.5372,0.5698,0.7387,0.6307,0.5495,1.0,0.4954,0.5128,0.4884,0.4442,0.4212,0.4727,0.5315,0.4849,0.539,0.5043
dip41_,0.9391,0.9465,1.0474,1.0845,1.1504,1.4913,1.2733,1.1093,2.0187,1.0,1.0353,0.9859,0.8968,0.8503,0.9543,1.0729,0.979,1.0881,1.0181


---
## Odds Ratios (2)
**TODO**: add the other Odds Ratios formula

In [24]:
odds_ratios_2 = pd.DataFrame()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1
        proba_one = boost_sk.predict_proba(X_one).T[1].mean()
        odds_one = proba_one/(1-proba_one)
        
        X_zero = X["t1"].copy()
        X_zero[non_pivots] = 0
        X_zero[pivot] = 0
        proba_zero = boost_sk.predict_proba(X_zero).T[1].mean()
        odds_zero = proba_zero/(1-proba_zero)
        
        odds_ratios_2.loc[pivot, "ratio"] = odds_one/odds_zero

odds_ratios_2.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
ratio,1.0608,1.1122,1.0051,0.9707,0.7707,0.5945,0.6963,0.7992,0.4392,0.8866,0.8564,0.8992,0.9886,1.0427,1.1075,0.9082,1.1594,1.0431,0.9779
