# Simple Trees

---
## Experimental Setup

### Libraries & Settings

In [1]:
import os              # General OS commands
import numpy as np     # NumPy
import pandas as pd    # Python Data Analysis Library
import zipfile         # Compress/decompress ZIP files
import sqlite3         # SQLite3 Database Driver
import re              # Regular Expressions

In [2]:
from sklearn import ensemble, metrics

In [3]:
# Never truncate columns, display all the data
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)

# Display floating-point numbers with 4 decimals in `pandas.DataFrame`
pd.options.display.float_format = '{:,.4f}'.format

import matplotlib.pyplot as plt
# Display MatPlotLib stuff inline
%matplotlib inline

### Database

In [4]:
zip_filename = "../data/ee-insee-2015_custom-sqlite.zip"
eedb = zip_filename.replace("-sqlite.zip", ".sqlite")

if not os.path.exists(eedb):
    with zipfile.ZipFile(zip_filename) as zip_file:
        zip_file.extractall("../data/")

In [5]:
with sqlite3.connect(eedb) as con:
    query = "SELECT * FROM eec15_custom"
    eec15 = pd.read_sql_query(query, con)

In [6]:
# Create a ("female" x "enfant") interaction variable
bool_ = eec15[["enfants_", "female_"]].astype(bool)
eec15["female_enfants_"] = (bool_.enfants_ & bool_.female_).astype(int)

# Drop data we don't need
eec15 = eec15[eec15.age60_ == 0]
eec15 = eec15.drop("age60_", 1)

---
## Random Forest

In [7]:
# Build a list of parameters to include in the model, using regex
# https://www.datarobot.com/blog/multiple-regression-using-statsmodels/
filters = {
    "age": "^age[0-9]{2}_$",
    "diploma": "^dip[0-9]{2}_$",
    "etranger": "^etranger_$",
    "domtom": "^domtom_$",
#     "trim": "^trim$",
    "female": "^female_$",
    "enfants": "^enfants_$",
#     "female_enfants": "^female_enfants_$",
    "region": "^region[1-2]_$"
}
params = {k: sorted([x for x in eec15.columns if re.match(r, x)]) for (k, r) in filters.items()}

# Avoid the dummy variable trap
params = {k: (v if len(v) == 1 else v[:-1]) for (k, v) in params.items()}
params["region"] += ["region2_"]

In [8]:
trims = ["t{}".format(x) for x in sorted(eec15["trim"].unique())]
eec15_ = {t: eec15[eec15.trim == int(t[1])] for t in trims}
X = {t: eec15_[t][sum(params.values(), [])] for t in eec15_}
y = {t: eec15_[t]["actop_"] for t in eec15_}

### Train the model using `trim=1` data

In [9]:
# Create & fit the forest
random_forest_sk = ensemble.RandomForestClassifier(
    criterion='gini',
    n_estimators=100,
    max_features=4,
    max_depth=12,
    random_state=1,
    n_jobs=-1
).fit(X["t1"], y["t1"])

In [10]:
# Check the accuracy of the model on the training set
print "Accuracy: {}".format(random_forest_sk.score(X["t1"], y["t1"]))

Accuracy: 0.773977868695


### Apply the model to test data (`trim=2/3/4`)

In [11]:
# Check the accuracy of the model on the training & test sets

for t in trims:
#     predicted = random_forest_sk.predict_proba(X[t]).T[1]
#     predicted = np.where(predicted < 0.5, 0, 1)
    predicted = random_forest_sk.predict(X[t])
    print "Accuracy ({}): {}".format(t, metrics.accuracy_score(y[t], predicted))

    predicted_proba = random_forest_sk.predict_proba(X[t]).T[1]
    print "ROC Area-Under-Curve ({}): {}".format(t, metrics.roc_auc_score(y[t], predicted_proba))

# Note: the two methods for computing `predicted` are equivalent (THRESHOLD = 0.5)

Accuracy (t1): 0.773977868695
ROC Area-Under-Curve (t1): 0.81259853703
Accuracy (t2): 0.778372860499
ROC Area-Under-Curve (t2): 0.812627942292
Accuracy (t3): 0.770331172041
ROC Area-Under-Curve (t3): 0.805320741
Accuracy (t4): 0.771684959321
ROC Area-Under-Curve (t4): 0.809385186715


---
## Marginal Effects

In [12]:
marginal_effects = pd.DataFrame()

### By modifying the dataset ("passage de tout le monde en licence")

In [13]:
brute_force = pd.Series()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1
        
        proba_one = random_forest_sk.predict_proba(X_one).T[1].mean()
        proba_t1 = random_forest_sk.predict_proba(X["t1"]).T[1].mean()
        brute_force[pivot] = proba_one - proba_t1

In [14]:
marginal_effects["brute_force"] = brute_force
marginal_effects.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
brute_force,0.1374,0.151,-0.0951,-0.1232,-0.1474,-0.1697,0.0181,-0.12,-0.1555,0.0552,-0.0563,-0.0212,0.1192,0.1207,0.0334,-0.0798,0.0004,0.0124,0.0972


---
## Most important parameters
_ranked by **entropy importance**_ (**TODO**: where did Bastien get this?!)

In [15]:
entropy = pd.DataFrame(random_forest_sk.feature_importances_, index=X["t1"].columns, columns=["entropy"])
entropy.sort_values(by="entropy", ascending=False).transpose()

Unnamed: 0,age15_,dip60_,enfants_,dip10_,domtom_,etranger_,female_,dip31_,age40_,dip42_,dip11_,dip41_,dip50_,age30_,dip33_,region1_,region2_,dip70_,dip30_
entropy,0.373,0.1113,0.083,0.0579,0.0518,0.0469,0.0403,0.0384,0.0369,0.028,0.0267,0.0258,0.0246,0.0228,0.0151,0.0069,0.0062,0.0029,0.0015
