# Random Forest

---
## Experimental Setup

### Libraries & Settings

In [1]:
import os              # General OS commands
import numpy as np     # NumPy
import pandas as pd    # Python Data Analysis Library
import zipfile         # Compress/decompress ZIP files
import sqlite3         # SQLite3 Database Driver
import re              # Regular Expressions
import datetime        # Date/Time functions

In [2]:
from sklearn import ensemble, metrics

In [3]:
# Never truncate columns, display all the data
from IPython.display import display, HTML
pd.set_option('display.max_colwidth', -1)

# Display floating-point numbers with 4 decimals in `pandas.DataFrame`
pd.options.display.float_format = '{:,.4f}'.format

import matplotlib.pyplot as plt
# Display MatPlotLib stuff inline
%matplotlib inline

### Database

In [4]:
zip_filename = "../../data/ee-insee-2015_custom-sqlite.zip"
eedb = zip_filename.replace("-sqlite.zip", ".sqlite")

if not os.path.exists(eedb):
    with zipfile.ZipFile(zip_filename) as zip_file:
        zip_file.extractall("../../data/")

In [5]:
with sqlite3.connect(eedb) as con:
    query = "SELECT * FROM eec15_custom"
    eec15 = pd.read_sql_query(query, con)

In [6]:
# Create a ("female" x "enfant") interaction variable
bool_ = eec15[["enfants_", "female_"]].astype(bool)
eec15["female_enfants_"] = (bool_.enfants_ & bool_.female_).astype(int)

# Drop data we don't need
eec15 = eec15[eec15.age60_ == 0]
eec15 = eec15.drop("age60_", 1)

---
## Random Forest

In [7]:
# Build a list of parameters to include in the model, using regex
# https://www.datarobot.com/blog/multiple-regression-using-statsmodels/
filters = {
    "age": "^age[0-9]{2}_$",
    "diploma": "^dip[0-9]{2}_$",
    "etranger": "^etranger_$",
    "domtom": "^domtom_$",
#     "trim": "^trim$",
    "female": "^female_$",
    "enfants": "^enfants_$",
#     "female_enfants": "^female_enfants_$",
    "region": "^region[1-2]_$"
}
params = {k: sorted([x for x in eec15.columns if re.match(r, x)]) for (k, r) in filters.items()}

# Avoid the dummy variable trap
params = {k: (v if len(v) == 1 else v[:-1]) for (k, v) in params.items()}
params["region"] += ["region2_"]

In [8]:
trims = ["t{}".format(x) for x in sorted(eec15["trim"].unique())]
eec15_ = {t: eec15[eec15.trim == int(t[1])] for t in trims}
X = {t: eec15_[t][sum(params.values(), [])] for t in eec15_}
y = {t: eec15_[t]["actop_"] for t in eec15_}

### Train the model using `trim=1` data

In [9]:
# Create & fit the forest
random_forest_sk = ensemble.RandomForestClassifier(
    criterion='gini',
    n_estimators=100,
    max_features=4,
    max_depth=12,
    random_state=1,
    n_jobs=-1
).fit(X["t1"], y["t1"])

### Apply the model to test data (`trim=2/3/4`)

In [10]:
for t in trims:
#     predicted = random_forest_sk.predict_proba(X[t]).T[1]
#     predicted = np.where(predicted < 0.5, 0, 1)
    predicted = random_forest_sk.predict(X[t])
    print "Accuracy ({}): {}".format(t, metrics.accuracy_score(y[t], predicted))

# Note: the two methods for computing `predicted` are equivalent (THRESHOLD = 0.5)

Accuracy (t1): 0.773977868695
Accuracy (t2): 0.778372860499
Accuracy (t3): 0.770331172041
Accuracy (t4): 0.771684959321


In [11]:
for t in trims:
    predicted_proba = random_forest_sk.predict_proba(X[t]).T[1]
    print "ROC Area-Under-Curve ({}): {}".format(t, metrics.roc_auc_score(y[t], predicted_proba))

ROC Area-Under-Curve (t1): 0.81259853703
ROC Area-Under-Curve (t2): 0.812627942292
ROC Area-Under-Curve (t3): 0.805320741
ROC Area-Under-Curve (t4): 0.809385186715


### Accuracy of prediction of `1` and `0`

In [12]:
for actop_ in [0, 1]:
    print "===== actop_ = {} =====".format(actop_)
    for t in trims:
        y_zero = y[t][y[t] == actop_]
        X_zero = X[t][X[t].index.map(lambda x: x in y_zero)]
    
        predicted = random_forest_sk.predict(X_zero)
        print "Accuracy ({}): {}".format(t, metrics.accuracy_score(y_zero, predicted))
    print

===== actop_ = 0 =====
Accuracy (t1): 0.929073807304
Accuracy (t2): 0.927836374917
Accuracy (t3): 0.9261868368
Accuracy (t4): 0.92848284415

===== actop_ = 1 =====
Accuracy (t1): 0.488374814699
Accuracy (t2): 0.496346534254
Accuracy (t3): 0.472784943963
Accuracy (t4): 0.477153833579



---
## Marginal Effects

### By modifying the dataset ("passage de tout le monde en licence")

In [13]:
brute_force = pd.DataFrame()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1
        
        # Note mean() on the other dimension as compared to simple tree
        # This makes no difference on the mean(), but gives us the right std()
        proba_one = random_forest_sk.predict_proba(X_one).T[1]
        proba_t1 = random_forest_sk.predict_proba(X["t1"]).T[1]
        
        brute_force[pivot] = proba_one - proba_t1    # these are now vectors

In [14]:
marginal_effects = pd.DataFrame()
marginal_effects["brute_force_mean"] = brute_force.mean()
marginal_effects["brute_force_std"] = brute_force.std()
marginal_effects.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
brute_force_mean,0.1374,0.151,-0.0951,-0.1232,-0.1474,-0.1697,0.0181,-0.12,-0.1555,0.0552,-0.0563,-0.0212,0.1192,0.1207,0.0334,-0.0798,0.0004,0.0124,0.0972
brute_force_std,0.125,0.145,0.1999,0.1844,0.1621,0.206,0.1406,0.1675,0.1914,0.1434,0.1426,0.1567,0.1631,0.1496,0.0623,0.1456,0.0233,0.0618,0.0937


---
## Mean Marginal Effects (*bootstrap*)

We use the *bootstrap* method to calculate a confidence interval for our marginal effects. The *bootstrap* method consists in successive **random samplings with replacement** of the original database.

**Note: this is computationally very intensive – be careful or else**

### Computation

In [15]:
n = 1000                          # number of iterations
k = eec15.index.size              # number of samples per iteration

In [16]:
index = sum(params.values(), [])
marginal_effects_iterations = pd.DataFrame(index=index)

start = datetime.datetime.now()
print "start: {}".format(start)

for i in range(n):  
    # take a sample of the DataFrame
    eec15_sample = eec15.sample(k, replace=True)

    # separate parameters and output
    X_sample = eec15_sample[index]
    y_sample = eec15_sample["actop_"]

    # build the simple tree for the sample
    sample_random_forest_sk = ensemble.RandomForestClassifier(criterion='gini', 
                                   n_estimators=100,
                                   max_features=4,
                                   max_depth=12,
                                   random_state=1,
                                   n_jobs=-1
                               ).fit(X_sample, y_sample)

    # calculate marginal effects for that sample
    brute_force = pd.Series()
    for category in filters:
        for pivot in params[category]:
            non_pivots = [x for x in params[category] if x != pivot]
        
            X_one = X_sample.copy() 
            X_one[non_pivots] = 0
            X_one[pivot] = 1
        
            proba_one = sample_random_forest_sk.predict_proba(X_one).T[1].mean()
            proba_base = sample_random_forest_sk.predict_proba(X_sample).T[1].mean()
            brute_force[pivot] = proba_one - proba_base
    
    # store the results in the dataframe
    marginal_effects_iterations[i] = brute_force

In [17]:
marginal_effects_iterations.to_csv("csv/random_forest_{}.csv".format(n))

### Mean/Standard Deviation (across iterations)

In [18]:
marginal_effects_summary = pd.DataFrame()
marginal_effects_summary["mean"] = marginal_effects_iterations.mean(axis=1)
marginal_effects_summary["std"] = marginal_effects_iterations.std(axis=1)

marginal_effects_summary.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
mean,0.1347,0.1471,-0.0948,-0.1242,-0.1414,-0.1554,0.0194,-0.1189,-0.1493,0.0574,-0.054,-0.0218,0.1182,0.1266,0.0339,-0.0851,0.0002,0.0102,0.1027
std,0.0036,0.0024,0.0019,0.0017,0.0028,0.0059,0.0061,0.0033,0.0075,0.0025,0.0028,0.0024,0.0024,0.0073,0.0008,0.002,0.0003,0.0022,0.0025


### 95% Confidence Interval

In [19]:
interval_size = 0.99
alpha = (1 - interval_size)/2
ix = int(round(alpha*n))      # ix has to be an integer

interval = pd.DataFrame(index=["lower", "upper"])
for category in filters:
    for pivot in params[category]:
        me_pivot = marginal_effects_iterations.loc[pivot].sort_values()
        interval[pivot] = [me_pivot.iloc[ix], me_pivot.iloc[-ix]]

interval

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
lower,0.1258,0.1408,-0.0992,-0.1288,-0.1487,-0.1703,0.0037,-0.1271,-0.1696,0.0511,-0.0618,-0.0275,0.1119,0.1099,0.0322,-0.0897,-0.0005,0.0047,0.0966
upper,0.1435,0.1539,-0.0898,-0.1201,-0.1345,-0.1408,0.0351,-0.1113,-0.1305,0.0638,-0.0465,-0.0157,0.125,0.1471,0.036,-0.0798,0.0009,0.0165,0.1091


---
## Most important parameters
_ranked by **entropy importance**_ (**TODO**: where did Bastien get this?!)

In [20]:
entropy = pd.DataFrame(random_forest_sk.feature_importances_, index=X["t1"].columns, columns=["entropy"])
entropy_sorted = entropy.sort_values(by="entropy", ascending=False).transpose()

---
# Odds Ratios

The odds ratios formula is:
$$\frac{p/(1-p)}{q/(1-q)} = \frac{p(1-q)}{q(1-p)}$$

where $p$ and $q$ are the probability of being **unemployed**.

In [21]:
odds = pd.Series()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1

        proba_one = random_forest_sk.predict_proba(X_one).T[1].mean()
        odds[pivot] = proba_one/(1-proba_one)  

In [22]:
odds_ratios = pd.DataFrame(index=odds.index, columns=odds.index)
for col in odds.index:
    for row in odds.index:
        odds_ratios[col][row] = odds[col]/odds[row]

In [23]:
# odds_ratios     # col, row = odds[col]/odds[row]
odds_ratios.T   # row, col = odds[row]/odds[col]  

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
etranger_,1.0,0.9468,2.7728,3.2303,3.7249,4.2997,1.631,3.1733,3.9199,1.3952,2.2822,1.9386,1.0756,1.0691,1.5281,2.5624,1.7613,1.6715,1.1753
age15_,1.0561,1.0,2.9285,3.4117,3.934,4.5411,1.7226,3.3514,4.14,1.4736,2.4103,2.0475,1.136,1.1292,1.6139,2.7063,1.8602,1.7654,1.2413
age30_,0.3606,0.3415,1.0,1.165,1.3434,1.5507,0.5882,1.1444,1.4137,0.5032,0.8231,0.6992,0.3879,0.3856,0.5511,0.9241,0.6352,0.6028,0.4239
age40_,0.3096,0.2931,0.8584,1.0,1.1531,1.331,0.5049,0.9823,1.2135,0.4319,0.7065,0.6001,0.333,0.331,0.4731,0.7932,0.5452,0.5174,0.3638
dip10_,0.2685,0.2542,0.7444,0.8672,1.0,1.1543,0.4379,0.8519,1.0524,0.3746,0.6127,0.5205,0.2888,0.287,0.4102,0.6879,0.4729,0.4487,0.3155
dip11_,0.2326,0.2202,0.6449,0.7513,0.8663,1.0,0.3793,0.738,0.9117,0.3245,0.5308,0.4509,0.2502,0.2487,0.3554,0.596,0.4096,0.3888,0.2733
dip30_,0.6131,0.5805,1.7,1.9805,2.2838,2.6362,1.0,1.9456,2.4033,0.8554,1.3992,1.1886,0.6595,0.6555,0.9369,1.571,1.0799,1.0248,0.7206
dip31_,0.3151,0.2984,0.8738,1.018,1.1738,1.355,0.514,1.0,1.2353,0.4397,0.7192,0.6109,0.339,0.3369,0.4816,0.8075,0.555,0.5268,0.3704
dip33_,0.2551,0.2415,0.7074,0.8241,0.9503,1.0969,0.4161,0.8095,1.0,0.3559,0.5822,0.4946,0.2744,0.2727,0.3898,0.6537,0.4493,0.4264,0.2998
dip41_,0.7167,0.6786,1.9873,2.3152,2.6697,3.0817,1.169,2.2744,2.8095,1.0,1.6357,1.3895,0.7709,0.7663,1.0952,1.8366,1.2624,1.198,0.8423


---
## Odds Ratios (2)
**TODO**: add the other Odds Ratios formula

In [24]:
odds_ratios_2 = pd.DataFrame()
for category in filters:
    for pivot in params[category]:
        non_pivots = [x for x in params[category] if x != pivot]
        
        X_one = X["t1"].copy() 
        X_one[non_pivots] = 0
        X_one[pivot] = 1
        proba_one = random_forest_sk.predict_proba(X_one).T[1].mean()
        odds_one = proba_one/(1-proba_one)
        
        X_zero = X["t1"].copy()
        X_zero[non_pivots] = 0
        X_zero[pivot] = 0
        proba_zero = random_forest_sk.predict_proba(X_zero).T[1].mean()
        odds_zero = proba_zero/(1-proba_zero)
        
        odds_ratios_2.loc[pivot, "ratio"] = odds_one/odds_zero

odds_ratios_2.transpose()

Unnamed: 0,etranger_,age15_,age30_,age40_,dip10_,dip11_,dip30_,dip31_,dip33_,dip41_,dip42_,dip50_,dip60_,dip70_,female_,enfants_,region1_,region2_,domtom_
ratio,1.8528,2.9195,0.9969,0.8557,0.3612,0.3129,0.8248,0.4239,0.3432,0.9642,0.5895,0.6939,1.2507,1.2583,1.4018,0.6231,1.0663,1.1235,1.6085
