In [1]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(font='Noto Sans CJK TC')
mpl.style.use('ggplot')  # must put after sns.set

In [2]:
import statsmodels.api as sm

from sklearn import ensemble  # RandomForestClassifier
from sklearn import model_selection  # train_test_split
from sklearn import metrics  # classification_report

  from pandas.core import datetools


In [3]:
fair_df = sm.datasets.fair.load_pandas().data

##  Predict Affair or No – Classification

In [4]:
# in scikit-learn,
# uppercase `X` is independent variables in a matrix (m × n),
# lowercase `y` is dependent variables in a vector (m × 1)
X_affairyn = fair_df.iloc[:, :-1]
y_affairyn = fair_df.affairs > 0

display(
    X_affairyn.head(),
    y_affairyn.head(),
    y_affairyn.groupby(y_affairyn).count(),
)

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0


0    True
1    True
2    True
3    True
4    True
Name: affairs, dtype: bool

affairs
False    4313
True     2053
Name: affairs, dtype: int64

In [5]:
(
    X_affairyn_train, X_affairyn_test,
    y_affairyn_train, y_affairyn_test
) = model_selection.train_test_split(
    X_affairyn,
    y_affairyn,
    test_size=0.2, random_state=0
)

# the most formal splits
#
# - training set: adjust the weights in model
# - validation set: minimize overfitting
# - testing set: just assesses the performance
#
# here we don't have the validation set

In [6]:
# m: model
m = ensemble.RandomForestClassifier(n_estimators=10)
print(m)
print()

%time m.fit(X_affairyn_train, y_affairyn_train)
print()

print(metrics.classification_report(
    y_affairyn_test, m.predict(X_affairyn_test)
))
print()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

CPU times: user 73.3 ms, sys: 3.7 ms, total: 77 ms
Wall time: 88.7 ms

             precision    recall  f1-score   support

      False       0.74      0.81      0.77       885
       True       0.45      0.36      0.40       389

avg / total       0.65      0.67      0.66      1274




Hmmm ... not so good, the f1-score, whose range is $[0, 1]$, reaches only 0.6. Anyway, it just a random trial. Try to adjust the arguments or predict something else.

## Dig More

* [Decision tree – Wikipedia](https://en.wikipedia.org/wiki/Decision_tree)
* [Random forest – Wikipedia](https://en.wikipedia.org/wiki/Random_forest)
* [Precision and recall – Wikipedia](https://en.wikipedia.org/wiki/Precision_and_recall)
* [F1 score – Wikipedia](https://en.wikipedia.org/wiki/F1_score)
* [RandomForest – scikit-learn](http://scikit-learn.org/stable/modules/ensemble.html#forest)
* [Scikit-learn Tutorials](http://scikit-learn.org/stable/tutorial/index.html)