In [1]:
%matplotlib inline
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(font='Noto Sans CJK TC')
mpl.style.use('ggplot')  # must put after sns.set

In [2]:
import statsmodels.api as sm

from sklearn import svm  # LinearSVC, SVC, SVR
from sklearn import neighbors  # KNeighborsClassifier
from sklearn import ensemble  # RandomForestClassifier, RandomForestRegressor
from sklearn import cluster  # KMeans
from sklearn import linear_model  # Ridge, LogisticRegression*, LinearRegression*
from sklearn import metrics  # r2_score, classification_report
from sklearn import model_selection  # train_test_split

# * for fun ... baseline
# ref: http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

  from pandas.core import datetools


In [3]:
fair_df = sm.datasets.fair.load_pandas().data
display(
    fair_df.head(),
    fair_df.tail(),
    fair_df.dtypes
)

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
6361,5.0,32.0,13.0,2.0,3.0,17.0,4.0,3.0,0.0
6362,4.0,32.0,13.0,1.0,1.0,16.0,5.0,5.0,0.0
6363,5.0,22.0,2.5,0.0,2.0,14.0,3.0,1.0,0.0
6364,5.0,32.0,6.0,1.0,3.0,14.0,3.0,4.0,0.0
6365,4.0,22.0,2.5,0.0,2.0,16.0,2.0,4.0,0.0


rate_marriage      float64
age                float64
yrs_married        float64
children           float64
religious          float64
educ               float64
occupation         float64
occupation_husb    float64
affairs            float64
dtype: object

## Affair or No – Classification & Clustering

In [4]:
X_affairyn = fair_df.iloc[:,:-1]
y_affairyn = fair_df.affairs > 0

display(
    X_affairyn.head(),
    y_affairyn.head(),
    y_affairyn.groupby(y_affairyn).count(),
)

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0


0    True
1    True
2    True
3    True
4    True
Name: affairs, dtype: bool

affairs
False    4313
True     2053
Name: affairs, dtype: int64

In [5]:
(
    X_affairyn_train, X_affairyn_test,
    y_affairyn_train, y_affairyn_test
) = model_selection.train_test_split(
    X_affairyn,
    y_affairyn,
    test_size=0.4, random_state=0
)

print(f'''\
X_affairyn: {X_affairyn.shape} → {X_affairyn_train.shape} / {X_affairyn_test.shape}
y_affairyn: {y_affairyn.shape} → {y_affairyn_train.shape} / {y_affairyn_test.shape}''')

X_affairyn: (6366, 8) → (3819, 8) / (2547, 8)
y_affairyn: (6366,) → (3819,) / (2547,)


### Classification

In [6]:
for M, d in [
    # https://en.wikipedia.org/wiki/Support_vector_machine
    # http://scikit-learn.org/stable/modules/svm.html#svm-classification
    (svm.LinearSVC, {}),
    (svm.SVC, {}),
    # https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
    # http://scikit-learn.org/stable/modules/neighbors.html#classification
    (neighbors.KNeighborsClassifier, {}),
    # https://en.wikipedia.org/wiki/Random_forest
    # http://scikit-learn.org/stable/modules/ensemble.html#forest
    (ensemble.RandomForestClassifier, {}),
    # https://en.wikipedia.org/wiki/Logistic_regression
    # http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
    (linear_model.LogisticRegression, {}),
]:
    
    m = M(**d)
    print(m)
    print()
    
    %time m.fit(X_affairyn_train, y_affairyn_train)
    print()
    
    print(metrics.classification_report(
        y_affairyn_test, m.predict(X_affairyn_test)
    ))
    print()

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

CPU times: user 285 ms, sys: 3.22 ms, total: 289 ms
Wall time: 292 ms

             precision    recall  f1-score   support

      False       0.89      0.36      0.51      1734
       True       0.40      0.91      0.55       813

avg / total       0.73      0.53      0.52      2547


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

CPU times: user 658 ms, sys: 34.3 ms, total: 692 ms
Wall time: 704 ms

             precision    recall  f1-score   support

      False       0.74      0.90      0.81      1734
       True       0.59      0.32      0.41       813

avg / total       0.69      0.71      0

### Clustering

In [7]:
for M, d in [
    # https://en.wikipedia.org/wiki/K-means_clustering
    # http://scikit-learn.org/stable/modules/clustering.html#k-means
    (cluster.KMeans, dict(n_clusters=2)),
]:
    
    m = M(**d)
    print(m)
    print()
    
    #%time m.fit(X_affairyn_train, y_affairyn_train)
    %time m.fit(X_affairyn_train, y_affairyn_train)
    print()
    
    print(metrics.classification_report(
        y_affairyn_test, m.predict(X_affairyn_test)
    ))
    print()

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

CPU times: user 54.6 ms, sys: 7.7 ms, total: 62.3 ms
Wall time: 76.5 ms

             precision    recall  f1-score   support

      False       0.74      0.71      0.72      1734
       True       0.43      0.47      0.45       813

avg / total       0.64      0.63      0.64      2547




## Married Years? – Regression

In [8]:
X_yrsmarried = fair_df[[
    'rate_marriage','age', 'children',
    'religious', 'educ', 'occupation', 'occupation_husb'
]]
y_yrsmarried = fair_df.yrs_married

display(
    X_yrsmarried.head(),
    y_yrsmarried.head(),
    y_yrsmarried.groupby(y_yrsmarried).count(),
)

Unnamed: 0,rate_marriage,age,children,religious,educ,occupation,occupation_husb
0,3.0,32.0,3.0,3.0,17.0,2.0,5.0
1,3.0,27.0,3.0,1.0,14.0,3.0,4.0
2,4.0,22.0,0.0,1.0,16.0,3.0,5.0
3,4.0,37.0,4.0,3.0,16.0,5.0,5.0
4,5.0,27.0,1.0,1.0,14.0,3.0,4.0


0     9.0
1    13.0
2     2.5
3    16.5
4     9.0
Name: yrs_married, dtype: float64

yrs_married
0.5      370
2.5     2034
6.0     1141
9.0      602
13.0     590
16.5     818
23.0     811
Name: yrs_married, dtype: int64

In [9]:
(
    X_yrsmarried_train, X_yrsmarried_test,
    y_yrsmarried_train, y_yrsmarried_test
) = model_selection.train_test_split(
    X_yrsmarried,
    y_yrsmarried,
    test_size=0.4, random_state=0
)

print(f'''\
X_yrsmarried: {X_yrsmarried.shape} → {X_yrsmarried_train.shape} / {X_yrsmarried_test.shape}
y_yrsmarried: {y_yrsmarried.shape} → {y_yrsmarried_train.shape} / {y_yrsmarried_test.shape}''')

X_yrsmarried: (6366, 7) → (3819, 7) / (2547, 7)
y_yrsmarried: (6366,) → (3819,) / (2547,)


In [10]:
for M, d in [
    # https://en.wikipedia.org/wiki/Tikhonov_regularization
    # http://scikit-learn.org/stable/modules/linear_model.html#ridge-regression
    (linear_model.Ridge, {}),
    # https://en.wikipedia.org/wiki/Linear_regression
    # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
    (linear_model.LinearRegression, {}),
]:
    
    m = M(**d)
    print(m)
    print()
    
    %time m.fit(X_yrsmarried_train, y_yrsmarried_train)
    print()
    
    #print(metrics.classification_report(
    #    y_affairyn_test, m.predict(X_affairyn_test)
    #))
    print(f'r2_score: {metrics.r2_score(y_yrsmarried_test, m.predict(X_yrsmarried_test))}')
    print()
    print()

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

CPU times: user 4.21 ms, sys: 4.84 ms, total: 9.05 ms
Wall time: 13.1 ms

r2_score: 0.8640635217215769


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

CPU times: user 3.19 ms, sys: 3.25 ms, total: 6.43 ms
Wall time: 10.5 ms

r2_score: 0.8640638770052516




