In [None]:
import os
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn_pandas import DataFrameMapper

from risk_learning.config import filenames
from risk_learning.risk_learning import get_classifier_family_name

%matplotlib inline

In [None]:
df = pd.read_csv(filenames.fake_churn)
print(df.info())

In [None]:
df.head()

## Split off test set

In [None]:
# Look at records per year for time split
df.groupby('year').size()

In [None]:
# Split off last three years for test set
split_year = 2015
test = df.loc[df['year']>=split_year, :]
train_validate = df.loc[df['year'] < split_year]

In [None]:
data = train_validate[[c for c in df.columns if c != 'churn']]
lb = LabelBinarizer()
target = lb.fit_transform(train_validate['churn']).ravel()

X_train, X_validate, y_train, y_validate = train_test_split(
    data, target, test_size=0.25, random_state=42, stratify=target
)

## Put preprocessing and model selection in a pipeline

In [None]:
# Preprocessing
mapper = DataFrameMapper([
    ('gender', LabelBinarizer()),
    (['age'], StandardScaler()),
    ('profession', LabelBinarizer()), 
])

pipe = Pipeline([
    ('featurize', mapper),
    ('lr', LogisticRegression(solver='lbfgs', fit_intercept=False))
    ])

def clf_model_selection(clf_families, param_grids):
    for clf_family, param_grid in zip(clf_families, param_grids):
        clf_family_name = get_classifier_family_name(clf_family)
        print(clf_family_name)
        pipe = Pipeline([
            ('featurize', mapper),
            (clf_family_name, clf_family)
            ])
        
        # Hyperparameter search
        clf_select = GridSearchCV(pipe, param_grid, iid=False, cv=5)
        clf_select.fit(X_train, y_train)
        print("Best parameter (CV score=%0.3f): {}".format(clf_select.best_score_))
        print(clf_select.best_params_)
        print('Setting pipeline parameter(s)')


        # With selected hyperparameter, fit on entire training data
        print('\nFit coefficients to training set')
        clf_select.refit

        # Predict on validation data
        print('\nEvaluate on validation set')
        print(clf_select.score(X_validate, y_validate))
        print('\n')

clf_families = [
    LogisticRegression(solver='lbfgs', fit_intercept=False),
    GradientBoostingClassifier(),
]

param_grids = [
    {
        get_classifier_family_name(LogisticRegression()) + '__C': np.logspace(-4, 2, 20),
    },
    {
        get_classifier_family_name(GradientBoostingClassifier()) + '__max_depth': range(1,10),
    }
]
clf_model_selection(clf_families, param_grids)