In [None]:
import os
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn_pandas import DataFrameMapper

from risk_learning.config import filenames

%matplotlib inline

In [None]:
df = pd.read_csv(filenames.fake_churn)
print(df.info())

In [None]:
df.head()

## Split off test set

In [None]:
# Look at records per year for time split
df.groupby('year').size()

In [None]:
# Split off last three years for test set
split_year = 2015
test = df.loc[df['year']>=split_year, :]
train_validate = df.loc[df['year'] < split_year]

In [None]:
data = train_validate[[c for c in df.columns if c != 'churn']]
lb = LabelBinarizer()
target = lb.fit_transform(train_validate['churn']).ravel()

X_train, X_validate, y_train, y_validate = train_test_split(
    data, target, test_size=0.25, random_state=42, stratify=target
)

## Transform features

In [None]:
mapper = DataFrameMapper([
    ('gender', LabelBinarizer()),
    (['age'], StandardScaler()),
    ('profession', LabelBinarizer()), 
])

## Combine in a pipeline

In [None]:
pipe = Pipeline([
    ('featurize', mapper),
    ('lr', LogisticRegression(solver='lbfgs', fit_intercept=False))
    ])


# Hyperparameter search
param_grid = {
    'lr__C': np.logspace(-4, 2, 20),
}
search = GridSearchCV(pipe, param_grid, iid=False, cv=5)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
print('Setting pipeline parameter(s)')
pipe.set_params(**search.best_params_)


# With selected hyperparameter, fit training data
print('\nFit coefficients to training set')
pipe.fit(X_train, y_train)

# Predict on validation data
print('\nEvaluate on validation set')
pipe.score(X_validate, y_validate)
