In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from glmnet import LogitNet

In [2]:
churn_df = pd.read_csv("../data/churn_train.csv")
X = churn_df.drop(['churn_label'], axis=1)
y = churn_df.churn_label
print X.shape, y.shape

(572171, 21) (572171,)


In [3]:
RANDOM_SEED = 24
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = RANDOM_SEED
)

In [4]:
log_net = LogitNet(n_splits=5, scoring='roc_auc', n_jobs=2, random_state=RANDOM_SEED)

In [9]:
%%time
param = {'alpha': [i/10.0 for i in range(1, 10, 2)]}
gsearch = GridSearchCV(estimator=log_net, param_grid=param, scoring='roc_auc', iid=False, cv=5)
gsearch.fit(X_train, y_train)

CPU times: user 28min 53s, sys: 23.9 s, total: 29min 17s
Wall time: 18min 55s


In [13]:
pd.DataFrame(gsearch.cv_results_).sort_values('mean_test_score')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,65.936394,0.051043,0.898241,0.89826,0.1,{u'alpha': 0.1},5,0.897431,0.898382,0.898295,...,0.89748,0.898268,0.901137,0.897927,0.896861,0.898502,5.09424,0.002478,0.001518,0.000193
1,43.156625,0.049911,0.898293,0.898307,0.3,{u'alpha': 0.3},4,0.897484,0.898388,0.898299,...,0.897549,0.898313,0.901218,0.897979,0.896917,0.898607,1.729083,0.00156,0.001527,0.000204
2,38.073968,0.051277,0.898336,0.898331,0.5,{u'alpha': 0.5},3,0.897614,0.898442,0.898302,...,0.897585,0.898304,0.901246,0.897987,0.89693,0.898649,1.827131,0.007271,0.001518,0.000217
3,37.880411,0.045541,0.898484,0.898478,0.7,{u'alpha': 0.7},2,0.897731,0.89856,0.898344,...,0.897837,0.898536,0.901351,0.898131,0.897157,0.898784,3.019679,0.001844,0.001482,0.000216
4,33.154145,0.045971,0.898718,0.898723,0.9,{u'alpha': 0.9},1,0.897969,0.898847,0.898541,...,0.89799,0.898714,0.901606,0.898418,0.897482,0.89897,5.087631,0.002859,0.001483,0.000186


In [10]:
gsearch.best_params_, gsearch.best_score_

({'alpha': 0.9}, 0.89871786864010128)

In [14]:
%%time
param_2 = {'alpha': [0.85, 0.9, 0.95, 1]}
gsearch_2 = GridSearchCV(estimator=log_net, param_grid=param_2, scoring='roc_auc', iid=False, cv=5)
gsearch_2.fit(X_train, y_train)

CPU times: user 17min 36s, sys: 16.5 s, total: 17min 53s
Wall time: 11min 28s


In [16]:
pd.DataFrame(gsearch_2.cv_results_).sort_values('mean_test_score')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,34.321648,0.052253,0.898639,0.898653,0.85,{u'alpha': 0.85},4,0.897882,0.898761,0.898476,...,0.897948,0.898659,0.901463,0.898287,0.897423,0.898975,1.158791,0.009323,0.001451,0.000226
1,32.090353,0.044583,0.898718,0.898723,0.9,{u'alpha': 0.9},3,0.897969,0.898847,0.898541,...,0.89799,0.898714,0.901606,0.898418,0.897482,0.89897,1.271328,0.00049,0.001483,0.000186
2,31.84986,0.044192,0.89881,0.898817,0.95,{u'alpha': 0.95},2,0.897969,0.898841,0.898631,...,0.898156,0.898915,0.901682,0.898491,0.897615,0.899073,1.382704,0.000472,0.001473,0.000192
3,31.012679,0.044124,0.898856,0.898863,1.0,{u'alpha': 1},1,0.898092,0.898979,0.898656,...,0.89816,0.898947,0.901677,0.898462,0.897695,0.899124,1.491452,0.000542,0.001443,0.000225


In [18]:
gsearch_2.best_params_, gsearch_2.best_score_
# lasso is the best

({'alpha': 1}, 0.8988560248351456)

In [19]:
log_lasso = LogitNet(n_splits=5, scoring='roc_auc', n_jobs=2, random_state=RANDOM_SEED)

In [21]:
log_lasso.fit(X_train, y_train)

LogitNet(alpha=1, cut_point=1.0, fit_intercept=True, lambda_path=None,
     max_iter=100000, min_lambda_ratio=0.0001, n_jobs=2, n_lambda=100,
     n_splits=5, random_state=24, scoring='roc_auc', standardize=True,
     tol=1e-07, verbose=False)

In [22]:
train_prediction = log_lasso.predict(X_train, lamb=log_lasso.lambda_max_)
train_predprob = log_lasso.predict_proba(X_train, lamb=log_lasso.lambda_max_)[:, 1]
test_prediction = log_lasso.predict(X_test, lamb=log_lasso.lambda_max_)
test_predprob = log_lasso.predict_proba(X_test, lamb=log_lasso.lambda_max_)[:, 1]

In [23]:
print "Accuracy (Train) : {}".format(metrics.accuracy_score(y_train, train_prediction))
print "AUC Score (Train): {}".format(metrics.roc_auc_score(y_train, train_predprob))
print "Accuracy (Test): {}".format(metrics.accuracy_score(y_test, test_prediction))
print "AUC Score (Test): {}\n".format(metrics.roc_auc_score(y_test, test_predprob))

Accuracy (Train) : 0.833456402817
AUC Score (Train): 0.899413845278
Accuracy (Test): 0.832175470791
AUC Score (Test): 0.898028824121



In [24]:
log_lasso.lambda_best_

array([ 0.02993902])