In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

In [2]:
churn_df = pd.read_csv("../data/churn_train.csv")
X = churn_df.drop(['churn_label'], axis=1)
y = churn_df.churn_label

In [3]:
print X.shape, y.shape

(572171, 21) (572171,)


In [4]:
RANDOM_SEED = 24
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = RANDOM_SEED
)

In [5]:
clf = SGDClassifier(random_state=RANDOM_SEED, n_jobs=2, max_iter=1000, tol=1e-3)

In [6]:
%%time
clf.fit(X_train, y_train)

CPU times: user 3.61 s, sys: 88.9 ms, total: 3.69 s
Wall time: 3.74 s


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=2, penalty='l2', power_t=0.5, random_state=24, shuffle=True,
       tol=0.001, verbose=0, warm_start=False)

In [7]:
test_predprob = clf.decision_function(X_test)
test_prediction = clf.predict(X_test)

In [8]:
metrics.accuracy_score(y_test, test_prediction)

0.81360597719229255

In [9]:
metrics.roc_auc_score(y_test, test_predprob)

0.83468328953719817

In [21]:
param_sgd = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]}

In [22]:
gearch_sgd = GridSearchCV(SGDClassifier(random_state=RANDOM_SEED, n_jobs=2, max_iter=1000, tol=1e-3), 
                          param_grid=param_sgd, 
                          scoring='roc_auc', iid=False, cv=5)

In [23]:
%%time
gearch_sgd.fit(X_train, y_train)

CPU times: user 1min 15s, sys: 4.78 s, total: 1min 20s
Wall time: 1min 20s


GridSearchCV(cv=5, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=2, penalty='l2', power_t=0.5, random_state=24, shuffle=True,
       tol=0.001, verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=1,
       param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [29]:
pd.DataFrame(gearch_sgd.cv_results_).sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
2,2.140293,0.056083,0.807484,0.807368,0.01,{u'alpha': 0.01},1,0.856608,0.855972,0.761771,...,0.786544,0.786653,0.779031,0.77688,0.853468,0.855763,0.278236,0.006313,0.039662,0.040399
4,2.435297,0.052672,0.801421,0.801858,1.0,{u'alpha': 1},2,0.729676,0.730121,0.839924,...,0.752352,0.755996,0.846806,0.843673,0.83835,0.84028,0.581508,0.002435,0.049922,0.048724
5,2.55096,0.053768,0.751617,0.751477,10.0,{u'alpha': 10},3,0.85245,0.852323,0.772927,...,0.457344,0.457753,0.811578,0.809274,0.863785,0.865691,0.322035,0.001677,0.150595,0.150509
0,2.351426,0.047495,0.725233,0.724047,0.0001,{u'alpha': 0.0001},4,0.804281,0.804324,0.727598,...,0.874738,0.875927,0.75531,0.750201,0.464237,0.461841,0.410069,0.002451,0.139735,0.140686
1,2.207081,0.049898,0.695564,0.69449,0.001,{u'alpha': 0.001},5,0.698101,0.698954,0.747805,...,0.787608,0.787713,0.853502,0.847962,0.390803,0.389156,0.115891,0.002711,0.160643,0.160268
3,2.273541,0.049391,0.571022,0.570288,0.1,{u'alpha': 0.1},6,0.672633,0.674362,0.172338,...,0.874272,0.875334,0.835753,0.832896,0.300113,0.297704,0.543856,0.002436,0.284503,0.285119


In [24]:
gearch_sgd.best_score_, gearch_sgd.best_params_

(0.80748432208955467, {'alpha': 0.01})

In [26]:
%%time
clf_sgdsvm_best = SGDClassifier(random_state=RANDOM_SEED, n_jobs=2, max_iter=1000, tol=1e-3, alpha=0.01)
clf_sgdsvm_best.fit(X_train, y_train)

CPU times: user 2.64 s, sys: 41.2 ms, total: 2.68 s
Wall time: 2.69 s


In [27]:
test_prediction = clf_sgdsvm_best.predict(X_test)
test_predscore = clf_sgdsvm_best.decision_function(X_test)

In [28]:
print metrics.accuracy_score(y_test, test_prediction)
print metrics.roc_auc_score(y_test, test_predscore)

0.631056931883
0.69522159588


In [35]:
param_sgd_2 = {'alpha': [0.005] + [i/100.0 for i in range(1, 10)]}

In [37]:
%%time
gearch_sgd_2 = GridSearchCV(SGDClassifier(random_state=RANDOM_SEED, n_jobs=2, max_iter=1000, tol=1e-3), 
                            param_grid=param_sgd_2, scoring='roc_auc', iid=False, cv=5)
gearch_sgd_2.fit(X_train, y_train)

CPU times: user 2min 3s, sys: 7.79 s, total: 2min 11s
Wall time: 2min 12s


In [38]:
pd.DataFrame(gearch_sgd_2.cv_results_).sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
5,2.282504,0.052692,0.846184,0.846629,0.05,{u'alpha': 0.05},1,0.875251,0.875132,0.813653,...,0.853506,0.855603,0.840899,0.838134,0.847612,0.851072,0.815425,0.007975,0.019934,0.020503
0,2.390374,0.061479,0.831892,0.832399,0.005,{u'alpha': 0.005},2,0.828697,0.828062,0.767993,...,0.866654,0.868563,0.862801,0.859848,0.833316,0.836353,0.174181,0.012401,0.035381,0.03491
7,2.403771,0.052072,0.828768,0.828536,0.07,{u'alpha': 0.07},3,0.882078,0.882549,0.770542,...,0.869113,0.870178,0.799664,0.794718,0.822443,0.824876,0.377092,0.007615,0.041826,0.042879
2,2.501811,0.052426,0.827544,0.82766,0.02,{u'alpha': 0.02},4,0.884433,0.884685,0.753778,...,0.846209,0.846806,0.849209,0.846059,0.80409,0.805824,0.332286,0.005602,0.044823,0.0441
8,2.135673,0.046828,0.819443,0.819239,0.08,{u'alpha': 0.08},5,0.858299,0.858521,0.75229,...,0.852159,0.85293,0.781206,0.776625,0.853264,0.856665,0.29071,0.000656,0.044036,0.045802
3,2.737967,0.064895,0.813526,0.813469,0.03,{u'alpha': 0.03},6,0.843785,0.843441,0.75246,...,0.774897,0.774838,0.848079,0.845289,0.848412,0.852136,0.385949,0.013175,0.041346,0.041764
6,2.386334,0.053128,0.810607,0.810588,0.06,{u'alpha': 0.06},7,0.796406,0.796392,0.772469,...,0.841323,0.841818,0.824823,0.821811,0.818016,0.820622,0.277015,0.010791,0.023902,0.023957
1,2.097056,0.052636,0.807484,0.807368,0.01,{u'alpha': 0.01},8,0.856608,0.855972,0.761771,...,0.786544,0.786653,0.779031,0.77688,0.853468,0.855763,0.230293,0.008201,0.039662,0.040399
9,1.957373,0.047447,0.797917,0.797686,0.09,{u'alpha': 0.09},9,0.822894,0.822215,0.819695,...,0.786285,0.786458,0.725298,0.722064,0.835413,0.838535,0.232796,0.00114,0.039782,0.041415
4,2.416107,0.051099,0.787053,0.786865,0.04,{u'alpha': 0.04},10,0.759244,0.760289,0.784282,...,0.840365,0.840774,0.877502,0.87419,0.673875,0.675571,0.379565,0.003028,0.070162,0.068755


In [39]:
gearch_sgd_2.best_params_, gearch_sgd_2.best_score_

({'alpha': 0.05}, 0.84618415331021912)

In [40]:
%%time
clf_sgdsvm_best_2 = SGDClassifier(random_state=RANDOM_SEED, n_jobs=2, max_iter=1000, tol=1e-3, alpha=0.05)
clf_sgdsvm_best_2.fit(X_train, y_train)

CPU times: user 3.02 s, sys: 57.7 ms, total: 3.08 s
Wall time: 3.11 s


In [43]:
test_prediction_2 = clf_sgdsvm_best_2.predict(X_test)
test_predscore_2 = clf_sgdsvm_best_2.decision_function(X_test)
print metrics.accuracy_score(y_test, test_prediction_2)
print metrics.roc_auc_score(y_test, 1/(1+np.exp(-test_predscore_2)))

0.635120374011
0.701795609277


