In [158]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [159]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [160]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%s' % i for i in range(1, 11)]

In [161]:
train_df = pd.read_csv('../../data/train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('../../data/test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

In [162]:
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', sep=' ', index=None, header=None)

In [164]:
%%time
cv = CountVectorizer(ngram_range=(1, 2), max_features=10000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

CPU times: user 9.5 s, sys: 296 ms, total: 9.79 s
Wall time: 9.95 s


In [165]:
# save train targets into a separate vector
y_train = train_df['target'].astype('int').values

In [166]:
# performing time series cross-validation
time_split = TimeSeriesSplit(n_splits=10)

In [167]:
# perform time series cross-validation with logistic regression
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [168]:
%%time
# hangs with n_jobs > 1, and locally this runs much faster
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1) 

CPU times: user 952 ms, sys: 473 ms, total: 1.42 s
Wall time: 23.6 s


In [169]:
cv_scores, cv_scores.mean()

(array([0.83446098, 0.64246422, 0.8735211 , 0.96202618, 0.83732716,
        0.88826603, 0.92535969, 0.85203452, 0.92850608, 0.91200625]),
 0.8655972220392567)

In [170]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [171]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288

In [172]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    
    df_times = df.replace(0, pd.NaT)[times]
    duration = (df_times.max(axis=1) - df_times[times].min(axis=1)) / np.timedelta64(1, 's')
    
    X = hstack([
        X_sparse,
        morning.values.reshape(-1, 1),
        day.values.reshape(-1, 1),
        evening.values.reshape(-1, 1),
        night.values.reshape(-1, 1),
        duration.values.reshape(-1, 1) / 1800,
               ])
    return X

In [173]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)
print(X_train_new.shape, X_test_new.shape)

(253561, 10005) (82797, 10005)
CPU times: user 2min 20s, sys: 6.49 s, total: 2min 26s
Wall time: 2min 30s


In [174]:
%%time
# performing time series cross-validation, we see an improvement in ROC AUC
# hangs with n_jobs > 1, and locally this runs much faster
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=1)

CPU times: user 52 s, sys: 1.57 s, total: 53.6 s
Wall time: 27.9 s


In [175]:
cv_scores, cv_scores.mean()

(array([0.87509075, 0.73611874, 0.9258791 , 0.9783075 , 0.89898831,
        0.94450181, 0.95474931, 0.92582883, 0.948598  , 0.94367186]),
 0.9131734217435374)

In [176]:
logit.fit(X_train_new, y_train)

logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.93843

In [177]:
# tune regularization parameter C
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(
    estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1
)

In [178]:
%%time
logit_grid_searcher.fit(X_train_new, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  9.3min finished


CPU times: user 17min 6s, sys: 29.8 s, total: 17min 36s
Wall time: 9min 24s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [179]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9162884163483165, {'C': 0.21544346900318834})

In [180]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94242

In [None]:
%%time
# performing time series cross-validation, we see an improvement in ROC AUC
cv_scores = cross_val_score(logit_grid_searcher.best_estimator_, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

In [None]:
cv_scores, cv_scores.mean() # 0.9177259216878799

In [None]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm4.csv') # 0.94278