In [97]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [98]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [99]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%s' % i for i in range(1, 11)]

In [100]:
train_df = pd.read_csv('../../data/train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('../../data/test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

In [101]:
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', sep=' ', index=None, header=None)

In [102]:
#####################################################

In [103]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)
CPU times: user 10.9 s, sys: 386 ms, total: 11.3 s
Wall time: 10.8 s


In [104]:
# save train targets into a separate vector
y_train = train_df['target'].astype('int').values

In [105]:
# performing time series cross-validation
time_split = TimeSeriesSplit(n_splits=10)

In [106]:
# perform time series cross-validation with logistic regression
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [107]:
%%time
# hangs with n_jobs > 1, and locally this runs much faster
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=1) 

CPU times: user 1min 47s, sys: 16.5 s, total: 2min 3s
Wall time: 31.2 s


In [108]:
cv_scores, cv_scores.mean()

(array([0.83141992, 0.64670618, 0.87991997, 0.96315292, 0.84221296,
        0.87840646, 0.94475732, 0.85322024, 0.92988416, 0.90752702]),
 0.8677207147367503)

In [109]:
#####################################################

In [110]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [111]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288 -> 0.91382

### Features

In [112]:
features = {}

In [113]:
%time
# hour
def feature_hour(df):
    return df.fillna(0)['time1'].apply(lambda ts: ts.hour)

features['hour'] = {
    'train': feature_hour(train_df),
    'test': feature_hour(test_df),
}

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [114]:
%time
# morning
def feature_morning(series):
    return ((series >= 7) & (series <= 11)).astype('int')

features['morning'] = {
    'train': feature_morning(features['hour']['train']),
    'test': feature_morning(features['hour']['test']),
}

# day
def feature_day(series):
    return ((series >= 12) & (series <= 18)).astype('int')

features['day'] = {
    'train': feature_day(features['hour']['train']),
    'test': feature_day(features['hour']['test']),
}

# evening
def feature_evening(series):
    return ((series >= 19) & (series <= 23)).astype('int')

features['evening'] = {
    'train': feature_evening(features['hour']['train']),
    'test': feature_evening(features['hour']['test']),
}

# night
def feature_night(series):
    return ((series >= 0) & (series <= 6)).astype('int')

features['night'] = {
    'train': feature_night(features['hour']['train']),
    'test': feature_night(features['hour']['test']),
}

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.96 µs


In [115]:
# duration
def feature_duration(df):
    df_times = df[times]
    return (df_times.max(axis=1) - df_times[times].min(axis=1)) / np.timedelta64(1, 's')

features['duration'] = {
    'train': feature_duration(train_df),
    'test': feature_duration(test_df),
}

In [116]:
# sites
def feature_sites(df):
    return df[sites].count(axis=1)

features['sites'] = {
    'train': feature_sites(train_df),
    'test': feature_sites(test_df),
}

In [117]:
# avg_time
def feature_avg_time(duration, sites):
    return duration / sites

features['avg_time'] = {
    'train': feature_avg_time(features['duration']['train'], features['sites']['train']),
    'test': feature_avg_time(features['duration']['test'], features['sites']['test']),
}

In [118]:
# first_time
def feature_first_time(df):
    return (df['time2'] - df['time1']).fillna(0) / np.timedelta64(1, 's')

features['first_time'] = {
    'train': feature_first_time(train_df),
    'test': feature_first_time(test_df),
}

In [119]:
# 2014
def feature_2014(df):
    return (df['time1'].apply(lambda ts: ts.year) == 2014).astype('int')

features['2014'] = {
    'train': feature_2014(train_df),
    'test': feature_2014(test_df),
}

In [120]:
def prepare_set(cols, is_test=False):
    return hstack([
        X_test if is_test else X_train, 
        *[features[col]['test' if is_test else 'train'].values.reshape(-1, 1) for col in cols]]
    )

In [121]:
columns = (
        'hour', # 3.41
        'morning',  # 4.58
        'day',  # 4.61
        'evening',  # 0.15
        'night',  # -
#         'duration',  # 0.06
#         'sites',  # -
#         'avg_time',  # -
#         'first_time',  # 0
#         '2014',  # 0.1
    )

X_train_new = prepare_set(columns)
X_test_new = prepare_set(columns, True)

In [122]:
%%time
# performing time series cross-validation, we see an improvement in ROC AUC
# hangs with n_jobs > 1, and locally this runs much faster
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

CPU times: user 1.39 s, sys: 220 ms, total: 1.61 s
Wall time: 24.9 s


In [123]:
cv_scores, cv_scores.mean(), cv_scores.mean() - 0.8689298500287295
# 0.9177585931529093

(array([0.83099787, 0.72182327, 0.95343563, 0.98253007, 0.90059535,
        0.95449905, 0.96034331, 0.93415212, 0.96440056, 0.95672132]),
 0.9159498560297443,
 0.04702000600101486)

In [124]:
#####################################################

In [125]:
logit.fit(X_train_new, y_train)

logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.93843

In [126]:
# tune regularization parameter C
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(
    estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1
)

In [127]:
%%time
logit_grid_searcher.fit(X_train_new, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  7.0min finished


CPU times: user 24min 26s, sys: 3min 45s, total: 28min 12s
Wall time: 7min 6s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [128]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9198696582897723, {'C': 0.21544346900318834})

In [129]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94242

In [130]:
%%time
# performing time series cross-validation, we see an improvement in ROC AUC
cv_scores = cross_val_score(logit_grid_searcher.best_estimator_, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

CPU times: user 1.01 s, sys: 104 ms, total: 1.12 s
Wall time: 12.5 s


In [131]:
cv_scores, cv_scores.mean() # 0.9177259216878799

(array([0.85944617, 0.72962129, 0.94407697, 0.98029395, 0.91054982,
        0.95706374, 0.95547239, 0.93444745, 0.9675228 , 0.96020201]),
 0.9198696582897723)

In [132]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm4.csv') # 0.94278