In [515]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
from scipy.sparse import hstack
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

In [516]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [517]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%s' % i for i in range(1, 11)]

In [518]:
train_df = pd.read_csv('../../data/train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('../../data/test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

In [519]:
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', sep=' ', index=None, header=None)

In [520]:
#####################################################

In [521]:
%%time
cv = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, max_df=0.80, min_df=2)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)
CPU times: user 6.31 s, sys: 195 ms, total: 6.51 s
Wall time: 6.51 s


In [522]:
# save train targets into a separate vector
y_train = train_df['target'].astype('int').values

In [523]:
# performing time series cross-validation
time_split = TimeSeriesSplit(n_splits=10)

In [524]:
# perform time series cross-validation with logistic regression
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [525]:
%%time
# hangs with n_jobs > 1, and locally this runs much faster
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1) 

CPU times: user 674 ms, sys: 101 ms, total: 775 ms
Wall time: 4.71 s


In [526]:
cv_scores, cv_scores.mean(), cv_scores.std()

(array([0.8002963 , 0.66032929, 0.87432248, 0.93674938, 0.84713484,
        0.8883615 , 0.92449344, 0.87080113, 0.93027344, 0.92019323]),
 0.8652955040273607,
 0.07940757947938765)

In [527]:
#####################################################

In [528]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [529]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288 -> 0.91382

### Features

In [530]:
features = {}

In [532]:
%time
# start_hour
def feature_start_hour(df):
    return df.fillna(0)['time1'].apply(lambda ts: ts.hour)

features['start_hour'] = {
    'train': feature_start_hour(train_df),
    'test': feature_start_hour(test_df),
}

features['start_hour_scale'] = {
    'train': features['start_hour']['train'] / 23,
    'test': features['start_hour']['test'] / 23,
}

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.2 µs


In [533]:
%time
# morning
def feature_morning(series):
    return ((series >= 7) & (series <= 11)).astype('int')

features['morning'] = {
    'train': feature_morning(features['start_hour']['train']),
    'test': feature_morning(features['start_hour']['test']),
}

# day
def feature_day(series):
    return ((series >= 12) & (series <= 18)).astype('int')

features['day'] = {
    'train': feature_day(features['start_hour']['train']),
    'test': feature_day(features['start_hour']['test']),
}

# evening
def feature_evening(series):
    return ((series >= 19) & (series <= 23)).astype('int')

features['evening'] = {
    'train': feature_evening(features['start_hour']['train']),
    'test': feature_evening(features['start_hour']['test']),
}

# night
def feature_night(series):
    return ((series >= 0) & (series <= 6)).astype('int')

features['night'] = {
    'train': feature_night(features['start_hour']['train']),
    'test': feature_night(features['start_hour']['test']),
}

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [534]:
# duration
def feature_duration(df):
    df_times = df[times]
    return (df_times.max(axis=1) - df_times[times].min(axis=1)) / np.timedelta64(1, 's') / (1800)

features['duration'] = {
    'train': feature_duration(train_df),
    'test': feature_duration(test_df),
}

In [535]:
# # sites
# def feature_sites(df):
#     return df[sites].count(axis=1)

# features['sites'] = {
#     'train': feature_sites(train_df),
#     'test': feature_sites(test_df),
# }

In [536]:
# avg_time
def feature_avg_time(df):
    return df[times].apply(lambda x: x.apply(lambda v: int(v.timestamp()) if not pd.isnull(v) else v)).diff(axis=1).median(axis=1, skipna=True)

features['avg_time'] = {
    'train': feature_avg_time(train_df).fillna(0),
    'test': feature_avg_time(test_df).fillna(0),
}


  r = func(a, **kwargs)


In [537]:
# # first_time
# def feature_first_time(df):
#     return (df['time2'] - df['time1']).fillna(0) / np.timedelta64(1, 's')

# features['first_time'] = {
#     'train': feature_first_time(train_df),
#     'test': feature_first_time(test_df),
# }

In [538]:
# 2014
def feature_2014(df):
    return (df['time1'].apply(lambda ts: ts.year) == 2014).astype('int')

features['2014'] = {
    'train': feature_2014(train_df),
    'test': feature_2014(test_df),
}

In [539]:
%time
# end_hour
def feature_end_hour(df):
    return df[times].max(axis=1).apply(lambda ts: ts.hour)

features['end_hour'] = {
    'train': feature_end_hour(train_df),
    'test': feature_end_hour(test_df),
}

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


In [540]:
features['hours_start'] = {
    'train': pd.get_dummies(features['start_hour']['train']),
    'test': pd.get_dummies(features['start_hour']['test']),
}

features['hours_end'] = {
    'train': pd.get_dummies(features['end_hour']['train']),
    'test': pd.get_dummies(features['end_hour']['test']),
}

In [541]:
%time
# day_of_week
def feature_day_of_week(df):
    return df['time1'].apply(lambda t: t.weekday())

features['day_of_week'] = {
    'train': feature_day_of_week(train_df) / 7,
    'test': feature_day_of_week(test_df) / 7,
}

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs


In [542]:
%time
# month
def feature_month(df):
    return df['time1'].apply(lambda t: t.month)

features['month'] = {
    'train': feature_month(train_df) / 12,
    'test': feature_month(test_df) / 12,
}

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


In [543]:
# return
def feature_return(df):
    return df[sites].count(axis=1) - df[sites].nunique(axis=1)

%time
features['return'] = {
    'train': feature_return(train_df),
    'test': feature_return(test_df),
}

features['return'] = {
    'train': features['return']['train'] / features['return']['train'].max(),
    'test': features['return']['test'] / features['return']['test'].max(),
}

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [544]:
def prepare_set(cols, is_test=False):
    return hstack([
        X_test if is_test else X_train, 
        *[features[col]['test' if is_test else 'train'].values.reshape(-1, 1) for col in cols]]
    )

In [545]:
columns = (
        'start_hour', # 3.41
#         'end_hour',
        'morning',  # 4.58
        'day',  # 4.61
        'evening',  # 0.15
#         'night',  # -
        'duration',  # 0.06
#         'sites',  # -
        'avg_time',  # -
#         'first_time',  # 0
        '2014',  # 0.1
        'day_of_week',
#         'month',
        'return',
    )

X_train_new = hstack([
    prepare_set(columns),
    features['hours_start']['train'],
    features['hours_end']['train'],
])
X_test_new = hstack([
    prepare_set(columns, True),
    features['hours_start']['test'],
    features['hours_end']['test'],
])

# X_train_new = np.hstack([
#     *[features[col]['train'].values.reshape(-1, 1) for col in columns],
#     features['hours_start']['train'],
#     features['hours_end']['train'],
# ])

In [546]:
%%time
# performing time series cross-validation, we see an improvement in ROC AUC
# hangs with n_jobs > 1, and locally this runs much faster
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

CPU times: user 1.73 s, sys: 256 ms, total: 1.99 s
Wall time: 17.1 s


In [547]:
cv_scores, cv_scores.mean(), cv_scores.std()
# (array([0.87008266, 0.80648914, 0.92857517, 0.96592745, 0.91598991,
#        0.9516962 , 0.94826252, 0.93866035, 0.95200296, 0.95071618]),
# 0.9228402542341649,
# 0.05391040420543547)

(array([0.66578703, 0.84412429, 0.96711052, 0.95650211, 0.92884522,
        0.97999949, 0.86594665, 0.96063525, 0.96848259, 0.98101456]),
 0.9118447710576161,
 0.09359091118748379)

In [548]:
#####################################################

In [549]:
logit.fit(X_train_new, y_train)

logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.93843

In [553]:
# tune regularization parameter C
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(
    estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1
)

In [554]:
%%time
logit_grid_searcher.fit(X_train_new, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.5min finished


CPU times: user 46.9 s, sys: 6.28 s, total: 53.2 s
Wall time: 3min 41s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [555]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_  # (0.9249450807614793, {'C': 1.6681005372000592})

(0.9176980723756122, {'C': 12.915496650148826})

In [556]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv')

In [557]:
%%time
# performing time series cross-validation, we see an improvement in ROC AUC
cv_scores = cross_val_score(logit_grid_searcher.best_estimator_, X_train_new, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)

CPU times: user 1.39 s, sys: 397 ms, total: 1.79 s
Wall time: 31.6 s


In [558]:
cv_scores, cv_scores.mean(), cv_scores.std() # 0.9249450807614791, 0.04601868425363776

(array([0.66252892, 0.83522061, 0.97240668, 0.96250271, 0.93606407,
        0.97642791, 0.91768815, 0.95889084, 0.97531643, 0.9799344 ]),
 0.9176980723756122,
 0.09465246923113832)

In [559]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'submission1.csv') # 0.94637

In [None]:
logit_grid_searcher.best_estimator_.coef_[0][50000:]