In [1]:
%%time
# Import libraries and set desired options
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

Wall time: 45.9 s


In [None]:
#https://www.kaggle.com/kashnitsky/correct-time-aware-cross-validation-scheme

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
%%time
times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv('alice\\train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('alice\\test_sessions.csv',
                      index_col='session_id', parse_dates=times)
train_df = train_df.sort_values(by='time1')
# Look at the first rows of the training set
train_df.head()

Wall time: 43.3 s


In [4]:
#Transform data into format which can be fed into CountVectorizer
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [5]:
%%time
#Fit CountVectorizer and trasfrom data with it

cv = CountVectorizer(ngram_range=(1, 3), max_features=60000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

Wall time: 48.4 s


In [6]:
#Save train targets into a separate vector.
y_train = train_df['target'].astype('int').values

In [7]:
#We'll be performing time series cross-validation, see sklearn TimeSeriesSplit and this dicussion on StackOverflow
time_split = TimeSeriesSplit(n_splits=10)

In [8]:
#
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [9]:
#Perform time series cross-validation with logistic regression
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [10]:
#Now we'll add some time features: indicators of morning, day, evening and night.

def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [11]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

Wall time: 7min 25s


In [14]:
X_train_new.shape, X_test_new.shape

((253561, 60004), (82797, 60004))

In [16]:
%%time
#add start_month feature
from sklearn.preprocessing import StandardScaler
start_month_train = train_df['time1'].apply(lambda ts:100 * ts.year + ts.month).astype('float64')
start_month_train = StandardScaler().fit_transform(start_month_train.values.reshape(-1,1))
X_train_mod = hstack([X_train_new, start_month_train])
start_month_test = test_df['time1'].apply(lambda ts:100 * ts.year + ts.month).astype('float64')
start_month_test = StandardScaler().fit_transform(start_month_test.values.reshape(-1,1))
X_test_mod = hstack([X_test_new, start_month_test])

Wall time: 6.21 s


In [17]:
%%time
#full_df['duration'] = full_df[times].max(axis=1)-full_df[times].min(axis=1)
#full_df['duration']=full_df['duration'].apply(lambda ts:ts.seconds).astype('float64')
duration_train = train_df[times].max(axis=1)-train_df[times].min(axis=1)
duration_test = test_df[times].max(axis=1)-test_df[times].min(axis=1)
duration_train = duration_train.apply(lambda ts:ts.seconds).astype('float64')
duration_test = duration_test.apply(lambda ts:ts.seconds).astype('float64')
duration_train

Wall time: 11.7 s


In [18]:
type(duration_train) #= StandardScaler().fit_transform(duration_train.values.reshape(-1,1))
duration_train = StandardScaler().fit_transform(duration_train.values.reshape(-1,1))
duration_test = StandardScaler().fit_transform(duration_test.values.reshape(-1,1))

In [23]:
online_day_train = train_df['time1'].apply(lambda ts: ts.dayofweek in [0,1,3,4]).astype('float64')
online_day_test = test_df['time1'].apply(lambda ts: ts.dayofweek in [0,1,3,4]).astype('float64')
online_day_train = online_day_train.values.reshape(-1,1)
online_day_test = online_day_test.values.reshape(-1,1)
online_day_test

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [0.]])

In [24]:
X_train_mod = hstack([X_train_new, start_month_train, duration_train, online_day_train])
X_test_mod = hstack([X_test_new, start_month_test, duration_test, online_day_test])
X_train_mod.shape, X_test_mod.shape

((253561, 60007), (82797, 60007))

In [25]:
%%time
#Performing time series cross-validation, we see an improvement in ROC AUC.
cv_scores = cross_val_score(logit, X_train_mod, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 2min 24s


In [26]:
cv_scores, cv_scores.mean()

(array([0.90831399, 0.81943585, 0.89776853, 0.98654856, 0.91712   ,
        0.95985   , 0.95888751, 0.94545225, 0.92154321, 0.94952074]),
 0.92644406552483)

In [27]:
%%time
logit.fit(X_train_mod, y_train)

Wall time: 1min 15s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [28]:
logit_test_pred2 = logit.predict_proba(X_test_mod)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.94187

In [29]:
#Now we tune regularization parameter C.
c_values = np.logspace(-2, 2, 10)
logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [30]:
%%time
logit_grid_searcher.fit(X_train_mod, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 33.2min finished


Wall time: 34min 4s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [31]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9287994489003171, {'C': 0.21544346900318834})

In [32]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_mod)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94589