In [70]:
# import libraries
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
# a helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


**Read training and test sets, sort train set by session start time.**

In [255]:
train_df = pd.read_csv('~/Downloads/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('~/Downloads/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')
test_df = test_df.sort_values(by='time1')
# Look at the first rows of the training set


**Transform data into format which can be fed into `CountVectorizer`**

In [6]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [208]:
sep_bound = int(np.ceil(train_df.shape[0] * 0.8))

train_df.iloc[range(sep_bound)][sites]\
        .fillna(0).astype('int')\
        .to_csv('inner_train_sessions_text.txt', sep=' ', index=None, header=None)

train_df.iloc[range(sep_bound, train_df.shape[0])][sites]\
        .fillna(0).astype('int')\
        .to_csv('inner_test_sessions_text.txt', sep=' ', index=None, header=None)


In [205]:
train_df.iloc[range(sep_bound)][sites].shape,\
train_df.iloc[range(sep_bound, train_df.shape[0])][sites].shape, \
train_df[sites].shape

((177493, 10), (76068, 10), (253561, 10))

In [203]:
len(np.unique(np.concatenate([train_df['site' + str(i)].values for i in range(1, 11)])))

41602

In [7]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [91]:
!head -5 inner_test_sessions_text.txt

953 953 953 953 953 953 953 953 953 953
953 953 953 953 953 953 953 953 953 953
953 953 953 953 953 953 953 953 953 953
953 953 953 953 953 953 953 953 953 953
953 953 953 953 953 953 953 953 953 953


**Fit `CountVectorizer` and trasfrom data with it.**

In [210]:
%%time
cv = CountVectorizer()
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)
CPU times: user 3.44 s, sys: 138 ms, total: 3.57 s
Wall time: 3.67 s


In [211]:
with open('inner_train_sessions_text.txt') as inp_inner_train_file:
    X_inner_train = cv.transform(inp_inner_train_file)
with open('inner_test_sessions_text.txt') as inp_inner_test_file:
    X_inner_test = cv.transform(inp_inner_test_file)
print(X_inner_train.shape, X_inner_test.shape)

(202849, 41592) (50712, 41592)


## I separeated train data set into inner_train and inner_test (inner, because it is used only in my code)

**Save train targets into a separate vector.**

In [9]:
y_train = train_df['target'].astype('int')

In [96]:
y_inner_train = train_df.iloc[range(sep_bound)]['target'].astype('int')
y_inner_test = train_df.iloc[range(sep_bound, train_df.shape[0])]['target'].astype('int')

### train Logistic regression

In [10]:
logit = LogisticRegression(C=1, random_state=17)

In [97]:
%%time
cv_scores = cross_val_score(logit, X_inner_train, y_inner_train, cv=5, scoring='roc_auc')

CPU times: user 22.6 s, sys: 448 ms, total: 23 s
Wall time: 9.01 s


In [98]:
cv_scores

array([0.92487708, 0.88103578, 0.76814638, 0.90003107, 0.86026132])

In [99]:
cv_scores.mean()

np.float64(0.8668703288172275)

In [14]:
%%time
logit.fit(X_train, y_train)

CPU times: user 5.62 s, sys: 87.5 ms, total: 5.71 s
Wall time: 2.21 s


In [15]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]

In [16]:
test_pred_logit1

array([3.49637074e-03, 4.94332375e-11, 5.66205296e-12, ...,
       1.00328833e-02, 4.05119249e-04, 1.37432056e-06], shape=(82797,))

In [18]:
## CV 0.885
write_to_submission_file(test_pred_logit1, 'logit_subm1.csv') ## .908 ROC AUC Public LB

### Time Features
 - hour when the session started
 - morning 
 - day
 - eve
 - night

In [19]:
def add_time_features(time1_series, X_sparse):
    hour = time1_series.apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [20]:
test_df.loc[:, 'time1'].fillna(0).apply(lambda ts: ts.hour).head()

session_id
1    11
2    11
3    15
4    10
5    15
Name: time1, dtype: int64

In [21]:
%%time
X_train_with_time = add_time_features(train_df['time1'].fillna(0), X_train)
X_test_with_time = add_time_features(test_df['time1'].fillna(0), X_test)

CPU times: user 1.05 s, sys: 133 ms, total: 1.18 s
Wall time: 1.31 s


In [212]:
%%time
time_feature = train_df['time1'].fillna(0)
X_inner_train_with_time = add_time_features(time_feature.iloc[range(sep_bound)], X_inner_train)
X_inner_test_with_time = add_time_features(time_feature.iloc[range(sep_bound, train_df.shape[0])], X_inner_test)


CPU times: user 1.82 s, sys: 1.13 s, total: 2.95 s
Wall time: 3.37 s


In [213]:
X_train_with_time.shape, X_test_with_time.shape

((253561, 41596), (82797, 41596))

In [214]:
%%time
cv_scores = cross_val_score(logit, X_train_with_time, y_train, cv=5, scoring='roc_auc')

CPU times: user 25.8 s, sys: 423 ms, total: 26.2 s
Wall time: 9.39 s


In [216]:
cv_scores.mean()

np.float64(0.9306802415000124)

In [26]:
%%time
logit.fit(X_train_with_time, y_train)

CPU times: user 6.22 s, sys: 86.9 ms, total: 6.3 s
Wall time: 2.32 s


In [27]:
test_pred_logit2 = logit.predict_proba(X_test_with_time)[:, 1]

In [28]:
test_pred_logit2

array([9.48623794e-05, 1.51907103e-08, 1.52101138e-10, ...,
       3.19570780e-04, 1.19102031e-05, 9.30722765e-08], shape=(82797,))

In [30]:
## CV .9307
write_to_submission_file(test_pred_logit2, 'logit_subm2.csv') ## ROC AUC 0.93565 Public LB

## Tuning Model by myself

In [49]:
import asyncio
from telegram import Bot

BOT_TOKEN = "8001454425:AAFmtQcTsAmDV9uOPJbwnJDLGAi21LCLrmY"
CHAT_ID = 7620452856

async def send_message(text='✅ Model training has finished!'):
    bot = Bot(token=BOT_TOKEN)
    await bot.send_message(chat_id=CHAT_ID, text=text)


In [122]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

In [229]:
params = {'C': np.linspace(1, 3, 10)}
logit = LogisticRegression(penalty='l2')

In [230]:
logit_search = GridSearchCV(logit, params, n_jobs=-1, cv=5, scoring='roc_auc')

In [231]:
%time logit_search.fit(X_inner_train_with_time, y_inner_train)

CPU times: user 8.18 s, sys: 756 ms, total: 8.94 s
Wall time: 1min 4s


In [124]:
# await send_message(f'best params: {locally_best_logit2.best_params_}')

In [232]:
locally_best_logit2 = logit_search.best_estimator_

In [235]:
locally_best_logit2.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=np.float64(1.2222222222222223))>

In [236]:
locally_best_logit2.fit(X_inner_train_with_time, y_inner_train)
# logit.fit(X_inner_train_with_time, y_inner_train)

In [237]:
%%time
# cv_scores = cross_val_score(logit, X_inner_test_with_time, y_inner_test, cv=5, scoring='roc_auc')
cv_scores = cross_val_score(locally_best_logit2, X_inner_test_with_time, y_inner_test, cv=5, scoring='roc_auc')

CPU times: user 5.33 s, sys: 136 ms, total: 5.47 s
Wall time: 2.16 s


In [238]:
cv_scores.mean()

np.float64(0.9434085343845983)

In [239]:
test_pred_logit2 = locally_best_logit2.predict_proba(X_test_with_time)[:, 1]

In [240]:
test_pred_logit2

array([1.17771192e-04, 1.74380510e-08, 7.09872224e-10, ...,
       3.07854786e-04, 1.30050795e-05, 9.09490920e-08], shape=(82797,))

In [241]:
## CV 0.94 (0.934 in Kaggle which is worse than with C=1)
write_to_submission_file(test_pred_logit2, 'logit_subm3.csv')

## Code chunks from kaggle

In [264]:
df_train = train_df.reset_index()

In [265]:
df_train.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
1,54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
2,77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
3,114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
4,146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [266]:
for j in range(1,11):
  if j == 1:
    df = df_train[['session_id','site{i}'.format(i=j),'target']]
    df.columns = ['session_id','site', 'target']
  if j<10:
    df_2 = df_train[['session_id','site{i}'.format(i=j+1), 'target']]
    df_2.columns = ['session_id','site', 'target']
    df = pd.concat([df, df_2], ignore_index=True)

In [267]:
df.head()

Unnamed: 0,session_id,site,target
0,21669,56.0,0
1,54843,56.0,0
2,77292,946.0,0
3,114021,945.0,0
4,146670,947.0,0


In [276]:
df_pivot = df.pivot_table(values='session_id', index='site', columns='target', aggfunc='count', fill_value=0).reset_index()
df_pivot.head()

target,site,0,1
0,1.0,2993,12
1,2.0,304,18
2,3.0,18001,129
3,4.0,1782,26
4,5.0,1321,0


In [281]:
df_pivot[df_pivot[0] == 0].count() 
## there are 190 sites in the training dataset which were visited by Alice only 
## (non of other useres in the dataset visited this sites)

target
site    190
0       190
1       190
dtype: int64