In [5]:
# Import libraries and set desired options

from __future__ import division, print_function
# Disable Anaconda warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [6]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [7]:
# Load websites dictionary
with open(r"data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])

sites_dict['zone'] = sites_dict['site'].str.split('.').apply(lambda x: x[-1])
sites_dict.loc[sites_dict['zone'].str.isnumeric(), 'zone'] = 'ip_address'
sites_dict['zone_le'] = LabelEncoder().fit_transform(sites_dict['zone'])

print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site,zone,zone_le
25075,www.abmecatronique.com,com,28
13997,groups.live.com,com,28
42436,majeureliguefootball.wordpress.com,com,28
30911,cdt46.media.tourinsoft.eu,eu,41
8104,www.hdwallpapers.eu,eu,41


In [8]:
# Read the training and test data sets
train_df = pd.read_csv('data/train_sessions.csv',
                      index_col='session_id')
test_df = pd.read_csv('data/test_sessions.csv',
                      index_col='session_id')

# Switch time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [9]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [10]:
# Our target variable
y_train = train_df['target']

# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [11]:
# Dataframe with indices of visited websites in session
full_sites = full_df[sites]

full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [12]:
def  get_time_diff(row): 
    time_length = row.shape[0] - 1 
    time_diff = [0]*time_length 
    i = 0 
    while (i < time_length)and pd.notnull(row[i+1]): 
        time_diff[i] = (row[i+1] - row[i]) / np.timedelta64(1,'s') 
        i += 1 
    return  time_diff

In [13]:
def get_time_features(df):
    time_df = pd.DataFrame(index=df.index)
    
    hour = df['time1'].dt.hour
    time_df['hour'] = hour
    time_df['day_'] = df['time1'].dt.day
    time_df['month'] = df['time1'].dt.month
    time_df['year'] = df['time1'].dt.year
    time_df['myear'] = df['time1'].dt.year * 12 + df['time1'].dt.month
    
#     time_df['morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
#     time_df['day'] = ((hour >= 12) & (hour <= 18)).astype('int')
#     time_df['evening'] = ((hour >= 19) & (hour <= 23)).astype('int')

    time_df['min'] = df['time1'] 
    time_df['max'] = df[times].max(axis=1)

    for px in ['min', 'max']:
        time_df['minutes'] = time_df[px].dt.hour * 60 + time_df[px].dt.minute
        time_df['sin_'+px] = np.sin(2*np.pi*time_df['minutes']/1440.)
        time_df['cos_'+px] = np.cos(2*np.pi*time_df['minutes']/1440.)

    time_df['dow'] = time_df['min'].apply(lambda ts: ts.date().weekday())
    time_df['weekend'] = (time_df['dow'] > 4).astype('int')
    time_df['n_null'] = df[times].isnull().sum(axis=1)

    time_df['dt'] = time_df['max'] - time_df['min']
    for time in times[1:]:
        dt_ = (df[time] - time_df['min']).fillna(time_df['dt'])
        time_df['dt_' + time] = np.log1p(np.abs(dt_.astype('timedelta64[s]')))
    time_df['dt'] = np.log1p(np.abs(time_df['dt'].astype('timedelta64[s]')))
    time_df['dt_mean'] = time_df[['dt_' + time for time in times[1:]]].mean(axis=1)
    time_df['dt_std'] = time_df[['dt_' + time for time in times[1:]]].std(axis=1)
    time_df['dt_var'] = time_df[['dt_' + time for time in times[1:]]].var(axis=1)
    
    s_columns = [col for col in time_df.columns if time_df[col].dtype != '<M8[ns]']
    
    s_scaler = StandardScaler()
    time_df[s_columns] = s_scaler.fit_transform(time_df[s_columns])

    time_df = time_df.drop(['min', 'max'], axis=1)
    
    for col in time_df.columns:
        time_df[col] = time_df[col].fillna(time_df[col].mean())

    return time_df

In [14]:
full_time = get_time_features(full_df[times])
ft_columns = ['dow', 'weekend', 'day_', 'month', 'sin_min', 'cos_min', 'dt', 'dt_std', 'dt_mean', 'n_null',
              'sin_max', 'cos_max', 'hour', 'year', 'minutes', 'myear']  # full_time.columns
# good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year']

full_time = full_time[ft_columns]

hours_dum = pd.get_dummies(pd.cut(full_time['hour'], bins=4))

full_time = pd.concat([full_time, hours_dum], axis=1)
# full_time = pd.concat([full_time[good_cols], hours_dum], axis=1)
print(full_time.agg(['min', 'mean', 'max']))
full_time = full_time.values

               dow       weekend          day_         month       sin_min  \
min  -1.425540e+00 -3.842226e-01 -1.566362e+00 -1.150793e+00 -1.459027e+00   
mean -4.028597e-14  2.643474e-13  1.188026e-13 -3.089523e-13 -8.181688e-15   
max   2.057594e+00  2.602658e+00  1.695464e+00  1.740832e+00  1.690438e+00   

           cos_min            dt        dt_std       dt_mean        n_null  \
min  -8.437691e-01 -1.920105e+00 -1.399868e+00 -1.609976e+00 -2.824378e-01   
mean -2.089893e-15 -9.427659e-15 -2.642982e-15 -2.419324e-15  3.250062e-14   
max   4.849943e+00  2.268596e+00  4.993226e+00  3.111933e+00  4.802319e+00   

           sin_max       cos_max          hour          year       minutes  \
min  -1.451753e+00 -8.401770e-01 -1.700055e+00 -1.871975e+00 -1.655692e+00   
mean -2.703839e-15  1.714818e-15  1.160608e-14 -3.004657e-12  1.552238e-15   
max   1.696008e+00  4.799861e+00  3.392265e+00  5.341951e-01  3.539841e+00   

             myear  (-1.705, -0.427]  (-0.427, 0.846]  (0.846

## TF-IDF

In [31]:
full_sites_tf = full_sites.copy()

for col in full_sites_tf.columns:
    full_sites_tf[col] = full_sites_tf[col].map(sites_dict.site)

full_sites_tf = full_sites_tf.fillna('')
# df_tf_col = full_sites_tf.apply(lambda x: '.'.join([i for i in x if len(i)>0]), axis=1)
# df_tf_col = df_tf_col.str.split('[.-]').str.join(' ')

df_tf_col = full_sites_tf.apply(lambda x: ' '.join([i for i in x if len(i)>0]), axis=1)

In [32]:
vect = TfidfVectorizer(max_df=.7, sublinear_tf=True)
df_tf = vect.fit_transform(df_tf_col)

#### Split

In [33]:
time_split = TimeSeriesSplit(n_splits=10)

In [34]:
# X_train = df_tf[:idx_split,:]
# X_test = df_tf[idx_split:,:]


X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split,:]])
X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:,:]])

#### GridSearch

In [101]:
c_values = [0.01, 0.03]
# cw_values = [{0: 0.6, 1: 0.4}, {0: 0.9, 1: 0.1}, {0: 0.8, 1: 0.2}, {0: 0.7, 1: 0.3} , {0: 0.3, 1: 0.7}]

lrcv = LogisticRegressionCV(Cs=c_values, scoring='roc_auc', n_jobs=-1, cv=kfold_split,
                            verbose=1, class_weight='balanced', max_iter=3000)

In [102]:
%%time
lrcv.fit(X_train, y_train);
lrcv.score

[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  8.3min remaining: 24.9min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  8.6min finished


CPU times: user 3min 56s, sys: 2min 36s, total: 6min 33s
Wall time: 10min 39s


In [103]:
cvr = lrcv.scores_[1]
idx = cvr.mean(axis=0).argmax()
for i in range(cvr.shape[1]):
    print(f"ROC_AUC [{lrcv.Cs_[i]:>4}]: {cvr[:, i].mean():.4f}+-{cvr[:, i].std():.4f}")
print(f"Best params: {lrcv.C_}")

ROC_AUC [0.01]: 0.9574+-0.0141
ROC_AUC [0.03]: 0.9587+-0.0136
Best params: [0.03]


ROC_AUC [0.01]: 0.9574+-0.0141
ROC_AUC [0.03]: 0.9587+-0.0136
ROC_AUC [ 0.1]: 0.9586+-0.0127
ROC_AUC [ 0.3]: 0.9578+-0.0123
ROC_AUC [ 1.0]: 0.9559+-0.0129
ROC_AUC [ 3.0]: 0.9531+-0.0142
ROC_AUC [10.0]: 0.9488+-0.0167
Best params: [0.1]

#### Evaluating

In [39]:
X_train = df_tf[:idx_split,:]
X_test = df_tf[idx_split:,:]

# X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split,:]])
# X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:,:]])

In [40]:
params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}

logit = LogisticRegression(**params)

In [41]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

CPU times: user 791 ms, sys: 159 ms, total: 950 ms
Wall time: 8.32 s


In [42]:
print(f"ROC_AUC: {cv_scores.mean():.4f}+-{cv_scores.std():.4f}")
# cv_scores

ROC_AUC: 0.8537+-0.0796


In [28]:
logit.fit(X_train, y_train)
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm_not_raw.csv')

### Check columns importance

### Define class_weight

In [109]:
%%time
cv_scores_arr = list()

for cw in [{0: 0.2, 1: 0.8}, {0: 0.3, 1: 0.7}, {0: 0.4, 1: 0.6}]:
    vect = TfidfVectorizer(max_df=0.9, sublinear_tf=True)
    df_tf = vect.fit_transform(df_tf_col)
    params = {'C': 0.1, 'class_weight': cw, 'random_state':17, 'n_jobs':1}
    
    X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split,:]])
    X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:,:]])
    
    logit = LogisticRegression(**params)
    cv_scores_ = cross_val_score(logit, X_train, y_train, cv=kfold_split, scoring='roc_auc', n_jobs=-1)
    cv_scores_arr.append(cv_scores_)
    
    print(f"cw {cw}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f}")

cw {0: 0.2, 1: 0.8}: 0.9542+-0.0159
cw {0: 0.3, 1: 0.7}: 0.9537+-0.0162
cw {0: 0.4, 1: 0.6}: 0.9529+-0.0165
CPU times: user 3min 28s, sys: 22.5 s, total: 3min 51s
Wall time: 8min 35s


cw balanced  : 0.9586+-0.0127
cw {0: 0.9, 1: 0.1}: 0.9390+-0.0209
cw {0: 0.8, 1: 0.2}: 0.9451+-0.0188
cw {0: 0.7, 1: 0.3}: 0.9483+-0.0178
cw {0: 0.6, 1: 0.4}: 0.9504+-0.0172
cw {0: 0.5, 1: 0.5}: 0.9518+-0.0168
cw {0: 0.4, 1: 0.6}: 0.9529+-0.0165
cw {0: 0.3, 1: 0.7}: 0.9537+-0.0162
cw {0: 0.2, 1: 0.8}: 0.9542+-0.0159

### Define good tf-idf params

In [30]:
%%time
params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}
cv_scores_arr = list()

for i in [.5, .6, .7, .8, .9, 1]:
    vect = TfidfVectorizer(max_df=i, sublinear_tf=True)
    df_tf = vect.fit_transform(df_tf_col)
    
    X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split,:]])
    X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:,:]])
    
    logit = LogisticRegression(**params)
    cv_scores_ = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)
    cv_scores_arr.append(cv_scores_)
    
    print(f"max_df {i:<6}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f}")

max_df 0.5   : 0.8736+-0.1068
max_df 0.6   : 0.8742+-0.1057
max_df 0.7   : 0.8742+-0.1057
max_df 0.8   : 0.8736+-0.1061
max_df 0.9   : 0.8738+-0.1067
max_df 1     : 0.8008+-0.0957
CPU times: user 59.3 s, sys: 6.51 s, total: 1min 5s
Wall time: 3min 36s


min_df 0.0001: 0.9581+-0.0129
min_df 0.001: 0.9560+-0.0120
min_df 0.01: 0.9403+-0.0132
min_df 0.1: 0.9110+-0.0209
min_df 0.3: 0.8979+-0.0200

max_df 0.50: 0.9582+-0.0125
max_df 0.60: 0.9583+-0.0126
max_df 0.70: 0.9583+-0.0126
max_df 0.80: 0.9583+-0.0126
max_df 0.85: 0.9583+-0.0126
max_df 0.90: 0.9586+-0.0127
max_df 0.95: 0.9586+-0.0127
max_df 0.98: 0.9586+-0.0127
max_df 1.00: 0.8968+-0.0200

### Define good hour split

In [72]:
good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year', 'morning', 'day', 'evening']
mask = pd.Series(ft_columns).isin(good_cols).values

X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split, mask]])
X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:, mask]])

params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}

logit = LogisticRegression(**params)

cv_scores = cross_val_score(logit, X_train, y_train, cv=kfold_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

print(f"ROC_AUC: {cv_scores.mean():.4f}+-{cv_scores.std():.4f}")

ROC_AUC: 0.9530+-0.0141


In [80]:
%%time

good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year']
mask = pd.Series(ft_columns).isin(good_cols).values

mask_hours = pd.Series(ft_columns).isin(['hour']).values
hours = pd.Series(full_time[:, mask_hours].flatten())

n_cols = len(ft_columns)
cv_scores_arr = []

for i in range(2, 9):
    hours_dum = pd.get_dummies(pd.cut(hours, bins=i)).values
    
    X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split, mask], hours_dum[:idx_split, :]])
    X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:, mask], hours_dum[idx_split:, :]])
    
    logit = LogisticRegression(**params)
    cv_scores_ = cross_val_score(logit, X_train, y_train, cv=kfold_split, scoring='roc_auc', n_jobs=-1)
    cv_scores_arr.append(cv_scores_)
    
    d_cv = cv_scores_ - cv_scores
    n_pos = (d_cv > 0).sum()
    if not(d_cv.mean() > 0 and n_pos > 4):
        print(f"Bins {i:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} EXCLUDE")
    else:
        cv_scores = cv_scores_.copy()
        print(f"Bins {i:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} ADD")

Bins 2         : 0.9465+-0.0194 EXCLUDE
Bins 3         : 0.9418+-0.0187 EXCLUDE
Bins 4         : 0.9583+-0.0126 ADD
Bins 5         : 0.9494+-0.0208 EXCLUDE
Bins 6         : 0.9429+-0.0222 EXCLUDE
Bins 7         : 0.9548+-0.0194 EXCLUDE
Bins 8         : 0.9592+-0.0168 ADD
CPU times: user 1min 29s, sys: 34.6 s, total: 2min 4s
Wall time: 20min 24s


### Delete columns one by one

In [23]:
ft_columns = ['dow', 'weekend', 'day_', 'month', 'sin_min', 'cos_min', 'dt', 'dt_std', 'dt_mean', 'n_null',
              'sin_max', 'cos_max', 'hour', 'year', 'minutes', 'myear', 'bin1', 'bin2', 'bin3', 'bin4']  # full_time.columns

In [24]:
cv_scores.mean()

0.8735747277102843

In [25]:
%%time

params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}

n_cols = len(ft_columns)
cv_scores_all = list()
delete_dict = {col:0 for col in ft_columns}
boundary = time_split.n_splits * 7 // 10

for iter_ in range(10):
    print(f"Iter {iter_}")
    idx_order = list(range(n_cols))
    np.random.seed(iter_)
    np.random.shuffle(idx_order)
    cv_scores_n_ = cv_scores.copy()
    cv_scores_arr = list()
    mask = np.ones(n_cols, dtype='bool')
    
    for n_ in range(-n_cols, 0):
        i = idx_order[n_]
        mask[i] = False
        X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split, mask]])
        X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:, mask]])
        logit = LogisticRegression(**params)
        cv_scores_ = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)
        cv_scores_arr.append(cv_scores_)

        d_cv = cv_scores_ - cv_scores_n_
        n_neg = (d_cv > 0).sum()
        if d_cv.mean() > 0 and n_neg >= boundary:
            delete_dict[ft_columns[i]] += 1
            cv_scores_n_ = cv_scores_.copy()
            print(f"{ft_columns[i]:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} EXCLUDE  ")
        else:
            mask[i] = True
            print(f"{ft_columns[i]:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} OK  ")
    cv_scores_all.append(cv_scores_arr)
    print('*' * 30)

Iter 0
bin3      : 0.8723+-0.1060 OK  
weekend   : 0.8803+-0.1304 EXCLUDE  
bin4      : 0.8800+-0.1307 OK  
dt_mean   : 0.8804+-0.1302 EXCLUDE  
sin_max   : 0.8804+-0.1304 OK  
bin2      : 0.8813+-0.1266 OK  
dt        : 0.8802+-0.1307 OK  
year      : 0.8803+-0.1301 OK  
sin_min   : 0.8803+-0.1304 OK  
day_      : 0.8803+-0.1300 OK  
cos_min   : 0.8803+-0.1301 OK  
minutes   : 0.8801+-0.1309 OK  
n_null    : 0.8812+-0.1328 OK  
dt_std    : 0.8804+-0.1303 OK  
bin1      : 0.8804+-0.1304 OK  
cos_max   : 0.8803+-0.1304 OK  
month     : 0.8803+-0.1303 OK  
dow       : 0.8799+-0.1290 OK  
myear     : 0.8804+-0.1300 OK  
hour      : 0.8806+-0.1304 EXCLUDE  
******************************
Iter 1
month     : 0.8731+-0.1065 OK  
bin1      : 0.8744+-0.1054 OK  
dt        : 0.8736+-0.1061 OK  
sin_max   : 0.8736+-0.1062 OK  
day_      : 0.8744+-0.1061 OK  
minutes   : 0.8736+-0.1062 OK  
sin_min   : 0.8735+-0.1062 OK  
bin2      : 0.8748+-0.0999 OK  
dt_std    : 0.8736+-0.1061 OK  
weekend   : 

Process ForkPoolWorker-508:
Process ForkPoolWorker-507:
Traceback (most recent call last):


KeyboardInterrupt: 

### Add columns one by one

In [None]:
%%time

params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}

n_cols = len(ft_columns)
cv_scores_arr = []
mask = np.zeros(n_cols, dtype='bool')
for i in range(-n_cols, 0):
    if 'dt_time' not in ft_columns[i]:
        mask[i] = True
        X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split, mask]])
        X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:, mask]])
        logit = LogisticRegression(**params)
        cv_scores_ = cross_val_score(logit, X_train, y_train, cv=time_split, scoring='roc_auc', n_jobs=-1)
        cv_scores_arr.append(cv_scores_)

        d_cv = cv_scores_ - cv_scores
        n_pos = (d_cv > 0).sum()
        if not(d_cv.mean() > 0 and n_pos > d_cv.shape[0]/2):
            mask[i] = False
            print(f"{ft_columns[i]:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} EXCLUDE  ")
        else:
            cv_scores = cv_scores_.copy()
            print(f"{ft_columns[i]:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} ADD  ")

dow       : 0.8544+-0.0874 EXCLUDE
weekend   : 0.8399+-0.0972 EXCLUDE
day_      : 0.8523+-0.0798 EXCLUDE
month     : 0.8552+-0.0807 ADD
sin_min   : 0.9028+-0.0783 ADD
cos_min   : 0.8915+-0.0949 EXCLUDE
dt        : 0.9019+-0.0794 EXCLUDE
dt_std    : 0.9028+-0.0788 EXCLUDE
dt_mean   : 0.9016+-0.0794 EXCLUDE
n_null    : 0.9020+-0.0756 EXCLUDE
morning   : 0.8923+-0.0966 EXCLUDE
day       : 0.9008+-0.0817 EXCLUDE
evening   : 0.9057+-0.0756 ADD
sin_max   : 0.9057+-0.0756 ADD
cos_max   : 0.8854+-0.1113 EXCLUDE
hour      : 0.9007+-0.0837 EXCLUDE
year      : 0.9099+-0.0767 ADD
minutes   : 0.9054+-0.0844 EXCLUDE

In [None]:
good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year', 'morning', 'day', 'evening']

TfidfVectorizer df_tf + time  max_df=.7 sublinear_tf=True
ROC_AUC: 0.9168+-0.0542
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time  max_df=.7
ROC_AUC: 0.9148+-0.0504
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time  max_df=.8
ROC_AUC: 0.9125+-0.0538
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time
ROC_AUC: 0.9115+-0.0546
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time (sites name full)
ROC_AUC: 0.9115+-0.0546
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf
ROC_AUC: 0.8575+-0.0753
Best params: {'C': 20, 'class_weight': {0: 0.6, 1: 0.4}}

CountVectorizer df_tf
ROC_AUC: 0.8351+-0.0763
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

#### To submit

In [38]:
logit_test_pred = logit_grid_searcher.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'pred/a_sub_new_df_tfidf_drop_dub.csv')

# Lets watch origin

User sessions are chosen in the way they are not longer than half an hour or/and contain more than ten websites. I.e. a session is considered as ended either if a user has visited ten websites or if a session has lasted over thirty minutes.

There are some empty values in the table, it means that some sessions contain less than ten websites. Replace empty values with 0 and change columns types to integer. Also load the websites dictionary and check how it looks like:

**Пользовательские сессии выбираются так, чтобы они не превышали получаса или / и содержали более десяти веб-сайтов. То есть сеанс считается завершенным, если пользователь посетил десять веб-сайтов или если сеанс длится более тридцати минут.**  

**В таблице есть несколько пустых значений, это означает, что некоторые сеансы содержат менее десяти веб-сайтов. Замените пустые значения на 0 и измените типы столбцов на integer. Также загрузите словарь веб-сайтов и проверьте, как это выглядит:**

In [19]:
df_alice = pd.read_csv('data/train/Alice_log.csv')
df_alice.head(5)

Unnamed: 0,timestamp,site
0,2013-02-12 16:25:10,api.bing.com
1,2013-02-12 16:25:11,api.bing.com
2,2013-02-12 16:32:10,api.bing.com
3,2013-02-12 16:32:11,www.google.fr
4,2013-02-12 16:32:24,www.google.fr


In [9]:
df_user0010 = pd.read_csv('data/train/other_user_logs/user0010.csv')
df_user0010.head(5)

Unnamed: 0,timestamp,site
0,2013-12-18 10:19:27,ocsp.digicert.com
1,2013-12-18 10:19:28,ocsp.digicert.com
2,2013-12-18 10:19:28,clients1.google.com
3,2013-12-18 10:19:29,gtglobal-ocsp.geotrust.com
4,2013-12-18 10:19:29,clients1.google.com


In [13]:
full_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,945,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,946,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,952,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22


In [18]:
train_df[train_df['target']==1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2297 entries, 251175 to 244233
Data columns (total 21 columns):
site1     2297 non-null int32
time1     2297 non-null datetime64[ns]
site2     2297 non-null int32
time2     2294 non-null datetime64[ns]
site3     2297 non-null int32
time3     2287 non-null datetime64[ns]
site4     2297 non-null int32
time4     2286 non-null datetime64[ns]
site5     2297 non-null int32
time5     2280 non-null datetime64[ns]
site6     2297 non-null int32
time6     2273 non-null datetime64[ns]
site7     2297 non-null int32
time7     2269 non-null datetime64[ns]
site8     2297 non-null int32
time8     2263 non-null datetime64[ns]
site9     2297 non-null int32
time9     2262 non-null datetime64[ns]
site10    2297 non-null int32
time10    2258 non-null datetime64[ns]
target    2297 non-null int64
dtypes: datetime64[ns](10), int32(10), int64(1)
memory usage: 305.1 KB


In [60]:
DT = 30

df_alice = pd.read_csv('data/train/Alice_log.csv')
df_alice['timestamp'] = df_alice['timestamp'].apply(pd.to_datetime)
for i in range(-1, -10, -1):
    df_alice['timestamp' + str(-i+1)] = df_alice['timestamp'].shift(i)
    df_alice['site' + str(-i+1)] = df_alice['site'].shift(i)
    
    df_alice['dt'] = (df_alice['timestamp' + str(-i+1)] - df_alice['timestamp']).dt.seconds / 60
    df_alice.loc[df_alice['dt']>DT, ['timestamp' + str(-i+1), 'site' + str(-i+1)]] = None

del df_alice['dt']

to_int = dict(zip(sites_dict['site'], sites_dict.index))
for col in df_alice.columns:
    if 'site' in col:
        df_alice[col] = df_alice[col].map(to_int)

# df_alice['dt'] = (df_alice['timestamp'].shift(-1) - df_alice['timestamp']).dt.seconds / 60
# df_alice['dt_10sites'] = (df_alice['timestamp'].shift(-10) - df_alice['timestamp']).dt.seconds / 60
df_alice.head()

Unnamed: 0,timestamp,site,timestamp2,site2,timestamp3,site3,timestamp4,site4,timestamp5,site5,timestamp6,site6,timestamp7,site7,timestamp8,site8,timestamp9,site9,timestamp10,site10
0,2013-02-12 16:25:10,270,2013-02-12 16:25:11,270.0,2013-02-12 16:32:10,270.0,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0
1,2013-02-12 16:25:11,270,2013-02-12 16:32:10,270.0,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0
2,2013-02-12 16:32:10,270,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0,2013-02-12 16:32:28,7832.0
3,2013-02-12 16:32:11,21,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0,2013-02-12 16:32:28,7832.0,2013-02-12 16:32:29,37.0
4,2013-02-12 16:32:24,21,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0,2013-02-12 16:32:28,7832.0,2013-02-12 16:32:29,37.0,2013-02-12 16:32:34,7832.0
