In [1]:
# Import libraries and set desired options

from __future__ import division, print_function
# Disable Anaconda warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
# Load websites dictionary
with open(r"data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), columns=['site'])

sites_dict['zone'] = sites_dict['site'].str.split('.').apply(lambda x: x[-1])
sites_dict.loc[sites_dict['zone'].str.isnumeric(), 'zone'] = 'ip_address'
sites_dict['zone_le'] = LabelEncoder().fit_transform(sites_dict['zone'])

print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site,zone,zone_le
25075,www.abmecatronique.com,com,28
13997,groups.live.com,com,28
42436,majeureliguefootball.wordpress.com,com,28
30911,cdt46.media.tourinsoft.eu,eu,41
8104,www.hdwallpapers.eu,eu,41


In [4]:
def get_df(path, is_alice=False):
    df_alice = pd.read_csv(path)
    
    # удаление подряд идущих дубликватов
    df_alice = df_alice.loc[df_alice['site'].shift()!=df_alice['site']]
    
    df_alice['site1'] = df_alice['site']
    df_alice['time1'] = df_alice['timestamp'].apply(pd.to_datetime)

    del df_alice['site']
    del df_alice['timestamp']

    for i in range(-1, -10, -1):
        df_alice['site' + str(-i+1)] = df_alice['site1'].shift(i)
        df_alice['time' + str(-i+1)] = df_alice['time1'].shift(i)

        df_alice['dt'] = (df_alice['time' + str(-i+1)] - df_alice['time1']).dt.seconds / 60
        df_alice.loc[df_alice['dt']>DT, ['time' + str(-i+1), 'site' + str(-i+1)]] = None

    del df_alice['dt']

    to_int = dict(zip(sites_dict['site'], sites_dict.index))
    for col in df_alice.columns:
        if 'site' in col:
            df_alice[col] = df_alice[col].map(to_int)
    df_alice['target'] = int(is_alice)
    df_alice['for_folds'] = df_alice.index / (df_alice.shape[0] - 1)
    return df_alice

In [5]:
eval_get_df = False

if eval_get_df:
    df_alice = get_df('data/train/Alice_log.csv', is_alice=True)

    df_user_list = list()
    dirname = r'data\train\other_user_logs'
    for filename in os.listdir(dirname):
        path = os.path.join(dirname, filename)
        df_user = get_df(path)
        df_user_list.append(df_user)

    train_df = pd.concat([df_alice, *df_user_list], axis=0)
    train_df.to_csv('data/new_train_upd.csv')
else:
    train_df = pd.read_csv('data/new_train.csv', index_col=0)

In [6]:
# Read the training and test data sets
test_df = pd.read_csv('data/test_sessions.csv',
                      index_col='session_id')

# Switch time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='for_folds')
del train_df['for_folds']
train_df = train_df.reset_index(drop=True)

# Look at the first rows of the training set
train_df.head()

Unnamed: 0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,270,2013-02-12 16:25:10,270.0,2013-02-12 16:25:11,270.0,2013-02-12 16:32:10,21.0,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,...,2013-02-12 16:32:25,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:26,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,1
1,167,2013-11-25 08:03:45,1.0,2013-11-25 08:03:46,66.0,2013-11-25 08:03:50,781.0,2013-11-25 08:03:50,781.0,2013-11-25 08:03:51,...,2013-11-25 08:03:52,781.0,2013-11-25 08:03:53,781.0,2013-11-25 08:03:54,781.0,2013-11-25 08:03:55,270.0,2013-11-25 08:03:58,0
2,820,2013-11-19 13:25:23,820.0,2013-11-19 13:25:25,,NaT,,NaT,8675.0,2013-12-17 13:35:27,...,2014-02-13 13:33:01,820.0,2014-02-13 13:33:04,820.0,2014-02-13 13:33:27,820.0,2014-02-13 13:33:29,1446.0,2014-02-13 13:41:50,0
3,1,2013-04-12 16:52:26,21.0,2013-04-12 16:52:33,21.0,2013-04-12 16:52:45,22.0,2013-04-12 16:52:45,23.0,2013-04-12 16:52:48,...,2013-04-12 16:53:09,23.0,2013-04-12 16:53:09,22.0,2013-04-12 16:53:09,40.0,2013-04-12 16:53:10,21.0,2013-04-12 16:54:10,0
4,1,2013-11-16 10:32:07,21.0,2013-11-16 10:32:21,23.0,2013-11-16 10:32:23,21.0,2013-11-16 10:32:37,23.0,2013-11-16 10:32:37,...,2013-11-16 10:32:38,1.0,2013-11-16 10:33:35,21.0,2013-11-16 10:33:43,21.0,2013-11-16 10:33:44,21.0,2013-11-16 10:33:45,0


In [7]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [8]:
# Our target variable
y_train = train_df['target']

# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [9]:
# Dataframe with indices of visited websites in session
full_sites = full_df[sites]

full_sites.head()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
0,270,270,270,21,21,7832,21,7832,30,7832
1,167,1,66,781,781,781,781,781,781,270
2,820,820,0,0,8675,820,820,820,820,1446
3,1,21,21,22,23,21,23,22,40,21
4,1,21,23,21,23,21,1,21,21,21


In [10]:
full_df[times].head(1)

Unnamed: 0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
0,2013-02-12 16:25:10,2013-02-12 16:25:11,2013-02-12 16:32:10,2013-02-12 16:32:11,2013-02-12 16:32:24,2013-02-12 16:32:25,2013-02-12 16:32:25,2013-02-12 16:32:26,2013-02-12 16:32:27,2013-02-12 16:32:27


In [11]:
def  get_time_diff(row): 
    time_length = row.shape[0] - 1 
    time_diff = [0]*time_length 
    i = 0 
    while (i < time_length)and pd.notnull(row[i+1]): 
        time_diff[i] = (row[i+1] - row[i]) / np.timedelta64(1,'s') 
        i += 1 
    return  time_diff

In [45]:
def get_time_features(df):
    time_df = pd.DataFrame(index=df.index)
    
    hour = df['time1'].dt.hour
    time_df['hour'] = hour
    time_df['day_'] = df['time1'].dt.day
    time_df['month'] = df['time1'].dt.month
    time_df['year'] = df['time1'].dt.year
    
    time_df['morning'] = ((hour >= 7) & (hour <= 11)).astype('int')
    time_df['day'] = ((hour >= 12) & (hour <= 18)).astype('int')
    time_df['evening'] = ((hour >= 19) & (hour <= 23)).astype('int')
#     time_df['night'] = ((hour >= 0) & (hour <= 6)).astype('int')

    time_df['min'] = df['time1'] 
    time_df['max'] = df[times].max(axis=1)

    

    for px in ['min', 'max']:
        time_df['minutes'] = time_df[px].dt.hour * 60 + time_df[px].dt.minute
        time_df['sin_'+px] = np.sin(2*np.pi*time_df['minutes']/1440.)
        time_df['cos_'+px] = np.cos(2*np.pi*time_df['minutes']/1440.)

    time_df['dow'] = time_df['min'].apply(lambda ts: ts.date().weekday())
    time_df['weekend'] = (time_df['dow'] > 4).astype('int')
    time_df['n_null'] = df[times].isnull().sum(axis=1)
    
#     time_df['mean_dt'] = time_df['dt'] / (10 - time_df['n_null'])
    
    
#     time_diff = []
#     for row in df.values:
#         time_diff.append (get_time_diff (row))
#     time_diff = np.log1p(np.array(time_diff).astype(float))
#     time_names = ['time_diff'+str(j) for j in range(1,10)] 
#     for ind, column_name in enumerate(time_names): 
#         time_df[column_name] = time_diff[:,ind] 

    time_df['dt'] = time_df['max'] - time_df['min']
    for time in times[1:]:
        dt_ = (df[time] - time_df['min']).fillna(time_df['dt'])
        time_df['dt_' + time] = np.log1p(np.abs(dt_.astype('timedelta64[s]')))
    time_df['dt'] = np.log1p(np.abs(time_df['dt'].astype('timedelta64[s]')))
    time_df['dt_mean'] = time_df[['dt_' + time for time in times[1:]]].mean(axis=1)
    time_df['dt_std'] = time_df[['dt_' + time for time in times[1:]]].std(axis=1)
    
    s_columns = [col for col in time_df.columns if time_df[col].dtype != '<M8[ns]']
    
    s_scaler = StandardScaler()
    time_df[s_columns] = s_scaler.fit_transform(time_df[s_columns])
#     for time in times[1:]:
#         time_df['dt_' + time] = (time_df['dt_' + time] - 3) / 3
#     time_df['dt'] = (time_df['dt'] - 3) / 3
#     time_df['dt_mean'] = (time_df['dt_mean'] - 3) / 3
#     time_df['dow'] = (time_df['dow'] - 3) / 3
#     time_df['n_null'] = (time_df['n_null'] - 4.5) / 4.5

    time_df = time_df.drop(['min', 'max'], axis=1)
    
    for col in time_df.columns:
        time_df[col] = time_df[col].fillna(time_df[col].mean())

    return time_df

In [82]:
full_time = get_time_features(full_df[times])
ft_columns = ['dow', 'weekend', 'day_', 'month', 'sin_min', 'cos_min', 'dt', 'dt_std', 'dt_mean', 'n_null',
              'dt_time10', 'dt_time9', 'dt_time8', 'dt_time7', 'dt_time6', 'dt_time5', 'dt_time4', 'dt_time3', 'dt_time2',
              'morning', 'day', 'evening', 'sin_max', 'cos_max', 'hour', 'year', 'minutes']  # full_time.columns
good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year']

full_time = full_time[ft_columns]

hours_dum = pd.get_dummies(pd.cut(full_time['hour'], bins=4))

full_time = pd.concat([full_time[good_cols], hours_dum], axis=1)
print(full_time.agg(['min', 'mean', 'max']))
full_time = full_time.values

           sin_min       cos_max        n_null            dt          year  \
min  -1.483732e+00 -8.384219e-01 -2.444461e-01 -1.880744e+00 -1.592300e+00   
mean  2.820128e-14 -4.585466e-15  2.882429e-13 -4.949774e-15  1.378052e-11   
max   1.669230e+00  4.904801e+00  5.843342e+00  7.473835e+00  6.280224e-01   

      (-1.69, -0.406]  (-0.406, 0.873]  (0.873, 2.152]  (2.152, 3.431]  
min          0.000000         0.000000        0.000000        0.000000  
mean         0.483626         0.349742        0.146071        0.020561  
max          1.000000         1.000000        1.000000        1.000000  


## TF-IDF

In [85]:
full_sites_tf = full_sites.copy()

for col in full_sites_tf.columns:
    full_sites_tf[col] = full_sites_tf[col].map(sites_dict.site)

full_sites_tf = full_sites_tf.fillna('')
df_tf_col = full_sites_tf.apply(lambda x: '.'.join([i for i in x if len(i)>0]), axis=1)
df_tf_col = df_tf_col.str.split('[.-]').str.join(' ')

AttributeError: str not found

In [114]:
vect = TfidfVectorizer(max_df=.7, sublinear_tf=True)
df_tf = vect.fit_transform(df_tf_col)

#### Split

In [93]:
time_split = TimeSeriesSplit(n_splits=10)
kfold_split = KFold(n_splits=8, shuffle=False)

In [115]:
# X_train = df_tf[:idx_split,:]
# X_test = df_tf[idx_split:,:]


X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split,:]])
X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:,:]])

#### GridSearch

In [101]:
c_values = [0.01, 0.03]
# cw_values = [{0: 0.6, 1: 0.4}, {0: 0.9, 1: 0.1}, {0: 0.8, 1: 0.2}, {0: 0.7, 1: 0.3} , {0: 0.3, 1: 0.7}]

lrcv = LogisticRegressionCV(Cs=c_values, scoring='roc_auc', n_jobs=-1, cv=kfold_split,
                            verbose=1, class_weight='balanced', max_iter=3000)

In [102]:
%%time
lrcv.fit(X_train, y_train);
lrcv.score

[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  8.3min remaining: 24.9min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  8.6min finished


CPU times: user 3min 56s, sys: 2min 36s, total: 6min 33s
Wall time: 10min 39s


In [103]:
cvr = lrcv.scores_[1]
idx = cvr.mean(axis=0).argmax()
for i in range(cvr.shape[1]):
    print(f"ROC_AUC [{lrcv.Cs_[i]:>4}]: {cvr[:, i].mean():.4f}+-{cvr[:, i].std():.4f}")
print(f"Best params: {lrcv.C_}")

ROC_AUC [0.01]: 0.9574+-0.0141
ROC_AUC [0.03]: 0.9587+-0.0136
Best params: [0.03]


ROC_AUC [0.01]: 0.9574+-0.0141
ROC_AUC [0.03]: 0.9587+-0.0136
ROC_AUC [ 0.1]: 0.9586+-0.0127
ROC_AUC [ 0.3]: 0.9578+-0.0123
ROC_AUC [ 1.0]: 0.9559+-0.0129
ROC_AUC [ 3.0]: 0.9531+-0.0142
ROC_AUC [10.0]: 0.9488+-0.0167
Best params: [0.1]

#### Evaluating

In [47]:
X_train = df_tf[:idx_split,:]
X_test = df_tf[idx_split:,:]

In [116]:
params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}

logit = LogisticRegression(**params)

In [117]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, cv=kfold_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

CPU times: user 10.3 s, sys: 2.52 s, total: 12.8 s
Wall time: 2min 46s


In [118]:
print(f"ROC_AUC: {cv_scores.mean():.4f}+-{cv_scores.std():.4f}")
# cv_scores

ROC_AUC: 0.9583+-0.0126


In [119]:
logit.fit(X_train, y_train)
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm_raw_07.csv')

### Check columns importance

### Define class_weight

In [109]:
%%time
cv_scores_arr = list()

for cw in [{0: 0.2, 1: 0.8}, {0: 0.3, 1: 0.7}, {0: 0.4, 1: 0.6}]:
    vect = TfidfVectorizer(max_df=0.9, sublinear_tf=True)
    df_tf = vect.fit_transform(df_tf_col)
    params = {'C': 0.1, 'class_weight': cw, 'random_state':17, 'n_jobs':1}
    
    X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split,:]])
    X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:,:]])
    
    logit = LogisticRegression(**params)
    cv_scores_ = cross_val_score(logit, X_train, y_train, cv=kfold_split, scoring='roc_auc', n_jobs=-1)
    cv_scores_arr.append(cv_scores_)
    
    print(f"cw {cw}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f}")

cw {0: 0.2, 1: 0.8}: 0.9542+-0.0159
cw {0: 0.3, 1: 0.7}: 0.9537+-0.0162
cw {0: 0.4, 1: 0.6}: 0.9529+-0.0165
CPU times: user 3min 28s, sys: 22.5 s, total: 3min 51s
Wall time: 8min 35s


cw balanced  : 0.9586+-0.0127
cw {0: 0.9, 1: 0.1}: 0.9390+-0.0209
cw {0: 0.8, 1: 0.2}: 0.9451+-0.0188
cw {0: 0.7, 1: 0.3}: 0.9483+-0.0178
cw {0: 0.6, 1: 0.4}: 0.9504+-0.0172
cw {0: 0.5, 1: 0.5}: 0.9518+-0.0168
cw {0: 0.4, 1: 0.6}: 0.9529+-0.0165
cw {0: 0.3, 1: 0.7}: 0.9537+-0.0162
cw {0: 0.2, 1: 0.8}: 0.9542+-0.0159

### Define good tf-idf params

In [91]:
%%time
params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}
cv_scores_arr = list()

for i in [False]:
    vect = TfidfVectorizer(max_df=0.9, sublinear_tf=False)
    df_tf = vect.fit_transform(df_tf_col)
    
    X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split,:]])
    X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:,:]])
    
    logit = LogisticRegression(**params)
    cv_scores_ = cross_val_score(logit, X_train, y_train, cv=kfold_split, scoring='roc_auc', n_jobs=-1)
    cv_scores_arr.append(cv_scores_)
    
    print(f"sublinear_tf {i:<6}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f}")

sublinear_tf 0     : 0.9566+-0.0126
CPU times: user 1min 9s, sys: 7.57 s, total: 1min 17s
Wall time: 3min 57s


min_df 0.0001: 0.9581+-0.0129
min_df 0.001: 0.9560+-0.0120
min_df 0.01: 0.9403+-0.0132
min_df 0.1: 0.9110+-0.0209
min_df 0.3: 0.8979+-0.0200

max_df 0.50: 0.9582+-0.0125
max_df 0.60: 0.9583+-0.0126
max_df 0.70: 0.9583+-0.0126
max_df 0.80: 0.9583+-0.0126
max_df 0.85: 0.9583+-0.0126
max_df 0.90: 0.9586+-0.0127
max_df 0.95: 0.9586+-0.0127
max_df 0.98: 0.9586+-0.0127
max_df 1.00: 0.8968+-0.0200

### Define good hour split

In [72]:
good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year', 'morning', 'day', 'evening']
mask = pd.Series(ft_columns).isin(good_cols).values

X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split, mask]])
X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:, mask]])

params = {'C': 0.1, 'class_weight': 'balanced', 'random_state':17, 'n_jobs':1}

logit = LogisticRegression(**params)

cv_scores = cross_val_score(logit, X_train, y_train, cv=kfold_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

print(f"ROC_AUC: {cv_scores.mean():.4f}+-{cv_scores.std():.4f}")

ROC_AUC: 0.9530+-0.0141


In [80]:
%%time

good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year']
mask = pd.Series(ft_columns).isin(good_cols).values

mask_hours = pd.Series(ft_columns).isin(['hour']).values
hours = pd.Series(full_time[:, mask_hours].flatten())

n_cols = len(ft_columns)
cv_scores_arr = []

for i in range(2, 9):
    hours_dum = pd.get_dummies(pd.cut(hours, bins=i)).values
    
    X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split, mask], hours_dum[:idx_split, :]])
    X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:, mask], hours_dum[idx_split:, :]])
    
    logit = LogisticRegression(**params)
    cv_scores_ = cross_val_score(logit, X_train, y_train, cv=kfold_split, scoring='roc_auc', n_jobs=-1)
    cv_scores_arr.append(cv_scores_)
    
    d_cv = cv_scores_ - cv_scores
    n_pos = (d_cv > 0).sum()
    if not(d_cv.mean() > 0 and n_pos > 4):
        print(f"Bins {i:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} EXCLUDE")
    else:
        cv_scores = cv_scores_.copy()
        print(f"Bins {i:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} ADD")

Bins 2         : 0.9465+-0.0194 EXCLUDE
Bins 3         : 0.9418+-0.0187 EXCLUDE
Bins 4         : 0.9583+-0.0126 ADD
Bins 5         : 0.9494+-0.0208 EXCLUDE
Bins 6         : 0.9429+-0.0222 EXCLUDE
Bins 7         : 0.9548+-0.0194 EXCLUDE
Bins 8         : 0.9592+-0.0168 ADD
CPU times: user 1min 29s, sys: 34.6 s, total: 2min 4s
Wall time: 20min 24s


### Add columns one by one

In [51]:
%%time

n_cols = len(ft_columns)
cv_scores_arr = []
mask = np.zeros(n_cols, dtype='bool')
for i in range(-n_cols, 0):
    mask[i] = True
    X_train = hstack([df_tf[:idx_split,:], full_time[:idx_split, mask]])
    X_test = hstack([df_tf[idx_split:,:], full_time[idx_split:, mask]])
    logit = LogisticRegression(**params)
    cv_scores_ = cross_val_score(logit, X_train, y_train, cv=kfold_split, scoring='roc_auc', n_jobs=-1)
    cv_scores_arr.append(cv_scores_)
    
    d_cv = cv_scores_ - cv_scores
    n_pos = (d_cv > 0).sum()
    if not(d_cv.mean() > 0 and n_pos > 4):
        mask[i] = False
        print(f"{ft_columns[i]:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} EXCLUDE")
    else:
        cv_scores = cv_scores_.copy()
        print(f"{ft_columns[i]:<10}: {cv_scores_.mean():.4f}+-{cv_scores_.std():.4f} ADD")

dow       : 0.9078+-0.0384 EXCLUDE
weekend   : 0.9046+-0.0509 EXCLUDE
day_      : 0.9058+-0.0344 EXCLUDE
month     : 0.9166+-0.0213 EXCLUDE
sin_min   : 0.9361+-0.0228 ADD
cos_min   : 0.9349+-0.0238 EXCLUDE
dt        : 0.9361+-0.0229 EXCLUDE
dt_std    : 0.9361+-0.0229 EXCLUDE
dt_mean   : 0.9361+-0.0229 EXCLUDE
n_null    : 0.9362+-0.0228 ADD
dt_time10 : 0.9362+-0.0229 ADD
dt_time9  : 0.9362+-0.0229 EXCLUDE
dt_time8  : 0.9362+-0.0229 EXCLUDE
dt_time7  : 0.9362+-0.0229 EXCLUDE
dt_time6  : 0.9362+-0.0229 EXCLUDE
dt_time5  : 0.9362+-0.0229 EXCLUDE
dt_time4  : 0.9362+-0.0229 EXCLUDE
dt_time3  : 0.9362+-0.0229 EXCLUDE
dt_time2  : 0.9362+-0.0229 EXCLUDE
morning   : 0.9401+-0.0195 ADD
day       : 0.9417+-0.0189 ADD
evening   : 0.9418+-0.0188 ADD
sin_max   : 0.9418+-0.0188 ADD
cos_max   : 0.9483+-0.0172 ADD
hour      : 0.9481+-0.0173 EXCLUDE
year      : 0.9530+-0.0141 ADD
minutes   : 0.9525+-0.0148 EXCLUDE
CPU times: user 4min 55s, sys: 1min 50s, total: 6min 45s
Wall time: 1h 2min 52s


dow       : 0.9078+-0.0384 EXCLUDE  
weekend   : 0.9046+-0.0509 EXCLUDE  
day_      : 0.9058+-0.0344 EXCLUDE  
month     : 0.9166+-0.0213 EXCLUDE  
sin_min   : 0.9361+-0.0228 ADD  
cos_min   : 0.9349+-0.0238 EXCLUDE  
dt        : 0.9361+-0.0229 EXCLUDE  
dt_std    : 0.9361+-0.0229 EXCLUDE  
dt_mean   : 0.9361+-0.0229 EXCLUDE  
n_null    : 0.9362+-0.0228 ADD  
dt_time10 : 0.9362+-0.0229 ADD  
dt_time9  : 0.9362+-0.0229 EXCLUDE  
dt_time8  : 0.9362+-0.0229 EXCLUDE  
dt_time7  : 0.9362+-0.0229 EXCLUDE  
dt_time6  : 0.9362+-0.0229 EXCLUDE  
dt_time5  : 0.9362+-0.0229 EXCLUDE  
dt_time4  : 0.9362+-0.0229 EXCLUDE  
dt_time3  : 0.9362+-0.0229 EXCLUDE  
dt_time2  : 0.9362+-0.0229 EXCLUDE  
morning   : 0.9401+-0.0195 ADD  
day       : 0.9417+-0.0189 ADD  
evening   : 0.9418+-0.0188 ADD  
sin_max   : 0.9418+-0.0188 ADD  
cos_max   : 0.9483+-0.0172 ADD  
hour      : 0.9481+-0.0173 EXCLUDE   
year      : 0.9530+-0.0141 ADD  
minutes   : 0.9525+-0.0148 EXCLUDE  
CPU times: user 4min 55s, sys: 1min 50s, total: 6min 45s  
Wall time: 1h 2min 52s   


In [None]:
good_cols = ['sin_min', 'cos_max', 'n_null', 'dt', 'year', 'morning', 'day', 'evening']

In [193]:
cv_scores_arr_deleting = cv_scores_arr.copy()
for i, cv_scores_ in enumerate(cv_scores_arr_deleting):
    d_cv = cv_scores_ - cv_scores
    n_pos = (d_cv > 0).sum()
    if d_cv.mean() < 0 and n_pos < 5:
        print(f"{ft_columns[i]:<10}: {d_cv.mean():>8.5f} N_pos: {n_pos}")

day_      : -0.00000 N_pos: 2
day       : -0.00000 N_pos: 4
evening   : -0.00000 N_pos: 4
minutes   : -0.00000 N_pos: 4
dow       : -0.00012 N_pos: 0
dt_time3  : -0.00004 N_pos: 1
dt_time4  : -0.00001 N_pos: 2
dt_time6  : -0.00017 N_pos: 2
dt_time7  : -0.00013 N_pos: 3
dt_time10 : -0.00303 N_pos: 3


TfidfVectorizer df_tf + time  max_df=.7 sublinear_tf=True
ROC_AUC: 0.9168+-0.0542
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time  max_df=.7
ROC_AUC: 0.9148+-0.0504
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time  max_df=.8
ROC_AUC: 0.9125+-0.0538
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time
ROC_AUC: 0.9115+-0.0546
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf + time (sites name full)
ROC_AUC: 0.9115+-0.0546
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

TfidfVectorizer df_tf
ROC_AUC: 0.8575+-0.0753
Best params: {'C': 20, 'class_weight': {0: 0.6, 1: 0.4}}

CountVectorizer df_tf
ROC_AUC: 0.8351+-0.0763
Best params: {'C': 10, 'class_weight': {0: 0.6, 1: 0.4}}

#### To submit

In [38]:
logit_test_pred = logit_grid_searcher.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'pred/a_sub_new_df_tfidf_drop_dub.csv')

# Lets watch origin

User sessions are chosen in the way they are not longer than half an hour or/and contain more than ten websites. I.e. a session is considered as ended either if a user has visited ten websites or if a session has lasted over thirty minutes.

There are some empty values in the table, it means that some sessions contain less than ten websites. Replace empty values with 0 and change columns types to integer. Also load the websites dictionary and check how it looks like:

**Пользовательские сессии выбираются так, чтобы они не превышали получаса или / и содержали более десяти веб-сайтов. То есть сеанс считается завершенным, если пользователь посетил десять веб-сайтов или если сеанс длится более тридцати минут.**  

**В таблице есть несколько пустых значений, это означает, что некоторые сеансы содержат менее десяти веб-сайтов. Замените пустые значения на 0 и измените типы столбцов на integer. Также загрузите словарь веб-сайтов и проверьте, как это выглядит:**

In [19]:
df_alice = pd.read_csv('data/train/Alice_log.csv')
df_alice.head(5)

Unnamed: 0,timestamp,site
0,2013-02-12 16:25:10,api.bing.com
1,2013-02-12 16:25:11,api.bing.com
2,2013-02-12 16:32:10,api.bing.com
3,2013-02-12 16:32:11,www.google.fr
4,2013-02-12 16:32:24,www.google.fr


In [9]:
df_user0010 = pd.read_csv('data/train/other_user_logs/user0010.csv')
df_user0010.head(5)

Unnamed: 0,timestamp,site
0,2013-12-18 10:19:27,ocsp.digicert.com
1,2013-12-18 10:19:28,ocsp.digicert.com
2,2013-12-18 10:19:28,clients1.google.com
3,2013-12-18 10:19:29,gtglobal-ocsp.geotrust.com
4,2013-12-18 10:19:29,clients1.google.com


In [13]:
full_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,945,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,946,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,952,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22


In [18]:
train_df[train_df['target']==1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2297 entries, 251175 to 244233
Data columns (total 21 columns):
site1     2297 non-null int32
time1     2297 non-null datetime64[ns]
site2     2297 non-null int32
time2     2294 non-null datetime64[ns]
site3     2297 non-null int32
time3     2287 non-null datetime64[ns]
site4     2297 non-null int32
time4     2286 non-null datetime64[ns]
site5     2297 non-null int32
time5     2280 non-null datetime64[ns]
site6     2297 non-null int32
time6     2273 non-null datetime64[ns]
site7     2297 non-null int32
time7     2269 non-null datetime64[ns]
site8     2297 non-null int32
time8     2263 non-null datetime64[ns]
site9     2297 non-null int32
time9     2262 non-null datetime64[ns]
site10    2297 non-null int32
time10    2258 non-null datetime64[ns]
target    2297 non-null int64
dtypes: datetime64[ns](10), int32(10), int64(1)
memory usage: 305.1 KB


In [60]:
DT = 30

df_alice = pd.read_csv('data/train/Alice_log.csv')
df_alice['timestamp'] = df_alice['timestamp'].apply(pd.to_datetime)
for i in range(-1, -10, -1):
    df_alice['timestamp' + str(-i+1)] = df_alice['timestamp'].shift(i)
    df_alice['site' + str(-i+1)] = df_alice['site'].shift(i)
    
    df_alice['dt'] = (df_alice['timestamp' + str(-i+1)] - df_alice['timestamp']).dt.seconds / 60
    df_alice.loc[df_alice['dt']>DT, ['timestamp' + str(-i+1), 'site' + str(-i+1)]] = None

del df_alice['dt']

to_int = dict(zip(sites_dict['site'], sites_dict.index))
for col in df_alice.columns:
    if 'site' in col:
        df_alice[col] = df_alice[col].map(to_int)

# df_alice['dt'] = (df_alice['timestamp'].shift(-1) - df_alice['timestamp']).dt.seconds / 60
# df_alice['dt_10sites'] = (df_alice['timestamp'].shift(-10) - df_alice['timestamp']).dt.seconds / 60
df_alice.head()

Unnamed: 0,timestamp,site,timestamp2,site2,timestamp3,site3,timestamp4,site4,timestamp5,site5,timestamp6,site6,timestamp7,site7,timestamp8,site8,timestamp9,site9,timestamp10,site10
0,2013-02-12 16:25:10,270,2013-02-12 16:25:11,270.0,2013-02-12 16:32:10,270.0,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0
1,2013-02-12 16:25:11,270,2013-02-12 16:32:10,270.0,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0
2,2013-02-12 16:32:10,270,2013-02-12 16:32:11,21.0,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0,2013-02-12 16:32:28,7832.0
3,2013-02-12 16:32:11,21,2013-02-12 16:32:24,21.0,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0,2013-02-12 16:32:28,7832.0,2013-02-12 16:32:29,37.0
4,2013-02-12 16:32:24,21,2013-02-12 16:32:25,7832.0,2013-02-12 16:32:25,21.0,2013-02-12 16:32:26,7832.0,2013-02-12 16:32:27,30.0,2013-02-12 16:32:27,7832.0,2013-02-12 16:32:27,29.0,2013-02-12 16:32:28,7832.0,2013-02-12 16:32:29,37.0,2013-02-12 16:32:34,7832.0
