### IDEAS

* Bad news in the end: our CV scheme is not perfect. Try to improve it! (hint: is all training set needed for a good prediction?



### HINTS


* keep track of cross-validation improvements for each fold (or at least as many folds as possible)
* take a look at cross-validation std, not only mean
* try to build a CV scheme so that CV improvements correlate with LB improvements (it's very important)
* exploring feature importance might help, sometimes even in detecting overfitting
* spend most of the competition time exploring data and building features

* existing features, try multiplying or dividing two of them
* do not hesitate to convert an existing feature (for example, take a logarithm), etc.

In [1]:
# Import libraries and set desired options
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack

# !pip install eli5
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

Using TensorFlow backend.


In [199]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

In [37]:
PATH_TO_DATA = 'data/'
PATH_TO_SUBMISSIONS = 'submits/'
path_to_site_dict = os.path.join(PATH_TO_DATA, 'site_dic.pkl')
SEED = 17

In [254]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    subm_path = os.path.join(PATH_TO_SUBMISSIONS, out_file)
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

def train_and_predict(model, X_train, y_train, X_test, all_func=False,
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):

    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if all_func:
        site_feature_names = model.named_steps['feats'].get_params()['tfidf__tfidf'].get_feature_names()
        site_feature_names = [' '.join([id2site[int(site_idx)] for site_idx in fn.split(' ')]) for fn in site_feature_names ]
        if new_feature_names:
            all_feature_names = site_feature_names + new_feature_names 
        else: 
            all_feature_names = site_feature_names

        display_html(eli5.show_weights(estimator=model.named_steps['clf'], 
                      feature_names=all_feature_names, top=top_n_features_to_show))

        if new_feature_names:
            print('New feature weights:')

            print(pd.DataFrame({'feature': new_feature_names, 
                            'coef': model.named_steps['clf'].coef_.flatten()[-len(new_feature_names):]}))
    
        test_pred = model.predict_proba(X_test)[:, 1]
        subm_path = os.path.join(PATH_TO_SUBMISSIONS, submission_file_name)
        write_to_submission_file(test_pred, subm_path) 
    
    return cv_scores

In [218]:
times = [f"time{i}" for i in range(1, 11)]
sites = [f"site{i}" for i in range(1, 11)]

df_train = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id', parse_dates=times)
df_test = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id', parse_dates=times)

df_train = df_train.sort_values('time1')

df_train['index'] = np.arange(df_train.shape[0])
df_test['index'] = np.arange(df_train.shape[0], df_train.shape[0] + df_test.shape[0])

In [219]:
with open(path_to_site_dict, 'rb') as f:
    site2id = pickle.load(f)

id2site = {v:k for (k, v) in site2id.items()}
id2site[0] = 'unknown'

In [220]:
df_train.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,site7,time7,site8,time8,site9,time9,site10,time10,target,index
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,,NaT,,NaT,,NaT,,NaT,0,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,,NaT,,NaT,,NaT,,NaT,0,1
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0,2
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0,3
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0,4


In [None]:
top_sites = pd.Series(df_train[sites].fillna(0).values.flatten()).value_counts().sort_values(ascending=False).drop(index=[0])

top_sites_alice = pd.Series(df_train[df_train.target==1][sites].fillna(0).values.flatten()).value_counts().sort_values(ascending=False).drop(index=[0])

In [221]:
def preproc_data(X):
    new_data = pd.DataFrame(index=X.index)
    
    # sites params
    print("Making sites in one row")
    for col in sites:
        new_data[col] = X[col].fillna(0).astype(int)
    new_data['sites'] = new_data[sites].apply(lambda row: ' '.join(row[row != 0].astype(str)), axis=1)
    new_data['sites_text'] = new_data[sites].apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1)
    
    print("Count sites")
    new_data['n_sites'] = X[sites].notna().sum(axis=1)
    new_data['n_unique_sites'] = X[sites].nunique(axis=1) - X['site10'].isna().astype(int)
    new_data['rate_unique_sites'] = new_data['n_unique_sites'] / new_data['n_sites']
    
    for n_top in [5, 20, 50, 100]:
        print("Count top", n_top)
        new_data['top5'] = new_data['n_unique_sites'] / new_data['n_sites']
        
        new_data[f'top{n_top}'] = X[sites].isin(top_sites.index[:n_top]).sum(axis=1)
        new_data[f'utop{n_top}'] = X[sites].apply(lambda row: np.isin(pd.unique(row), top_sites.index[:n_top]).sum(), axis=1)
        new_data[f'top_rate{n_top}'] = new_data[f'top{n_top}'] / new_data['n_sites']
    
        new_data[f'atop{n_top}'] = X[sites].isin(top_sites_alice.index[:n_top]).sum(axis=1)
        new_data[f'uatop{n_top}'] = X[sites].apply(lambda row: np.isin(pd.unique(row), top_sites_alice.index[:n_top]).sum(), axis=1)
        new_data[f'top_arate{n_top}'] = new_data[f'atop{n_top}'] / new_data['n_sites']

    # times params
    print("Eval duration")
    new_data['s_duration'] = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[s]').astype(int)
    new_data['s_duration_log'] = new_data['s_duration'].apply(np.log1p)
    
    print("Eval time diffs")
    tdiff = X[times].diff(axis=1)
    for col in times:
        tdiff[col] = tdiff[col].dt.total_seconds()
        tdiff.loc[tdiff[col] > 1800, col] = np.NaN
    new_data[[f'tdiff_{col}' for col in times[1:]]] = tdiff[times[1:]]
    new_data['tdiff_mean'] = tdiff[times[1:]].mean(axis=1)
    new_data['tdiff_std'] = tdiff[times[1:]].std(axis=1)
    new_data['tdiff_var'] = tdiff[times[1:]].var(axis=1)
    
    print("Eval hour params")
    new_data['hour'] = X['time1'].dt.hour
    new_data['hour_sin'] = new_data['hour'].apply(lambda ts: np.sin(2*np.pi*ts/24.))
    new_data['hour_cos'] = new_data['hour'].apply(lambda ts: np.cos(2*np.pi*ts/24.))
    new_data['hour_sin_17h'] = new_data['hour'].apply(lambda ts: np.sin(2*np.pi*(ts - 7)/17.))
    new_data['hour_cos_17h'] = new_data['hour'].apply(lambda ts: np.cos(2*np.pi*(ts - 7)/17.))
    
    new_data['hour_bin_1.1'] = new_data['hour'].apply(lambda hour: int(hour >= 0 and hour <= 6))
    new_data['hour_bin_1.2'] = new_data['hour'].apply(lambda hour: int(hour >= 7 and hour <= 11)) 
    new_data['hour_bin_1.3'] = new_data['hour'].apply(lambda hour: int(hour >= 12 and hour <= 18)) 
    new_data['hour_bin_1.4'] = new_data['hour'].apply(lambda hour: int(hour >= 19 and hour <= 23))

    new_data['hour_bin_2.1'] = new_data['hour'].apply(lambda hour: int(hour <= 11)) 
    new_data['hour_bin_2.2'] = new_data['hour'].apply(lambda hour: int(hour >= 12 and hour <= 13)) 
    new_data['hour_bin_2.3'] = new_data['hour'].apply(lambda hour: int(hour >= 14 and hour <= 15)) 
    new_data['hour_bin_2.4'] = new_data['hour'].apply(lambda hour: int(hour >= 16 and hour <= 18)) 
    new_data['hour_bin_2.5'] = new_data['hour'].apply(lambda hour: int(hour >= 19 and hour <= 23))
    
    new_data['hour_bin_3.1'] = new_data['hour'].apply(lambda hour: int(hour <= 11)) 
    new_data['hour_bin_3.2'] = new_data['hour'].apply(lambda hour: int(hour >= 12 and hour <= 15))
    new_data['hour_bin_3.3'] = new_data['hour'].apply(lambda hour: int(hour >= 16 and hour <= 18)) 
    new_data['hour_bin_3.4'] = new_data['hour'].apply(lambda hour: int(hour >= 19 and hour <= 23))
    
    new_data['hour_minutes'] = X['time1'].dt.hour * 60 + X['time1'].dt.minute
    new_data['hour_minutes_sin'] = new_data['hour_minutes'].apply(lambda ts: np.sin(2*np.pi*ts/1440.))
    new_data['hour_minutes_cos'] = new_data['hour_minutes'].apply(lambda ts: np.cos(2*np.pi*ts/1440.))
    
    print("Eval other time params")
    new_data['day'] = X['time1'].dt.day
    new_data['week'] = X['time1'].dt.weekday
    new_data['holiday'] = (new_data['week'] >= 5).astype(int)
    new_data['month'] = X['time1'].dt.month
    new_data['quarter'] = X['time1'].dt.quarter

    return new_data

In [222]:
X_train = preproc_data(df_train)
# X_test = preproc_data(df_test)
y_train = df_train['target']

X_train.head()

Making sites in one row
Count sites
Count top 5
Count top 20
Count top 50
Count top 100
Eval duration
Eval time diffs
Eval hour params
Eval other time params


Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,hour_bin_3.3,hour_bin_3.4,hour_minutes,hour_minutes_sin,hour_minutes_cos,day,week,holiday,month,quarter
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,55,0,0,0,0,0,0,0,0,...,0,0,485,0.854912,-0.518773,12,5,1,1,1
54843,56,55,56,55,0,0,0,0,0,0,...,0,0,517,0.774393,-0.632705,12,5,1,1,1
77292,946,946,951,946,946,945,948,784,949,946,...,0,0,530,0.737277,-0.67559,12,5,1,1,1
114021,945,948,949,948,945,946,947,945,946,946,...,0,0,530,0.737277,-0.67559,12,5,1,1,1
146670,947,950,948,947,950,952,946,951,946,947,...,0,0,530,0.737277,-0.67559,12,5,1,1,1


In [None]:
sites_cols_all = ['n_sites', 'n_unique_sites', 'rate_unique_sites', 
                  'top5', 'utop5', 'top_rate5', 'atop5', 'uatop5', 'top_arate5', 
                  'top20', 'utop20', 'top_rate20', 'atop20', 'uatop20', 'top_arate20', 
                  'top50', 'utop50', 'top_rate50', 'atop50', 'uatop50',  'top_arate50', 
                  'top100', 'utop100', 'top_rate100', 'atop100', 'uatop100', 'top_arate100']

times_cols_all = ['s_duration', 's_duration_log',
                  'tdiff_time2', 'tdiff_time3', 'tdiff_time4', 'tdiff_time5', 'tdiff_time6', 'tdiff_time7', 'tdiff_time8', 'tdiff_time9', 'tdiff_time10', 
                  'tdiff_mean', 'tdiff_std', 'tdiff_var', 
                  'hour', 'hour_sin', 'hour_cos', 'hour_sin_17h', 'hour_cos_17h', 
                  'hour_bin_1.1', 'hour_bin_1.2', 'hour_bin_1.3', 'hour_bin_1.4', 
                  'hour_bin_2.1', 'hour_bin_2.2', 'hour_bin_2.3', 'hour_bin_2.4', 'hour_bin_2.5',
                  'hour_bin_3.1', 'hour_bin_3.2', 'hour_bin_3.3', 'hour_bin_3.4',
                  'hour_minutes', 'hour_minutes_sin', 'hour_minutes_cos', 
                  'day', 'week', 'holiday', 'month', 'quarter']

In [265]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class Multiplier(BaseEstimator, TransformerMixin):
    def __init__(self, multipliers):
        self.multipliers = multipliers

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for col, mult in self.multipliers.items():
            X[col] = X[col] * mult
        return X

class PartScaler(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_scale):
        self.cols_to_scale = cols_to_scale
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.cols_to_scale])
        return self

    def transform(self, X):
        X[self.cols_to_scale] = self.scaler.transform(X[self.cols_to_scale])
        return X
    
tfidf_col = 'sites'
sites_cols_use = ['n_sites', 'n_unique_sites', 'rate_unique_sites']
times_cols_use = ['s_duration_log', 'tdiff_mean', 'hour_bin_3.1', 'hour_bin_3.2', 'hour_bin_3.3', 'hour_bin_3.4', 
                  'holiday', 'quarter']
sites_cols_scale = sites_cols_use
times_cols_scale = times_cols_use

new_feature_names = sites_cols_use + times_cols_use

pl_sites_tfidf = Pipeline([
            ('select', Selector(key=tfidf_col)),
            ('tfidf', TfidfVectorizer(ngram_range=(1, 3), max_features=50000, 
                                      smooth_idf=True, binary=True, sublinear_tf=True))
            ])

pl_sites = Pipeline([
            ('select', Selector(key=sites_cols_use)),
            ('scaler', PartScaler(cols_to_scale=sites_cols_scale))
            ])
pl_times = Pipeline([
            ('select', Selector(key=times_cols_use)),
            ('scaler', PartScaler(cols_to_scale=times_cols_scale))
            ])

# pl_mm_sites = Pipeline([
#             ('select', Selector(key=sites_cols_use)),
#             ('scaler', MinMaxScaler())
#             ])
# pl_mm_times = Pipeline([
#             ('select', Selector(key=times_cols_use)),
#             ('scaler', MinMaxScaler())
#             ])

feats_all = FeatureUnion([
                    ('tfidf', pl_sites_tfidf), 
                    ('site', pl_sites),
                    ('time', pl_times),
                    ])

pl_all = Pipeline([
                ('feats', feats_all),
                ('clf', LogisticRegression(C=3.36, random_state=17))
              ])

In [146]:
time_split = TimeSeriesSplit(n_splits=10)

In [None]:
new_params = {
    'feats__site__select__key':,
    'feats__time__select__key':
}
pl_all.set_params(new_params)

In [223]:
cv_scores = train_and_predict(pl_all, X_train, y_train, X_test, new_feature_names=new_feature_names, 
                              cv=time_split, submission_file_name='subm1.csv')

CV scores [0.63069556 0.81052432 0.80932491 0.96650384 0.91013896 0.97732602
 0.91728036 0.95154464 0.97860902 0.97585911]
CV mean: 0.8927806742169867, CV std: 0.10659543941822727


Weight?,Feature
+9.506,www.express.co.uk
+9.419,cid-ed6c3e6a5c6608a4.users.storage.live.com
+7.617,www.tete-en-lair.com
+7.302,www.info-jeunes.net
+7.067,inskin01.wt-eu02.net
+6.193,tru.am
+5.963,vk.com
+5.945,cid-4390f92c7906cc9c.users.storage.live.com
+5.762,www.audienceinsights.net
+5.677,www.melty.fr


New feature weights:
              feature      coef
0             n_sites  0.418412
1      n_unique_sites -0.439911
2   rate_unique_sites  0.281502
3      s_duration_log -0.161264
4          tdiff_mean -0.127971
5        hour_bin_3.1 -0.947009
6        hour_bin_3.2  0.448411
7        hour_bin_3.3  1.087984
8        hour_bin_3.4 -0.709140
9             holiday -0.374594
10            quarter  0.459186


times_cols_use = ['hour_bin_1.2', 'hour_bin_1.3', 'hour_bin_1.4', 
                  's_duration_log', 'week']  
times_cols_scale = times_cols_use  
time_multipliers = {col: 4 for col in ['hour_bin_1.2', 'hour_bin_1.3', 'hour_bin_1.4']}  

CV scores [0.84356453 0.80263008 0.93680874 0.97206808 0.92006707 0.95113146
 0.95939422 0.94596484 0.9654401  0.96336943]  
CV mean: 0.9260438552580255, CV std: 0.0542101245717717  


In [None]:
sites_cols_all = ['n_sites', 'n_unique_sites', 'rate_unique_sites', 
                  'top5', 'utop5', 'top_rate5', 'atop5', 'uatop5', 'top_arate5', 
                  'top20', 'utop20', 'top_rate20', 'atop20', 'uatop20', 'top_arate20', 
                  'top50', 'utop50', 'top_rate50', 'atop50', 'uatop50',  'top_arate50', 
                  'top100', 'utop100', 'top_rate100', 'atop100', 'uatop100', 'top_arate100']

times_cols_all = ['s_duration', 's_duration_log',
                  'tdiff_time2', 'tdiff_time3', 'tdiff_time4', 'tdiff_time5', 'tdiff_time6', 'tdiff_time7', 'tdiff_time8', 'tdiff_time9', 'tdiff_time10', 
                  'tdiff_mean', 'tdiff_std', 'tdiff_var', 
                  'hour', 'hour_sin', 'hour_cos', 'hour_sin_17h', 'hour_cos_17h', 
                  'hour_bin_1.1', 'hour_bin_1.2', 'hour_bin_1.3', 'hour_bin_1.4', 
                  'hour_bin_2.1', 'hour_bin_2.2', 'hour_bin_2.3', 'hour_bin_2.4', 'hour_bin_2.5',
                  'hour_bin_3.1', 'hour_bin_3.2', 'hour_bin_3.3', 'hour_bin_3.4',
                  'hour_minutes', 'hour_minutes_sin', 'hour_minutes_cos', 
                  'day', 'week', 'holiday', 'month', 'quarter']

In [284]:
pl_sites = Pipeline([
            ('select', Selector(key=sites_cols_use)),
            ('scaler', PartScaler(cols_to_scale=sites_cols_scale)),
            ('mult', Multiplier(multipliers=site_multipliers))
            ])
pl_times = Pipeline([
            ('select', Selector(key=times_cols_use)),
            ('scaler', PartScaler(cols_to_scale=times_cols_scale)),
            ('mult', Multiplier(multipliers=time_multipliers)),
            ])

feats_all = FeatureUnion([
                    ('tfidf', pl_sites_tfidf), 
                    ('site', pl_sites),
                    ('time', pl_times),
                    ])

pl_all = Pipeline([
                ('feats', feats_all),
                ('clf', LogisticRegression(C=3.36, random_state=17))
              ])

sites_cols_use = ['utop5']
sites_cols_scale = sites_cols_use
site_multipliers = {}

times_cols_use = ['hour_bin_1.2', 'hour_bin_1.3', 'hour_bin_1.4', 
                  's_duration_log', 'week']
times_cols_scale = times_cols_use
time_multipliers = {col: 4 for col in ['hour_bin_1.2', 'hour_bin_1.3', 'hour_bin_1.4']}

new_feature_names = sites_cols_use + times_cols_use

new_params = {
    'feats__site__select__key': sites_cols_use,
    'feats__site__scaler__cols_to_scale': sites_cols_scale,
    'feats__site__mult__multipliers': site_multipliers,
    'feats__time__select__key': times_cols_use,
    'feats__time__scaler__cols_to_scale': times_cols_scale,
    'feats__time__mult__multipliers': time_multipliers
}

pl_all.set_params(**new_params)

cv_scores = train_and_predict(pl_all, X_train, y_train, X_test, new_feature_names=new_feature_names, 
                              cv=time_split, submission_file_name='subm.csv')

CV scores [0.84348357 0.80194655 0.93669637 0.97185967 0.91910566 0.95066294
 0.95915775 0.94570029 0.96531759 0.96313671]
CV mean: 0.925706709874784, CV std: 0.054298452139029924


In [None]:
CV scores [0.84294169 0.80249486 0.93680474 0.97199644 0.91994177 0.9510159
 0.96012776 0.94628275 0.96576051 0.96389505]
CV mean: 0.9261261481016886, CV std: 0.054442613023800324

CV scores [0.84356453 0.80263008 0.93680874 0.97206808 0.92006707 0.95113146
 0.95939422 0.94596484 0.9654401  0.96336943]
CV mean: 0.9260438552580255, CV std: 0.0542101245717717


In [224]:
# mask = df_train['time1'] >= '2013-09-01'

# cv_scores = train_and_predict(pl_all, X_train.loc[mask], y_train.loc[mask], X_test, 
#                               new_feature_names=new_feature_names, 
#                               cv=time_split, submission_file_name='subm1.csv')

CV scores [0.77155298 0.82195919 0.97311692 0.84137333 0.9638262  0.97091797
 0.90627378 0.94872638 0.97724456 0.97675957]
CV mean: 0.9151750882843354, CV std: 0.0724188226258625


Weight?,Feature
+9.407,cid-ed6c3e6a5c6608a4.users.storage.live.com
+7.517,www.tete-en-lair.com
+7.259,www.info-jeunes.net
+6.133,www.melty.fr
+5.894,cid-4390f92c7906cc9c.users.storage.live.com
+5.837,www.audienceinsights.net
+5.707,dub119.mail.live.com
+5.663,www.purepeople.com
+5.588,www.clermont-filmfest.com
+5.480,vk.com


New feature weights:
              feature      coef
0             n_sites  0.445165
1      n_unique_sites -0.493026
2   rate_unique_sites  0.296914
3      s_duration_log -0.160987
4          tdiff_mean -0.137365
5        hour_bin_3.1 -1.601602
6        hour_bin_3.2  0.868337
7        hour_bin_3.3  1.386172
8        hour_bin_3.4 -0.568917
9             holiday -0.311879
10            quarter  0.471043
