In [95]:
import warnings
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from tqdm import tqdm 
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
PATH_TO_DATA = 'data/'

In [2]:
time_cols = ['time%d' % i for i in range(1, 11)]
site_cols = ['site%d' % i for i in range(1, 11)]

train_df = pd.read_csv(PATH_TO_DATA + 'train_sessions.csv', index_col='session_id', parse_dates=time_cols)
test_df = pd.read_csv(PATH_TO_DATA + 'test_sessions.csv', index_col='session_id', parse_dates=time_cols)

with open(PATH_TO_DATA + 'site_dic.pkl', 'rb') as site_file:
     sites_dict = pickle.load(site_file)
        
id_sites_dict = {v: k for k, v in sites_dict.items()}

In [3]:
def split_data(X_data, y_data):
    grouped = train_df[['target']].groupby(by='target')
    
    train_ids = []
    valid_ids = []
    
    for g in tqdm(grouped.groups.keys()):
        train_shape = int(grouped.get_group(g).shape[0] * 0.7)

        ids_to_train = grouped.get_group(g).index[:train_shape]
        ids_to_valid = grouped.get_group(g).index[train_shape:]

        train_ids.extend(ids_to_train)
        valid_ids.extend(ids_to_valid)
        
    train_ids = np.array(train_ids) - 1
    valid_ids = np.array(valid_ids) - 1
        
    return X_data.tocsc()[train_ids], y_data[train_ids], X_data.tocsc()[valid_ids], y_data[valid_ids]

In [4]:
def get_dense_matrix(matrix):
    site_ids = list(id_sites_dict)
    X = matrix.values
    
    i = 0
    data = list()
    col = list()
    rows = list()
    for row in tqdm(X):
        unique, counts = np.unique(row, return_counts=True)
        dic = dict(zip(unique, counts))
        for k in dic:
            if k != 0:
                data.append(dic[k])
                rows.append(i)
                col.append(k-1)
            
        i += 1
    X_sparse = csr_matrix((data, (rows, col)), shape=(X.shape[0], len(site_ids)))
    return X_sparse

In [5]:
def score(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    valid_score = model.predict_proba(X_valid)
    print(roc_auc_score(y_valid, valid_score[:, 1:]))

In [6]:
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [7]:
def make_submission(model, X_train, y_train, X_test, file_name):
    print(X_train.shape)
    print(X_test.shape)
    model.fit(X_train, y_train)
    test_pred_proba = model.predict_proba(X_test)
    write_to_submission_file(test_pred_proba[:, 1:], file_name)

In [8]:
def exptact_time_features(data):

    day_offset = 24
    month_offset = day_offset + 7
    morning_offset = month_offset + 12
    evening_offset = morning_offset + 1
    row_size = evening_offset + 2
    values = []

    for _, row in tqdm(data.iterrows()):
        
        time = row[time_cols[0]]

        r = np.zeros(row_size)
        r[time.hour] += 1
        r[day_offset + time.dayofweek] += 1
        r[month_offset + time.month] += 1
        r[morning_offset] = time.hour < 11
        r[evening_offset] = time.hour > 19
        values.append(r[1:])
        
    return csr_matrix(values)

In [9]:
def unique(data):
    return csr_matrix([[sum(1 for s in np.unique(row.values) if s != 0)] for _, row in tqdm(data.iterrows())])

In [10]:
str_train = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in train_df[site_cols].iterrows()]
str_test = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in test_df[site_cols].iterrows()]

In [11]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 2)).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 37.3 s


In [12]:
X_tmp_train = hstack((X_train_idf, 
                      exptact_time_features(train_df[time_cols]),
                      unique(train_df[site_cols].fillna(0).astype('int'))))

X_tmp_test = hstack((X_test_idf, 
                     exptact_time_features(test_df[time_cols]),
                     unique(test_df[site_cols].fillna(0).astype('int'))))

253561it [00:18, 13547.16it/s]
253561it [00:26, 9605.02it/s]
82797it [00:05, 14269.17it/s]
82797it [00:08, 9822.51it/s] 


In [13]:
X_train, y_train, X_valid, y_valid = split_data(X_tmp_train, train_df['target'].values.astype('int64'))

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 33.37it/s]


((177491, 129384), (177491,), (76070, 129384), (76070,))

In [14]:
%%time
logit_c_values = np.logspace(-4, 2, 10)

skf = StratifiedKFold(n_splits=3, random_state=17)

logit_grid_searcher = LogisticRegressionCV(Cs=logit_c_values, cv=skf, n_jobs=-1)
logit_grid_searcher.fit(X_train, y_train)

Wall time: 54.9 s


In [15]:
logit_mean_cv_scores = next (iter (logit_grid_searcher.scores_.values())).mean(axis=0)
pd.Series(logit_mean_cv_scores, index=logit_grid_searcher.Cs_).sort_values(ascending=False)

100.000000    0.995205
21.544347     0.995183
4.641589      0.994653
1.000000      0.993600
0.215443      0.992687
0.046416      0.991481
0.010000      0.990946
0.002154      0.990946
0.000464      0.990946
0.000100      0.990946
dtype: float64

In [16]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y_train, X_valid, y_valid)

0.9900939394988099
Wall time: 19.9 s


In [17]:
X_train_sparse = get_dense_matrix(train_df[site_cols].fillna(0).astype('int'))
X_train_time_features = exptact_time_features(train_df[time_cols])
X_Train_unique = unique(train_df[site_cols].fillna(0).astype('int'))

100%|███████████████████████████████| 253561/253561 [00:11<00:00, 23033.61it/s]
253561it [00:18, 14029.15it/s]
253561it [00:21, 11798.66it/s]


In [18]:
X_tmp_train = hstack((X_train_sparse, X_train_time_features, X_Train_unique))

In [19]:
X_train, y_train, X_valid, y_valid = split_data(X_tmp_train, train_df['target'].values.astype('int64'))

100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 29.61it/s]


In [20]:
pd.Series(y_train).value_counts()

0    175884
1      1607
dtype: int64

In [21]:
pd.Series(y_valid).value_counts()

0    75380
1      690
dtype: int64

In [22]:
y = train_df['target'].values.astype('int64')
X_train, X_valid, y_train, y_valid = train_test_split(X_tmp_train, y, train_size =0.7, stratify=y)

In [23]:
pd.Series(y_train).value_counts()

0    175884
1      1608
dtype: int64

In [24]:
pd.Series(y_valid).value_counts()

0    75380
1      689
dtype: int64

In [14]:
def score(model, X, y, train_size=0.7, random_states=[1, 13, 42]):
    result = []
    
    for rs in random_states:
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=train_size, stratify=y, random_state=rs)
        m = clone(model, safe=True)
        m.fit(X_train, y_train)
        valid_score = m.predict_proba(X_valid)
        result.append(roc_auc_score(y_valid, valid_score[:, 1:]))
        
    return result

In [26]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y_train, X_valid, y_valid)

ValueError: Invalid value for train_size: <76069x48417 sparse matrix of type '<class 'numpy.float64'>'
	with 757097 stored elements in Compressed Sparse Row format>

In [26]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_tmp_train, y)

Wall time: 54.3 s


[0.9869365413592901, 0.9869002280077986, 0.9894256521673834]

In [27]:
tmp = train_df[time_cols].head(3)

In [28]:
tmp

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-02-20 10:02:45,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2014-02-22 11:19:50,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,2014-02-22 11:20:16
3,2013-12-16 16:40:17,2013-12-16 16:40:18,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:20,2013-12-16 16:40:21,2013-12-16 16:40:22,2013-12-16 16:40:24


In [16]:
def extract_year_month(data):
    time = time_cols[0]
    values = [row[time].year * 100 + row[time].month for _, row in tqdm(data.iterrows())]
    series = pd.Series(values)
    return csr_matrix(pd.get_dummies(series))

In [30]:
X_train_year_month = extract_year_month(train_df[time_cols])

253561it [00:21, 11980.31it/s]


In [31]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_year_month))

In [32]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min 8s


[0.9870906901885792, 0.9870493707546978, 0.9895351505925853]

In [17]:
def extract_part_of_day(data):
    time = time_cols[0]
    values = [row[time].hour // 6 for _, row in tqdm(data.iterrows())]
    series = pd.Series(values)
    return csr_matrix(pd.get_dummies(series))

In [34]:
X_train_part_of_day = extract_part_of_day(train_df[time_cols])

253561it [00:16, 15815.78it/s]


In [35]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_part_of_day))

In [36]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min


[0.986982616571442, 0.9869237373408691, 0.9892917972259372]

In [18]:
def extract_weekend(data):
    time = time_cols[0]
    values = [[row[time].dayofweek > 4] for _, row in tqdm(data.iterrows())]
    return csr_matrix(values)

In [38]:
X_train_weekend = extract_weekend(train_df[time_cols])

253561it [00:16, 15684.93it/s]


In [39]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_weekend))

In [40]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 1min


[0.9869277422067813, 0.9869012099701138, 0.9894044725880407]

In [41]:
tmp = train_df[time_cols].head(3)

In [42]:
tmp

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-02-20 10:02:45,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2014-02-22 11:19:50,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,2014-02-22 11:20:16
3,2013-12-16 16:40:17,2013-12-16 16:40:18,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:20,2013-12-16 16:40:21,2013-12-16 16:40:22,2013-12-16 16:40:24


In [19]:
def extract_duration(data):
    values = []
    time = time_cols[0]

    for _, row in tqdm(data.iterrows()):

        first = row[time]
        last = first

        for t, check in zip(time_cols, row.values == np.datetime64('NaT')):
            if check:
                break
            else:
                last = row[t]

        values.append([np.log1p(last.minute - first.minute)])

    return csr_matrix(np.nan_to_num(values))

In [44]:
X_train_duration = extract_duration(train_df[time_cols])

253561it [01:11, 3566.61it/s]


In [45]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_duration))

In [46]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 56.2 s


[0.9870381359505644, 0.9868416472167529, 0.9895346114760203]

In [20]:
def extract_week(data):
    time = time_cols[0]
    values = []
    
    for _, row in tqdm(data.iterrows()):
        
        r = np.zeros(53)
        r[row[time].week] = 1
        values.append(r)
        
    return csr_matrix(values)

In [48]:
X_train_weeks = extract_week(train_df[time_cols])

253561it [00:17, 14362.88it/s]


In [49]:
X = hstack((X_train_sparse, X_train_time_features, X_Train_unique, X_train_weeks))

In [50]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 50.2 s


[0.9894175365376625, 0.9901336951318929, 0.9910148522762849]

In [51]:
X = hstack((X_train_sparse, 
            X_train_time_features, 
            X_Train_unique, 
            X_train_year_month, 
            X_train_part_of_day, 
            X_train_weekend,
            X_train_duration,
            X_train_weeks))

In [52]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X, y)

Wall time: 53.4 s


[0.9895003968283003, 0.990180992983398, 0.9909503123217787]

In [22]:
train_test_sites_df = pd.concat([train_df[site_cols].fillna(0).astype('int'), test_df[site_cols].fillna(0).astype('int')])
train_test_times_df = pd.concat([train_df[time_cols], test_df[time_cols]])

In [23]:
X_tmp_sparse = get_dense_matrix(train_test_sites_df)
X_tmp_time_features = exptact_time_features(train_test_times_df)
X_tmp_unique = unique(train_test_sites_df)

X_tmp_year_month = extract_year_month(train_test_times_df)
X_tmp_part_of_day = extract_part_of_day(train_test_times_df)
X_tmp_weekend = extract_weekend(train_test_times_df)
X_tmp_duration = extract_duration(train_test_times_df)
X_tmp_weeks = extract_week(train_test_times_df)

100%|███████████████████████████████████████████████████████████████████████| 336358/336358 [00:13<00:00, 25523.40it/s]
336358it [00:23, 14116.79it/s]
336358it [00:36, 9297.70it/s]
336358it [00:26, 12652.57it/s]
336358it [00:20, 16380.67it/s]
336358it [00:22, 15117.17it/s]
336358it [01:24, 3958.92it/s]
336358it [00:21, 15349.42it/s]


In [21]:
def split_train_and_test(data, train_size):
    return data.tocsc()[:train_size], data.tocsc()[train_size:]

In [57]:
X_tmp.shape, X_train.shape, X_test.shape, train_df.shape[0]

NameError: name 'X_tmp' is not defined

In [56]:
X_tmp = hstack((X_tmp_sparse, 
                X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
#                 X_tmp_part_of_day, 
#                 X_tmp_weekend,
#                 X_tmp_duration,
#                 X_tmp_weeks
               ))



In [57]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [24]:
str_train = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(train_df[site_cols].iterrows())]
str_test = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(test_df[site_cols].iterrows())]

253561it [00:17, 14354.43it/s]
82797it [00:05, 14436.92it/s]


In [25]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 5)).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 1min 41s


In [60]:
X_tmp = hstack((vstack((X_train_idf, X_test_idf)), 
                X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
                X_tmp_part_of_day, 
                X_tmp_weekend,
                X_tmp_duration,
                X_tmp_weeks
               ))

In [61]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [62]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y)

Wall time: 4min 44s


[0.9902382933726015, 0.9920294503976177, 0.989674647003802]

In [63]:
%%time
score(LogisticRegression(C=15, n_jobs=-1), X_train, y)

Wall time: 4min 14s


[0.990263362292878, 0.9920144128962844, 0.9894890368721073]

In [65]:
%%time
make_submission(LogisticRegression(C=15, n_jobs=-1), X_train, y, X_test, 'i_test.csv') # 0.96002

(253561, 1698311)
(82797, 1698311)
Wall time: 2min 4s


In [64]:
X_sparse = vstack((X_train_idf, X_test_idf))

In [65]:
from sklearn.decomposition import TruncatedSVD

In [66]:
svd = TruncatedSVD(n_components=100)
X_svd = svd.fit_transform(X_sparse)

In [69]:
X_tmp = hstack((X_sparse, X_svd,
                X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
                X_tmp_part_of_day, 
                X_tmp_weekend,
                X_tmp_duration,
                X_tmp_weeks
               ))

In [70]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [71]:
%%time
score(LogisticRegression(C=21.544347, n_jobs=-1), X_train, y)

Wall time: 6min 26s


[0.9902779376942987, 0.9921437046010904, 0.9899757628595668]

In [72]:
%%time
score(LogisticRegression(C=15, n_jobs=-1), X_train, y)

Wall time: 6min 10s


[0.9902991942902936, 0.9921449753758509, 0.9898079050661939]

In [74]:
%%time
score(LogisticRegression(C=21.544347, solver='lbfgs', n_jobs=-1), X_train, y)

Wall time: 3min 6s


[0.9879016274003685, 0.9884361807288162, 0.9825051090921624]

In [75]:
%%time
score(LogisticRegression(C=15, solver='lbfgs', n_jobs=-1), X_train, y)

Wall time: 3min 6s


[0.9874112623760948, 0.9890079523544183, 0.9820001879206313]

In [76]:
%%time
score(LogisticRegression(C=37.92690190732246, solver='lbfgs', n_jobs=-1), X_train, y)

Wall time: 2min 58s


[0.9869483152799883, 0.98891019896867, 0.9822151606509602]

In [80]:
from sklearn.model_selection import GridSearchCV 

In [81]:
%%time
grid_params = {'C' : np.logspace(-2, 2, 20)}
gridsearcer = GridSearchCV(LogisticRegression(solver='lbfgs', n_jobs=-1), grid_params, cv=3, scoring='roc_auc', n_jobs=-1)
gridsearcer.fit(X_train, y)

NameError: name 'grid_params' is not defined

In [67]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 3), max_features=100000).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 50.8 s


# HERE

In [26]:
X_tmp = hstack((vstack((X_train_idf, X_test_idf)), 
                X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
                X_tmp_part_of_day, 
                X_tmp_weekend,
                X_tmp_duration,
                X_tmp_weeks
               ))

In [27]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [29]:
%%time
score(LogisticRegression(C=15, n_jobs=-1), X_train, y)

Wall time: 3min 41s


[0.9902807488021023, 0.9919933103335938, 0.9894300036082302]

In [28]:
%%time
make_submission(LogisticRegression(C=15, n_jobs=-1), X_train, y, X_test, 'i_test_tfidf_3_max_100K.csv') # 0.95952

(253561, 1698311)
(82797, 1698311)
Wall time: 2min


# SUBMIT

In [36]:
def extract_alice_top_30(df_sites):
    alice_top_30 = pd.Series(df_sites[:y.shape[0]][y==1].values.flatten()).value_counts()[:30]
    alice_top_30 = alice_top_30.drop(0) 
    return df_sites.apply(lambda row: row.isin(alice_top_30).astype(int), axis=1).max(axis=1).values.reshape(-1,1)

In [37]:
def extract_long(df):
    return ((df.max(axis=1) - df.time1).dt.seconds > 60).astype(int).values.reshape(-1, 1)

In [38]:
def extract_year(df):
    return (df.time1.dt.year % 2013).astype(int).values.reshape(-1, 1)

In [288]:
def exptact_time_features(data):

    day_offset = 24
    month_offset = day_offset + 7
    morning_offset = month_offset + 12
    row_size = morning_offset + 3
    values = []

    for _, row in tqdm(data.iterrows()):
        
        time = row[time_cols[0]]

        r = np.zeros(row_size)
        r[time.hour] += 1
        r[day_offset + time.dayofweek] += 1
        r[month_offset + time.month] += 1
        r[morning_offset] = time.hour < 11
        r[morning_offset + 1] = time.hour > 19
#         r[morning_offset + 2] = (time.hour >= 11) and (time.hour <= 19)
        values.append(r[1:])
        
    return csr_matrix(values)

X_tmp_time_features = exptact_time_features(train_test_times_df)


0it [00:00, ?it/s]
1320it [00:00, 13082.82it/s]
2683it [00:00, 13207.78it/s]
4039it [00:00, 13276.47it/s]
5384it [00:00, 13292.68it/s]
6743it [00:00, 13345.18it/s]
8126it [00:00, 13451.76it/s]
9490it [00:00, 13471.83it/s]
10792it [00:00, 13296.41it/s]
12138it [00:00, 13309.58it/s]
13493it [00:01, 13345.39it/s]
14797it [00:01, 13216.18it/s]
16094it [00:01, 12833.63it/s]
17387it [00:01, 12828.05it/s]
18734it [00:01, 12980.41it/s]
20091it [00:01, 13117.71it/s]
21465it [00:01, 13263.88it/s]
22810it [00:01, 13283.75it/s]
24167it [00:01, 13333.05it/s]
25538it [00:01, 13408.62it/s]
26879it [00:02, 13333.24it/s]
28212it [00:02, 13178.34it/s]
29543it [00:02, 13182.39it/s]
30862it [00:02, 13071.25it/s]
32186it [00:02, 13086.55it/s]
33495it [00:02, 13052.49it/s]
34862it [00:02, 13197.46it/s]
36224it [00:02, 13286.51it/s]
37591it [00:02, 13364.08it/s]
38958it [00:02, 13418.91it/s]
40301it [00:03, 13228.21it/s]
41662it [00:03, 13305.38it/s]
43021it [00:03, 13354.18it/s]
44383it [00:03, 13397.33it/

In [318]:
def get_tdiff(X):
    times = time_cols
    tdiff = X[times].diff(axis=1)
    for col in times:
#         new_data[col] = X[col]
        tdiff[col] = tdiff[col].dt.total_seconds()
        tdiff.loc[tdiff[col] > 1800, col] = np.NaN
    tdiff_cols = [f'tdiff_{col}' for col in times[:-1]]
    
    new_data = np.zeros((X.shape[0], 3))
#     new_data[tdiff_cols] = tdiff[times[:-1]]
    new_data_mean = StandardScaler().fit_transform(tdiff[times[:-1]].mean(axis=1).values.reshape(-1, 1)).flatten()
    new_data_std = StandardScaler().fit_transform(tdiff[times[:-1]].std(axis=1).values.reshape(-1, 1)).flatten()
    new_data_var = StandardScaler().fit_transform(tdiff[times[:-1]].var(axis=1).values.reshape(-1, 1)).flatten()
    return csr_matrix(new_data_mean.reshape(-1, 1)), csr_matrix(new_data_std.reshape(-1, 1)), csr_matrix(new_data_var.reshape(-1, 1))

X_tmp_tdiff_mean, X_tmp_tdiff_std, X_tmp_tdiff_var = get_tdiff(train_test_times_df)
X_tmp_tdiff_mean.shape

(336358, 1)

In [None]:
str_train = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(train_df[site_cols].iterrows())]
str_test = [' '.join([id_sites_dict[idx] for idx in row.values if idx in id_sites_dict]) for _, row in tqdm(test_df[site_cols].iterrows())]

# num_train = [' '.join([str(idx) for idx in row.values if ~np.isnan(idx)]) for _, row in tqdm(train_df[site_cols].iterrows())]
# num_test = [' '.join([str(idx) for idx in row.values if ~np.isnan(idx)]) for _, row in tqdm(test_df[site_cols].iterrows())]

# str_train_ws = [' '.join(row.split('.')) for row in str_train]
# str_test_ws = [' '.join(row.split('.')) for row in str_test]



In [342]:
for col in time_cols:
    train_df[col] = pd.to_datetime(train_df[col])
    test_df[col] = pd.to_datetime(test_df[col])

In [344]:
hour_train = [' '.join([str(idx.hour) for idx in pd.to_datetime(row.values) if idx]) for _, row in tqdm(train_df[time_cols].iterrows())]
hour_test = [' '.join([str(idx.hour) for idx in pd.to_datetime(row.values) if idx]) for _, row in tqdm(test_df[time_cols].iterrows())]









0it [00:00, ?it/s]







209it [00:00, 2071.43it/s]







418it [00:00, 2071.43it/s]







630it [00:00, 2080.26it/s]







844it [00:00, 2092.32it/s]







1055it [00:00, 2092.01it/s]







1267it [00:00, 2094.74it/s]







1479it [00:00, 2096.67it/s]







1687it [00:00, 2086.01it/s]







1890it [00:00, 2063.23it/s]







2089it [00:01, 2035.09it/s]







2287it [00:01, 2012.73it/s]







2485it [00:01, 1961.79it/s]







2696it [00:01, 1998.91it/s]







2902it [00:01, 2011.56it/s]







3110it [00:01, 2030.43it/s]







3321it [00:01, 2048.31it/s]







3537it [00:01, 2078.98it/s]







3750it [00:01, 2088.50it/s]







3959it [00:01, 2087.80it/s]







4171it [00:02, 2091.79it/s]







4381it [00:02, 2076.27it/s]







4589it [00:02, 2052.07it/s]







4801it [00:02, 2066.56it/s]







5017it [00:02, 2088.57it/s]







5229it [00:02, 2092.33it/s]







5439it [00:02, 2088.99it/s]







5651it [00:02, 2092.60it/s]







5863it [00:02, 2095.16it/

96447it [00:46, 2147.94it/s]







96663it [00:46, 2145.80it/s]







96879it [00:46, 2144.30it/s]







97095it [00:46, 2143.26it/s]







97312it [00:47, 2145.49it/s]







97529it [00:47, 2147.06it/s]







97745it [00:47, 2145.18it/s]







97963it [00:47, 2149.80it/s]







98180it [00:47, 2150.07it/s]







98396it [00:47, 2147.29it/s]







98614it [00:47, 2151.27it/s]







98832it [00:47, 2154.08it/s]







99049it [00:47, 2153.07it/s]







99266it [00:47, 2152.38it/s]







99485it [00:48, 2157.79it/s]







99702it [00:48, 2155.67it/s]







99918it [00:48, 2151.19it/s]







100134it [00:48, 2148.07it/s]







100352it [00:48, 2151.83it/s]







100568it [00:48, 2142.13it/s]







100785it [00:48, 2144.70it/s]







101002it [00:48, 2146.51it/s]







101217it [00:48, 2141.80it/s]







101434it [00:48, 2144.48it/s]







101649it [00:49, 2121.40it/s]







101862it [00:49, 1944.43it/s]







102060it [00:49, 1949.79it/s]







102270it [00:49, 1

144379it [01:09, 2137.37it/s]







144594it [01:09, 2135.43it/s]







144813it [01:09, 2145.85it/s]







145031it [01:09, 2150.26it/s]







145247it [01:09, 2147.42it/s]







145465it [01:09, 2151.36it/s]







145681it [01:09, 2148.20it/s]







145898it [01:09, 2148.95it/s]







146115it [01:09, 2149.49it/s]







146332it [01:10, 2149.86it/s]







146548it [01:10, 2147.13it/s]







146763it [01:10, 2142.25it/s]







146979it [01:10, 2141.82it/s]







147198it [01:10, 2150.35it/s]







147414it [01:10, 2147.49it/s]







147631it [01:10, 2148.46it/s]







147848it [01:10, 2149.14it/s]







148063it [01:10, 2143.64it/s]







148278it [01:10, 2139.80it/s]







148495it [01:11, 2143.07it/s]







148714it [01:11, 2151.23it/s]







148930it [01:11, 2148.11it/s]







149145it [01:11, 2136.54it/s]







149361it [01:11, 2137.81it/s]







149575it [01:11, 2126.40it/s]







149792it [01:11, 2133.64it/s]







150009it [01:11, 2138.73it/s]







1

239257it [01:54, 2038.83it/s]







239473it [01:54, 2068.39it/s]







239687it [01:54, 2083.90it/s]







239905it [01:54, 2106.34it/s]







240121it [01:54, 2116.55it/s]







240333it [01:54, 2111.93it/s]







240545it [01:54, 2108.70it/s]







240756it [01:54, 2090.94it/s]







240966it [01:55, 2057.40it/s]







241183it [01:55, 2084.54it/s]







241401it [01:55, 2106.80it/s]







241616it [01:55, 2113.97it/s]







241832it [01:55, 2121.96it/s]







242045it [01:55, 2112.39it/s]







242263it [01:55, 2126.64it/s]







242477it [01:55, 2124.93it/s]







242694it [01:55, 2132.61it/s]







242912it [01:55, 2140.95it/s]







243127it [01:56, 2137.91it/s]







243346it [01:56, 2147.61it/s]







243562it [01:56, 2145.57it/s]







243777it [01:56, 2141.15it/s]







243993it [01:56, 2141.05it/s]







244208it [01:56, 2137.99it/s]







244422it [01:56, 2132.87it/s]







244636it [01:56, 2122.96it/s]







244852it [01:56, 2128.28it/s]







2

82445it [00:39, 1980.82it/s]







82660it [00:39, 2023.57it/s]







82797it [00:39, 2096.49it/s]

In [348]:
str_train_h = list()
for i in range(len(str_train)):
    t_sites = str_train[i].split(' ')
    t_hours = hour_train[i].split(' ')
    res = ' '.join(t_sites[j] + '.' + t_hours[j] for j in range(len(t_sites)))
    str_train_h.append(res)

In [350]:
str_test_h = list()
for i in range(len(str_test)):
    t_sites = str_test[i].split(' ')
    t_hours = hour_test[i].split(' ')
    res = ' '.join(t_sites[j] + '.' + t_hours[j] for j in range(len(t_sites)))
    str_test_h.append(res)

In [375]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 4), max_features=80000, binary=True).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 1min 18s


In [39]:
X_tmp_alice_top_30 = extract_alice_top_30(train_test_sites_df)
X_tmp_long = extract_long(train_test_times_df)
X_tmp_year = extract_year(train_test_times_df)

In [106]:
wscaler = StandardScaler()
X_tmp_weeks_scaled = csr_matrix(wscaler.fit_transform(train_test_times_df['time1'].dt.weekday.values.reshape(-1, 1)))

In [196]:
wscaler = StandardScaler()
X_tmp_nweeks_scaled = csr_matrix(wscaler.fit_transform(train_test_times_df['time1'].dt.week.values.reshape(-1, 1)))
X_tmp_nweeks_scaled = csr_matrix(wscaler.fit_transform(train_test_times_df['time1'].dt.week.values.reshape(-1, 1)))
X_tmp_nweeks_ohe = csr_matrix(pd.get_dummies(train_test_times_df['time1'].dt.week.values))
X_tmp_nweeks_scaled.shape, X_tmp_nweeks_ohe.shape

((336358, 1), (336358, 42))

In [187]:
X_tmp_duration_q4 = csr_matrix(pd.get_dummies(pd.qcut(X_tmp_duration.toarray().flatten(), 10, duplicates='drop')))
X_tmp_duration_4 = csr_matrix(pd.get_dummies(pd.cut(X_tmp_duration.toarray().flatten(), 4, duplicates='drop')))
X_tmp_duration_4.shape, X_tmp_duration_q4.shape

(336358, 4)

In [376]:
X_tmp = hstack((vstack((X_train_idf, X_test_idf)), 
                X_tmp_time_features, 
#                 X_tmp_unique, 
                X_tmp_year_month, 
                X_tmp_part_of_day, 
                X_tmp_weekend,
                X_tmp_duration * 3,
#                 csr_matrix(StandardScaler().fit_transform(X_tmp_duration.todense())),
#                 X_tmp_duration_4,
                X_tmp_weeks,
                X_tmp_nweeks_ohe,
                X_tmp_nweeks_scaled * 3,
#                 X_tmp_tdiff_mean, 
#                 X_tmp_tdiff_std, 
                X_tmp_tdiff_var,
#                 X_tmp_weeks_scaled
#                 X_tmp_alice_top_30,
#                 X_tmp_long,
#                 X_tmp_year
               ))

In [377]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [373]:
# mask = train_df['time1'] >= "2014-01-01"
# X_train = X_train[np.arange(mask.shape[0])[mask]]
# y = y[np.arange(mask.shape[0])[mask]]

In [378]:
%%time
make_submission(LogisticRegression(C=15, n_jobs=-1), X_train, y, X_test, 'subm_top5.csv') # 

(253561, 80171)
(82797, 80171)
Wall time: 1min 6s


In [75]:
from sklearn.model_selection import cross_val_score

In [76]:
cross_val_score(LogisticRegression(C=15, n_jobs=-1), X_train, y, cv=3, scoring='roc_auc')

array([0.99549224, 0.99565788, 0.99563412])

In [77]:
cross_val_score(LogisticRegression(n_jobs=-1), X_train, y, cv=3)

array([0.99430911, 0.99424995, 0.9943563 ])

In [86]:
X_tmp = hstack((vstack((X_train_idf, X_test_idf)), 
                X_tmp_time_features, 
                #X_tmp_unique, 
                #X_tmp_year_month, 
                #X_tmp_part_of_day, 
                #X_tmp_weekend,
                #X_tmp_duration,
                #X_tmp_weeks
               ))
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])
print(cross_val_score(LogisticRegression(C=15, n_jobs=-1), X_train, y, cv=3, scoring='roc_auc'))
print(cross_val_score(LogisticRegression(n_jobs=-1), X_train, y, cv=3, scoring='roc_auc'))

[0.98996515 0.98956524 0.98973357]
[0.98097117 0.98324009 0.98291327]


In [85]:
print(cross_val_score(LogisticRegression(C=15, n_jobs=-1), X_train_idf, y, cv=3, scoring='roc_auc'))
print(cross_val_score(LogisticRegression(n_jobs=-1), X_train_idf, y, cv=3, scoring='roc_auc'))

[0.96451302 0.96575294 0.96158993]
[0.94855154 0.95410396 0.94985419]


In [93]:
train_df_str = train_df[site_cols].fillna(0).astype(int).astype(str).apply(lambda row: ' '.join(row), axis=1)
train_df_str[:2].values

array(['718 0 0 0 0 0 0 0 0 0',
       '890 941 3847 941 942 3846 3847 3846 1516 1518'], dtype=object)

In [98]:
tfidf_vec = TfidfVectorizer(ngram_range = (1, 3), max_features=100000)
X_train_sparse = tfidf_vec.fit_transform(train_df_str.values)

In [99]:
print(cross_val_score(LogisticRegression(C=15, n_jobs=-1), X_train_sparse, y, cv=3, scoring='roc_auc'))
print(cross_val_score(LogisticRegression(n_jobs=-1), X_train_sparse, y, cv=3, scoring='roc_auc'))

[0.96121915 0.96207822 0.95527485]
[0.9504246  0.95423162 0.9487498 ]


In [96]:
print(cross_val_score(LogisticRegression(C=15, random_state=17, n_jobs=-1), X_train_sparse, y, cv=3, scoring='roc_auc'))
print(cross_val_score(LogisticRegression(random_state=17, n_jobs=-1), X_train_sparse, y, cv=3, scoring='roc_auc'))

[0.96121915 0.96207822 0.95527485]
[0.9504246  0.95423162 0.9487498 ]


In [110]:
train_df_sorted = train_df.sort_values('time1').reset_index(drop=True)

In [111]:
y_s = train_df_sorted.target

In [112]:
train_df_sorted_str = train_df_sorted[site_cols].fillna(0).astype(int).astype(str).apply(lambda row: ' '.join(row), axis=1)

In [113]:
tfidf_vec_sorted = TfidfVectorizer(ngram_range = (1, 3), max_features=100000)
X_train_sorted_sparse = tfidf_vec_sorted.fit_transform(train_df_sorted_str.values)

In [114]:
print(cross_val_score(LogisticRegression(C=15, random_state=17, n_jobs=-1), X_train_sorted_sparse, y_s, cv=3, scoring='roc_auc'))
print(cross_val_score(LogisticRegression(random_state=17, n_jobs=-1), X_train_sorted_sparse, y_s, cv=3, scoring='roc_auc'))

[0.84151882 0.83717209 0.87342938]
[0.85269461 0.81783706 0.89248218]


In [149]:
def extract_alice_top_30(df_sites):
    alice_top_30 = pd.Series(df_sites[:y.shape[0]][y==1].values.flatten()).value_counts()[:30]
    alice_top_30 = alice_top_30.drop(0) 
    return df_sites.apply(lambda row: row.isin(alice_top_30).astype(int), axis=1).max(axis=1).values.reshape(-1,1)

In [150]:
X_tmp_alice_top_30 = extract_alice_top_30(train_test_sites_df)

In [151]:
def extract_long(df):
    return ((df.max(axis=1) - df.time1).dt.seconds > 60).astype(int).values.reshape(-1, 1)

In [152]:
X_tmp_long = extract_long(train_test_times_df)

In [153]:
def extract_year(df):
    return (df.time1.dt.year % 2013).astype(int).values.reshape(-1, 1)

In [154]:
X_tmp_year = extract_year(train_test_times_df)

In [None]:
X_tmp_alice_top_30 = extract_alice_top_30(train_test_sites_df)
X_tmp_long = extract_long(train_test_times_df)
X_tmp_year = extract_year(train_test_times_df)

In [117]:
%%time
tfidf = TfidfVectorizer(ngram_range = (1, 5)).fit(np.array(str_train))
X_train_idf = tfidf.transform(np.array(str_train))
X_test_idf = tfidf.transform(np.array(str_test))

Wall time: 1min 39s


In [155]:
X_tmp = hstack((vstack((X_train_idf, X_test_idf)), 
                X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
                X_tmp_part_of_day, 
                X_tmp_weekend,
                X_tmp_duration,
                X_tmp_weeks,
                X_tmp_alice_top_30,
                X_tmp_long,
                X_tmp_year
               ))

In [156]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [158]:
%%time
score(LogisticRegression(C=15, n_jobs=-1), X_train, y)

Wall time: 4min 14s


[0.990220175205182, 0.9920215561907717, 0.9895030539027996]

In [159]:
make_submission(LogisticRegression(C=15, n_jobs=-1), X_train, y, X_test, 'lr_tfidf_5_more.csv') # 0.95999

(253561, 1698314)
(82797, 1698314)


In [160]:
X_tmp = hstack((vstack((X_train_idf, X_test_idf)), 
                #X_tmp_time_features, 
                X_tmp_unique, 
                X_tmp_year_month, 
                X_tmp_part_of_day, 
                X_tmp_weekend,
                X_tmp_duration,
                X_tmp_weeks,
                X_tmp_alice_top_30,
                X_tmp_long,
                X_tmp_year
               ))

In [161]:
y = train_df['target'].values.astype('int64')
X_train, X_test = split_train_and_test(X_tmp, train_df.shape[0])

In [162]:
%%time
score(LogisticRegression(C=15, n_jobs=-1), X_train, y)

Wall time: 3min 48s


[0.9813161934057573, 0.9852840239352352, 0.9826542903473875]