In [1]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [3]:
# helper function for returning cross validation scores
def call_CV(X,y):
    time_split = TimeSeriesSplit(n_splits=10)
    logit = LogisticRegression(C=1, random_state=17)
    cv_scores = cross_val_score(logit, X, y, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1)
    return(cv_scores.mean())

In [4]:
train_data = pd.read_csv('../data/Project_Alice/train_sessions.csv',
                       index_col='session_id')
test_data = pd.read_csv('../data/Project_Alice/test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_data[times] = train_data[times].apply(pd.to_datetime)
test_data[times] = test_data[times].apply(pd.to_datetime)

# Sort the data by time
train_data = train_data.sort_values(by='time1')

# Look at the first rows of the training set
train_data.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [5]:
sites = ['site%s' % i for i in range(1, 11)]
train_data[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_data[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [31]:
%%time
tfv = TfidfVectorizer(ngram_range=(1,3),max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = tfv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = tfv.transform(inp_test_file)
X_train.shape, X_test.shape

Wall time: 11.5 s


In [35]:
X_train.shape,X_test.shape

((253561, 50000), (82797, 50000))

In [36]:
y_train = train_data['target'].astype('int')

In [37]:
call_CV(X_train,y_train)

0.8670498229283498

In [38]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [39]:
%%time
X_train_new = add_time_features(train_data.fillna(0), X_train)
X_test_new = add_time_features(test_data.fillna(0), X_test)

Wall time: 1min 23s


In [112]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

In [40]:
call_CV(X_train_new,y_train)

0.9229082956845186

In [41]:
%%time
spoof_days = [1,3,4,0]
non_spoof_days = [2,5,6]
site1vals = [80,77,76]
# train data
train_data['max_time'] = train_data[times].max(axis=1)
train_data['min_time'] = train_data[times].min(axis=1)
train_data['seconds'] = (train_data['max_time'] - train_data['min_time']) / np.timedelta64(1, 's')
train_data['unique_sites'] = train_data[sites].nunique(axis=1)
train_data['time_spent_on_each_site'] =  train_data['seconds'] / train_data['unique_sites'] 
train_data['day_of_week'] = train_data['min_time'].apply(lambda x: x.dayofweek)
train_data['is_other_weekday'] = [1 if i in spoof_days else 0 for i in train_data['day_of_week']]
train_data['difference_from_std'] = train_data['seconds'] - np.std(train_data['seconds'])

#---------------------------------------------------------------------------------------------------------####

# test data
test_data['max_time'] = test_data[times].max(axis=1)
test_data['min_time'] = test_data[times].min(axis=1)
test_data['seconds'] = (test_data['max_time'] - test_data['min_time']) / np.timedelta64(1, 's')
test_data['unique_sites'] = test_data[sites].nunique(axis=1)
test_data['time_spent_on_each_site'] =  test_data['seconds'] / test_data['unique_sites'] 
test_data['day_of_week'] = test_data['min_time'].apply(lambda x: x.dayofweek)
test_data['is_other_weekday'] = [1 if i in spoof_days else 0 for i in test_data['day_of_week']]
test_data['difference_from_std'] = test_data['seconds'] - np.std(test_data['seconds'])

#----------------------------------------------------------------------------------------------------------##

train_data['time_spent_on_each_site'] = StandardScaler().fit_transform(train_data[['time_spent_on_each_site']])
train_data['seconds'] = StandardScaler().fit_transform(train_data[['seconds']])
train_data['difference_from_std'] = StandardScaler().fit_transform(train_data[['difference_from_std']])

test_data['time_spent_on_each_site'] = StandardScaler().fit_transform(test_data[['time_spent_on_each_site']])
test_data['seconds'] = StandardScaler().fit_transform(test_data[['seconds']])
test_data['difference_from_std'] = StandardScaler().fit_transform(test_data[['difference_from_std']])

Wall time: 26 s


In [53]:
X_train_more_features = hstack([X_train_new,train_data.time_spent_on_each_site.values.reshape(-1,1),
                                train_data.is_other_weekday.values.reshape(-1,1),
                                train_data.difference_from_std.values.reshape(-1,1),
                                ])

X_test_more_features = hstack([X_test_new,test_data.time_spent_on_each_site.values.reshape(-1,1),
                               test_data.is_other_weekday.values.reshape(-1,1),
                               test_data.difference_from_std.values.reshape(-1,1),
                               ])

In [54]:
# bestd cv 0.9203662109521012 , 0.920550085501536 , 0.9206717544989749
call_CV(X_train_more_features,y_train)

0.9205512612275749

In [55]:
logit = LogisticRegression(C=1, random_state=17,solver='liblinear')
logit.fit(X_train_more_features,y_train)
logit_test_pred = logit.predict_proba(X_test_more_features)[:, 1]
write_to_submission_file(logit_test_pred, 'subm14.csv')

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [63]:
def call_CV_RF(X,y):
    time_split = TimeSeriesSplit(n_splits=10)
    rf = RandomForestClassifier()
    #param_grid = {'max_depth':np.arange(2,10),
    #             'min_samples_leaf':np.arange(2,10)}
    
    cv_scores = cross_val_score(rf, X, y, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1)
    return(cv_scores.mean())

In [64]:
call_CV_RF(X_train_more_features,y_train)

0.7675741721144798