In [1]:
import warnings
import pickle

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from tqdm import tqdm 


warnings.filterwarnings('ignore')


In [2]:
# Read the training and test data sets, change paths if needed


times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv('train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head(10)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0
242171,952,2013-01-12 08:50:22,947.0,2013-01-12 08:50:23,953.0,2013-01-12 08:50:23,946.0,2013-01-12 08:50:23,947.0,2013-01-12 08:50:24,...,2013-01-12 08:50:24,953.0,2013-01-12 08:50:24,955.0,2013-01-12 08:50:24,946.0,2013-01-12 08:50:25,947.0,2013-01-12 08:50:25,0
57157,953,2013-01-12 08:50:25,947.0,2013-01-12 08:50:26,946.0,2013-01-12 08:50:26,953.0,2013-01-12 08:50:26,955.0,2013-01-12 08:50:26,...,2013-01-12 08:50:27,953.0,2013-01-12 08:50:27,946.0,2013-01-12 08:50:27,953.0,2013-01-12 08:50:28,1033.0,2013-01-12 08:50:28,0
240201,946,2013-01-12 08:50:28,947.0,2013-01-12 08:50:28,954.0,2013-01-12 08:50:28,953.0,2013-01-12 08:50:29,946.0,2013-01-12 08:50:29,...,2013-01-12 08:50:29,946.0,2013-01-12 08:50:30,956.0,2013-01-12 08:50:30,957.0,2013-01-12 08:50:31,956.0,2013-01-12 08:50:31,0
210686,946,2013-01-12 08:50:31,956.0,2013-01-12 08:50:32,946.0,2013-01-12 08:50:32,946.0,2013-01-12 08:50:33,955.0,2013-01-12 08:50:33,...,2013-01-12 08:50:33,946.0,2013-01-12 08:50:34,946.0,2013-01-12 08:50:35,946.0,2013-01-12 08:50:36,948.0,2013-01-12 08:50:36,0
98804,948,2013-01-12 08:50:37,946.0,2013-01-12 08:50:37,948.0,2013-01-12 08:50:38,784.0,2013-01-12 08:50:49,49.0,2013-01-12 08:50:59,...,2013-01-12 08:51:03,812.0,2013-01-12 08:51:03,982.0,2013-01-12 08:51:03,52.0,2013-01-12 08:51:03,52.0,2013-01-12 08:51:04,0


In [3]:
sites = ['site%s' % i for i in range(1, 11)]

In [4]:
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open('site_dic.pkl', "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), 
                          columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head()

Websites total: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [5]:
full_df = pd.concat([train_df.drop('target', axis=1), test_df])
# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [6]:
time_df = pd.DataFrame(index=full_df.index)

# Find sessions' starting and ending
time_df['min'] = full_df[times].min(axis=1)
time_df['max'] = full_df[times].max(axis=1)

# Calculate sessions' duration in seconds
time_df['seconds'] = (time_df['max'] - time_df['min']) / np.timedelta64(1, 's')

time_df.head()

Unnamed: 0_level_0,min,max,seconds
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21669,2013-01-12 08:05:57,2013-01-12 08:05:57,0.0
54843,2013-01-12 08:37:23,2013-01-12 09:07:09,1786.0
77292,2013-01-12 08:50:13,2013-01-12 08:50:17,4.0
114021,2013-01-12 08:50:17,2013-01-12 08:50:20,3.0
146670,2013-01-12 08:50:20,2013-01-12 08:50:22,2.0


## year_month

In [7]:
year_month = pd.DataFrame(index=full_df.index)

# Add start_month feature
year_month['start_month'] = full_df['time1'].apply(lambda ts: 
                                                      100 * ts.year + ts.month).astype('float64')
year_month_t = pd.get_dummies(year_month['start_month'])


## month

In [8]:
month = pd.DataFrame(index=full_df.index)

# Add start_month feature
month['month'] = full_df['time1'].apply(lambda ts: ts.month).astype('float64')
month_t = pd.get_dummies(month['month'])


## number of week

In [9]:
number_weeks= pd.DataFrame(index=full_df.index)
number_weeks['number_week'] = full_df['time1'].apply(lambda ts: ts.week).astype('float64')
number_weeks_t = pd.get_dummies(number_weeks['number_week'])

## выходные

In [10]:
week_end= pd.DataFrame(index=full_df.index)
week_end['week_end'] = full_df['time1'].apply(lambda ts: 1 if ts.dayofweek>4 else 0)

## Час

In [11]:
hour = pd.DataFrame(index=full_df.index)
hour['start_hour'] = full_df['time1'].apply(lambda ts: ts.hour)
hour_t = pd.get_dummies(hour['start_hour'])

## День недели

In [12]:
day_of_week= pd.DataFrame(index=full_df.index)
day_of_week['day_of_week'] = full_df['time1'].apply(lambda ts: ts.dayofweek)

In [13]:
day_of_week_t = pd.get_dummies(day_of_week['day_of_week']) 

In [14]:
def blyat(t):
    if t<11:
        return 0
    elif t>19:
        return 1
    else:
        return 2
    

## Часть дня

In [15]:
part_of_day = pd.DataFrame(index=full_df.index)
part_of_day['part_of_day'] = hour['start_hour'].apply(blyat)

In [16]:
part_of_day_t = pd.get_dummies(part_of_day['part_of_day']) 

## Часть дня 1

In [17]:
def blyat1(t):
    if 6<=t<12:
        return 0
    elif 12<=t<18:
        return 1
    elif 18<=t<24:
        return 3
    else:
        return 4
    

In [18]:
part_of_day1 = pd.DataFrame(index=full_df.index)
part_of_day1['part_of_day1'] = hour['start_hour'].apply(blyat1)

In [19]:
part_of_day_t1 = pd.get_dummies(part_of_day1['part_of_day1']) 

## уникальный значения

In [20]:
n_q = pd.DataFrame(index=full_df.index)
n_q['uni'] = full_df[sites].replace(0, np.NaN).nunique(axis=1)

## Нагло содрал у  [ЮРИЯ](https://www.kaggle.com/kashnitsky/model-validation-in-a-competition)

In [None]:
id2site = {v:k for (k, v) in site_dict.items()}
str_train = [' '.join([id2site[idx] for idx in row.values if idx in id2site]) for _, row in train_df[sites].iterrows()]
str_test = [' '.join([id2site[idx] for idx in row.values if idx in id2site]) for _, row in test_df[sites].iterrows()]

In [None]:
vectorizer = TfidfVectorizer(ngram_range = (1, 5),max_features =1600000)

In [None]:
%%time
X_train = vectorizer.fit_transform(np.array(str_train))
X_test = vectorizer.transform((str_test ))

In [None]:
print(X_train.shape, X_test.shape)


In [21]:
y =train_df[['target']]

## Колдуем с признаками

In [22]:
from sklearn.preprocessing import StandardScaler
time_dur = StandardScaler().fit_transform(time_df[['seconds']].values)

In [24]:
 week_end[['week_end']].values[:idx_split]

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [26]:
X_train_t =np.hstack([#X_train,
                   part_of_day_t1.values[:idx_split],
                   day_of_week_t.values[:idx_split],
                   year_month_t.values[:idx_split],
                   number_weeks_t.values[:idx_split],
                   time_df[['seconds']].values[:idx_split],
                   week_end[['week_end']].values[:idx_split]
])
                   
X_test_t =np.hstack([#X_test,
                   part_of_day_t1.values[idx_split:],
                   day_of_week_t.values[idx_split:],
                   year_month_t.values[idx_split:],
                   number_weeks_t.values[idx_split:],
                   time_df[['seconds']].values[idx_split:],
                   week_end[['week_end']].values[idx_split:] 
])

(253561, 78)

 ### Проверяем

In [29]:
def score(model, X, y, train_size=0.7, random_states=[1, 13, 42]):
    result = []
    
    for rs in random_states:
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=train_size, stratify=y, random_state=rs)
        m = clone(model, safe=True)
        m.fit(X_train, y_train)
        valid_score = m.predict_proba(X_valid)
        result.append(roc_auc_score(y_valid, valid_score[:, 1:]))
        
    return result

In [30]:
score(LogisticRegression(C=21.55, n_jobs=-1), X_train_t
      , y)
#[0.9889510370484755, 0.9865306924836754, 0.988063920740623]


[0.9258525358310348, 0.9201725673616521, 0.9173274470789702]

In [31]:
score(LogisticRegression(C=15, n_jobs=-1), X_train_t
      , y)
#[0.9890169440485576, 0.986253971652481, 0.9879412909762284]


[0.9258546730431321, 0.9201912053914738, 0.9173470670711068]

In [33]:
score(LogisticRegression(C=10 ,n_jobs=-1), X_train_t
      , y)
#[0.9890169440485576, 0.986253971652481, 0.9879412909762284]


[0.9259112417741403, 0.9200752375674907, 0.9172272484145159]

In [34]:
score(LogisticRegression(C=5 ,n_jobs=-1), X_train_t
      , y)
#[0.9890169440485576, 0.986253971652481, 0.9879412909762284]


[0.9256572023470054, 0.9199441744796852, 0.9170828421917245]

### Отправляем

In [35]:
def make_submission(model, X_train, y_train, X_test):
    print(X_train.shape)
    print(X_test.shape)
    model.fit(X_train, y_train)
    test_pred_proba = model.predict_proba(X_test)
    write_to_submission_file(test_pred_proba[:, 1:], 'result____1_test1.csv')
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
make_submission(LogisticRegression(C=19, n_jobs=-1), X_train_t, y, X_test_t#result__8 0.95647lb

In [36]:
make_submission(LogisticRegression(C=12, n_jobs=-1), X_train_t, y, X_test_t)#result__8 0.95647lb

(253561, 78)
(82797, 78)
