In [1]:
import warnings
warnings.filterwarnings('ignore')
from glob import glob
import os
import pickle
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import math
import re
from collections import Counter
from scipy.sparse import csr_matrix
import itertools

In [2]:
PATH_TO_DATA = '..\\capstone_user_identification'

In [9]:
PATH_TO_EXPS = 'user_identification_exps'

In [5]:
def load_dict(path_to_dict):
    """Функция, загружающая словарь частот из файла"""
    with open(path_to_dict, 'rb') as site_freq_pkl:
        site_freq = pickle.load(site_freq_pkl)
    return site_freq

Словарь сайтов формата:  
  
{имя сайта : (id сайта, частота встречаемости сайта по всем пользователям)}

In [8]:
site_freq = load_dict(os.path.join(PATH_TO_DATA, 'site_freq_3users.pkl'))
site_freq

{'google.com': (1, 9),
 'oracle.com': (2, 8),
 'vk.com': (3, 3),
 'meduza.io': (4, 3),
 'mail.google.com': (5, 2),
 'football.kulichki.ru': (6, 2),
 'geo.mozilla.org': (7, 1),
 'accounts.google.com': (8, 1),
 'apis.google.com': (9, 1),
 'plus.google.com': (10, 1),
 'yandex.ru': (11, 1)}

In [14]:
def create_sites_dict(path_to_csv_files):
    """Функция, которая читает все файлы в директории и составляет словарь частот для сайтов"""
    
    # Функция, создающая уникальный словарь из frame
    def dict_by_user(frame):
        nonlocal dict_size
        fr_dict = dict(Counter(frame))
        for key in fr_dict:
            dict_size += 1
            fr_dict[key] = (dict_size, fr_dict[key])
        return fr_dict
    
    # Функция, объединяющая словари, собранные из разных файлов: dict1, dict2 вида {key: (id, freq)}
    def dict_concat(dict1, dict2):
        for key in dict2:
            if key in dict1:
                dict1[key] = (dict1[key][0], dict1[key][1] + dict2[key][1])
            else:
                dict1[key] = dict2[key]
        return dict1
    
    paths_to_csv = glob(os.path.join(path_to_csv_files, 'user*.csv'))
    fr_dict = {}
    dict_size = 0
    for path in paths_to_csv:
        user_sites = pd.read_csv(path).site
        fr_dict = dict_concat(fr_dict, dict_by_user(user_sites))
    
    fr_dict_new = {}
    for i, el in enumerate(sorted(fr_dict.items(), key=lambda x: x[1][1], reverse=True)):
        fr_dict_new[el[0]] = (i+1, el[1][1])
    
    return fr_dict_new

In [3]:
def prepare_train_set(path_to_csv_files, path_to_dict, session_length=10, window_size=10, time_limit=5*60):
    """Функция, разбивающая историю посещений сайтов на сессии длинной session_length, со сдвигом window_size, и с ограничением
    по времни time_limit.
        path_to_csv_files - путь к директории с файлами вида user\d{4}.csv
        path_to_dict - путь к файлу со словарем частот"""
    
    paths_to_csv = glob(os.path.join(path_to_csv_files, 'user*.csv'))
    # загрузка словаря
    with open(path_to_dict, 'rb') as site_freq_pkl:
        site_freq = pickle.load(site_freq_pkl)
    
    def make_sess_idxs(end_id, idx_shift):
        temp_idxs = []
        row = [0]
        sess_number = 0
        base = np.array(range(session_length)) + idx_shift
        while True:
            row = base + window_size*sess_number
            sess_number += 1
            temp_idxs.append(row)
            if row[0]+window_size>=end_id: break
        temp_idxs = np.array(temp_idxs)
        temp_idxs[temp_idxs>end_id] = end_id
        return temp_idxs
   
    files_sess_data = []
    for path in tqdm(paths_to_csv):
        # считываем файл и меняем тип времени
        user_data = pd.read_csv(path).astype({'timestamp': 'datetime64'})
        # применяем словарь к именам сайтов
        user_data.site = user_data.site.map(lambda key: site_freq[key][0])
        # вычисляем разность времени заходов на сайты
        user_data[['time_diff']] = (user_data.timestamp - user_data.timestamp.shift(1)).apply(lambda x: x.total_seconds())
        user_time = user_data.time_diff
        user_time.index -= 1
        user_data.time_diff = user_time
        user_data = user_data.fillna(0).astype({'site': 'int', 'time_diff': 'int'})
        user_data.loc[user_data.shape[0]] = [0, 0, 0]
        # вычисляем массив индексов для сортировки на сессии со скользящим окном
        cut_idx = 0
        shift_size = 0
        data_idxs = np.zeros((1, session_length))
        i = 0
        while True:
            temp_idxs = make_sess_idxs(user_data.shape[0]-1, cut_idx + shift_size)
            if temp_idxs.shape[0]>0:
                data_idxs = np.vstack((data_idxs, temp_idxs)).astype(int)
                if data_idxs[0].sum()==0:
                    data_idxs = data_idxs[1:]
            diff_times = user_data.time_diff.values[data_idxs]
            check_limit = np.where(diff_times.sum(axis=1) > time_limit)[0]
            if i+1 > check_limit.shape[0]: break
            time_sum = 0
            for j, time in enumerate(diff_times[check_limit[i]]):
                time_sum += time
                if time_sum > time_limit: break
            cut_idx = check_limit[i]*session_length + j + 1
            shift_size = (session_length - cut_idx%session_length)%session_length
            cut_idx = data_idxs.ravel()[cut_idx]
            empty_rows = pd.DataFrame(0, index=range(shift_size), columns=user_data.columns)
            user_data = pd.concat((user_data.iloc[:cut_idx], empty_rows, user_data.iloc[cut_idx:])).reset_index(drop=True)
            data_idxs = data_idxs[:check_limit[i]+1]
            i += 1
    
        # вычисление часа захода на сайт
        user_time = user_data.timestamp
        user_time[user_time==0] = np.nan
        user_time = pd.to_datetime(user_time)
        start_hour = user_time.apply(lambda x: x.hour)
        start_hour = start_hour[data_idxs[:, 0]].values[:, np.newaxis]
        # вычисление дня недели захода на сайт
        day_of_week = user_time.apply(lambda x: x.weekday())
        day_of_week = day_of_week[data_idxs[:, 0]].values[:, np.newaxis]
        # вычисление года захода на сайт
        year = user_time.apply(lambda x: x.year)
        year = year[data_idxs[:, 0]].values[:, np.newaxis]
        # вычисление месяца захода на сайт
        month = user_time.apply(lambda x: x.month)
        month = month[data_idxs[:, 0]].values[:, np.newaxis]
        # вычисление дня месяца захода на сайт
        day = user_time.apply(lambda x: x.day)
        day = day[data_idxs[:, 0]].values[:, np.newaxis]
        # вычисление времени суток захода на сайт
        time_of_day = user_time.apply(lambda x: x.hour)
        time_of_day[time_of_day<12] = 0
        time_of_day[time_of_day.between(12, 16)] = 1
        time_of_day[time_of_day>16] = 2
        time_of_day = time_of_day[data_idxs[:, 0]].values[:, np.newaxis]
        
        sess_data = user_data.site.values[data_idxs]
        diff_times = user_data.time_diff.values[data_idxs]
        
        # заполнение столбца user_id
        us_id = int(re.search('user\d{4}', path)[0][-4:])
        user_id = pd.DataFrame({'user_id': [us_id]*(sess_data.shape[0])})
        file_sess_data = pd.DataFrame(np.hstack((sess_data, diff_times, start_hour, day_of_week, year, month, day,
                                                 time_of_day, user_id)))
        file_sess_data.columns = ['site'+str(i) for i in range(1, session_length+1)] + \
                                    ['diff_time'+str(i)for i in range(1, session_length+1)] + \
                                    ['start_hour', 'day_of_week', 'year', 'month', 'day', 'time_of_day'] + ['user_id']
        files_sess_data.append(file_sess_data)
    # объединение сессий со всех файлов в одну таблицу
    file_sess_data = pd.concat(files_sess_data, ignore_index=True)
    file_sess_data.index.name = 'session_id'
    file_sess_data.index += 1
    
    return file_sess_data.astype(int)

### Создание датафреймов из исходных данных  с swt = (15, 5, 30)

In [4]:
train_data_3users = prepare_train_set(os.path.join(PATH_TO_DATA, '3users'), 
                                      os.path.join(PATH_TO_DATA, 'site_freq_3users.pkl'),
                                     session_length=15, window_size=5, time_limit=30*60)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [5]:
train_data_3users.iloc[:, :15].head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,site11,site12,site13,site14,site15
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,3,2,2,0,0,0,0,0,0,0,0,0,0,0,0
2,7,2,1,8,5,9,10,0,0,0,0,0,0,0,0
3,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0
4,3,2,6,0,0,0,0,0,0,0,0,0,0,0,0
5,6,2,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
train_data_3users.iloc[:, 15:30].head()

Unnamed: 0_level_0,diff_time1,diff_time2,diff_time3,diff_time4,diff_time5,diff_time6,diff_time7,diff_time8,diff_time9,diff_time10,diff_time11,diff_time12,diff_time13,diff_time14,diff_time15
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,287,1184,6278,0,0,0,0,0,0,0,0,0,0,0,0
2,186,2,1,2,3,55,3540,0,0,0,0,0,0,0,0
3,2,3,55,0,0,0,0,0,0,0,0,0,0,0,0
4,287,1184,6278,0,0,0,0,0,0,0,0,0,0,0,0
5,186,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
train_data_3users.iloc[:, 30:37].head()

Unnamed: 0_level_0,start_hour,day_of_week,year,month,day,time_of_day,user_id
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,9,4,2013,11,15,0,1
2,11,4,2013,11,15,0,1
3,12,4,2013,11,15,1,1
4,9,4,2013,11,15,0,2
5,11,4,2013,11,15,0,2


In [8]:
%%time
train_data_10users = prepare_train_set(os.path.join(PATH_TO_DATA, '10users'), 
                                       os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'),
                                     session_length=15, window_size=5, time_limit=30*60)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))


Wall time: 12.1 s


In [9]:
train_data_10users.iloc[:, :15].head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,site11,site12,site13,site14,site15
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,192,574,133,3,133,133,3,133,203,133,415,193,674,254,133
2,133,3,133,203,133,415,193,674,254,133,31,393,3305,217,55
3,415,193,674,254,133,31,393,3305,217,55,55,3,55,55,5
4,31,393,3305,217,55,55,3,55,55,5,293,415,333,897,55
5,55,3,55,55,5,293,415,333,897,55,473,3306,473,55,55


In [9]:
%%time
train_data_150users = prepare_train_set(join(PATH_TO_DATA, '150users'), join(PATH_TO_DATA, 'site_freq_150users.pkl'),
                                     session_length=15, window_size=5, time_limit=30*60)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=150.0), HTML(value='')))


Wall time: 1min 32s


In [20]:
train_data_150users.head()

NameError: name 'train_data_150users' is not defined

In [11]:
%%time
train_data_all_users = prepare_train_set(join(PATH_TO_DATA, 'all_users'), join(PATH_TO_DATA, 'site_freq_all_users.pkl'),
                                     session_length=15, window_size=5, time_limit=30*60)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3370.0), HTML(value='')))




ValueError: Cannot convert non-finite values (NA or inf) to integer

In [12]:
train_data_all_users.isna().any().any()

NameError: name 'train_data_all_users' is not defined

In [16]:
train_data_all_users.head(15)

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,diff_time7,diff_time8,diff_time9,diff_time10,diff_time11,diff_time12,diff_time13,diff_time14,diff_time15,user_id
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,4,2,2,4,1,38,59,73,73,...,0,46,330,3675,0,0,0,0,0,1
2,1,2,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,70,1,40,35,70,35,70,40,35,70,...,0,0,1,0,0,0,1,1,0,1
4,35,70,40,35,70,35,67,65,67,65,...,0,1,1,0,0,0,0,1,0,1
5,35,67,65,67,65,40,70,35,67,40,...,0,0,1,0,0,0,0,1,0,1
6,40,70,35,67,40,35,65,67,70,35,...,0,0,1,0,0,0,1,4,0,1
7,35,65,67,70,35,40,65,67,67,67,...,0,1,4,0,0,134,0,0,0,1
8,40,65,67,67,67,65,40,35,67,70,...,134,0,0,0,0,235,0,0,0,1
9,65,40,35,67,70,65,40,65,70,67,...,235,0,0,0,0,1,1,0,0,1
10,65,40,65,70,67,35,40,67,65,67,...,1,1,0,0,0,0,1,1,0,1


In [15]:
PATH_TO_EXPS = 'user_identification_exps'

In [16]:
train_data_3users.to_csv(join(PATH_TO_EXPS, 
                                       'train_data_3users.csv'), 
                        index_label='session_id', float_format='%d')
train_data_10users.to_csv(join(PATH_TO_EXPS, 
                                        'train_data_10users.csv'), 
                         index_label='session_id', float_format='%d')
train_data_150users.to_csv(join(PATH_TO_EXPS, 
                                       'train_data_150users.csv'), 
                        index_label='session_id', float_format='%d')

In [14]:
train_data_all_users.to_csv(join(PATH_TO_EXPS, 
                                        'train_data_all_users.csv'), 
                         index_label='session_id', float_format='%d')

NameError: name 'train_data_all_users' is not defined

#### Создание разреженнной матрицы "мешка слов"

In [10]:
def prepare_csr(X):
    """Функция, создающая csr-матрицу из матрицы вхождений сайтов в сессию"""
    data = np.ones(X.size, dtype=int)
    indices = X.reshape(-1)
    indptr = np.arange(X.shape[0] + 1) * X.shape[1]
    return csr_matrix((data, indices, indptr), dtype=int)[:, 1:]

In [11]:
X_sparse_3users = prepare_csr(train_data_3users.iloc[:, :15].values)
X_sparse_3users

<8x11 sparse matrix of type '<class 'numpy.intc'>'
	with 32 stored elements in Compressed Sparse Row format>

In [12]:
X_sparse_3users.todense()

matrix([[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0],
        [3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [3, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1],
        [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

### Подготовка датасетов с разными параметрами swt

In [16]:
%%time
data_lengths = []

for window_size, session_length, time_limit in tqdm(list(itertools.product([10, 7, 5], [15, 10, 7, 5], [5, 10, 15, 20, 30]))):
    if window_size <= session_length:
        train_data = prepare_train_set(os.path.join(PATH_TO_DATA, '10users'), 
                                       os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'),
                                     session_length, window_size, time_limit*60)
        X_sparse = prepare_csr(train_data.iloc[:, :session_length].values)
        y = train_data.user_id.values
        data_lengths.append(X_sparse.shape[0])
        with open(os.path.join(os.path.join(PATH_TO_EXPS, 'swt_tuning'), 
                                  'X_sparse_10users_s'+str(session_length)+'_w'+str(window_size)+'_t'+str(time_limit)+'.pkl'),
                     'wb') as X_pkl:
            pickle.dump(X_sparse, X_pkl, protocol=2)
        with open(os.path.join(os.path.join(PATH_TO_EXPS, 'swt_tuning'), 
                     'y_10users_s'+str(session_length)+'_w'+str(window_size)+'_t'+str(time_limit)+'.pkl'), 'wb') as y_pkl:
            pickle.dump(y, y_pkl, protocol=2)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))



Wall time: 9min 28s


### Подготовка датасетов на 10 и 150 пользователей с параметрами аналогичными данным из соревнования Catch Me

In [5]:
%%time
train_data_10users = prepare_train_set(os.path.join(PATH_TO_DATA, '10users'), 
                                       os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'),
                                     session_length=10, window_size=10, time_limit=30*60)
train_data_150users = prepare_train_set(os.path.join(PATH_TO_DATA, '150users'), 
                                        os.path.join(PATH_TO_DATA, 'site_freq_150users.pkl'),
                                     session_length=10, window_size=10, time_limit=30*60)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=150.0), HTML(value='')))


Wall time: 1min 18s


In [6]:
train_data_10users.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,diff_time8,diff_time9,diff_time10,start_hour,day_of_week,year,month,day,time_of_day,user_id
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,192,574,133,3,133,133,3,133,203,133,...,1,0,0,8,4,2013,11,15,0,31
2,415,193,674,254,133,31,393,3305,217,55,...,3,8,1,8,4,2013,11,15,0,31
3,55,3,55,55,5,293,415,333,897,55,...,0,0,0,8,4,2013,11,15,0,31
4,473,3306,473,55,55,55,55,937,199,123,...,0,0,0,8,4,2013,11,15,0,31
5,342,55,5,3307,258,211,3308,2086,675,2086,...,1,0,1,8,4,2013,11,15,0,31


In [10]:
train_data_10users.to_csv(os.path.join(PATH_TO_EXPS, 
                                        'train_data_10users_for_catch_me.csv'), 
                         index_label='session_id', float_format='%d')
train_data_150users.to_csv(os.path.join(PATH_TO_EXPS, 
                                       'train_data_150users_for_catch_me.csv'), 
                        index_label='session_id', float_format='%d')

## Подготовка данных для kaggle Catch me

In [5]:
PATH_TO_DATA = '..\\capstone_user_identification'

In [6]:
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
                       index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
                      index_col='session_id')

In [15]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,,,,,,,,...,,,,,,,,,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [38]:
timestamp = train_df[['time'+str(i) for i in range(1, 11)]].apply(pd.to_datetime)
timestamp

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-02-20 10:02:45,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2014-02-22 11:19:50,2014-02-22 11:19:50,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:51,2014-02-22 11:19:52,2014-02-22 11:19:52,2014-02-22 11:20:15,2014-02-22 11:20:16
3,2013-12-16 16:40:17,2013-12-16 16:40:18,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:19,2013-12-16 16:40:20,2013-12-16 16:40:21,2013-12-16 16:40:22,2013-12-16 16:40:24
4,2014-03-28 10:52:12,2014-03-28 10:52:42,2014-03-28 10:53:12,2014-03-28 10:53:42,2014-03-28 10:54:12,2014-03-28 10:54:42,2014-03-28 10:55:12,2014-03-28 10:55:42,2014-03-28 10:56:12,2014-03-28 10:56:42
5,2014-02-28 10:53:05,2014-02-28 10:55:22,2014-02-28 10:55:22,2014-02-28 10:55:23,2014-02-28 10:55:23,2014-02-28 10:55:59,2014-02-28 10:55:59,2014-02-28 10:55:59,2014-02-28 10:57:06,2014-02-28 10:57:11
...,...,...,...,...,...,...,...,...,...,...
253557,2013-11-25 10:26:54,2013-11-25 10:26:58,2013-11-25 10:27:03,2013-11-25 10:27:04,2013-11-25 10:27:13,2013-11-25 10:27:16,2013-11-25 10:27:28,2013-11-25 10:27:40,2013-11-25 10:27:52,2013-11-25 10:27:53
253558,2013-03-12 16:01:15,2013-03-12 16:01:16,2013-03-12 16:01:16,2013-03-12 16:01:17,2013-03-12 16:01:17,2013-03-12 16:01:17,2013-03-12 16:01:18,2013-03-12 16:01:18,2013-03-12 16:01:18,2013-03-12 16:01:18
253559,2013-09-12 14:05:03,2013-09-12 14:05:10,2013-09-12 14:05:10,2013-09-12 14:06:29,2013-09-12 14:06:30,NaT,NaT,NaT,NaT,NaT
253560,2013-12-19 15:20:22,2013-12-19 15:20:22,2013-12-19 15:20:22,2013-12-19 15:20:22,2013-12-19 15:20:22,2013-12-19 15:20:23,2013-12-19 15:20:23,2013-12-19 15:20:23,2013-12-19 15:20:24,2013-12-19 15:20:24


In [41]:
diff_times = pd.Series(timestamp.values.ravel())
diff_times = (diff_times - diff_times.shift(1)).apply(lambda x: x.total_seconds())
diff_times[:15]

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    0.0
12    1.0
13    0.0
14    0.0
dtype: float64

In [42]:
diff_times.shape

(2535610,)

In [43]:
diff_times.index -= 1

In [45]:
diff_times = diff_times.drop([-1])

In [50]:
diff_times = diff_times.append(pd.Series(0), ignore_index=True)

In [55]:
diff_times.shape[0]/10

253561.0

In [56]:
diff_times = diff_times.values.reshape((int(diff_times.shape[0]/10), 10))

In [58]:
diff_times.shape

(253561, 10)

In [30]:
def prepare_train_set_from_ready(df):
    sess_data = df[['site'+str(i) for i in range(1, 11)]]
    
    timestamp = df[['time'+str(i) for i in range(1, 11)]].apply(pd.to_datetime)
    
    diff_times = pd.Series(timestamp.values.ravel())
    diff_times = (diff_times - diff_times.shift(1)).apply(lambda x: x.total_seconds())
    diff_times.index -= 1
    diff_times = diff_times.drop([-1])
    diff_times = diff_times.append(pd.Series(0), ignore_index=True)
    diff_times = diff_times.values.reshape((int(diff_times.shape[0]/10), 10))[:, :-1]
    
    user_time = timestamp.iloc[:, 0]
    # вычисление часа захода на сайт
    start_hour = user_time.apply(lambda x: x.hour).values[:, np.newaxis]
    # вычисление дня недели захода на сайт
    day_of_week = user_time.apply(lambda x: x.weekday()).values[:, np.newaxis]
    # вычисление года захода на сайт
    year = user_time.apply(lambda x: x.year).values[:, np.newaxis]
    # вычисление месяца захода на сайт
    month = user_time.apply(lambda x: x.month).values[:, np.newaxis]
    # вычисление дня месяца захода на сайт
    day = user_time.apply(lambda x: x.day).values[:, np.newaxis]
    # вычисление времени суток захода на сайт
    time_of_day = user_time.apply(lambda x: x.hour)
    time_of_day[time_of_day<12] = 0
    time_of_day[time_of_day.between(12, 16)] = 1
    time_of_day[time_of_day>16] = 2
    time_of_day = time_of_day.values[:, np.newaxis]
    
    file_sess_data = pd.DataFrame(np.hstack((sess_data.values, diff_times, start_hour, day_of_week, year, month, day,
                                                 time_of_day)))
    file_sess_data.columns = ['site'+str(i) for i in range(1, 11)] + \
                                    ['diff_time'+str(i)for i in range(1, 10)] + \
                                    ['start_hour', 'day_of_week', 'year', 'month', 'day', 'time_of_day']
    
    return file_sess_data.fillna(0).astype(int)

In [27]:
train_catch_me = prepare_train_set_from_ready(train_df)

In [28]:
train_catch_me.head()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,diff_time6,diff_time7,diff_time8,diff_time9,start_hour,day_of_week,year,month,day,time_of_day
0,718,0,0,0,0,0,0,0,0,0,...,0,0,0,0,10,3,2014,2,20,0
1,890,941,3847,941,942,3846,3847,3846,1516,1518,...,1,0,23,1,11,5,2014,2,22,0
2,14769,39,14768,14769,37,39,14768,14768,14768,14768,...,1,1,1,2,16,0,2013,12,16,1
3,782,782,782,782,782,782,782,782,782,782,...,30,30,30,30,10,4,2014,3,28,0
4,22,177,175,178,177,178,175,177,177,178,...,0,0,67,5,10,4,2014,2,28,0


In [31]:
test_catch_me = prepare_train_set_from_ready(test_df)

In [32]:
test_catch_me.head()

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,diff_time6,diff_time7,diff_time8,diff_time9,start_hour,day_of_week,year,month,day,time_of_day
0,29,35,22,321,23,2211,6730,21,44582,15336,...,0,0,6,0,11,5,2014,10,4,0
1,782,782,782,782,782,782,782,782,782,782,...,13,6,1,23,11,3,2014,7,3,0
2,55,55,55,55,55,55,55,55,1445,1445,...,1,1,14,3,15,4,2014,12,5,1
3,1023,1022,50,222,202,3374,50,48,48,3374,...,0,0,1,0,10,1,2014,11,4,0
4,301,301,301,66,67,69,70,68,71,167,...,0,0,0,4,15,4,2014,5,16,1


In [34]:
train_catch_me.to_csv(os.path.join(PATH_TO_DATA, 
                                       'train_catch_me.csv'), 
                        index_label='session_id', float_format='%d')
test_catch_me.to_csv(os.path.join(PATH_TO_DATA, 
                                        'test_catch_me.csv'), 
                         index_label='session_id', float_format='%d')

In [4]:
PATH_TO_ALICE = 'kaggle_catch_me_train\\Alice_log'

In [5]:
PATH_TO_OTHERS = 'kaggle_catch_me_train\\other_user_logs'

In [9]:
PATH_TO_DATA = 'kaggle_catch_me_train'

In [7]:
site_freq_catch_me = create_sites_dict('kaggle_catch_me_train\\for_dict')

In [8]:
site_freq_catch_me

{'www.google.fr': (1, 123776),
 'www.google.com': (2, 87619),
 'annotathon.org': (3, 77055),
 'apis.google.com': (4, 58258),
 'www.facebook.com': (5, 54094),
 'www.bing.com': (6, 46405),
 'blast.ncbi.nlm.nih.gov': (7, 43841),
 'www.ncbi.nlm.nih.gov': (8, 38194),
 'clients1.google.com': (9, 36085),
 'mail.google.com': (10, 35178),
 's.youtube.com': (11, 31391),
 'plus.google.com': (12, 30616),
 'safebrowsing-cache.google.com': (13, 27812),
 'accounts.google.com': (14, 25275),
 'twitter.com': (15, 23726),
 'platform.twitter.com': (16, 23495),
 'www.phylogeny.fr': (17, 23026),
 's-static.ak.facebook.com': (18, 22470),
 'www.youtube.com': (19, 20922),
 'static.ak.facebook.com': (20, 19683),
 'login.live.com': (21, 18130),
 'i1.ytimg.com': (22, 17270),
 'docs.google.com': (23, 16890),
 'translate.google.fr': (24, 15920),
 'drive.google.com': (25, 15879),
 'clients1.google.fr': (26, 14211),
 'safebrowsing.clients.google.com': (27, 13539),
 'api.bing.com': (28, 13534),
 'ajax.googleapis.com':

In [10]:
with open(join(PATH_TO_DATA, 'site_freq_catch_me.pkl'), 'wb') as site_freq_pkl:
            pickle.dump(site_freq_catch_me, site_freq_pkl, protocol=2)

In [None]:
others_train_data = prepare_train_set()