In [96]:
import pandas as pd
import numpy as np
import glob,os,pickle
from datetime import datetime
from tqdm import tqdm 
from collections import defaultdict

In [100]:
# Взвращаем ИД сайта по его названию
def get_site_id(site,site_dict):
    print ('site:',site)
    return site_dict[site][0]

# Вырезаем ИД пользователя из имени файла
def get_user_id(filename):
    start_pos=(filename.find('\\user'))+5
    end_pos=(filename.find('.csv'))
    return int(filename[start_pos:end_pos])
# Заменяем названия сайтов на ИД, формируем словарь частот сайтов
def get_site_dict(all_user_log):
    # Расчитываем частоту freq для каждого сайта 
    all_user_log['freq'] = all_user_log.groupby('site')['site'].transform('count')
    # Формируем словарь
    site_dict=all_user_log[['site','freq']].drop_duplicates().sort_values(by='freq',ascending=False).reset_index(drop=True)
    site_dict['site_id']=site_dict.index+1
    site_dict.set_index(site_dict.site,inplace=True)
    site_dict['site_freq']=site_dict.apply(lambda row: (row['site_id'],row['freq']), axis=1)
    all_user_log=all_user_log.join(site_dict[['site','site_id']],on=['site'],rsuffix='r')
    site_dict=site_dict.site_freq.to_dict()
    return all_user_log[['timestamp','site_id','user_id']],site_dict

def user_log_to_session(user_log_df,user_id,session_length,window_size):
    res=pd.DataFrame()
    # Добавляем колонки
    for col in range(1,session_length+1):
        shift_index=col-1
        site_col_name='site'+str(col)
        res[site_col_name]=user_log_df.site_id[shift_index:]
        res[site_col_name]=res[site_col_name].shift(-shift_index)
        timestamp_col_name='time'+str(col)
        res[timestamp_col_name]=user_log_df.timestamp[shift_index:]
        res[timestamp_col_name]=res[timestamp_col_name].shift(-shift_index)
    # Удаляем лишние строки
    res['user_id']=user_id
    ind_tosave=[x for x in range(0,len(user_log_df),window_size)]
    return res.iloc[ind_tosave,:]

# Функция возвращает подготовленные данные и словарь сайтов
def prepare_train_set(path_to_csv_files, session_length,window_size):
    # Последовательно читаем все файлы сессий в каталоге и склеиваем их в один DataFrame = sessions_df
    all_files = glob.glob(os.path.join(path_to_csv_files, "*.csv"))
    print('1. Читаем исходные данные')
    user_log=[]
    for filename in all_files:
        df_user=pd.read_csv(filename)
        df_user['user_id']=get_user_id(filename)
        user_log.append(df_user)
    all_user_log = pd.concat(user_log, ignore_index=True)
    print('2. Формируем словарь')
    # Формируем словарь сайтов
    all_user_log,site_dict=get_site_dict(all_user_log)
    print('3. Формируем сессии')
    sessions=pd.DataFrame()
    for user_id in tqdm(all_user_log.user_id.value_counts().index):
        user_df=all_user_log[all_user_log.user_id==user_id]
        user_session=user_log_to_session(user_df,user_id,session_length,window_size)
        sessions=pd.concat([sessions,user_session])
    return sessions,site_dict

def resize_train(path_to_csv_files, session_length,window_size):
    sessions,dict_=prepare_train_set(path_to_csv_files, session_length,window_size)
    with open('train_'+str(window_size)+'_session.pkl', 'wb') as pkl_file:
        pickle.dump(sessions, pkl_file)
    with open('train_'+str(window_size)+'_dict.pkl', 'wb') as pkl_file:
        pickle.dump(dict_, pkl_file)

In [105]:
%%time
for window_size in [3,5,7,10]:
    print ('window_size=',window_size)
    resize_train('tr/',10,window_size)

window_size= 3
1. Читаем исходные данные
2. Формируем словарь
3. Формируем сессии



  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  9.85it/s]


window_size= 5
1. Читаем исходные данные
2. Формируем словарь
3. Формируем сессии



  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  7.86it/s]


window_size= 7
1. Читаем исходные данные
2. Формируем словарь
3. Формируем сессии


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  9.45it/s]


window_size= 10
1. Читаем исходные данные
2. Формируем словарь
3. Формируем сессии


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  8.09it/s]


Wall time: 1.63 s


In [93]:
pickle.dump

NameError: name 'pickle' is not defined

In [92]:
sessions

Unnamed: 0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
0,142,2014-01-30 13:36:20,111.0,2014-01-30 13:36:20,115.0,2014-01-30 13:36:20,95.0,2014-01-30 13:36:20,158.0,2014-01-30 13:36:20,...,2014-01-30 13:36:20,114.0,2014-01-30 13:36:21,111.0,2014-01-30 13:36:21,115.0,2014-01-30 13:36:21,95.0,2014-01-30 13:36:21,1
3,95,2014-01-30 13:36:20,158.0,2014-01-30 13:36:20,114.0,2014-01-30 13:36:20,114.0,2014-01-30 13:36:21,111.0,2014-01-30 13:36:21,...,2014-01-30 13:36:21,95.0,2014-01-30 13:36:21,4.0,2014-01-30 13:36:24,5.0,2014-01-30 13:36:25,4.0,2014-01-30 13:36:32,1
6,114,2014-01-30 13:36:21,111.0,2014-01-30 13:36:21,115.0,2014-01-30 13:36:21,95.0,2014-01-30 13:36:21,4.0,2014-01-30 13:36:24,...,2014-01-30 13:36:25,4.0,2014-01-30 13:36:32,4.0,2014-01-30 13:36:39,50.0,2014-01-30 13:36:39,4.0,2014-01-30 13:36:42,1
9,95,2014-01-30 13:36:21,4.0,2014-01-30 13:36:24,5.0,2014-01-30 13:36:25,4.0,2014-01-30 13:36:32,4.0,2014-01-30 13:36:39,...,2014-01-30 13:36:39,4.0,2014-01-30 13:36:42,4.0,2014-01-30 13:36:44,9.0,2014-01-30 13:36:45,8.0,2014-01-30 13:36:47,1
12,4,2014-01-30 13:36:32,4.0,2014-01-30 13:36:39,50.0,2014-01-30 13:36:39,4.0,2014-01-30 13:36:42,4.0,2014-01-30 13:36:44,...,2014-01-30 13:36:45,8.0,2014-01-30 13:36:47,113.0,2014-01-30 13:36:49,4.0,2014-01-30 13:36:51,9.0,2014-01-30 13:36:51,1
15,4,2014-01-30 13:36:42,4.0,2014-01-30 13:36:44,9.0,2014-01-30 13:36:45,8.0,2014-01-30 13:36:47,113.0,2014-01-30 13:36:49,...,2014-01-30 13:36:51,9.0,2014-01-30 13:36:51,8.0,2014-01-30 13:36:51,113.0,2014-01-30 13:36:51,50.0,2014-01-30 13:37:41,1
18,8,2014-01-30 13:36:47,113.0,2014-01-30 13:36:49,4.0,2014-01-30 13:36:51,9.0,2014-01-30 13:36:51,8.0,2014-01-30 13:36:51,...,2014-01-30 13:36:51,50.0,2014-01-30 13:37:41,5.0,2014-01-30 13:42:29,5.0,2014-01-30 13:42:30,209.0,2014-01-30 13:42:30,1
21,9,2014-01-30 13:36:51,8.0,2014-01-30 13:36:51,113.0,2014-01-30 13:36:51,50.0,2014-01-30 13:37:41,5.0,2014-01-30 13:42:29,...,2014-01-30 13:42:30,209.0,2014-01-30 13:42:30,195.0,2014-01-30 13:42:30,191.0,2014-01-30 13:42:30,53.0,2014-01-30 13:42:31,1
24,50,2014-01-30 13:37:41,5.0,2014-01-30 13:42:29,5.0,2014-01-30 13:42:30,209.0,2014-01-30 13:42:30,195.0,2014-01-30 13:42:30,...,2014-01-30 13:42:30,53.0,2014-01-30 13:42:31,5.0,2014-01-30 13:42:31,14.0,2014-01-30 13:42:32,39.0,2014-01-30 13:42:32,1
27,209,2014-01-30 13:42:30,195.0,2014-01-30 13:42:30,191.0,2014-01-30 13:42:30,53.0,2014-01-30 13:42:31,5.0,2014-01-30 13:42:31,...,2014-01-30 13:42:32,39.0,2014-01-30 13:42:32,23.0,2014-01-30 13:42:32,109.0,2014-01-30 13:42:32,5.0,2014-01-30 13:42:32,1


{'0.docs.google.com': (79, 6),
 '0.drive.google.com': (56, 9),
 '0.gravatar.com': (146, 3),
 '0.talkgadget.google.com': (161, 2),
 '1.docs.google.com': (100, 4),
 '1.drive.google.com': (122, 4),
 '1.gravatar.com': (141, 3),
 '2.drive.google.com': (175, 2),
 '8c.img.v4.skyrock.net': (280, 1),
 'a392.idata.over-blog.com': (184, 2),
 'acc.linternaute.com': (151, 3),
 'accounts.google.com': (11, 76),
 'accounts.google.fr': (34, 20),
 'accounts.youtube.com': (40, 15),
 'ad.foxitsoftware.com': (245, 1),
 'af.lygo.com': (312, 1),
 'ajax.googleapis.com': (38, 18),
 'ajax.microsoft.com': (158, 2),
 'annotathon.org': (2, 724),
 'annotathon.univ-mrs.fr': (110, 4),
 'api.bing.com': (29, 25),
 'api.dailymotion.com': (242, 1),
 'api.dmcloud.net': (130, 3),
 'api.facebook.com': (297, 1),
 'api.recaptcha.net': (155, 2),
 'api.recsys.opera.com': (36, 19),
 'api.twitter.com': (102, 4),
 'apis.google.com': (9, 123),
 'apr.lijit.com': (164, 2),
 'archimer.ifremer.fr': (49, 11),
 'assets.pinterest.com': (9

In [78]:
sessions=pd.DataFrame()
session_length=10
window_size=3
for user_id in tqdm(a.user_id.value_counts().index):
    user_df=a[a.user_id==user_id]
    user_session=user_log_to_session(user_df,user_id,session_length,window_size)
    sessions=pd.concat([sessions,user_session])

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.41it/s]


In [79]:
sessions

Unnamed: 0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
0,142,2014-01-30 13:36:20,111.0,2014-01-30 13:36:20,115.0,2014-01-30 13:36:20,95.0,2014-01-30 13:36:20,158.0,2014-01-30 13:36:20,...,2014-01-30 13:36:20,114.0,2014-01-30 13:36:21,111.0,2014-01-30 13:36:21,115.0,2014-01-30 13:36:21,95.0,2014-01-30 13:36:21,1
3,95,2014-01-30 13:36:20,158.0,2014-01-30 13:36:20,114.0,2014-01-30 13:36:20,114.0,2014-01-30 13:36:21,111.0,2014-01-30 13:36:21,...,2014-01-30 13:36:21,95.0,2014-01-30 13:36:21,4.0,2014-01-30 13:36:24,5.0,2014-01-30 13:36:25,4.0,2014-01-30 13:36:32,1
6,114,2014-01-30 13:36:21,111.0,2014-01-30 13:36:21,115.0,2014-01-30 13:36:21,95.0,2014-01-30 13:36:21,4.0,2014-01-30 13:36:24,...,2014-01-30 13:36:25,4.0,2014-01-30 13:36:32,4.0,2014-01-30 13:36:39,50.0,2014-01-30 13:36:39,4.0,2014-01-30 13:36:42,1
9,95,2014-01-30 13:36:21,4.0,2014-01-30 13:36:24,5.0,2014-01-30 13:36:25,4.0,2014-01-30 13:36:32,4.0,2014-01-30 13:36:39,...,2014-01-30 13:36:39,4.0,2014-01-30 13:36:42,4.0,2014-01-30 13:36:44,9.0,2014-01-30 13:36:45,8.0,2014-01-30 13:36:47,1
12,4,2014-01-30 13:36:32,4.0,2014-01-30 13:36:39,50.0,2014-01-30 13:36:39,4.0,2014-01-30 13:36:42,4.0,2014-01-30 13:36:44,...,2014-01-30 13:36:45,8.0,2014-01-30 13:36:47,113.0,2014-01-30 13:36:49,4.0,2014-01-30 13:36:51,9.0,2014-01-30 13:36:51,1
15,4,2014-01-30 13:36:42,4.0,2014-01-30 13:36:44,9.0,2014-01-30 13:36:45,8.0,2014-01-30 13:36:47,113.0,2014-01-30 13:36:49,...,2014-01-30 13:36:51,9.0,2014-01-30 13:36:51,8.0,2014-01-30 13:36:51,113.0,2014-01-30 13:36:51,50.0,2014-01-30 13:37:41,1
18,8,2014-01-30 13:36:47,113.0,2014-01-30 13:36:49,4.0,2014-01-30 13:36:51,9.0,2014-01-30 13:36:51,8.0,2014-01-30 13:36:51,...,2014-01-30 13:36:51,50.0,2014-01-30 13:37:41,5.0,2014-01-30 13:42:29,5.0,2014-01-30 13:42:30,209.0,2014-01-30 13:42:30,1
21,9,2014-01-30 13:36:51,8.0,2014-01-30 13:36:51,113.0,2014-01-30 13:36:51,50.0,2014-01-30 13:37:41,5.0,2014-01-30 13:42:29,...,2014-01-30 13:42:30,209.0,2014-01-30 13:42:30,195.0,2014-01-30 13:42:30,191.0,2014-01-30 13:42:30,53.0,2014-01-30 13:42:31,1
24,50,2014-01-30 13:37:41,5.0,2014-01-30 13:42:29,5.0,2014-01-30 13:42:30,209.0,2014-01-30 13:42:30,195.0,2014-01-30 13:42:30,...,2014-01-30 13:42:30,53.0,2014-01-30 13:42:31,5.0,2014-01-30 13:42:31,14.0,2014-01-30 13:42:32,39.0,2014-01-30 13:42:32,1
27,209,2014-01-30 13:42:30,195.0,2014-01-30 13:42:30,191.0,2014-01-30 13:42:30,53.0,2014-01-30 13:42:31,5.0,2014-01-30 13:42:31,...,2014-01-30 13:42:32,39.0,2014-01-30 13:42:32,23.0,2014-01-30 13:42:32,109.0,2014-01-30 13:42:32,5.0,2014-01-30 13:42:32,1


Unnamed: 0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
2624,116,2013-11-27 13:35:14,142.0,2013-11-27 13:35:17,111.0,2013-11-27 13:35:17,115.0,2013-11-27 13:35:17,95.0,2013-11-27 13:35:17,...,2013-11-27 13:35:17,158.0,2013-11-27 13:35:17,95.0,2013-11-27 13:35:18,111.0,2013-11-27 13:35:18,115.0,2013-11-27 13:35:18,5
2627,115,2013-11-27 13:35:17,95.0,2013-11-27 13:35:17,114.0,2013-11-27 13:35:17,158.0,2013-11-27 13:35:17,95.0,2013-11-27 13:35:18,...,2013-11-27 13:35:18,115.0,2013-11-27 13:35:18,114.0,2013-11-27 13:35:18,5.0,2013-11-27 13:35:22,5.0,2013-11-27 13:35:28,5


In [16]:
window_size=3
session_length=10
max_sesssion_len=200
# Обрабатываем лог юзера и возвращем индексы записей лога, которые должны войти в одну сессию 
def process_user(user_df,window_size,session_length,max_sesssion_len):
    session_coords=[]
    start_index=user_df.index.min()
    max_index=user_df.index.max()
    while start_index<max_index:
        for pad in range(session_length):
            end_index=start_index+session_length-pad-1
            if end_index>max_index:
                end_index=max_index
            time1=datetime.strptime(user_df.loc[start_index,'timestamp'], '%Y-%m-%d %H:%M:%S')
            time2=datetime.strptime(user_df.loc[end_index,'timestamp'], '%Y-%m-%d %H:%M:%S')
            session_len=(time2-time1).total_seconds()
            if session_len<=max_sesssion_len:
                break
        session_coords.append([start_index,end_index])
        start_index+=window_size
    return (session_coords)

In [24]:
session_coords=[]
for user_id in tqdm(a.user_id.value_counts().index):
    user_df=a[a.user_id==user_id]
    session_coords=process_user(user_df,window_size,session_length,max_sesssion_len)
    sessiond_df=log_to_session(session_coords,user_df,user_id)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.19s/it]


In [25]:
sessiond_df

Unnamed: 0,time1,site1,time2,site2,time3,site3,time4,site4,time5,site5,...,site6,time7,site7,time8,site8,time9,site9,time10,site10,user_id
0,,,,,,,,,,,...,,,,,,,,,,5
1,,,,,,,,,,,...,,,,,,,,,,5
2,,,,,,,,,,,...,,,,,,,,,,5
3,,,,,,,,,,,...,,,,,,,,,,5
4,,,,,,,,,,,...,,,,,,,,,,5
5,,,,,,,,,,,...,,,,,,,,,,5
6,,,,,,,,,,,...,,,,,,,,,,5
7,,,,,,,,,,,...,,,,,,,,,,5
8,,,,,,,,,,,...,,,,,,,,,,5
9,,,,,,,,,,,...,,,,,,,,,,5


In [21]:
def log_to_session(session_coords,user_df,user_id):
    sessions_num=len(session_coords)
    sessions=np.array([])
    for coord in session_coords:
        session_log=user_df.iloc[coord[0]:coord[1]+1,:][['timestamp','site_id']].as_matrix()
        if len(session_log) < windows_size:
            ar_pad=np.empty(2*(windows_size-len(session_log)))
            ar_pad.fill(np.nan)
            session_log=np.append(session_log, ar_pad)
        sessions=np.append(sessions, session_log)
        sessions=np.append(sessions, user_id)
    col_name=np.array([['time'+str(x),'site'+str(x)] for x in range(1,windows_size+1)]).reshape([1,2*windows_size])[0]
    col_name=np.append(col_name, ['user_id'])
    sessions=pd.DataFrame(sessions.reshape([sessions_num,1+2*windows_size]),columns=col_name)
    return sessions
    
t=log_to_session(session_coords,a,1)

In [22]:
t

Unnamed: 0,time1,site1,time2,site2,time3,site3,time4,site4,time5,site5,...,site6,time7,site7,time8,site8,time9,site9,time10,site10,user_id
0,2014-01-30 13:36:20,142,2014-01-30 13:36:20,111,2014-01-30 13:36:20,115,2014-01-30 13:36:20,95,2014-01-30 13:36:20,158,...,114,2014-01-30 13:36:21,114,2014-01-30 13:36:21,111,2014-01-30 13:36:21,115,2014-01-30 13:36:21,95,1
1,2014-01-30 13:36:20,95,2014-01-30 13:36:20,158,2014-01-30 13:36:20,114,2014-01-30 13:36:21,114,2014-01-30 13:36:21,111,...,115,2014-01-30 13:36:21,95,2014-01-30 13:36:24,4,2014-01-30 13:36:25,5,2014-01-30 13:36:32,4,1
2,2014-01-30 13:36:21,114,2014-01-30 13:36:21,111,2014-01-30 13:36:21,115,2014-01-30 13:36:21,95,2014-01-30 13:36:24,4,...,5,2014-01-30 13:36:32,4,2014-01-30 13:36:39,4,2014-01-30 13:36:39,50,2014-01-30 13:36:42,4,1
3,2014-01-30 13:36:21,95,2014-01-30 13:36:24,4,2014-01-30 13:36:25,5,2014-01-30 13:36:32,4,2014-01-30 13:36:39,4,...,50,2014-01-30 13:36:42,4,2014-01-30 13:36:44,4,2014-01-30 13:36:45,9,2014-01-30 13:36:47,8,1
4,2014-01-30 13:36:32,4,2014-01-30 13:36:39,4,2014-01-30 13:36:39,50,2014-01-30 13:36:42,4,2014-01-30 13:36:44,4,...,9,2014-01-30 13:36:47,8,2014-01-30 13:36:49,113,2014-01-30 13:36:51,4,2014-01-30 13:36:51,9,1
5,2014-01-30 13:36:42,4,2014-01-30 13:36:44,4,2014-01-30 13:36:45,9,2014-01-30 13:36:47,8,2014-01-30 13:36:49,113,...,4,2014-01-30 13:36:51,9,2014-01-30 13:36:51,8,2014-01-30 13:36:51,113,2014-01-30 13:37:41,50,1
6,2014-01-30 13:36:47,8,2014-01-30 13:36:49,113,2014-01-30 13:36:51,4,2014-01-30 13:36:51,9,2014-01-30 13:36:51,8,...,113,2014-01-30 13:37:41,50,,,,,,,1
7,2014-01-30 13:36:51,9,2014-01-30 13:36:51,8,2014-01-30 13:36:51,113,2014-01-30 13:37:41,50,,,...,,,,,,,,,,1
8,2014-01-30 13:37:41,50,,,,,,,,,...,,,,,,,,,,1
9,2014-01-30 13:42:30,209,2014-01-30 13:42:30,195,2014-01-30 13:42:30,191,2014-01-30 13:42:31,53,2014-01-30 13:42:31,5,...,14,2014-01-30 13:42:32,39,2014-01-30 13:42:32,23,2014-01-30 13:42:32,109,2014-01-30 13:42:32,5,1


In [194]:
col_name=np.array([['time'+str(x),'site'+str(x)] for x in range(1,windows_size+1)]).reshape([1,2*windows_size])[0]

array(['time1', 'site1', 'time2', 'site2', 'time3', 'site3', 'time4',
       'site4', 'time5', 'site5', 'time6', 'site6', 'time7', 'site7',
       'time8', 'site8', 'time9', 'site9', 'time10', 'site10', 'user_id'], 
      dtype='<U7')

In [134]:
p.shape

(9, 2)