In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

In [2]:
PATH_TO_DATA = '..\\capstone_user_identification'

In [3]:
PATH_TO_EXPS = 'user_identification_exps'

In [15]:
def create_features(path_to_train_data, path_to_freq_file, session_length):
    '''Функция, которая создает различные признаки.
    path_to_train_data - путь к .csv-файлу с сессиями пользователей;
    path_to_freq_file - путь к словарю частот сайтов;
    session_length - длинна сессии.
    
    Создаваемые признаки:
    session_timespan - длина сессии, сек;
    unique_sites - кол-во уникальных сайтов в сессии;
    top_sites_time - среднее время проведенное на сайте из топ-30 (усреднение по сайтам из топ-30 в данной сессии);
    is_top_i - наличие i-го сайта из топ-30 в данной сессии;
    facebook_freq - частота посещения facebook в данной сессии.
    
    Функция возвращает датафрейм с построенными признаками, но без исходных данных'''
    data = pd.read_csv(path_to_train_data, index_col=0)
    with open(path_to_freq_file, 'rb') as site_freq_pkl:
        site_freq = pickle.load(site_freq_pkl)
    sites_data = data[['site'+str(i) for i in range(1, session_length)]]
    diff_times = data[['diff_time'+str(i) for i in range(1, session_length)]]
    data[['session_timespan']] = diff_times.sum(axis=1) 
    data[['unique_sites']] = sites_data.apply(np.unique, axis=1).apply(lambda x: x[x!=0].shape[0])
    data[['top_sites_time']] = \
        (diff_times.values*(sites_data<31).values).sum(axis=1)/np.count_nonzero((sites_data<31).values, axis=1)
    for i in range(1, 31):
        data[['is_top'+str(i)]] = (sites_data==i).any(axis=1)
    facebook_id = site_freq['www.facebook.com'][0]
    data[['facebook_freq']] = (sites_data==facebook_id).sum(axis=1)
    
    return data.fillna(0)[['session_timespan', 'unique_sites', 'top_sites_time']+
                          ['is_top'+str(i) for i in range(1, 31)]+['facebook_freq']]

## Признаки для 10 и 150 пользователей с параметрами соревнования Catch Me

In [16]:
%%time
features_10users = create_features(os.path.join(PATH_TO_EXPS, 'train_data_10users_for_catch_me.csv'), 
                                   os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'),
                                  session_length=10)
features_150users = create_features(os.path.join(PATH_TO_EXPS, 'train_data_150users_for_catch_me.csv'), 
                                   os.path.join(PATH_TO_DATA, 'site_freq_150users.pkl'),
                                   session_length=10)

  (diff_times.values*(sites_data<31).values).sum(axis=1)/np.count_nonzero((sites_data<31).values, axis=1)
  (diff_times.values*(sites_data<31).values).sum(axis=1)/np.count_nonzero((sites_data<31).values, axis=1)


Wall time: 3.8 s


In [17]:
features_10users.head()

Unnamed: 0_level_0,session_timespan,unique_sites,top_sites_time,is_top1,is_top2,is_top3,is_top4,is_top5,is_top6,is_top7,...,is_top22,is_top23,is_top24,is_top25,is_top26,is_top27,is_top28,is_top29,is_top30,facebook_freq
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,33,5,0.5,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
2,284,9,0.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
3,258,7,7.0,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1
4,30,5,0.0,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
5,6,9,1.0,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1


In [24]:
features_10users.to_csv(os.path.join(PATH_TO_EXPS, 
                                        'new_features_10users_for_catch_me.csv'), 
                         index_label='session_id', float_format='%d')
features_150users.to_csv(os.path.join(PATH_TO_EXPS, 
                                       'new_features_150users_for_catch_me.csv'), 
                        index_label='session_id', float_format='%d')

In [5]:
data = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_catch_me.csv'), index_col=0)

In [6]:
data.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,...,diff_time6,diff_time7,diff_time8,diff_time9,start_hour,day_of_week,year,month,day,time_of_day
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,718,0,0,0,0,0,0,0,0,0,...,0,0,0,0,10,3,2014,2,20,0
1,890,941,3847,941,942,3846,3847,3846,1516,1518,...,1,0,23,1,11,5,2014,2,22,0
2,14769,39,14768,14769,37,39,14768,14768,14768,14768,...,1,1,1,2,16,0,2013,12,16,1
3,782,782,782,782,782,782,782,782,782,782,...,30,30,30,30,10,4,2014,3,28,0
4,22,177,175,178,177,178,175,177,177,178,...,0,0,67,5,10,4,2014,2,28,0


In [34]:
sites_data = data[['site'+str(i) for i in range(1, 11)]]

In [27]:
top_sites = pd.Series(sites_data.values.ravel()).value_counts().drop([0]).index[:30]

In [31]:
top_sites

Int64Index([ 21,  23, 782,  22,  29, 167, 780, 778,  52, 812,  80, 570,  55,
             39,  37,  30, 786,  35,  76,  33,   3,  77, 616, 733, 677, 229,
             56, 270,  38, 941],
           dtype='int64')

In [35]:
sites_data==top_sites[0]

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
253556,False,False,False,False,False,False,False,False,False,False
253557,False,False,False,False,False,False,False,False,False,True
253558,False,False,False,False,False,False,False,False,False,False
253559,False,False,False,False,False,False,False,False,False,False


In [36]:
sites_data==top_sites[1]

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
253556,False,False,False,False,False,False,False,False,False,False
253557,False,False,False,False,False,True,False,False,True,False
253558,False,False,False,False,False,False,False,False,False,False
253559,False,False,False,False,False,False,False,False,False,False


In [38]:
(sites_data==top_sites[0]) | (sites_data==top_sites[1])

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
253556,False,False,False,False,False,False,False,False,False,False
253557,False,False,False,False,False,True,False,False,True,True
253558,False,False,False,False,False,False,False,False,False,False
253559,False,False,False,False,False,False,False,False,False,False


In [39]:
sites_matrix = (sites_data==top_sites[0])
for i in range(1, 30):
    sites_matrix = sites_matrix | (sites_data==top_sites[i])

In [40]:
sites_matrix

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,False,False,False,False,False,False,False,False,False,False
1,False,True,False,True,False,False,False,False,False,False
2,False,True,False,False,True,True,False,False,False,False
3,True,True,True,True,True,True,True,True,True,True
4,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
253556,False,False,False,False,False,False,False,False,False,False
253557,False,False,False,True,False,True,False,False,True,True
253558,False,False,False,False,False,False,False,False,False,False
253559,True,False,False,True,True,True,False,True,True,True


In [49]:
def create_features_from_ready(path_to_train_data):
    data = pd.read_csv(path_to_train_data, index_col=0)
    sites_data = data[['site'+str(i) for i in range(1, 11)]]
    diff_times = data[['diff_time'+str(i) for i in range(1, 10)]]
    data[['session_timespan']] = diff_times.sum(axis=1) 
    data[['unique_sites']] = sites_data.apply(np.unique, axis=1).apply(lambda x: x[x!=0].shape[0])
    top_sites = pd.Series(sites_data.values.ravel()).value_counts().drop([0]).index[:30]
    sites_matrix = (sites_data==top_sites[0])
    for i in range(1, 30):
        sites_matrix = sites_matrix | (sites_data==top_sites[i])
    data[['top_sites_time']] = \
        (diff_times.values*sites_matrix.values[:, :-1]).sum(axis=1)/np.count_nonzero(sites_matrix.values[:, :-1], axis=1)
    for i in range(1, 31):
        data[['is_top'+str(i)]] = (sites_data==top_sites[i-1]).any(axis=1)
    
    return data.fillna(0).iloc[:, 25:]

In [50]:
train_feats_catch_me = create_features_from_ready(os.path.join(PATH_TO_DATA, 'train_catch_me.csv'))

  (diff_times.values*sites_matrix.values[:, :-1]).sum(axis=1)/np.count_nonzero(sites_matrix.values[:, :-1], axis=1)


In [54]:
test_feats_catch_me = create_features_from_ready(os.path.join(PATH_TO_DATA, 'test_catch_me.csv'))

  (diff_times.values*sites_matrix.values[:, :-1]).sum(axis=1)/np.count_nonzero(sites_matrix.values[:, :-1], axis=1)


In [56]:
train_feats_catch_me.to_csv(os.path.join(PATH_TO_DATA, 
                                        'train_feats_catch_me.csv'), 
                         index_label='session_id', float_format='%d')
test_feats_catch_me.to_csv(os.path.join(PATH_TO_DATA, 
                                       'test_feats_catch_me.csv'), 
                        index_label='session_id', float_format='%d')