In [1]:
import os
os.chdir('../')

In [10]:
import data
import pandas as pd
import numpy as np
from preprocess_utils.last_clickout_indices import find as find_last_clickout_indices
from tqdm.auto import tqdm
tqdm.pandas()
pd.options.display.max_colwidth = -1
pd.options.display.max_columns = None

In [3]:
from extract_features.rnn.reference_price_in_last_clickout import ReferencePriceInLastClickout
from extract_features.rnn.clickout_vector_prices import ClickoutVectorPrices
rp_f = ReferencePriceInLastClickout() #.read_feature(one_hot=False)
vp_f = ClickoutVectorPrices() #.read_feature(one_hot=False)

In [3]:
df = data.train_df('small')

### Complete code at the end

In [5]:
df.tail()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
15932974,ZYNMLE3MV3LK,62728015bec05,1541544470,2,interaction item image,6617798,PT,"Paris, France",desktop,,,,14.0
15932988,ZYNMLE3MV3LK,62728015bec05,1541544491,16,clickout item,6617798,PT,"Paris, France",desktop,Focus on Distance,6617798|1263420|9567886|1161323|149768|1890735...,58|96|55|75|90|60|233|104|150|145|328|207|150|...,1.0
15932989,ZYNMLE3MV3LK,62728015bec05,1541544540,17,clickout item,2712342,PT,"Paris, France",desktop,Focus on Distance,6617798|1263420|9567886|1161323|149768|1890735...,58|96|55|75|90|60|233|104|150|145|328|207|150|...,1.0
15932990,ZYNMLE3MV3LK,62728015bec05,1541544967,18,change of sort order,interaction sort button,PT,"Paris, France",desktop,,,,1.0
15932991,ZYNMLE3MV3LK,62728015bec05,1541544973,19,clickout item,,PT,"Paris, France",desktop,Focus on Distance,6617798|1263420|9567886|1161323|149768|1890735...,58|96|55|75|90|60|233|104|150|145|328|207|150|...,1.0


In [41]:
def reference_price(df):
    df = df.sort_values(['user_id','session_id','timestamp','step']).reset_index()

    # find the last clickout rows
    last_clickout_idxs = find_last_clickout_indices(df)
    clickout_rows = df.loc[last_clickout_idxs, ['user_id','session_id','action_type','impressions','prices']]
    clickout_rows['impression_list'] = clickout_rows.impressions.str.split('|').apply(lambda x: list(map(int, x)))
    clickout_rows['price_list'] = clickout_rows.prices.str.split('|').apply(lambda x: list(map(int, x)))
    # find the interactions with numeric reference
    reference_rows = df[['user_id','session_id','reference','action_type', 'index']]
    reference_rows = reference_rows[df.reference.str.isnumeric() == True].astype({'reference':'int'})
    # skip last clickouts
    reference_rows = reference_rows.loc[~reference_rows.index.isin(last_clickout_idxs)]
    reference_rows = reference_rows.drop('action_type',axis=1)
    # store the resulting series of prices
    price_series = np.ones(reference_rows.shape[0], dtype=float) * (-1)

    # min_price = 999999
    # max_price = -999999

    # iterate over the sorted reference_rows and clickout_rows
    j = 0
    clickout_indices = clickout_rows.index.values
    ckidx = clickout_indices[j]
    next_clickout_user_id = clickout_rows.at[ckidx, 'user_id']
    next_clickout_sess_id = clickout_rows.at[ckidx, 'session_id']
    k = 0
    for row in tqdm(zip(reference_rows.index, reference_rows.user_id, reference_rows.session_id, 
                        reference_rows.reference)):
        idx = row[0]
        # if the current index is over the last clickout, break
        if idx >= clickout_indices[-1]:
            break
        # find the next clickout index
        while idx > clickout_indices[j]:
            j += 1
            ckidx = clickout_indices[j]
            next_clickout_user_id = clickout_rows.at[ckidx, 'user_id']
            next_clickout_sess_id = clickout_rows.at[ckidx, 'session_id']

        # check if row and next_clickout are in the same session
        if row[1] == next_clickout_user_id and row[2] == next_clickout_sess_id:
            impress = clickout_rows.at[ckidx, 'impression_list']
            row_reference = row[3]
            if row_reference in impress:
                ref_idx = impress.index(row_reference)
                price_list = clickout_rows.at[ckidx, 'price_list']
                ref_price = price_list[ref_idx]
                price_series[k] = ref_price
                # update min and max
                # min_price = min(min_price, min(price_list))
                # max_price = max(max_price, max(price_list))  
        k += 1

    # find max and min prices, expanding the prices as vector and then find max and min
    temp = df['prices'].dropna().str.split('|', expand=True).astype('float')
    max_price = temp.max().max()
    min_price = temp.min().min()
    
    print('ref_price:', max_price, min_price)

    # scale
    mask_na = price_series > 0
    price_series[mask_na] = np.log(price_series[mask_na])
    min_price = np.log(min_price)
    max_price = np.log(max_price)
    print('ref_price log:', max_price, min_price)
    
    price_series[mask_na] = (price_series[mask_na] - min_price) / (max_price - min_price)
    price_series[~mask_na] = 0
    
    reference_rows['price'] = price_series
    return reference_rows.drop(['user_id','session_id','reference'], axis=1).set_index('index')

def vector_price(df):
    # find the clickout interactions
    res_df = df[['user_id','session_id','prices']]
    res_df = res_df[df.action_type == 'clickout item']

    # expand the prices as vector
    expanded_prices = res_df.prices.str.split('|', expand=True).fillna(0).astype('int')
        
    # scale log
    mask_non_zero = expanded_prices > 0
    max_price = max(np.max(expanded_prices[mask_non_zero]))
    min_price = min(np.min(expanded_prices[mask_non_zero]))
    print('vector_price:', max_price, min_price)
    
    max_price = np.log(max_price)
    min_price = np.log(min_price)
    print('vector_price log:', max_price, min_price)
    
    log_prices = np.log(expanded_prices[mask_non_zero])
    log_prices = ((log_prices - min_price) / (max_price - min_price)).fillna(0)

    # add the prices to the resulting df
    for i in range(25):
        res_df['price_{}'.format(i)] = log_prices.loc[:, i]

    return res_df.drop(['user_id','session_id','prices'], axis=1)

In [42]:
ref_price = reference_price(df)

HBox(children=(IntProgress(value=0, max=72350), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


ref_price: 9999.0 5.0
ref_price log: 9.21024036697585 1.6094379124341003


In [43]:
vector_price = vector_price(df)

vector_price: 9999.0 5.0
vector_price log: 9.21024036697585 1.6094379124341003


In [5]:
merged = vp_f.join_to(rp_f.join_to(df)) \
    .drop(['current_filters','platform','city','device'],axis=1)
merged

  mask |= (ar1 == a)


reference_price_in_last_clickout feature read
clickout_vector_prices feature read


Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,impressions,prices,frequence,price,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9,price_10,price_11,price_12,price_13,price_14,price_15,price_16,price_17,price_18,price_19,price_20,price_21,price_22,price_23,price_24
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,,,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,,,3.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,,,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,,,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,,,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,,,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,,,5.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13,00RL8Z82B2Z1,aff3928535f48,1541037543,14,clickout item,109038,3400638|1253714|3367857|5100540|1088584|666916|54833|2922310|9711560|109038|666856|10077318|1431482|129343|6339822|6806806|1041528|109013|3909420|55088|3095758|109018|54885|1257342|2595006,95|66|501|112|95|100|101|72|82|56|56|143|70|25|71|162|73|143|188|118|77|131|143|49|165,1.0,0.000000,0.495562,0.456514,0.675169,0.513264,0.495562,0.501075,0.502145,0.465826,0.479764,0.438964,0.438964,0.539585,0.462810,0.353739,0.464328,0.553041,0.467303,0.539585,0.569109,0.518881,0.473019,0.530138,0.539585,0.424738,0.555021
14,00RL8Z82B2Z1,aff3928535f48,1541038469,15,search for poi,Surry Hills,,,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
15,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,55109|129343|54824|2297972|109014|1257342|1031578|109018|1332971|666916|54833|54885|2237222|10077318|1166793|9132132|1474297|3909420|6622154|55091|8444418|54804|2050977|666936|55088,162|25|150|143|101|49|118|131|18|100|101|143|51|143|123|36|66|188|124|138|94|137|180|112|118,1.0,0.000000,0.553041,0.353739,0.544738,0.539585,0.502145,0.424738,0.518881,0.530138,0.319685,0.501075,0.502145,0.539585,0.428996,0.539585,0.523350,0.392046,0.456514,0.569109,0.524222,0.535748,0.494426,0.534964,0.564414,0.513264,0.518881


In [6]:
merged[merged.session_id == '3599a6f709eab']

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,impressions,prices,frequence,price,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9,price_10,price_11,price_12,price_13,price_14,price_15,price_16,price_17,price_18,price_19,price_20,price_21,price_22,price_23,price_24
81,02SRUT1NQYH1,3599a6f709eab,1541063730,1,interaction item image,2795374,,,33.0,0.453223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114,02SRUT1NQYH1,3599a6f709eab,1541063863,34,interaction item info,2795374,,,1.0,0.453223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,02SRUT1NQYH1,3599a6f709eab,1541063864,35,clickout item,2795374,2795374|5582964|1088390|2781070|1258068|1271962|3184892|148884|3528776|107183|5156744|107048|1907333|3370484|6003326|8436316|9025316|125181|3861490|131257|4415954|107162|3143352|6652864|8118684,64|54|36|121|76|81|92|40|73|52|98|104|56|414|67|111|21|122|55|104|56|64|40|29|44,1.0,0.0,0.453223,0.435086,0.392046,0.521584,0.471618,0.478448,0.492115,0.403192,0.467303,0.431064,0.498903,0.505292,0.438964,0.654505,0.458122,0.512299,0.335602,0.522471,0.437042,0.505292,0.438964,0.453223,0.403192,0.369276,0.413299


In [6]:
def m_vector_prices(df):
    # find the clickout interactions
    res_df = df[['user_id','session_id','prices']]
    res_df = res_df[df.action_type == 'clickout item']

    # expand the prices as vector
    expanded_prices = res_df.prices.str.split('|', expand=True).fillna(0).astype('int')

    # scale log
    mask_non_zero = expanded_prices > 0
    #log_prices = np.log(expanded_prices[mask_non_zero])
    
    # scale min-max
    max_price = np.log(max(np.max(expanded_prices[mask_non_zero])))
    min_price = np.log(min(np.min(expanded_prices[mask_non_zero])))
    
    return max_price, min_price

In [7]:
def m_ref_price(df):
    temp = df['prices'].dropna().str.split('|', expand=True).astype('float')
    max_price = temp.max().max()
    min_price = temp.min().min()
    
    return np.log(max_price), np.log(min_price)

In [8]:
m_vector_prices(df)

(9.210340371976184, 1.6094379124341003)

In [9]:
m_ref_price(df)

(9.210340371976184, 1.6094379124341003)

## Test

In [8]:
merged = vp_f.join_to(rp_f.join_to(df))

reference_price_in_last_clickout feature read
clickout_vector_prices feature read


In [9]:
merged[merged.session_id == '3599a6f709eab']

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence,price,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9,price_10,price_11,price_12,price_13,price_14,price_15,price_16,price_17,price_18,price_19,price_20,price_21,price_22,price_23,price_24
81,02SRUT1NQYH1,3599a6f709eab,1541063730,1,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,33.0,0.335413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114,02SRUT1NQYH1,3599a6f709eab,1541063863,34,interaction item info,2795374,FI,"Krakow, Poland",mobile,,,,1.0,0.335413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,02SRUT1NQYH1,3599a6f709eab,1541063864,35,clickout item,2795374,FI,"Krakow, Poland",mobile,,2795374|5582964|1088390|2781070|1258068|1271962|3184892|148884|3528776|107183|5156744|107048|1907333|3370484|6003326|8436316|9025316|125181|3861490|131257|4415954|107162|3143352|6652864|8118684,64|54|36|121|76|81|92|40|73|52|98|104|56|414|67|111|21|122|55|104|56|64|40|29|44,1.0,0.0,0.335413,0.313061,0.259717,0.419207,0.358023,0.366405,0.383159,0.273578,0.352724,0.308096,0.391471,0.399289,0.317846,0.58104,0.34144,0.407858,0.188804,0.42029,0.315475,0.399289,0.317846,0.335413,0.273578,0.23127,0.286118


In [3]:
full_df = data.full_df()

caching df_full...


  mask |= (ar1 == a)


Done!


In [4]:
full_df.shape

(6694934, 13)

In [5]:
full_df.head(15)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,,1
1,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,3
2,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,1
3,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1
4,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,1
5,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1
6,00RL8Z82B2Z1,aff3928535f48,1541037542,13,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,5
7,00RL8Z82B2Z1,aff3928535f48,1541037543,14,clickout item,109038,AU,"Sydney, Australia",mobile,,3400638|1253714|3367857|5100540|1088584|666916...,95|66|501|112|95|100|101|72|82|56|56|143|70|25...,1
8,00RL8Z82B2Z1,aff3928535f48,1541038469,15,search for poi,Surry Hills,AU,"Sydney, Australia",mobile,,,,1
9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...,1


## Take only target rows (last clickout for each session)

In [65]:
def find_last_clickout_indices(df):
    indices = []
    cur_ses = ''
    cur_user = ''
    for idx,row in tqdm(df[df.action_type == 'clickout item'][['user_id','session_id','action_type']][::-1].iterrows()):
        ruid = row.user_id
        rsid = row.session_id
        if (ruid != cur_user or rsid != cur_ses):
            indices.append(idx)
            cur_user = ruid
            cur_ses = rsid
    return indices[::-1]

In [66]:
%time indices = find_last_clickout_indices(full_df)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

CPU times: user 3min 48s, sys: 1.44 s, total: 3min 50s
Wall time: 3min 50s


In [71]:
indices[:10]

[9, 21, 27, 45, 48, 49, 51, 57, 59, 62]

### Too slow?? Try this one ;)

In [5]:
def find_last_clickout_indices2(df):
    indices = []
    cur_ses = ''
    cur_user = ''
    temp_df = df[df.action_type == 'clickout item'][['user_id','session_id','action_type']]
    for idx in tqdm(temp_df.index.values[::-1]):
        ruid = temp_df.at[idx,'user_id']
        rsid = temp_df.at[idx,'session_id']
        if (ruid != cur_user or rsid != cur_ses):
            indices.append(idx)
            cur_user = ruid
            cur_ses = rsid
    return indices[::-1]

In [6]:
%time indices2 = find_last_clickout_indices2(full_df)

HBox(children=(IntProgress(value=0, max=2115365), HTML(value='')))


CPU times: user 44.4 s, sys: 583 ms, total: 45 s
Wall time: 45.2 s


CRAAAAA

In [7]:
indices2[:10]

[9, 21, 27, 45, 48, 49, 51, 57, 59, 62]

## Expand the impressions

In [9]:
%time base_df = full_df.loc[indices2].copy()

CPU times: user 861 ms, sys: 125 ms, total: 986 ms
Wall time: 996 ms


In [21]:
base_df.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109|129343|54824|2297972|109014|1257342|1031...,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
21,02SRUT1NQYH1,3599a6f709eab,1541063864,35,clickout item,2795374,FI,"Krakow, Poland",mobile,,2795374|5582964|1088390|2781070|1258068|127196...,64|54|36|121|76|81|92|40|73|52|98|104|56|414|6...,1
27,03K8AXBL4BX2,ec139e10b9238,1541100652,7,clickout item,1032816,UK,"London, United Kingdom",desktop,,12693|46363|81657|18448|47687|152913|18417|927...,104|92|100|103|102|104|72|85|81|75|107|86|98|8...,1
45,03P4VFKK12UO,325fafb5fa450,1541107538,55,clickout item,1320460,US,"Bakersfield, USA",desktop,Very Good Rating|5 Star|4 Star|Hotel|Motel|Res...,1306936|56482|2842358|6881276|65685|63259|6539...,178|104|110|94|57|96|46|61|48|35|50|38|59|44|5...,1
48,0473FZ8UNXRS,bcc452f3350eb,1541062532,3,clickout item,3143258,AU,"Legian, Indonesia",desktop,,1258184|3866722|8929970|2315702|116619|1511641...,51|43|69|49|62|50|55|42|87|46|43|114|194|50|19...,1


In [12]:
base_df.shape

(1102556, 13)

#### Bottleneck is coming...

In [22]:
def expand_impressions(df):
    res_df = df.copy()
    res_df.impressions = res_df.impressions.str.split('|')
    res_df = res_df.reset_index()
    
    res_df = pd.DataFrame({
      col:np.repeat(res_df[col].values, res_df.impressions.str.len())
      for col in res_df.columns.drop('impressions')}
    ).assign(**{'impressions':np.concatenate(res_df.impressions.values)})[res_df.columns]

    return res_df.rename(mapper={'impressions':'impression'})

In [24]:
%time expand_impressions(base_df)

CPU times: user 50.8 s, sys: 58.4 s, total: 1min 49s
Wall time: 2min 28s


Unnamed: 0,index,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
0,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
1,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,129343,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
2,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,54824,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
3,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,2297972,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
4,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,109014,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
5,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,1257342,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
6,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,1031578,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
7,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,109018,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
8,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,1332971,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
9,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,666916,162|25|150|143|101|49|118|131|18|100|101|143|5...,1


## Complete code

In [4]:
full_df = data.full_df()

def find_last_clickout_indices(df):
    indices = []
    cur_ses = ''
    cur_user = ''
    temp_df = df[df.action_type == 'clickout item'][['user_id','session_id','action_type']]
    for idx in tqdm(temp_df.index.values[::-1]):
        ruid = temp_df.at[idx,'user_id']
        rsid = temp_df.at[idx,'session_id']
        if (ruid != cur_user or rsid != cur_ses):
            indices.append(idx)
            cur_user = ruid
            cur_ses = rsid
    return indices[::-1]

def expand_impressions(df):
    res_df = df.copy()
    res_df.impressions = res_df.impressions.str.split('|')
    res_df = res_df.reset_index()
    
    res_df = pd.DataFrame({
      col:np.repeat(res_df[col].values, res_df.impressions.str.len())
      for col in res_df.columns.drop('impressions')}
    ).assign(**{'impressions':np.concatenate(res_df.impressions.values)})[res_df.columns]

    return res_df.rename(mapper={'impressions':'impression'})

In [5]:
%time idxs = find_last_clickout_indices(full_df)

HBox(children=(IntProgress(value=0, max=2115365), HTML(value='')))


CPU times: user 43.2 s, sys: 531 ms, total: 43.7 s
Wall time: 43.9 s


In [6]:
%time base_df = full_df.loc[idxs].copy()

CPU times: user 1.1 s, sys: 332 ms, total: 1.43 s
Wall time: 1.43 s


In [7]:
%time base_df = expand_impressions(base_df)

CPU times: user 52.5 s, sys: 1min 8s, total: 2min 1s
Wall time: 2min 44s


In [8]:
base_df.shape

(25045602, 14)

In [9]:
base_df

Unnamed: 0,index,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
0,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,55109,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
1,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,129343,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
2,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,54824,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
3,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,2297972,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
4,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,109014,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
5,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,1257342,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
6,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,1031578,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
7,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,109018,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
8,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,1332971,162|25|150|143|101|49|118|131|18|100|101|143|5...,1
9,9,00RL8Z82B2Z1,aff3928535f48,1541038485,16,clickout item,1257342,AU,"Sydney, Australia",mobile,,666916,162|25|150|143|101|49|118|131|18|100|101|143|5...,1


### Tot time: 3min 26s

In [4]:
dataset = pd.read_csv('dataset/preprocessed/cluster_recurrent/small/dataset_classification_p6/X_train.csv', index_col=0)

In [5]:
dataset

Unnamed: 0_level_0,user_id,session_id,timestamp,step,reference,platform,city,current_filters,frequence,rp_0,rp_1,rp_2,rp_3,rp_4,rp_5,rp_6,rp_7,rp_8,rp_9,rp_10,rp_11,rp_12,rp_13,rp_14,rp_15,rp_16,rp_17,rp_18,rp_19,rp_20,rp_21,rp_22,rp_23,rp_24,glob_clickout_popularity,price,price_pos_0,price_pos_1,price_pos_2,price_pos_3,price_pos_4,price_pos_5,price_pos_6,price_pos_7,price_pos_8,price_pos_9,price_pos_10,price_pos_11,price_pos_12,price_pos_13,price_pos_14,price_pos_15,price_pos_16,price_pos_17,price_pos_18,price_pos_19,price_pos_20,price_pos_21,price_pos_22,price_pos_23,price_pos_24,impr_c0,impr_c1,impr_c2,impr_c3,impr_c4,impr_c5,impr_c6,impr_c7,impr_c8,impr_c9,impr_c10,impr_c11,impr_c12,impr_c13,impr_c14,impr_c15,impr_c16,impr_c17,impr_c18,impr_c19,impr_c20,impr_c21,impr_c22,impr_c23,impr_c24,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9,price_10,price_11,price_12,price_13,price_14,price_15,price_16,price_17,price_18,price_19,price_20,price_21,price_22,price_23,price_24,duration,satisf_perc_0,satisf_perc_1,satisf_perc_2,satisf_perc_3,satisf_perc_4,satisf_perc_5,satisf_perc_6,satisf_perc_7,satisf_perc_8,satisf_perc_9,satisf_perc_10,satisf_perc_11,satisf_perc_12,satisf_perc_13,satisf_perc_14,satisf_perc_15,satisf_perc_16,satisf_perc_17,satisf_perc_18,satisf_perc_19,satisf_perc_20,satisf_perc_21,satisf_perc_22,satisf_perc_23,satisf_perc_24,impr_pop0,impr_pop1,impr_pop2,impr_pop3,impr_pop4,impr_pop5,impr_pop6,impr_pop7,impr_pop8,impr_pop9,impr_pop10,impr_pop11,impr_pop12,impr_pop13,impr_pop14,impr_pop15,impr_pop16,impr_pop17,impr_pop18,impr_pop19,impr_pop20,impr_pop21,impr_pop22,impr_pop23,impr_pop24,sort_rating,sort_pop,sort_price,mobile,desktop,tablet,clickout item,interaction item rating,interaction item info,interaction item image,interaction item deals,search for item,search for destination,search for poi
orig_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1
-1,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27774,0004IOZI7CKF,0146f7cb014ba,1541266717,1,"Valencia, Spain",DE,"Valencia, Spain",0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.302030,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,1,0,0,0,0,0,0,1,0
27775,0004IOZI7CKF,0146f7cb014ba,1541266769,2,3381482,DE,"Valencia, Spain",0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.112735,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.253489,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,1,0,0,1,0,0,0,0,0
27776,0004IOZI7CKF,0146f7cb014ba,1541266796,3,2627602,DE,"Valencia, Spain",0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.462810,0.481065,0.544738,0.539585,0.484875,0.468760,0.474402,0.496688,0.437042,0.465826,0.517965,0.482349,0.492115,0.438964,0.509351,0.538066,0.489755,0.522471,0.474402,0.529312,0.473019,0.483619,0.567372,0.422544,0.471618,0.291254,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.610146,0.659327,0.363205,0.527787,0.350487,0.235242,0.539183,0.414115,0.445001,0.202627,0.464518,0.101313,0.160578,0.574693,0.350487,0.000000,0.385736,0.101313,0.202627,0.374904,0.202627,0.160578,0.350487,0.405254,0.261891,0,0,0,0,0,1,1,0,0,0,0,0,0,0
27777,0004IOZI7CKF,0146f7cb014ba,1541266841,4,7822344,DE,"Valencia, Spain",0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.462810,0.481065,0.544738,0.539585,0.484875,0.468760,0.474402,0.496688,0.437042,0.465826,0.517965,0.482349,0.492115,0.438964,0.509351,0.538066,0.489755,0.522471,0.474402,0.529312,0.473019,0.483619,0.567372,0.422544,0.471618,0.253489,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.610146,0.659327,0.363205,0.527787,0.350487,0.235242,0.539183,0.414115,0.445001,0.202627,0.464518,0.101313,0.160578,0.574693,0.350487,0.000000,0.385736,0.101313,0.202627,0.374904,0.202627,0.160578,0.350487,0.405254,0.261891,0,0,0,0,0,1,1,0,0,0,0,0,0,0
27778,0004IOZI7CKF,0146f7cb014ba,1541266868,5,110985,DE,"Valencia, Spain",0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.470198,0.449830,0.514221,0.490942,0.507340,0.516109,0.470198,0.459707,0.478448,0.484875,0.392046,0.456514,0.484875,0.551024,0.541082,0.517965,0.461270,0.506321,0.454881,0.484875,0.479764,0.481065,0.465826,0.507340,0.494426,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.336556,0.321156,0.430372,0.303940,0.458297,0.350487,0.202627,0.303940,0.160578,0.363205,0.261891,0.385736,0.481734,0.511065,0.492178,0.303940,0.501926,0.546314,0.414115,0.261891,0.395820,0.374904,0.303940,0.437869,0.303940,0,0,0,0,0,1,1,0,0,0,0,0,0,0
-1,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-1,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-1,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-1,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
len(dataset.columns.drop(['user_id','session_id','step','reference','platform','city','current_filters']))

169

In [3]:
full = data.full_df()

caching df_full...


  mask |= (ar1 == a)


Done!


In [4]:
full.loc[[16727760, 18888288]]

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
16727760,HDFSLRD2P5IC,cbe3752713eee,1541624037,5,clickout item,,IT,"Pozzuoli, Italy",mobile,,45927|1258844|21154|4719620|873351|21061|21072|21081|21085|21100|21115|21126|21150|45499|45807|1240467|1542573|21057|1666029|7176920|21097|83964|153182|45930|1473027,93|183|240|99|62|166|179|161|94|191|83|65|94|58|74|130|50|60|96|47|511|71|70|124|55,1.0
18888288,7X4FZTVRCDQA,2a181b2125efe,1541592794,9,clickout item,,IT,"Falcone, Italy",mobile,,4622816|3389774|4743270|4090264|4906084|1390332|1949033|7197516|1668209|1857023|9790044|2520372|346171|103993|6402498|513786|9498644|2875112|7038592|1573641|3983502|8119076|2857132|642391|3214070,75|60|50|60|35|30|69|48|56|75|70|32|70|487|54|82|32|30|78|63|92|61|61|50|40,1.0


In [5]:
full[full.session_id == 'cbe3752713eee']

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
16727756,HDFSLRD2P5IC,cbe3752713eee,1541623639,1,clickout item,1258844.0,IT,"Naples, Italy",mobile,,1258844|4719620|896773|21097|1240467|45930|153190|2755116|21154|17292|21072|21085|21100|21126|21153|45881|81395|1495041|1966097|2816732|6982518|7814148|575441|21121|2759590,107|69|99|98|109|108|122|75|222|70|129|95|131|58|49|49|80|39|77|79|39|69|81|125|60,1.0
16727757,HDFSLRD2P5IC,cbe3752713eee,1541623682,2,clickout item,21126.0,IT,"Naples, Italy",mobile,,1258844|4719620|1240467|21097|153190|45930|21154|21061|21072|21085|21100|21115|21126|21150|45499|45807|21057|21081|1666029|7176920|83964|153182|1473027|5723742|9492710,183|99|130|511|141|124|240|166|179|94|191|83|65|94|58|74|60|161|96|47|71|70|55|100|83,1.0
16727758,HDFSLRD2P5IC,cbe3752713eee,1541623722,3,interaction item info,45807.0,IT,"Naples, Italy",mobile,,,,1.0
16727759,HDFSLRD2P5IC,cbe3752713eee,1541623725,4,clickout item,45807.0,IT,"Naples, Italy",mobile,,1258844|4719620|1240467|21097|153190|45930|21154|21061|21072|21085|21100|21115|21126|21150|45499|45807|21057|21081|1666029|7176920|83964|153182|1473027|5723742|9492710,183|99|130|511|141|124|240|166|179|94|191|83|65|94|58|74|60|161|96|47|71|70|55|100|83,1.0
16727760,HDFSLRD2P5IC,cbe3752713eee,1541624037,5,clickout item,,IT,"Pozzuoli, Italy",mobile,,45927|1258844|21154|4719620|873351|21061|21072|21081|21085|21100|21115|21126|21150|45499|45807|1240467|1542573|21057|1666029|7176920|21097|83964|153182|45930|1473027,93|183|240|99|62|166|179|161|94|191|83|65|94|58|74|130|50|60|96|47|511|71|70|124|55,1.0


In [6]:
full[full.session_id == '2a181b2125efe']

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,frequence
5054814,7X4FZTVRCDQA,2a181b2125efe,1541329853,1,search for destination,"Rimini, Italy",IT,"Rimini, Italy",mobile,,,,1.0
6267791,7X4FZTVRCDQA,2a181b2125efe,1541506315,2,search for destination,"Milan, Italy",IT,"Milan, Italy",mobile,,,,1.0
8324291,7X4FZTVRCDQA,2a181b2125efe,1541103586,3,clickout item,94907,IT,"Giardini-Naxos, Italy",mobile,,94907|101278|82654|23607|23610|45848|1203406|2121304|113048|23609|45427|897473|101282|95937|45552|7018352|1716855|113932|6227466|94638|3208754|23600|95925|23602|4440326,82|55|87|187|85|98|184|94|86|111|144|87|223|49|104|61|54|49|105|105|33|330|264|118|136,1.0
18888280,7X4FZTVRCDQA,2a181b2125efe,1541683876,1,clickout item,45643,IT,"Catania, Italy",mobile,,45643|2609026|20210|20238|2774322|20207|20212|20213|20209|20215|20219|1838087|3134547|1277780|104016|1501099|45885|926493|1331335|3894602|3049100|20236|103708|5835954|20220,110|74|84|103|77|98|132|113|59|61|94|45|81|64|92|93|174|93|60|79|71|57|66|104|55,1.0
18888281,7X4FZTVRCDQA,2a181b2125efe,1541683994,2,clickout item,104016,IT,"Catania, Italy",mobile,,45643|2609026|20210|20238|2774322|20207|20212|20213|20209|20215|20219|1838087|3134547|1277780|104016|1501099|45885|926493|1331335|3894602|3049100|20236|103708|5835954|20220,110|74|84|103|77|98|132|113|59|61|94|45|81|64|92|93|174|93|60|79|71|57|66|104|55,1.0
18888282,7X4FZTVRCDQA,2a181b2125efe,1541684131,3,clickout item,1694719,IT,"Aci Castello, Italy",mobile,,103708|5835954|449296|1223484|3049100|3813262|3492814|20219|1233899|16746|973075|995623|1041214|20239|1346229|6240332|2861186|1152752|693311|1869703|8590236|643926|8185994|3134553|1694719,56|100|59|42|84|69|119|95|69|810|52|51|32|30|60|63|57|39|69|68|64|49|74|62|49,1.0
18888283,7X4FZTVRCDQA,2a181b2125efe,1541592699,4,search for destination,"Giardini-Naxos, Italy",IT,"Giardini-Naxos, Italy",mobile,,,,1.0
18888284,7X4FZTVRCDQA,2a181b2125efe,1541592713,5,interaction item image,101278,IT,"Giardini-Naxos, Italy",mobile,,,,2.0
18888286,7X4FZTVRCDQA,2a181b2125efe,1541592717,7,clickout item,101278,IT,"Giardini-Naxos, Italy",mobile,,101278|94907|101279|897473|965647|1269352|2808272|97214|513601|195131|4279974|4062968|1632921|3520100|7974094|1969143|1842355|5755448|3370994|1033342|2516532|3789928|977641|1714237|3125476,55|999|139|158|800|50|50|1056|62|50|30|50|55|50|48|59|38|50|50|46|39|45|200|43|30,1.0
18888287,7X4FZTVRCDQA,2a181b2125efe,1541592781,8,search for destination,"Falcone, Italy",IT,"Falcone, Italy",mobile,,,,1.0


In [7]:
indices = data.target_indices('full','cluster_recurrent')

In [9]:
print(18888288 in indices)
print(16727760 in indices)

True
True


In [10]:
last_clicks = find_last_clickout_indices(full)

HBox(children=(IntProgress(value=0, max=2115364), HTML(value='')))


