In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/events.csv', low_memory=False)
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [3]:
user_features = df[['person']].drop_duplicates().copy()

In [4]:
# Las comento porque no aportaban mucho en lo que probé, pero tal vez sean útiles.
#users_con_vistas = set(df.loc[df['event'] == 'viewed product', 'person'])
#user_features['vio_productos'] = user_features['person'].isin(users_con_vistas)

#users_con_checkouts = set(df.loc[df['event'] == 'checkout', 'person'])
#user_features['tiene_checkouts'] = user_features['person'].isin(users_con_checkouts)

#Como hay muy pocas leads, de momento no las agrego
#users_con_leads = set(df.loc[df['event'] == 'lead', 'person'])
#users['tiene_leads'] = users['person'].isin(users_con_leads)

#users_con_conversions = set(df.loc[df['event'] == 'conversion', 'person'])
#user_features['tiene_conversions'] = user_features['person'].isin(users_con_conversions)
user_features.set_index('person', inplace=True)

In [5]:
user_features.head()

4886f805
ad93850f
0297fc1e
2d681dd8
cccea85e


In [6]:
def agregar_feature(nombre, evento):
    feature_df = df.loc[df['event'] == evento]
    feature_df = feature_df.groupby('person')['event'].value_counts().unstack()
    feature_df.rename(columns={evento:'cant_'+nombre}, inplace=True)
    return feature_df

In [7]:
conversion = agregar_feature('conversions', 'conversion')
user_features = user_features.join(conversion).fillna(0)

In [8]:
checkout = agregar_feature('checkouts', 'checkout')
user_features = user_features.join(checkout).fillna(0)

In [9]:
viewed_product = agregar_feature('viewed_product', 'viewed product')
user_features = user_features.join(viewed_product).fillna(0)

In [10]:
searched_product = agregar_feature('searched_product', 'searched products')
user_features = user_features.join(searched_product).fillna(0)

In [11]:
user_features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cant_conversions,38829.0,0.182621,0.987561,0.0,0.0,0.0,0.0,129.0
cant_checkouts,38829.0,1.682119,2.717728,0.0,1.0,1.0,2.0,197.0
cant_viewed_product,38829.0,32.144119,79.762927,0.0,3.0,10.0,29.0,2355.0
cant_searched_product,38829.0,3.363878,12.897345,0.0,0.0,0.0,2.0,739.0


## Sesiones

In [12]:
sesiones = df.copy()

In [13]:
sesiones.sort_values(by='timestamp', inplace=True)
sesiones['diff'] = sesiones.groupby('person')['timestamp'].diff() / np.timedelta64(1, 'h')
sesiones['diff'].fillna(0, inplace=True)

In [14]:
sesiones[['timestamp', 'person', 'diff']].head(10)

Unnamed: 0,timestamp,person,diff
2307205,2018-01-01 08:09:31,0f4e2a4b,0.0
1753202,2018-01-01 08:09:31,0f4e2a4b,0.0
1753201,2018-01-01 08:09:31,0f4e2a4b,0.0
1753200,2018-01-01 08:09:44,0f4e2a4b,0.003611
1753207,2018-01-01 08:45:29,0f4e2a4b,0.595833
2307206,2018-01-01 08:45:29,0f4e2a4b,0.0
1753210,2018-01-01 08:45:29,0f4e2a4b,0.0
1753203,2018-01-01 08:45:43,0f4e2a4b,0.003889
1753204,2018-01-01 08:48:57,0f4e2a4b,0.053889
1753213,2018-01-01 08:49:05,0f4e2a4b,0.002222


In [15]:
sesiones['new_session'] = sesiones.groupby('person')['diff'].apply(lambda x: x > 1.0)
sesiones['session_id'] = sesiones.groupby('person')['new_session'].cumsum()

In [16]:
sesiones[['timestamp', 'person', 'event', 'diff', 'session_id']].head()

Unnamed: 0,timestamp,person,event,diff,session_id
2307205,2018-01-01 08:09:31,0f4e2a4b,visited site,0.0,0.0
1753202,2018-01-01 08:09:31,0f4e2a4b,search engine hit,0.0,0.0
1753201,2018-01-01 08:09:31,0f4e2a4b,ad campaign hit,0.0,0.0
1753200,2018-01-01 08:09:44,0f4e2a4b,viewed product,0.003611,0.0
1753207,2018-01-01 08:45:29,0f4e2a4b,ad campaign hit,0.595833,0.0


In [17]:
#cantidad_eventos_usuario = sesiones.loc[sesiones['event'] == 'checkout']
#sesiones = sesiones.loc[sesiones['event'] == 'checkout']

In [18]:
cantidad_eventos_usuario = sesiones.groupby(['person', 'session_id'])['event']\
    .value_counts().unstack().unstack().sum(axis=1).to_frame().rename(columns={0:"total_eventos"})

In [19]:
cantidad_eventos_usuario.head()

Unnamed: 0_level_0,total_eventos
person,Unnamed: 1_level_1
0008ed71,6.0
00091926,448.0
00091a7a,10.0
000ba417,206.0
000c79fe,17.0


In [20]:
sesiones_por_usuario = sesiones.groupby('person')['session_id'].unique().apply(lambda x: len(x)).\
                        to_frame().rename(columns={'session_id':'total_sesiones'})
promedio_eventos_sesion = cantidad_eventos_usuario.join(sesiones_por_usuario)

In [21]:
promedio_eventos_sesion['promedio_eventos_por_sesion'] = promedio_eventos_sesion['total_eventos']\
                                            / promedio_eventos_sesion['total_sesiones']

In [22]:
user_features = user_features.join(promedio_eventos_sesion[['total_sesiones', 'promedio_eventos_por_sesion']])

In [23]:
user_features.fillna(0, inplace=True)

In [24]:
user_features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cant_conversions,38829.0,0.182621,0.987561,0.0,0.0,0.0,0.0,129.0
cant_checkouts,38829.0,1.682119,2.717728,0.0,1.0,1.0,2.0,197.0
cant_viewed_product,38829.0,32.144119,79.762927,0.0,3.0,10.0,29.0,2355.0
cant_searched_product,38829.0,3.363878,12.897345,0.0,0.0,0.0,2.0,739.0
total_sesiones,38829.0,4.883257,9.130344,1.0,1.0,2.0,5.0,254.0
promedio_eventos_por_sesion,38829.0,13.440557,12.868624,1.0,6.0,10.0,16.2,251.0


In [25]:
user_features.to_csv('data/features_basicas.csv', sep=',')