In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/events.csv', low_memory=False)
df['timestamp'] = pd.to_datetime(df['timestamp'])
user_features = df[['person']].drop_duplicates().copy()
user_features.set_index('person', inplace=True)

In [3]:
ultima_quincena = df.loc[df['timestamp'] >= pd.Timestamp(2018,5,13)]

In [4]:
def agregar_feature(nombre, evento):
    feature_df = ultima_quincena.loc[ultima_quincena['event'] == evento]
    feature_df = feature_df.groupby('person')['event'].value_counts().unstack()
    feature_df.rename(columns={evento:'ult_quin_cant_'+nombre}, inplace=True)
    return feature_df

In [5]:
conversion = agregar_feature('conversions', 'conversion')
user_features = user_features.join(conversion).fillna(0)

In [6]:
checkout = agregar_feature('checkouts', 'checkout')
user_features = user_features.join(checkout).fillna(0)

In [7]:
viewed_product = agregar_feature('viewed_product', 'viewed product')
user_features = user_features.join(viewed_product).fillna(0)

In [8]:
searched_product = agregar_feature('searched_product', 'searched products')
user_features = user_features.join(searched_product).fillna(0)

In [9]:
visited_site = agregar_feature('visitas', 'visited site')
user_features = user_features.join(visited_site).fillna(0)

In [10]:
lead = agregar_feature('leads', 'lead')
user_features = user_features.join(lead).fillna(0)

In [11]:
brand_listing = agregar_feature('brand_listings', 'brand listing')
user_features = user_features.join(brand_listing).fillna(0)

In [12]:
generic_listing = agregar_feature('generic_listings', 'generic listing')
user_features = user_features.join(generic_listing).fillna(0)

In [13]:
ads = agregar_feature('entradas_ads', 'ad campaign hit')
user_features = user_features.join(ads).fillna(0)

In [14]:
search_engine = agregar_feature('entradas_buscador', 'search engine hit')
user_features = user_features.join(search_engine).fillna(0)

## Sesiones

In [15]:
sesiones = ultima_quincena.copy()

In [16]:
sesiones.sort_values(by='timestamp', inplace=True)
sesiones['diff'] = sesiones.groupby('person')['timestamp'].diff() / np.timedelta64(1, 'h')
sesiones['diff'].fillna(0, inplace=True)

In [17]:
sesiones[['timestamp', 'person', 'diff']].head(10)

Unnamed: 0,timestamp,person,diff
585820,2018-05-13 00:00:20,f2ddab42,0.0
2244557,2018-05-13 00:00:23,b96ab3e6,0.0
764282,2018-05-13 00:00:23,b96ab3e6,0.0
108640,2018-05-13 00:00:39,a515878a,0.0
1107578,2018-05-13 00:00:49,846ee151,0.0
2016700,2018-05-13 00:00:50,846ee151,0.000278
1105067,2018-05-13 00:01:01,846ee151,0.003056
1106237,2018-05-13 00:01:07,846ee151,0.001667
1105086,2018-05-13 00:01:22,846ee151,0.004167
1105068,2018-05-13 00:01:34,846ee151,0.003333


In [18]:
sesiones['new_session'] = sesiones.groupby('person')['diff'].apply(lambda x: x > 1.0)
sesiones['session_id'] = sesiones.groupby('person')['new_session'].cumsum()

In [19]:
sesiones[['timestamp', 'person', 'event', 'diff', 'session_id']].head()

Unnamed: 0,timestamp,person,event,diff,session_id
585820,2018-05-13 00:00:20,f2ddab42,searched products,0.0,0.0
2244557,2018-05-13 00:00:23,b96ab3e6,visited site,0.0,0.0
764282,2018-05-13 00:00:23,b96ab3e6,searched products,0.0,0.0
108640,2018-05-13 00:00:39,a515878a,conversion,0.0,0.0
1107578,2018-05-13 00:00:49,846ee151,ad campaign hit,0.0,0.0


In [20]:
#cantidad_eventos_usuario = sesiones.loc[sesiones['event'] == 'checkout']
#sesiones = sesiones.loc[sesiones['event'] == 'checkout']

In [21]:
cantidad_eventos_usuario = sesiones.groupby(['person', 'session_id'])['event']\
    .value_counts().unstack().unstack().sum(axis=1).to_frame().rename(columns={0:"ult_quin_total_eventos"})

In [22]:
cantidad_eventos_usuario.head()

Unnamed: 0_level_0,ult_quin_total_eventos
person,Unnamed: 1_level_1
0008ed71,6.0
00091926,367.0
000ba417,206.0
000c79fe,17.0
000e4d9e,411.0


In [23]:
sesiones_por_usuario = sesiones.groupby('person')['session_id'].unique().apply(lambda x: len(x)).\
                        to_frame().rename(columns={'session_id':'ult_quin_total_sesiones'})
promedio_eventos_sesion = cantidad_eventos_usuario.join(sesiones_por_usuario)

In [24]:
promedio_eventos_sesion['ult_quin_promedio_eventos_por_sesion'] = promedio_eventos_sesion['ult_quin_total_eventos']\
                                            / promedio_eventos_sesion['ult_quin_total_sesiones']

In [25]:
user_features = user_features.join(promedio_eventos_sesion[['ult_quin_total_sesiones', 'ult_quin_promedio_eventos_por_sesion']])

In [26]:
user_features.fillna(0, inplace=True)

In [27]:
user_features.head()

Unnamed: 0_level_0,ult_quin_cant_conversions,ult_quin_cant_checkouts,ult_quin_cant_viewed_product,ult_quin_cant_searched_product,ult_quin_cant_visitas,ult_quin_cant_leads,ult_quin_cant_brand_listings,ult_quin_cant_generic_listings,ult_quin_cant_entradas_ads,ult_quin_cant_entradas_buscador,ult_quin_total_sesiones,ult_quin_promedio_eventos_por_sesion
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4886f805,0.0,1.0,4.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,9.0
ad93850f,0.0,1.0,20.0,0.0,5.0,0.0,15.0,7.0,10.0,7.0,5.0,13.0
0297fc1e,0.0,1.0,103.0,0.0,25.0,0.0,0.0,2.0,4.0,0.0,23.0,5.869565
2d681dd8,0.0,1.0,13.0,1.0,2.0,0.0,5.0,1.0,1.0,2.0,2.0,13.0
cccea85e,0.0,0.0,525.0,1.0,17.0,0.0,3.0,15.0,10.0,18.0,13.0,45.307692


In [29]:
user_features.to_csv('../data/features_basicas_ult_quin.csv', sep=',')