In [None]:
# install micropip and load required packages
import micropip
await micropip.install('http://localhost:8000/files/obj_pg_wrapper-0.0.1-py3-none-any.whl')
await micropip.install('http://localhost:8000/files/buhtuh-0.0.1-py3-none-any.whl')
await micropip.install('http://localhost:8000/files/objectiv_buhtuh-0.0.1-py3-none-any.whl')
await micropip.install('plotly')

In [None]:
# import PG wrapper
from obj_pg_wrapper import create_engine
import obj_pg_wrapper

In [None]:
import sys
sys.modules['sqlalchemy'] = obj_pg_wrapper
import datetime
import matplotlib.pyplot as plt

# import Objectiv buh_tuh
from buhtuh.pandasql import BuhTuhDataFrame
from objectiv_buhtuh.util import duplo_basic_features

## Get website production data

In [None]:
## get some data, add database and credentials here
engine = create_engine('http://localhost:5000')

In [None]:
## production website data (from sessionized_data + features)
basic_features = duplo_basic_features()
full_df = BuhTuhDataFrame.from_model(engine=engine, model=basic_features, index=['event_id'])

## Set the timeframe

In [None]:
# set the timeframe for analysis
selector = (full_df['moment'] >= datetime.date(2021,6,1)) & (full_df['moment'] < datetime.date(2021,10,11))

# create one sampled df with timeframe applied 
timeframe_df = full_df[selector]

timeframe_df.sort_values(by='moment', ascending=False).head()

## Set the time aggregation 

In [None]:
# choose for which level of time aggregation the rest of the analysis will run
# supports all Postgres datetime template patterns:
# https://www.postgresql.org/docs/9.1/functions-formatting.html#FUNCTIONS-FORMATTING-DATETIME-TABLE

agg_level = 'YYYYIW'

# add the time aggregation as new column to the dataframes, so we can group on this later
timeframe_df['time_aggregation'] = timeframe_df['moment'].format(agg_level)
full_df['time_aggregation'] = full_df['moment'].format(agg_level)

## Users

In [None]:
# calculate unique users 
users = timeframe_df.groupby('time_aggregation').aggregate({'user_id':'nunique'})

users.sort_values(by='time_aggregation', ascending=False).head()

In [None]:
users.sort_values(by='time_aggregation', ascending=True).head(60).plot(kind='line')
plt.show()

## Sessions

In [None]:
# calculate unique sessions
sessions = timeframe_df.groupby('time_aggregation').aggregate({'session_id':'nunique'})

sessions.sort_values(by='time_aggregation', ascending=False).head()

In [None]:
# visualize sessions
sessions.sort_values('time_aggregation', ascending=True).head(60).plot()
plt.show()

## Sessions per user

In [None]:
# merge users and sessions
users_sessions = sessions.merge(users, how='inner', on='time_aggregation')

# calculate average sessions per user
users_sessions['sessions_per_user_avg'] = users_sessions['session_id_nunique'] / users_sessions['user_id_nunique']

# clean-up columns
del(users_sessions['session_id_nunique'])
del(users_sessions['user_id_nunique'])

users_sessions.sort_values('time_aggregation', ascending=False).head()

In [None]:
# visualize average sessions per user
users_sessions.sort_values(by='time_aggregation', ascending=True).head(60).plot()
plt.show()

## New users

In [None]:
# define first seen per user, based on full dataset
user_first_seen = full_df.groupby('user_id').aggregate({'time_aggregation':'min'})

# calculate new users for each timeframe
new_users = user_first_seen.groupby('time_aggregation_min').aggregate({'user_id':'nunique'})

# merge with total users, to calculate ratio and limit to timerange
new_total_users = users.merge(new_users, how='inner', left_on='time_aggregation', right_on='time_aggregation_min', suffixes=('_total', '_new'))

# NOTE: also would be good to delete the index column time_aggregation_min, but we have no function for this yet

# calculate new & returning user share
new_total_users['new_user_share'] = new_total_users['user_id_nunique_new'] / new_total_users['user_id_nunique_total']
new_total_users['returning_user_share'] = (new_total_users['user_id_nunique_total'] - new_total_users['user_id_nunique_new']) / new_total_users['user_id_nunique_total']

new_total_users.sort_values(by='time_aggregation', ascending=False).head()

In [None]:
# visualize new users
new_total_users[['user_id_nunique_new', 'user_id_nunique_total']].sort_values(by='time_aggregation', ascending=True).head(60).plot()
plt.show()
# NOTE: also would be good to delete the index column time_aggregation_min, but we have no function for this yet, not the x-axis is showing twice

In [None]:
# visualize returning users
new_total_users[['returning_user_share']].sort_values(by='time_aggregation', ascending=True).head(60).plot()

## Frequency

In [None]:
# calculate total users
# NOTE; this is not possible in pandas, and therefore not super intuitive. timeframe_df['user_id'].nunique() should be working, on the list
total_users = timeframe_df.groupby()['user_id'].nunique()

# number of total sessions per user
total_sessions_user = timeframe_df.groupby(['user_id']).aggregate({'session_id':'nunique'})

# calculate frequency
frequency = total_sessions_user.groupby(['session_id_nunique']).aggregate({'user_id':'nunique'})

# add total users and calculate share per number of sessions
frequency['share_of_users'] = frequency['user_id_nunique'] / total_users['user_id_nunique'][1]

frequency.sort_values(by='session_id_nunique', ascending=True).head()

In [None]:
# visualize frequency
frequency[['share_of_users']].sort_values(by='session_id_nunique', ascending=True).head(60).plot(kind='bar')
plt.show()

## Events

In [None]:
# number of total user and hits per feature
users_per_event = timeframe_df.groupby(['time_aggregation', 'feature']).aggregate({'user_id':'nunique','session_hit_number':'count'})

users_per_event.sort_values(by=['time_aggregation', 'user_id_nunique'], ascending=False).head()

# TODO: 
# 1) add feature aggregation magic here, so we make the features show-off what we can do much more 
# 2) add location stack, showing the power of this very soon in the demo's

## Conversion

In [None]:
# TODO: 
# We can do much better here once we integrate feature selection & aggregation

# NOTE: WE NEED TO UPDATE THIS ONCE THE FIRST NEW EVENT FORMAT DATA COMES IN FOR A CONVERSION
# set the goal event that you define as conversion, using our subcribe-to-mailing
conv_selector = (timeframe_df['feature'] == '(WebDocumentContext,#document),(InputContext,keep-me-posted-input),(ButtonContext,subscribe)')

# create df with only conversion events
conversions_df = timeframe_df[conv_selector]

# calculate conversions, now per user, but can easily be aggregated to session_id instead
conversions = conversions_df.groupby('time_aggregation').aggregate({'user_id':'nunique'})

# merge with users, but can easily be done with sessions instead
conversion_rate = conversions.merge(users, how='inner', on='time_aggregation', suffixes=('_converting', '_total'))

# calculate conversion rate
conversion_rate['conversion_rate'] = conversion_rate['user_id_nunique_converting'] / conversion_rate['user_id_nunique_total']

conversion_rate.sort_values(by='time_aggregation', ascending=False).head()

In [None]:
# visualize conversion rate
conversion_rate[['conversion_rate']].sort_values(by='time_aggregation', ascending=True).head(60).plot()
plt.show()

## Bounce rate

In [None]:
# NOTE: we need to limit this to page or screen views, instead of all events. Do this once we have integration feature selection.

# gather sessions, hits per timeframe
hits_sessions = timeframe_df[['time_aggregation', 'session_id', 'session_hit_number']]

# calculate hits per session
hits_per_session = hits_sessions.groupby(['time_aggregation', 'session_id']).aggregate({'session_hit_number':'nunique'})

# select sessions with only one hit
hit_selector = (hits_per_session['session_hit_number_nunique'] == 1)
single_hit_sessions = hits_per_session[hit_selector].to_frame()

# count these single hit sessions per timeframe
bounced_sessions = single_hit_sessions.groupby('time_aggregation').aggregate({'session_id':'nunique'})

# merge with total sessions
bounce_rate = bounced_sessions.merge(sessions, how='inner', on='time_aggregation', suffixes=('_bounce', '_total'))

# calculate bounce rate
bounce_rate['bounce_rate'] = bounce_rate['session_id_nunique_bounce'] / bounce_rate['session_id_nunique_total']

bounce_rate.sort_values(by='time_aggregation', ascending=False).head()

In [None]:
# visualize bounce rate
bounce_rate[['bounce_rate']].sort_values(by='time_aggregation', ascending=True).head(60).plot()
plt.show()

## Session duration

In [None]:
# calculate duration of each session
# NOTE: we want this to work, but that is a bug, on the list:
# session_duration = timeframe_df.groupby(['session_id']).aggregate({'moment':['min','max'],'time_aggregation':'min'})

session_duration = timeframe_df.groupby(['session_id']).aggregate(['moment', 'moment', 'time_aggregation'], ['min', 'max', 'min'])

session_duration['session_duration'] = session_duration['moment_max'] - session_duration['moment_min']

# check which sessions have duration of zero
# NOTE: not very intuitive. on the list to improve
session_duration['session_duration_zero'] = session_duration['session_duration'] == '0'

# calculate average session duration
avg_session_duration = session_duration.groupby(['time_aggregation_min', 'session_duration_zero']).aggregate(['session_duration', 'session_id'],['average', 'count'])

# merge with total sessions and calculate share
duration_breakdown = avg_session_duration.merge(sessions, how='inner', left_on='time_aggregation_min', right_on='time_aggregation')

# clean-up and rename columns
duration_breakdown['share_of_sessions'] = duration_breakdown['session_id_count'] / duration_breakdown['session_id_nunique']
del(duration_breakdown['session_id_nunique'])

duration_breakdown.sort_values(by='time_aggregation_min', ascending=False).head(6)

# NOTE: also would be good to delete the index column time_aggregation, but we have no function for this yet

## Session duration between events

In [None]:
# define the start and stop events to measure the duration in between
start_event = '(WebDocumentContext,#document)'
stop_event = '(WebDocumentContext,#document),(SectionContext,footer)'

# filter on only these events
start_stop = timeframe_df[(timeframe_df.feature == start_event) | (timeframe_df.feature == stop_event)]

# get previous (because of the sorting) event for stop event _in the same session, window_lag(n) returns the nth previous value in the partition
window = start_stop.sort_values('moment').window('session_id')
start_stop['prev_event'] = start_stop.feature.window_lag(window)
start_stop['prev_moment'] = start_stop.moment.window_lag(window)

# create a copy of this df with as base_node the current df's state
# note: this is a temp fix until we automatically create a new node
start_stop = start_stop.get_df_materialized_model()

# filter: for each stop event, select the closest preceeding start event
complete = start_stop[(start_stop.feature == stop_event) & (start_stop.prev_event == start_event)]

# calculate duration
complete['duration'] = complete.moment - complete.prev_moment

# calculate average duration per timeframe
duration_between_events = complete.groupby('time_aggregation').aggregate({'duration':'average'})

duration_between_events.sort_values(by='time_aggregation', ascending=False).head()

## Retention

In [None]:
# select all active moments for each user
user_moments = timeframe_df.groupby(['user_id', 'time_aggregation']).aggregate({'moment':'count'})

# merge with first seen df
user_activity = user_moments.merge(user_first_seen, how='inner', on='user_id')

# clean-up and rename columns
user_activity['new_user_cohort'] = user_activity['time_aggregation_min']
del(user_activity['time_aggregation_min'])
del(user_activity['moment_count'])  

# for each new_user_cohort count how many users get back per timeframe
retention_input = user_activity.groupby(['new_user_cohort', 'time_aggregation']).aggregate({'user_id':'nunique'})

# add the size of each new user cohort
cohorts = retention_input.merge(new_users, how='inner', left_on='new_user_cohort', right_on='time_aggregation_min', suffixes=('_active', '_cohort'))

# NOTE: after we can rename/delete an index, remove the time_aggregation_min column here, it's duplicate

# calculate classic retention (so not rolling retention, where users are required to be active each timeframe)
cohorts['retention'] = cohorts['user_id_nunique_active'] / cohorts['user_id_nunique_cohort']

# now switch to Pandas, as the dataset is small enough
cohorts_df = cohorts.to_df().reset_index()

# create typical retention matrix
cohorts_df = cohorts_df.astype({'new_user_cohort': 'int', 'time_aggregation': 'int'})
cohorts_df['active_in_timeframe'] = cohorts_df.time_aggregation - cohorts_df.new_user_cohort
cohorts_df.pivot('new_user_cohort', 'active_in_timeframe', 'retention')

## User timeline

In [None]:
# show the timeline of an indivual user's events
# NOTE: we can make this better with feature selection & aggregation

# select the spefic user we want to replay
# NOTE: .astype('string') is more something buhtuh should handle, on list
user_selector = (timeframe_df['user_id'].astype('string') == '320db8ee-847c-424b-8291-c65d021575aa')

# create df with only this user's events
selected_user_df = timeframe_df[user_selector]

# NOTE: we can apply feature selection and maybe sankey visual here
# timeline of this user's events
user_timeline = selected_user_df[['moment','feature']]

user_timeline.sort_values(by='moment', ascending=True).head(30)

# TODO

## WIP Recency

In [None]:
# select all active days for each user
user_days = timeframe_df.groupby(['user_id', 'day'])['session_id'].nunique()

user_days['day_copy'] = user_days.index['day']

# get previous (because of the sorting) day for each user
# window = user_days.sort_values('day').window('user_id')
# user_days['prev_day'] = user_days.day.window_lag(window)

#user_days.head()
# create a copy of this df with as base_node the current df's state
# note: this is a temp fix until we automatically create a new node
#start_stop = start_stop.get_df_materialized_model()

In [None]:
# below parts first require some next steps in dub_buh_tuh

## WIP Stack

In [None]:
# timeframe_df.global_contexts.json.get_value('ApplicationContext').head()

timeframe_df.global_contexts.json[0].head()

## Conversion funnel

In [None]:
# TODO
# Self-merge is giving not the ouput we expect. 
# Without that, we can not create a sankey that looks like a familiar funnel. 
# See example here https://gitlab.com/newrelity/objectiv-taxonomy-prototypes/-/blob/web-analytics/data-science/issue_example_self_merge.ipynb

# showing the sequence of events for converting users

# resuse the df with only conversion events, select the users and their conversion moment
converting_users = conversions_df['user_id', 'moment']

# for now, we focus on the first conversion event. Later it is nice to also make it possible to see events between first and 2nd conversion, and so on.
converting_users = converting_users.groupby(['user_id'])['moment'].min()
converting_users['first_conversion_moment'] = converting_users['moment_min']
del(converting_users['moment_min'])

# merge with the df that has all user events in the timeframe
converting_users_events = timeframe_df.merge(converting_users, [('user_id', 'user_id')])

# select all events that converting users had up to their first conversion moment
event_selector = (converting_users_events['moment'] <= converting_users_events['first_conversion_moment'])
pre_conversion_events = converting_users_events[event_selector]

# create pairs of from-to events based on session hit number
event_sequence = pre_conversion_events['session_id', 'session_hit_number', 'feature']

event_pairs = event_sequence.merge(event_sequence, [('session_id')])

event_pairs.head(50)


In [None]:
df_sank = pd.read_csv('buh.csv')

In [None]:
categories = set(df_sank['source']).union(set(df_sank['target']))
df_sank['source'] = pd.Categorical(df_sank['source'], categories=categories)
df_sank['target'] = pd.Categorical(df_sank['target'], categories=categories)

text_in_title = str('title')
node = dict(
      pad=15,
      thickness=20,
      line=dict(color="black", width=0.5),
      label=df_sank.source.cat.categories,
      color='blue'
    )
link = pd.concat([df_sank[['source', 'target']].apply(lambda x: x.cat.codes), df_sank['value']], axis=1).to_dict('list')
fig = go.Figure(go.Sankey(arrangement="fixed", link=link, node=node), {'clickmode': 'event+select'})
fig.update_layout(title_text=text_in_title, font_size=10)

## Events flow

In [None]:
# events per session hit number
events_per_hit_number = timeframe_df[selector].groupby(['session_hit_number', 'feature'])['session_id'].nunique()

events_per_hit_number.sort_values({'session_hit_number':True}).head()

## Traffic source

In [None]:
# TODO
# For Traffic Source, Geo and Device metrics, we would need to get source/geo/device data from GlobalContext in a easy way.
# We can then also blend it in all metrics above as slicing option.

## Geo 

## Devices