In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import ast

In [2]:
file_path = '2024combined_file.csv'
df = pd.read_csv(file_path)

In [3]:
import pandas as pd
import ast

df = df.sort_values(by=['user_id', 'server_received_time'])
df['server_received_time'] = pd.to_datetime(df['server_received_time'])

df['time_since_last'] = df.groupby('user_id')['server_received_time'].diff()
df['is_new_session'] = df['time_since_last'].isna() | (df['time_since_last'] > pd.Timedelta(minutes=30))
df['session_id'] = df.groupby('user_id')['is_new_session'].cumsum().astype(str) + "_" + df['user_id']

df = df.sort_values(by=['session_id', 'server_received_time'])

df['next_session_time'] = df.groupby('user_id')['server_received_time'].shift(-1)
df['returned'] = (df['next_session_time'] - df['server_received_time']) > pd.Timedelta(days=1)

retention_rates = df.groupby('event_type')['returned'].mean().reset_index()
retention_rates.columns = ['event_type', 'retention_probability']

df = df.merge(retention_rates, on='event_type', how='left')

df['next_action'] = df.groupby('session_id')['event_type'].shift(-1)

df = df.merge(retention_rates, left_on='next_action', right_on='event_type', how='left', suffixes=('', '_next'))

df['y_best'] = df.groupby('event_type')['event_type_next'].transform(lambda x: x.mode()[0] if not x.isna().all() else "session_end")

df = df[['session_id', 'user_id', 'server_received_time', 'event_type', 'event_properties', 'y_best']]

df['y_best'] = df['y_best'].fillna("session_end")

df = df.sort_values(by=['session_id', 'server_received_time'])


In [4]:

def extract_property(event_json, key):
    try:
        parsed = ast.literal_eval(event_json)  
        return parsed.get(key, None)
    except (ValueError, SyntaxError):
        return None

df['event_slug'] = df['event_properties'].apply(lambda x: extract_property(x, 'slug'))
df['event_display_name'] = df['event_properties'].apply(lambda x: extract_property(x, 'displayName'))

df['y_best_detail'] = df.groupby('session_id')['event_slug'].shift(-1)  
df['y_best_display'] = df.groupby('session_id')['event_display_name'].shift(-1) 

df['y_best'] = df['y_best'].astype(str) + "::" + df['y_best_detail'].astype(str)
df['y_best'] = df['y_best'].fillna(df['event_type']) 

generic_actions = ["application-window-opened", "session_start", "session_end", "::None"]
df = df[~df['y_best'].isin(generic_actions)]
df = df[~df['y_best'].str.endswith("::None")]  


In [5]:

df = df[~df['y_best'].str.endswith("widget:render")]
df = df[~df['y_best'].str.endswith("widget:render::None")]  
df.loc[df['event_type'] == df['y_best'], 'y_best'] = "session_end"
df['y_best'] = df['y_best'].str.replace("::nan", "", regex=False)

In [6]:
print("NaN count in y_best:", df['y_best'].isna().sum())
print("Unique values in y_best:\n", df['y_best'].value_counts().head(20))

NaN count in y_best: 0
Unique values in y_best:
 y_best
account-lines::widget:render::documents-and-compliance-table    35558
dashboard:my-book:configurable-table:render::all-policies       10704
account-lines::widget:render::policy-detail-card                 9822
account-lines::widget:render::gl-exposure-history                7924
:all-accounts:layout:render::all-accounts                        7610
account-lines::widget:render::latest-policy-table                6981
account-lines::widget:render::property-locations                 6782
account-lines::widget:render::gl-exposures                       6666
account-lines::widget:render::general-liability                  6506
account-lines::widget:render::general                            5946
dashboard:my-book:configurable-table:render::my-book             5864
account-lines::widget:render::locations                          5807
account-lines::widget:render::one-drive-link                     5780
account-lines::widget:render::loca

In [7]:
df[['event_type', 'y_best']].sample(20)

Unnamed: 0,event_type,y_best
713095,account-lines::configurable-table:render,account-lines::widget:render::gl-exposure-history
544241,account-lines:::view,account:::view::attachments
754733,account-lines::widget:render,account-lines::widget:render::general-liability
610972,account-lines::widget:render,account-lines::widget:render::documents-and-co...
389444,account-lines::widget:render,account-lines::widget:render::documents-and-co...
818495,dashboard:my-book:widget:render,dashboard:my-book:configurable-table:render::n...
228762,submissions:all-account:configurable-table:render,submissions:all-account::view::submission-history
744807,account-lines::widget:render,account-lines::widget:render::gl-exposures
434333,account:::view,account-lines:::view::account-details-header
527665,account-lines::widget:render,account-lines::widget:render::locations-map


In [8]:
random_session_id = df['session_id'].sample(1).values[0]
df[df['session_id'] == random_session_id][['server_received_time', 'event_type', 'y_best']].sort_values(by='server_received_time')



Unnamed: 0,server_received_time,event_type,y_best
629225,2024-08-09 13:04:12.493,action-center:::close-click,::nav-header:action-center-click::all-policies
629226,2024-08-09 13:04:54.348,:all-accounts:configurable-table:render,:all-accounts:widget:render::all-policies-table
629227,2024-08-09 13:04:54.348,:all-accounts:widget:render,:all-accounts:layout:render::all-accounts
629229,2024-08-09 13:04:54.348,:all-accounts::view,account-lines::widget:render::policy-detail-card
629230,2024-08-09 13:04:58.759,account-lines::widget:render,account-lines::widget:render::locations-map
...,...,...,...
629341,2024-08-09 13:54:20.458,dashboard:my-book:configurable-table:render,dashboard:my-book:configurable-table:render::a...
629342,2024-08-09 13:54:20.458,dashboard:my-book:widget:render,dashboard:my-book:configurable-table:render::p...
629343,2024-08-09 13:54:20.458,dashboard:my-book:widget:render,dashboard:my-book:configurable-table:render::m...
629345,2024-08-09 13:54:20.458,dashboard:my-book::view,application-window-opened::recent-actions


In [None]:
import pandas as pd
import numpy as np


df['session_length'] = df.groupby('session_id')['server_received_time'].transform(lambda x: (x.max() - x.min()).total_seconds())
df['time_since_last_session'] = df.groupby('user_id')['server_received_time'].diff().dt.total_seconds()
df['total_past_sessions'] = df.groupby('user_id')['session_id'].transform('nunique')


df['event_category'] = df['event_type'].apply(lambda x: x.split(":")[1] if ":" in x else x)  

def extract_property(event_json, key):
    try:
        parsed = ast.literal_eval(event_json)  
        return parsed.get(key, None)
    except (ValueError, SyntaxError):
        return None

df['event_slug'] = df['event_properties'].apply(lambda x: extract_property(x, 'slug'))
df['event_display_name'] = df['event_properties'].apply(lambda x: extract_property(x, 'displayName'))


df['hour_of_day'] = df['server_received_time'].dt.hour
df['day_of_week'] = df['server_received_time'].dt.dayofweek  
df['is_working_hours'] = df['hour_of_day'].apply(lambda x: 1 if 9 <= x <= 18 else 0)  
features = [
    'session_length', 'time_since_last_session', 'total_past_sessions',
    'event_type', 'event_category', 'event_slug', 'event_display_name',
    'hour_of_day', 'day_of_week', 'is_working_hours'
]

df = df[features + ['y_best']]


In [None]:
df['time_since_last_session'].fillna(0, inplace=True)


df.loc[df['session_length'] == 0, 'session_length'] = 1  
df['total_past_sessions'] = df['total_past_sessions'].fillna(0).astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_since_last_session'].fillna(0, inplace=True)


In [16]:
df.to_csv("processed_dataset.csv", index=False)


In [None]:
'''
Summary of Feature Engineering and Target Calculation
Target Variable (y_best):

For each event, we identify the next action by shifting the event type within the same session.
We then compute the most frequent next event (mode) for each current event type. If no next event exists, it defaults to "session_end".
Additional event details (like the event slug) are appended to enrich the target label.
Feature Selection:

Session-Level Features:
Session Length: Duration of each session (in seconds).
Time Since Last Session: Time gap between the current event and the previous event for the same user.
Total Past Sessions: Count of unique sessions per user.
Event-Level Features:
Event Type & Category: Raw event type and a simplified category (derived from the event type).
Event Slug & Display Name: Extracted from JSON-like event properties for additional context.
Temporal Features:
Hour of Day & Day of Week: Capturing the time context of each event.
Working Hours Flag: Binary indicator for events occurring during typical work hours (9 AM to 6 PM).
This combination of target creation and comprehensive feature selection helps set the stage for predicting the next best action based on user behavior.
'''