In [2]:

import pandas as pd
import ast
import numpy as np
from datetime import timedelta


file_path = '2024combined_file.csv'
df = pd.read_csv(file_path)

df['server_received_time'] = pd.to_datetime(df['server_received_time'])
df = df.sort_values(by=['user_id', 'server_received_time'])


df['time_since_last'] = df.groupby('user_id')['server_received_time'].diff()
df['is_new_session'] = df['time_since_last'].isna() | (df['time_since_last'] > pd.Timedelta(minutes=30))
df['session_id'] = df.groupby('user_id')['is_new_session'].cumsum().astype(str) + "_" + df['user_id']
df = df.sort_values(by=['session_id', 'server_received_time'])


df['next_session_time'] = df.groupby('user_id')['server_received_time'].shift(-1)
df['returned'] = (df['next_session_time'] - df['server_received_time']) > pd.Timedelta(days=1)
retention_rates = df.groupby('event_type')['returned'].mean().reset_index()
retention_rates.columns = ['event_type', 'retention_probability']
df = df.merge(retention_rates, on='event_type', how='left')


df['next_action'] = df.groupby('session_id')['event_type'].shift(-1)
df = df.merge(retention_rates, left_on='next_action', right_on='event_type', how='left', suffixes=('', '_next'))
df['y_best'] = df.groupby('event_type')['event_type_next'].transform(
    lambda x: x.mode()[0] if not x.isna().all() else "session_end"
)
df = df[['session_id', 'user_id', 'server_received_time', 'event_type', 'event_properties', 'y_best']]
df['y_best'] = df['y_best'].fillna("session_end")
df = df.sort_values(by=['session_id', 'server_received_time'])


def extract_property(event_json, key):
    try:
        parsed = ast.literal_eval(event_json)
        return parsed.get(key, None)
    except (ValueError, SyntaxError):
        return None

df['event_slug'] = df['event_properties'].apply(lambda x: extract_property(x, 'slug'))
df['event_display_name'] = df['event_properties'].apply(lambda x: extract_property(x, 'displayName'))
df['y_best_detail'] = df.groupby('session_id')['event_slug'].shift(-1)
df['y_best_display'] = df.groupby('session_id')['event_display_name'].shift(-1)
df['y_best'] = df['y_best'].astype(str) + "::" + df['y_best_detail'].astype(str)
df['y_best'] = df['y_best'].fillna(df['event_type'])


generic_actions = ["application-window-opened", "session_start", "session_end", "::None"]
df = df[~df['y_best'].isin(generic_actions)]
df = df[~df['y_best'].str.endswith("::None")]
df = df[~df['y_best'].str.endswith("widget:render")]
df = df[~df['y_best'].str.endswith("widget:render::None")]
df.loc[df['event_type'] == df['y_best'], 'y_best'] = "session_end"
df['y_best'] = df['y_best'].str.replace("::nan", "", regex=False)


df['session_length'] = df.groupby('session_id')['server_received_time'].transform(
    lambda x: (x.max() - x.min()).total_seconds()
)
df['time_since_last_session'] = df.groupby('user_id')['server_received_time'].diff().dt.total_seconds()
df['total_past_sessions'] = df.groupby('user_id')['session_id'].transform('nunique')


df['event_category'] = df['event_type'].apply(lambda x: x.split(":")[1] if ":" in x else x)
df['hour_of_day'] = df['server_received_time'].dt.hour
df['day_of_week'] = df['server_received_time'].dt.dayofweek  
df['is_working_hours'] = df['hour_of_day'].apply(lambda x: 1 if 9 <= x <= 18 else 0)


df['server_received_time_numeric'] = df['server_received_time'].astype(np.int64) // 10**9


features = [
    'session_length', 'time_since_last_session', 'total_past_sessions',
    'event_type', 'event_category', 'event_slug', 'event_display_name',
    'hour_of_day', 'day_of_week', 'is_working_hours', 'server_received_time_numeric'
]
target = 'y_best'
df = df[features + [target]]


df['time_since_last_session'].fillna(0, inplace=True)
df.loc[df['session_length'] == 0, 'session_length'] = 1
df['total_past_sessions'] = df['total_past_sessions'].fillna(0).astype(int)


print("Data starts on:", df['server_received_time_numeric'].min())
print("Data ends on:", df['server_received_time_numeric'].max())
print("Total records:", len(df))


X_prepared = df[features]
y_prepared = df[target]


Data starts on: 1711310466
Data ends on: 1735653588
Total records: 330535


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_since_last_session'].fillna(0, inplace=True)


In [4]:
df.head(10)

Unnamed: 0,session_length,time_since_last_session,total_past_sessions,event_type,event_category,event_slug,event_display_name,hour_of_day,day_of_week,is_working_hours,server_received_time_numeric,y_best
2,1.0,0.0,1,all-accounts:::view,,,,20,6,0,1711311391,all-accounts::accounts-table:account-click
11,1.0,0.0,4,application-window-opened,application-window-opened,,,19,3,0,1713469110,dashboard:my-book::view
12,1.0,0.0,4,application-window-opened,application-window-opened,,,14,2,1,1715784782,dashboard:my-book::view
15,1.0,0.0,1,agency-dashboard:::view,,,,16,4,1,1729872625,agency-dashboard::widget:render::agency-dashboard
16,1.0,0.0,1,agency-dashboard::layout:render,,agency-dashboard,,16,4,1,1729872625,agency-dashboard:::view::bond-purchase-links
17,1.0,0.0,1,agency-dashboard::widget:render,,bond-purchase-links,Bond Purchase Links,16,4,1,1729872625,agency-dashboard::configurable-table:render::a...
18,1.0,0.0,1,agency-dashboard::widget:render,,agency-bonds-table,Bonds - Agency,16,4,1,1729872625,agency-dashboard::configurable-table:render::a...
19,1.0,0.0,1,agency-dashboard::configurable-table:render,,agency-bonds,,16,4,1,1729872625,agency-dashboard::configurable-table:render::a...
21,1.0,0.0,1,session_start,session_start,,,16,4,1,1729872625,session_end
643,1.0,0.0,24,application-window-opened,application-window-opened,,,20,6,0,1711311511,dashboard:my-book::view


In [6]:
import pandas as pd

# Assuming your new dataframe 'df' already has a column 'server_received_time_numeric'
# which represents the server_received_time in seconds since epoch

# Convert the minimum numeric timestamp back to a datetime
start_date = pd.to_datetime(df['server_received_time_numeric'].min(), unit='s')

# Compute the cutoff date as 28 days from the start date
cutoff_date = start_date + pd.Timedelta(days=28)

# Convert the cutoff datetime back to numeric (seconds since epoch)
cutoff_numeric = int(cutoff_date.timestamp())


df_first_28 = df[df['server_received_time_numeric'] < cutoff_numeric]


print("Data starts on:", pd.to_datetime(df_first_28['server_received_time_numeric'], unit='s').min())
print("Data ends on:", pd.to_datetime(df_first_28['server_received_time_numeric'], unit='s').max())
print("Total days in dataset:", 
      (pd.to_datetime(df_first_28['server_received_time_numeric'], unit='s').max() - 
       pd.to_datetime(df_first_28['server_received_time_numeric'], unit='s').min()).days)
print("Number of records in first 28 days:", len(df_first_28))


Data starts on: 2024-03-24 20:01:06
Data ends on: 2024-04-20 00:37:39
Total days in dataset: 26
Number of records in first 28 days: 450


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

features = [
    'session_length', 'time_since_last_session', 'total_past_sessions',
    'event_type', 'event_category', 'event_slug', 'event_display_name',
    'hour_of_day', 'day_of_week', 'is_working_hours', 'server_received_time_numeric'
]
target = 'y_best'

X = df_first_28[features]
y = df_first_28[target]


numeric_features = [
    'session_length', 'time_since_last_session', 'total_past_sessions',
    'hour_of_day', 'day_of_week', 'is_working_hours', 'server_received_time_numeric'
]
categorical_features = ['event_type', 'event_category', 'event_slug', 'event_display_name']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

clf_pipeline.fit(X_train, y_train)

print("Training accuracy:", clf_pipeline.score(X_train, y_train))
print("Validation accuracy:", clf_pipeline.score(X_val, y_val))


Training accuracy: 1.0
Validation accuracy: 0.9111111111111111


In [8]:
# Example input features (values are illustrative)
sample_input = {
    'session_length': 120,                # e.g., session lasted 120 seconds
    'time_since_last_session': 300,       # e.g., 300 seconds gap since last session
    'total_past_sessions': 5,             # e.g., user has had 5 sessions so far
    'event_type': 'dashboard:my-book::view',  # sample event type
    'event_category': 'dashboard',        # derived from event_type
    'event_slug': 'my-book',              # sample slug extracted from properties
    'event_display_name': 'view',         # sample display name extracted from properties
    'hour_of_day': 10,                    # event occurred at 10 AM
    'day_of_week': 1,                     # e.g., Tuesday (if Monday is 0)
    'is_working_hours': 1,                # within working hours (9 AM-6 PM)
    'server_received_time_numeric': 1711311391  # numeric timestamp (seconds since epoch)
}

# Convert the dictionary to a DataFrame
import pandas as pd
sample_df = pd.DataFrame([sample_input])

# Use the trained pipeline (clf_pipeline) to predict the target for the sample input
predicted_y = clf_pipeline.predict(sample_df)
print("Predicted y_best:", predicted_y)


Predicted y_best: ['application-window-opened']


In [9]:
# Example input features (values are illustrative for a UI-related event)
sample_input_ui = {
    'session_length': 180,                # session lasted 3 minutes
    'time_since_last_session': 45,         # 45 seconds gap since the previous event
    'total_past_sessions': 3,              # user has had 3 sessions so far
    'event_type': 'dashboard:widget:render',   # current event relates to widget rendering on a dashboard
    'event_category': 'dashboard',         # derived from event_type
    'event_slug': 'sidebar',               # the widget identifier, e.g., "sidebar"
    'event_display_name': 'render',        # the action being performed, "render"
    'hour_of_day': 11,                     # event occurred at 11 AM
    'day_of_week': 2,                      # e.g., Wednesday (if Monday is 0)
    'is_working_hours': 1,                 # within working hours
    'server_received_time_numeric': 1711311391  # numeric timestamp (seconds since epoch)
}

# Convert the dictionary to a DataFrame
import pandas as pd
sample_ui_df = pd.DataFrame([sample_input_ui])

# Use the trained pipeline (clf_pipeline) to predict the target for this UI-specific sample input
predicted_y_ui = clf_pipeline.predict(sample_ui_df)
print("Predicted y_best:", predicted_y_ui)


Predicted y_best: ['application-window-opened']


In [10]:
# Example input features for a UI widget render scenario
sample_input_ui2 = {
    'session_length': 300,                # Session lasts 300 seconds (5 minutes)
    'time_since_last_session': 20,         # 20 seconds gap since the last event
    'total_past_sessions': 8,              # User has had 8 sessions so far
    'event_type': 'dashboard:widget:render',  # Current event is a widget render event
    'event_category': 'dashboard',         # Derived from event_type
    'event_slug': 'chart',                 # Indicates that the widget is a chart
    'event_display_name': 'render',        # Display name for the event
    'hour_of_day': 14,                     # Event occurred at 2 PM
    'day_of_week': 4,                      # E.g., Friday (if Monday is 0)
    'is_working_hours': 1,                 # Within working hours
    'server_received_time_numeric': 1713469110  # A sample numeric timestamp (seconds since epoch)
}

# Convert the dictionary to a DataFrame
import pandas as pd
sample_ui_df2 = pd.DataFrame([sample_input_ui2])

# Use the trained pipeline (clf_pipeline) to predict the target for this UI-specific sample input
predicted_y_ui2 = clf_pipeline.predict(sample_ui_df2)
print("Predicted y_best:", predicted_y_ui2)


Predicted y_best: ['account-lines::widget:render']


In [11]:
# Example input features for a profile edit scenario
sample_input_3 = {
    'session_length': 240,               # Session duration of 240 seconds
    'time_since_last_session': 60,       # 60 seconds gap since the previous event
    'total_past_sessions': 10,           # User has had 10 sessions so far
    'event_type': 'profile:edit',        # Current event type indicating a profile edit
    'event_category': 'profile',         # Derived from event_type ("profile")
    'event_slug': 'form',                # Indicates the type of UI element (a form)
    'event_display_name': 'update',      # The display name for the event (updating profile)
    'hour_of_day': 16,                   # Event occurred at 4 PM
    'day_of_week': 3,                    # For example, if Monday is 0, then Thursday is 3
    'is_working_hours': 1,               # Within typical working hours
    'server_received_time_numeric': 1713500000  # Sample numeric timestamp (seconds since epoch)
}

# Convert the dictionary to a DataFrame
import pandas as pd
sample_df_3 = pd.DataFrame([sample_input_3])

# Use the trained pipeline (clf_pipeline) to predict the target for this sample input
predicted_y_best_3 = clf_pipeline.predict(sample_df_3)
print("Predicted y_best for profile edit scenario:", predicted_y_best_3)


Predicted y_best for profile edit scenario: ['application-window-opened']
