In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [57]:
import polars as pl
import polars.selectors as cs
import numpy as np
import yaml

from utils.preprocessing import *
from utils.modelling import TrainValidationSplit
from config.constants import BATCH_SIZE

## Data Preprocessing

We will proceed to process the concatenated dataset using the `preprocessing` module which we wrote.

This module contains some of the useful functions and configs we could use to process the data.

### Read YAML file

We will also read our preprocessing instructions from our config file, which is saved in `preprocessing.yaml`

In [46]:
# Read YAML file
config_file_path = '../config/preprocessing.yaml'
with open(config_file_path) as fstream:
    config = yaml.safe_load(fstream)

### Import Dataset

We will first import the dataset into our notebook

In [4]:
# Process data
file_path = os.path.expanduser(config['pipeline']['input_data'])
df = pl.read_parquet(file_path)
df

$insert_id,$insert_key,$schema,adid,amplitude_attribution_ids,amplitude_event_type,amplitude_id,app,city,client_event_time,client_upload_time,country,data,data_type,device_brand,device_carrier,device_family,device_id,device_manufacturer,device_model,device_type,dma,event_id,event_properties,event_time,event_type,global_user_properties,group_properties,groups,idfa,is_attribution_event,language,library,location_lat,location_lng,os_name,os_version,partner_id,paying,plan,platform,processed_time,region,sample_rate,server_received_time,server_upload_time,session_id,source_id,start_version,user_creation_time,user_id,user_properties,uuid,version_name,__index_level_0__
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64
"""251db963-6623-448e-8665-f542b8…","""None""","""None""","""None""","""None""","""None""","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:02.385000""","""2024-10-08 11:41:04.857000""","""India""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""None""","""Windows""","""None""","""22216""","""{}""","""2024-10-08 11:41:02.385000""","""session_start""","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/2.7.2""","""None""","""None""","""Chrome""","""129""","""None""","""None""","""{}""","""Web""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""None""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662385""","""None""","""None""","""None""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""fc2d4a2a-b05e-4620-b7e9-4e8de5…","""None""",0
"""3a95cdac-174c-4002-8e84-8aeba8…","""None""","""None""","""None""","""None""","""None""","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:02.386000""","""2024-10-08 11:41:04.857000""","""India""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""None""","""Windows""","""None""","""22215""","""{}""","""2024-10-08 11:41:02.386000""","""session_end""","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/2.7.2""","""None""","""None""","""Chrome""","""129""","""None""","""None""","""{}""","""Web""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""None""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662385""","""None""","""None""","""None""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""fa78037e-62ac-4518-b095-582feb…","""None""",1
"""50dbfad3-8e24-456a-ae68-21b09b…","""None""","""None""","""None""","""None""","""None""","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:02.386000""","""2024-10-08 11:41:04.857000""","""India""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""None""","""Windows""","""None""","""22216""","""{'[Amplitude] Session Replay I…","""2024-10-08 11:41:02.386000""","""session_start""","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/2.7.2""","""None""","""None""","""Chrome""","""129""","""None""","""None""","""{}""","""Web""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""None""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662386""","""None""","""None""","""None""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""9b8c3e3f-06e8-4e94-9610-9c7429…","""None""",2
"""ea0a4143-2380-48e7-a83e-8ba854…","""None""","""None""","""None""","""None""","""None""","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:03.500000""","""2024-10-08 11:41:04.857000""","""India""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""None""","""Windows""","""None""","""22217""","""{'rowModel': 'server', '[Ampli…","""2024-10-08 11:41:03.500000""",""":all-accounts:configurable-tab…","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/2.7.2""","""None""","""None""","""Chrome""","""129""","""None""","""None""","""{}""","""Web""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""None""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662386""","""None""","""None""","""None""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""61e42442-4205-4ec5-894e-b7a4f2…","""None""",3
"""4d89977d-4734-450c-afa1-07e326…","""None""","""None""","""None""","""None""","""None""","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:03.527000""","""2024-10-08 11:41:04.857000""","""India""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""None""","""Windows""","""None""","""22217""","""{'displayName': 'All Policies …","""2024-10-08 11:41:03.527000""",""":all-accounts:widget:render""","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/2.7.2""","""None""","""None""","""Chrome""","""129""","""None""","""None""","""{}""","""Web""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""None""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662386""","""None""","""None""","""None""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""3bc49813-f18a-4058-9b6d-f09598…","""None""",4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…","""None""","""None""","""None""","""None""","""None""","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:09.888000""","""2024-05-22 09:36:10.832000""","""United States""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""None""","""None""","""Windows""","""Nashville, TN""","""26""","""{'displayName': 'One Drive Lin…","""2024-05-22 09:36:09.888000""","""account-lines::widget:render""","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/1.8.0""","""None""","""None""","""Edge""","""125.0.0.0""","""None""","""None""","""{}""","""Web""","""2024-05-22 09:36:11.662000""","""Tennessee""","""None""","""2024-05-22 09:36:10.832000""","""2024-05-22 09:36:10.833000""","""1716368557821""","""None""","""None""","""None""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""c97dfcc3-2359-4c85-8c55-8a324f…","""None""",99995
"""55aec1ce-b876-4f66-b786-fd5706…","""None""","""None""","""None""","""None""","""None""","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:09.917000""","""2024-05-22 09:36:10.832000""","""United States""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""None""","""None""","""Windows""","""Nashville, TN""","""27""","""{'rowModel': 'legacyServer', '…","""2024-05-22 09:36:09.917000""","""account-lines::configurable-ta…","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/1.8.0""","""None""","""None""","""Edge""","""125.0.0.0""","""None""","""None""","""{}""","""Web""","""2024-05-22 09:36:11.662000""","""Tennessee""","""None""","""2024-05-22 09:36:10.832000""","""2024-05-22 09:36:10.833000""","""1716368557821""","""None""","""None""","""None""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""7a24386f-31b8-4a91-b01c-802a4e…","""None""",99996
"""8cb28e22-0dc3-456c-9956-41d52b…","""None""","""None""","""None""","""None""","""None""","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:15.727000""","""2024-05-22 09:36:16.789000""","""United States""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""None""","""None""","""Windows""","""Nashville, TN""","""28""","""{'displayName': 'Attachments',…","""2024-05-22 09:36:15.727000""","""account-lines::widget:render""","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/1.8.0""","""None""","""None""","""Edge""","""125.0.0.0""","""None""","""None""","""{}""","""Web""","""2024-05-22 09:36:17.297000""","""Tennessee""","""None""","""2024-05-22 09:36:16.789000""","""2024-05-22 09:36:16.791000""","""1716368557821""","""None""","""None""","""None""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""58501f05-388a-43aa-adeb-74c9a0…","""None""",99997
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…","""None""","""None""","""None""","""None""","""None""","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:33.812000""","""2024-05-22 09:36:34.881000""","""United States""","""{'path': '/2/httpapi', 'group_…","""event""","""None""","""None""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""None""","""None""","""Windows""","""Nashville, TN""","""29""","""{}""","""2024-05-22 09:36:33.812000""","""::nav-header:action-center-cli…","""None""","""{}""","""{}""","""None""","""None""","""English""","""amplitude-ts/1.8.0""","""None""","""None""","""Edge""","""125.0.0.0""","""None""","""None""","""{}""","""Web""","""2024-05-22 09:36:36.435000""","""Tennessee""","""None""","""2024-05-22 09:36:34.881000""","""2024-05-22 09:36:34.883000""","""1716368557821""","""None""","""None""","""None""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""d227faec-d7e7-4a24-b10b-cbb064…","""None""",99998


### Drop Columns

First, we will drop some of the columns that we have decided to not use for modelling during the EDA phase.

In [None]:
# def _file_sort(dir : list[str]):
#     files = [f for f in dir if 'df' in f]
#     sorted_files = sorted(files, key = lambda x : int(re.findall(r'\d+', x)[0]))
#     return sorted_files


# output_path = os.path.expanduser(config['pipeline']['output_path'])
# df = None

# for file in _file_sort(os.listdir(output_path)):
#     if 'df' not in file:
#         continue
#     df_chunk = pl.read_parquet(output_path + file)
    
#     if df is None:
#         df = df_chunk
#         expected_cols = df_chunk.columns
#     else: 

#         # There might be columns missing from chunk
#         # If so, add columns in and fill all of the values with null
#         missing_cols = set(expected_cols) - set(df_chunk.columns)
#         extra_cols = set(df_chunk.columns) - set(expected_cols)
#         df_chunk = df_chunk.with_columns(pl.lit(None).alias(c) for c in missing_cols)

#         df_chunk = df_chunk.select(expected_cols)
#         df = pl.concat([df, df_chunk], how = 'vertical_relaxed')

# del df_chunk

In [5]:
df = drop_columns(df, config, 0)
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""251db963-6623-448e-8665-f542b8…","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:02.385000""","""2024-10-08 11:41:04.857000""","""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""22216""","""{}""","""2024-10-08 11:41:02.385000""","""session_start""","""English""","""amplitude-ts/2.7.2""","""Chrome""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662385""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…"
"""3a95cdac-174c-4002-8e84-8aeba8…","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:02.386000""","""2024-10-08 11:41:04.857000""","""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""22215""","""{}""","""2024-10-08 11:41:02.386000""","""session_end""","""English""","""amplitude-ts/2.7.2""","""Chrome""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662385""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…"
"""50dbfad3-8e24-456a-ae68-21b09b…","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:02.386000""","""2024-10-08 11:41:04.857000""","""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""22216""","""{'[Amplitude] Session Replay I…","""2024-10-08 11:41:02.386000""","""session_start""","""English""","""amplitude-ts/2.7.2""","""Chrome""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662386""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…"
"""ea0a4143-2380-48e7-a83e-8ba854…","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:03.500000""","""2024-10-08 11:41:04.857000""","""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""22217""","""{'rowModel': 'server', '[Ampli…","""2024-10-08 11:41:03.500000""",""":all-accounts:configurable-tab…","""English""","""amplitude-ts/2.7.2""","""Chrome""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662386""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…"
"""4d89977d-4734-450c-afa1-07e326…","""935023330069""","""591532""","""Mumbai""","""2024-10-08 11:41:03.527000""","""2024-10-08 11:41:04.857000""","""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""","""22217""","""{'displayName': 'All Policies …","""2024-10-08 11:41:03.527000""",""":all-accounts:widget:render""","""English""","""amplitude-ts/2.7.2""","""Chrome""","""2024-10-08 11:41:05.371000""","""Maharashtra""","""2024-10-08 11:41:04.857000""","""2024-10-08 11:41:04.859000""","""1728387662386""","""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:09.888000""","""2024-05-22 09:36:10.832000""","""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""","""26""","""{'displayName': 'One Drive Lin…","""2024-05-22 09:36:09.888000""","""account-lines::widget:render""","""English""","""amplitude-ts/1.8.0""","""Edge""","""2024-05-22 09:36:11.662000""","""Tennessee""","""2024-05-22 09:36:10.832000""","""2024-05-22 09:36:10.833000""","""1716368557821""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…"
"""55aec1ce-b876-4f66-b786-fd5706…","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:09.917000""","""2024-05-22 09:36:10.832000""","""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""","""27""","""{'rowModel': 'legacyServer', '…","""2024-05-22 09:36:09.917000""","""account-lines::configurable-ta…","""English""","""amplitude-ts/1.8.0""","""Edge""","""2024-05-22 09:36:11.662000""","""Tennessee""","""2024-05-22 09:36:10.832000""","""2024-05-22 09:36:10.833000""","""1716368557821""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…"
"""8cb28e22-0dc3-456c-9956-41d52b…","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:15.727000""","""2024-05-22 09:36:16.789000""","""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""","""28""","""{'displayName': 'Attachments',…","""2024-05-22 09:36:15.727000""","""account-lines::widget:render""","""English""","""amplitude-ts/1.8.0""","""Edge""","""2024-05-22 09:36:17.297000""","""Tennessee""","""2024-05-22 09:36:16.789000""","""2024-05-22 09:36:16.791000""","""1716368557821""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…"
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…","""857540480084""","""591532""","""Nashville""","""2024-05-22 09:36:33.812000""","""2024-05-22 09:36:34.881000""","""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""","""29""","""{}""","""2024-05-22 09:36:33.812000""","""::nav-header:action-center-cli…","""English""","""amplitude-ts/1.8.0""","""Edge""","""2024-05-22 09:36:36.435000""","""Tennessee""","""2024-05-22 09:36:34.881000""","""2024-05-22 09:36:34.883000""","""1716368557821""","""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…"


In [None]:
# Expand dict columns
df = batch_preprocess_data(df, expand_dict_columns, config, BATCH_SIZE)
df

### Enforce Types

Next, we will convert the types of each column to our desired type

In [5]:
# Enforce Types
df = custom_enforce_types(df, config)
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""Mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""",22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""English""","""amplitude-ts/2.7.2""","""Chrome""",2024-10-08 11:41:05.371,"""Maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""False""","""https://accounts.google.com/""",
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""Mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""",22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""English""","""amplitude-ts/2.7.2""","""Chrome""",2024-10-08 11:41:05.371,"""Maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""False""","""https://accounts.google.com/""",
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""Mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""",22216,"""{'[Amplitude] Session Replay I…",2024-10-08 11:41:02.386,"""session_start""","""English""","""amplitude-ts/2.7.2""","""Chrome""",2024-10-08 11:41:05.371,"""Maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""False""","""https://accounts.google.com/""",
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""Mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""",22217,"""{'rowModel': 'server', '[Ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""English""","""amplitude-ts/2.7.2""","""Chrome""",2024-10-08 11:41:05.371,"""Maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""False""","""https://accounts.google.com/""",
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""Mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""India""","""Windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""None""",22217,"""{'displayName': 'All Policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""English""","""amplitude-ts/2.7.2""","""Chrome""",2024-10-08 11:41:05.371,"""Maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""False""","""https://accounts.google.com/""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""Nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""",26,"""{'displayName': 'One Drive Lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""English""","""amplitude-ts/1.8.0""","""Edge""",2024-05-22 09:36:11.662,"""Tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""underwriter""","""False""",,"""one-drive-link"""
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""Nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""",27,"""{'rowModel': 'legacyServer', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""English""","""amplitude-ts/1.8.0""","""Edge""",2024-05-22 09:36:11.662,"""Tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""underwriter""","""False""",,"""property-locations"""
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""Nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""",28,"""{'displayName': 'Attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""English""","""amplitude-ts/1.8.0""","""Edge""",2024-05-22 09:36:17.297,"""Tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""underwriter""","""False""",,"""attachments"""
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""Nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""United States""","""Windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""Nashville, TN""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""English""","""amplitude-ts/1.8.0""","""Edge""",2024-05-22 09:36:36.435,"""Tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'EMPTY'…","""underwriter""","""False""",,


### Lowercase All Values

In [6]:
# Lowercase all values
df = lowercase_all_values(df, config)
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""none""",22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""none""",22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""none""",22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""none""",22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…","""none""",22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link"""
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations"""
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments"""
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,


### Replace Null Representations

We will also replace some commmon null representations like `empty` and `none` to null values

In [7]:
# Replace with null
df = replace_with_null(df, config)
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link"""
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations"""
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments"""
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,


In [8]:
# Filter out empty user ids
df = df.filter(pl.col('user_id').is_not_null())
df


$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link"""
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations"""
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments"""
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,


### Create Session Duration Target Column

In [9]:
# Create session duration table
user_session = (df.group_by(['user_id', 'session_id'])
                  .agg([pl.max('client_event_time').name.prefix('max_'),
                        pl.min('client_event_time').name.prefix('min_')]))

# Session time calculation 
user_session = user_session.with_columns((pl.col('max_client_event_time') 
                                          - pl.col('min_client_event_time')).alias('session_duration'))
user_session = user_session.with_columns(pl.col('session_duration').dt.total_seconds().alias('session_seconds'))

# Aggregate data to client_event_datetime level
user_session = user_session.with_columns(pl.col('min_client_event_time').cast(pl.Date()).alias('client_event_datetime'))
user_session_datetime = user_session.group_by('user_id', 'client_event_datetime').agg(pl.sum('session_seconds'))
user_session_datetime

user_id,client_event_datetime,session_seconds
str,date,i64
"""7f90df1c-e78f-487a-858e-cfa2ad…",2024-12-05,2225
"""1f8b074f-2d01-46fc-a7a6-aa3a39…",2024-02-22,27713
"""97ef6826-391f-4bdb-9ec0-cedd69…",2023-09-18,11997
"""a86a6a48-eb31-4691-b9c4-4baa6d…",2024-11-21,245
"""acd72a1c-b1c0-4fa2-b12b-7ea3e7…",2023-12-27,248
…,…,…
"""f6981214-6a85-46d7-a361-6976fd…",2023-09-01,62
"""61a2808e-1f12-4e32-b1be-7a6222…",2024-02-14,7173
"""b6a21727-a12e-469f-afd5-77934d…",2023-12-20,1228
"""034ec23f-b10f-44a0-98cc-6f67d8…",2024-02-07,0


## Feature Engineering

Next, we will add some features which will serve to be useful in the modelling phase

### Group Regions 

We notice in EDA that most regions come from the United States, with a smaller subset in other countries.

For regions in the US, we will group them to four main regions: `northeast`, `midwest`, `south`, `west`

For any regions outside the US, we will classify them as `international`

In [10]:
def group_region_categories(df, input_col, output_col, condense_map_dict):

     # Get dictionary mapping from condense map
    map_dict = {}
    else_val = condense_map_dict.get('else', None)

    for key, val in condense_map_dict.items():
        if key == 'else':
            continue
        map_dict.update({itm : key for itm in val})
    
    print(map_dict)
    # Update values in the column with respective mappings
    df = df.with_columns(pl.col(input_col).replace(map_dict).alias(output_col))

    if else_val is not None:
        df = df.with_columns(pl.when(pl.col(output_col).is_in(condense_map_dict.keys()))
                                .then(pl.col(output_col))
                                .otherwise(pl.lit(else_val))
                                .alias(output_col))
    return df

In [11]:
# Specify region mappings
us_region_mapping = {
    'northeast': ['connecticut', 'maine', 'massachusetts', 'new hampshire', 'new jersey', 'new york', 'pennsylvania', 'rhode island', 'vermont'],
    'midwest': ['illinois', 'indiana', 'iowa', 'kansas', 'michigan', 'minnesota', 'missouri', 'nebraska', 'north dakota', 'ohio', 'south dakota', 'wisconsin'],
    'south': ['alabama', 'arkansas', 'delaware', 'florida', 'georgia', 'kentucky', 'louisiana', 'maryland', 'mississippi', 'north carolina', 'oklahoma', 
              'south carolina', 'tennessee', 'texas', 'virginia', 'west virginia'],
    'west': ['alaska', 'arizona', 'california', 'colorado', 'hawaii', 'idaho', 'montana', 'nevada', 'new mexico', 'oregon', 'utah', 'washington', 'wyoming'],
    'else': 'international'
}


In [12]:
# Encode regions
df = group_region_categories(df, 'region', 'region_grouped', us_region_mapping)
df

{'connecticut': 'northeast', 'maine': 'northeast', 'massachusetts': 'northeast', 'new hampshire': 'northeast', 'new jersey': 'northeast', 'new york': 'northeast', 'pennsylvania': 'northeast', 'rhode island': 'northeast', 'vermont': 'northeast', 'illinois': 'midwest', 'indiana': 'midwest', 'iowa': 'midwest', 'kansas': 'midwest', 'michigan': 'midwest', 'minnesota': 'midwest', 'missouri': 'midwest', 'nebraska': 'midwest', 'north dakota': 'midwest', 'ohio': 'midwest', 'south dakota': 'midwest', 'wisconsin': 'midwest', 'alabama': 'south', 'arkansas': 'south', 'delaware': 'south', 'florida': 'south', 'georgia': 'south', 'kentucky': 'south', 'louisiana': 'south', 'maryland': 'south', 'mississippi': 'south', 'north carolina': 'south', 'oklahoma': 'south', 'south carolina': 'south', 'tennessee': 'south', 'texas': 'south', 'virginia': 'south', 'west virginia': 'south', 'alaska': 'west', 'arizona': 'west', 'california': 'west', 'colorado': 'west', 'hawaii': 'west', 'idaho': 'west', 'montana': 'we

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug,region_grouped
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str,str
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international"""
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international"""
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international"""
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international"""
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link""","""south"""
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations""","""south"""
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments""","""south"""
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,,"""south"""


### Encode Events

Moreover, we will also encode the events into 7 main categories:

- Session & Navigation
- Account & Policy Management
- Dashboard & UI Interactions
- Action Center & Workflow
- Submission & Forms
- Filtering & Searching
- Document & Report Interactions
- Other/System Events

All of the contents of this encoding has been written in the config, and can be applied by using `map_values` 

In [13]:
# Function for grouping the events
def categorize_event(event_type):
    event_type = event_type.lower()  # Ensure input is lowercase
    
    if any(keyword in event_type for keyword in ["session_start", "session_end", "application-window", "nav-header", "dashboard"]):
        return "session & navigation"
    elif any(keyword in event_type for keyword in ["account", "policy", "rating"]):
        return "account & policy management"
    elif any(keyword in event_type for keyword in ["dashboard", "widget", "layout", "insights", "table"]):
        return "dashboard & ui interactions"
    elif any(keyword in event_type for keyword in ["action-center", "task", "workflow", "take-action"]):
        return "action center & workflow"
    elif any(keyword in event_type for keyword in ["submit-click", "form", "create", "definition", "save-click", "submissions"]):
        return "submission & forms"
    elif any(keyword in event_type for keyword in ["filter", "sort", "search", "advanced-filters"]):
        return "filtering & searching"
    elif any(keyword in event_type for keyword in ["document", "report", "download", "csv"]):
        return "document & report interactions"
    else:
        return "other/system events"


In [14]:
# Encode events
df = df.with_columns(pl.col('event_type')
                     .map_elements(categorize_event, return_dtype = pl.String)
                     .alias('event_category'))
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug,region_grouped,event_category
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str,str,str
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation"""
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation"""
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation"""
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management"""
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link""","""south""","""account & policy management"""
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations""","""south""","""account & policy management"""
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments""","""south""","""account & policy management"""
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,,"""south""","""session & navigation"""


### Datetime Feature Engineering

There is a lot of potential for feature engineering for the datetime columns.

We will split these datetime columns into their own components, while also calculating the relative time to indicate a notion of distance 

In [15]:
# Extract hours from datetime columns
datetime_cols = ["client_event_time", "client_upload_time", "event_time", 
                "processed_time", "server_received_time", "server_upload_time"]

time_to_hour_map = {
    'client_event_time': 'client_event_hour',
    'client_upload_time': 'client_upload_hour',
    'event_time': 'event_hour',
    'processed_time': 'processed_hour',
    'server_received_time': 'server_received_hour',
    'server_upload_time': 'server_upload_hour'
}

df = df.with_columns(pl.col(c).dt.hour().alias(time_to_hour_map[c]) for c in datetime_cols)
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug,region_grouped,event_category,client_event_hour,client_upload_hour,event_hour,processed_hour,server_received_hour,server_upload_hour
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str,str,str,i8,i8,i8,i8,i8,i8
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link""","""south""","""account & policy management""",9,9,9,9,9,9
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations""","""south""","""account & policy management""",9,9,9,9,9,9
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments""","""south""","""account & policy management""",9,9,9,9,9,9
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,,"""south""","""session & navigation""",9,9,9,9,9,9


In [16]:
# Extract relative time between start and end of the same evetns
df = df.with_columns([
    (pl.col("client_upload_time") - pl.col("client_event_time")).dt.total_seconds().alias("time_to_server"),
    (pl.col("server_upload_time") - pl.col("server_received_time")).dt.total_seconds().alias("server_to_process"),
    (pl.col("processed_time") - pl.col("server_upload_time")).dt.total_seconds().alias("processing_time")
])
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug,region_grouped,event_category,client_event_hour,client_upload_hour,event_hour,processed_hour,server_received_hour,server_upload_hour,time_to_server,server_to_process,processing_time
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str,str,str,i8,i8,i8,i8,i8,i8,i64,i64,i64
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments""","""south""","""account & policy management""",9,9,9,9,9,9,1,0,0
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,,"""south""","""session & navigation""",9,9,9,9,9,9,1,0,1


### Retention

There is a lot of information that we can extract from the first user visit. In particular, we can use the time between the first visits and subsequent later visits in order to infer whether the customer has an interest in the product.

We will choose to measure retention as whether the user has returned within 7 days of the last session. This indicator will be marked on a session level.

In [17]:
# Get dense rank of datetime
df = df.with_columns(pl.col('client_event_time').cast(pl.Date).alias('client_event_datetime'))

session_visit_checkpoints = (df.with_columns(pl.col('client_event_datetime')
                                                                   .rank(method = 'dense')
                                                                   .over('user_id')
                                                                   .alias('rank')))


# Self-join with previous rank to get next datetime
session_visit_checkpoints = (session_visit_checkpoints.with_columns(
                                (pl.col('rank') - 1).alias('prev_rank')))

session_visit_checkpoints = (session_visit_checkpoints.group_by('user_id', 'client_event_datetime')
                                .agg(pl.min('client_event_time'), pl.first('rank'), pl.first('prev_rank')))

session_visit_checkpoints = (session_visit_checkpoints.join(
                             session_visit_checkpoints.select('user_id', 'prev_rank', 'client_event_datetime'), 
                             left_on = ['user_id', 'rank'], 
                             right_on = ['user_id', 'prev_rank'],
                             how = 'left'))

# Calculate wait time between event datetimes
session_visit_checkpoints = session_visit_checkpoints.with_columns(
                            (pl.col('client_event_datetime_right') - pl.col('client_event_datetime'))
                            .dt.total_days().alias('wait_time'))

# Get indicator of whether this wait time is under 7 days (so we have retained the user)
session_visit_checkpoints = session_visit_checkpoints.with_columns(
                            pl.col('wait_time').le(7).cast(pl.Int8)
                            .alias('returned_within_7_days')
                            )

session_visit_checkpoints = session_visit_checkpoints.with_columns(pl.col('returned_within_7_days').fill_null(0))

# Join indicator to table
session_visit_checkpoints = session_visit_checkpoints.drop(['client_event_time', 'rank', 'prev_rank', 
                                                            'client_event_datetime_right', 'wait_time'])
session_visit_checkpoints

user_id,client_event_datetime,returned_within_7_days
str,date,i8
"""9d56981b-77d2-47ae-a62a-6ffb8c…",2024-09-26,1
"""62aa7244-86b5-41b9-8d24-099d0d…",2023-09-27,1
"""52513398-2ace-4653-a5fe-6cf994…",2023-07-06,1
"""e4dca041-9221-4508-96c4-6d98e2…",2024-04-17,1
"""10ffa9f6-3a4a-4b04-87c5-581ce6…",2023-12-28,1
…,…,…
"""dd375096-79d0-49c3-a48d-d8e084…",2023-10-19,1
"""b993171f-94d0-49b5-90c7-17502c…",2024-03-26,1
"""4291f20e-74f1-463f-8434-d9880c…",2024-03-05,1
"""18af8498-bd27-4557-84ca-32a38e…",2024-05-07,1


### Extract List-valued Columns

Next, we will extract values from list-valued columns, which can contain multiple values.

We will perform an encoding where each new indicator column would be a value from the list, and the value would represent whether the value is in that list.

Note that we will choose values from the list that are relevant to our modelling later on.

Some of the list-valued columns that we will process are `roles` and `referrer_user`

In [18]:
# Extract roles from roles, which can contain multiple roles
df = df.with_columns([
        pl.col('roles').str.contains('underwriter').alias('uw').cast(pl.Int8),
        pl.col('roles').str.contains('admin').alias('admin').cast(pl.Int8),
        pl.col('roles').str.contains('manager').alias('manager').cast(pl.Int8),
        pl.col('roles').str.contains('broker').alias('broker').cast(pl.Int8),
    ])
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug,region_grouped,event_category,client_event_hour,client_upload_hour,event_hour,processed_hour,server_received_hour,server_upload_hour,time_to_server,server_to_process,processing_time,client_event_datetime,uw,admin,manager,broker
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str,str,str,i8,i8,i8,i8,i8,i8,i64,i64,i64,date,i8,i8,i8,i8
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0,2024-10-08,1,0,0,0
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0,2024-10-08,1,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0,2024-05-22,1,0,0,0
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0,2024-05-22,1,0,0,0
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments""","""south""","""account & policy management""",9,9,9,9,9,9,1,0,0,2024-05-22,1,0,0,0
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,,"""south""","""session & navigation""",9,9,9,9,9,9,1,0,1,2024-05-22,1,0,0,0


In [19]:
# Extract brand from referrer, which can contain multiple referrers
df = df.with_columns([
        pl.col('referrer').str.contains('google').alias('google').cast(pl.Int32),
        pl.col('referrer').str.contains('microsoft|teams|office').alias('microsoft').cast(pl.Int32),
        pl.col('referrer').str.contains('federato').alias('federato').cast(pl.Int32),
        pl.col('referrer').str.contains('portal').alias('uw-portal').cast(pl.Int32),
    ])
df

$insert_id,amplitude_id,app,city,client_event_time,client_upload_time,country,device_family,device_id,dma,event_id,event_properties,event_time,event_type,language,library,os_name,processed_time,region,server_received_time,server_upload_time,session_id,user_id,user_properties,roles,isInternalUser,referrer,slug,region_grouped,event_category,client_event_hour,client_upload_hour,event_hour,processed_hour,server_received_hour,server_upload_hour,time_to_server,server_to_process,processing_time,client_event_datetime,uw,admin,manager,broker,google,microsoft,federato,uw-portal
str,i64,i64,str,datetime[μs],datetime[μs],str,str,str,str,i64,str,datetime[μs],str,str,str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str,str,str,i8,i8,i8,i8,i8,i8,i64,i64,i64,date,i8,i8,i8,i8,i32,i32,i32,i32
"""251db963-6623-448e-8665-f542b8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{}""",2024-10-08 11:41:02.385,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0,1,0,0,0
"""3a95cdac-174c-4002-8e84-8aeba8…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22215,"""{}""",2024-10-08 11:41:02.386,"""session_end""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0,1,0,0,0
"""50dbfad3-8e24-456a-ae68-21b09b…",935023330069,591532,"""mumbai""",2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22216,"""{'[amplitude] session replay i…",2024-10-08 11:41:02.386,"""session_start""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0,1,0,0,0
"""ea0a4143-2380-48e7-a83e-8ba854…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'rowmodel': 'server', '[ampli…",2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0,2024-10-08,1,0,0,0,1,0,0,0
"""4d89977d-4734-450c-afa1-07e326…",935023330069,591532,"""mumbai""",2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""","""8bd8b6ab-370f-4b56-b38f-ad221c…",,22217,"""{'displayname': 'all policies …",2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""english""","""amplitude-ts/2.7.2""","""chrome""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""{'roles': ['underwriter'], 'tr…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0,2024-10-08,1,0,0,0,1,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""102ec3ca-a467-41a9-aa19-94f5b0…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",26,"""{'displayname': 'one drive lin…",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""one-drive-link""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0,2024-05-22,1,0,0,0,,,,
"""55aec1ce-b876-4f66-b786-fd5706…",857540480084,591532,"""nashville""",2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",27,"""{'rowmodel': 'legacyserver', '…",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""property-locations""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0,2024-05-22,1,0,0,0,,,,
"""8cb28e22-0dc3-456c-9956-41d52b…",857540480084,591532,"""nashville""",2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",28,"""{'displayname': 'attachments',…",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,"""attachments""","""south""","""account & policy management""",9,9,9,9,9,9,1,0,0,2024-05-22,1,0,0,0,,,,
"""eac4d6a2-2ff0-4e59-80af-aa1b3f…",857540480084,591532,"""nashville""",2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""a8b357fb-b4fb-45d2-babd-42b470…","""nashville, tn""",29,"""{}""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""english""","""amplitude-ts/1.8.0""","""edge""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""{'initial_utm_medium': 'empty'…","""underwriter""","""false""",,,"""south""","""session & navigation""",9,9,9,9,9,9,1,0,1,2024-05-22,1,0,0,0,,,,


### Drop More Columns

Having done most of our aggregation and feature engineering, we can now drop them from our data

In [20]:
# Drop pre-aggregate columns
df = drop_columns(df, config, 1)
df

client_event_time,client_upload_time,country,device_family,dma,event_time,event_type,library,processed_time,region,server_received_time,server_upload_time,session_id,user_id,roles,isInternalUser,referrer,slug,region_grouped,event_category,client_event_hour,client_upload_hour,event_hour,processed_hour,server_received_hour,server_upload_hour,time_to_server,server_to_process,processing_time,client_event_datetime,uw,admin,manager,broker,google,microsoft,federato,uw-portal
datetime[μs],datetime[μs],str,str,str,datetime[μs],str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,str,str,i8,i8,i8,i8,i8,i8,i64,i64,i64,date,i8,i8,i8,i8,i32,i32,i32,i32
2024-10-08 11:41:02.385,2024-10-08 11:41:04.857,"""india""","""windows""",,2024-10-08 11:41:02.385,"""session_start""","""amplitude-ts/2.7.2""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0,1,0,0,0
2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""",,2024-10-08 11:41:02.386,"""session_end""","""amplitude-ts/2.7.2""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662385,"""8038ea38-8ddf-4a1a-825d-8287c0…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0,1,0,0,0
2024-10-08 11:41:02.386,2024-10-08 11:41:04.857,"""india""","""windows""",,2024-10-08 11:41:02.386,"""session_start""","""amplitude-ts/2.7.2""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""session & navigation""",11,11,11,11,11,11,2,0,0,2024-10-08,1,0,0,0,1,0,0,0
2024-10-08 11:41:03.500,2024-10-08 11:41:04.857,"""india""","""windows""",,2024-10-08 11:41:03.500,""":all-accounts:configurable-tab…","""amplitude-ts/2.7.2""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0,2024-10-08,1,0,0,0,1,0,0,0
2024-10-08 11:41:03.527,2024-10-08 11:41:04.857,"""india""","""windows""",,2024-10-08 11:41:03.527,""":all-accounts:widget:render""","""amplitude-ts/2.7.2""",2024-10-08 11:41:05.371,"""maharashtra""",2024-10-08 11:41:04.857,2024-10-08 11:41:04.859,1728387662386,"""8038ea38-8ddf-4a1a-825d-8287c0…","""underwriter""","""false""","""https://accounts.google.com/""",,"""international""","""account & policy management""",11,11,11,11,11,11,1,0,0,2024-10-08,1,0,0,0,1,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-05-22 09:36:09.888,2024-05-22 09:36:10.832,"""united states""","""windows""","""nashville, tn""",2024-05-22 09:36:09.888,"""account-lines::widget:render""","""amplitude-ts/1.8.0""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""underwriter""","""false""",,"""one-drive-link""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0,2024-05-22,1,0,0,0,,,,
2024-05-22 09:36:09.917,2024-05-22 09:36:10.832,"""united states""","""windows""","""nashville, tn""",2024-05-22 09:36:09.917,"""account-lines::configurable-ta…","""amplitude-ts/1.8.0""",2024-05-22 09:36:11.662,"""tennessee""",2024-05-22 09:36:10.832,2024-05-22 09:36:10.833,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""underwriter""","""false""",,"""property-locations""","""south""","""account & policy management""",9,9,9,9,9,9,0,0,0,2024-05-22,1,0,0,0,,,,
2024-05-22 09:36:15.727,2024-05-22 09:36:16.789,"""united states""","""windows""","""nashville, tn""",2024-05-22 09:36:15.727,"""account-lines::widget:render""","""amplitude-ts/1.8.0""",2024-05-22 09:36:17.297,"""tennessee""",2024-05-22 09:36:16.789,2024-05-22 09:36:16.791,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""underwriter""","""false""",,"""attachments""","""south""","""account & policy management""",9,9,9,9,9,9,1,0,0,2024-05-22,1,0,0,0,,,,
2024-05-22 09:36:33.812,2024-05-22 09:36:34.881,"""united states""","""windows""","""nashville, tn""",2024-05-22 09:36:33.812,"""::nav-header:action-center-cli…","""amplitude-ts/1.8.0""",2024-05-22 09:36:36.435,"""tennessee""",2024-05-22 09:36:34.881,2024-05-22 09:36:34.883,1716368557821,"""9aecc15d-64a4-4190-80f5-b5b842…","""underwriter""","""false""",,,"""south""","""session & navigation""",9,9,9,9,9,9,1,0,1,2024-05-22,1,0,0,0,,,,


### Apply One-Hot Encoding

For the rest of our categorical columns with low dimensionality, we will one-hot encode them using our preprocessing functions

In [None]:
df = encode_categorical(df, config)
df

client_event_time,client_upload_time,country,device_family_android,device_family_apple ipad,device_family_apple iphone,device_family_chrome os,device_family_chromium os,device_family_google nexus phone,device_family_ios,device_family_k,device_family_linux,device_family_mac,device_family_mac os x,device_family_ubuntu,device_family_windows,dma,event_time,event_type,library,processed_time,region,server_received_time,server_upload_time,session_id,user_id,roles,isInternalUser,referrer,slug,region_grouped_international,region_grouped_midwest,region_grouped_northeast,region_grouped_south,region_grouped_west,event_category_account & policy management,event_category_action center & workflow,event_category_dashboard & ui interactions,event_category_document & report interactions,event_category_filtering & searching,event_category_null,event_category_other/system events,event_category_session & navigation,event_category_submission & forms,client_event_hour,client_upload_hour,event_hour,processed_hour,server_received_hour,server_upload_hour,time_to_server,server_to_process,processing_time,client_event_datetime,uw,admin,manager,broker,google,microsoft,federato,uw-portal
datetime[μs],datetime[μs],str,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,str,datetime[μs],str,str,datetime[μs],str,datetime[μs],datetime[μs],i64,str,str,str,str,str,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i8,i8,i8,i8,i8,i8,i64,i64,i64,date,i8,i8,i8,i8,i32,i32,i32,i32
2025-01-06 21:10:35.099,2025-01-06 21:10:37.723,"""united states""",0,0,0,0,0,0,0,0,0,0,1,0,0,"""greenville-spartanburg, sc""",2025-01-06 21:10:35.099,"""account-lines:::view""","""amplitude-ts/2.7.2""",2025-01-06 21:10:39.275,"""south carolina""",2025-01-06 21:10:37.723,2025-01-06 21:10:37.726,1736197836023,"""57ac3fc1-363b-4f0d-8802-e9cc93…","""underwriter,manager,nf-underwr…","""false""",,,0,0,0,1,0,1,0,0,0,0,0,0,0,0,21,21,21,21,21,21,2,0,1,2025-01-06,1,0,1,0,,,,
2024-12-04 18:30:34.023,2024-12-04 18:30:35.093,"""united states""",0,0,0,0,0,0,0,0,0,0,0,0,1,"""phoenix, az""",2024-12-04 18:30:34.023,"""dashboard:my-book:configurable…","""amplitude-ts/2.7.2""",2024-12-04 18:30:36.149,"""arizona""",2024-12-04 18:30:35.093,2024-12-04 18:30:35.095,1733337033485,"""6e1b78be-1556-4893-9515-ee6af0…","""underwriter""","""false""","""https://login.microsoftonline.…","""actions-v2""",0,0,0,0,1,0,0,0,0,0,0,0,1,0,18,18,18,18,18,18,1,0,1,2024-12-04,1,0,0,0,0,1,0,0
2024-05-29 18:36:29.237,2024-05-29 18:36:30.054,"""united states""",0,0,0,0,0,0,0,0,0,0,0,0,1,"""hartford & new haven, ct""",2024-05-29 18:36:29.237,"""dashboard:my-book:configurable…","""amplitude-ts/1.8.0""",2024-05-29 18:36:31.362,"""connecticut""",2024-05-29 18:36:30.054,2024-05-29 18:36:30.057,1717007547069,"""ffefb3fc-7c7f-41fa-9ce7-fcf12e…","""underwriter""","""false""",,"""recent-actions""",0,0,1,0,0,0,0,0,0,0,0,0,1,0,18,18,18,18,18,18,0,0,1,2024-05-29,1,0,0,0,,,,
2024-10-02 14:37:15.293,2024-10-02 14:37:16.564,"""united states""",0,0,0,0,0,0,0,0,0,0,1,0,0,"""greenville-spartanburg, sc""",2024-10-02 14:37:15.293,"""account-lines::widget:render""","""amplitude-ts/2.7.2""",2024-10-02 14:37:17.514,"""south carolina""",2024-10-02 14:37:16.564,2024-10-02 14:37:16.566,1727877842052,"""0f1c0974-bc16-43a7-a0ec-d4d2d5…","""ua,underwriter""","""false""",,"""documents-and-compliance-table""",0,0,0,1,0,1,0,0,0,0,0,0,0,0,14,14,14,14,14,14,1,0,0,2024-10-02,1,0,0,0,,,,
2024-09-26 12:56:45.148,2024-09-26 12:56:46.277,"""india""",0,0,0,0,0,0,0,0,0,0,0,0,1,,2024-09-26 12:56:45.148,"""submissions:all-policy:configu…","""amplitude-ts/2.7.2""",2024-09-26 12:56:48.662,"""maharashtra""",2024-09-26 12:56:46.277,2024-09-26 12:56:46.279,1727353043590,"""b6071cc4-c091-4de4-82b1-722eff…","""underwriter""","""false""",,"""submission-history""",1,0,0,0,0,1,0,0,0,0,0,0,0,0,12,12,12,12,12,12,1,0,2,2024-09-26,1,0,0,0,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-05-31 20:12:51.490,2024-05-31 20:12:52.821,"""united states""",0,0,0,0,0,0,0,0,0,0,1,0,0,"""houston, tx""",2024-05-31 20:12:51.490,"""dashboard:my-book:widget:rende…","""amplitude-ts/2.7.2""",2024-05-31 20:12:54.999,"""texas""",2024-05-31 20:12:52.821,2024-05-31 20:12:52.823,1717186371457,"""6cd4ff43-3226-4bbf-8203-5d0147…","""underwriter""","""false""",,"""all-policies-table""",0,0,0,1,0,0,0,0,0,0,0,0,1,0,20,20,20,20,20,20,1,0,2,2024-05-31,1,0,0,0,,,,
2024-12-09 07:00:53.729,2024-12-09 07:00:54.838,"""india""",0,0,0,0,0,0,0,0,0,0,0,0,1,,2024-12-09 07:00:53.729,"""action-center:::view""","""amplitude-ts/2.7.2""",2024-12-09 07:00:55.595,"""maharashtra""",2024-12-09 07:00:54.838,2024-12-09 07:00:54.841,1733723220473,"""6066f244-0707-4e85-9fc5-70e6fe…","""underwriter""","""false""",,,1,0,0,0,0,0,1,0,0,0,0,0,0,0,7,7,7,7,7,7,1,0,0,2024-12-09,1,0,0,0,,,,
2024-11-07 21:03:41.308,2024-11-07 21:03:42.046,"""united states""",0,0,0,0,0,0,0,0,0,0,0,0,1,"""birmingham, al""",2024-11-07 21:03:41.308,"""session_start""","""amplitude-ts/2.7.2""",2024-11-07 21:03:43.472,"""alabama""",2024-11-07 21:03:42.046,2024-11-07 21:03:42.048,1731013421308,"""5a0098e1-6893-4a5a-884b-079420…","""underwriter""","""false""",,,0,0,0,1,0,0,0,0,0,0,0,0,1,0,21,21,21,21,21,21,0,0,1,2024-11-07,1,0,0,0,,,,
2024-07-12 04:15:23.073,2024-07-12 04:15:26.662,"""united states""",0,0,0,0,0,0,0,0,0,0,1,0,0,"""san francisco-oakland-san jose…",2024-07-12 04:15:23.073,"""application-window-opened""","""amplitude-ts/2.7.2""",2024-07-12 04:15:28.472,"""california""",2024-07-12 04:15:26.662,2024-07-12 04:15:26.663,1720757642034,"""93e50a3c-501a-4ecb-bf5f-e88c17…","""admin""","""true""",,,0,0,0,0,1,0,0,0,0,0,0,0,1,0,4,4,4,4,4,4,3,0,1,2024-07-12,0,1,0,0,,,,


### Aggregate Data by Client Event Time Level

In [76]:
# Get columns
time_cols = ['client_upload_time', 'server_received_time', 'processed_time', 'server_upload_time', 'client_event_time', 'event_time']

one_hot_cols = [col for col in df.columns if set(df[col].drop_nulls().to_list()) <= {0, 1}]

numeric_cols = df.select(cs.numeric().exclude(one_hot_cols + ['session_id'])).columns

In [77]:
# Define aggregation scheme
agg_scheme = []

# For user_id, take the first session, assuming 1 user per session
agg_scheme.append(pl.first('user_id').name.suffix('_first'))

# Get max for all time columns
agg_scheme.extend([pl.max(time_col).name.suffix('_max') for time_col in time_cols])

# Get max for all one-hot cols, that is, keep 1 when we see it
agg_scheme.extend([pl.max(c).name.suffix('_max') for c in one_hot_cols])

# Use mean for other numeric columns
agg_scheme.extend([pl.mean(c).name.suffix('_mean') for c in numeric_cols])



In [78]:
def aggregate_by_datetime(df):
    df_datetime = df.group_by('user_id', 'client_event_datetime').agg(agg_scheme)
    df_datetime = df_datetime.drop('user_id')
    df_datetime = df_datetime.with_columns(pl.col('user_id_first').alias('user_id'))
    return df_datetime

In [85]:
# Aggregate data by datetime
df_datetime = df.group_by('user_id', 'client_event_datetime').agg(agg_scheme)
df_datetime = df_datetime.drop('user_id')
df_datetime = df_datetime.with_columns(pl.col('user_id_first').alias('user_id'))
df_datetime

client_event_datetime,user_id_first,client_upload_time_max,server_received_time_max,processed_time_max,server_upload_time_max,client_event_time_max,event_time_max,device_family_android_max,device_family_apple ipad_max,device_family_apple iphone_max,device_family_chrome os_max,device_family_chromium os_max,device_family_google nexus phone_max,device_family_ios_max,device_family_k_max,device_family_linux_max,device_family_mac_max,device_family_mac os x_max,device_family_ubuntu_max,device_family_windows_max,region_grouped_international_max,region_grouped_midwest_max,region_grouped_northeast_max,region_grouped_south_max,region_grouped_west_max,event_category_account & policy management_max,event_category_action center & workflow_max,event_category_dashboard & ui interactions_max,event_category_document & report interactions_max,event_category_filtering & searching_max,event_category_null_max,event_category_other/system events_max,event_category_session & navigation_max,event_category_submission & forms_max,uw_max,admin_max,manager_max,broker_max,google_max,microsoft_max,federato_max,uw-portal_max,client_event_hour_mean,client_upload_hour_mean,event_hour_mean,processed_hour_mean,server_received_hour_mean,server_upload_hour_mean,time_to_server_mean,server_to_process_mean,processing_time_mean,user_id
date,str,datetime[μs],datetime[μs],datetime[μs],datetime[μs],datetime[μs],datetime[μs],u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i8,i8,i8,i8,i32,i32,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2024-09-17,"""23948c43-0a67-46af-9ac4-7c2934…",2024-09-18 11:27:28.995,2024-09-18 11:27:28.995,2024-09-18 11:27:29.569,2024-09-18 11:27:28.997,2024-09-17 22:12:46.224,2024-09-17 22:12:46.224,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,,,,,14.294521,14.369863,14.294521,14.369863,14.369863,14.369863,871.828767,0.0,0.527397,"""23948c43-0a67-46af-9ac4-7c2934…"
2024-03-06,"""88e62b40-bf38-4d53-9728-30184d…",2024-03-28 14:57:26.215,2024-03-28 14:57:26.215,2024-03-28 14:57:27.041,2024-03-28 14:57:26.219,2024-03-06 19:31:44.074,2024-03-06 19:31:44.074,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,,,,,15.578947,14.0,15.578947,14.0,14.0,14.0,1.8971e6,0.0,0.552632,"""88e62b40-bf38-4d53-9728-30184d…"
2024-09-27,"""897ef84b-cbc9-4c72-8bff-d35bfd…",2024-09-30 04:01:46.455,2024-09-30 04:01:46.455,2024-09-30 04:01:49.351,2024-09-30 04:01:46.456,2024-09-27 12:11:36.696,2024-09-27 12:11:36.696,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,,,,,7.386207,7.372414,7.386207,7.372414,7.372414,7.372414,857.77931,0.0,0.6,"""897ef84b-cbc9-4c72-8bff-d35bfd…"
2024-10-28,"""2960eb27-039a-4489-94eb-670e48…",2024-10-28 16:08:49.330,2024-10-28 16:08:49.330,2024-10-28 16:08:59.950,2024-10-28 16:08:49.333,2024-10-28 16:08:46.632,2024-10-28 16:08:46.632,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,,,,,16.0,16.0,16.0,16.0,16.0,16.0,2.482759,0.0,6.862069,"""2960eb27-039a-4489-94eb-670e48…"
2024-10-10,"""9cadb195-5205-414d-bac7-3d96d2…",2024-10-14 04:11:13.537,2024-10-14 04:11:13.537,2024-10-14 04:11:16.172,2024-10-14 04:11:13.539,2024-10-10 14:20:44.939,2024-10-10 14:20:44.939,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,9.45781,9.48833,9.45781,9.495512,9.48833,9.48833,733.199282,0.0,5.971275,"""9cadb195-5205-414d-bac7-3d96d2…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-04-22,"""abe2a8ce-bdc7-4dfb-be98-2d8802…",2024-04-22 14:56:53.858,2024-04-22 14:56:53.858,2024-04-22 14:56:54.939,2024-04-22 14:56:53.860,2024-04-22 14:56:51.832,2024-04-22 14:56:51.832,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,1,0,,,,,14.0,14.0,14.0,14.0,14.0,14.0,3.866667,0.0,1.766667,"""abe2a8ce-bdc7-4dfb-be98-2d8802…"
2023-10-16,"""27f310fe-0840-488c-a98f-d5083d…",2024-03-23 17:58:59.514,2024-03-23 17:58:59.514,2024-03-23 17:59:03.850,2024-03-23 17:58:59.516,2023-10-16 22:48:48.570,2023-10-16 22:48:48.570,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,18.946996,17.0,18.946996,17.0,17.0,17.0,1.3732e7,0.0,0.713781,"""27f310fe-0840-488c-a98f-d5083d…"
2024-02-19,"""dc56ed4b-87e2-4c13-85b0-5e11f1…",2024-03-27 04:25:41.584,2024-03-27 04:25:41.584,2024-03-27 04:25:42.425,2024-03-27 04:25:41.586,2024-02-19 14:37:33.535,2024-02-19 14:37:33.535,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,9.378151,4.0,9.378151,4.0,4.0,4.0,3.1772e6,0.0,0.369748,"""dc56ed4b-87e2-4c13-85b0-5e11f1…"
2024-01-29,"""332de40e-e936-4ae1-9619-719e2e…",2024-03-27 04:09:59.893,2024-03-27 04:09:59.893,2024-03-27 04:10:01.616,2024-03-27 04:09:59.899,2024-01-29 21:51:18.081,2024-01-29 21:51:18.081,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,,,,,17.7,4.0,17.7,4.0,4.0,4.0,4961122.3,0.0,4.6,"""332de40e-e936-4ae1-9619-719e2e…"


### Join Retention Indicator and Time Usage Target

In [86]:
# Perform join
df_datetime = df_datetime.join(session_visit_checkpoints, on = ['user_id', 'client_event_datetime'], how = 'left')
df_datetime = df_datetime.join(user_session_datetime, on = ['user_id', 'client_event_datetime'], how = 'left')
df_datetime

client_event_datetime,user_id_first,client_upload_time_max,server_received_time_max,processed_time_max,server_upload_time_max,client_event_time_max,event_time_max,device_family_android_max,device_family_apple ipad_max,device_family_apple iphone_max,device_family_chrome os_max,device_family_chromium os_max,device_family_google nexus phone_max,device_family_ios_max,device_family_k_max,device_family_linux_max,device_family_mac_max,device_family_mac os x_max,device_family_ubuntu_max,device_family_windows_max,region_grouped_international_max,region_grouped_midwest_max,region_grouped_northeast_max,region_grouped_south_max,region_grouped_west_max,event_category_account & policy management_max,event_category_action center & workflow_max,event_category_dashboard & ui interactions_max,event_category_document & report interactions_max,event_category_filtering & searching_max,event_category_null_max,event_category_other/system events_max,event_category_session & navigation_max,event_category_submission & forms_max,uw_max,admin_max,manager_max,broker_max,google_max,microsoft_max,federato_max,uw-portal_max,client_event_hour_mean,client_upload_hour_mean,event_hour_mean,processed_hour_mean,server_received_hour_mean,server_upload_hour_mean,time_to_server_mean,server_to_process_mean,processing_time_mean,user_id,returned_within_7_days,session_seconds
date,str,datetime[μs],datetime[μs],datetime[μs],datetime[μs],datetime[μs],datetime[μs],u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i8,i8,i8,i8,i32,i32,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i8,i64
2024-09-17,"""23948c43-0a67-46af-9ac4-7c2934…",2024-09-18 11:27:28.995,2024-09-18 11:27:28.995,2024-09-18 11:27:29.569,2024-09-18 11:27:28.997,2024-09-17 22:12:46.224,2024-09-17 22:12:46.224,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,,,,,14.294521,14.369863,14.294521,14.369863,14.369863,14.369863,871.828767,0.0,0.527397,"""23948c43-0a67-46af-9ac4-7c2934…",1,437
2024-03-06,"""88e62b40-bf38-4d53-9728-30184d…",2024-03-28 14:57:26.215,2024-03-28 14:57:26.215,2024-03-28 14:57:27.041,2024-03-28 14:57:26.219,2024-03-06 19:31:44.074,2024-03-06 19:31:44.074,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,,,,,15.578947,14.0,15.578947,14.0,14.0,14.0,1.8971e6,0.0,0.552632,"""88e62b40-bf38-4d53-9728-30184d…",1,2699
2024-09-27,"""897ef84b-cbc9-4c72-8bff-d35bfd…",2024-09-30 04:01:46.455,2024-09-30 04:01:46.455,2024-09-30 04:01:49.351,2024-09-30 04:01:46.456,2024-09-27 12:11:36.696,2024-09-27 12:11:36.696,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,,,,,7.386207,7.372414,7.386207,7.372414,7.372414,7.372414,857.77931,0.0,0.6,"""897ef84b-cbc9-4c72-8bff-d35bfd…",1,7104
2024-10-28,"""2960eb27-039a-4489-94eb-670e48…",2024-10-28 16:08:49.330,2024-10-28 16:08:49.330,2024-10-28 16:08:59.950,2024-10-28 16:08:49.333,2024-10-28 16:08:46.632,2024-10-28 16:08:46.632,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,,,,,16.0,16.0,16.0,16.0,16.0,16.0,2.482759,0.0,6.862069,"""2960eb27-039a-4489-94eb-670e48…",0,126
2024-10-10,"""9cadb195-5205-414d-bac7-3d96d2…",2024-10-14 04:11:13.537,2024-10-14 04:11:13.537,2024-10-14 04:11:16.172,2024-10-14 04:11:13.539,2024-10-10 14:20:44.939,2024-10-10 14:20:44.939,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,9.45781,9.48833,9.45781,9.495512,9.48833,9.48833,733.199282,0.0,5.971275,"""9cadb195-5205-414d-bac7-3d96d2…",1,16858
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-04-22,"""abe2a8ce-bdc7-4dfb-be98-2d8802…",2024-04-22 14:56:53.858,2024-04-22 14:56:53.858,2024-04-22 14:56:54.939,2024-04-22 14:56:53.860,2024-04-22 14:56:51.832,2024-04-22 14:56:51.832,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,1,0,,,,,14.0,14.0,14.0,14.0,14.0,14.0,3.866667,0.0,1.766667,"""abe2a8ce-bdc7-4dfb-be98-2d8802…",1,647
2023-10-16,"""27f310fe-0840-488c-a98f-d5083d…",2024-03-23 17:58:59.514,2024-03-23 17:58:59.514,2024-03-23 17:59:03.850,2024-03-23 17:58:59.516,2023-10-16 22:48:48.570,2023-10-16 22:48:48.570,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,18.946996,17.0,18.946996,17.0,17.0,17.0,1.3732e7,0.0,0.713781,"""27f310fe-0840-488c-a98f-d5083d…",1,9262
2024-02-19,"""dc56ed4b-87e2-4c13-85b0-5e11f1…",2024-03-27 04:25:41.584,2024-03-27 04:25:41.584,2024-03-27 04:25:42.425,2024-03-27 04:25:41.586,2024-02-19 14:37:33.535,2024-02-19 14:37:33.535,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,9.378151,4.0,9.378151,4.0,4.0,4.0,3.1772e6,0.0,0.369748,"""dc56ed4b-87e2-4c13-85b0-5e11f1…",1,20182
2024-01-29,"""332de40e-e936-4ae1-9619-719e2e…",2024-03-27 04:09:59.893,2024-03-27 04:09:59.893,2024-03-27 04:10:01.616,2024-03-27 04:09:59.899,2024-01-29 21:51:18.081,2024-01-29 21:51:18.081,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,,,,,17.7,4.0,17.7,4.0,4.0,4.0,4961122.3,0.0,4.6,"""332de40e-e936-4ae1-9619-719e2e…",1,2878


### Drop Remaining Columns

In [87]:
# Drop remaining columns
df_datetime = drop_columns(df_datetime, config, 2)
df_datetime

user_id_first,device_family_linux_max,device_family_mac os x_max,device_family_windows_max,region_grouped_international_max,region_grouped_midwest_max,region_grouped_northeast_max,region_grouped_south_max,region_grouped_west_max,event_category_account & policy management_max,event_category_action center & workflow_max,event_category_dashboard & ui interactions_max,event_category_other/system events_max,event_category_session & navigation_max,event_category_submission & forms_max,uw_max,admin_max,manager_max,broker_max,google_max,microsoft_max,client_event_hour_mean,client_upload_hour_mean,event_hour_mean,server_received_hour_mean,server_upload_hour_mean,time_to_server_mean,server_to_process_mean,processing_time_mean,user_id,returned_within_7_days,session_seconds
str,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i8,i8,i8,i8,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,str,i8,i64
"""23948c43-0a67-46af-9ac4-7c2934…",0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,,,14.294521,14.369863,14.294521,14.369863,14.369863,871.828767,0.0,0.527397,"""23948c43-0a67-46af-9ac4-7c2934…",1,437
"""88e62b40-bf38-4d53-9728-30184d…",0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,,,15.578947,14.0,15.578947,14.0,14.0,1.8971e6,0.0,0.552632,"""88e62b40-bf38-4d53-9728-30184d…",1,2699
"""897ef84b-cbc9-4c72-8bff-d35bfd…",0,0,1,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,,,7.386207,7.372414,7.386207,7.372414,7.372414,857.77931,0.0,0.6,"""897ef84b-cbc9-4c72-8bff-d35bfd…",1,7104
"""2960eb27-039a-4489-94eb-670e48…",0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,,,16.0,16.0,16.0,16.0,16.0,2.482759,0.0,6.862069,"""2960eb27-039a-4489-94eb-670e48…",0,126
"""9cadb195-5205-414d-bac7-3d96d2…",0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,0,0,0,0,9.45781,9.48833,9.45781,9.48833,9.48833,733.199282,0.0,5.971275,"""9cadb195-5205-414d-bac7-3d96d2…",1,16858
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""abe2a8ce-bdc7-4dfb-be98-2d8802…",0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,,,14.0,14.0,14.0,14.0,14.0,3.866667,0.0,1.766667,"""abe2a8ce-bdc7-4dfb-be98-2d8802…",1,647
"""27f310fe-0840-488c-a98f-d5083d…",0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,18.946996,17.0,18.946996,17.0,17.0,1.3732e7,0.0,0.713781,"""27f310fe-0840-488c-a98f-d5083d…",1,9262
"""dc56ed4b-87e2-4c13-85b0-5e11f1…",0,0,1,0,0,0,1,0,1,0,0,0,1,1,1,0,0,0,0,0,9.378151,4.0,9.378151,4.0,4.0,3.1772e6,0.0,0.369748,"""dc56ed4b-87e2-4c13-85b0-5e11f1…",1,20182
"""332de40e-e936-4ae1-9619-719e2e…",0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,,,17.7,4.0,17.7,4.0,4.0,4961122.3,0.0,4.6,"""332de40e-e936-4ae1-9619-719e2e…",1,2878


### Impute One-Hot Columns

For the rest of the one-hot valued columns, we will fill it all with zero

In [88]:
# Impute one-hot columns
columns_to_fill = ['uw_max', 'admin_max', 'manager_max', 
                   'broker_max', 'google_max', 'microsoft_max']
df_datetime = df_datetime.with_columns(pl.col(c).fill_null(0) for c in columns_to_fill)
df_datetime

user_id_first,device_family_linux_max,device_family_mac os x_max,device_family_windows_max,region_grouped_international_max,region_grouped_midwest_max,region_grouped_northeast_max,region_grouped_south_max,region_grouped_west_max,event_category_account & policy management_max,event_category_action center & workflow_max,event_category_dashboard & ui interactions_max,event_category_other/system events_max,event_category_session & navigation_max,event_category_submission & forms_max,uw_max,admin_max,manager_max,broker_max,google_max,microsoft_max,client_event_hour_mean,client_upload_hour_mean,event_hour_mean,server_received_hour_mean,server_upload_hour_mean,time_to_server_mean,server_to_process_mean,processing_time_mean,user_id,returned_within_7_days,session_seconds
str,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i8,i8,i8,i8,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,str,i8,i64
"""23948c43-0a67-46af-9ac4-7c2934…",0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,14.294521,14.369863,14.294521,14.369863,14.369863,871.828767,0.0,0.527397,"""23948c43-0a67-46af-9ac4-7c2934…",1,437
"""88e62b40-bf38-4d53-9728-30184d…",0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,15.578947,14.0,15.578947,14.0,14.0,1.8971e6,0.0,0.552632,"""88e62b40-bf38-4d53-9728-30184d…",1,2699
"""897ef84b-cbc9-4c72-8bff-d35bfd…",0,0,1,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,7.386207,7.372414,7.386207,7.372414,7.372414,857.77931,0.0,0.6,"""897ef84b-cbc9-4c72-8bff-d35bfd…",1,7104
"""2960eb27-039a-4489-94eb-670e48…",0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,16.0,16.0,16.0,16.0,16.0,2.482759,0.0,6.862069,"""2960eb27-039a-4489-94eb-670e48…",0,126
"""9cadb195-5205-414d-bac7-3d96d2…",0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,0,0,0,0,9.45781,9.48833,9.45781,9.48833,9.48833,733.199282,0.0,5.971275,"""9cadb195-5205-414d-bac7-3d96d2…",1,16858
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""abe2a8ce-bdc7-4dfb-be98-2d8802…",0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,14.0,14.0,14.0,14.0,14.0,3.866667,0.0,1.766667,"""abe2a8ce-bdc7-4dfb-be98-2d8802…",1,647
"""27f310fe-0840-488c-a98f-d5083d…",0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,0,18.946996,17.0,18.946996,17.0,17.0,1.3732e7,0.0,0.713781,"""27f310fe-0840-488c-a98f-d5083d…",1,9262
"""dc56ed4b-87e2-4c13-85b0-5e11f1…",0,0,1,0,0,0,1,0,1,0,0,0,1,1,1,0,0,0,0,0,9.378151,4.0,9.378151,4.0,4.0,3.1772e6,0.0,0.369748,"""dc56ed4b-87e2-4c13-85b0-5e11f1…",1,20182
"""332de40e-e936-4ae1-9619-719e2e…",0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,17.7,4.0,17.7,4.0,4.0,4961122.3,0.0,4.6,"""332de40e-e936-4ae1-9619-719e2e…",1,2878


In [89]:
# export parquet
df_datetime.write_parquet(config['pipeline']['output_data'])