*User Events Assesment*

Firstly I tried to create a random dataset named user_events with columns:
event_id (string)
user_id (string)
event_name (string) - possible values: 'PageView', 'Download', 'Install', 'Purchase'
platform (string) - possible values: ios and android
device_type (string)
timestamp (timestamp)

In [20]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Define possible values for event_name and platform
event_names = ['PageView', 'Download', 'Install', 'Purchase']
platforms = ['ios', 'android']

# Generate random data
data = {
    'event_id': [fake.uuid4() for _ in range(100)],
    'user_id': [fake.uuid4() for _ in range(100)],
    'event_name': [random.choice(event_names) for _ in range(100)],
    'platform': [random.choice(platforms) for _ in range(100)],
    'device_type': [fake.word() for _ in range(100)],
    'timestamp': [fake.date_time_this_year() for _ in range(100)]
}

# Create DataFrame
df_user_events = pd.DataFrame(data)

# Display the DataFrame
print(df_user_events.head())

                               event_id                               user_id  \
0  3324a8a0-8580-4c93-ba57-4690933fa44f  f6705b97-feac-4c85-8b11-0a5634188843   
1  46599b55-f5b3-4345-b7f1-4da20944facf  74c24938-3fd1-4d38-bfb8-ef406168ec09   
2  6af7d5ba-5c1e-4018-9e22-651fdfb8c1d0  87c4ec59-d7bc-4da8-b147-4104bfba865b   
3  c895213f-1ce5-4523-bcdc-80ab49ab9aea  9f223767-b39f-47f3-8886-24d00312859d   
4  30718df7-0feb-44fa-9dfa-010f121a3aaa  9b51699c-2520-46c6-86e3-3c909fee8164   

  event_name platform device_type                  timestamp  
0   PageView      ios      remain 2025-02-06 12:47:08.745290  
1   PageView  android     himself 2025-02-03 11:37:28.081030  
2   Purchase  android       write 2025-01-23 23:56:52.120980  
3    Install      ios          PM 2025-01-30 04:24:40.706377  
4   Purchase      ios       range 2025-01-08 10:14:08.798914  


In [None]:
df_user_events.to_csv('user_events.csv', index=False, encoding='utf-8', sep=',')

In [22]:
df_user_events


Unnamed: 0,event_id,user_id,event_name,platform,device_type,timestamp
0,3324a8a0-8580-4c93-ba57-4690933fa44f,f6705b97-feac-4c85-8b11-0a5634188843,PageView,ios,remain,2025-02-06 12:47:08.745290
1,46599b55-f5b3-4345-b7f1-4da20944facf,74c24938-3fd1-4d38-bfb8-ef406168ec09,PageView,android,himself,2025-02-03 11:37:28.081030
2,6af7d5ba-5c1e-4018-9e22-651fdfb8c1d0,87c4ec59-d7bc-4da8-b147-4104bfba865b,Purchase,android,write,2025-01-23 23:56:52.120980
3,c895213f-1ce5-4523-bcdc-80ab49ab9aea,9f223767-b39f-47f3-8886-24d00312859d,Install,ios,PM,2025-01-30 04:24:40.706377
4,30718df7-0feb-44fa-9dfa-010f121a3aaa,9b51699c-2520-46c6-86e3-3c909fee8164,Purchase,ios,range,2025-01-08 10:14:08.798914
...,...,...,...,...,...,...
95,a0e712d3-f8d2-4399-818c-928b5bca9963,53ba56c1-4206-4304-b323-97b215987a3d,Purchase,android,exist,2025-01-21 14:49:20.437354
96,e3dc997f-e685-4c14-9c46-77dfcb523175,d43a5964-a59d-4d9e-b646-c6f3bc23423e,Purchase,ios,much,2025-02-15 01:00:10.810872
97,da5b513e-3558-4562-9418-b5ffc4568583,a1089aff-ca47-4925-aeb3-04967aea965f,Install,android,follow,2025-02-04 06:34:50.363219
98,207eb58b-f74d-4437-9663-3050b0215b30,8790d165-d964-4d18-aaeb-4efa1f238666,Install,android,hour,2025-01-02 06:15:14.241510


Then to make the dataset more logical, I divided the user event steps by realistic percentages with incresing number of users and events.

In [23]:
from faker import Faker
import random
from datetime import timedelta
import pandas as pd

# Initialize Faker
fake = Faker()

# Define possible values
event_names = ['PageView', 'Download', 'Install']
platforms = ['ios', 'android']
device_types = ['Phone', 'Tablet']

# Define the number of users
num_users = 1000  # Adjust as needed

# Create a list to store events
events = []

for _ in range(num_users):
    user_id = fake.uuid4()  # Unique user

    # Step 1: PageView (100% of users)
    events.append({
        'event_id': fake.uuid4(),
        'user_id': user_id,
        'event_name': 'PageView',
        'platform': random.choice(platforms),
        'device_type': random.choice(device_types),
        'timestamp': fake.date_time_this_year()
    })

    # Step 2: Download (80% chance)
    if random.random() < 0.8:
        events.append({
            'event_id': fake.uuid4(),
            'user_id': user_id,
            'event_name': 'Download',
            'platform': events[-1]['platform'],  # Same platform as PageView
            'device_type': events[-1]['device_type'],  # Same device type
            'timestamp': events[-1]['timestamp'] + timedelta(hours=random.randint(1, 72))  # Within 72 hours
        })

        # Step 3: Install (90% of those who downloaded)
        if random.random() < 0.9:
            events.append({
                'event_id': fake.uuid4(),
                'user_id': user_id,
                'event_name': 'Install',
                'platform': events[-1]['platform'],
                'device_type': events[-1]['device_type'],
                'timestamp': events[-1]['timestamp'] + timedelta(hours=random.randint(1, 72))
            })

# Create DataFrame
df_user_events = pd.DataFrame(events)

# Save the dataset to a CSV file (optional)
df_user_events.to_csv("user_events1.csv", index=False)

# Display the first few rows
print(df_user_events.head())


                               event_id                               user_id  \
0  d24efe9d-c04e-4eae-b1ef-42f030df35a7  019d87ce-756d-4a56-909f-dbaf0a5b5eb3   
1  0c234e14-a01b-4d8c-bf70-a64cb5997e1e  019d87ce-756d-4a56-909f-dbaf0a5b5eb3   
2  923aa092-a566-42ef-8e0a-77665f9e4bb0  019d87ce-756d-4a56-909f-dbaf0a5b5eb3   
3  724935ea-012d-4a2c-85ff-15e0e790308e  98cedc91-5230-4b66-b7f2-edbffe8d8833   
4  051392b2-e239-4a4a-9dad-4730977d6776  3ca4b9f4-b223-4080-8de7-fce445b39789   

  event_name platform device_type                  timestamp  
0   PageView      ios       Phone 2025-02-18 16:14:59.866918  
1   Download      ios       Phone 2025-02-19 18:14:59.866918  
2    Install      ios       Phone 2025-02-19 23:14:59.866918  
3   PageView      ios       Phone 2025-01-11 15:41:35.254317  
4   PageView      ios       Phone 2025-01-18 22:12:25.750521  


In [None]:
df_user_events.to_csv('user_events1.csv', index=False, encoding='utf-8', sep=',')

In [24]:
df_user_events

Unnamed: 0,event_id,user_id,event_name,platform,device_type,timestamp
0,d24efe9d-c04e-4eae-b1ef-42f030df35a7,019d87ce-756d-4a56-909f-dbaf0a5b5eb3,PageView,ios,Phone,2025-02-18 16:14:59.866918
1,0c234e14-a01b-4d8c-bf70-a64cb5997e1e,019d87ce-756d-4a56-909f-dbaf0a5b5eb3,Download,ios,Phone,2025-02-19 18:14:59.866918
2,923aa092-a566-42ef-8e0a-77665f9e4bb0,019d87ce-756d-4a56-909f-dbaf0a5b5eb3,Install,ios,Phone,2025-02-19 23:14:59.866918
3,724935ea-012d-4a2c-85ff-15e0e790308e,98cedc91-5230-4b66-b7f2-edbffe8d8833,PageView,ios,Phone,2025-01-11 15:41:35.254317
4,051392b2-e239-4a4a-9dad-4730977d6776,3ca4b9f4-b223-4080-8de7-fce445b39789,PageView,ios,Phone,2025-01-18 22:12:25.750521
...,...,...,...,...,...,...
2503,8c3e66c6-b3ca-4169-bad5-c48fc7f0c852,7dec483a-0bd8-4b2c-ad10-2917d8b234c5,Install,android,Tablet,2025-02-15 22:06:05.192405
2504,3bc77700-4ae7-4e26-b8d7-8ce1eabbec52,c5f9cde9-5f20-428b-bd2c-b97497c84afa,PageView,ios,Tablet,2025-01-07 00:33:37.787415
2505,f4606926-9627-4d5d-b9cb-937c1085ba82,8ee7d879-8235-41a4-b77c-6634e15a447a,PageView,ios,Phone,2025-01-07 11:58:41.221669
2506,be3978a4-90fd-4860-88e1-755cb5da99c6,8ee7d879-8235-41a4-b77c-6634e15a447a,Download,ios,Phone,2025-01-07 22:58:41.221669


Then I added HardPaywall event after install just before the purchase. I think the number of users who see the app's HardPaywall is also important. Some apps conduct surveys when users first click on the app then Hard Paywall emerges just after the survey.

In [6]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import timedelta

# Initialize Faker
fake = Faker()

# Define possible values
event_names = ['PageView', 'Download', 'Install', 'HardPaywall', 'Purchase']
platforms = ['ios', 'android']
device_types = ['Phone', 'Tablet']

# Define the number of users
num_users = 1000  # Adjust as needed

# Create a list to store events
events = []

for _ in range(num_users):
    user_id = fake.uuid4()  # Unique user

    # Step 1: PageView (100% of users)
    pageview_timestamp = fake.date_time_this_year()
    platform = random.choice(platforms)
    device_type = random.choice(device_types)

    events.append({
        'event_id': fake.uuid4(),
        'user_id': user_id,
        'event_name': 'PageView',
        'platform': platform,
        'device_type': device_type,
        'timestamp': pageview_timestamp
    })

    # Step 2: Download (80% chance)
    if random.random() < 0.8:
        download_timestamp = pageview_timestamp + timedelta(hours=random.randint(1, 72))

        events.append({
            'event_id': fake.uuid4(),
            'user_id': user_id,
            'event_name': 'Download',
            'platform': platform,
            'device_type': device_type,
            'timestamp': download_timestamp
        })

        # Step 3: Install (90% of those who downloaded)
        if random.random() < 0.9:
            install_timestamp = download_timestamp + timedelta(hours=random.randint(1, 72))

            events.append({
                'event_id': fake.uuid4(),
                'user_id': user_id,
                'event_name': 'Install',
                'platform': platform,
                'device_type': device_type,
                'timestamp': install_timestamp
            })

            # Step 4: HardPaywall (90% of those who installed)
            if random.random() < 0.9:
                hardpaywall_timestamp = install_timestamp + timedelta(hours=random.randint(1, 72))

                events.append({
                    'event_id': fake.uuid4(),
                    'user_id': user_id,
                    'event_name': 'HardPaywall',
                    'platform': platform,
                    'device_type': device_type,
                    'timestamp': hardpaywall_timestamp
                })

                # Step 5: Purchase (10% of those who saw HardPaywall)
                if random.random() < 0.1:
                    purchase_timestamp = hardpaywall_timestamp + timedelta(hours=random.randint(1, 72))

                    events.append({
                        'event_id': fake.uuid4(),
                        'user_id': user_id,
                        'event_name': 'Purchase',
                        'platform': platform,
                        'device_type': device_type,
                        'timestamp': purchase_timestamp
                    })

# Create DataFrame
df_user_events = pd.DataFrame(events)

# Save the dataset to a CSV file (optional)
df_user_events.to_csv("user_events2.csv", index=False)

# Display the first few rows
print(df_user_events.head())

                               event_id                               user_id  \
0  37df1a74-ba66-4981-bf6a-79e062fdecc1  19d73092-e8ae-4930-b557-8b7b2ee7c8c4   
1  9193f83f-f763-40ab-ba16-0aa4dbaa87d8  19d73092-e8ae-4930-b557-8b7b2ee7c8c4   
2  00da9935-d469-4f80-a56e-520e7485205b  19d73092-e8ae-4930-b557-8b7b2ee7c8c4   
3  b04927b3-8f1a-4288-aaa5-65e2a75d4e62  19d73092-e8ae-4930-b557-8b7b2ee7c8c4   
4  a861954c-534c-4ae4-8afb-35156ce6b68a  c08749d7-e319-4f2f-b824-6155650ec5a0   

    event_name platform device_type                  timestamp  
0     PageView      ios       Phone 2025-01-29 07:39:16.192379  
1     Download      ios       Phone 2025-01-31 19:39:16.192379  
2      Install      ios       Phone 2025-02-03 03:39:16.192379  
3  HardPaywall      ios       Phone 2025-02-04 12:39:16.192379  
4     PageView      ios      Tablet 2025-01-25 00:32:27.921122  


In [7]:
df_user_events.to_csv('user_events2.csv', index=False, encoding='utf-8', sep=',')

In [8]:
df_user_events["event_name"].value_counts()

event_name
PageView       1000
Download        804
Install         727
HardPaywall     634
Purchase         68
Name: count, dtype: int64