In [1]:
# --- Standard Library ---

import os
import sys
import json

# --- Google Cloud Auth + APIs ---

from google.cloud import bigquery
from google.oauth2 import service_account
from google.api_core.exceptions import GoogleAPICallError, RetryError

from gspread_dataframe import set_with_dataframe

# --- Data & Visualization ---

import pandas as pd
import numpy as np
import openpyxl

# --- Local Modules ---

from modules.utilities import (
    pull_and_append,
    rebuild_data_json_from_backups,
    upload_named_dataframes_to_bq
)

from modules.flattening import (
    flatten_extract_params, 
    flatten_row,
    flatten_nested_column
)

from modules.cleaning import (
    apply_value_maps,
    safe_select_and_rename
)
# --- Lists and Maps ---

from modules.lists_and_maps import (
    df_column_names_map, 
    columns_to_drop,
    map_of_maps
    )

In [2]:
# --- Path Setup ---
SERVICE_ACCOUNT_KEY = './keys/key.json'
DATA_PATH = './data/data.json'
PROJECT_ID = "emojioracle-342f1"
DATASET_ID = "analytics_481352676"
BACKUP_PATH = './backup/'

# --- BigQuery Setup ---
SCOPES = [
    "https://www.googleapis.com/auth/bigquery",
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive"
]
credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_KEY,
    scopes = SCOPES
)
bq_client = bigquery.Client(credentials = credentials, project = PROJECT_ID)

In [3]:
# --- Main Execution ---

raw_data = pd.DataFrame(pull_and_append(credentials = credentials, 
                                  project_id = PROJECT_ID, 
                                  dataset_id = DATASET_ID, 
                                  data_path = DATA_PATH, 
                                  backup_path = BACKUP_PATH))

print(f"Data loaded with {len(raw_data)} rows and {len(raw_data.columns)} columns.")

Loaded existing data.
Latest event_date in merged data: 20250707
Backup already exists: events_20250510
Backup already exists: events_20250511
Backup already exists: events_20250512
Backup already exists: events_20250516
Backup already exists: events_20250517
Backup already exists: events_20250518
Backup already exists: events_20250519
Backup already exists: events_20250520
Backup already exists: events_20250522
Backup already exists: events_20250528
Backup already exists: events_20250529
Backup already exists: events_20250530
Backup already exists: events_20250531
Backup already exists: events_20250602
Backup already exists: events_20250603
Backup already exists: events_20250604
Backup already exists: events_20250605
Backup already exists: events_20250607
Backup already exists: events_20250608
Backup already exists: events_20250609
Backup already exists: events_20250610
Backup already exists: events_20250612
Backup already exists: events_20250613
Backup already exists: events_20250617

In [4]:
# Rebuild data JSON from backups if necessary 

# rebuild_data_json_from_backups(BACKUP_PATH, DATA_PATH)

In [3]:
# show full width of the DataFrame
pd.set_option('display.max_columns', None)

In [4]:
# Load the JSON data into a DataFrame
df = pd.read_json(DATA_PATH)

In [5]:
# --- Flatten the DataFrame ---
df = pd.DataFrame([flatten_row(row) for _, row in df.iterrows()]) # for wtfs refer to ./modules/flattening_json.py

In [6]:
df.columns = df.columns.str.replace('.', '__')

In [7]:
# --- Date and Time Cleanup and Transformation ---
df = df.drop(columns=['event_date']) # built in case event_date may not be the same as the one in the event_timestamp

df['time_delta'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) - pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)
df['time_delta'] = df['time_delta'].dt.total_seconds() # convert to seconds

df['event_datetime'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) 
df['event_previous_datetime'] = pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)
df['event_first_touch_datetime'] = pd.to_datetime(df['user_first_touch_timestamp'], unit='us', utc=True)
df['user__first_open_datetime'] = pd.to_datetime(df['user__first_open_time'], unit='ms', utc=True)


df['event_date'] = df['event_datetime'].dt.normalize()
df['event_time'] = df['event_datetime'].dt.time

df['event_previous_date'] = df['event_previous_datetime'].dt.normalize()
df['event_previous_time'] = df['event_previous_datetime'].dt.time

df['event_first_touch_date'] = df['event_first_touch_datetime'].dt.normalize()
df['event_first_touch_time'] = df['event_first_touch_datetime'].dt.time

df['user__first_open_date'] = df['user__first_open_datetime'].dt.normalize()
df['user__first_open_time'] = df['user__first_open_datetime'].dt.time

df['device__time_zone_offset_hours'] = df['device__time_zone_offset_seconds'] / 3600 # seconds to hours
df['event_params__engagement_time_seconds'] = df['event_params__engagement_time_msec'] / 1000 # ms to seconds
df['event_server_delay_seconds'] = df['event_server_timestamp_offset'] / 1000 # ms to seconds 
df['event_params__time_spent_seconds'] = df['event_params__time_spent'] # just renaming for clarity





In [8]:
# --- Add Time-Based Features ---

df['ts_weekday'] = df['event_datetime'].dt.day_name() # weekday name
df['ts_weekday'] = pd.Categorical(df['ts_weekday'], 
                                  categories=['Monday', 'Tuesday', 'Wednesday', 
                                              'Thursday', 'Friday', 'Saturday', 
                                              'Sunday'],
                                  ordered=True) # order the weekdays

df['ts_local_time'] = df['event_datetime'] + pd.to_timedelta(df['device__time_zone_offset_hours'].fillna(0), unit='h') # local time
df['ts_hour'] = df['ts_local_time'].dt.hour # local hour
df['ts_daytime_named'] = df['ts_hour'].apply(lambda x: 
                                             'Gece' if (x < 6 or x > 22) else 
                                             'Sabah' if x < 11 else 
                                             'Öğle' if x < 14 else 
                                             'Öğleden Sonra' if x < 17 else 
                                             'Akşam') # time group of day
df['ts_is_weekend'] = df['ts_weekday'].apply(lambda x: 
                                             'Hafta Sonu' if x in ['Saturday', 'Sunday'] else
                                             'Hafta İçi') 
df['ts_weekday'] = df['ts_weekday'].astype(str) # convert to string for consistency

In [9]:
# --- Session Definition and Duration Calculation ---

''' 

Create a calculated session times dataframe from the events dataframe.
This will infer session times based on the time gaps between events for each user.

This is done by:
1. Sorting events by user and timestamp.
2. Calculating the time difference between consecutive events for each user.
3. Defining a session timeout (6 minutes).
4. Assigning session IDs based on the time gaps.

'''

# Ensure events are sorted per user
df_sorted = df.sort_values(by=['user_pseudo_id', 'event_datetime'])

# Compute time gap between events per user
df_sorted['time_diff'] = df_sorted.groupby('user_pseudo_id')['event_datetime'].diff()

# Use 6-minute timeout
SESSION_TIMEOUT = pd.Timedelta(minutes=6)

# Define inferred session ID using 6-minute gaps
df_sorted['inferred_session_id'] = (
    (df_sorted['time_diff'] > SESSION_TIMEOUT) | df_sorted['time_diff'].isna()
).cumsum()

# Assign session IDs to the original DataFrame
df['inferred_session_id'] = df_sorted['inferred_session_id']

# Calculate session duration
df['session_duration_seconds'] = df.groupby(['user_pseudo_id', 'inferred_session_id'])['event_datetime'].transform(
    lambda x: (x.max() - x.min()).total_seconds()
)

# Session start and end times
df['session_start_time'] = df.groupby(['user_pseudo_id', 'inferred_session_id'])['event_datetime'].transform('min')
df['session_end_time'] = df.groupby(['user_pseudo_id', 'inferred_session_id'])['event_datetime'].transform('max')



In [10]:
# Infer and forward-fill the character name, current tier, and current question index within each session

# Step 1: Sort chronologically within sessions
df_sorted = df.sort_values(by=['user_pseudo_id', 'inferred_session_id', 'event_datetime'])

# Step 2: Forward-fill the relevant columns per user-session group
cols_to_fill = [
    'event_params__character_name',
    'event_params__current_tier',
]

df_sorted[cols_to_fill] = (
    df_sorted
    .groupby(['user_pseudo_id', 'inferred_session_id'])[cols_to_fill]
    .ffill()
)

df[cols_to_fill] = df_sorted[cols_to_fill]

In [11]:
# --- Question Index Clean-up ---
"""
Tier 1: 16 Questions, Except t: 12
Tier 2: 12 Questions
Tier 3: 12 Questions
Tier 4: 10 Questions
"""

df['event_params__current_question_index'] = pd.Series([pd.NA] * len(df), dtype="Int64")

df['event_params__current_tier'] = pd.to_numeric(df['event_params__current_tier'], errors='coerce').astype("Int64")
df['event_params__current_qi'] = pd.to_numeric(df['event_params__current_qi'], errors='coerce').astype("Int64")

notna_mask = df['event_params__character_name'].notna() & df['event_params__current_tier'].notna() & df['event_params__current_qi'].notna()

# Tier 1
tier_1_mask = notna_mask & (df['event_params__current_tier'] == 1)
t_char_mask = tier_1_mask & (df['event_params__character_name'] == 't')

df.loc[t_char_mask, 'event_params__current_question_index'] = 13 - df.loc[t_char_mask, 'event_params__current_qi']
df.loc[~t_char_mask & tier_1_mask, 'event_params__current_question_index'] = 17 - df.loc[(~t_char_mask) & tier_1_mask, 'event_params__current_qi']

# Tier 2 & 3
tier_2_3_mask = notna_mask & df['event_params__current_tier'].isin([2, 3])
df.loc[tier_2_3_mask, 'event_params__current_question_index'] = 13 - df.loc[tier_2_3_mask, 'event_params__current_qi']

# Tier 4
tier_4_mask = notna_mask & (df['event_params__current_tier'] == 4)
df.loc[tier_4_mask, 'event_params__current_question_index'] = 11 - df.loc[tier_4_mask, 'event_params__current_qi']

# Hiccups
problems_mask = notna_mask & ~df['event_params__current_tier'].isin([1, 2, 3, 4])
if df[problems_mask].shape[0] > 0:
    print("Something wrong in:")
    print(df.loc[problems_mask, ['event_params__character_name', 'event_params__current_tier', 'event_params__current_qi']])


In [12]:
# Calculate cumulative question index

df['cumulative_question_index'] = df['event_params__current_question_index']


# Tier 2
df.loc[(df['event_params__current_tier'] == 2) & (df['event_params__character_name'] == 't'), 'cumulative_question_index'] += 12
df.loc[(df['event_params__current_tier'] == 2) & (df['event_params__character_name'] != 't'), 'cumulative_question_index'] += 16

# Tier 3
df.loc[(df['event_params__current_tier'] == 3) & (df['event_params__character_name'] == 't'), 'cumulative_question_index'] += 24
df.loc[(df['event_params__current_tier'] == 3) & (df['event_params__character_name'] != 't'), 'cumulative_question_index'] += 28

# Tier 4
df.loc[(df['event_params__current_tier'] == 4) & (df['event_params__character_name'] == 't'), 'cumulative_question_index'] += 36
df.loc[(df['event_params__current_tier'] == 4) & (df['event_params__character_name'] != 't'), 'cumulative_question_index'] += 40

# NaNs
df.loc[df['event_params__current_tier'].isna(), 'cumulative_question_index'] = pd.NA




In [13]:
# Split 'event_params_mini_game_ri' maze_hand_* into columns
# e.g 'maze_hand_WomanHandTwo_maze_level_3'

# Column to process
col = 'event_params__mini_game_ri'

# Filter rows starting with 'maze_hand'
mask = df[col].str.startswith('maze_hand', na=False)

# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract Gender and Hand using the updated regex
gender_hand = parts[2].str.extract(r'(?P<Gender>Woman|Man)Hand(?P<Hand>\w+)')

# Extract Level (assumed to be in the last part)
levels = parts[5]

# Create new columns with extracted data
df.loc[mask, 'maze_gender'] = gender_hand['Gender']
df.loc[mask, 'maze_hand'] = gender_hand['Hand']
df.loc[mask, 'maze_level'] = levels

In [14]:
# Split event_params_mini_game_ri buff_* into columns
# e.g. 'buff_IncreaseXEnergy_gift_True_gold_False'


# Column to process
col = 'event_params__mini_game_ri'

# Filter rows starting with 'buff'
mask = df[col].str.startswith('buff', na=False)

# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract Buff Type and Level
buff_type = parts[2].str.extract(r'(?P<BuffType>\w+)')

# Extract Buff Gift and Gold status
buff_gift = parts[3].str.extract(r'(?P<BuffGift>\w+)')
buff_gold = parts[5].str.extract(r'(?P<BuffGold>\w+)')

# Create new columns with extracted data
df.loc[mask, 'buff_type'] = buff_type['BuffType']
df.loc[mask, 'buff_gift'] = buff_gift['BuffGift'].str.lower() == 'true'
df.loc[mask, 'buff_gold'] = buff_gold['BuffGold'].str.lower() == 'true'

In [15]:
# Split event_params_mini_game_ri earned_buff_* into columns
# e.g. 'earned_buff_GiveXCharacter'

# Column to process
col = 'event_params__mini_game_ri'

# Filter rows starting with 'earned_buff'
mask = df[col].str.startswith('earned_buff', na=False)

# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract Buff Type
buff_type = parts[2].str.extract(r'(?P<BuffType>\w+)')

# Create new columns with extracted data
df.loc[mask, 'earned_buff_type'] = buff_type['BuffType']


In [16]:
# Split event_params__spent_to doll values into columns
# e.g. 'erjohndoll'

# Column to process
col = 'event_params__spent_to'

# Filter rows including string 'doll'
mask = df[col].str.contains('doll', na=False)

# Split the string by name and doll
parts = df.loc[mask, col].str.split('doll', expand=True)

# Extract the doll name
df.loc[mask, 'doll_name'] = parts[0].str.strip()  # Get the name before 'doll'

# Rewrite the 'event_params__spent_to' column to just the doll name
df.loc[mask, col] = 'Doll'



In [18]:
# Split event_params__spent_to crystal values into columns
# list of possible values: cauldron_item, aliginn_item, coffee_item

# Column to process
col = 'event_params__spent_to'

# Filter rows including values from the list
mask = df[col].str.contains('cauldron_item|aliginn_item|coffee_item', na=False)

# Split the string by name and item
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract the item name
df.loc[mask, 'spent_in_crystal'] = parts[0].str.strip()  # Get the name before '_item'

# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Crystal Ball'


In [20]:
# Write event_params_spent_to permanent shop item values into shop_permanent_item
# list of possible values: dreamcatcher, catcollar, library1, library2, bugspray, schedule

# Column to process
col = 'event_params__spent_to'

# Filter rows including values from the list
mask = df[col].str.contains('dreamcatcher|catcollar|library1|library2|bugspray|schedule|crystal|horseshoe', na=False)

# Create a new column for the shop permanent item
df.loc[mask, 'shop_permanent_item'] = df.loc[mask, col].str.extract(r'(dreamcatcher|catcollar|library1|library2|bugspray|schedule|crystal|horseshoe)')[0]

# Rewrite the 'event_params__spent_to' column to just the item name

df.loc[mask, col] = 'Permanent Item'


In [21]:
# Write event_params_spent_to consumable shop item values into shop_consumable_item
# list of possible values: potion, ıncense, amulet, incense

# Column to process
col = 'event_params__spent_to'

# Filter rows including values from the list
mask = df[col].str.contains('potion|ıncense|amulet|incense', na=False)

# Create a new column for the shop consumable item
df.loc[mask, 'shop_consumable_item'] = df.loc[mask, col].str.extract(r'(potion|ıncense|amulet|incense)')[0]

# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Consumable Item'

In [22]:
# Write event_params_spent_to mini_game remainin item values into board_item
# everything except: ['Doll', 'Crystal Ball', 'Permanent Item', 'Consumable Item']

# Column to process
col = 'event_params__spent_to'

# Filter rows that are not in the known categories
mask = (~df[col].isin(['Doll', 'Crystal Ball', 'Permanent Item', 'Consumable Item'])) & \
    (df['event_params__where_its_spent'].isin(['board', 'board_item']))

# Create a new column for the board item
df.loc[mask, 'board_item'] = df.loc[mask, col]

# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Board Item'

In [23]:
df = df.drop(columns=columns_to_drop)

In [24]:
# Explicit rights and wrongs for questions

df['question_correct_incorrect'] = df.apply(
    lambda row: 'Doğru' if (row['event_name'] == 'question_completed' and pd.isna(row['event_params__answered_wrong'])) else
                'Yanlış' if row['event_params__answered_wrong'] == 1 else
                pd.NA,
    axis=1
)


In [29]:
df = apply_value_maps(df, map_of_maps, keep_unmapped=True)

In [28]:
# Create adressable question index
df['question_address'] = df['event_params__character_name'] + ' - T: ' + df['event_params__current_tier'].astype(str) + ' - Q: ' + df['event_params__current_question_index'].astype(str)

In [31]:
# Create user_metrics


df['event_datetime'] = pd.to_datetime(df['event_datetime'], errors='coerce')

# Group by user and calculate user-level metrics
user_metrics = df.groupby('user_pseudo_id').agg(
    first_seen=('event_datetime', 'min'),
    last_seen=('event_datetime', 'max'),
    total_sessions=('inferred_session_id', pd.Series.nunique),
    total_events=('event_name', 'count')
).reset_index()

# Compute user lifetime in days
user_metrics['lifetime_days'] = np.ceil((user_metrics['last_seen'] - 
                                         user_metrics['first_seen']).dt.total_seconds() / 86400
).astype('Int64')  # nullable int type for BQ/LS compatibility

# Churn flag based on 80th percentile of days since last event
reference_date = df['event_datetime'].max()


threshold = user_metrics['lifetime_days'].quantile(0.80)
user_metrics['is_churned'] = user_metrics['lifetime_days'] > threshold


# Retention buckets (for visualization or filtering in LS)
user_metrics['retention_bucket'] = pd.cut(
    user_metrics['lifetime_days'],
    bins=[-1, 0, 1, 3, 7, 14, 30, 90, float('inf')],
    labels=[
            '0_0d',
            '1_1d',
            '2_1-3d',
            '3_4-7d',
            '4_8-14d',
            '5_15-30d',
            '6_31-90d',
            '7_90+d'
    ]
)

# Active/returning user flags
user_metrics['is_retained_1d'] = user_metrics['lifetime_days'] >= 1
user_metrics['is_retained_7d'] = user_metrics['lifetime_days'] >= 7
user_metrics['is_retained_30d'] = user_metrics['lifetime_days'] >= 30

In [34]:
df['doll_name'].value_counts()

doll_name
T            21
C Jay         3
Lady Dodo     2
joe           2
Sinnct        1
D Lion        1
Army          1
ER John       1
Mustafa       1
Name: count, dtype: int64

In [33]:
df = safe_select_and_rename(df, df_column_names_map)

In [62]:
df[df['current_question_index'].notnull()].head()

Unnamed: 0,event_name,event_batch_id,user_pseudo_id,stream_identifier,platform,is_active_user,event_index_in_batch,event_datetime,event_date,event_time,previous_event_date,previous_event_time,first_touch_date,first_touch_time,time_since_previous_event,weekday,local_time,hour_of_day,time_of_day,is_weekend,time_zone_offset_hours,server_delay_seconds,session_id,session_number,firebase_event_origin,engaged_session_event,session_was_engaged,user_session_count,user_session_id,inferred_session_id,session_duration_seconds,session_start_time,session_end_time,screen_id,screen_class,menu_name,entrances_count,device_type,mobile_brand,mobile_model,device_hardware_model,operating_system,os_version,advertising_id,device_language,ad_tracking_limited,city,country,continent,region,subcontinent,metro_area,app_id,app_version,install_store,firebase_app_id,app_install_source,consent_analytics_storage,consent_ads_storage,ad_platform,ad_shown_in,ad_unit_id,ad_network,ad_format,conversion_event,firebase_error,fatal_error,raw_timestamp,engagement_time_seconds,time_spent_on_activity_seconds,maze_gender,maze_hand,maze_level,buff_type,buff_gift,buff_gold,earned_buff_type,original_qi,question_address,character_name,current_tier,current_question_index,cumulative_question_index,event_params__mini_game_ri,mini_game_name,answered_incorrectly,where_currency_was_earned,currency_name,currency_earned,how_currency_was_earned,currency_spent,where_currency_was_spent,spent_on,doll_name,spent_in_crystal,permanent_item,consumable_item,board_item,user_first_open_time,user_first_open_date,previous_first_open_count
918,Question Started,41,190d35a9fa00e1dbd9b484cb88e372ef,10359646141,ANDROID,True,4,2025-04-05 15:11:40.392000+00:00,2025-04-05 00:00:00+00:00,15:11:40.392000,2025-04-04 00:00:00+00:00,11:08:48.258000,2025-04-04 00:00:00+00:00,07:55:36.277000,100972.134,Cumartesi,2025-04-05 18:11:40.392000+00:00,18,Akşam,Hafta Sonu,3.0,591.592,1743865858,3.0,app,1.0,,3.0,1743866000.0,100,89.708012,2025-04-05 15:10:58.907000+00:00,2025-04-05 15:12:28.615012+00:00,8.710409e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,1.0,,,,,,,,,,,,,15,Mi - T: 1 - Q: 2,Mi,1,2,2,,,,,,,,,,,,,,,,08:00:00,2025-04-04 00:00:00+00:00,
919,Question Started,41,190d35a9fa00e1dbd9b484cb88e372ef,10359646141,ANDROID,True,9,2025-04-05 15:11:54.658001+00:00,2025-04-05 00:00:00+00:00,15:11:54.658001,2025-04-05 00:00:00+00:00,15:11:40.392001,2025-04-04 00:00:00+00:00,07:55:36.277000,14.266,Cumartesi,2025-04-05 18:11:54.658001+00:00,18,Akşam,Hafta Sonu,3.0,591.592,1743865858,3.0,app,1.0,,3.0,1743866000.0,100,89.708012,2025-04-05 15:10:58.907000+00:00,2025-04-05 15:12:28.615012+00:00,8.710409e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,1.0,,,,,,,,,,,,,14,Mi - T: 1 - Q: 3,Mi,1,3,3,,,,,,,,,,,,,,,,08:00:00,2025-04-04 00:00:00+00:00,
920,Question Started,41,190d35a9fa00e1dbd9b484cb88e372ef,10359646141,ANDROID,True,12,2025-04-05 15:12:19.735002+00:00,2025-04-05 00:00:00+00:00,15:12:19.735002,2025-04-05 00:00:00+00:00,15:11:54.658002,2025-04-04 00:00:00+00:00,07:55:36.277000,25.077,Cumartesi,2025-04-05 18:12:19.735002+00:00,18,Akşam,Hafta Sonu,3.0,591.592,1743865858,3.0,app,1.0,,3.0,1743866000.0,100,89.708012,2025-04-05 15:10:58.907000+00:00,2025-04-05 15:12:28.615012+00:00,8.710409e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,1.0,,,,,,,,,,,,,13,Mi - T: 1 - Q: 4,Mi,1,4,4,,,,,,,,,,,,,,,,08:00:00,2025-04-04 00:00:00+00:00,
921,Question Started,2390,ddf2acb862daf1f5e1e9dff579ec87d7,10359646141,ANDROID,True,42,2025-04-05 15:56:20.589008+00:00,2025-04-05 00:00:00+00:00,15:56:20.589008,2025-04-05 00:00:00+00:00,15:54:29.291008,2025-04-04 00:00:00+00:00,21:10:44.888000,111.298,Cumartesi,2025-04-05 18:56:20.589008+00:00,18,Akşam,Hafta Sonu,3.0,701.732,1743868238,6.0,app,1.0,,6.0,1743868000.0,270,1242.749,2025-04-05 15:50:38.231000+00:00,2025-04-05 16:11:20.980000+00:00,-1.397333e+18,UnityPlayerActivity,,,mobile,Samsung,SM-A525F,SM-A525F,Android,Android 14,,tr-tr,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,1.0,,,,,,,,,,,,,12,Mo - T: 3 - Q: 1,Mo,3,1,29,,,,,,,,,,,,,,,,22:00:00,2025-04-04 00:00:00+00:00,
922,Question Started,2451,ddf2acb862daf1f5e1e9dff579ec87d7,10359646141,ANDROID,True,9,2025-04-05 16:01:58.282001+00:00,2025-04-05 00:00:00+00:00,16:01:58.282001,2025-04-05 00:00:00+00:00,16:01:13.850001,2025-04-04 00:00:00+00:00,21:10:44.888000,44.432,Cumartesi,2025-04-05 19:01:58.282001+00:00,19,Akşam,Hafta Sonu,3.0,701.733,1743868238,6.0,app,1.0,,6.0,1743868000.0,270,1242.749,2025-04-05 15:50:38.231000+00:00,2025-04-05 16:11:20.980000+00:00,-1.397333e+18,UnityPlayerActivity,,,mobile,Samsung,SM-A525F,SM-A525F,Android,Android 14,,tr-tr,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,1.0,,,,,,,,,,,,,15,ER John - T: 1 - Q: 2,ER John,1,2,2,,,,,,,,,,,,,,,,22:00:00,2025-04-04 00:00:00+00:00,


In [29]:
df.columns

Index(['event_name', 'event_batch_id', 'user_pseudo_id', 'stream_identifier',
       'platform', 'is_active_user', 'event_index_in_batch', 'event_datetime',
       'event_date', 'event_time',
       ...
       'where_currency_was_spent', 'spent_on', 'doll_name', 'spent_in_crystal',
       'permanent_item', 'consumable_item', 'board_item',
       'user_first_open_time', 'user_first_open_date',
       'previous_first_open_count'],
      dtype='object', length=101)

In [61]:
df.to_csv('./data/cleaned_data.csv', index=False)
user_metrics.to_csv('./data/user_metrics.csv', index=False)

In [30]:
df['event_name'].unique()

array(['Ad Clicked', 'App Removed', 'First Open', 'Menu Closed',
       'Menu Opened', 'Screen Viewed', 'Ad Impression', 'Session Started',
       'App Data Cleared', 'User Engagement', 'Question Started',
       'Mini-game Started', 'Question Completed', 'Mini-game Completed',
       'Earned Virtual Currency', 'Spent Virtual Currency',
       'Mini-game Failed', 'App Exception'], dtype=object)

In [31]:
# --- Upload Data to BigQuery ---

# Define table names
main_table_id = f"{PROJECT_ID}.{DATASET_ID}.clean_data"

upload_named_dataframes_to_bq(
    dataframes={
        "MainCleanData": df,
        "UserMetrics": user_metrics,
        },
    dataset_id=DATASET_ID,
    project_id=PROJECT_ID,
    bq_client=bq_client,
)


DataFrame 'MainCleanData' uploaded successfully to emojioracle-342f1.analytics_481352676.MainCleanData
DataFrame 'UserMetrics' uploaded successfully to emojioracle-342f1.analytics_481352676.UserMetrics


In [None]:
dftz = df.copy()
for col in df.select_dtypes(include=['datetimetz']):
    df[col] = df[col].dt.tz_localize(None)


df.to_excel('./data/cleaned_data.xlsx', index=False)

In [30]:
df.info(max_cols=1000, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24587 entries, 0 to 24586
Data columns (total 101 columns):
 #    Column                          Non-Null Count  Dtype              
---   ------                          --------------  -----              
 0    event_name                      24587 non-null  object             
 1    event_batch_id                  24587 non-null  int64              
 2    user_pseudo_id                  24587 non-null  object             
 3    stream_identifier               24587 non-null  int64              
 4    platform                        24587 non-null  object             
 5    is_active_user                  24587 non-null  bool               
 6    event_index_in_batch            24587 non-null  int64              
 7    event_datetime                  24587 non-null  datetime64[ns, UTC]
 8    event_date                      24587 non-null  datetime64[ns, UTC]
 9    event_time                      24587 non-null  object             
 1

In [66]:
df['app_version'].value_counts()

app_version
0.1.0    24587
Name: count, dtype: int64

In [56]:
df['stream_identifier'].unique()

array([10359646141])

In [51]:
{    'event_params__ad_platform': 'ad_platform',
    'event_params__ad_shown_where': 'ad_shown_in',
    'event_params__ad_unit_id': 'ad_unit_id',
    'event_params__ad_network': 'ad_network',
    'event_params__ad_format': 'ad_format',
    'event_params__firebase_conversion': 'conversion_event'}.values()

dict_values(['ad_platform', 'ad_shown_in', 'ad_unit_id', 'ad_network', 'ad_format', 'conversion_event'])

In [32]:
df[df['event_name'] == 'User Engagement'].tail()

Unnamed: 0,event_name,event_batch_id,user_pseudo_id,stream_identifier,platform,is_active_user,event_index_in_batch,event_datetime,event_date,event_time,previous_event_date,previous_event_time,first_touch_date,first_touch_time,time_since_previous_event,weekday,local_time,hour_of_day,time_of_day,is_weekend,time_zone_offset_hours,server_delay_seconds,session_id,session_number,firebase_event_origin,engaged_session_event,session_was_engaged,user_session_count,user_session_id,inferred_session_id,session_duration_seconds,session_start_time,session_end_time,screen_id,screen_class,menu_name,entrances_count,device_type,mobile_brand,mobile_model,device_hardware_model,operating_system,os_version,advertising_id,device_language,ad_tracking_limited,city,country,continent,region,subcontinent,metro_area,app_id,app_version,install_store,firebase_app_id,app_install_source,consent_analytics_storage,consent_ads_storage,ad_platform,ad_shown_in,ad_unit_id,ad_network,ad_format,conversion_event,firebase_error,fatal_error,raw_timestamp,engagement_time_seconds,time_spent_on_activity_seconds,maze_gender,maze_hand,maze_level,buff_type,buff_gift,buff_gold,earned_buff_type,original_qi,question_address,character_name,current_tier,current_question_index,cumulative_question_index,mini_game_round_index,mini_game_name,answered_incorrectly,question_correct_incorrect,where_currency_was_earned,currency_name,currency_earned,how_currency_was_earned,currency_spent,where_currency_was_spent,spent_on,doll_name,spent_in_crystal,permanent_item,consumable_item,board_item,user_first_open_time,user_first_open_date,previous_first_open_count
23457,User Engagement,1588,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,5,2025-06-24 16:28:25.462004+00:00,2025-06-24 00:00:00+00:00,16:28:25.462004,2025-06-24 00:00:00+00:00,16:27:44.801004,2025-04-09 00:00:00+00:00,09:32:18.415000,40.661,Salı,2025-06-24 19:28:25.462004+00:00,19,Akşam,Hafta İçi,3.0,1665.043,1750780551,11.0,auto,1.0,,11.0,1750781000.0,198,3328.381011,2025-06-24 15:55:51.063000+00:00,2025-06-24 16:51:19.444011+00:00,5.431278e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,,,,,22.268,,,,,,,,,,C Jay - T: 2 - Q: 4,C Jay,2.0,4.0,,,,,,,,,,,,,,,,,,10:00:00,2025-04-09 00:00:00+00:00,
23549,User Engagement,1715,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,120,2025-06-24 16:48:46.272120+00:00,2025-06-24 00:00:00+00:00,16:48:46.272120,2025-06-24 00:00:00+00:00,16:28:25.462120,2025-04-09 00:00:00+00:00,09:32:18.415000,1220.81,Salı,2025-06-24 19:48:46.272120+00:00,19,Akşam,Hafta İçi,3.0,1665.044,1750780551,11.0,auto,1.0,,11.0,1750781000.0,198,3328.381011,2025-06-24 15:55:51.063000+00:00,2025-06-24 16:51:19.444011+00:00,-5.806733e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,,,,,1021.262,,,,,,,,,,C Jay - T: 3 - Q: 8,C Jay,3.0,8.0,,,,,,,,,,,,,,,,,,10:00:00,2025-04-09 00:00:00+00:00,
23553,User Engagement,1715,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,126,2025-06-24 16:49:45.278126+00:00,2025-06-24 00:00:00+00:00,16:49:45.278126,2025-06-24 00:00:00+00:00,16:48:46.272126,2025-04-09 00:00:00+00:00,09:32:18.415000,59.006,Salı,2025-06-24 19:49:45.278126+00:00,19,Akşam,Hafta İçi,3.0,1665.044,1750780551,11.0,auto,1.0,,11.0,1750781000.0,198,3328.381011,2025-06-24 15:55:51.063000+00:00,2025-06-24 16:51:19.444011+00:00,-3.135578e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,,,,,55.665,,,,,,,,,,C Jay - T: 3 - Q: 10,C Jay,3.0,10.0,,,,,,,,,,,,,,,,,,10:00:00,2025-04-09 00:00:00+00:00,
23563,User Engagement,1727,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,12,2025-06-24 16:51:19.444011+00:00,2025-06-24 00:00:00+00:00,16:51:19.444011,2025-06-24 00:00:00+00:00,16:49:45.278011,2025-04-09 00:00:00+00:00,09:32:18.415000,94.166,Salı,2025-06-24 19:51:19.444011+00:00,19,Akşam,Hafta İçi,3.0,1665.045,1750780551,11.0,auto,1.0,,11.0,1750781000.0,198,3328.381011,2025-06-24 15:55:51.063000+00:00,2025-06-24 16:51:19.444011+00:00,-3.135578e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,,,,,72.907,,,,,,,,,,C Jay - T: 3 - Q: 12,C Jay,3.0,12.0,,,,,,,,,,,,,,,,,,10:00:00,2025-04-09 00:00:00+00:00,
23564,User Engagement,1728,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,1,2025-06-24 17:20:43.536000+00:00,2025-06-24 00:00:00+00:00,17:20:43.536000,2025-06-24 00:00:00+00:00,16:51:19.444000,2025-04-09 00:00:00+00:00,09:32:18.415000,1764.092,Salı,2025-06-24 20:20:43.536000+00:00,20,Akşam,Hafta İçi,3.0,1665.046,1750780551,11.0,auto,1.0,,11.0,1750781000.0,199,0.0,2025-06-24 17:20:43.536000+00:00,2025-06-24 17:20:43.536000+00:00,-3.135578e+18,UnityPlayerActivity,,,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,Yes,Yes,,,,,,,,,,3.166,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10:00:00,2025-04-09 00:00:00+00:00,


In [106]:
df['event_name'].unique()

array(['Ad Clicked', 'App Removed', 'First Open', 'Menu Closed',
       'Menu Opened', 'Screen Viewed', 'Ad Impression', 'Session Started',
       'App Data Cleared', 'User Engagement', 'Question Started',
       'Mini-game Started', 'Question Completed', 'Mini-game Completed',
       'Earned Virtual Currency', 'Spent Virtual Currency',
       'Mini-game Failed', 'App Exception'], dtype=object)