In [1]:
print("Starting data update process...")

# --- Standard Library ---
import os
import sys
import json
# --- Google Cloud Auth + APIs ---
from google.cloud import bigquery
from google.oauth2 import service_account
from google.api_core.exceptions import GoogleAPICallError, RetryError
from gspread_dataframe import set_with_dataframe
# --- Data & Visualization ---
import pandas as pd
import numpy as np
import openpyxl
# --- Local Modules ---
from modules.utilities import (
    pull_and_append,
    rebuild_data_json_from_backups,
    upload_named_dataframes_to_bq,
    convert_bool_to_int
)
from modules.flattening import (
    flatten_extract_params, 
    flatten_row,
    flatten_nested_column
)
from modules.cleaning import (
    apply_value_maps,
    safe_select_and_rename
)
# --- Lists and Maps ---
from modules.lists_and_maps import (
    df_column_names_map, 
    columns_to_drop,
    map_of_maps,
    df_splits,
    df_filters
    )
print("Imports completed successfully.")

Starting data update process...
Imports completed successfully.


In [2]:
# --- Path Setup ---
SERVICE_ACCOUNT_KEY = './keys/key.json'
DATA_PATH = './data/data.json'
PROJECT_ID = "emojioracle-342f1"
DATASET_ID = "analytics_481352676"
BACKUP_PATH = './backup/'
# Ensure service account key exists
if not os.path.exists(SERVICE_ACCOUNT_KEY):
    print(f"Service account key not found at {SERVICE_ACCOUNT_KEY}. Please check the path, or download a new json key file.")
    sys.exit(1)
print("Paths set up successfully.")
# --- BigQuery Setup ---
SCOPES = [
    "https://www.googleapis.com/auth/bigquery",
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive"
]
credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_KEY,
    scopes = SCOPES
)
bq_client = bigquery.Client(credentials = credentials, project = PROJECT_ID)
print("BigQuery client initialized successfully.")

Paths set up successfully.
BigQuery client initialized successfully.


In [None]:
# --- Main Execution ---
raw_data = pd.DataFrame(pull_and_append(credentials = credentials, 
                                  project_id = PROJECT_ID, 
                                  dataset_id = DATASET_ID, 
                                  data_path = DATA_PATH, 
                                  backup_path = BACKUP_PATH))
print(f"Data loaded with {len(raw_data)} rows and {len(raw_data.columns)} columns.")

In [3]:
# Load the JSON data into a DataFrame
df = pd.read_json(DATA_PATH)
print(f"Data loaded into DataFrame with {df.shape[0]} rows and {df.shape[1]} columns.")

Data loaded into DataFrame with 31284 rows and 30 columns.


In [4]:
# --- Flatten the DataFrame ---
df = pd.DataFrame([flatten_row(row) for _, row in df.iterrows()]) # for wtfs refer to ./modules/flattening_json.py
print(f"Data flattened to {df.shape[0]} rows and {df.shape[1]} columns.")

Data flattened to 31284 rows and 92 columns.


In [5]:
df.columns = df.columns.str.replace('.', '__')
print(f"Column names updated to use '__' instead of '.' - now {df.shape[1]} columns.")

Column names updated to use '__' instead of '.' - now 92 columns.


In [6]:
# --- Date and Time Cleanup and Transformation ---
df = df.drop(columns=['event_date'], errors='ignore') # built in case event_date may not be the same as the one in the event_timestamp
df['time_delta'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) - pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)
df['time_delta'] = df['time_delta'].dt.total_seconds() # convert to seconds
df['event_datetime'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) 
df['event_previous_datetime'] = pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)
df['event_first_touch_datetime'] = pd.to_datetime(df['user_first_touch_timestamp'], unit='us', utc=True)
df['user__first_open_datetime'] = pd.to_datetime(df['user__first_open_time'], unit='ms', utc=True)
df['event_date'] = df['event_datetime'].dt.normalize()
df['event_time'] = df['event_datetime'].dt.time
df['event_previous_date'] = df['event_previous_datetime'].dt.normalize()
df['event_previous_time'] = df['event_previous_datetime'].dt.time
df['event_first_touch_date'] = df['event_first_touch_datetime'].dt.normalize()
df['event_first_touch_time'] = df['event_first_touch_datetime'].dt.time
df['user__first_open_date'] = df['user__first_open_datetime'].dt.normalize()
df['user__first_open_time'] = df['user__first_open_datetime'].dt.time
df['device__time_zone_offset_hours'] = df['device__time_zone_offset_seconds'] / 3600 # seconds to hours
df['event_params__engagement_time_seconds'] = df['event_params__engagement_time_msec'] / 1000 # ms to seconds
df['event_server_delay_seconds'] = df['event_server_timestamp_offset'] / 1000 # ms to seconds 
df['event_params__time_spent_seconds'] = df['event_params__time_spent'] # just renaming for clarity
print("Date and time cleanup and transformation completed successfully.")

Date and time cleanup and transformation completed successfully.


In [7]:
# --- Add Time-Based Features ---
df['ts_weekday'] = df['event_datetime'].dt.day_name() # weekday name
df['ts_weekday'] = pd.Categorical(df['ts_weekday'], 
                                  categories=['Monday', 'Tuesday', 'Wednesday', 
                                              'Thursday', 'Friday', 'Saturday', 
                                              'Sunday'],
                                  ordered=True) # order the weekdays
df['ts_local_time'] = df['event_datetime'] + pd.to_timedelta(df['device__time_zone_offset_hours'].fillna(0), unit='h') # local time
df['ts_hour'] = df['ts_local_time'].dt.hour # local hour
df['ts_daytime_named'] = df['ts_hour'].apply(lambda x: 
                                             'Gece' if (x < 6 or x > 22) else 
                                             'Sabah' if x < 11 else 
                                             'Öğle' if x < 14 else 
                                             'Öğleden Sonra' if x < 17 else 
                                             'Akşam') # time group of day
df['ts_is_weekend'] = df['ts_weekday'].apply(lambda x: 
                                             'Hafta Sonu' if x in ['Saturday', 'Sunday'] else
                                             'Hafta İçi') 
df['ts_weekday'] = df['ts_weekday'].astype(str) # convert to string for consistency
print("Time-based features added successfully.")

Time-based features added successfully.


In [8]:
# --- Session Definition and Duration Calculation ---
''' 
Create a calculated session times dataframe from the events dataframe.
This will infer session times based on the time gaps between events for each user.

This is done by:
1. Sorting events by user and timestamp.
2. Calculating the time difference between consecutive events for each user.
3. Defining a session timeout (6 minutes).
4. Assigning session IDs based on the time gaps.
'''
# Ensure events are sorted per user
df_sorted = df.sort_values(by=['user_pseudo_id', 'event_datetime'])
# Compute time gap between events per user
df_sorted['time_diff'] = df_sorted.groupby('user_pseudo_id')['event_datetime'].diff()
# Use 6-minute timeout
SESSION_TIMEOUT = pd.Timedelta(minutes=6)
# Define inferred session ID using 6-minute gaps
df_sorted['inferred_session_id'] = (
    (df_sorted['time_diff'] > SESSION_TIMEOUT) | df_sorted['time_diff'].isna()
).cumsum()
# Assign session IDs to the original DataFrame
df['inferred_session_id'] = df_sorted['inferred_session_id'].loc[df.index]
# Calculate session duration
df['session_duration_seconds'] = df.groupby(['user_pseudo_id', 'inferred_session_id'])['event_datetime'].transform(
    lambda x: (x.max() - x.min()).total_seconds()
).round(3)
df['session_duration_minutes'] = (df['session_duration_seconds'] / 60).round(2)
df['session_duration_hours'] = (df['session_duration_seconds'] / 3600).round(3)
# Session start and end times
df['session_start_time'] = df.groupby(['user_pseudo_id', 'inferred_session_id'])['event_datetime'].transform('min')
df['session_end_time'] = df.groupby(['user_pseudo_id', 'inferred_session_id'])['event_datetime'].transform('max')
print(f"Session IDs assigned and durations calculated for {df['inferred_session_id'].nunique()} unique sessions.")

Session IDs assigned and durations calculated for 381 unique sessions.


In [9]:
# Infer and forward-fill the character name, current tier, and current question index within each session
# Step 1: Sort chronologically within sessions
df_sorted = df.sort_values(by=['user_pseudo_id', 'inferred_session_id', 'event_datetime'])
# Step 2: Forward-fill the relevant columns per user-session group
cols_to_fill = [
    'event_params__character_name',
    'event_params__current_tier',
    'event_params__current_qi',
]
df_sorted[cols_to_fill] = (
    df_sorted
    .groupby(['user_pseudo_id', 'inferred_session_id'])[cols_to_fill]
    .ffill()
)
df.loc[df_sorted.index, cols_to_fill] = df_sorted[cols_to_fill]
print(f"Character names, tiers, and question indices forward-filled for {df['inferred_session_id'].nunique()} unique sessions.")

Character names, tiers, and question indices forward-filled for 381 unique sessions.


In [10]:
# --- Question Index Clean-up ---
"""
Tier 1: 16 Questions, Except t: 12
Tier 2: 12 Questions
Tier 3: 12 Questions
Tier 4: 10 Questions
"""
df['event_params__current_question_index'] = pd.NA
df['event_params__current_tier'] = pd.to_numeric(df['event_params__current_tier'], errors='coerce').astype("Int64")
df['event_params__current_qi'] = pd.to_numeric(df['event_params__current_qi'], errors='coerce').astype("Int64")
notna_mask = df['event_params__character_name'].notna() & df['event_params__current_tier'].notna() & df['event_params__current_qi'].notna()
# Tier 1
tier_1_mask = notna_mask & (df['event_params__current_tier'] == 1)
t_char_mask = tier_1_mask & (df['event_params__character_name'] == 't')
df.loc[t_char_mask, 'event_params__current_question_index'] = 13 - df.loc[t_char_mask, 'event_params__current_qi']
df.loc[~t_char_mask & tier_1_mask, 'event_params__current_question_index'] = 17 - df.loc[(~t_char_mask) & tier_1_mask, 'event_params__current_qi']
# Tier 2 & 3
tier_2_3_mask = notna_mask & df['event_params__current_tier'].isin([2, 3])
df.loc[tier_2_3_mask, 'event_params__current_question_index'] = 13 - df.loc[tier_2_3_mask, 'event_params__current_qi']
# Tier 4
tier_4_mask = notna_mask & (df['event_params__current_tier'] == 4)
df.loc[tier_4_mask, 'event_params__current_question_index'] = 11 - df.loc[tier_4_mask, 'event_params__current_qi']
# Hiccups
problems_mask = notna_mask & ~df['event_params__current_tier'].isin([1, 2, 3, 4])
if df[problems_mask].shape[0] > 0:
    print("Something wrong in:")
    print(df.loc[problems_mask, ['event_params__character_name', 'event_params__current_tier', 'event_params__current_qi']])
print(f"Question index cleaned up for {df['event_params__current_question_index'].notna().sum()} rows.")

Question index cleaned up for 30052 rows.


In [11]:
# Calculate cumulative question index
df['cumulative_question_index'] = df['event_params__current_question_index'].copy()
df['cumulative_question_index'] = pd.to_numeric(df['cumulative_question_index'], errors='coerce')
# Tier 2
df.loc[(df['event_params__current_tier'] == 2) & (df['event_params__character_name'] == 't'), 'cumulative_question_index'] += 12
df.loc[(df['event_params__current_tier'] == 2) & (df['event_params__character_name'] != 't'), 'cumulative_question_index'] += 16
# Tier 3
df.loc[(df['event_params__current_tier'] == 3) & (df['event_params__character_name'] == 't'), 'cumulative_question_index'] += 24
df.loc[(df['event_params__current_tier'] == 3) & (df['event_params__character_name'] != 't'), 'cumulative_question_index'] += 28
# Tier 4
df.loc[(df['event_params__current_tier'] == 4) & (df['event_params__character_name'] == 't'), 'cumulative_question_index'] += 36
df.loc[(df['event_params__current_tier'] == 4) & (df['event_params__character_name'] != 't'), 'cumulative_question_index'] += 40
# NaNs
df.loc[df['event_params__current_tier'].isna(), 'cumulative_question_index'] = pd.NA
print(f"Cumulative question index calculated for {df['cumulative_question_index'].notna().sum()} rows.")

Cumulative question index calculated for 30052 rows.


In [12]:
"""
TODO mini_game_ri
"""
# Split 'event_params_mini_game_ri' maze_hand_* into columns
# e.g 'maze_hand_WomanHandTwo_maze_level_3'
# Column to process
col = 'event_params__mini_game_ri'
# Filter rows starting with 'maze_hand'
mask = df[col].str.startswith('maze_hand', na=False)
# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)
# Extract Gender and Hand using the updated regex
gender_hand = parts[2].str.extract(r'(?P<Gender>Woman|Man)Hand(?P<Hand>\w+)')
# Extract Level (assumed to be in the last part)
levels = parts[5]
# Create new columns with extracted data
df.loc[mask, 'maze_gender'] = gender_hand['Gender']
df.loc[mask, 'maze_hand'] = gender_hand['Hand']
df.loc[mask, 'maze_level'] = levels
print(f"Extracted maze hand data for {mask.sum()} rows.")

Extracted maze hand data for 112 rows.


In [13]:
# Split event_params_mini_game_ri buff_* into columns
# e.g. 'buff_IncreaseXEnergy_gift_True_gold_False'
# Column to process
col = 'event_params__mini_game_ri'
# Filter rows starting with 'buff'
mask = df[col].str.startswith('buff', na=False)
# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)
# Extract Buff Type and Level
buff_type = parts[2].str.extract(r'(?P<BuffType>\w+)')
# Extract Buff Gift and Gold status
buff_gift = parts[3].str.extract(r'(?P<BuffGift>\w+)')
buff_gold = parts[5].str.extract(r'(?P<BuffGold>\w+)')
# Create new columns with extracted data
df.loc[mask, 'buff_type'] = buff_type['BuffType']
df.loc[mask, 'buff_gift'] = buff_gift['BuffGift'].str.lower() == 'true'
df.loc[mask, 'buff_gold'] = buff_gold['BuffGold'].str.lower() == 'true'
print(f"Extracted buff data for {mask.sum()} rows.")

Extracted buff data for 167 rows.


In [14]:
# Split event_params_mini_game_ri earned_buff_* into columns
# e.g. 'earned_buff_GiveXCharacter'
# Column to process
col = 'event_params__mini_game_ri'
# Filter rows starting with 'earned_buff'
mask = df[col].str.startswith('earned_buff', na=False)
# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)
# Extract Buff Type
buff_type = parts[2].str.extract(r'(?P<BuffType>\w+)')
# Create new columns with extracted data
df.loc[mask, 'earned_buff_type'] = buff_type['BuffType']
print(f"Extracted earned buff data for {mask.sum()} rows.")

Extracted earned buff data for 30 rows.


In [15]:
"""
END mini_game_ri
"""
# Split event_params__spent_to doll values into columns
# e.g. 'erjohndoll'
# Column to process
col = 'event_params__spent_to'
# Filter rows including string 'doll'
mask = df[col].str.contains('doll', na=False)
# Split the string by name and doll
parts = df.loc[mask, col].str.split('doll', expand=True)
# Extract the doll name
df.loc[mask, 'doll_name'] = parts[0].str.strip()  # Get the name before 'doll'
# Rewrite the 'event_params__spent_to' column to just the doll name
df.loc[mask, col] = 'Doll'
print(f"Extracted doll data for {mask.sum()} rows.")

Extracted doll data for 44 rows.


In [16]:
# Split event_params__spent_to crystal values into columns
# list of possible values: cauldron_item, aliginn_item, coffee_item
# Column to process
col = 'event_params__spent_to'
# Filter rows including values from the list
mask = df[col].str.contains('cauldron_item|aliginn_item|coffee_item', na=False)
# Split the string by name and item
parts = df.loc[mask, col].str.split('_', expand=True)
# Extract the item name
df.loc[mask, 'spent_in_crystal'] = parts[0].str.strip()  # Get the name before '_item'
# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Crystal Ball'
print(f"Extracted crystal ball data for {mask.sum()} rows.")

Extracted crystal ball data for 228 rows.


In [17]:
# Write event_params_spent_to permanent shop item values into shop_permanent_item
# list of possible values: dreamcatcher, catcollar, library1, library2, bugspray, schedule
# Column to process
col = 'event_params__spent_to'
# Filter rows including values from the list
mask = df[col].str.contains('dreamcatcher|catcollar|library1|library2|bugspray|schedule|crystal|horseshoe', na=False)
# Create a new column for the shop permanent item
df.loc[mask, 'shop_permanent_item'] = df.loc[mask, col].str.extract(r'(dreamcatcher|catcollar|library1|library2|bugspray|schedule|crystal|horseshoe)')[0]
# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Permanent Item'
print(f"Extracted permanent shop item data for {mask.sum()} rows.")

Extracted permanent shop item data for 92 rows.


In [18]:
# Write event_params_spent_to consumable shop item values into shop_consumable_item
# list of possible values: potion, ıncense, amulet, incense
# Column to process
col = 'event_params__spent_to'
# Filter rows including values from the list
mask = df[col].str.contains('potion|ıncense|amulet|incense', na=False)
# Create a new column for the shop consumable item
df.loc[mask, 'shop_consumable_item'] = df.loc[mask, col].str.extract(r'(potion|ıncense|amulet|incense)')[0]
# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Consumable Item'
print(f"Extracted consumable shop item data for {mask.sum()} rows.")

Extracted consumable shop item data for 166 rows.


In [19]:
# Write event_params_spent_to mini_game remainin item values into board_item
# everything except: ['Doll', 'Crystal Ball', 'Permanent Item', 'Consumable Item']
# Column to process
col = 'event_params__spent_to'
# Filter rows that are not in the known categories
mask = (~df[col].isin(['Doll', 'Crystal Ball', 'Permanent Item', 'Consumable Item'])) & \
    (df['event_params__where_its_spent'].isin(['board', 'board_item']))
# Create a new column for the board item
df.loc[mask, 'board_item'] = df.loc[mask, col]
# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Board Item'
print(f"Extracted board item data for {mask.sum()} rows.")

Extracted board item data for 376 rows.


In [20]:
df = df.drop(columns=columns_to_drop)
print(f"Dropped {len(columns_to_drop)} columns: {columns_to_drop}.")

Dropped 30 columns: ['event_timestamp', 'event_previous_timestamp', 'user_first_touch_timestamp', 'device__time_zone_offset_seconds', 'event_params__engagement_time_msec', 'event_previous_datetime', 'event_params__time_spent', 'event_first_touch_datetime', 'user__first_open_datetime', 'event_value_in_usd', 'user_id', 'batch_page_id', 'batch_ordering_id', 'privacy_info__uses_transient_token', 'user_ltv', 'device__mobile_marketing_name', 'device__vendor_id', 'device__browser', 'device__browser_version', 'device__web_info', 'event_dimensions', 'traffic_source__name', 'traffic_source__medium', 'traffic_source__source', 'ecommerce', 'event_server_timestamp_offset', 'event_params__update_with_analytics', 'event_params__system_app_update', 'collected_traffic_source', 'event_params__system_app'].


In [21]:
# Rewrite the 'key' value in 'event_params__spent_to' as 'Key'
df.loc[df['event_params__spent_to'] == 'key', 'event_params__spent_to'] = 'Key'
# Apply value maps to the DataFrame
print("Applying value maps to the DataFrame...")
df = apply_value_maps(df, map_of_maps, keep_unmapped=True)
print(f"Value maps applied. DataFrame now has {df.shape[1]} columns.")

Applying value maps to the DataFrame...
Value maps applied. DataFrame now has 102 columns.


In [22]:
# Create adressable question index
df['question_address'] = df['event_params__character_name'] + ' - T: ' + df['event_params__current_tier'].astype(str) + ' - Q: ' + df['event_params__current_question_index'].astype(str)
print(f"Question address created for {df['question_address'].notna().sum()} rows.")

Question address created for 30052 rows.


In [23]:
# Create user_metrics
df['event_datetime'] = pd.to_datetime(df['event_datetime'], errors='coerce')
# Group by user and calculate user-level metrics
user_metrics = df.groupby('user_pseudo_id').agg(
    first_seen=('event_datetime', 'min'),
    last_seen=('event_datetime', 'max'),
    total_sessions=('inferred_session_id', pd.Series.nunique),
    total_events=('event_name', 'count')
).reset_index()
# Reference date: typically the latest timestamp in your data
reference_date = df['event_datetime'].max()
# Lifetime: just for info
user_metrics['lifetime_days'] = (user_metrics['last_seen'] - user_metrics['first_seen']).dt.days
# Days since last activity
user_metrics['days_since_last_seen'] = (reference_date - user_metrics['last_seen']).dt.days
# Churn: hasn't been seen for 14+ days
user_metrics['is_churned'] = user_metrics['days_since_last_seen'] > 14
# Active/returning user flags
user_metrics['is_retained_1d'] = user_metrics['lifetime_days'] >= 0
user_metrics['is_retained_7d'] = user_metrics['lifetime_days'] >= 7
user_metrics['is_retained_30d'] = user_metrics['lifetime_days'] >= 30
# Calculate active days: number of unique days the user has been active
days_active = df.groupby('user_pseudo_id')['event_datetime'].apply(
    lambda x: x.dt.normalize().nunique()
).reset_index(name='active_days')
user_metrics = user_metrics.merge(days_active, on='user_pseudo_id', how='left')
# Active user flags based on active days
user_metrics['is_active_1d'] = user_metrics['active_days'] >= 2
user_metrics['is_active_7d'] = user_metrics['active_days'] >= 7
user_metrics['is_active_30d'] = user_metrics['active_days'] >= 30
# Is active yesterday: if the user has been seen in the last 24 hours
user_metrics['is_active_yesterday'] = user_metrics['days_since_last_seen'] <= 1
# User status based on activity
conditions = [
    user_metrics['days_since_last_seen'] > 14,         # churned first
    user_metrics['days_since_last_seen'] <= 7,         # then active
    user_metrics['lifetime_days'] <= 1                 # then new
]
labels = ['Bırakmış', 'Aktif', 'Yeni']
user_metrics['user_status'] = np.select(conditions, labels, default='dormant')
df = df.merge(user_metrics, on='user_pseudo_id', how='left')
print(f"User metrics calculated for {user_metrics.shape[0]} users.")

User metrics calculated for 41 users.


In [None]:
# Calculate app-version-level KPIs
kpis_per_version = []

for version, group in df.groupby('app_info__version'):
    # Drop duplicates to get one row per user
    users = group[['user_pseudo_id', 'first_seen', 'last_seen', 'lifetime_days',
                   'days_since_last_seen', 'active_days']].drop_duplicates()

    # User-level flags
    users['is_churned'] = users['days_since_last_seen'] > 14
    users['is_retained_1d'] = users['lifetime_days'] >= 0
    users['is_retained_7d'] = users['lifetime_days'] >= 7
    users['is_retained_30d'] = users['lifetime_days'] >= 30
    users['is_active_1d'] = users['active_days'] >= 2
    users['is_active_7d'] = users['active_days'] >= 7
    users['is_active_30d'] = users['active_days'] >= 30
    users['is_active_yesterday'] = users['days_since_last_seen'] <= 1

    # User status
    users['user_status'] = np.select(
        [
            users['days_since_last_seen'] > 14,
            users['days_since_last_seen'] <= 7,
            users['lifetime_days'] <= 1
        ],
        ['Bırakmış', 'Aktif', 'Yeni'],
        default='dormant'
    )

    # Aggregate KPIs
    kpis_per_version.append({
        'app_version': version,
        'report_days': (reference_date - group['event_datetime'].min()).days,
        'report_date': reference_date.normalize(),
        'report_timestamp': reference_date,              # for versioning/debugging/audit
        'total_users': users.shape[0],
        'total_sessions': group['inferred_session_id'].nunique(),
        'sessions_per_player': group['inferred_session_id'].nunique() / users.shape[0],
        'total_events': group.shape[0],
        'churn_rate': users['is_churned'].mean(),
        'retention_1d': users['is_retained_1d'].mean(),
        'retention_7d': users['is_retained_7d'].mean(),
        'retention_30d': users['is_retained_30d'].mean(),
        'active_1d': users['is_active_1d'].mean(),
        'active_7d': users['is_active_7d'].mean(),
        'active_30d': users['is_active_30d'].mean(),
        'active_yesterday': users['is_active_yesterday'].mean(),
        'ads_shown': group['event_name'].str.contains('Ad Impression', case=False, na=False).sum(),
        'ads_per_session': group['event_name'].str.contains('Ad Impression', case=False, na=False).sum() / group['inferred_session_id'].nunique(),
        'ads_per_player': group['event_name'].str.contains('Ad Impression', case=False, na=False).sum() / users.shape[0],
        'ads_per_active_player': group['event_name'].str.contains('Ad Impression', case=False, na=False).sum() / users[users['is_active_yesterday']].shape[0],
        'new_users': (users['lifetime_days'] <= 1).sum(),
        'returning_players': ((users['lifetime_days'] > 1) & (users['days_since_last_seen'] <= 7)).sum(),
        'churned_players': (users['is_churned']).sum(),
    })

# Combine all KPIs into a DataFrame
kpis_df = pd.DataFrame(kpis_per_version)

In [27]:
# Convert boolean columns to integers (0/1)
print("Converting boolean columns to integers...")
df = convert_bool_to_int(df)
user_metrics = convert_bool_to_int(user_metrics)
print(f"Boolean columns converted. DataFrame now has {df.shape[1]} columns and user_metrics has {user_metrics.shape[1]} columns.")

Converting boolean columns to integers...
Boolean columns converted. DataFrame now has 119 columns and user_metrics has 17 columns.


In [29]:
# Rename and select columns according to df_column_names_map
print("Renaming and selecting columns according to df_column_names_map...")
df = safe_select_and_rename(df, df_column_names_map)
print(f"DataFrame columns renamed and selected according to the map. Now has {df.shape[1]} columns.")

Renaming and selecting columns according to df_column_names_map...
DataFrame columns renamed and selected according to the map. Now has 103 columns.


In [None]:
# Save the DataFrame to CSV files based on the splits defined in df_splits
print("Saving DataFrame splits to CSV files...")
for name, cols in df_splits.items():
    df_subset = df[cols].copy()
    if name in df_filters:
        df_subset = df_subset[df_filters[name](df_subset)]
    df_subset.to_csv(f'./data/{name}.csv', index=False)
print("Data cleaning and transformation completed successfully.")

In [None]:
print("Saving aggregated dataframes to CSV files...")
KPI_PATH = './data/kpis_df.csv'
# Load existing KPI data if exists
if os.path.exists(KPI_PATH):
    existing_kpis = pd.read_csv(KPI_PATH)
    existing_kpis['report_date'] = pd.to_datetime(existing_kpis['report_date'])
    file_exists = True
else:
    existing_kpis = pd.DataFrame()
    file_exists = False
# Filter out duplicates
kpis_df = pd.DataFrame(kpis_per_version)
if not existing_kpis.empty:
    kpis_df = kpis_df.merge(
        existing_kpis[['app_version', 'report_date']],
        on=['app_version', 'report_date'],
        how='left',
        indicator=True
    ).query('_merge == "left_only"').drop(columns=['_merge'])
# Append only new rows
if not kpis_df.empty:
    kpis_df.to_csv(
        KPI_PATH,
        mode='a' if file_exists else 'w',
        header=not file_exists,
        index=False
    )
print("Cleaned data saved to CSV files successfully.")

In [None]:
"""

print("Saving seperate dataframes to CSV files...")
df.to_csv('./data/cleaned_data.csv', index=False, chunksize=100000)
kpis_df.to_csv('./data/kpis_df.csv', index=False)
print("Cleaned data saved to CSV files successfully.")

"""