In [1]:
# --- Standard Library ---
import os
import sys
import json

# --- Google Cloud Auth + APIs ---
from google.cloud import bigquery
from google.oauth2 import service_account
from google.api_core.exceptions import GoogleAPICallError, RetryError

import gspread
from gspread_dataframe import set_with_dataframe

# --- Data & Visualization ---
import pandas as pd
import pandas_gbq
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# --- Local Modules ---
from modules.pull_and_backup import pull_and_append
from modules.flattening import flatten_extract_params, flatten_row, flatten_nested_column
from modules.column_order import column_order

In [2]:
# --- Configuration ---
SERVICE_ACCOUNT_KEY = './keys/key.json'
DATA_PATH = './data/data.json'
PROJECT_ID = "emojioracle-342f1"
DATASET_ID = "analytics_481352676"
BACKUP_PATH = './backup/'
SCOPES = [
    "https://www.googleapis.com/auth/bigquery",
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive"
]
# --- BigQuery Client ---
credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_KEY,
    scopes = SCOPES
)
bq_client = bigquery.Client(credentials = credentials, project = PROJECT_ID)

In [3]:
# will pull what is missing from BigQuery and merge into data_path
df = pd.DataFrame(pull_and_append(credentials = credentials, project_id = PROJECT_ID, dataset_id = DATASET_ID, data_path = DATA_PATH, backup_path = BACKUP_PATH))

Loaded existing data.
Latest event_date in merged data: 20250510
Backup already exists: events_20250405
Backup already exists: events_20250406
Backup already exists: events_20250407
Backup already exists: events_20250408
Backup already exists: events_20250409
Backup already exists: events_20250410
Backup already exists: events_20250411
Backup already exists: events_20250412
Backup already exists: events_20250413
Backup already exists: events_20250414
Backup already exists: events_20250415
Backup already exists: events_20250416
Backup already exists: events_20250417
Backup already exists: events_20250418
Backup already exists: events_20250420
Backup already exists: events_20250421
Backup already exists: events_20250422
Backup already exists: events_20250423
Backup already exists: events_20250424
Backup already exists: events_20250425
Backup already exists: events_20250427
Backup already exists: events_20250428
Backup already exists: events_20250429
Backup already exists: events_20250430

In [4]:
# load the data from the JSON file
df = pd.read_json(DATA_PATH)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16988 entries, 0 to 16987
Data columns (total 30 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   event_date                         16988 non-null  int64  
 1   event_timestamp                    16988 non-null  int64  
 2   event_name                         16988 non-null  object 
 3   event_params                       16988 non-null  object 
 4   event_previous_timestamp           16693 non-null  float64
 5   event_value_in_usd                 0 non-null      float64
 6   event_bundle_sequence_id           16988 non-null  int64  
 7   event_server_timestamp_offset      16988 non-null  int64  
 8   user_id                            0 non-null      float64
 9   user_pseudo_id                     16988 non-null  object 
 10  privacy_info                       16988 non-null  object 
 11  user_properties                    16988 non-null  obj

In [6]:
pd.set_option('display.max_columns', None) # uncomment to see all of the cols in pandas dataframes

In [7]:
# flattenning

df = pd.DataFrame([flatten_row(row) for _, row in df.iterrows()]) # for wtfs refer to ./modules/flattening_json.py

In [8]:
# cleaning & preprocessing

## dates and times
df = df.drop(columns=['event_date']) # built in case event_date may not be the same as the one in the event_timestamp

# convert unix time to ape-friendly datetime format
df['event_datetime'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) 
df['event_previous_datetime'] = pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)
df['event_first_touch_datetime'] = pd.to_datetime(df['user_first_touch_timestamp'], unit='us', utc=True)
df['user.first_open_datetime'] = pd.to_datetime(df['user.first_open_time'], unit='ms', utc=True)


df['event_date'] = df['event_datetime'].dt.normalize()
df['event_time'] = df['event_datetime'].dt.time

df['event_previous_date'] = df['event_previous_datetime'].dt.date
df['event_previous_time'] = df['event_previous_datetime'].dt.time

df['event_first_touch_date'] = df['event_first_touch_datetime'].dt.date
df['event_first_touch_time'] = df['event_first_touch_datetime'].dt.time

df['user.first_open_date'] = df['user.first_open_datetime'].dt.date
df['user.first_open_time'] = df['user.first_open_datetime'].dt.time

df['device.time_zone_offset_hours'] = df['device.time_zone_offset_seconds'] / 3600 # seconds to hours
df['event_params.engagement_time_seconds'] = df['event_params.engagement_time_msec'] / 1000 # ms to seconds
df['event_server_delay_seconds'] = df['event_server_timestamp_offset'] / 1000 # ms to seconds 
df['event_params.time_spent_seconds'] = df['event_params.time_spent'] # just renaming for clarity

df = df.drop(columns=['event_timestamp',
                      'event_previous_timestamp', 
                      'user_first_touch_timestamp', 
                      'event_server_timestamp_offset', 
                      'device.time_zone_offset_seconds', 
                      'event_params.engagement_time_msec',
                      'event_previous_datetime',
                      'event_params.time_spent',
                      'event_first_touch_datetime',
                      'user.first_open_datetime'
                    ])



In [9]:
## time series

df['ts_weekday'] = df['event_datetime'].dt.day_name() # weekday name
df['ts_weekday'] = pd.Categorical(df['ts_weekday'], categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True) # order the weekdays

df['ts_local_time'] = df['event_datetime'] + pd.to_timedelta(df['device.time_zone_offset_hours'].fillna(0), unit='h') # local time
df['ts_hour'] = df['ts_local_time'].dt.hour # local hour
df['ts_daytime_named'] = df['ts_hour'].apply(lambda x: 
                                             'night' if (x < 6 or x > 22) else 
                                             'morning' if x < 11 else 
                                             'noon' if x < 14 else 
                                             'afternoon' if x < 17 else 
                                             'evening') # time group of day
df['ts_is_weekend'] = df['ts_weekday'].isin(['Saturday', 'Sunday']) # is weekend

In [None]:
df.info()

In [10]:
# Revise Question Indices as df['event_params.current_question_index'] 
"""
Tier 1: 16 Questions, Except t: 12
Tier 2: 12 Questions
Tier 3: 12 Questions
Tier 4: 10 Questions
"""

df['event_params.current_question_index'] = pd.Series([pd.NA] * len(df), dtype="Int64")

df['event_params.current_tier'] = pd.to_numeric(df['event_params.current_tier'], errors='coerce').astype("Int64")
df['event_params.current_qi'] = pd.to_numeric(df['event_params.current_qi'], errors='coerce').astype("Int64")

notna_mask = df['event_params.character_name'].notna()

# Tier 1
tier_1_mask = notna_mask & (df['event_params.current_tier'] == 1)
t_char_mask = tier_1_mask & (df['event_params.character_name'] == 't')

df.loc[t_char_mask, 'event_params.current_question_index'] = 13 - df.loc[t_char_mask, 'event_params.current_qi']
df.loc[~t_char_mask & tier_1_mask, 'event_params.current_question_index'] = 17 - df.loc[(~t_char_mask) & tier_1_mask, 'event_params.current_qi']

# Tier 2 & 3
tier_2_3_mask = notna_mask & df['event_params.current_tier'].isin([2, 3])
df.loc[tier_2_3_mask, 'event_params.current_question_index'] = 13 - df.loc[tier_2_3_mask, 'event_params.current_qi']

# Tier 4
tier_4_mask = notna_mask & (df['event_params.current_tier'] == 4)
df.loc[tier_4_mask, 'event_params.current_question_index'] = 11 - df.loc[tier_4_mask, 'event_params.current_qi']

# Hiccups
problems_mask = notna_mask & ~df['event_params.current_tier'].isin([1, 2, 3, 4])
if df[problems_mask].shape[0] > 0:
    print("Something wrong in:")
    print(df.loc[problems_mask, ['event_params.character_name', 'event_params.current_tier', 'event_params.current_qi']])


In [11]:
# I reordered the columns to make it easier to follow, in column_order.py

df = df[column_order]

df.columns = df.columns.str.replace('.', '__') # replace dots with dashes in column names bc of bigquerys nagging

In [None]:
df.columns

In [12]:
df.tail(5)

Unnamed: 0,event_name,event_date,event_time,event_previous_date,event_previous_time,event_first_touch_date,event_first_touch_time,event_bundle_sequence_id,user_id,user_pseudo_id,user__first_open_date,user__first_open_time,user__ga_session_id,user__ga_session_number,app_info__id,app_info__firebase_app_id,app_info__version,app_info__install_store,app_info__install_source,device__advertising_id,device__vendor_id,device__category,device__mobile_brand_name,device__mobile_model_name,device__mobile_marketing_name,device__mobile_os_hardware_model,device__operating_system,device__operating_system_version,device__language,device__is_limited_ad_tracking,device__browser,device__browser_version,device__web_info,device__time_zone_offset_hours,geo__city,geo__country,geo__continent,geo__region,geo__sub_continent,geo__metro,traffic_source__name,traffic_source__medium,traffic_source__source,collected_traffic_source,event_params__ga_session_id,event_params__firebase_screen_id,event_params__ad_unit_id,event_params__ad_format,event_params__ad_network,event_params__ad_platform,event_params__ad_shown_where,event_params__answered_wrong,event_params__character_name,event_params__current_qi,event_params__current_question_index,event_params__current_tier,event_params__earned_amount,event_params__engaged_session_event,event_params__engagement_time_seconds,event_params__entrances,event_params__firebase_conversion,event_params__firebase_error,event_params__firebase_event_origin,event_params__firebase_screen_class,event_params__ga_session_number,event_params__how_its_earned,event_params__menu_name,event_params__mini_game_name,event_params__mini_game_ri,event_params__previous_first_open_count,event_params__session_engaged,event_params__spent_amount,event_params__spent_to,event_params__system_app,event_params__system_app_update,event_params__time_spent_seconds,event_params__update_with_analytics,event_params__where_its_earned,event_params__where_its_spent,event_params__currency_name,batch_event_index,batch_ordering_id,batch_page_id,privacy_info__ads_storage,privacy_info__analytics_storage,privacy_info__uses_transient_token,event_dimensions,event_server_delay_seconds,event_value_in_usd,ecommerce,is_active_user,platform,stream_id,user_ltv,event_datetime,ts_weekday,ts_is_weekend,ts_local_time,ts_hour,ts_daytime_named
16983,menu_opened,2025-05-08 00:00:00+00:00,13:01:01.125001,2025-05-08,12:00:07.961001,2025-04-09,09:32:18.415000,1426,,6bef18e3d26a07b8a20c26ff7eb26d82,2025-04-09,10:00:00,1746706000.0,8.0,com.TestCompany.TestApp,1:332513280181:android:e9c9cb25732e9477377efc,0.1.0,,com.google.android.packageinstaller,,,mobile,Xiaomi,MI 9,,MI 9,Android,Android 11,en-us,No,,,,3.0,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),(direct),(none),(direct),,1746705591,-4.992188e+18,,,,,,,,,,,,1.0,,,,,app,UnityPlayerActivity,8.0,,board_menu,,,,,,,,,,,,,,2,,,Yes,Yes,No,,948.681,,,True,ANDROID,10359646141,,2025-05-08 13:01:01.125001+00:00,Thursday,False,2025-05-08 16:01:01.125001+00:00,16,afternoon
16984,menu_closed,2025-05-08 00:00:00+00:00,13:01:02.979002,2025-05-08,12:00:10.703002,2025-04-09,09:32:18.415000,1426,,6bef18e3d26a07b8a20c26ff7eb26d82,2025-04-09,10:00:00,1746706000.0,8.0,com.TestCompany.TestApp,1:332513280181:android:e9c9cb25732e9477377efc,0.1.0,,com.google.android.packageinstaller,,,mobile,Xiaomi,MI 9,,MI 9,Android,Android 11,en-us,No,,,,3.0,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),(direct),(none),(direct),,1746705591,-4.992188e+18,,,,,,,,,,,,1.0,,,,,app,UnityPlayerActivity,8.0,,board_menu,,,,,,,,,1.854248,,,,,3,,,Yes,Yes,No,,948.681,,,True,ANDROID,10359646141,,2025-05-08 13:01:02.979002+00:00,Thursday,False,2025-05-08 16:01:02.979002+00:00,16,afternoon
16985,earn_virtual_currency,2025-05-08 00:00:00+00:00,13:01:05.757003,2025-05-06,16:22:37.214003,2025-04-09,09:32:18.415000,1426,,6bef18e3d26a07b8a20c26ff7eb26d82,2025-04-09,10:00:00,1746706000.0,8.0,com.TestCompany.TestApp,1:332513280181:android:e9c9cb25732e9477377efc,0.1.0,,com.google.android.packageinstaller,,,mobile,Xiaomi,MI 9,,MI 9,Android,Android 11,en-us,No,,,,3.0,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),(direct),(none),(direct),,1746705591,-4.992188e+18,,,,,,,,,,,10.0,1.0,,,,,app,UnityPlayerActivity,8.0,normal,,,,,,,,,,,,question,,gold,4,,,Yes,Yes,No,,948.681,,,True,ANDROID,10359646141,,2025-05-08 13:01:05.757003+00:00,Thursday,False,2025-05-08 16:01:05.757003+00:00,16,afternoon
16986,user_engagement,2025-05-08 00:00:00+00:00,13:01:09.648000,2025-05-08,12:32:38.737000,2025-04-09,09:32:18.415000,1427,,6bef18e3d26a07b8a20c26ff7eb26d82,2025-04-09,10:00:00,1746706000.0,8.0,com.TestCompany.TestApp,1:332513280181:android:e9c9cb25732e9477377efc,0.1.0,,com.google.android.packageinstaller,,,mobile,Xiaomi,MI 9,,MI 9,Android,Android 11,en-us,No,,,,3.0,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),(direct),(none),(direct),,1746705591,-4.992188e+18,,,,,,,,,,,,1.0,19.323,,,,auto,UnityPlayerActivity,8.0,,,,,,,,,,,,,,,,1,,,Yes,Yes,No,,1613.881,,,True,ANDROID,10359646141,,2025-05-08 13:01:09.648000+00:00,Thursday,False,2025-05-08 16:01:09.648000+00:00,16,afternoon
16987,app_remove,2025-05-10 00:00:00+00:00,03:53:21.501000,NaT,NaT,2025-04-14,19:32:39.117000,284,,ce1a4e02e4f8de31648cc7c0ff82c787,2025-04-14,20:00:00,1744659000.0,1.0,com.TestCompany.TestApp,1:332513280181:android:e9c9cb25732e9477377efc,0.1.0,,com.google.android.packageinstaller,,,mobile,Samsung,SM-A045F,Galaxy A04,SM-A045F,Android,Android 14,en-us,No,,,,3.0,Istanbul,Türkiye,Asia,Istanbul,Western Asia,(not set),(direct),(none),(direct),,1744659160,,,,,,,,,,,,,,,,,,auto,,1.0,,,,,,,,,,,,,,,,1,,,Yes,Yes,No,,86.979,,,False,ANDROID,10359646141,,2025-05-10 03:53:21.501000+00:00,Saturday,True,2025-05-10 06:53:21.501000+00:00,6,morning


In [13]:
df.to_parquet('./data/cleaned_data.parquet', index=False)

In [14]:
# Write to BigQuery 
table_id = f"{PROJECT_ID}.{DATASET_ID}.clean_data"

try:
    # Load data into BigQuery
    job = bq_client.load_table_from_dataframe(df, table_id, job_config = bigquery.LoadJobConfig(
        write_disposition = "WRITE_TRUNCATE"
    ))

    job.result()  # Wait for the job to finish

    print(f"Cleaned data written to {table_id}. Ready to use in Looker.")

except GoogleAPICallError as api_error:
    print(f"API error occurred: {api_error}")
except RetryError as retry_error:
    print(f"Retry error occurred: {retry_error}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Cleaned data written to emojioracle-342f1.analytics_481352676.clean_data. Ready to use in Looker.
