In [1]:
# --- Standard Library ---

import os
import sys
import json

# --- Google Cloud Auth + APIs ---

from google.cloud import bigquery
from google.oauth2 import service_account
from google.api_core.exceptions import GoogleAPICallError, RetryError

from gspread_dataframe import set_with_dataframe

# --- Data & Visualization ---

import pandas as pd
import openpyxl

# --- Local Modules ---

from modules.utilities import (
    pull_and_append,
#    rebuild_data_json_from_backups,
    upload_named_dataframes_to_bq
)

from modules.flattening import (
    flatten_extract_params, 
    flatten_row,
    flatten_nested_column
)

from modules.cleaning import (
    apply_value_maps,
    safe_select_and_rename
)
# --- Lists and Maps ---

from modules.lists_and_maps import (
    df_column_names_map, 
    columns_to_drop,
    map_of_maps,
    event_name_map,
    event_params__mini_game_ri_map,
    event_params__menu_name_map,
    event_params__character_name_map,
    event_params__mini_game_name_map,
    event_params__where_its_earned_map,
    event_params__currency_name_map,
    event_params__how_its_earned_map,
    event_params__where_its_spent_map,
    )

In [2]:
# --- Path Setup ---
SERVICE_ACCOUNT_KEY = './keys/key.json'
DATA_PATH = './data/data.json'
PROJECT_ID = "emojioracle-342f1"
DATASET_ID = "analytics_481352676"
BACKUP_PATH = './backup/'

# --- BigQuery Setup ---
SCOPES = [
    "https://www.googleapis.com/auth/bigquery",
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive"
]
credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_KEY,
    scopes = SCOPES
)
bq_client = bigquery.Client(credentials = credentials, project = PROJECT_ID)

In [3]:
# --- Main Execution ---

raw_data = pd.DataFrame(pull_and_append(credentials = credentials, 
                                  project_id = PROJECT_ID, 
                                  dataset_id = DATASET_ID, 
                                  data_path = DATA_PATH, 
                                  backup_path = BACKUP_PATH))

print(f"Data loaded with {len(raw_data)} rows and {len(raw_data.columns)} columns.")

Loaded existing data.
Latest event_date in merged data: 20250610
Backup already exists: events_20250413
Backup already exists: events_20250414
Backup already exists: events_20250415
Backup already exists: events_20250416
Backup already exists: events_20250417
Backup already exists: events_20250418
Backup already exists: events_20250420
Backup already exists: events_20250421
Backup already exists: events_20250422
Backup already exists: events_20250423
Backup already exists: events_20250424
Backup already exists: events_20250425
Backup already exists: events_20250427
Backup already exists: events_20250428
Backup already exists: events_20250429
Backup already exists: events_20250430
Backup already exists: events_20250506
Backup already exists: events_20250507
Backup already exists: events_20250508
Backup already exists: events_20250510
Backup already exists: events_20250511
Backup already exists: events_20250512
Backup already exists: events_20250516
Backup already exists: events_20250517

# **IF EDITING, START HERE**

In [3]:
# Load the JSON data into a DataFrame
df = pd.read_json(DATA_PATH)

In [4]:
pd.set_option('display.max_columns', None) # uncomment to see all of the cols in pandas dataframes

In [5]:
# --- Flatten the DataFrame ---
df = pd.DataFrame([flatten_row(row) for _, row in df.iterrows()]) # for wtfs refer to ./modules/flattening_json.py

In [6]:
df.columns = df.columns.str.replace('.', '__')

In [7]:
# --- Date and Time Cleanup and Transformation ---
df = df.drop(columns=['event_date']) # built in case event_date may not be the same as the one in the event_timestamp

df['time_delta'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) - pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)
df['time_delta'] = df['time_delta'].dt.total_seconds() # convert to seconds

df['event_datetime'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) 
df['event_previous_datetime'] = pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)
df['event_first_touch_datetime'] = pd.to_datetime(df['user_first_touch_timestamp'], unit='us', utc=True)
df['user__first_open_datetime'] = pd.to_datetime(df['user__first_open_time'], unit='ms', utc=True)


df['event_date'] = df['event_datetime'].dt.normalize()
df['event_time'] = df['event_datetime'].dt.time

df['event_previous_date'] = df['event_previous_datetime'].dt.normalize()
df['event_previous_time'] = df['event_previous_datetime'].dt.time

df['event_first_touch_date'] = df['event_first_touch_datetime'].dt.normalize()
df['event_first_touch_time'] = df['event_first_touch_datetime'].dt.time

df['user__first_open_date'] = df['user__first_open_datetime'].dt.normalize()
df['user__first_open_time'] = df['user__first_open_datetime'].dt.time

df['device__time_zone_offset_hours'] = df['device__time_zone_offset_seconds'] / 3600 # seconds to hours
df['event_params__engagement_time_seconds'] = df['event_params__engagement_time_msec'] / 1000 # ms to seconds
df['event_server_delay_seconds'] = df['event_server_timestamp_offset'] / 1000 # ms to seconds 
df['event_params__time_spent_seconds'] = df['event_params__time_spent'] # just renaming for clarity





In [8]:
# --- Add Time-Based Features ---

df['ts_weekday'] = df['event_datetime'].dt.day_name() # weekday name
df['ts_weekday'] = pd.Categorical(df['ts_weekday'], 
                                  categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                                  ordered=True) # order the weekdays

df['ts_local_time'] = df['event_datetime'] + pd.to_timedelta(df['device__time_zone_offset_hours'].fillna(0), unit='h') # local time
df['ts_hour'] = df['ts_local_time'].dt.hour # local hour
df['ts_daytime_named'] = df['ts_hour'].apply(lambda x: 
                                             'Night' if (x < 6 or x > 22) else 
                                             'Morning' if x < 11 else 
                                             'Noon' if x < 14 else 
                                             'Afternoon' if x < 17 else 
                                             'Evening') # time group of day
df['ts_is_weekend'] = df['ts_weekday'].apply(lambda x: 
                                             'Weekend' if x in ['Saturday', 'Sunday'] else
                                             'Weekday') 

In [9]:
# --- Question Index Clean-up ---
"""
Tier 1: 16 Questions, Except t: 12
Tier 2: 12 Questions
Tier 3: 12 Questions
Tier 4: 10 Questions
"""

df['event_params__current_question_index'] = pd.Series([pd.NA] * len(df), dtype="Int64")

df['event_params__current_tier'] = pd.to_numeric(df['event_params__current_tier'], errors='coerce').astype("Int64")
df['event_params__current_qi'] = pd.to_numeric(df['event_params__current_qi'], errors='coerce').astype("Int64")

notna_mask = df['event_params__character_name'].notna()

# Tier 1
tier_1_mask = notna_mask & (df['event_params__current_tier'] == 1)
t_char_mask = tier_1_mask & (df['event_params__character_name'] == 't')

df.loc[t_char_mask, 'event_params__current_question_index'] = 13 - df.loc[t_char_mask, 'event_params__current_qi']
df.loc[~t_char_mask & tier_1_mask, 'event_params__current_question_index'] = 17 - df.loc[(~t_char_mask) & tier_1_mask, 'event_params__current_qi']

# Tier 2 & 3
tier_2_3_mask = notna_mask & df['event_params__current_tier'].isin([2, 3])
df.loc[tier_2_3_mask, 'event_params__current_question_index'] = 13 - df.loc[tier_2_3_mask, 'event_params__current_qi']

# Tier 4
tier_4_mask = notna_mask & (df['event_params__current_tier'] == 4)
df.loc[tier_4_mask, 'event_params__current_question_index'] = 11 - df.loc[tier_4_mask, 'event_params__current_qi']

# Hiccups
problems_mask = notna_mask & ~df['event_params__current_tier'].isin([1, 2, 3, 4])
if df[problems_mask].shape[0] > 0:
    print("Something wrong in:")
    print(df.loc[problems_mask, ['event_params__character_name', 'event_params__current_tier', 'event_params__current_qi']])


In [10]:
# --- Session Definition and Duration Calculation ---

''' 

Create a calculated session times dataframe from the events dataframe.
This will infer session times based on the time gaps between events for each user.

This is done by:
1. Sorting events by user and timestamp.
2. Calculating the time difference between consecutive events for each user.
3. Defining a session timeout (6 minutes).
4. Assigning session IDs based on the time gaps.

'''

# Ensure events are sorted per user
df_sorted = df.sort_values(by=['user_pseudo_id', 'event_datetime'])

# Compute time gap between events per user
df_sorted['time_diff'] = df_sorted.groupby('user_pseudo_id')['event_datetime'].diff()

# Use 6-minute timeout
SESSION_TIMEOUT = pd.Timedelta(minutes=6)

# Define inferred session ID using 6-minute gaps
df_sorted['inferred_session_id'] = (
    (df_sorted['time_diff'] > SESSION_TIMEOUT) | df_sorted['time_diff'].isna()
).cumsum()

# Assign session IDs to the original DataFrame
df['inferred_session_id'] = df_sorted['inferred_session_id']



In [11]:
# Infer and forward-fill the character name, current tier, and current question index within each session

# Step 1: Sort chronologically within sessions
df_sorted = df.sort_values(by=['user_pseudo_id', 'inferred_session_id', 'event_datetime'])

# Step 2: Forward-fill the relevant columns per user-session group
cols_to_fill = [
    'event_params__character_name',
    'event_params__current_tier',
    'event_params__current_question_index'
]

df_sorted[cols_to_fill] = (
    df_sorted
    .groupby(['user_pseudo_id', 'inferred_session_id'])[cols_to_fill]
    .ffill()
)

df[cols_to_fill] = df_sorted[cols_to_fill]

In [12]:
# Split 'event_params_mini_game_ri' maze_hand_* into columns
# e.g 'maze_hand_WomanHandTwo_maze_level_3'

# Column to process
col = 'event_params__mini_game_ri'

# Filter rows starting with 'maze_hand'
mask = df[col].str.startswith('maze_hand', na=False)

# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract Gender and Hand using the updated regex
gender_hand = parts[2].str.extract(r'(?P<Gender>Woman|Man)Hand(?P<Hand>\w+)')

# Extract Level (assumed to be in the last part)
levels = parts[5]

# Create new columns with extracted data
df.loc[mask, 'maze_gender'] = gender_hand['Gender']
df.loc[mask, 'maze_hand'] = gender_hand['Hand']
df.loc[mask, 'maze_level'] = levels

In [13]:
# Split event_params_mini_game_ri buff_* into columns
# e.g. 'buff_IncreaseXEnergy_gift_True_gold_False'


# Column to process
col = 'event_params__mini_game_ri'

# Filter rows starting with 'buff'
mask = df[col].str.startswith('buff', na=False)

# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract Buff Type and Level
buff_type = parts[2].str.extract(r'(?P<BuffType>\w+)')

# Extract Buff Gift and Gold status
buff_gift = parts[3].str.extract(r'(?P<BuffGift>\w+)')
buff_gold = parts[5].str.extract(r'(?P<BuffGold>\w+)')

# Create new columns with extracted data
df.loc[mask, 'buff_type'] = buff_type['BuffType']
df.loc[mask, 'buff_gift'] = buff_gift['BuffGift'].str.lower() == 'true'
df.loc[mask, 'buff_gold'] = buff_gold['BuffGold'].str.lower() == 'true'

In [14]:
# Split event_params_mini_game_ri earned_buff_* into columns
# e.g. 'earned_buff_GiveXCharacter'

# Column to process
col = 'event_params__mini_game_ri'

# Filter rows starting with 'earned_buff'
mask = df[col].str.startswith('earned_buff', na=False)

# Split the matching rows by underscore
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract Buff Type
buff_type = parts[2].str.extract(r'(?P<BuffType>\w+)')

# Create new columns with extracted data
df.loc[mask, 'earned_buff_type'] = buff_type['BuffType']


In [15]:
# Split event_params__spent_to doll values into columns
# e.g. 'erjohndoll'

# Column to process
col = 'event_params__spent_to'

# Filter rows including string 'doll'
mask = df[col].str.contains('doll', na=False)

# Split the string by name and doll
parts = df.loc[mask, col].str.split('doll', expand=True)

# Extract the doll name
df.loc[mask, 'doll_name'] = parts[0].str.strip()  # Get the name before 'doll'

# Rewrite the 'event_params__spent_to' column to just the doll name
df.loc[mask, col] = 'Doll'



In [16]:
# Split event_params__spent_to crystal values into columns
# list of possible values: cauldron_item, aliginn_item, coffee_item

# Column to process
col = 'event_params__spent_to'

# Filter rows including values from the list
mask = df[col].str.contains('cauldron_item|aliginn_item|coffee_item', na=False)

# Split the string by name and item
parts = df.loc[mask, col].str.split('_', expand=True)

# Extract the item name
df.loc[mask, 'spent_in_crystal'] = parts[0].str.strip()  # Get the name before '_item'

# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Crystal Ball'


In [17]:
# Write event_params_spent_to permanent shop item values into shop_permanent_item
# list of possible values: dreamcatcher, catcollar, library1, library2, bugspray, schedule

# Column to process
col = 'event_params__spent_to'

# Filter rows including values from the list
mask = df[col].str.contains('dreamcatcher|catcollar|library1|library2|bugspray|schedule', na=False)

# Create a new column for the shop permanent item
df.loc[mask, 'shop_permanent_item'] = df.loc[mask, col].str.extract(r'(dreamcatcher|catcollar|library1|library2|bugspray|schedule)')[0]

# Rewrite the 'event_params__spent_to' column to just the item name

df.loc[mask, col] = 'Permanent Item'


In [18]:
# Write event_params_spent_to consumable shop item values into shop_consumable_item
# list of possible values: potion, ıncense, amulet, incense

# Column to process
col = 'event_params__spent_to'

# Filter rows including values from the list
mask = df[col].str.contains('potion|ıncense|amulet|incense', na=False)

# Create a new column for the shop consumable item
df.loc[mask, 'shop_consumable_item'] = df.loc[mask, col].str.extract(r'(potion|ıncense|amulet|incense)')[0]

# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Consumable Item'

In [None]:
# Write event_params_spent_to mini_game remainin item values into board_item
# everything except: ['Doll', 'Crystal Ball', 'Permanent Item', 'Consumable Item']

# Column to process
col = 'event_params__spent_to'

# Filter rows that are not in the known categories
mask = (~df[col].isin(['Doll', 'Crystal Ball', 'Permanent Item', 'Consumable Item'])) & \
    (df['event_params__where_its_spent'].isin(['board', 'board_item']))

# Create a new column for the board item
df.loc[mask, 'board_item'] = df.loc[mask, col]

# Rewrite the 'event_params__spent_to' column to just the item name
df.loc[mask, col] = 'Board Item'

In [20]:
df = df.drop(columns=columns_to_drop)

In [21]:
df.tail()

Unnamed: 0,event_name,event_bundle_sequence_id,user_pseudo_id,stream_id,platform,is_active_user,batch_event_index,event_params__ga_session_id,event_params__firebase_screen_id,event_params__ga_session_number,event_params__ad_platform,event_params__firebase_screen_class,event_params__ad_shown_where,event_params__ad_unit_id,event_params__engaged_session_event,event_params__firebase_event_origin,user__first_open_time,user__ga_session_number,user__ga_session_id,privacy_info__analytics_storage,privacy_info__ads_storage,device__category,device__mobile_brand_name,device__mobile_model_name,device__mobile_os_hardware_model,device__operating_system,device__operating_system_version,device__advertising_id,device__language,device__is_limited_ad_tracking,geo__city,geo__country,geo__continent,geo__region,geo__sub_continent,geo__metro,app_info__id,app_info__version,app_info__install_store,app_info__firebase_app_id,app_info__install_source,event_params__firebase_conversion,event_params__previous_first_open_count,event_params__menu_name,event_params__entrances,event_params__ad_network,event_params__ad_format,event_params__session_engaged,event_params__current_qi,event_params__character_name,event_params__current_tier,event_params__mini_game_ri,event_params__mini_game_name,event_params__answered_wrong,event_params__where_its_earned,event_params__currency_name,event_params__earned_amount,event_params__how_its_earned,event_params__spent_amount,event_params__where_its_spent,event_params__spent_to,event_params__firebase_error,event_params__fatal,event_params__timestamp,time_delta,event_datetime,event_date,event_time,event_previous_date,event_previous_time,event_first_touch_date,event_first_touch_time,user__first_open_date,device__time_zone_offset_hours,event_params__engagement_time_seconds,event_server_delay_seconds,event_params__time_spent_seconds,ts_weekday,ts_local_time,ts_hour,ts_daytime_named,ts_is_weekend,event_params__current_question_index,inferred_session_id,maze_gender,maze_hand,maze_level,buff_type,buff_gift,buff_gold,earned_buff_type,doll_name,spent_in_crystal,shop_permanent_item,shop_consumable_item,board_item
22144,question_started,967,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,11,1749570901,-5.152938e+18,7.0,,UnityPlayerActivity,,,1.0,app,10:00:00,7.0,1749571000.0,Yes,Yes,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,1.0,,,,,,,11,mryogurt,1,,,,,,,,,,Board Item,,,,13.196,2025-06-10 16:17:43.577002+00:00,2025-06-10 00:00:00+00:00,16:17:43.577002,2025-06-10 00:00:00+00:00,16:17:30.381002,2025-04-09 00:00:00+00:00,09:32:18.415000,2025-04-09 00:00:00+00:00,3.0,,849.954,,Tuesday,2025-06-10 19:17:43.577002+00:00,19,Evening,Weekday,6,180,,,,,,,,,,,,
22145,question_started,967,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,14,1749570901,-5.152938e+18,7.0,,UnityPlayerActivity,,,1.0,app,10:00:00,7.0,1749571000.0,Yes,Yes,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,1.0,,,,,,,10,mryogurt,1,,,,,,,,,,Board Item,,,,14.957,2025-06-10 16:17:58.534003+00:00,2025-06-10 00:00:00+00:00,16:17:58.534003,2025-06-10 00:00:00+00:00,16:17:43.577003,2025-04-09 00:00:00+00:00,09:32:18.415000,2025-04-09 00:00:00+00:00,3.0,,849.954,,Tuesday,2025-06-10 19:17:58.534003+00:00,19,Evening,Weekday,7,180,,,,,,,,,,,,
22146,question_started,967,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,18,1749570901,-5.152938e+18,7.0,,UnityPlayerActivity,,,1.0,app,10:00:00,7.0,1749571000.0,Yes,Yes,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,1.0,,,,,,,9,mryogurt,1,,,,,,,,,,Board Item,,,,18.902,2025-06-10 16:18:17.436004+00:00,2025-06-10 00:00:00+00:00,16:18:17.436004,2025-06-10 00:00:00+00:00,16:17:58.534004,2025-04-09 00:00:00+00:00,09:32:18.415000,2025-04-09 00:00:00+00:00,3.0,,849.954,,Tuesday,2025-06-10 19:18:17.436004+00:00,19,Evening,Weekday,8,180,,,,,,,,,,,,
22147,question_started,967,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,27,1749570901,-5.152938e+18,7.0,,UnityPlayerActivity,,,1.0,app,10:00:00,7.0,1749571000.0,Yes,Yes,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,1.0,,,,,,,8,billy,1,,,,,,,,,,Board Item,,,,139.997,2025-06-10 16:20:37.433005+00:00,2025-06-10 00:00:00+00:00,16:20:37.433005,2025-06-10 00:00:00+00:00,16:18:17.436005,2025-04-09 00:00:00+00:00,09:32:18.415000,2025-04-09 00:00:00+00:00,3.0,,849.954,,Tuesday,2025-06-10 19:20:37.433005+00:00,19,Evening,Weekday,9,180,,,,,,,,,,,,
22148,question_started,967,9ebba72a8b854826a91973f63bc39231,10359646141,ANDROID,True,37,1749570901,-5.152938e+18,7.0,,UnityPlayerActivity,,,1.0,app,10:00:00,7.0,1749571000.0,Yes,Yes,mobile,Xiaomi,MI 9,MI 9,Android,Android 11,,en-us,No,Ankara,Türkiye,Asia,Ankara,Western Asia,(not set),com.TestCompany.TestApp,0.1.0,,1:332513280181:android:e9c9cb25732e9477377efc,com.google.android.packageinstaller,1.0,,,,,,,7,billy,1,,,,,,,,,,Board Item,,,,65.104,2025-06-10 16:21:42.537006+00:00,2025-06-10 00:00:00+00:00,16:21:42.537006,2025-06-10 00:00:00+00:00,16:20:37.433006,2025-04-09 00:00:00+00:00,09:32:18.415000,2025-04-09 00:00:00+00:00,3.0,,849.954,,Tuesday,2025-06-10 19:21:42.537006+00:00,19,Evening,Weekday,10,180,,,,,,,,,,,,


In [28]:
print(list(df.columns))

['Event Type', 'Event Batch ID', 'User Pseudo ID', 'Stream Identifier', 'Platform', 'Is Active User', 'Event Index in Batch', 'Event Date & Time', 'Event Date', 'Event Time', 'Previous Event Date', 'Previous Event Time', 'First Touch Date', 'First Touch Time', 'Time Since Previous Event', 'Weekday', 'Local Time', 'Hour of Day', 'Time of Day', 'Is Weekend', 'Time Zone Offset Hours', 'Server Delay Seconds', 'Session ID', 'Session Number', 'Firebase Event Origin', 'Engaged Session Event', 'Session Was Engaged', 'User Session Count', 'User Session ID', 'Inferred Session ID', 'Screen ID', 'Screen Class', 'Menu Name', 'Entrances Count', 'Device Type', 'Mobile Brand', 'Mobile Model', 'Device Hardware Model', 'Operating System', 'OS Version', 'Advertising ID', 'Device Language', 'Ad Tracking Limited', 'City', 'Country', 'Continent', 'Region', 'Subcontinent', 'Metro Area', 'App ID', 'App Version', 'Install Store', 'Firebase App ID', 'App Install Source', 'Consent: Analytics Storage', 'Consent: 

In [23]:
df['event_name'].unique().tolist() # print unique values in the mini_game_ri column

['ad_clicked',
 'app_remove',
 'first_open',
 'menu_closed',
 'menu_opened',
 'screen_view',
 'ad_impression',
 'session_start',
 'app_clear_data',
 'user_engagement',
 'question_started',
 'mini_game_started',
 'question_completed',
 'mini_game_completed',
 'earn_virtual_currency',
 'spend_virtual_currency',
 'mini_game_failed',
 'app_exception']

In [24]:
df = apply_value_maps(df, map_of_maps, keep_unmapped=True)

In [25]:
df = safe_select_and_rename(df, df_column_names_map)

In [29]:
# --- Upload Data to BigQuery ---

# Define table names
main_table_id = f"{PROJECT_ID}.{DATASET_ID}.clean_data"

upload_named_dataframes_to_bq(
    dataframes={
        "MainCleanData": df,
        },
    dataset_id=DATASET_ID,
    project_id=PROJECT_ID,
    bq_client=bq_client,
)


DataFrame 'MainCleanData' uploaded successfully to emojioracle-342f1.analytics_481352676.MainCleanData


In [26]:
dftz = df.copy()
for col in df.select_dtypes(include=['datetimetz']):
    df[col] = df[col].dt.tz_localize(None)


df.to_excel('./data/cleaned_data.xlsx', index=False)

In [None]:
df['inferred_session_id'].value_counts()

In [None]:
df.info()