In [1]:
import os
import sys
import json
from datetime import timedelta

from google.cloud import bigquery
from google.oauth2 import service_account

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

from modules.pull_data import pull_and_append
from modules.flattening_json import flatten_extract_params, flatten_row, flatten_nested_column

In [2]:
data_path = "./data/data.json"

# project_id.dataset_id.event_YYYYMMDD
project_id = "emojioracle-342f1"
dataset_id = "analytics_481352676"

# service account key from Google Cloud IAM
bigquery_key = './keys/colab.json'
credentials = service_account.Credentials.from_service_account_file(bigquery_key)

In [3]:
pd.set_option('display.max_columns', None) # uncomment to see all of the cols in pandas dataframes

In [4]:
# will pull what is missing from BigQuery and merge into data_path
pull_and_append(credentials, project_id, dataset_id, data_path)

Loaded existing data.
Latest event date in existing data: 20250410
Fetching table: events_20250411
Updated data.json with 32 new rows.


In [5]:
# load the data merged by pull_and_append into df
df = pd.read_json(data_path)

In [6]:
# flattenning

df = pd.DataFrame([flatten_row(row) for _, row in df.iterrows()]) # for wtfs refer to ./modules/flattening_json.py

In [9]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13294 entries, 0 to 13293
Data columns (total 90 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   event_date                              13294 non-null  int64  
 1   event_timestamp                         13294 non-null  int64  
 2   event_name                              13294 non-null  object 
 3   event_previous_timestamp                13083 non-null  float64
 4   event_value_in_usd                      0 non-null      float64
 5   event_bundle_sequence_id                13294 non-null  int64  
 6   event_server_timestamp_offset           13294 non-null  int64  
 7   user_id                                 0 non-null      float64
 8   user_pseudo_id                          13294 non-null  object 
 9   user_first_touch_timestamp              13294 non-null  int64  
 10  stream_id                               13294 non-null  in

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
non_null_count = df.count().sum()

print(non_null_count)

In [None]:
# cleaning & preprocessing

## dates and times

df['event_datetime'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) # convert unix time to ape-friendly

df['event_date'] = df['event_datetime'].dt.date
df['event_time'] = df['event_datetime'].dt.time

df['event_previous_datetime'] = pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)

df['event_previous_date'] = df['event_previous_datetime'].dt.date
df['event_previous_time'] = df['event_previous_datetime'].dt.time

df['event_server_delay'] = pd.to_timedelta(df['event_server_timestamp_offset'], unit='ms') # this is the time difference between the event recorded at client and the event recorded at server
df['event_server_datetime'] = df['event_datetime'] + df['event_server_delay']
