<a href="https://colab.research.google.com/github/nocsaren/emoji-oracle-analytics/blob/main/eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio --quiet # this is for interactive graphs

In [None]:
import sys
import pandas as pd
import plotly.express as px
import gradio as gr
import matplotlib.pyplot as plt

from datetime import timedelta

from google.colab import drive
from google.cloud import bigquery
from google.oauth2 import service_account

In [None]:
drive.mount('/content/drive', force_remount=True)

# service account key from Google Cloud IAM
credentials = service_account.Credentials.from_service_account_file('/content/drive/MyDrive/emoji-oracle-analytics/keys/colab.json')

pd.set_option('display.max_colwidth', None) # for those ugly all-columns displays

In [None]:
sys.path.append('/content/drive/MyDrive/emoji-oracle-analytics/modules')

from update_data import pull_and_append, flatten_extract_params, flatten_row # custom functions are stored in ./modules

In [None]:
project_id = "emojioracle-342f1"
dataset_id = "analytics_481352676"
data_path = "/content/drive/MyDrive/emoji-oracle-analytics/data/data.json"

pull_and_append(credentials, project_id, dataset_id, data_path) # will pull what is missing from BigQuery

In [None]:
df = pd.read_json(data_path)

In [None]:
# cleaning & preprocessing

## dates and times

df['event_datetime'] = pd.to_datetime(df['event_timestamp'], unit='us', utc=True) # convert unix time to ape-friendly

df['event_date'] = df['event_datetime'].dt.date
df['event_time'] = df['event_datetime'].dt.time

df['event_previous_datetime'] = pd.to_datetime(df['event_previous_timestamp'], unit='us', utc=True)

df['event_previous_date'] = df['event_previous_datetime'].dt.date
df['event_previous_time'] = df['event_previous_datetime'].dt.time

df['event_server_delay'] = pd.to_timedelta(df['event_server_timestamp_offset'], unit='ms') # this is the time difference between the event recorded at client and the event recorded at server
df['event_server_datetime'] = df['event_datetime'] + df['event_server_delay']



In [None]:
## unnesting

df = pd.DataFrame([flatten_row(row) for _, row in df.iterrows()])

In [None]:
# Function to generate the chart
def plot_distribution(column_name):
    data = df[column_name].dropna()  # Drop NaN values

    # bar chart for categorical columns
    if data.dtype == 'object':
        value_counts = data.value_counts()
        fig, ax = plt.subplots(figsize=(12, 12))
        value_counts.plot(kind='bar', ax=ax, color='skyblue')
        ax.set_title(f'Distribution of {column_name}', fontsize=16)
        ax.set_xlabel(column_name, fontsize=14)
        ax.set_ylabel('Count', fontsize=14)
        plt.subplots_adjust(bottom=0.2)
        plt.xticks(rotation=90, ha='right', fontsize=9)  # Adjust x-axis label size and rotation

    # histogram for numerical columns
    else:
        fig, ax = plt.subplots(figsize=(12, 12))  # Adjusted size (width x height)
        ax.hist(data, bins=20, color='skyblue', edgecolor='black')
        ax.set_title(f'Distribution of {column_name}', fontsize=16)
        ax.set_xlabel(column_name, fontsize=14)
        ax.set_ylabel('Frequency', fontsize=14)
        plt.subplots_adjust(bottom=0.2)

    return fig

column_names = df.columns.tolist()

# gr interface
with gr.Blocks() as demo:
    with gr.Row():
        column_dropdown = gr.Dropdown(choices=column_names, label="Select Column", elem_id="column_dropdown")
    with gr.Row():
        chart_output = gr.Plot(elem_id="chart_output")

    column_dropdown.change(fn=plot_distribution, inputs=column_dropdown, outputs=chart_output)

demo.launch(debug=True)

In [None]:
df.info()