# Exploratory data analysis (EDA)

Use this notebook to explore and visualize the data.

In [1]:
import os
import ibis
import ibis.selectors as s
import plotly.express as px

from python_analytics_accelerator.dag.config import (
    DATA_DIR,
    RAW_DATA_DIR,
    BRONZE,
    SILVER,
    GOLD,
)
from python_analytics_accelerator.dag.resources import Catalog

ibis.options.interactive = True

px.defaults.template = "plotly_dark"

In [2]:
catalog = Catalog()

In [3]:
catalog.list_groups()

['bronze', 'gold', 'silver']

In [4]:
catalog.list_tables(GOLD)

['gold_gh_commits',
 'gold_gh_forks',
 'gold_gh_issues',
 'gold_gh_prs',
 'gold_gh_stars',
 'gold_gh_watchers',
 'gold_pypi_downloads']

In [5]:
t = catalog.table("gold_pypi_downloads").cache()
t

In [None]:
t.count()

In [None]:
t["count"].sum()

In [None]:
t["date"].min()

In [None]:
t = t.mutate(date=t["date"].cast("timestamp").truncate("D"))
t = t.group_by("date").agg(downloads=t["count"].sum())
t.order_by(ibis.desc("date"))

In [None]:
t = t.select(
    timestamp="date",
    rolling_downloads=t["downloads"]
    .sum()
    .over(ibis.window(order_by="date", preceding=28, following=0)),
)
t

In [None]:
px.line(
    t,
    x="timestamp",
    y="rolling_downloads",
)

In [None]:
t.to_polars()

In [None]:
t.count()

In [None]:
t.schema()