# Preparation

In [None]:
from voxrow.web.domain.value_objects import Settings

In [None]:
settings: Settings = Settings()

# Duckdb

In [None]:
%load_ext magic_duckdb
%dql -t relation
%dql SELECT 'Hanya Tes' ini;

In [None]:
from os import environ

In [None]:
environ["AWS_ACCESS_KEY_ID"] = settings.cloudflare_r2.aws_access_key_id.get_secret_value()
environ["AWS_SECRET_ACCESS_KEY"] = settings.cloudflare_r2.aws_secret_access_key.get_secret_value()

In [None]:
%%dql -j
CREATE OR REPLACE
SECRET (
    TYPE r2,
    PROVIDER credential_chain,
    CHAIN env,
    ENDPOINT '{{ settings.cloudflare_r2.endpoint_url.host }}'
);

# API

## Preparation

In [None]:
from fastapi.testclient import TestClient

from voxrow.web.entrypoints.fastapi import app

In [None]:
client: TestClient = TestClient(app)

## /idx/stock-summary

### Query

In [None]:
%%dql
SELECT
    *
FROM
    'r2://datalake/idx.co.id/GetStockSummary/2020-01-02.json.gz'
;

In [None]:
import boto3
import duckdb
from pandas import DataFrame

from voxrow.data.domain.value_objects import Boto3Credential

In [None]:
credential: Boto3Credential = settings.cloudflare_r2

duckdb.register(
    "list_idx_stock_summary",
    DataFrame(
        content
        for page in (
            boto3.client(
                service_name=credential.service_name,
                endpoint_url=str(credential.endpoint_url),
                aws_access_key_id=credential.aws_access_key_id.get_secret_value(),
                aws_secret_access_key=credential.aws_secret_access_key.get_secret_value(),
                region_name=credential.region_name,
            )
            .get_paginator("list_objects_v2")
            .paginate(
                Bucket="datalake",
                Prefix="idx.co.id/GetStockSummary/"
            )
        )
        for content in page["Contents"]
    )
)

In [None]:
%%dql
SELECT
    "date"."year" || '-' || "date"."month" AS "month",
    COUNT(*) AS files
FROM
    list_idx_stock_summary,
    LATERAL (
        SELECT
            REGEXP_EXTRACT(
                "Key",
                '.*/(\d{4}).(\d{2}).\d{2}.json.gz',
                ['year', 'month']
            ) AS "date"
    )
GROUP BY
    1
ORDER BY
    1
;

In [None]:
%%dql
SELECT
    "date"."year",
    COUNT(
        DISTINCT
        "date"."year" || '-' || "date"."month"
    ) AS months,
    COUNT(*) AS files
FROM
    list_idx_stock_summary,
    LATERAL (
        SELECT
            REGEXP_EXTRACT(
                "Key",
                '.*/(\d{4}).(\d{2}).\d{2}.json.gz',
                ['year', 'month']
            ) AS "date"
    )
GROUP BY
    1
ORDER BY
    1
;

### API Call

In [None]:
import random
from datetime import date, timedelta
from time import sleep

from httpx import Response
from pydantic import validate_call

In [None]:
@validate_call
def extract_stock_summary_idx(
    settings: Settings,
    start_date: date,
    end_date: date | None = None,
) -> None:
    if end_date is None:
        if start_date.strftime("%a") not in ("Sat", "Sun"):
            resp: Response = client.get(
                "/idx/stock-summary",
                headers=dict(Authorization=f"Bearer {settings.cron_secret.get_secret_value()}"),
                params=dict(date=str(start_date)),
                timeout=60 * 1.5,
            )

            print(f"{start_date} | status_code: {resp.status_code}")
            resp.raise_for_status()
    elif start_date == end_date:
        extract_stock_summary_idx(
            settings=settings,
            start_date=start_date,
            end_date=None,
        )
    elif start_date > end_date:
        extract_stock_summary_idx(
            settings=settings,
            start_date=end_date,
            end_date=start_date,
        )
    else:
        for day in range((end_date - start_date).days + 1):
            extract_stock_summary_idx(
                settings=settings,
                start_date=start_date + timedelta(days=day),
                end_date=None,
            )
            sleep(random.uniform(3.0, 5.0))

In [None]:
# Oldest date is 2020-01-02
year: int = 2020

extract_stock_summary_idx(
    settings=settings,
    start_date=date(year, 1, 1),
    end_date=date(year, 12, 31),
)