In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

tqdm.pandas()

In [2]:
from clickhouse_driver import Client as Clickhouse

def click_query(q, **kw):
    click = Clickhouse("localhost")
    return click.query_dataframe(q, params=kw)

In [3]:
%%time
params = [
    ('2024-10-15', '2024-11-30', 'telegram.org', 'web_connectivity'),
    ('2024-10-15', '2024-11-30', None, 'telegram'),
    ('2023-10-15', '2023-11-30', 'telegram.org', 'web_connectivity'),
    ('2023-10-15', '2023-11-30', None, 'telegram'),
]
for st, et, domain, tn in params:
    sq = {
        "measurement_start_day": st,
        "measurement_end_day": et,
        "cc_list": ['KE'],
        "test_name": tn,
    }
    fn = f"{st[:4]}-KE-{tn}"
    q = """
    WITH multiIf(
        dns_failure IS NOT NULL, tuple('dns', dns_failure),
        tcp_failure IS NOT NULL, tuple('tcp', tcp_failure),
        tls_failure IS NOT NULL, tuple('tls', tls_failure),
        http_failure IS NOT NULL, tuple('https', http_failure),
        tuple('ok', '')
    ) as failure
    SELECT 
    report_id,
    input,
    test_name,
    test_version,
    measurement_uid,
    probe_cc,
    probe_asn,
    probe_as_org_name,
    probe_as_cc,
    network_type,
    measurement_start_time,
    hostname,
    ip,
    port,
    ip_asn,
    ip_as_org_name,
    resolver_ip,
    resolver_cc,
    resolver_asn,
    resolver_as_org_name,
    resolver_as_cc,
    dns_engine,
    dns_failure,
    dns_answer,
    tcp_success,
    tcp_failure,
    tls_handshake_time,
    tls_handshake_read_count,
    tls_handshake_write_count,
    tls_handshake_read_bytes,
    tls_handshake_write_bytes,
    tls_handshake_last_operation,
    tls_cipher_suite IS NOT NULL as tls_success,
    tls_is_certificate_valid,
    tls_end_entity_certificate_subject,
    tls_end_entity_certificate_subject_common_name,
    tls_end_entity_certificate_issuer,
    tls_end_entity_certificate_issuer_common_name,
    tls_end_entity_certificate_san_list,
    tls_end_entity_certificate_not_valid_after,
    tls_end_entity_certificate_not_valid_before,
    tls_certificate_chain_length,
    tls_failure,
    http_request_url,
    http_failure,
    http_runtime,
    probe_analysis,
    failure.1 as failure_class,
    IF(failure_class = 'ok', 'ok', concat(failure_class, '.', failure_str)) as failure_str_full,
    IF(startsWith(failure.2, 'unknown_failure'), 'unknown_failure', failure.2) as failure_str,
    failure.2 as failure_str_raw
    FROM obs_web
    WHERE measurement_start_time > %(measurement_start_day)s
    AND measurement_start_time < %(measurement_end_day)s
    AND probe_cc IN %(cc_list)s
    AND test_name IN %(test_name)s
    """
    if domain is not None:
        q += 'AND hostname IN %(domain)s'
        sq['domain'] = domain
        fn += f'-{domain}'
    fn += '.csv'
    df_dump = click_query(q, **sq)
    df_dump.to_csv(fn, index=False)

CPU times: user 32.8 s, sys: 990 ms, total: 33.8 s
Wall time: 1min 9s
