In [1]:
import numpy as np
import pandas as pd
import altair as alt

from tqdm import tqdm

tqdm.pandas()
pd.options.display.max_columns = None
pd.options.display.max_rows = 200
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
def get_explorer_url(e):
    return f'https://explorer.ooni.org/m/{e["measurement_uid"]}'

def print_explorer_url(e):
    print(get_explorer_url(e))

In [3]:
OONI_COLOR_SCHEME = {"gray0": "#f8f9fa", "gray1": "#f1f3f5", "gray2": "#e9ecef", "gray3": "#dee2e6", "gray4": "#ced4da", "gray5": "#adb5bd", "gray6": "#868e96", "gray7": "#495057", "gray8": "#343a40", "gray9": "#212529", "blue0": "#e7f5ff", "blue1": "#c9e8ff", "blue2": "#8dd5f8", "blue3": "#5db8fe", "blue4": "#37a6ed", "blue5": "#0588cb", "blue6": "#0f77b8", "blue7": "#056aa6", "blue8": "#005f9c", "blue9": "#005a99", "indigo0": "#edf2ff", "indigo1": "#dbe4ff", "indigo2": "#bac8ff", "indigo3": "#91a7ff", "indigo4": "#748ffc", "indigo5": "#5c7cfa", "indigo6": "#4c6ef5", "indigo7": "#4263eb", "indigo8": "#3b5bdb", "indigo9": "#364fc7", "violet0": "#f3f0ff", "violet1": "#e5dbff", "violet2": "#d0bfff", "violet3": "#b197fc", "violet4": "#9775fa", "violet5": "#845ef7", "violet6": "#7950f2", "violet7": "#7048e8", "violet8": "#6741d9", "violet9": "#5f3dc4", "fuchsia0": "#f8f0fc", "fuchsia1": "#f3d9fa", "fuchsia2": "#eebefa", "fuchsia3": "#e599f7", "fuchsia4": "#da77f2", "fuchsia5": "#cc5de8", "fuchsia6": "#be4bdb", "fuchsia7": "#ae3ec9", "fuchsia8": "#9c36b5", "fuchsia9": "#862e9c", "pink0": "#fff0f6", "pink1": "#ffdeeb", "pink2": "#fcc2d7", "pink3": "#faa2c1", "pink4": "#f783ac", "pink5": "#f06595", "pink6": "#e64980", "pink7": "#d6336c", "pink8": "#c2255c", "pink9": "#a61e4d", "red0": "#fff5f5", "red1": "#ffe3e3", "red2": "#ffc9c9", "red3": "#ffa8a8", "red4": "#ff8787", "red5": "#ff6b6b", "red6": "#fa5252", "red7": "#f03e3e", "red8": "#e03131", "red9": "#c92a2a", "orange0": "#fff4e6", "orange1": "#ffe8cc", "orange2": "#ffd8a8", "orange3": "#ffc078", "orange4": "#ffa94d", "orange5": "#ff922b", "orange6": "#fd7e14", "orange7": "#f76707", "orange8": "#e8590c", "orange9": "#d9480f", "yellow0": "#fff9db", "yellow1": "#fff3bf", "yellow2": "#ffec99", "yellow3": "#ffe066", "yellow4": "#ffd43b", "yellow5": "#fcc419", "yellow6": "#fab005", "yellow7": "#f59f00", "yellow8": "#f08c00", "yellow9": "#e67700", "lime0": "#f4fce3", "lime1": "#e9fac8", "lime2": "#d8f5a2", "lime3": "#c0eb75", "lime4": "#a9e34b", "lime5": "#94d82d", "lime6": "#82c91e", "lime7": "#74b816", "lime8": "#66a80f", "lime9": "#5c940d", "green0": "#ebfbee", "green1": "#d3f9d8", "green2": "#b2f2bb", "green3": "#8ce99a", "green4": "#69db7c", "green5": "#51cf66", "green6": "#40c057", "green7": "#37b24d", "green8": "#2f9e44", "green9": "#2b8a3e", "teal0": "#e6fcf5", "teal1": "#c3fae8", "teal2": "#96f2d7", "teal3": "#63e6be", "teal4": "#38d9a9", "teal5": "#20c997", "teal6": "#12b886", "teal7": "#0ca678", "teal8": "#099268", "teal9": "#087f5b", "cyan0": "#e3fafc", "cyan1": "#c5f6fa", "cyan2": "#99e9f2", "cyan3": "#66d9e8", "cyan4": "#3bc9db", "cyan5": "#22b8cf", "cyan6": "#15aabf", "cyan7": "#1098ad", "cyan8": "#0c8599", "cyan9": "#0b7285"}

In [4]:
OONI_NOK_COLORS = [
    OONI_COLOR_SCHEME['red8'],
    OONI_COLOR_SCHEME['yellow6'],
    OONI_COLOR_SCHEME['gray5'],
    OONI_COLOR_SCHEME['blue6'],
    OONI_COLOR_SCHEME['orange6'],
]
OONI_OK_COLOR = OONI_COLOR_SCHEME['green6']

In [5]:
def make_ooni_color_scale(all_failures):
    """
    given a list of failures, generates an OONI compatible color palette for it.
    """
    color_domain = ["ok"]
    color_range = [OONI_OK_COLOR]
    try:
        all_failures.remove('ok')
    except ValueError:
        pass
    for idx, failure_str in enumerate(all_failures):
        color_domain.append(failure_str)
        try:
            color_range.append(OONI_NOK_COLORS[idx])
        except IndexError:
            raise Exception(f"too many failure strings for current color scheme: {all_failures}")
    color_scale = alt.Scale(domain=color_domain, range=color_range)
    return color_scale

## Load the source datasets
* `2025-05-01-TZ.csv` contains all measurements from 2025-05-01 to 2025-06-01 coming from Tanzania for `twitter.com` and `x.com`
* `2025-06-01-TZ.csv` contains all measurements for the from 2025-06-01 to 2025-07-07 coming from Tanzania for `twitter.com` and `x.com`


These CSV files were generated running the following query on our observation table researcher database:

```
WITH multiIf(
    dns_failure IS NOT NULL, tuple('dns', dns_failure),
    tcp_failure IS NOT NULL, tuple('tcp', tcp_failure),
    tls_failure IS NOT NULL, tuple('tls', tls_failure),
    http_failure IS NOT NULL, tuple('https', http_failure),
    tuple('ok', '')
) as failure
SELECT 
report_id,
input,
test_name,
test_version,
measurement_uid,
probe_cc,
probe_asn,
probe_as_org_name,
probe_as_cc,
network_type,
measurement_start_time,
hostname,
ip,
port,
ip_asn,
ip_as_org_name,
resolver_ip,
resolver_cc,
resolver_asn,
resolver_as_org_name,
resolver_as_cc,
dns_engine,
dns_failure,
dns_answer,
tcp_success,
tcp_failure,
tls_handshake_time,
tls_handshake_read_count,
tls_handshake_write_count,
tls_handshake_read_bytes,
tls_handshake_write_bytes,
tls_handshake_last_operation,
tls_cipher_suite IS NOT NULL as tls_success,
tls_is_certificate_valid,
tls_end_entity_certificate_subject,
tls_end_entity_certificate_subject_common_name,
tls_end_entity_certificate_issuer,
tls_end_entity_certificate_issuer_common_name,
tls_end_entity_certificate_san_list,
tls_end_entity_certificate_not_valid_after,
tls_end_entity_certificate_not_valid_before,
tls_certificate_chain_length,
tls_failure,
http_request_url,
http_failure,
http_runtime,
probe_analysis,
failure.1 as failure_class,
IF(failure_class = 'ok', 'ok', concat(failure_class, '.', failure_str)) as failure_str_full,
IF(startsWith(failure.2, 'unknown_failure'), 'unknown_failure', failure.2) as failure_str,
failure.2 as failure_str_raw
FROM obs_web
WHERE measurement_start_time > %(measurement_start_day)s
AND measurement_start_time < %(measurement_end_day)s
AND probe_cc IN %(cc_list)s
AND test_name IN %(test_name)s
```

The code used for extracting these fields from the raw JSON measurements can be found here: https://github.com/ooni/data. 

If you require access to this database, please contact us at contact@ooni.org.

In [None]:
date_columns = [
    'measurement_start_time',
    'tls_end_entity_certificate_not_valid_after',
    'tls_end_entity_certificate_not_valid_before'
]
column_types = {'dns_failure': np.dtype('O'), 'tls_failure': np.dtype('O')}

df_05 = pd.read_csv("2025-05-01-TZ.csv.gz", 
                         parse_dates=date_columns, 
                         dtype=column_types)
df_06 = pd.read_csv("2025-06-01-TZ.csv.gz", 
                         parse_dates=date_columns, 
                         dtype=column_types)

df = pd.concat([df_05, df_06], ignore_index=True)

In [None]:
MEASUREMENT_START_DAY = '2025-05-01'
MEASUREMENT_END_DAY = '2025-07-07'

In [None]:
def calculate_grouped_with_totals(df_raw, freq='d'):
    """
    Groups all observations by the specified grouping keys and
    calculate the total and percentage of each `failure_str_full`
    on a given day.
    """
    df_grouped = df_raw[[
        'measurement_start_time',
        'probe_as_org_name',
        'hostname',
        #'probe_analysis',
        'probe_asn',
        'ip',
        'failure_str_full',
        'network_type',
        #'resolver_asn',
        #'resolver_as_org_name',
        #'network_type',
        'measurement_uid',
        'resolver_asn'
    ]].groupby([
        pd.Grouper(freq=freq, key='measurement_start_time'),
        'probe_as_org_name',
        'probe_asn',
        'hostname',
        'ip',
        'failure_str_full',
        'network_type',
        'resolver_asn'
        #'resolver_asn',
        #'resolver_as_org_name',
        #'probe_analysis',
        #'network_type'
    ]).count().reset_index().rename(columns={'measurement_uid': 'obs_count'}).copy()
    group_cols = ['measurement_start_time', 'probe_as_org_name', 'probe_asn', 'hostname', 'ip', 'network_type', 'resolver_asn']
    total_counts = df_grouped.groupby(group_cols)['obs_count'].sum().reset_index()
    total_counts.rename(columns={'obs_count': 'total_count'}, inplace=True)
    
    # Merge total counts back to original dataframe
    df_with_totals = pd.merge(df_grouped, total_counts, on=group_cols)
    
    # Calculate percentages
    df_with_totals['percentage'] = (df_with_totals['obs_count'] / df_with_totals['total_count']) * 100
    return df_with_totals

In [None]:
def plot_no_disaggregate(df_msmts, start_time='2025-05-01', save=False, title='Overall accessibility of twitter.com and x.com (web_connectivity)', fname_extra=''):
    df_all = df_msmts.groupby(['measurement_start_time', 'failure_str_full'])['obs_count'].sum().reset_index()
    total_counts = df_all.groupby('measurement_start_time')['obs_count'].sum().reset_index()
    total_counts.rename(columns={'obs_count': 'total_count'}, inplace=True)
    df_msmts = pd.merge(df_all, total_counts, on='measurement_start_time')
    df_msmts['percentage'] = (df_msmts['obs_count'] / df_msmts['total_count']) * 100

    left_offset = -30
    top_offset = -20
    ooni_logo = alt.Chart(
        {"values": [{"url": "https://raw.githubusercontent.com/ooni/design-system/refs/heads/master/svgs/logos/OONI-HorizontalMonochrome.svg"}]}
    ).mark_image(opacity=0.5).encode(
        x=alt.value(left_offset), x2=alt.value(left_offset+80),  # pixels from left
        y=alt.value(top_offset), y2=alt.value(top_offset+40),  # pixels from top
        url="url:N"
    )

    df_sel = df_msmts[
        (df_msmts['measurement_start_time'] > start_time)
    ]

    base_msmts = alt.Chart(df_sel)
    
    color_scale = make_ooni_color_scale(list(df_sel['failure_str_full'].unique()))
    bar_chart = base_msmts.mark_bar().encode(
        x=alt.X('measurement_start_time:T', axis=alt.Axis(format="%Y-%m-%dT%H")),
        y='percentage',
        color=alt.Color('failure_str_full',
            scale=color_scale,
            legend=alt.Legend(
                title=None,
                orient='none',
                legendY=-20,
                labelOpacity=1,
                direction='horizontal',
                titleAnchor='start')
        ),
        tooltip=(
            alt.Tooltip("measurement_start_time:T", format="%Y-%m-%dT%H"),
            'failure_str_full', 'percentage', 'obs_count', 
            'total_count'
        )
    )
    
    count_chart = base_msmts.mark_circle().encode(
        x='measurement_start_time',
        y='total_count',
        size=alt.Size('total_count',
            # legend=alt.Legend(
            #     orient='none',
            #     legendY=-40,
            #     legendX=600,
            #     labelOpacity=1,
            #     direction='horizontal',
            #     titleAnchor='end')
        ),
        tooltip=(
            alt.Tooltip("measurement_start_time:T", format="%Y-%m-%dT%H"),
            'failure_str_full', 'percentage', 'obs_count', 
            'total_count'
        )
    )
    

    year = start_time[:4]
    chart = alt.vconcat((bar_chart + count_chart).resolve_legend(color='independent').properties(
        width=1500,
        height=300,
        title=f'{title} in Tanzania'
    ), ooni_logo).configure_concat(
        spacing=-30
    ).configure_view(
        strokeOpacity=0
    )
    display(chart)

In [None]:
def plot_all_charts_wc(df_msmts, start_time='2025-05-01', save=False):
    """
    Produce web_connectivity charts for all probe_asns, limiting the
    selection to only measurements collected after
    the start_time.
    When save=True, the measurements are exported as png,
    if save=False, they are displayed inline.
    """
    left_offset = -30
    top_offset = -20
    ooni_logo = alt.Chart(
        {"values": [{"url": "https://raw.githubusercontent.com/ooni/design-system/refs/heads/master/svgs/logos/OONI-HorizontalMonochrome.svg"}]}
    ).mark_image(opacity=0.5).encode(
        x=alt.value(left_offset), x2=alt.value(left_offset+80),  # pixels from left
        y=alt.value(top_offset), y2=alt.value(top_offset+40),  # pixels from top
        url="url:N"
    )
 
    for idx, row in df_msmts[['probe_asn', 'probe_as_org_name']].drop_duplicates().iterrows():
        probe_asn = row['probe_asn']
        probe_as_org_name = row['probe_as_org_name']

        df_sel = df_msmts[
            (df_msmts['measurement_start_time'] > start_time)
            & (df_msmts['probe_asn'] == probe_asn)
        ]
        if df_sel['total_count'].sum() < 10:
            continue
        
        base_msmts = alt.Chart(df_sel)
        
        color_scale = make_ooni_color_scale(list(df_sel['failure_str_full'].unique()))
        # color_scale = make_ooni_color_scale(sorted(list(df_sel['failure_str_full'].unique())))
        bar_chart = base_msmts.mark_bar().encode(
            x=alt.X('measurement_start_time:T', axis=alt.Axis(format="%Y-%m-%dT%H"), title='measurement start time'),
            y=alt.Y('obs_count', title='observation count'),
            color=alt.Color('failure_str_full',
                scale=color_scale,
                legend=alt.Legend(
                    title=None,
                    orient='right',
                    legendY=-20,
                    labelOpacity=1,
                    direction='vertical',
                    titleAnchor='start')
            ),
            tooltip=(
                alt.Tooltip("measurement_start_time:T", format="%Y-%m-%dT%H"),
                'probe_as_org_name', 
                'failure_str_full', 'percentage', 'obs_count', 
                'total_count'
            )
        )
        
        count_chart = base_msmts.mark_circle().encode(
            x='measurement_start_time',
            y='total_count',
            size=alt.Size('total_count',
                # legend=alt.Legend(
                #     orient='none',
                #     legendY=-40,
                #     legendX=600,
                #     labelOpacity=1,
                #     direction='horizontal',
                #     titleAnchor='end')
            ),
            tooltip=(
                alt.Tooltip("measurement_start_time:T", format="%Y-%m-%dT%H"),
                'probe_as_org_name', 
                'failure_str_full', 'percentage', 'obs_count', 
                'total_count'
            )
        )
        

        year = start_time[:4]
        chart = alt.vconcat(( bar_chart ).resolve_legend(color='independent').properties(
            width=1000,
            height=300,
            title=f'Accessibility of twitter (web_connectivity) in Tanzania on {probe_as_org_name} (AS{probe_asn})'
        ), ooni_logo).configure_concat(
            spacing=-30
        ).configure_view(
            strokeOpacity=0
        )
        filename = f'charts/{year}-twitter-web_connectivity-AS{probe_asn}.png'
        display(chart)

In [None]:
df_wc_totals = calculate_grouped_with_totals(df)

In [None]:
cnt_by_asn = df_wc_totals.groupby(
    ['probe_asn', 'probe_as_org_name']
)['obs_count'].sum().reset_index().sort_values(by='obs_count', ascending=False)

cnt_by_asn['net_name'] = cnt_by_asn.apply(lambda x: f"AS{x['probe_asn']}: {x['probe_as_org_name']}", axis=1)

In [None]:
base = alt.Chart(cnt_by_asn)

bar = base.mark_bar(color='#0588cb').encode(
    y=alt.Y('net_name').sort('-x'),
    x=alt.X('obs_count'),
    text='obs_count'
)

text = bar.mark_text(
    align="left",
    baseline="middle",
    dx=3
).encode(
    text="obs_count:Q",
)

(bar + text).properties(
    title=f'Observations for twitter.com or x.com in Tanzania {MEASUREMENT_START_DAY} - {MEASUREMENT_END_DAY}'
)

In [None]:
plot_all_charts_wc(df_wc_totals, save=True)