In [1]:
import pandas as pd
from IPython.display import Image
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sample_uids = [
 '20240101113125.808243_BG_webconnectivity_ba4cf234d47b73ec',
 '20240220050354.524776_US_webconnectivity_a690552818804fc3',
 '20240224232654.007984_RU_webconnectivity_9164a3df2d35316c',
 '20240226225131.261075_VE_webconnectivity_a3e31c999fc8b161',
 '20240226225231.612531_VE_webconnectivity_47d3561e542337d8',
 '20240226225245.422977_VE_webconnectivity_37dcdd51581cbd13',
 '20240227160142.829208_US_webconnectivity_2c9b945991901f8d',
 '20240311005741.675686_SE_webconnectivity_2a4a84928bd17ab0',
 '20240316062349.575036_US_webconnectivity_b63efd0ed694e005',
 '20240328194436.111684_TR_webconnectivity_5414b98b9c9f873b',
 '20240409035157.511121_BR_webconnectivity_85b1de4e410314f7',
 '20240410115514.842786_BR_webconnectivity_6d224ea8b6b3ca7c',
 '20240410124541.400148_BR_webconnectivity_ac16eb05475f622a',
 '20240410134516.191956_BR_webconnectivity_a947a71df23203f3',
 '20240410135012.070725_BR_webconnectivity_d3e9bce608599d5e',
 '20240413230148.929136_KH_webconnectivity_33ae9436d7b6779c',
 '20240415141511.755790_BR_webconnectivity_07547042a9307621',
 '20240416214307.425888_BR_webconnectivity_c9d912510d61bc0b',
 '20240418071953.744226_BR_webconnectivity_2c6b65eeff39f34f',
 '20240506163355.336279_VN_webconnectivity_8de9ce661982414d',
 '20240508114544.444840_BR_webconnectivity_379718729eeed45b',
 '20240514091759.626287_NL_webconnectivity_38962f675df0f676',
 '20240519182127.049517_US_webconnectivity_6e12ec842e98674a',
 '20240522113817.495473_NL_webconnectivity_e81ed947814f9d2a',
 '20240524142956.125292_JO_webconnectivity_5cd31e0f03494bd2',
 '20240627074955.407639_RU_webconnectivity_99e23c2b9680c1cb',
 '20240704234509.514733_NL_webconnectivity_6f924d2e43209ae1',
 '20240705015016.667807_AE_webconnectivity_392604bce72a44d5',
 '20240722041303.591874_MM_webconnectivity_98e6671dd7dce61a',
 '20240722041334.778473_MM_webconnectivity_93a628687303be86',
 '20240722042950.433988_MM_webconnectivity_d388cdc8bad1f15c',
 '20240728114019.011456_VE_webconnectivity_1f0ba3819685e272',
 '20240810175938.750056_VE_webconnectivity_016721e1ebfdc2ed',
 '20240810182504.634031_VE_webconnectivity_853b19341a35d2f0',
 '20240810182604.928599_VE_webconnectivity_f42bd5020088c498',
 '20240815043926.030750_VE_webconnectivity_f034e025b6db08ea',
 '20240815092817.050941_DE_webconnectivity_661003d3498d802f',
 '20240815172949.387668_VE_webconnectivity_427addcd5d1ea806',
 '20240818234433.791006_VE_webconnectivity_8fac03865551a8eb',
 '20240820163242.249906_JO_webconnectivity_a826f460299fa996']

In [3]:
from pathlib import Path
from oonidata.dataclient import load_measurement
from oonidata.apiclient import get_measurement_dict_by_uid

from oonipipeline.netinfo import NetinfoDB
from oonipipeline.transforms.observations import measurement_to_observations

In [82]:
netinfodb = NetinfoDB(datadir=Path("datadir"), download=False)
# 20231129041111.939896_NP_webconnectivity_32105af63774eabf
#raw_msmt = get_measurement_dict_by_uid("20240804233508.181041_TR_webconnectivity_9a7272e520f3d836")
def make_obs_ctrl(measurement_uid):
    raw_msmt = get_measurement_dict_by_uid(measurement_uid)
    msmt = load_measurement(msmt=raw_msmt)
    web_observations, web_control_observations = measurement_to_observations(
        msmt, netinfodb=netinfodb
    )
    return web_observations, web_control_observations

In [79]:
from collections import Counter
def most_frequent(asns):
    if not asns:
        return 0
    count = Counter(asns)
    most_common = count.most_common(1)
    return most_common[0][0]

CLOUD_PROVIDERS_ASNS = [
    13335,  # Cloudflare: https://www.peeringdb.com/net/4224
    209242, #	Cloudflare London, LLC	
    20940,  # Akamai: https://www.peeringdb.com/net/2
    9002,  # Akamai RETN
    16625, # Akamai Technologies, Inc.	
    63949, # Akamai Technologies, Inc.	
    16509, #	Amazon.com, Inc.
    14618, #	Amazon.com, Inc.	
    15169, #	Google LLC
    396982,  # Google Cloud: https://www.peeringdb.com/net/30878
    54113, #	Fastly, Inc
    8075, # Microsoft Corporation
    8068, #	Microsoft Corporation
]
def is_cloud_provider(as_number):
    return as_number in CLOUD_PROVIDERS_ASNS

def dns_feature_vector(dns_answers, w_ctrl):
    ctrl_map = {}
    for ctrl in w_ctrl:
        ctrl_map[ctrl.ip] = ctrl

    unreachable_count = 0
    for answer in dns_answers:
        if ctrl_map.get(answer.ip):
            if ctrl_map[answer.ip].tcp_success == False:
                unreachable_count += 1
    dns_answer_vector = {
        "answer_count": len(dns_answers),
        "answer_cloud_provider": 1 if any([is_cloud_provider(x.ip_asn) for x in dns_answers]) else 0,
        "unreachable_answer_count": unreachable_count,
        "top_answer_asn": most_frequent(map(lambda x: x.ip_asn, dns_answers))
    }
    return dns_answer_vector

In [None]:
obs, ctrl = make_obs_ctrl("20240316062349.575036_US_webconnectivity_b63efd0ed694e005")

In [86]:
from pprint import pprint
#pprint(obs[0])

In [80]:
from scipy.spatial import distance
netinfodb = NetinfoDB(datadir=Path("datadir"), download=False)
def compute_dns_distance(measurement_uid):
    web_observations, web_control_observations = make_obs_ctrl(measurement_uid)
    exp_dns_answers = list(filter(lambda x: x.dns_answer is not None, web_observations))
    if len(exp_dns_answers) == 0:
        return None
    exp_vector = list(dns_feature_vector(web_observations, web_control_observations).values())
    ctrl_dns_answers = list(filter(lambda x: x.dns_failure is not None or x.dns_success is True, web_control_observations))
    ctrl_vector = list(dns_feature_vector(
        ctrl_dns_answers,
        web_control_observations
    ).values())
    try:
        d = distance.cosine(exp_vector, ctrl_vector)
        return d
    except:
        print(exp_vector, ctrl_vector)
        return None

In [81]:
distance_map = {
}
for uid in sample_uids:
    print(uid)
    d = compute_dns_distance(uid)
    print(d)
    distance_map[uid] = d

20240101113125.808243_BG_webconnectivity_ba4cf234d47b73ec
2.511132244364944e-07
20240220050354.524776_US_webconnectivity_a690552818804fc3
2.0243516862805677e-09
20240224232654.007984_RU_webconnectivity_9164a3df2d35316c
2.811796573354286e-09
20240226225131.261075_VE_webconnectivity_a3e31c999fc8b161
5.679824388593602e-10
20240226225231.612531_VE_webconnectivity_47d3561e542337d8
2.811796573354286e-09
20240226225245.422977_VE_webconnectivity_37dcdd51581cbd13
1.1247181741502743e-08
20240227160142.829208_US_webconnectivity_2c9b945991901f8d
None
20240311005741.675686_SE_webconnectivity_2a4a84928bd17ab0
1.834545071410787e-09
20240316062349.575036_US_webconnectivity_b63efd0ed694e005
[3, 0, 0, None] [2, 0, 0, 60781]
None
20240328194436.111684_TR_webconnectivity_5414b98b9c9f873b
1.3396325160286437e-09
20240409035157.511121_BR_webconnectivity_85b1de4e410314f7
0.0
20240410115514.842786_BR_webconnectivity_6d224ea8b6b3ca7c
2.8117966843765885e-09
20240410124541.400148_BR_webconnectivity_ac16eb05475f62

In [88]:
compute_dns_distance("20240729235930.015020_IR_webconnectivity_eda3016ef01b4d3b")

3.9001535001581544e-08