In [9]:
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()
from bokeh.plotting import figure, show

In [10]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 100
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [11]:
from clickhouse_driver import Client as Clickhouse
from uuid import uuid4
from pathlib import Path

def click_query(q, params=None):
    click = Clickhouse("localhost")
    return click.query_dataframe(q, params=params)

def click_query_fsn(q, params=None):
    click = Clickhouse("backend-fsn.ooni.org")
    return click.query_dataframe(q, params=params)

In [12]:
def get_explorer_url(e):
    return f'https://explorer.ooni.org/m/{e['measurement_uid']}'
def print_explorer_url(e):
    print(get_explorer_url(e))

# Unusual combinations
Try to find easy unusual combinations in software_name, platform, like (ooniprobe-android, ios) 

Possible field combinations with inconsistencies: 
- **software_name, platform** ooniprobe-android on ios? ooniprobe-desktop in android?
    - (*android, !android) 
    - (*ios, !ios)
    - (*desktop, !(windows | macos | linux)
- **platform, architecture** ios or android on amd64?
    - (*ios, !arm)
    - (*android, !arm) with the caveat that there are x86 android devices, just not too many. Keep an eye for high volume anomalies  
- **software_name, software_version** a non existent release version for android?
- **software_version, engine_version** an engine version that wasn't released with that software?

## What to do after finding an anomaly

- look at the country codes
- look at the test results
- look at the dates of these measurements

## (*android, !android)

In [13]:
click_query_fsn("""SELECT software_name, platform, COUNT(*) AS total 
FROM fastpath 
where platform <> '' and software_name like '%android%' and platform not like '%android%'
GROUP BY software_name, platform 
ORDER BY total ASC;""")

Unnamed: 0,software_name,platform,total
0,ooniprobe-android,linux,1
1,ooniprobe-android,React OS,6
2,ooniprobe-android,macos,161
3,ooniprobe-android-unattended,macos,550


The most remarkable ones in the previous experiment are the ooniprobe-android - macos combinatios, but those might be due to development machines, let's find out

In [14]:
click_query_fsn(
    """
    select software_name, platform, probe_cc, probe_asn, count(*) as total
    from fastpath 
    where (software_name = 'ooniprobe-android' or software_name = 'ooniprobe-android-unattended') and (platform = 'macos')
    group by (software_name, platform, probe_cc, probe_asn)
    order by total desc; 
    """
)

Unnamed: 0,software_name,platform,probe_cc,probe_asn,total
0,ooniprobe-android-unattended,macos,CA,13335,320
1,ooniprobe-android-unattended,macos,US,13335,224
2,ooniprobe-android,macos,CA,13335,117
3,ooniprobe-android,macos,US,13335,37
4,ooniprobe-android,macos,PT,12353,7
5,ooniprobe-android-unattended,macos,HN,13335,5
6,ooniprobe-android-unattended,macos,NG,13335,1


After talking to Norbel I discovered that **most of these things were just Norbel's and Sergio's machines

The US/CA ASNs correspond to cloudflare, and Norbel always works with a cloudflare VPN activated

The PT ASN corresponds to Sergio

What about the **React OS** we saw before? maybe it's related to the `ooniprobe-react-os` software name?

In [15]:
click_query_fsn(
    """
    select software_name, platform, probe_cc, probe_asn, architecture, measurement_start_time as st 
    from fastpath
    where platform like '%React%' and software_name like '%android%'
    order by st desc
    """
)

Unnamed: 0,software_name,platform,probe_cc,probe_asn,architecture,st
0,ooniprobe-android,React OS,CN,56040,amd64,2024-02-14 15:18:08
1,ooniprobe-android,React OS,CN,9808,amd64,2024-01-18 06:08:23
2,ooniprobe-android,React OS,CN,9808,amd64,2024-01-18 05:24:46
3,ooniprobe-android,React OS,CN,9808,amd64,2024-01-18 05:20:54
4,ooniprobe-android,React OS,CN,56040,amd64,2023-12-27 04:24:58
5,ooniprobe-android,React OS,CN,56040,amd64,2023-12-27 02:36:10


These are from china as well, and quite old

56040, 9808 = China mobile ISP
 

## (*ios, !ios)

In [16]:
click_query_fsn(
    """
    select software_name, platform, count(*) as total
    from fastpath
    where software_name like '%ios%' and software_name not like '%debug%' and software_name not like '%dev%' and platform not like '%ios%' and platform <> ''
    group by (platform, software_name)
    order by total asc
    """
)

Unnamed: 0,software_name,platform,total


Nothing strange it seems

## (*desktop, !(windows | macos | linux)

In [17]:
click_query_fsn(
    """
    select software_name, platform, count(*) as total
    from fastpath
    where software_name like '%desktop%' and software_name not like '%debug%' and software_name not like '%dev%' and platform not like '%linux%' and platform not like '%windows%' and platform not like '%macos%' and platform <> ''
    group by (platform, software_name)
    order by total asc
    """
)

Unnamed: 0,software_name,platform,total


## Conclusions (software_name, platform)
- No issues detected for (*ios, !ios)
- No issues detected for (*desktop, !(macos | windows | linux)
- Some issues detected for (*android, !android) but it was just development measurements

## *IOS, !arm

Something we can check is comparing the platform against the architecture. For example, there are no x86 ios devices

In [18]:
click_query_fsn(
    """
    select software_name, platform, architecture, count(*) as total 
    from fastpath
    where platform like '%ios%' and architecture <> '' and architecture not like '%arm%' 
    group by (software_name, platform, architecture) 
    order by total asc   
    """
)

Unnamed: 0,software_name,platform,architecture,total
0,ooniprobe-ios,ios,amd64,2334


There seems to be some measurements with non arm ios reporting, let's see

In [19]:
click_query_fsn(
    """
    select software_name, platform, architecture, probe_cc, probe_asn, count(*) as total 
    from fastpath 
    where platform = 'ios' and architecture = 'amd64' 
    group by software_name, platform, architecture, probe_cc, probe_asn 
    order by total asc;
    """
)

Unnamed: 0,software_name,platform,architecture,probe_cc,probe_asn,total
0,ooniprobe-ios,ios,amd64,CM,36912,7
1,ooniprobe-ios,ios,amd64,US,395337,31
2,ooniprobe-ios,ios,amd64,US,54112,40
3,ooniprobe-ios,ios,amd64,US,395336,117
4,ooniprobe-ios,ios,amd64,US,8075,667
5,ooniprobe-ios,ios,amd64,CM,15964,1472


ASNs to name:
- 395337, 54112, 395336: MacStadium, Inc. is a company in the technology industry that provides managed hosting and cloud computing services for businesses.
- 8075: microsoft

All of these come from either datacenter ASNs or cameroon, so it's probably Norbel. 
However, **it's strange that he works on an M2 mac and the app reports amd64**

In [20]:
click_query_fsn(
    """
    select software_name, platform, architecture, probe_cc, probe_asn, measurement_start_time as st 
    from fastpath 
    where platform = 'ios' and architecture = 'amd64' 
    order by st desc 
    limit 10;
    """
)

Unnamed: 0,software_name,platform,architecture,probe_cc,probe_asn,st
0,ooniprobe-ios,ios,amd64,US,8075,2024-04-23 02:33:47
1,ooniprobe-ios,ios,amd64,US,8075,2024-04-23 02:33:11
2,ooniprobe-ios,ios,amd64,US,8075,2024-04-21 02:34:30
3,ooniprobe-ios,ios,amd64,US,8075,2024-04-21 02:33:46
4,ooniprobe-ios,ios,amd64,US,8075,2024-04-20 02:27:20
5,ooniprobe-ios,ios,amd64,US,8075,2024-04-20 02:26:43
6,ooniprobe-ios,ios,amd64,US,8075,2024-04-18 02:32:34
7,ooniprobe-ios,ios,amd64,US,8075,2024-04-18 02:31:52
8,ooniprobe-ios,ios,amd64,US,8075,2024-04-16 02:28:01
9,ooniprobe-ios,ios,amd64,US,8075,2024-04-16 02:27:23


These measurements are a bit old, so they might come from an old computer from Norbel?

## *android, !arm
Remember that there are some amd64 android devices we just want to see if there's some volume anomaly here

In [26]:
android_noarm_df = click_query_fsn(
    """
    select platform, architecture, count(*) as total 
    from fastpath
    where platform like '%android%' and architecture not like '%arm%' and platform <> '' and architecture <> '' and software_name not like '%debug%' and software_name not like '%dev%'
    group by platform, architecture
    order by total asc;
    """
)
android_noarm_df

Unnamed: 0,platform,architecture,total
0,android,386,1253990
1,android,amd64,4350163


There are many measurements from non-arm android devices.

Let's see the distribution by software_name

In [22]:
click_query_fsn(
    """
    select software_name, platform, architecture, count(*) as total 
    from fastpath
    where platform like '%android%' and architecture not like '%arm%' and platform <> '' and architecture <> '' and software_name not like '%debug%' and software_name not like '%dev%'
    group by software_name, platform, architecture
    order by total asc;
    """
)

Unnamed: 0,software_name,platform,architecture,total
0,news-media-scan-android-unattended,android,386,71
1,news-media-scan-android-unattended,android,amd64,100
2,ooniprobe-android-experimental,android,386,102
3,news-media-scan-android,android,386,334
4,news-media-scan-android,android,amd64,731
5,ooniprobe-android,android,386,9606
6,ooniprobe-android,android,amd64,75782
7,ooniprobe-android-unattended,android,386,1243744
8,ooniprobe-android-unattended,android,amd64,4273495


Last 5 measurements with platform = android, architecture = arm64. These are quite recent

In [23]:
click_query_fsn(
    """
       select software_name, platform, architecture, measurement_start_time
    from fastpath
    where platform = 'android' and architecture = 'amd64'              
    order by measurement_start_time desc limit 5
    """
)

Unnamed: 0,software_name,platform,architecture,measurement_start_time
0,ooniprobe-android-unattended,android,amd64,2025-04-08 12:59:33
1,ooniprobe-android-unattended,android,amd64,2025-04-08 12:59:28
2,ooniprobe-android-unattended,android,amd64,2025-04-08 12:59:13
3,ooniprobe-android-unattended,android,amd64,2025-04-08 12:59:07
4,ooniprobe-android-unattended,android,amd64,2025-04-08 12:58:52


non-arm android distribution by country

In [24]:
click_query_fsn(
    """
    select software_name, platform, architecture, probe_cc, count(*) as total
    from fastpath
    where platform = 'android' and architecture not like '%arm%' and architecture <> ''               
    group by software_name, platform, architecture, probe_cc
    order by total desc
    """
)

Unnamed: 0,software_name,platform,architecture,probe_cc,total
0,ooniprobe-android-unattended,android,amd64,US,3441340
1,ooniprobe-android-unattended,android,386,CA,877851
2,ooniprobe-android-unattended,android,386,US,322348
3,ooniprobe-android-unattended,android,amd64,IT,274433
4,ooniprobe-android-unattended,android,amd64,DE,219807
...,...,...,...,...,...
188,ooniprobe-android-unattended,android,386,UA,1
189,ooniprobe-android-unattended,android,amd64,UY,1
190,ooniprobe-android,android,amd64,HU,1
191,ooniprobe-android-debug,android,amd64,CN,1


Most of these measurements com from the US. Let's see about the ASNs to check if they come from consumer ISPs

In [25]:
click_query_fsn(
    """
    select software_name, platform, architecture, probe_cc, probe_asn, count(*) as total
    from fastpath
    where platform = 'android' and architecture not like '%arm%' and architecture <> '' and probe_cc = 'US'               
    group by software_name, platform, architecture, probe_cc, probe_asn
    order by total desc
    """
)

Unnamed: 0,software_name,platform,architecture,probe_cc,probe_asn,total
0,ooniprobe-android-unattended,android,amd64,US,10796,1866690
1,ooniprobe-android-unattended,android,amd64,US,7922,488155
2,ooniprobe-android-unattended,android,amd64,US,6128,370198
3,ooniprobe-android-unattended,android,amd64,US,5650,186091
4,ooniprobe-android-unattended,android,386,US,21928,121460
...,...,...,...,...,...,...
212,ooniprobe-android,android,amd64,US,30036,1
213,ooniprobe-android,android,amd64,US,63018,1
214,ooniprobe-android,android,amd64,US,63949,1
215,ooniprobe-android-unattended,android,386,US,400861,1


- 10796: Charter Communications Inc. is a telecommunications company operating in the cable television, internet service provider, and telephone industries.
- 7922: Comcast Cable Communications, LLC is a telecommunications company providing cable television, internet, phone, and wireless services in the United States.
- 6128: Cablevision Systems Corp. is a telecommunications company providing cable television, internet, and phone services to residential and business customers in the United States.

Most of the ASNs are consumer ASNs, so maybe this is the expected behaviour?

## Conclusions (platform, architecture)
- Nothing weird on IOS, !arm
- High volume of measurements for (android, !arm), specially coming from the US and consumer ASNs, so probably legit traffic. Still might be worth checking which devices are these