In [2]:
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()
from bokeh.plotting import figure, show

In [3]:
pd.options.display.max_columns = 500
pd.options.display.max_rows = 100
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
from clickhouse_driver import Client as Clickhouse
from uuid import uuid4
from pathlib import Path

def click_query(q, params=None):
    click = Clickhouse("localhost")
    return click.query_dataframe(q, params=params)

def click_query_fsn(q, params=None):
    click = Clickhouse("backend-fsn.ooni.org")
    return click.query_dataframe(q, params=params)

In [5]:
def get_explorer_url(e):
    return f'https://explorer.ooni.org/m/{e['measurement_uid']}'
def print_explorer_url(e):
    print(get_explorer_url(e))

# Strange software names 
We will look into measurements with non-mainstream software names, like these ones: 
- ooniprobe-android
- ooniprobe-ios
- ooniprobe-desktop
- ooniprobe-cli
- miniooni
- iThena-ooniprobe
- news-media-scan

In [6]:
ok_software_names = ['ooniprobe-android%', 'ooniprobe-ios%', 'ooniprobe-desktop%', 'ooniprobe-cli%', 'miniooni%', 'iThena-ooniprobe%', 'news-media-scan%']

notin = " and ".join(f"software_name not like '{s}'" for s in ok_software_names)
nodev = "software_name not like '%dev%' and software_name not like '%debug%'"

In [7]:

q = f"""select software_name, count(*) as total from fastpath where {notin} and {nodev} group by software_name order by total asc"""
q.replace("\n", "")

click_query_fsn(q)

Unnamed: 0,software_name,total
0,ooniprobe_ios,1
1,Vladhog Security Monitoring Service,3
2,ooni-probe-example,3
3,antani,3
4,pyminiooni,3
5,oonimkall,4
6,my-iiios,6
7,x,6
8,surfshark,6
9,TestIt,8


## Highlighted examples
- **vladhog***: [OK] Seems like a security service that uses ooniprobe to run network tests
- **murakami**: [OK] https://github.com/m-lab/murakami/tree/main it's a project similar to ooni, but it seems like a superset of ooni
- **MySorgenia**: [???] Seems like an italian app to manage some services? https://areaclienti.sorgenia.it/login?forwardURL=%2Fprivate%2Fhome
- **ooniprobe-react-os**: [!!!] the most weird one, it has a very high volume from China and we don't know who to attribute it to
- **onimkall**: [OK] I think it's ours
- **dismantle**: it has very old measurements (2023) and comes mostly from Italy, maybe it was a testing thing


### MySorgenia
It's supposed to be an mobile app, so its traffic should come from android or ios

In [8]:
click_query_fsn(
    """
    select software_name, platform, count(*) as total 
    from fastpath 
    where software_name='MySorgenia' and platform <> '' 
    group by software_name, platform
    order by total desc
    """
)

Unnamed: 0,software_name,platform,total
0,MySorgenia,android,2816


All of its measurements do come from android, which is expected 

In [9]:
click_query_fsn(
    """
    select software_name, platform, probe_cc, count(*) as total 
    from fastpath 
    where software_name='MySorgenia' and platform <> '' 
    group by software_name, platform, probe_cc
    order by total desc
    """
)

Unnamed: 0,software_name,platform,probe_cc,total
0,MySorgenia,android,IT,2725
1,MySorgenia,android,FR,60
2,MySorgenia,android,IN,9
3,MySorgenia,android,BG,6
4,MySorgenia,android,MD,4
5,MySorgenia,android,ES,2
6,MySorgenia,android,DE,2
7,MySorgenia,android,CH,2
8,MySorgenia,android,GB,1
9,MySorgenia,android,DK,1


In [11]:
click_query_fsn(
    """
    select software_name, platform, probe_cc, measurement_start_time 
    from fastpath 
    where software_name='MySorgenia' and platform <> '' 
    order by total desc
    """
)

ServerException: Code: 47.
DB::Exception: Missing columns: 'total' while processing query: 'SELECT software_name, platform, probe_cc, measurement_start_time FROM fastpath WHERE (software_name = 'MySorgenia') AND (platform != '') ORDER BY total DESC', required columns: 'software_name' 'total' 'platform' 'probe_cc' 'measurement_start_time', maybe you meant: ['software_name','platform','probe_cc','measurement_start_time']. Stack trace:

0. DB::Exception::Exception(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int, bool) @ 0x8fe3e9a in /usr/bin/clickhouse
1. DB::TreeRewriterResult::collectUsedColumns(std::__1::shared_ptr<DB::IAST> const&, bool) @ 0x10734e43 in /usr/bin/clickhouse
2. DB::TreeRewriter::analyzeSelect(std::__1::shared_ptr<DB::IAST>&, DB::TreeRewriterResult&&, DB::SelectQueryOptions const&, std::__1::vector<DB::TableWithColumnNamesAndTypes, std::__1::allocator<DB::TableWithColumnNamesAndTypes> > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&, std::__1::shared_ptr<DB::TableJoin>) const @ 0x107382d0 in /usr/bin/clickhouse
3. ? @ 0x1040b324 in /usr/bin/clickhouse
4. DB::InterpreterSelectQuery::InterpreterSelectQuery(std::__1::shared_ptr<DB::IAST> const&, std::__1::shared_ptr<DB::Context const>, std::__1::shared_ptr<DB::IBlockInputStream> const&, std::__1::optional<DB::Pipe>, std::__1::shared_ptr<DB::IStorage> const&, DB::SelectQueryOptions const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&, std::__1::shared_ptr<DB::StorageInMemoryMetadata const> const&) @ 0x10407898 in /usr/bin/clickhouse
5. DB::InterpreterSelectQuery::InterpreterSelectQuery(std::__1::shared_ptr<DB::IAST> const&, std::__1::shared_ptr<DB::Context const>, DB::SelectQueryOptions const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) @ 0x10405f9e in /usr/bin/clickhouse
6. DB::InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(std::__1::shared_ptr<DB::IAST> const&, std::__1::shared_ptr<DB::Context const>, DB::SelectQueryOptions const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) @ 0x105e0bba in /usr/bin/clickhouse
7. DB::InterpreterFactory::get(std::__1::shared_ptr<DB::IAST>&, std::__1::shared_ptr<DB::Context>, DB::SelectQueryOptions const&) @ 0x101e5c97 in /usr/bin/clickhouse
8. ? @ 0x107a53e6 in /usr/bin/clickhouse
9. DB::executeQuery(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::shared_ptr<DB::Context>, bool, DB::QueryProcessingStage::Enum, bool) @ 0x107a3d23 in /usr/bin/clickhouse
10. DB::TCPHandler::runImpl() @ 0x1104343e in /usr/bin/clickhouse
11. DB::TCPHandler::run() @ 0x110563d9 in /usr/bin/clickhouse
12. Poco::Net::TCPServerConnection::start() @ 0x13bce56f in /usr/bin/clickhouse
13. Poco::Net::TCPServerDispatcher::run() @ 0x13bcfffa in /usr/bin/clickhouse
14. Poco::PooledThread::run() @ 0x13d02279 in /usr/bin/clickhouse
15. Poco::ThreadImpl::runnableEntry(void*) @ 0x13cfe50a in /usr/bin/clickhouse
16. start_thread @ 0x7ea7 in /usr/lib/x86_64-linux-gnu/libpthread-2.31.so
17. __clone @ 0xfba2f in /usr/lib/x86_64-linux-gnu/libc-2.31.so


Almost the entire traffic comes from Italy, which makes sense

### Dismantle

In [None]:
click_query_fsn(
    """
    select software_name, platform, probe_cc, architecture, count(*) as total 
    from fastpath 
    where software_name='dismantle'
    group by software_name, platform, probe_cc, architecture
    order by total desc
    """
)

Doesn't provide platform and architecture!

In [None]:
click_query_fsn(
    """
    select software_name, probe_cc, measurement_start_time as st 
    from fastpath 
    where software_name='dismantle'
    order by st desc
    """
)

And they are very old, the first one comes from 2023-02-08 16:34:36