This notebook searches the RIPE atlas measurement metadata archive (available from [ftp.ripe.net](ftp://ftp.ripe.net/atlas/measurements)) for UDP and TCP traceroute measurements.

In [2]:
# Some jupyter magic to set up the environment correctly
%load_ext autoreload
%autoreload 2
%matplotlib inline

# thanks for letting me know about your plans but i don't really care
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)

# things we need, things to make us go
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import collections
import warnings
import hashlib
import requests
import os.path
import time
import calendar
import itertools
import json

In [26]:
TraceMeta = collections.namedtuple("TraceMeta", ("msm_id", "af", "proto", "target", "start_epoch", "stop_epoch", "interval", "probes"))

def tm_generator(first_start=0, last_start=calendar.timegm(time.gmtime()), skip_lines = 0):
    current_time = calendar.timegm(time.gmtime())
    
    with open("data_cache/all-measurements-fjson.txt") as fjf:
        for num, line in enumerate(fjf):
            if num < skip_lines:
                continue
            
            mm = json.loads(line)
                
            if (((mm['type']['id'] == 2) or (mm['type']['id'] == 4)) and
                ((mm['status']['id'] == 2) or (mm['status']['id'] == 4)) and
                (('protocol' in mm) or ('proto_tcp' in mm)) and
                (mm['start_time'] >= first_start) and
                (mm['start_time'] < last_start) ):
            
                    if mm['status']['id'] == 4:
                        stop_time = mm['stop_time']
                    else:
                        stop_time = current_time
                    
                    if 'proto_tcp' in mm and mm['proto_tcp']:
                        proto = 'TCP'
                    else:
                        proto = mm['protocol']
                        
                    yield TraceMeta(mm['msm_id'], mm['af'], proto, mm['dst_addr'], 
                                    mm['start_time'], stop_time, mm['interval'],
                                    mm['participant_count'])

# get ALL the traceroutes! (takes about five minutes on Forclaz)
tmgen = tm_generator()
%time tmdf = pd.DataFrame([m for m in tmgen], columns=TraceMeta._fields)
tmgen.close() # generator wraps a file, close it.

# estimate sample count
tmdf['samples'] = (tmdf['probes'] * (tmdf['stop_epoch'] - tmdf['start_epoch'])) / tmdf['interval']

# cast timestamps
tmdf['start'] = pd.to_datetime(tmdf['start_epoch'] * 1e9)
tmdf['stop'] = pd.to_datetime(tmdf['stop_epoch'] * 1e9)
tmdf['duration'] = tmdf['stop'] - tmdf['start']

# count msms
tmdf['n'] = 1

# index by msm
tmdf.index = pd.Index(tmdf['msm_id'])
del(tmdf['msm_id'])

CPU times: user 4min 43s, sys: 4.64 s, total: 4min 48s
Wall time: 4min 48s


NameError: name 'msm_id' is not defined

In [95]:
# exclude all measurements ending before 1 Jan 2015, and look only at UDP and TCP
tmdf_udp = tmdf[tmdf['proto'] == 'UDP']
tmdf_udp = tmdf_udp[tmdf_udp["stop"] >= "2015-01-01"]
tmdf_tcp = tmdf[tmdf['proto'] == 'TCP']
tmdf_tcp = tmdf_tcp[tmdf_tcp["stop"] >= "2015-01-01"]

In [96]:
udp_targets = pd.DataFrame(data = {'udp_samples': tmdf_udp.groupby('target')['samples'].sum(),
                                   'udp_maxprobes': tmdf_udp.groupby('target')['probes'].max(),
                                   'udp_msms': tmdf_udp.groupby('target')['n'].sum()})
tcp_targets = pd.DataFrame(data = {'tcp_samples': tmdf_tcp.groupby('target')['samples'].sum(),
                                   'tcp_maxprobes': tmdf_tcp.groupby('target')['probes'].max(),
                                   'tcp_msms': tmdf_tcp.groupby('target')['n'].sum()})
common_targets = pd.merge(udp_targets, tcp_targets, left_index=True, right_index=True, how='inner')


In [102]:
tmdf_udp[tmdf_udp['probes'] >= 100].index.values

array([1842149, 1873371, 1873380, ..., 3020847, 3021090, 3102318])