### Dataprep / Pathspider

Prepare FJSON formatted Pathspider ECN plugin merged observations in a single file for further per-target analysis

In [5]:
import pandas as pd
import itertools
import json
import matplotlib.pyplot as plt

%matplotlib inline

def gen_ndjson_ecn_flows(filename, limit = 0):
    """
    Iterate over objects in an NDJSON file,
    returning only Pathspider ECN flow results
    """
    with open(filename) as f:
        for n, line in enumerate(f):
            try:
                j = json.loads(line)
                for f in j['flow_results']:
                    yield f
            except Exception as e:
                print(repr(e))

            if limit and n >= limit - 1:
                break
def gen_ndjson_ecn_observations(filename, limit = 0):
    """
    Iterate over objects in an NDJSON file,
    returning full Pathspider ECN opservations
    """
    with open(filename) as f:
        for n, line in enumerate(f):
            try:
                j = json.loads(line)
                yield j
            except Exception as e:
                print(repr(e))  

            if limit and n >= limit - 1:
                break

In [21]:
condition_map = {"ecn.connectivity.works": "all_conn",
                "ecn.connectivity.broken": "ecn0_conn",
                "ecn.connectivity.transient": "ecn1_conn",
                "ecn.connectivity.offline": "no_conn",
                "ecn.negotiated": "nego",
                "ecn.ect_zero.seen": "ect0",
                "ecn.ect_one.seen": "ect1",
                "ecn.ce.seen": "ce"}
                                            
def extract_ecn_conditions(obsgen):
    for obs in obsgen:
        d = { 'sip': obs['sip'],
              'dip': obs['dip'],
              'host': obs['hostname'],
              'rank': obs['rank']}
        
        if len(obs['flow_results']) == 2 and 'oct_rev' in obs['flow_results'][1]:
            d['ecn1_oct'] = obs['flow_results'][1]['oct_rev'] 
        else:
            d['ecn1_oct'] = 0
        
        for col in condition_map.values():
             d[col] = False
        
        condset = set(obs['conditions'])
        for cond in condition_map:
            if cond in condset:
                d[condition_map[cond]] = True
        
        yield d

def condition_dataframe(filename, limit = 0):
    return pd.DataFrame(extract_ecn_conditions(gen_ndjson_ecn_observations(filename, limit)))

In [34]:
df = {}
%time df{'ams'} = condition_dataframe('udQlWFuQrA-do-ams3-2048-0')
%time df{'lon'} = condition_dataframe('udQlWFuQrA-do-lon1-2048-0')
%time df{'fra'} = condition_dataframe('udQlWFuQrA-do-fra1-2048-0')
%time df{'nyc'} = condition_dataframe('udQlWFuQrA-do-nyc2-2048-0')
%time df{'sfo'} = condition_dataframe('udQlWFuQrA-do-sfo1-2048-0')
%time df{'sgp'} = condition_dataframe('udQlWFuQrA-do-sgp1-2048-0')

CPU times: user 26 s, sys: 797 ms, total: 26.8 s
Wall time: 26.8 s
CPU times: user 30.6 s, sys: 1.28 s, total: 31.9 s
Wall time: 32 s
CPU times: user 30.1 s, sys: 1.25 s, total: 31.3 s
Wall time: 31.4 s
CPU times: user 29.2 s, sys: 1.16 s, total: 30.4 s
Wall time: 30.4 s
CPU times: user 27.9 s, sys: 1.12 s, total: 29 s
Wall time: 29.1 s
CPU times: user 28.6 s, sys: 1.17 s, total: 29.8 s
Wall time: 29.8 s


In [36]:
df = { 'ams': df_ams0,
       'fra': df_fra0,
       'lon': df_lon0,
       'nyc': df_nyc0,
       'sfo': df_sfo0,
       'sgp': df_sgp0 }

False    764620
True       1200
dtype: int64