### Preparing Pathspider ECN data for analysis

Prepare FJSON formatted Pathspider ECN plugin merged observations in a single file for further per-target analysis

In [1]:
import pandas as pd
import itertools
import json
import matplotlib.pyplot as plt

%matplotlib inline

def gen_ndjson_ecn_flows(filename, limit = 0):
    """
    Iterate over objects in an NDJSON file,
    returning only Pathspider ECN flow results
    """
    with open(filename) as f:
        for n, line in enumerate(f):
            try:
                j = json.loads(line)
                for f in j['flow_results']:
                    yield f
            except Exception as e:
                print(repr(e))

            if limit and n >= limit - 1:
                break

def gen_ndjson_ecn_observations(filename, limit = 0):
    """
    Iterate over objects in an NDJSON file,
    returning full Pathspider ECN opservations
    """
    with open(filename) as f:
        for n, line in enumerate(f):
            try:
                j = json.loads(line)
                yield j
            except Exception as e:
                print(repr(e))  

            if limit and n >= limit - 1:
                break

Map conditions produced by PathSpider to column names, and load conditions into dataframes.

In [2]:
condition_map = {"ecn.connectivity.works": "all_conn",
                "ecn.connectivity.broken": "ecn0_conn",
                "ecn.connectivity.transient": "ecn1_conn",
                "ecn.connectivity.offline": "no_conn",
                "ecn.negotiated": "nego",
                "ecn.ect_zero.seen": "ect0",
                "ecn.ect_one.seen": "ect1",
                "ecn.ce.seen": "ce"}
                                            
def extract_ecn_conditions(obsgen):
    for obs in obsgen:
        d = { 'sip': obs['sip'],
              'dip': obs['dip'],
              'host': obs['hostname'],
              'rank': obs['rank']}
        
        if len(obs['flow_results']) == 2 and 'oct_rev' in obs['flow_results'][1]:
            d['ecn1_oct'] = obs['flow_results'][1]['oct_rev'] 
        else:
            d['ecn1_oct'] = 0
        
        for col in condition_map.values():
             d[col] = False
        
        condset = set(obs['conditions'])
        for cond in condition_map:
            if cond in condset:
                d[condition_map[cond]] = True
        
        yield d

def condition_dataframe(filename, limit = 0):
    df = pd.DataFrame(extract_ecn_conditions(gen_ndjson_ecn_observations(filename, limit)))
    
    df['ip6'] = df["ip6"] = df['dip'].apply(lambda x: ":" in x)
    
    return df

In [3]:
df = {'ams': condition_dataframe('udQlWFuQrA-do-ams3-2048-0'),
      'lon': condition_dataframe('udQlWFuQrA-do-lon1-2048-0'),
      'fra': condition_dataframe('udQlWFuQrA-do-fra1-2048-0'),
      'nyc': condition_dataframe('udQlWFuQrA-do-nyc2-2048-0'),
      'sfo': condition_dataframe('udQlWFuQrA-do-sfo1-2048-0'),
      'sgp': condition_dataframe('udQlWFuQrA-do-sgp1-2048-0') }

### Analysis

First the easy stuff. Targets negotiating ECN, and marking ECT

In [4]:
def df_counts(df):
    out = {'n': len(df)}
    
    for oc in ('no_conn', 'ecn0_conn', 'ecn1_conn', 'all_conn', 'nego', 'nego-ect0-ect1', 'ect0+nego', 'ect1+nego', 'ce+nego', 'ect0-nego', 'ect1-nego', 'ce-nego'):
        if '+' in oc:
            col = oc.split("+")
            r = df[col[0]]
            for ncol in col[1:]:
                r = r & df[ncol]
        elif '-' in oc:
            col = oc.split("-")
            r = df[col[0]] 
            for ncol in col[1:]:
                r = r & ~df[ncol]
        else:
            r = df[oc]
        
        try:
            out[oc] = r.value_counts()[True]
        except KeyError:
            out[oc] = 0
    
    return out

def print_df_counts(dc):
    print("%d/%d (%5.3f%%) did not connect" % (dc['no_conn'], dc['n'], dc['no_conn']*100/dc['n']))
    print("%d/%d (%5.3f%%) negotiated ECN" % (dc['nego'], dc['n'], dc['nego']*100/dc['n']))
    print("%d/%d (%5.3f%%) and marked ECT0" % (dc['ect0+nego'], dc['nego'], dc['ect0+nego']*100/dc['n']))
    print("%d/%d (%5.3f%%) and marked ECT1" % (dc['ect1+nego'], dc['nego'], dc['ect1+nego']*100/dc['n']))
    print("%d/%d (%5.3f%%) and marked CE" % (dc['ect1+nego'], dc['nego'], dc['ect1+nego']*100/dc['n']))

def print_latex_table_v4v6(df):
    
    dc4 = df_counts(df[~df['ip6']])
    dc6 = df_counts(df[df['ip6']])
    
    print("\multicolumn{2}{c|}{IPv4} & \multicolumn{2}{c|}{IPv6} &\\\\")
    print("\multicolumn{2}{c|}{$n=$%d} & \multicolumn{2}{c|}{$n=$%d} &\\\\" % (dc4['n'], dc6['n']))
    print("            hosts & pct      &    hosts & pct      & description \\\\")
    print("\\hline")
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & Completely failed to connect \\\\" %
                    (dc4['no_conn'], dc4['no_conn']*100/dc4['n'], dc6['no_conn'], dc6['no_conn']*100/dc6['n']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & Failed to connect w/ECN \\\\" %
                    (dc4['ecn0_conn'], dc4['ecn0_conn']*100/dc4['n'], dc6['ecn0_conn'], dc6['ecn0_conn']*100/dc6['n']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & Transient connection failure \\\\" %
                    (dc4['ecn1_conn'], dc4['ecn1_conn']*100/dc4['n'], dc6['ecn1_conn'], dc6['ecn1_conn']*100/dc6['n']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & No connection failure, of which: \\\\" %
                    (dc4['all_conn'], dc4['all_conn']*100/dc4['n'], dc6['all_conn'], dc6['all_conn']*100/dc6['n']))
    print("\\hline")
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & \hspace{3mm} Negotiated ECN, of which: \\\\" %
                    (dc4['nego'], dc4['nego']*100/dc4['all_conn'], dc6['nego'], dc6['nego']*100/dc6['all_conn']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & \hspace{6mm} Marked ECT0 \\\\" %
                    (dc4['ect0+nego'], dc4['ect0+nego']*100/dc4['nego'], dc6['ect0+nego'], dc6['ect0+nego']*100/dc6['nego']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & \hspace{6mm} Marked ECT1 \\\\" %
                    (dc4['ect1+nego'], dc4['ect1+nego']*100/dc4['nego'], dc6['ect1+nego'], dc6['ect1+nego']*100/dc6['nego']))

    
def print_latex_table_pam15(df):
    dc4 = df_counts(df[~df['ip6']])
    dc6 = df_counts(df[df['ip6']])
    
    print("\multicolumn{2}{c|}{IPv4} & \multicolumn{2}{c|}{IPv6} &\\\\")
    print("\multicolumn{2}{c|}{$n=$%d} & \multicolumn{2}{c|}{$n=$%d} &\\\\" % (dc4['n'], dc6['n']))
    print("            hosts & pct      &    hosts & pct      & description \\\\")
    print("\\hline")
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & Completely failed to connect \\\\" %
                    (dc4['no_conn'], dc4['no_conn']*100/dc4['n'], dc6['no_conn'], dc6['no_conn']*100/dc6['n']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & Capable of negotiating ECN, of which: \\\\" %
                    (dc4['nego'], dc4['nego']*100/dc4['n'], dc6['nego'], dc6['nego']*100/dc6['all_conn']))    
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & \hspace{6mm} Never mark ECT \\\\" %
                    (dc4['nego-ect0-ect1'], dc4['nego-ect0-ect1']*100/dc4['nego'], dc6['nego-ect0-ect1'], dc6['nego-ect0-ect1']*100/dc6['nego']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & \hspace{6mm} Mark ECT1 \\\\" %
                    (dc4['ect1+nego'], dc4['ect1+nego']*100/dc4['nego'], dc6['ect1+nego'], dc6['ect1+nego']*100/dc6['nego']))
    print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & Failed to connect w/ECN \\\\" %
                    (dc4['ecn0_conn'], dc4['ecn0_conn']*100/dc4['n'], dc6['ecn0_conn'], dc6['ecn0_conn']*100/dc6['n']))
#     print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & Transient connection failure \\\\" %
#                     (dc4['ecn1_conn'], dc4['ecn1_conn']*100/dc4['n'], dc6['ecn1_conn'], dc6['ecn1_conn']*100/dc6['n']))
#     print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & No connection failure, of which: \\\\" %
#                     (dc4['all_conn'], dc4['all_conn']*100/dc4['n'], dc6['all_conn'], dc6['all_conn']*100/dc6['n']))
#     print("\\hline")
#     print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & \hspace{6mm} Marked ECT0 \\\\" %
#                     (dc4['ect0+nego'], dc4['ect0+nego']*100/dc4['nego'], dc6['ect0+nego'], dc6['ect0+nego']*100/dc6['nego']))
#     print("           %6d & %6.3f\\%% &   %6d & %6.3f\\%% & \hspace{6mm} Marked ECT1 \\\\" %
#                     (dc4['ect1+nego'], dc4['ect1+nego']*100/dc4['nego'], dc6['ect1+nego'], dc6['ect1+nego']*100/dc6['nego']))
    

In [5]:
print_latex_table_pam15(df['ams'])

\multicolumn{2}{c|}{IPv4} & \multicolumn{2}{c|}{IPv6} &\\
\multicolumn{2}{c|}{$n=$675289} & \multicolumn{2}{c|}{$n=$90531} &\\
            hosts & pct      &    hosts & pct      & description \\
\hline
            12583 &  1.863\% &     3621 &  4.000\% & Completely failed to connect \\
           498866 & 73.874\% &    82722 & 95.232\% & Capable of negotiating ECN, of which: \\
            15000 &  3.007\% &     6622 &  8.005\% & \hspace{6mm} Never mark ECT \\
               30 &  0.006\% &       16 &  0.019\% & \hspace{6mm} Mark ECT1 \\
             1851 &  0.274\% &       23 &  0.025\% & Failed to connect w/ECN \\


### Site and Path Dependency

In [37]:
import requests
def canid_prefix_asn(addr,port=8377):
    res = requests.get("http://localhost:%u/prefix.json?addr=%s" % (port, str(addr)))
    j = res.json()
    return {'addr': addr,
            'prefix': j['Prefix'],
            'asn': j['ASN'],
            'cc': j['CountryCode']}

In [38]:
site_df = pd.read_table("ecn-site-dependent-17jan10.txt",header=None,names=["ip"])
site_df['ip6'] = site_df['ip'].apply(lambda x: ":" in x)
site_df['asn'] = site_df['ip'].apply(lambda a: canid_prefix_asn(a)['asn'])
site_df['cc'] = site_df['ip'].apply(lambda a: canid_prefix_asn(a)['cc'])

In [39]:
path_df = pd.read_table("ecn-path-dependent-17jan10.txt",header=None,names=["ip"])
path_df['ip6'] = path_df['ip'].apply(lambda x: ":" in x)
path_df['asn'] = path_df['ip'].apply(lambda a: canid_prefix_asn(a)['asn'])
path_df['cc'] = path_df['ip'].apply(lambda a: canid_prefix_asn(a)['cc'])

In [40]:
len(path_df[~path_df['ip6']])

194

In [41]:
len(path_df[path_df['ip6']])

2

In [46]:
site_df['asn'].value_counts()

4134      283
4837       66
62468      34
17841      32
4808       31
4766       29
40065      27
38283      18
38197      18
197988     17
132422     17
17816      17
4812       16
4713       15
4847       14
58543      13
44066      10
17623      10
131279     10
4538        9
133779      8
13768       8
9318        7
9808        7
9316        7
53587       6
3462        6
4611        6
23724       6
9488        5
         ... 
23570       1
23568       1
23565       1
9186        1
16339       1
5089        1
45977       1
7018        1
17213       1
133948      1
15107       1
197695      1
17511       1
7564        1
17676       1
9595        1
5483        1
7524        1
3390        1
9531        1
134420      1
3320        1
17564       1
9459        1
17638       1
15557       1
3249        1
9389        1
17574       1
6147        1
Name: asn, dtype: int64