# Analysis of pathspider tfo data

This notebook takes an fjson file from `pathspider tfo` and analyzes it for TFO support as well as apparent TFO connection dependency.

First, needful things:

In [18]:
import json
import pandas as pd
import matplotlib as plt

%matplotlib inline

def gen_fjson(filename):
    """
    Iterate over objects in an FJSON file.
    """
    with open(filename) as f:
        for line in f:
            try:
                yield json.loads(line)
            except:
                pass

def rejoin_tfo_df(tfo_rdf, config_column='config'):
    tfo_df = tfo_rdf.loc[:,[config_column,'connstate','dip','host','rank',
                            'fwd_rst','rev_rst','tfo_seq','tfo_ack','tfo_dlen',
                            'tfo_synclen','tfo_synkind','tfo_ackclen','tfo_ackkind']]

    tfo_0df = tfo_df[tfo_df[config_column] == 0]
    tfo_0df.index = tfo_0df.dip
    del(tfo_0df[config_column])
    del(tfo_0df['dip'])
    del(tfo_0df['tfo_seq'])
    del(tfo_0df['tfo_ack'])
    del(tfo_0df['tfo_dlen'])
    del(tfo_0df['tfo_synclen'])
    del(tfo_0df['tfo_synkind'])
    del(tfo_0df['tfo_ackclen'])
    del(tfo_0df['tfo_ackkind'])
    tfo_0df.columns = ['conn_t0','host','rank','fwd_rst_t0','rev_rst_t0']

    tfo_1df = tfo_df[tfo_df[config_column] == 1]
    tfo_1df.index = tfo_1df.dip
    del(tfo_1df[config_column])
    del(tfo_1df['dip'])
    del(tfo_1df['host'])
    del(tfo_1df['rank'])
    tfo_1df.columns = ['conn_t1','fwd_rst_t1','rev_rst_t1','tfo_seq','tfo_ack','tfo_dlen',
                       'tfo_synclen','tfo_synkind','tfo_ackclen','tfo_ackkind']

    tfo_jdf = tfo_0df.join(tfo_1df, how="inner")
    tfo_xdf = tfo_0df.loc[tfo_0df.index.difference(tfo_1df.index)]
    
    return(tfo_jdf, tfo_xdf)
            
import requests
import ipaddress

def canid_prefix_asn(addr):
    res = requests.get("http://localhost:8081/prefix.json?addr="+str(addr))
    j = res.json()
    return {'addr': addr,
            'prefix': j['Prefix'],
            'asn': j['ASN'] }
            

def ripestat_prefix_asn(addr):
    res = requests.get("https://stat.ripe.net/data/prefix-overview/data.json?resource="+str(addr))
    data = res.json()['data']
    prefix = None
    asn = None
    try:
        prefix = data['resource']
        asn = data['asns'][0]['asn']
    except KeyError:
        pass
    return {'addr':   addr,
            'prefix': prefix, 
            'asn':    asn}

def prefix_asn_df(df, prefix_cache):
    
    rows = []
    
    for addr in df.index.values:
        
        naddr = ipaddress.ip_network(addr)
        row = None
        
        # check prefix cache
        for pfx in prefix_cache:
            if pfx.overlaps(naddr):
                # cache hit, exit
                row = prefix_cache[pfx].copy()
                row['addr'] = addr
                #print("cached:   "+repr(row))

        # or go to a local canid cache of ripestat
        if not row:
            row = canid_prefix_asn(addr)
            #print("ripestat: "+repr(row))
            prefix_cache[ipaddress.ip_network(row['prefix'])] = row
        
        rows.append(row)
    
    # now augment the input frame
    odf = pd.DataFrame(rows)
    odf.index = odf['addr']
    del(odf['addr'])
    return df.join(odf)

def select_ip4(df):
    return df.loc[pd.Index((s for s in df.index.values if ':' not in s))]

def select_ip6(df):
    return df.loc[pd.Index((s for s in df.index.values if ':'     in s))]

Load the data into a Pandas dataframe, then select interesting columns, split based on TFO state, and join on destination address.

In [18]:
#resolute_rdf = pd.DataFrame(gen_fjson("tfo-full-resolute-20170116.ndjson"))

In [26]:
#%time (resolute_jdf, resolute_xdf) = rejoin_tfo_df(pd.DataFrame(resolute_rdf))

CPU times: user 7.8 s, sys: 497 ms, total: 8.3 s
Wall time: 8.3 s


In [19]:
# Merge two runs, only add missing rows from second
#%time (tfo_a_jdf, tfo_a_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run4.fjson")))
#%time (tfo_b_jdf, tfo_b_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run4b.fjson")))

#tfo_jdf = tfo_a_jdf.append(tfo_b_jdf).reset_index().drop_duplicates(subset='dip', keep='first').set_index('dip')
#tfo_xdf = tfo_a_xdf.append(tfo_b_xdf).reset_index().drop_duplicates(subset='dip', keep='first').set_index('dip')

#%time (tfo5_jdf, tfo5_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run5.fjson")))

%time (tfo6_jdf, tfo6_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run6.fjson")),config_column='tfostate')

%time (resolute_jdf, resolute_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("tfo-full-resolute-20170116.ndjson")))

CPU times: user 39 s, sys: 2.43 s, total: 41.5 s
Wall time: 41.6 s


Now run the joint data frames through a sieve to classify them as follows:

- TFO works: TFO cookie was received, SYN-ACK ackonwledges data on SYN.
- TFO data not acked: TFO cookie received, SYN-ACK acknowledges SYN only.
- TFO data failure: TFO cookie received, no ACK (or RST) of SYN with data.
- TFO not negotiated: TFO cookie not received, but no TFO-linked connection impairment.
- TFO connection failure: Presence of TFO option causes connection failure (RST or drop).
- Connection failure: Neither vanilla TCP nor TFO connection attempt succeeded.

In [3]:
def tfo_sieve(tfo_jdf, tfo_xdf, prefix_cache):
    # Cookie available on SYN, but not ACK
    tfo_rscookie = tfo_jdf[(tfo_jdf['tfo_synclen'] > 0) & (tfo_jdf['tfo_ackclen'] == 0)]
    
    # Cookie available on ACK, but not SYN (probably retry 254)
    tfo_racookie = tfo_jdf[(tfo_jdf['tfo_synclen'] == 0) & (tfo_jdf['tfo_ackclen'] > 0)]

    # Cookie available on ACK, but not SYN (probably retry 254)
    tfo_rascookie = tfo_jdf[(tfo_jdf['tfo_synclen'] > 0) & (tfo_jdf['tfo_ackclen'] > 0)]

    # Add ASN information to cookie table, we'll use it later
    tfo_cookie = prefix_asn_df(tfo_rscookie.append(tfo_racookie).append(tfo_rascookie), prefix_cache)

    # (special cases... SYN cookie, but not eight bytes long)
    tfo_oddcookie = tfo_cookie[(tfo_cookie['tfo_synclen'] > 0) & (tfo_cookie['tfo_synclen'] != 8)]

    # (special cases... ACK cookie of kind 254: retry)
    tfo_expcookie = tfo_cookie[(tfo_cookie['tfo_ackclen'] > 0) & (tfo_cookie['tfo_ackkind'] == 254)]

    # (special cases... ACK cookie even with SYN cookie)
    tfo_twocookie = tfo_cookie[(tfo_cookie['tfo_synclen'] > 0) & (tfo_cookie['tfo_ackclen'] > 0)]
    
    # TFO works: data sent and ACKed
    tfo_works = tfo_cookie[((tfo_cookie['tfo_ack'] - tfo_cookie['tfo_seq'] - 1) == tfo_cookie['tfo_dlen'])]

    # TFO data not acked: data sent, but ACK only ACKs SYN
    tfo_dna = tfo_cookie[((tfo_cookie['tfo_ack'] - tfo_cookie['tfo_seq'] - 1) == 0)]

    # TFO data failed: data seen, but no ACK seen
    tfo_dfail = tfo_cookie[tfo_cookie['tfo_ack'] == 0]

    # No cookie available
    tfo_nocookie = tfo_jdf[tfo_jdf['tfo_synclen'] == 0]

    # TFO connection failures (where TFO attempted)
    tfo_cfail = tfo_nocookie[~tfo_nocookie['conn_t1']]

    # TFO not negotiated
    tfo_nope = tfo_nocookie[tfo_nocookie['conn_t1']]

    # Complete connection failures
    total_cfail = tfo_xdf[~tfo_xdf['conn_t0']]

    # TFO connection failures (either TFO not attempted or not seen)
    xtfo_cfail = tfo_xdf[tfo_xdf['conn_t0']]
    
    # summarize
    ct_total = len(tfo_jdf) + len(tfo_xdf)
    ct_totalfail = len(total_cfail)
    ct_tfocfail = len(tfo_cfail) + len(xtfo_cfail)
    ct_tfonope = len(tfo_nope)

    ct_tfodfail = len(tfo_dfail)
    ct_tfodna = len(tfo_dna)
    ct_tfoworks = len(tfo_works)
    ct_tfonego = len(tfo_cookie)

    ct_oddcookie = len(tfo_oddcookie)
    ct_expcookie = len(tfo_expcookie)
    ct_twocookie = len(tfo_twocookie)

    ct_tfogoog = len(tfo_cookie[tfo_cookie['asn'] == 15169])
    ct_tfongoog = len(tfo_cookie[tfo_cookie['asn'] != 15169])

    print("Of %6u tested IP addresses:" % (ct_total,))
    print("   %6u (%6.3f%%) completely failed to connect." % (ct_totalfail, 100 * ct_totalfail / ct_total))
    print("   %6u (%6.3f%%) may have TFO-dependent failure." % (ct_tfocfail, 100 * ct_tfocfail / ct_total))
    print("   %6u (%6.3f%%) did not negotiate TFO." % (ct_tfonope, 100 * ct_tfonope / ct_total))
    print("   %6u (%6.3f%%) negotiated TFO, of which:" % (ct_tfonego, 100 * ct_tfonego / ct_total))
    print(" - - - - - - - -")
    print("   %6u (%6.3f%% / %6.3f%%) responded with a type-254 cookie" % 
                  (ct_expcookie, 100 * ct_expcookie / ct_tfonego, 100 * ct_expcookie / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) responded with a non-8-byte cookie" % 
                  (ct_oddcookie, 100 * ct_oddcookie / ct_tfonego, 100 * ct_oddcookie / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) properly ACKed data on SYN" % 
                  (ct_tfoworks, 100 * ct_tfoworks / ct_tfonego, 100 * ct_tfoworks / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) returned a cookie while ACKing data on SYN" % 
                  (ct_twocookie, 100 * ct_twocookie / ct_tfonego, 100 * ct_twocookie / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) did not ACK data on SYN" % 
                  (ct_tfodna, 100 * ct_tfodna / ct_tfonego, 100 * ct_tfodna / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) failed with data on SYN" % 
                  (ct_tfodfail, 100 * ct_tfodfail / ct_tfonego, 100 * ct_tfodfail / ct_total))
    print(" - - - - - - - -")
    print("   %6u (%6.3f%% / %6.3f%%) are Google properties" %
                  (ct_tfogoog, 100 * ct_tfogoog / ct_tfonego, 100 * ct_tfogoog / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) are not Google properties" %
                  (ct_tfongoog, 100 * ct_tfongoog / ct_tfonego, 100 * ct_tfongoog / ct_total))

    return {'cookie': tfo_cookie,
            'oddcookie': tfo_oddcookie,
            'expcookie': tfo_expcookie,
            'twocookie': tfo_twocookie,
            'works': tfo_works,
            'dna': tfo_dna,
            'dfail': tfo_dfail,
            'nope': tfo_nope,
            'tcfail': total_cfail,
            'xcfail': xtfo_cfail}

In [4]:
prefix_cache = {}
print('All addresses:')
sieve6all = tfo_sieve(resolute_jdf, resolute_xdf, prefix_cache)
print('IPv4 only:')
sieve6v4 = tfo_sieve(select_ip4(resolute_jdf), select_ip4(resolute_xdf), prefix_cache)
print('IPv6 only:')
sieve6v6 = tfo_sieve(select_ip6(resolute_jdf), select_ip6(resolute_xdf), prefix_cache)

All addresses:
Of 939680 tested IP addresses:
    29839 ( 3.175%) completely failed to connect.
      177 ( 0.019%) may have TFO-dependent failure.
   908464 (96.678%) did not negotiate TFO.
      866 ( 0.092%) negotiated TFO, of which:
 - - - - - - - -
       31 ( 3.580% /  0.003%) responded with a type-254 cookie
       12 ( 1.386% /  0.001%) responded with a non-8-byte cookie
      830 (95.843% /  0.088%) properly ACKed data on SYN
       33 ( 3.811% /  0.004%) returned a cookie while ACKing data on SYN
       68 ( 7.852% /  0.007%) did not ACK data on SYN
        0 ( 0.000% /  0.000%) failed with data on SYN
 - - - - - - - -
      690 (79.677% /  0.073%) are Google properties
      176 (20.323% /  0.019%) are not Google properties
IPv4 only:
Of 843966 tested IP addresses:
    25333 ( 3.002%) completely failed to connect.
      168 ( 0.020%) may have TFO-dependent failure.
   817479 (96.862%) did not negotiate TFO.
      809 ( 0.096%) negotiated TFO, of which:
 - - - - - - - -
     

In [21]:
prefix_cache = {}
print('All addresses:')
sieve6all = tfo_sieve(tfo6_jdf, tfo6_xdf, prefix_cache)
print('IPv4 only:')
sieve6v4 = tfo_sieve(select_ip4(tfo6_jdf), select_ip4(tfo6_xdf), prefix_cache)
print('IPv6 only:')
sieve6v6 = tfo_sieve(select_ip6(tfo6_jdf), select_ip6(tfo6_xdf), prefix_cache)

All addresses:
Of 673230 tested IP addresses:
    18777 ( 2.789%) completely failed to connect.
      211 ( 0.031%) may have TFO-dependent failure.
   653681 (97.096%) did not negotiate TFO.
      578 ( 0.086%) negotiated TFO, of which:
 - - - - - - - -
       15 ( 2.595% /  0.002%) responded with a type-254 cookie
       11 ( 1.903% /  0.002%) responded with a non-8-byte cookie
      563 (97.405% /  0.084%) properly ACKed data on SYN
       16 ( 2.768% /  0.002%) returned a cookie while ACKing data on SYN
       32 ( 5.536% /  0.005%) did not ACK data on SYN
        0 ( 0.000% /  0.000%) failed with data on SYN
 - - - - - - - -
      485 (83.910% /  0.072%) are Google properties
       93 (16.090% /  0.014%) are not Google properties
IPv4 only:
Of 620560 tested IP addresses:
    15818 ( 2.549%) completely failed to connect.
      208 ( 0.034%) may have TFO-dependent failure.
   604023 (97.335%) did not negotiate TFO.
      528 ( 0.085%) negotiated TFO, of which:
 - - - - - - - -
     

Dump all TFO-supporting websites, and apparently-TFO-connection-failing websites, to CSV, so we can check them from multiple vantage points.

In [5]:
with open("tfo_cookie7.csv", "w") as file:
    tfo_cookie_csv = sieve6all['cookie'].copy()
    tfo_cookie_csv['port'] = 80
    file.write(tfo_cookie_csv.loc[:,["port","host","rank"]].to_csv(header=False))

with open("tfo_cfail7.csv", "w") as file:
    tfo_cfail_csv = sieve6all['tcfail'].copy()
    tfo_cfail_csv['port'] = 80
    file.write(tfo_cfail_csv.loc[:,["port","host","rank"]].to_csv(header=False))
    xtfo_cfail_csv = sieve6all['xcfail'].copy()
    xtfo_cfail_csv['port'] = 80
    file.write(xtfo_cfail_csv.loc[:,["port","host","rank"]].to_csv(header=False))

What about odd cookies?

In [6]:
sieve6all['oddcookie']

Unnamed: 0_level_0,conn_t0,host,rank,fwd_rst_t0,rev_rst_t0,conn_t1,fwd_rst_t1,rev_rst_t1,tfo_seq,tfo_ack,tfo_dlen,tfo_synclen,tfo_synkind,tfo_ackclen,tfo_ackkind,asn,prefix
dip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
195.53.217.36,True,mapfre.es,53336,False,False,True,True,False,4212357000.0,4212357000.0,35.0,6.0,34.0,6.0,34.0,30846,195.53.217.0/24
195.25.232.194,True,macif.fr,62072,False,False,True,True,False,528768600.0,528768600.0,34.0,6.0,34.0,6.0,34.0,3215,195.25.0.0/16
57.74.24.130,True,weg.net,144757,False,False,True,True,False,3856047000.0,3856047000.0,33.0,6.0,34.0,6.0,34.0,51964,57.74.0.0/17
195.235.248.195,True,fundacionmapfre.org,158909,False,False,True,True,False,867961300.0,867961300.0,45.0,6.0,34.0,6.0,34.0,30846,195.235.248.0/24
212.170.33.244,True,verti.es,164578,False,False,True,True,False,1104188000.0,1104188000.0,34.0,6.0,34.0,6.0,34.0,30846,212.170.33.0/24
195.53.217.193,True,mapfre.com,180053,False,True,True,True,False,598635400.0,598635400.0,36.0,6.0,34.0,6.0,34.0,30846,195.53.217.0/24
195.235.248.34,True,jubilacionypension.com,279894,False,False,True,True,False,2507622000.0,2507622000.0,48.0,6.0,34.0,6.0,34.0,30846,195.235.248.0/24
195.235.248.46,True,mapfretecuidamos.com,288347,False,True,True,True,False,1324109000.0,1324109000.0,46.0,6.0,34.0,6.0,34.0,30846,195.235.248.0/24
195.53.217.43,True,mapfre.net,609528,False,False,True,True,False,1829123000.0,1829123000.0,36.0,6.0,34.0,6.0,34.0,30846,195.53.217.0/24
212.170.33.198,True,mapfre-warranty.com,713074,False,False,True,True,False,3608641000.0,3608641000.0,45.0,6.0,34.0,6.0,34.0,30846,212.170.33.0/24


Experimental ones?

In [7]:
sieve6all['expcookie']

Unnamed: 0_level_0,conn_t0,host,rank,fwd_rst_t0,rev_rst_t0,conn_t1,fwd_rst_t1,rev_rst_t1,tfo_seq,tfo_ack,tfo_dlen,tfo_synclen,tfo_synkind,tfo_ackclen,tfo_ackkind,asn,prefix
dip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
78.140.190.102,True,r7mediar.com,3023,False,False,True,False,False,938617400.0,938617400.0,0.0,0.0,254.0,8.0,254.0,35415,78.140.184.0/21
163.172.40.207,True,meduza.io,7839,False,False,True,True,False,2700552000.0,2700552000.0,0.0,0.0,254.0,8.0,254.0,12876,163.172.0.0/16
163.172.40.199,True,meduza.io,7838,False,False,True,True,False,272847100.0,272847100.0,0.0,0.0,254.0,8.0,254.0,12876,163.172.0.0/16
5.199.134.80,True,kedem.ru,60041,False,False,True,True,False,286203500.0,286203500.0,0.0,0.0,254.0,8.0,254.0,24961,5.199.128.0/20
89.108.106.190,True,fanat1k.ru,62387,False,False,True,True,False,2492193000.0,2492193000.0,0.0,0.0,254.0,8.0,254.0,43146,89.108.106.0/24
203.162.59.2,True,vieclam24h.vn,63067,True,False,True,True,False,474682900.0,474682900.0,0.0,0.0,254.0,8.0,254.0,7643,203.162.59.0/24
144.76.82.156,True,samba.org,80131,False,False,True,True,False,4001043000.0,4001043000.0,0.0,0.0,254.0,8.0,254.0,24940,144.76.0.0/16
2a01:4f8:192:486::443:2,True,samba.org,80132,False,False,True,True,False,477206500.0,477206500.0,0.0,0.0,254.0,8.0,254.0,24940,2a01:4f8::/29
185.22.172.173,True,animetosho.org,164005,False,False,True,True,False,3015407000.0,3015407000.0,0.0,0.0,254.0,8.0,254.0,43317,185.22.172.0/22
2a00:1838:20:2::2c65:425e,True,animetosho.org,164006,False,False,True,True,False,4176094000.0,4176094000.0,0.0,0.0,254.0,8.0,254.0,43317,2a00:1838::/32


Double cookies?

In [8]:
sieve6all['twocookie']

Unnamed: 0_level_0,conn_t0,host,rank,fwd_rst_t0,rev_rst_t0,conn_t1,fwd_rst_t1,rev_rst_t1,tfo_seq,tfo_ack,tfo_dlen,tfo_synclen,tfo_synkind,tfo_ackclen,tfo_ackkind,asn,prefix
dip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
195.53.217.36,True,mapfre.es,53336,False,False,True,True,False,4212357000.0,4212357000.0,35.0,6.0,34.0,6.0,34.0,30846,195.53.217.0/24
195.25.232.194,True,macif.fr,62072,False,False,True,True,False,528768600.0,528768600.0,34.0,6.0,34.0,6.0,34.0,3215,195.25.0.0/16
173.194.44.87,True,china-cdn88nmbwacdnln8hq8qwe.com,96387,False,False,True,True,False,3827056000.0,3827056000.0,58.0,8.0,34.0,8.0,34.0,15169,173.194.44.0/24
195.216.236.10,True,www.inbox.lt,119706,False,False,True,True,False,3174944000.0,3174944000.0,38.0,8.0,34.0,8.0,34.0,12993,195.216.236.0/24
178.33.126.200,True,tigerrr.com,138435,False,False,True,True,False,1519086000.0,1519086000.0,37.0,8.0,34.0,8.0,34.0,16276,178.32.0.0/15
2001:41d0:b:6c8::1,True,tigerrr.com,138436,False,False,True,True,False,4103058000.0,4103058000.0,37.0,8.0,34.0,8.0,34.0,16276,2001:41d0::/32
57.74.24.130,True,weg.net,144757,False,False,True,True,False,3856047000.0,3856047000.0,33.0,6.0,34.0,6.0,34.0,51964,57.74.0.0/17
195.235.248.195,True,fundacionmapfre.org,158909,False,False,True,True,False,867961300.0,867961300.0,45.0,6.0,34.0,6.0,34.0,30846,195.235.248.0/24
212.170.33.244,True,verti.es,164578,False,False,True,True,False,1104188000.0,1104188000.0,34.0,6.0,34.0,6.0,34.0,30846,212.170.33.0/24
195.53.217.193,True,mapfre.com,180053,False,True,True,True,False,598635400.0,598635400.0,36.0,6.0,34.0,6.0,34.0,30846,195.53.217.0/24


### Comparison with Andreas' results from June

In [9]:
andreas_df = pd.read_table("andreas_results.txt", delimiter="\s*\|\s*", engine="python")\
.set_index('ip')\
.drop("Unnamed: 0",axis=1)\
.drop("Unnamed: 13",axis=1)

In [10]:
len(andreas_df)

362

In [11]:
len(andreas_df[andreas_df['google'] == 1])

304

In [12]:
# Domain set intersection
jan_tfoset = set(sieve6all['cookie']['host'].unique())
jun_tfoset = set(andreas_df['host'].unique())
jan_allset = set(resolute_jdf['host'].unique())

In [15]:
jun_tfoset

{'abema.tv',
 'allabout-japan.com',
 'anonanalytics.com',
 'ato.mx',
 'd35classicapp.com',
 'digitalworldads.com',
 'dudeplaygames.com',
 'gamez4tops.com',
 'mobilepinbox.com',
 'notifuse.com',
 'number1hazard.com',
 'shipment.co',
 'shop3stockoff.com',
 'takusuta.com',
 'ussto.com',
 'w20sweeplnd.com',
 'widevine.com',
 'www.2016cbseresultsnic.in',
 'www.20speed.co',
 'www.2ality.com',
 'www.ad-2.me',
 'www.ad4screen.com',
 'www.adbuddiz.com',
 'www.addressreport.com',
 'www.admob.com',
 'www.adyoulike.com',
 'www.aladdin.ie',
 'www.almadia.pl',
 'www.alvexo.com',
 'www.ambev.com.br',
 'www.aporta.org.mx',
 'www.arbor.io',
 'www.arrow-arrow.com',
 'www.auntbertha.com',
 'www.bablic.com',
 'www.balagannna.com',
 'www.balatarin.com',
 'www.battlefieldbr.com',
 'www.bbfdirect.com',
 'www.beatstage.com',
 'www.beautyscenery.com',
 'www.blockly-games.appspot.com',
 'www.blogspot.com',
 'www.blogspot.com.cy',
 'www.blossom.io',
 'www.boxc.com',
 'www.bq.com',
 'www.bqreaders.com',
 'www.buy