# Analysis of pathspider tfo data

This notebook takes an fjson file from `pathspider tfo` and analyzes it for TFO support as well as apparent TFO connection dependency.

First, needful things:

In [1]:
import json
import pandas as pd
import matplotlib as plt

%matplotlib inline

def gen_fjson(filename):
    """
    Iterate over objects in an FJSON file.
    """
    with open(filename) as f:
        for line in f:
            try:
                yield json.loads(line)
            except:
                pass

def rejoin_tfo_df(tfo_rdf):
    tfo_df = tfo_rdf.loc[:,['tfostate','connstate','dip','host','rank',
                            'fwd_rst','rev_rst','tfo_seq','tfo_ack','tfo_dlen',
                            'tfo_synclen','tfo_synkind','tfo_ackclen','tfo_ackkind']]

    tfo_0df = tfo_df[tfo_df['tfostate'] == 0]
    tfo_0df.index = tfo_0df.dip
    del(tfo_0df['tfostate'])
    del(tfo_0df['dip'])
    del(tfo_0df['tfo_seq'])
    del(tfo_0df['tfo_ack'])
    del(tfo_0df['tfo_dlen'])
    del(tfo_0df['tfo_synclen'])
    del(tfo_0df['tfo_synkind'])
    del(tfo_0df['tfo_ackclen'])
    del(tfo_0df['tfo_ackkind'])
    tfo_0df.columns = ['conn_t0','host','rank','fwd_rst_t0','rev_rst_t0']

    tfo_1df = tfo_df[tfo_df['tfostate'] == 1]
    tfo_1df.index = tfo_1df.dip
    del(tfo_1df['tfostate'])
    del(tfo_1df['dip'])
    del(tfo_1df['host'])
    del(tfo_1df['rank'])
    tfo_1df.columns = ['conn_t1','fwd_rst_t1','rev_rst_t1','tfo_seq','tfo_ack','tfo_dlen',
                       'tfo_synclen','tfo_synkind','tfo_ackclen','tfo_ackkind']

    tfo_jdf = tfo_0df.join(tfo_1df, how="inner")
    tfo_xdf = tfo_0df.loc[tfo_0df.index.difference(tfo_1df.index)]
    
    return(tfo_jdf, tfo_xdf)
            
import requests
import ipaddress

def ripestat_prefix_asn(addr):
    res = requests.get("https://stat.ripe.net/data/prefix-overview/data.json?resource="+str(addr))
    data = res.json()['data']
    prefix = None
    asn = None
    try:
        prefix = data['resource']
        asn = data['asns'][0]['asn']
    except KeyError:
        pass
    return {'addr':   addr,
            'prefix': prefix, 
            'asn':    asn}

def prefix_asn_df(df, prefix_cache):
    
    rows = []
    
    for addr in df.index.values:
        
        naddr = ipaddress.ip_network(addr)
        row = None
        
        # check prefix cache
        for pfx in prefix_cache:
            if pfx.overlaps(naddr):
                # cache hit, exit
                row = prefix_cache[pfx].copy()
                row['addr'] = addr
                #print("cached:   "+repr(row))

        # or go to ripestat
        if not row:
            row = ripestat_prefix_asn(addr)
            #print("ripestat: "+repr(row))
            prefix_cache[ipaddress.ip_network(row['prefix'])] = row
        
        rows.append(row)
    
    # now augment the input frame
    odf = pd.DataFrame(rows)
    odf.index = odf['addr']
    del(odf['addr'])
    return df.join(odf)

def select_ip4(df):
    return df.loc[pd.Index((s for s in df.index.values if ':' not in s))]

def select_ip6(df):
    return df.loc[pd.Index((s for s in df.index.values if ':'     in s))]

Load the data into a Pandas dataframe, then select interesting columns, split based on TFO state, and join on destination address.

In [2]:
# Merge two runs, only add missing rows from second
#%time (tfo_a_jdf, tfo_a_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run4.fjson")))
#%time (tfo_b_jdf, tfo_b_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run4b.fjson")))

#tfo_jdf = tfo_a_jdf.append(tfo_b_jdf).reset_index().drop_duplicates(subset='dip', keep='first').set_index('dip')
#tfo_xdf = tfo_a_xdf.append(tfo_b_xdf).reset_index().drop_duplicates(subset='dip', keep='first').set_index('dip')

#%time (tfo5_jdf, tfo5_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run5.fjson")))

%time (tfo6_jdf, tfo6_xdf) = rejoin_tfo_df(pd.DataFrame(gen_fjson("1m-run6.fjson")))

CPU times: user 39.4 s, sys: 4.29 s, total: 43.6 s
Wall time: 44.2 s


Now run the joint data frames through a sieve to classify them as follows:

- TFO works: TFO cookie was received, SYN-ACK ackonwledges data on SYN.
- TFO data not acked: TFO cookie received, SYN-ACK acknowledges SYN only.
- TFO data failure: TFO cookie received, no ACK (or RST) of SYN with data.
- TFO not negotiated: TFO cookie not received, but no TFO-linked connection impairment.
- TFO connection failure: Presence of TFO option causes connection failure (RST or drop).
- Connection failure: Neither vanilla TCP nor TFO connection attempt succeeded.

In [3]:
def tfo_sieve(tfo_jdf, tfo_xdf, prefix_cache):
    # Cookie available on SYN, but not ACK
    tfo_rscookie = tfo_jdf[(tfo_jdf['tfo_synclen'] > 0) & (tfo_jdf['tfo_ackclen'] == 0)]
    
    # Cookie available on ACK, but not SYN (probably retry 254)
    tfo_racookie = tfo_jdf[(tfo_jdf['tfo_synclen'] == 0) & (tfo_jdf['tfo_ackclen'] > 0)]

    # Cookie available on ACK, but not SYN (probably retry 254)
    tfo_rascookie = tfo_jdf[(tfo_jdf['tfo_synclen'] > 0) & (tfo_jdf['tfo_ackclen'] > 0)]

    # Add ASN information to cookie table, we'll use it later
    tfo_cookie = prefix_asn_df(tfo_rscookie.append(tfo_racookie).append(tfo_rascookie), prefix_cache)

    # (special cases... SYN cookie, but not eight bytes long)
    tfo_oddcookie = tfo_cookie[(tfo_cookie['tfo_synclen'] > 0) & (tfo_cookie['tfo_synclen'] != 8)]

    # (special cases... ACK cookie of kind 254: retry)
    tfo_expcookie = tfo_cookie[(tfo_cookie['tfo_ackclen'] > 0) & (tfo_cookie['tfo_ackkind'] == 254)]

    # (special cases... ACK cookie even with SYN cookie)
    tfo_twocookie = tfo_cookie[(tfo_cookie['tfo_synclen'] > 0) & (tfo_cookie['tfo_ackclen'] > 0)]
    
    # TFO works: data sent and ACKed
    tfo_works = tfo_cookie[((tfo_cookie['tfo_ack'] - tfo_cookie['tfo_seq'] - 1) == tfo_cookie['tfo_dlen'])]

    # TFO data not acked: data sent, but ACK only ACKs SYN
    tfo_dna = tfo_cookie[((tfo_cookie['tfo_ack'] - tfo_cookie['tfo_seq'] - 1) == 0)]

    # TFO data failed: data seen, but no ACK seen
    tfo_dfail = tfo_cookie[tfo_cookie['tfo_ack'] == 0]

    # No cookie available
    tfo_nocookie = tfo_jdf[tfo_jdf['tfo_synclen'] == 0]

    # TFO connection failures (where TFO attempted)
    tfo_cfail = tfo_nocookie[~tfo_nocookie['conn_t1']]

    # TFO not negotiated
    tfo_nope = tfo_nocookie[tfo_nocookie['conn_t1']]

    # Complete connection failures
    total_cfail = tfo_xdf[~tfo_xdf['conn_t0']]

    # TFO connection failures (either TFO not attempted or not seen)
    xtfo_cfail = tfo_xdf[tfo_xdf['conn_t0']]
    
    # summarize
    ct_total = len(tfo_jdf) + len(tfo_xdf)
    ct_totalfail = len(total_cfail)
    ct_tfocfail = len(tfo_cfail) + len(xtfo_cfail)
    ct_tfonope = len(tfo_nope)

    ct_tfodfail = len(tfo_dfail)
    ct_tfodna = len(tfo_dna)
    ct_tfoworks = len(tfo_works)
    ct_tfonego = len(tfo_cookie)

    ct_oddcookie = len(tfo_oddcookie)
    ct_expcookie = len(tfo_expcookie)
    ct_twocookie = len(tfo_twocookie)

    ct_tfogoog = len(tfo_cookie[tfo_cookie['asn'] == 15169])
    ct_tfongoog = len(tfo_cookie[tfo_cookie['asn'] != 15169])

    print("Of %6u tested IP addresses:" % (ct_total,))
    print("   %6u (%6.3f%%) completely failed to connect." % (ct_totalfail, 100 * ct_totalfail / ct_total))
    print("   %6u (%6.3f%%) may have TFO-dependent failure." % (ct_tfocfail, 100 * ct_tfocfail / ct_total))
    print("   %6u (%6.3f%%) did not negotiate TFO." % (ct_tfonope, 100 * ct_tfonope / ct_total))
    print("   %6u (%6.3f%%) negotiated TFO, of which:" % (ct_tfonego, 100 * ct_tfonego / ct_total))
    print(" - - - - - - - -")
    print("   %6u (%6.3f%% / %6.3f%%) responded with a type-254 cookie" % 
                  (ct_expcookie, 100 * ct_expcookie / ct_tfonego, 100 * ct_expcookie / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) responded with a non-8-byte cookie" % 
                  (ct_oddcookie, 100 * ct_oddcookie / ct_tfonego, 100 * ct_oddcookie / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) properly ACKed data on SYN" % 
                  (ct_tfoworks, 100 * ct_tfoworks / ct_tfonego, 100 * ct_tfoworks / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) returned a cookie while ACKing data on SYN" % 
                  (ct_twocookie, 100 * ct_twocookie / ct_tfonego, 100 * ct_twocookie / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) did not ACK data on SYN" % 
                  (ct_tfodna, 100 * ct_tfodna / ct_tfonego, 100 * ct_tfodna / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) failed with data on SYN" % 
                  (ct_tfodfail, 100 * ct_tfodfail / ct_tfonego, 100 * ct_tfodfail / ct_total))
    print(" - - - - - - - -")
    print("   %6u (%6.3f%% / %6.3f%%) are Google properties" %
                  (ct_tfogoog, 100 * ct_tfogoog / ct_tfonego, 100 * ct_tfogoog / ct_total))
    print("   %6u (%6.3f%% / %6.3f%%) are not Google properties" %
                  (ct_tfongoog, 100 * ct_tfongoog / ct_tfonego, 100 * ct_tfongoog / ct_total))

    return {'cookie': tfo_cookie,
            'oddcookie': tfo_oddcookie,
            'expcookie': tfo_expcookie,
            'twocookie': tfo_twocookie,
            'works': tfo_works,
            'dna': tfo_dna,
            'dfail': tfo_dfail,
            'nope': tfo_nope,
            'tcfail': total_cfail,
            'xcfail': xtfo_cfail}

In [4]:
prefix_cache = {}
print('All addresses:')
sieve6all = tfo_sieve(tfo6_jdf, tfo6_xdf, prefix_cache)
print('IPv4 only:')
sieve6v4 = tfo_sieve(select_ip4(tfo6_jdf), select_ip4(tfo6_xdf), prefix_cache)
print('IPv6 only:')
sieve6v6 = tfo_sieve(select_ip6(tfo6_jdf), select_ip6(tfo6_xdf), prefix_cache)

All addresses:
Of 673230 tested IP addresses:
    18777 ( 2.789%) completely failed to connect.
      211 ( 0.031%) may have TFO-dependent failure.
   653681 (97.096%) did not negotiate TFO.
      578 ( 0.086%) negotiated TFO, of which:
 - - - - - - - -
       15 ( 2.595% /  0.002%) responded with a type-254 cookie
       11 ( 1.903% /  0.002%) responded with a non-8-byte cookie
      563 (97.405% /  0.084%) properly ACKed data on SYN
       16 ( 2.768% /  0.002%) returned a cookie while ACKing data on SYN
       32 ( 5.536% /  0.005%) did not ACK data on SYN
        0 ( 0.000% /  0.000%) failed with data on SYN
 - - - - - - - -
      485 (83.910% /  0.072%) are Google properties
       93 (16.090% /  0.014%) are not Google properties
IPv4 only:
Of 620560 tested IP addresses:
    15818 ( 2.549%) completely failed to connect.
      208 ( 0.034%) may have TFO-dependent failure.
   604023 (97.335%) did not negotiate TFO.
      528 ( 0.085%) negotiated TFO, of which:
 - - - - - - - -
     

Dump all TFO-supporting websites, and apparently-TFO-connection-failing websites, to CSV, so we can check them from multiple vantage points.

In [5]:
with open("tfo_cookie6.csv", "w") as file:
    tfo_cookie_csv = sieve6all['cookie'].copy()
    tfo_cookie_csv['port'] = 80
    file.write(tfo_cookie_csv.loc[:,["port","host","rank"]].to_csv(header=False))

with open("tfo_cfail6.csv", "w") as file:
    tfo_cfail_csv = sieve6all['tcfail'].copy()
    tfo_cfail_csv['port'] = 80
    file.write(tfo_cfail_csv.loc[:,["port","host","rank"]].to_csv(header=False))
    xtfo_cfail_csv = sieve6all['xcfail'].copy()
    xtfo_cfail_csv['port'] = 80
    file.write(xtfo_cfail_csv.loc[:,["port","host","rank"]].to_csv(header=False))

What about odd cookies?

In [6]:
sieve6all['oddcookie']

Unnamed: 0_level_0,conn_t0,host,rank,fwd_rst_t0,rev_rst_t0,conn_t1,fwd_rst_t1,rev_rst_t1,tfo_seq,tfo_ack,tfo_dlen,tfo_synclen,tfo_synkind,tfo_ackclen,tfo_ackkind,asn,prefix
dip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
195.53.217.36,True,www.mapfre.es,27440,False,False,True,True,False,1874712566,1874712567,39,6,34,6,34,30846,195.53.217.0/24
195.25.232.194,True,www.macif.fr,36623,False,False,True,True,False,1457388550,1457388551,38,6,34,6,34,3215,195.25.0.0/16
195.235.248.195,True,www.fundacionmapfre.org,103702,False,False,True,True,False,581667038,581667039,49,6,34,6,34,30846,195.235.248.0/24
195.53.217.193,True,www.mapfre.com,107231,False,False,True,True,False,655997387,655997388,40,6,34,6,34,30846,195.53.217.0/24
212.170.33.117,True,www.verti.es,123356,False,False,True,True,False,4158400847,4158400848,38,6,34,6,34,30846,212.170.33.0/24
195.235.248.46,True,www.mapfretecuidamos.com,202247,False,False,True,True,False,2274067134,2274067135,50,6,34,6,34,30846,195.235.248.0/24
195.235.248.34,True,www.jubilacionypension.com,251608,False,False,True,True,False,2512689423,2512689424,52,6,34,6,34,30846,195.235.248.0/24
195.53.217.43,True,www.mapfre.net,534578,False,False,True,True,False,1125819701,1125819702,40,6,34,6,34,30846,195.53.217.0/24
195.235.248.190,True,www.mapfregrupo.com,885747,False,False,True,True,False,1268297577,1268297578,45,6,34,6,34,30846,195.235.248.0/24
212.170.33.198,True,www.mapfre-warranty.com,894866,False,False,True,True,False,875244314,875244315,49,6,34,6,34,30846,212.170.33.0/24


Experimental ones?

In [7]:
sieve6all['expcookie']

Unnamed: 0_level_0,conn_t0,host,rank,fwd_rst_t0,rev_rst_t0,conn_t1,fwd_rst_t1,rev_rst_t1,tfo_seq,tfo_ack,tfo_dlen,tfo_synclen,tfo_synkind,tfo_ackclen,tfo_ackkind,asn,prefix
dip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5.199.134.80,True,www.kedem.ru,27550,False,False,True,True,False,2721245025,2721245026,0,0,254,8,254,24961,5.199.128.0/20
144.76.82.156,True,www.samba.org,40158,False,False,True,True,False,721180658,721180659,0,0,254,8,254,24940,144.76.0.0/16
89.108.106.190,True,www.fanat1k.ru,168736,False,False,True,True,False,2434491587,2434491588,0,0,254,8,254,43146,89.108.106.0/24
192.99.188.166,True,www.yorukaze.me,185618,False,False,True,True,False,997985756,997985757,0,0,254,8,254,16276,192.99.0.0/16
128.199.73.182,True,www.imoney.ph,266050,False,False,True,False,False,1947324689,1947324690,0,0,254,8,254,133165,128.199.64.0/18
151.80.139.61,True,www.soniaperonaci.it,267373,False,False,True,True,False,584685926,584685927,0,0,254,8,254,16276,151.80.0.0/16
222.255.27.187,True,kenh14cdn.com,296319,False,False,True,False,False,2958237200,2958237201,0,0,254,8,254,45899,222.255.27.0/24
188.117.132.67,True,www.trzepak.pl,353956,False,False,True,False,False,280045895,280045896,0,0,254,8,254,31242,188.117.128.0/18
109.168.105.231,True,www.stgy.it,408563,True,False,True,True,False,2460130360,2460130361,0,0,254,8,254,5602,109.168.105.0/24
61.147.80.168,True,www.ceve-market.org,474355,False,False,True,False,False,1925627199,1925627200,0,0,254,8,254,23650,61.147.80.0/24


Double cookies?

In [8]:
sieve6all['twocookie']

Unnamed: 0_level_0,conn_t0,host,rank,fwd_rst_t0,rev_rst_t0,conn_t1,fwd_rst_t1,rev_rst_t1,tfo_seq,tfo_ack,tfo_dlen,tfo_synclen,tfo_synkind,tfo_ackclen,tfo_ackkind,asn,prefix
dip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
195.53.217.36,True,www.mapfre.es,27440,False,False,True,True,False,1874712566,1874712567,39,6,34,6,34,30846,195.53.217.0/24
195.25.232.194,True,www.macif.fr,36623,False,False,True,True,False,1457388550,1457388551,38,6,34,6,34,3215,195.25.0.0/16
195.235.248.195,True,www.fundacionmapfre.org,103702,False,False,True,True,False,581667038,581667039,49,6,34,6,34,30846,195.235.248.0/24
195.53.217.193,True,www.mapfre.com,107231,False,False,True,True,False,655997387,655997388,40,6,34,6,34,30846,195.53.217.0/24
212.170.33.117,True,www.verti.es,123356,False,False,True,True,False,4158400847,4158400848,38,6,34,6,34,30846,212.170.33.0/24
195.235.248.46,True,www.mapfretecuidamos.com,202247,False,False,True,True,False,2274067134,2274067135,50,6,34,6,34,30846,195.235.248.0/24
195.235.248.34,True,www.jubilacionypension.com,251608,False,False,True,True,False,2512689423,2512689424,52,6,34,6,34,30846,195.235.248.0/24
178.33.126.200,True,www.tigerrr.com,253603,False,False,True,True,False,2403091124,2403091166,41,8,34,8,34,16276,178.32.0.0/15
2001:41d0:b:6c8::1,True,www.tigerrr.com,253603,False,False,True,True,False,2311219388,2311219430,41,8,34,8,34,16276,2001:41d0::/32
195.216.236.200,True,www.inbox.eu,274706,False,False,True,True,False,274553123,274553124,38,8,34,8,34,12993,195.216.236.0/24


### Comparison with Andreas' results from June

In [9]:
andreas_df = pd.read_table("andreas_results.txt", delimiter="\s*\|\s*", engine="python")\
.set_index('ip')\
.drop("Unnamed: 0",axis=1)\
.drop("Unnamed: 13",axis=1)

In [10]:
len(andreas_df)

362

In [11]:
len(andreas_df[andreas_df['google'] == 1])

304

In [12]:
# Domain set intersection
sep_tfoset = set(sieve6all['cookie']['host'].unique())
jun_tfoset = set(andreas_df['host'].unique())
sep_allset = set(tfo6_jdf['host'].unique())