# Tools for inspecting individual beacons

See "Performance Bucket TimeSeries" and ../ReadMe for more information, including require tools 

## Setup

* Install [Jupyter](http://jupyter.org/install)
* Install [gcloud SDK](https://cloud.google.com/sdk/downloads)
* Install the google-cloud-bigquery package:

 + `pip install --upgrade google-cloud-bigquery`
 
* Permissions:

 + Join: `discussion@measurement-lab.net` (See https://www.measurementlab.net/data/docs/bq/quickstart/ )
 + Authenticate: `gcloud auth application-default login`
 + Set default project: `gcloud config set project mlab-sandbox`
 
* Start Jupyter

 + `jupyter notebook`
 
## References

* Matplotlib - https://matplotlib.org/contents.html
* Pandas - https://pandas.pydata.org/pandas-docs/stable/api.html 
* BigQuery - https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators 

In [1]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime
import collections
import pickle
# Depends on: pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')
print "Done"

Done


# Global flags

Invoke cell individually or reorder them to change defaults

In [2]:
EndDate = '2018-05-13'

In [3]:
# Skip slow/expensive queries
DoQueries=False

In [4]:
# Force queries
DoQueries=True

In [5]:
# Enable interactive figures that pan and zoom
interactive = True

In [6]:
# Enables figures to load inline in the browser and saved (github etc).
interactive = False

In [7]:
# Disable plots for the paper
DoPaper = False

In [8]:
# Enable plots for the paper
DoPaper = True

In [9]:
DoExp = True

In [10]:
DoExp = False

In [11]:
def setupmatplotlib(force=None):
    global interactive
    if force == 'inline':
        %matplotlib inline
        return
    elif force == 'interactive':
        %matplotlib
        return
    elif force is not None:
        print 'Unknown option, using default'
    if interactive:
        print 'default interactive'
        %matplotlib
        return
    else:
        print 'default inline'
        %matplotlib inline
        return
setupmatplotlib()

default inline


In [12]:
# Depends on: pip install --upgrade google-cloud-bigquery
# TODO:  COnsider automatically inserting #standardSQL
from google.cloud import bigquery

def expand_query(query, **kwargs):
    """expand_query: expans nested {parameter} substitutions.
    Stashes forensic output in globals.
    """
    global DebugQuery # For pasting into BQ, after the fact
    global NumberedQuery # For grocking BQ error line numbers.
    global DefaultArgs # To ignore some 

    # Only allow argument substitution 4 levels deep, because
    # accidental infinite recursion risks crashing the notebook.
    args = DefaultArgs.copy()
    args.update(kwargs)
    query=query.format(**args)
    query=query.format(**args)
    query=query.format(**args)
    query=query.format(**args)
    if '{' in query:
        raise "Unexpanded substitutions"
    
    # Leave crumbs if we need a postmortem
    DebugQuery = query
    NumberedQuery = ""
    for i, l in enumerate(query.split('\n')):
          NumberedQuery += "%3d %s\n"%(i, l)

    return query

def run_query(query, project='mlab-sandbox', otherindex=None, timeindex='partition_date', **kwargs):
    """ run_query
        Accepts nested {parameter} substitutions.
        
        Stashes forensic output in globals.
    """
    global NumberedQuery
    query=expand_query(query,  **kwargs)

    # do the work
    client = bigquery.Client(project=project)
    job = client.query(query)  # All errors are delayed

    # Marshal the results, catching async errors
    try:
        results = collections.defaultdict(list)
        for row in job.result(timeout=300):
            for key in row.keys():
                results[key].append(row.get(key)) 
    except:
        print NumberedQuery
        raise

    if otherindex:
        return pd.DataFrame(results, index=results[otherindex])
    # Default is timeindex='partition_date', but 'test_time' is common
    if timeindex == 'test_time':
        print "Index by test_time"
        return pd.DataFrame(results,
                            index=pd.DatetimeIndex(results[timeindex]*1000000000))

    if timeindex:
        return pd.DataFrame(results, index=pd.DatetimeIndex(results[timeindex]))
    # set timeindex=None to force a raw DataFrame
    return pd.DataFrame(results)

def write_query_table(query, otable,
                      project='mlab-sandbox', dataset_id='mattmathis',
                      **kwargs):
    """ write_query_table
        Accepts nested {parameter} substitutions.
        
        Stashes forensic output in globals.
    """
    global NumberedQuery
    query=expand_query(query,  **kwargs)

    # do the work
    client = bigquery.Client(project=project)
    job_config = bigquery.QueryJobConfig()
    table_ref = client.dataset(dataset_id).table(otable)
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
    job = client.query(query, location='US', job_config=job_config)

    # Marshal the results, catching async errors
    try:
        res = job.result()  # Get the first row to make sure it starts
        while not job.done():
            print 'tick'
            time.sleep(5)
        assert job.state == 'DONE'
    except:
        print "Query Errored"
        print NumberedQuery
        raise
    print "Query completed"
    return
# Not tested, can read a table with
#    iterator = client.list_rows(
#        table_ref, selected_fields=[bigquery.SchemaField('my_col', 'INT64')]) 
    
if False:
    testQ="""
    SELECT *
    FROM `mattmathis.new_master_annotations`
    """
    write_query_table(testQ, otable='test_results2')

print "Done"

Done


In [13]:
def unlog(x, pos):
    v = math.pow(10, x)
    frac, whole = math.modf(v)
    if frac > 0:
        return '%.1f' % v
    else:
        return '%d' % whole

logFormatter = matplotlib.ticker.FuncFormatter(unlog)
print "Done"

Done


In [14]:
# Automaticly save or restor pickle data
def AutoPickle(data, name, save=True):
    fname = name+'.pickle'
    try:
        if len(data) <2:
            raise "Not Valid"
        if save:
            print "Saving Data"
            with open(fname, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        else:
            print "Data seems valid, but save != True"
        return(data)
    except:
        print "Loading prior data"
        with open(fname, 'rb') as f:
            return(pickle.load(f))
print 'Done'

Done


## Query templates for most of the plots below
Unidirectional with selected percentiles

In [15]:
# All of these queries yield timeseries of multidimensional histograms of 'value'

# Query the list of beacons
BeaconQ ="""
SELECT
  clientIP {beacon_fields}
FROM
  `mattmathis.new_master_annotations`
WHERE
  clientIP NOT IN (
    '45.56.98.222',
    '64.9.225.99',
    '64.9.225.190' ) # exclude eb, etc
    {beacon_where}
"""

# Query relevant fields of the beacons of interest
# This yields one row per test
# with download rate (Mb/s) in column 'value'
DownloadQ="""
SELECT
  partition_date,
  web100_log_entry.connection_spec.remote_ip AS clientIP,
  connection_spec.data_direction AS direction,
  web100_log_entry.connection_spec.local_ip AS local_ip,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 5),
         "UNK") AS server_site,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 3),
         "UNK") AS server_metro,
  web100_log_entry.log_time AS test_time,
  web100_log_entry.snap.duration AS duration,
  web100_log_entry.snap.HCDataOctetsOut AS bytes_transfered,
  {more_data}
  SAFE_DIVIDE(web100_log_entry.snap.HCDataOctetsOut, web100_log_entry.snap.duration) * 8 AS value
FROM
  `measurement-lab.release.ndt_all`
WHERE
  connection_spec.data_direction = 1
  AND web100_log_entry.snap.duration > 10000
  AND web100_log_entry.snap.HCDataOctetsOut > 0
  AND web100_log_entry.snap.HCDataOctetsOut < 1625000000
  {data_where}
  AND partition_date <= '{enddate}' 
"""

# Query relevant fields of the beacons of interest
# This yields one row per test
# with upload rate (Mb/s) in column 'value'
UploadQ="""
SELECT
  partition_date,
  web100_log_entry.connection_spec.remote_ip AS clientIP,
  connection_spec.data_direction AS direction,
  web100_log_entry.connection_spec.local_ip AS local_ip,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 5),
         "UNK") AS server_site,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 3),
         "UNK") AS server_metro,
  web100_log_entry.log_time AS test_time,
  web100_log_entry.snap.duration AS duration,
  web100_log_entry.snap.HCDataOctetsIn AS bytes_transfered,
  {more_data}
  SAFE_DIVIDE(web100_log_entry.snap.HCDataOctetsIn, web100_log_entry.snap.duration) * 8 AS value
FROM
  `measurement-lab.release.ndt_all`
WHERE
  connection_spec.data_direction = 0
  AND web100_log_entry.snap.duration > 10000
  AND web100_log_entry.snap.HCDataOctetsIn > 0
  AND web100_log_entry.snap.HCDataOctetsIn < 1625000000
  {data_where}
  AND partition_date <= '{enddate}' 
"""

# Query relevant fields of the beacons of interest
# This yields one row per test
# with RTT in column 'value'
RTTQ="""
SELECT
  partition_date,
  web100_log_entry.connection_spec.remote_ip AS clientIP,
  connection_spec.data_direction AS direction,
  web100_log_entry.connection_spec.local_ip AS local_ip,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 5),
         "UNK") AS server_site,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 3),
         "UNK") AS server_metro,
  web100_log_entry.log_time AS test_time,
  web100_log_entry.snap.duration AS duration,
  web100_log_entry.snap.HCDataOctetsOut AS bytes_transfered, 
  SAFE_DIVIDE(web100_log_entry.snap.SumRTT, web100_log_entry.snap.CountRTT) AS value
FROM
  `measurement-lab.release.ndt_all`
WHERE
  connection_spec.data_direction = 1
  AND web100_log_entry.snap.duration > 10000
  AND web100_log_entry.snap.HCDataOctetsOut > 0
  AND web100_log_entry.snap.HCDataOctetsOut < 1625000000
  {data_where}
  AND partition_date <= '{enddate}' 
"""

# Query relevant fields of the beacons of interest
# This yields one row per test
# with MinRTT in column 'value'
MinRTTQ="""
SELECT
  partition_date,
  web100_log_entry.connection_spec.remote_ip AS clientIP,
  connection_spec.data_direction AS direction,
  web100_log_entry.connection_spec.local_ip AS local_ip,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 5),
         "UNK") AS server_site,
  IFNULL(SUBSTR(connection_spec.server_hostname, -25, 3),
         "UNK") AS server_metro,
  web100_log_entry.log_time AS test_time,
  web100_log_entry.snap.duration AS duration,
  web100_log_entry.snap.HCDataOctetsOut AS bytes_transfered, 
  web100_log_entry.snap.MinRTT AS value
FROM
  `measurement-lab.release.ndt_all`
WHERE
  connection_spec.data_direction = 1
  AND web100_log_entry.snap.duration > 10000
  AND web100_log_entry.snap.HCDataOctetsOut > 0
  AND web100_log_entry.snap.HCDataOctetsOut < 1625000000
  {data_where}
  AND partition_date <= '{enddate}' 
"""

# Joinclause
joinQ = """
    SELECT *
    FROM ( {data} )
    INNER JOIN ( {beacons} )
    USING ( clientIP )
"""

# Aggregate test statistics by partition_date and server_site
mainQ="""
#standardSQL
SELECT
  UNIX_DATE(partition_date) * 86400 AS partition_time,
  partition_date,
  server_site,
  ANY_VALUE(server_metro) AS server_metro,
  COUNTIF(value < 1.0) AS LT001,
  COUNTIF(value < 2.0) AS LT002,
  COUNTIF(value < 4.0) AS LT004,
  COUNTIF(value < 8.0) AS LT008,
  COUNTIF(value < 16.0) AS LT016,
  COUNTIF(value < 32.0) AS LT032,
  COUNTIF(value < 64.0) AS LT064,
  COUNTIF(value < 128.0) AS LT128,
  COUNTIF(value < 256.0) AS LT256,
  COUNTIF(value < 512.0) AS LT512,
  COUNT(*) AS count
FROM ( {joinclause} )
GROUP BY
  partition_date,
  server_site
ORDER BY
  partition_date
"""

global EndDate # pervent irrelevant changes
# Default values for optional parameters
DefaultArgs = {
    'beacons':BeaconQ,
    'beacon_fields':'',
    'beacon_where':'',
    'data':DownloadQ,
    'more_data':'',
    'data_where':'',
    'enddate':EndDate,
    'joinclause':joinQ
}

# Useful debugging queries
CountQ="""
#standardSQL
SELECT
    count(*) AS count
FROM ( {counted} )
"""

LimitQ="""
#standardSQL
SELECT
    *
FROM ( {counted} ) LIMIT 10
"""

print "Done"

Done


In [16]:
# Confirm that BQ credentials are working
# Count Master Beacons, Should be 1.55M
if True:
        print run_query(CountQ, counted=BeaconQ, timeindex=None)

     count
0  1555329


# Global Fleet Information

The first cell below was generated by MiscSmallQueries, which extracts the data from the sites spreadsheet.

The second cell below was manaully generated to help tell some stories.

In [17]:
# Sites with missing region fields ['']
contenents = {'Europe': ['vie', 'bcn', 'ham', 'trn', 'svg', 'dub', 'lis', 'fra', 'ath', 'beg', 'lba', 'prg', 'par', 'lhr', 'lca', 'mil', 'ams', 'lju', 'bru', 'arn', 'mad'], 'Oceania': ['wlg', 'syd', 'akl'], 'South_America': ['fln', 'bog'], 'Africa': ['acc', 'jnb', 'tnr', 'nbo', 'tun', 'los', 'mpm'], 'Asia': ['tpe', 'mnl', 'hnd', 'bkk', 'sin', 'bom'], 'North_America': ['yqm', 'den', 'yvr', 'mia', 'iad', 'atl', 'lga', 'nuq', 'ywg', 'yyc', 'lax', 'yul', 'sea', 'sjc', 'ord', 'yyz', 'dfw']}
# Sites with missing transit fields ['', 'lga0t', 'iad0t', 'iad1t']
transits = {'Serbian Open eXchange': ['beg01'], 'Level 3': ['fra04', 'mad02'], 'go6': ['lju01'], 'Tata': ['bom02', 'mia03', 'ord03', 'lax02', 'sin01', 'nuq04', 'lga03', 'dfw02', 'atl03', 'sea02', 'iad03'], 'Internap': ['atl06', 'ams07', 'sjc01', 'lga07', 'dfw06', 'sea06'], 'Hurricane Electric': ['yul02', 'yul01', 'yyz02', 'yyc01', 'yyz01', 'yvr01', 'yyc02', 'yqm01', 'ywg01'], 'PHOpenIX': ['mnl01'], 'Biglobe': ['hnd02'], 'ISC': ['nuq02'], 'Vodafone': ['ams05', 'arn02', 'lis02', 'bru01', 'mil05', 'lhr04', 'par05', 'prg05', 'fra03'], 'Ghana IXP': ['acc02', 'acc01'], 'CAT Telecom': ['bkk01'], 'Victoria University of Wellington': ['wlg01'], 'AARNET': ['syd01'], 'Telia': ['mad03', 'arn01', 'arn04', 'lhr02', 'ams03', 'par01', 'fra01', 'mil02', 'ham01', 'prg03', 'bru03', 'par03'], 'RNP (ASN 1916) and FAPESC (ASN 52950).': ['fln01'], 'Tinet': ['mad01', 'mil01'], 'Altibox': ['svg01'], 'Zayo': ['dfw05', 'mia05', 'ord05', 'lga06', 'den04', 'nuq06', 'lax05'], 'Vocus': ['syd02'], 'Leaseweb': ['ams06'], 'XO': ['dfw04', 'atl05', 'den03', 'nuq05', 'sea05', 'iad01'], 'WIDE': ['hnd01'], 'REANNZ': ['akl01', 'wlg02'], 'CyNet': ['lca01'], 'Ubuntunet': ['nbo01'], 'RTR': ['vie01'], 'Nigeria IXP': ['los01'], 'Cogent': ['ord02', 'mia02', 'dfw01', 'atl02', 'sea01', 'nuq03', 'lax01', 'iad02', 'lga02'], 'CATNIX / Orange': ['bcn01'], 'GTT': ['ams04', 'lhr03', 'mia04', 'prg04', 'arn05', 'mad04', 'par04', 'lax03', 'ord04', 'fra02', 'sea03', 'atl04', 'mil03', 'lga04', 'den01', 'iad04', 'bru04'], 'Airtel': ['bom01'], 'aql': ['lba01'], 'National Chi Nan University': ['tpe01'], 'ATI': ['tun01'], 'Voxel': ['lga01'], 'Google': ['nuq01'], 'Internap / Voxel': ['ams02'], 'Telecom Malagasy': ['tnr01'], 'Topix': ['trn01'], 'GRNET': ['ath01', 'ath02', 'ath03'], 'Tenet': ['jnb01'], 'HEanet': ['dub01'], 'Level3': ['lhr01', 'arn03', 'ams01', 'lhr05', 'mia01', 'lga05', 'lga1t', 'mil04', 'prg01', 'lax04', 'lis01', 'par02', 'atl01', 'bru02', 'dfw03', 'prg02', 'den02', 'sea04', 'iad05', 'ord01'], 'Telefonica': ['bog01'], 'Morenet': ['mpm01']}
cities = {'': '', 'prg': 'Prague', 'yyc': 'Calgary', 'vie': 'Vienna', 'bru': 'Brussells', 'nuq': 'San Jose', 'lba': 'Leeds', 'jnb': 'Johannesburg', 'yul': 'Montreal', 'mnl': 'Quezon City', 'sea': 'Seattle', 'ord': 'Chicago', 'arn': 'Stockholm', 'tpe': 'Taipei', 'fra': 'Frankfurt', 'ham': 'Hamburg', 'yvr': 'Vancouver', 'mia': 'Miami', 'iad': 'Washington', 'dfw': 'Dallas', 'los': 'Lagos', 'lis': 'Lisbon', 'sjc': 'San Jose', 'sin': 'Changi', 'lhr': 'London', 'yqm': 'Moncton', 'lga': 'New York', 'syd': 'Sydney', 'akl': 'Auckland', 'par': 'Paris', 'bkk': 'Bangkok', 'mpm': 'Maputo', 'lax': 'Los Angeles', 'mad': 'Madrid', 'tun': 'Tunis', 'lca': 'Nicosia', 'ams': 'Amsterdam', 'yyz': 'Toronto', 'mil': 'Milan', 'acc': 'Accra', 'bcn': 'Barcelona', 'hnd': 'Tokyo', 'tnr': 'Antananarivo', 'svg': 'Sola', 'atl': 'Atlanta', 'trn': 'Turin', 'beg': 'Belgrade', 'ath': 'Athens', 'ywg': 'Winnipeg', 'bom': 'Mumbai', 'den': 'Denver', 'fln': 'Florian\xc3\xb3polis', 'nbo': 'Nairobi', 'wlg': 'Wellington', 'lju': 'Ljubljana', 'dub': 'Dublin', 'bog': 'Bogota'}

In [18]:
# Manual metro sets
USAmetros = ['lga', 'den', 'iad', 'dfw', 'ord', 'lax', 'sea', 'nuq']


# Default beacon timeseries plotter

In [19]:
# Base series plotting code
def new_plot_beacon_rates(pdata, ofile=None, title=None, ztime=None, xlim=None, figsize=(16, 20)):
    
    fig, axes = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=figsize)
   
    ax = axes # was for ax in axes:

    # make it pretty
    ax.xaxis.set_major_locator(matplotlib.dates.YearLocator())
    ax.xaxis.set_minor_locator(matplotlib.dates.MonthLocator())
    if ofile:
        ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y'))
        ax.xaxis.set_minor_formatter(matplotlib.dates.DateFormatter(''))
    else:
        ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y   '))
        #  ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y-%m'))
        ax.tick_params(axis='x', labelrotation=90)
        ax.xaxis.set_minor_formatter(matplotlib.dates.DateFormatter('%m'))
        
    if xlim is not None:
        ax.set_xlim(xlim)
    # print list(pdata), list(pdata['test_time'])
    rate = pdata['rate']
    if ztime:
        hour = (pdata['test_time'] % 86400) / 3600.0
        ax.plot(rate[hour > ztime], marker='x', linestyle='None', label='')
        ax.plot(rate[hour < ztime], marker='+', linestyle='None', label='')
    else:
        ax.plot(rate, marker='.', linestyle='None', label='')

    # if
    if title and not ofile:
        fig.suptitle(title, y='0.97', fontsize=14)
    if ofile:
        fig.savefig(ofile, dpi=100)
    plt.show()
    # plt.close
# tester needed
print 'Done'

Done


# Testing area
Paste experimental query and plotting below.  When done, move to the proper section

In [20]:
# CHANGE LOG for experimental  (this duplicates another cell)

# Query beacons of interest
# Time series of single beacons, for illustration 

# beacon slectors
first1000beacons="""
  ORDER BY series_start ASC LIMIT 1000
"""

greedybeacons="""
  ORDER BY series_download_bytes DESC LIMIT 100
"""

longestbeacons="""
    ORDER BY series_elapsed_days DESC LIMIT 100
"""

selectedBeaconQ ="""
SELECT
  clientIP {beacon_fields}
FROM
  `mattmathis.new_master_annotations`
WHERE
  clientIP IN ( {roguesgallery} )
"""

beacon_fields=", series_start, series_start_asc, series_count, series_download_bytes, series_elapsed_days"

seriesQ="""
#standardSQL
SELECT
  UNIX_DATE(partition_date) * 86400 AS partition_time,
  partition_date,
  server_site,
  server_metro,
  clientIP,
  test_time,
  duration,
  bytes_transfered,
  value AS rate
  {beacon_fields}
FROM ( {joinclause} )
ORDER BY test_time ASC
"""

In [21]:
# Focus on Interconnection Beacons
# Running to lga between 2013-01-01 and 2014-08-01\
# To selected access ISPs
if True:
#    bw = 'ORDER BY series_start ASC LIMIT 10'
    beacon_where = """
        AND series_start_asc < '2013-01-01'
        AND series_end_asc > '2014-08-01'
#        AND REGEXP_CONTAINS(clientIP, '^24.')
#        ORDER BY series_count DESC
#        LIMIT 10
        """
    data_where = """
        AND SUBSTR(connection_spec.server_hostname, -25, 3) in ( {metros} )
        """
    tmpargs = {
        'data':DownloadQ,
        'beacon_fields':beacon_fields,
        'beacon_where':beacon_where,
        'data_where':data_where,
        'metros':" 'lga' "
    }
    InterconnectionBeacons = run_query(seriesQ, timeindex=None, **tmpargs)
    InterconnectionBeacons.index = pd.DatetimeIndex(
        InterconnectionBeacons['test_time']*1000000000) # NB: test_time
print 'Done', len(InterconnectionBeacons)

Done 823881


In [22]:
# print InterconnectionBeacons[0:2]
# print NumberedQuery

In [None]:
# Dive into Interconnection study beacons
if True:
    setupmatplotlib('inline')
    xlim=pd.DatetimeIndex(['2013-01-01','2014-08-01'])
    # print xlim
    tmp = pd.DataFrame(InterconnectionBeacons, copy=True)
    clients = set(tmp['clientIP'])
    print "Clients: ", len(clients)
    
    ptmp = tmp.pivot_table(index=tmp.index, columns='clientIP',
                           values=['rate','test_time'])
    #    print clients
    # print tmp[0:3]
    print ptmp                                                                   

    for c, i in zip(clients, range(len(clients))):
        if i > 10:
            break
        print 'lga beacon:', c
        new_plot_beacon_rates(tmp[tmp['clientIP']==c], xlim=xlim,
                              title='Lga beacon: '+c, ztime=12.0,
                              figsize=(16, 3))
print 'Done'

# Search for  genericly interesting  beacons
(High rate, frequent or early tests).

In [None]:
# Query and Display the 100 greedyest beacons
if DoQueries:
    GreedyBeacons = run_query(seriesQ,
                             beacon_fields=beacon_fields, beacon_where=greedybeacons)
    GreedyBeacons.index=pd.DatetimeIndex(GreedyBeacons['test_time']*1000000000)
    print GreedyBeacons[0:3]
    print 'Query Done'

In [None]:
if True:
    # Force inline because interactive opens seperate windows for each
    %matplotlib inline
    tmp = pd.DataFrame(GreedyBeacons, copy=True)
    
    pdata = tmp.pivot_table(index=tmp.index, columns='clientIP', values='rate')
    clients=list(pdata)
    for c, n in zip(clients, range(len(clients))):
        t = GreedyBeacons[GreedyBeacons['clientIP']== c]
        print "   '%s', # Greedy %d"%(c, n)
        new_plot_beacon_rates(t, title="Client "+c, figsize=(16, 3))
print 'Done'

In [None]:
# Display the first 1000 Beacons
if DoQueries:
    First1000Beacons = run_query(seriesQ,
                             beacon_fields=beacon_fields, beacon_where=first1000beacons)
    First1000Beacons.index=pd.DatetimeIndex(First1000Beacons['test_time']*1000000000)
if True:
    # Force inline because interactive opens seperate windows for each
    %matplotlib inline
    # Force copying to prevent corrupting raw_data[] when re-executing bad code
    pdata = First1000Beacons.pivot_table(index=tmp.index, columns='clientIP', values='rate')
    clients=list(pdata)

    for c, n in zip(clients, range(len(clients))):
        print "   '%s', # First %d"%(c, n)
        plot_beacon_rates(First1000Beacons[c], title="Client "+c, figsize=(16, 3))
    print 'Done'

In [None]:
# Display the 100 longest beacons
if DoQueries:
    LongestBeacons =  run_query(seriesQ, timeindex='test_time',
                             beacon_fields=beacon_fields, beacon_where=longestbeacons)
if True:
    # Force inline because interactive opens seperate windows for each
    %matplotlib inline
    tmp = pd.DataFrame(LongestBeacons, copy=True)
    tmp.index = pd.DatetimeIndex(tmp['test_time']*1000000000)
    pdata = tmp.pivot_table(index=tmp.index, columns='clientIP', values='rate')
    print len(list(pdata))
    for c, n in zip(list(pdata), range(len(pdata))):
        print "   '%s', # Longest %d"%(c, n)
        plot_beacon_rates(pdata, title="Client "+c, clients=[c], figsize=(16, 3))

In [None]:


if True:
    # Force inline because interactive opens seperate windows for each
    setupmatplotlib('inline')
    
    pdata = InterconnectionBeacons.pivot_table(
                            columns='clientIP', values=['test_time', 'rate' ])
    clients=list(pdata['test_time'])
    # clients=['24.113.136.215']
    print clients
    for c, n in zip(clients, range(len(clients))):
        print "   '%s', # First %d"%(c, n)
        tmp = InterconnectionBeacons[InterconnectionBeacons['clientIP'] == c]
        # print tmp
        new_plot_beacon_rates(tmp, title="Client "+c, figsize=(16, 3))

In [None]:
# SAVE slice on arbitarry predicates
# debugging code
# Test queries

predicate = 'REGEXP_CONTAINS(ClientType, "BT")'  # not it   
predicate = 'connection_spec.websockets IS TRUE' # Big shift 2017-05-01
if False:
    args = {
        'clist':['North_America'],
        'predicate':predicate,
#        'cohort':'2010-06-01',
#        'coend':'2018-01-01',
        'more_data':'connection_spec.websockets AS websockets, '
    }
    comparison = {
        'NA_all':query_global_cohort(pmode=0, **args)['North_America'],
        'NA_withoutPred':query_global_cohort(pmode=-1, **args)['North_America'],
        'NA_OnlyPred':query_global_cohort(pmode=-2, **args)['North_America']}

import time
if DoExp:
    setupmatplotlib('interactive')
    time.sleep(1)
    plot_geo_rates(comparison)

print 'Done'

In [None]:
# debugging code
# Test queries
Beacon Timeseries
predicate = 'REGEXP_CONTAINS(ClientType, "BT")'  # not it   
predicate = 'connection_spec.websockets IS TRUE' # Big shift 2017-05-01
if False:
    args = {
        'clist':['North_America'],
        'predicate':predicate,
#        'cohort':'2010-06-01',
#        'coend':'2018-01-01',
        'more_data':'connection_spec.websockets AS websockets, '
    }
    comparison = {
        'NA_all':query_global_cohort(pmode=0, **args)['North_America'],
        'NA_withoutPred':query_global_cohort(pmode=-1, **args)['North_America'],
        'NA_OnlyPred':query_global_cohort(pmode=-2, **args)['North_America']}

if DoExp:
    setupmatplotlib('interactive')
    plot_geo_rates(comparison)

print 'Done'

# Time series of individual beacons

In [None]:
# Detailed plot of manually slected beacons
# Note that thispPredates new master beacons, and many are excluded now
roguesgallery = [
 '144.130.155.1', # Greedy 23 - Strong ~2 MB/s Max + outliers UP
 '163.7.137.201', # Grenew_plot_beacon_ratesedy 25  ~ 850 M/b max but only 2 days
 '163.7.137.243', # Greedy 26 -  850 M/b max, 5 days
 '195.143.162.141', # Greedy 35	- Varying flattop (w/ outliers)
 '204.246.122.65', # Greedy 42 - Varying flattop (w/ outliers) ~ 3 years
 '208.77.130.154', # Greedy 48 - stripes
 '23.228.128.99', # Greedy 52 - stripes
 '45.79.140.244', # Greedy 56 - 850 for ~ 1 day, then 400 Mb/s
 '45.79.155.9', # Greedy 57 - Man test at low rates, two spikes one to 800+
 '77.95.64.13', # Greedy 90 - several days at ~700 Mb/s, downward shift
 '121.54.32.106', # Longest 7 (and many like it) thin viel w/ gaps July 2010, 2012 2016-2017, upward slope
 '217.72.93.226', # Longest 64 - long staircase
 '84.1.111.194', # Longest 93 - long staircase
 '93.99.142.1', # Longest 99 - 3 rate steps but decreasing test volumes
 '121.54.32.102', # First 42 - Another BT?
 # (Through)
 '121.54.32.108', # First 46
 '84.235.73.18', # First 819 - NAT!  Very uniform noisy mostly under 2 MB/s
 '84.235.73.19', # First 820 - NAT!  Very uniform noisy mostly under 2 MB/s
 '84.235.73.20', # First 821 - NAT!  Very uniform noisy mostly under 2 MB/s
 '84.235.73.21', # First 822 - NAT!  Very uniformflat and noisy mostly under 2 MB/s
 '84.235.75.18', # First 823 - NAT!  Very uniform noisy mostly under 2 MB/s
 '84.235.75.21', # First 824 - NAT!  Very uniform noisy mostly under 2 MB/s
]

if True:
    CuratedBeacons = run_query(seriesQ, beacons=selectedBeaconQ, timeindex=None,
                             beacon_fields=beacon_fields, roguesgallery=str(roguesgallery)[1:-1])
    CuratedBeacons.index = pd.DatetimeIndex(CuratedBeacons['test_time']*1000000000)
print 'Done'

# print CuratedBeacons
if True:
    tmp = CuratedBeacons[CuratedBeacons['clientIP'] == '84.235.73.20']
    new_plot_beacon_rates(tmp, title='Test of 84.235.73.20', figsize=(16, 3))

In [None]:
# Display seleceted (Curated) beaconsnew_plot_beacon_rates

if True:
    # Force inline because interactive opens seperate windows for each
    %matplotlib inline
    tmp = pd.DataFrame(CuratedBeacons, copy=True)
    pdata = tmp.pivot_table(index=tmp.index, columns='clientIP', values='rate')
    print len(list(pdata))
    for c, n in zip(list(pdata), range(len(pdata))):
        print "   '%s', # Selected %d"%(c, n)
        new_plot_beacon_rates(pdata, title="Client "+c, clients=[c], figsize=(16, 3))
print "Done"

In [None]:
# Paper Figures

PaperDir = '../paper/'
PaperDir = './'
DoPaper = False
if True:
    ofile = None
    if DoPaper:
        ofile = PaperDir+'Selected.png'
        print 'Formating to:', ofile
    %matplotlib
    tmp = pd.DataFrame(CuratedBeacons, copy=True)
    # pdata = tmp.pivot_table(index=tmp.index, columns='clientIP', values='rate')
    new_plot_beacon_rates(tmp[tmp['clientIP']=='217.72.93.226'],
                  ofile=ofile, figsize=(6, 2))