In [1]:
# Enables figures to load inline in the browser.
# %matplotlib inline

In [2]:
# Enables figures to load outside of browser.
%matplotlib 

Using matplotlib backend: Qt5Agg


In [5]:
import os
import math
import csv
import pandas as pd
import numpy as np
import matplotlib.dates as dates
import matplotlib.pyplot as plt
import matplotlib.ticker
import datetime
import collections

# Some matplotlib features are version dependent.
assert(matplotlib.__version__ >= '2.1.2')

In [6]:
# Depends on: pip install --upgrade google-cloud-bigquery
from google.cloud import bigquery

def run_query(query, project='mlab-sandbox', **kwargs):
    """ run_query
        Yields a DataFrame from a query string
        Accepts arbitrary {parameter} substitutions  
    """
    query=query.format(**kwargs)
    # print query
    client = bigquery.Client(project=project)
    job = client.query(query)

    results = collections.defaultdict(list)
    for row in job.result(timeout=300):
        for key in row.keys():
            results[key].append(row.get(key))

    return pd.DataFrame(results)

## The followinq query shows the first 1000 beacons

In [None]:
Q="""
SELECT 
  clientIP, series_start_asc, series_elapsed_days, series_interval_hours, series_count, series_uploads, series_downloads,  series_end_asc,  series_metro_details
FROM `mlab-sandbox.mattmathis.master_annotations`
ORDER BY series_start ASC
LIMIT 1000
"""
First1k = run_query(Q)
print First1k[0:12]

# BEWARE there are OAM addresses in the Master file

In [None]:
# CHeck for Known OAM addresses in master beacons
Q="""
SELECT
  *
FROM
    `mattmathis.master_annotations`
WHERE
    clientIP IN ( '45.56.98.222','64.9.225.99','64.9.225.190' )
"""
print run_query(Q)

## Rows with unreasonable OctetsOut
Roughly 800.  Are they parsing errors or errors in the raw data.
Looking through these, many also have unreasonable durations or other fields.

In [None]:
Q="""
#standardSQL
SELECT
    test_id as ID,
    partition_date as pd,
    connection_spec.client_ip as client,
    connection_spec.server_hostname as server,
    web100_log_entry.snap.HCDataOctetsOut AS out,
    web100_log_entry.snap.Duration AS duration
FROM `measurement-lab.stable.ndt_all` 
WHERE
    web100_log_entry.snap.HCDataOctetsOut < 0 OR
    web100_log_entry.snap.HCDataOctetsOut > 1625000000
ORDER BY
  out
"""
print run_query(Q)

In [12]:
# WHat was the last test?  How often does it change?
Q="""
SELECT
    MAX(web100_log_entry.log_time) AS last_test
FROM
    `measurement-lab.release.ndt_all`
WHERE
    partition_date > '2018-05-22'
"""
print run_query(Q)

    last_test
0  1527200478


# Extract region to metro map
(Actually a dictionary)

Manually save the site spreadsheet to /tmp/M-Lab_Sites.csv, run the cell below and paste the output elsewhere.

In [None]:
import collections
regions = collections.defaultdict(set)
transits = collections.defaultdict(set)
cities = {}

with open('/tmp/M-Lab_Sites.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        site = row['Site'].lower()
        metro = site[0:3]
        region = row['Region']
        transit = row['Transit provider']

        regions[region] |= {metro}
        transits[transit] |= {site}
        cities[metro] = row['City']

sregion={}
for region in regions:
    if region == '':
        print "# Sites with missing region fields", list(regions[region])
        continue
    rname = region.replace(' ','_')
    sregion[rname] = list(regions[region])
print "contenents =", sregion

stransit={}
for transit in transits:
    if transit == '':
        print "# Sites with missing transit fields", list(transits[transit])
        continue
    stransit[transit] = list(transits[transit])
print "transits =", stransit

print "cities =", cities