In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from datetime import datetime, timedelta

from plotly.graph_objs import *
from moztelemetry import get_pings_properties, get_one_ping_per_client
from moztelemetry.dataset import Dataset

%matplotlib inline

Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.


In [2]:
sc.defaultParallelism

160

In [3]:
# We build Aurora nightly too.
nightly_build_channels = ["nightly", "aurora"]

In [4]:
sample_size = 1.0

The FX_TAB_SWITCH_SPINNER_VISIBLE_LONG_MS probe was introduced and made opt-out on the September 8th, 2016 build (uplifted across all channels too)

Let's make sure we only examine builds from that point and after.

In [5]:
probe_available = datetime(2016, 9, 8)
days_to_look_back = 180
start_date = max(probe_available, datetime.today() - timedelta(days=days_to_look_back)).strftime("%Y%m%d")
start_date

'20160908'

In [6]:
end_date = datetime.today().strftime("%Y%m%d")
end_date

'20170117'

In [7]:
properties = ["clientId",
              "payload/histograms/FX_TAB_SWITCH_SPINNER_VISIBLE_LONG_MS",
              "payload/histograms/FX_TAB_SWITCH_SPINNER_VISIBLE_MS",
              "environment/system/os/name",
              "application/buildId",
              "environment/settings/e10sEnabled"]

In [17]:
def windows_only(p):
    return p["environment/system/os/name"] == "Windows_NT"

def e10s_enabled_only(p):
    return p["environment/settings/e10sEnabled"]

def long_spinners_keyed_by_build_and_client(ping):
    return ((ping["application/buildId"][:8], ping["clientId"]), (ping["payload/histograms/FX_TAB_SWITCH_SPINNER_VISIBLE_LONG_MS"], ping["payload/histograms/FX_TAB_SWITCH_SPINNER_VISIBLE_MS"]))

def none_is_subsumed(x, y):
    long_x = x[0]
    long_y = y[0]
    short_x = x[1]
    short_y = y[1]

    if long_x is None:
        long_x = pd.Series()
    if long_y is None:
        long_y = pd.Series()
    if short_x is None:
        short_x = pd.Series()
    if short_y is None:
        short_y = pd.Series()

    return (long_x.add(long_y, fill_value=0.0), short_x.add(short_y, fill_value=0.0))

def bucket_by_long_severity_per_client(spinner_pair):
    buildId = spinner_pair[0][0]
    hist = spinner_pair[1][0]
    named_index = ["unaffected",
                   "0ms - 999ms",
                   "1000ms - 2296ms",
                   "2297ms - 5276ms",
                   "5277ms - 12123ms",
                   "12124ms - 27855ms",
                   "27856ms - 63999ms",
                   "64000ms+"]
    
    severity = pd.Series([0, 0, 0, 0, 0, 0, 0, 0], index=named_index)

    if hist is None or hist.empty:
        severity[named_index[0]] = 1
    elif hist[hist.index >= 64000].sum() > 0:
        severity[named_index[7]] = 1
    elif hist[hist.index >= 27856].sum() > 0:
        severity[named_index[6]] = 1
    elif hist[hist.index >= 12124].sum() > 0:
        severity[named_index[5]] = 1
    elif hist[hist.index >= 5277].sum() > 0:
        severity[named_index[4]] = 1
    elif hist[hist.index >= 2297].sum() > 0:
        severity[named_index[3]] = 1
    elif hist[hist.index >= 1000].sum() > 0:
        severity[named_index[2]] = 1
    elif hist[hist.index >= 0].sum() > 0:
        severity[named_index[1]] = 1

    return (buildId, severity)

def bucket_by_short_severity_per_client(spinner_pair):
    buildId = spinner_pair[0][0]
    long_hist = spinner_pair[1][0]
    hist = spinner_pair[1][1]

    named_index = ["unaffected",
                   "not short",
                   "0ms - 49ms",
                   "50ms - 99ms",
                   "100ms - 199ms",
                   "200ms - 399ms",
                   "400ms - 799ms",
                   "800ms+"]
    
    severity = pd.Series([0, 0, 0, 0, 0, 0, 0, 0], index=named_index)

    if hist is None or hist.empty or long_hist is None or long_hist.empty:
        severity[named_index[0]] = 1
    elif long_hist[long_hist.index >= 1000].sum() > 0:
        severity[named_index[1]] = 1
    elif hist[hist.index >= 800].sum() > 0:
        severity[named_index[7]] = 1
    elif hist[hist.index >= 400].sum() > 0:
        severity[named_index[6]] = 1
    elif hist[hist.index >= 200].sum() > 0:
        severity[named_index[5]] = 1
    elif hist[hist.index >= 100].sum() > 0:
        severity[named_index[4]] = 1
    elif hist[hist.index >= 50].sum() > 0:
        severity[named_index[3]] = 1
    elif hist[hist.index >= 0].sum() > 0:
        severity[named_index[2]] = 1

    return (buildId, severity)

def to_percentages(build_severities):
    severities = build_severities[1]
    total_clients = severities.sum()
    if total_clients > 0:
        return (build_severities[0], severities / total_clients)
    return (build_severities[0], severities)

In [18]:
build_results = {}

for build_type in nightly_build_channels:
    pings = Dataset.from_source("telemetry") \
        .where(docType='main') \
        .where(appBuildId=lambda b: (b.startswith(start_date) or b > start_date) and (b.startswith(end_date) or b < end_date)) \
        .where(appUpdateChannel=build_type) \
        .records(sc, sample=sample_size)
    
    ping_props = get_pings_properties(pings, properties)
    windows_pings_only = ping_props.filter(windows_only)
    e10s_enabled_on_windows_pings_only = windows_pings_only.filter(e10s_enabled_only)
    grouped_spinners = e10s_enabled_on_windows_pings_only.repartition(200).map(long_spinners_keyed_by_build_and_client).reduceByKey(none_is_subsumed)
    
    # Long spinners...
    collected_percentages_long = grouped_spinners \
        .map(bucket_by_long_severity_per_client) \
        .reduceByKey(lambda x, y: x + y) \
        .repartition(200) \
        .map(to_percentages).collect()
    final_result_long = sorted(collected_percentages_long, key=lambda result: result[0])
    
    # Short spinners
    collected_percentages_short = grouped_spinners \
        .map(bucket_by_short_severity_per_client) \
        .reduceByKey(lambda x, y: x + y) \
        .repartition(200) \
        .map(to_percentages).collect()
    
    final_result_short = sorted(collected_percentages_short, key=lambda result: result[0])
    short_hist = final_result_short[:5][1][1]
    
    if round(short_hist[2:].sum(), 3) == round(final_result_long[:5][1][1][1], 3):
        print "All is well with the world"
        
    final_construction = {
        'long': final_result_long,
        'short': final_result_short,
    }
    
    build_results[build_type] = final_construction

All is well with the world
All is well with the world


In [19]:
import ujson as json

for result_key, results in build_results.iteritems():
    filename = "./output/severities_by_build_id_%s.json" % result_key
    results_json = json.dumps(results, ensure_ascii=False)

    with open(filename, 'w') as f:
        f.write(results_json)

For now, we're focusing in on our Windows users, so let's filter out anything that's not Windows.

To make it easier to group by both build ID and client ID, we'll map the histograms to a tuple of (build ID, client ID), and then reduce the histograms on that key to accumulate all pings that came in for the same client and the same build.

Note that if the client pinged but didn't have the histogram, the value in the key-value pair will be `None`.

The histogram for the spinner looks like this:

```
 0        1.0
 1000     0.0
 2297     0.0
 5277     0.0
 12124    0.0
 27856    0.0
 64000    0.0
 dtype: float64,
```

And we've combined all of the client histograms by build ID. What that means is that `grouped_spinners` is keyed on (build ID, client ID) and the value is the sum of all of the spinner histograms (or `None` if the client never saw a spinner for that build). Next, we'll look at each spinner histogram sum, and bucket based on where we see values > 1.

We'll just use a pandas `Series` for the bucket structure.

In [None]:

#final_result_long