### TLS 1.3 Experiment Beta 51

In [None]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import IPython

from __future__ import division
from moztelemetry.spark import get_one_ping_per_client, get_pings_properties
from moztelemetry import Dataset
from montecarlino import grouped_permutation_test

%pylab inline
IPython.core.pylabtools.figsize(16, 7)
import warnings; warnings.simplefilter('ignore')

In [None]:
sc.defaultParallelism

In [None]:
def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
    histA = xs.sum(axis=0)
    histB = ys.sum(axis=0)
    
    if normalize:
        histA = histA/histA.sum()
        histB = histB/histB.sum()
    
    d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
        for (a, b) in zip(histA, histB)])

    return d

def median_diff(xs, ys):
    return np.median(xs) - np.median(ys)

def compare_histogram(histogram, branches, treatment, control):
    pvalue = grouped_permutation_test(chi2_distance, [treatment, control], num_samples=100)
    
    eTotal = treatment.sum()
    nTotal = control.sum()
        
    eTotal = 100*eTotal/eTotal.sum()
    nTotal = 100*nTotal/nTotal.sum()
        
    fig = plt.figure()
    fig.subplots_adjust(hspace=0.3)
        
    ax = fig.add_subplot(1, 1, 1)
    ax2 = ax.twinx()
    width = 0.4
    ylim = max(eTotal.max(), nTotal.max())
        
    eTotal.plot(kind="bar", alpha=0.5, color="yellow", label=branches[0], ax=ax, width=width, position=0, ylim=(0, ylim + 1))
    nTotal.plot(kind="bar", alpha=0.5, color="blue", label=branches[1], ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim())
        
    ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0],
              ["{} ({} samples".format(branches[0], len(treatment)), "{} ({} samples)".format(branches[1], len(control))])

    plt.title(histogram)
    plt.xlabel(histogram)
    plt.ylabel("Frequency %")
    plt.show()
        
    print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue)
    
def normalize_uptime_hour(frame):
    frame = frame[frame["payload/simpleMeasurements/uptime"] > 0]
    frame.drop("environment/addons/activeExperiment/branch", axis=1, inplace=True)
    frame = 60 * frame.apply(lambda x: x/frame["payload/simpleMeasurements/uptime"]) # Metric per hour
    frame.drop('payload/simpleMeasurements/uptime', axis=1, inplace=True)
    return frame
    
def compare_count_histograms(pings, branches, *histograms_names):
    
    properties = histograms_names + ("payload/simpleMeasurements/uptime", "environment/addons/activeExperiment/branch")

    frame = pd.DataFrame(get_pings_properties(pings, properties).collect())

    first = frame[frame["environment/addons/activeExperiment/branch"] == branches[0]]
    first = normalize_uptime_hour(first)
    
    second = frame[frame["environment/addons/activeExperiment/branch"] == branches[1]]
    second = normalize_uptime_hour(second)
    
    for histogram in first.columns:
        if histogram == "environment/addons/activeExperiment/branch" or histogram.endswith("_parent") or histogram.endswith("_children"):
            continue
        compare_scalars(histogram + " per hour", first[histogram].dropna(), second[histogram].dropna())

        
def compare_histograms(pings, branches, *histogram_names):
    frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("environment/addons/activeExperiment/branch",), with_processes=True).collect())
    first = frame[frame["environment/addons/activeExperiment/branch"] == branches[0]]
    second = frame[frame["environment/addons/activeExperiment/branch"] == branches[1]]
    
    for histogram in second.columns:
        if histogram == "environment/addons/activeExperiment/branch" or histogram.endswith("_parent") or histogram.endswith("_children"):
            continue
            
        has_children = np.sum(first[histogram + "_children"].notnull()) > 0
        has_parent = np.sum(first[histogram + "_parent"].notnull()) > 0
        
        if has_children and has_parent:
            compare_histogram(histogram + " (parent + children)", branches, first[histogram].dropna(), second[histogram].dropna())
            
        if has_parent:
            compare_histogram(histogram + " (parent)", branches, first[histogram + "_parent"].dropna(), second[histogram].dropna())
            
        if has_children:
            compare_histogram(histogram + " (children)", branches, first[histogram + "_children"].dropna(), second[histogram].dropna())
                    

                
def compare_scalars(metric, *groups):
    if groups[0].empty or groups[1].empty:
        return
    print "Median difference in {} is {:.2f}, ({:.2f}, {:.2f}).".format(metric,
                                                                        median_diff(*groups), 
                                                                        np.median(groups[0]),
                                                                        np.median(groups[1]))

    print "The probablity of this effect being purely by chance is {:.2f}.". \
        format(grouped_permutation_test(median_diff, groups, num_samples=10000))

def compare_enabled_control_histograms(pings, *histograms):
    return compare_histograms(pings, ['enabled', 'control'], *histograms)

def compare_enabled_disabled_histograms(pings, *histograms):
        return compare_histograms(pings, ['enabled', 'disabled'], *histograms)

def compare_enabled_control_count_histograms(pings, *histograms):
        return compare_count_histograms(pings, ['enabled', 'control'], *histograms)

def compare_enabled_disabled_count_histograms(pings, *histograms):
        return compare_count_histograms(pings, ['enabled', 'disabled'], *histograms)


#### Get treatment and control partitions

This experiment didn't have a control, but it was sampled for 10% of the population. We'll take a 10% sample of the population that wasn't in the experiment (from our initial filtered set that recreates the experimental eligibility criteria) and use that as our control group. Note that there may still be some confounding factors due to the way samples are taken for telemetry experiments, but for the purpose of this analysis, taking from the population of people who aren't in the experiment but could qualify is probably fine.

In [None]:
def is_in_tls_experiment(ping):
    try:
        experiment = ping["environment"]["addons"]["activeExperiment"]
        return experiment["id"] == "tls13-compat-ff51@experiments.mozilla.org"
    except:
        return False

def is_not_in_tls_experiment(ping):
    try:
        experiment = ping["environment"]["addons"]["activeExperiment"]
    except:
        return False
    try:
        return experiment["id"] != "tls13-compat-ff51@experiments.mozilla.org"
    except:
        return True

In [None]:
def tag_control(ping):
    ping["environment"]["addons"]["oldExperiment"] = ping["environment"]["addons"]["activeExperiment"]
    ping["environment"]["addons"]["activeExperiment"] = {
        u'branch': u'control',
        u'id': u'tls13-compat-ff51@experiments.mozilla.org'
    }
    return ping

Trying to get as close to the experiment's conditions as possible to draw from

In [None]:
all_pings = (Dataset.from_source('telemetry')
                .where(docType='main')
                .where(appName='Firefox')
                .where(appUpdateChannel='beta')
                .where(appVersion=lambda x: x >= "51." and x < "52.")
                .where(appBuildId=lambda x: x >= "20170105155013")
                .where(submissionDate=lambda x: x >= '20170112' and x < '20170131')
                .records(sc, sample=0.02))

In [None]:
experimental = all_pings.filter(is_in_tls_experiment)

In [None]:
control = all_pings.filter(is_not_in_tls_experiment).sample(False, 0.1).map(tag_control)

In [None]:
pings = experimental.union(control).persist()

How many pings do we have in each branch?

In [None]:
pings.map(lambda x: (x["environment"]["addons"]["activeExperiment"].get("branch", None), 1))\
     .countByKey()

## Histogram Comparisons

#### Note: any "missing" histograms generally mean no results were found for that particular histogram in our sample

#### SSL Histograms

In [None]:
def filter_histogram(p):
    branch = p["environment/addons/activeExperiment/branch"]
    histo = p["payload/histograms/SSL_VERSION_FALLBACK_INAPPROPRIATE"]
    return (branch, histo[histo != 0.0])

get_pings_properties(pings,
                     ["payload/histograms/SSL_VERSION_FALLBACK_INAPPROPRIATE",
                      "environment/addons/activeExperiment/branch"]) \
    .filter(lambda p: p["payload/histograms/SSL_VERSION_FALLBACK_INAPPROPRIATE"] is not None) \
    .map(filter_histogram) \
    .collect()

There was a single instance of SSL_VERSION_FALLBACK_INAPPROPRIATE in the entire sample

In [None]:
def add_ssl_handshake_sum(p):
    handshakes = 0
    try:
        for k, v in p["payload"]["histograms"]["SSL_HANDSHAKE_VERSION"]["values"].iteritems():
            handshakes += v
        p["SSL_HANDSHAKE_VERSION_sum"] = handshakes
    except:
        pass
    return p

    
pings_with_sum = pings.map(add_ssl_handshake_sum)
compare_enabled_control_count_histograms(pings_with_sum, "SSL_HANDSHAKE_VERSION_sum")

This is very likely a result of the confounding factors in experiment eligibility -- running a similar comparison between the recent GPU experiment on nightly (gpu-process-nightly53@experiments.mozilla.org, which should not have an effect on SSL handshakes) and a "simulated" sample yields "-43.64, (174.55, 218.18)" with a simliar sized sample.

In [None]:
def add_http_pageload_is_ssl_sum(p):
    handshakes = 0
    try:
        p["HTTP_PAGELOAD_IS_SSL_sum"] = p["payload"]["histograms"]["HTTP_PAGELOAD_IS_SSL"]["sum"]
    except:
        pass
    return p
    
pings_with_sum = pings.map(add_http_pageload_is_ssl_sum)
compare_enabled_control_count_histograms(pings_with_sum, "HTTP_PAGELOAD_IS_SSL_sum")

Similarly, the GPU experiment's results were -1.67, (7.06, 8.73).

In [None]:
def add_http_pageload_is_not_ssl_sum(p):
    handshakes = 0
    try:
        p["HTTP_PAGELOAD_IS_NOT_SSL_sum"] = p["payload"]["histograms"]["HTTP_PAGELOAD_IS_SSL"]["values"]["0"]
    except:
        pass
    return p
    
pings_with_sum = pings.map(add_http_pageload_is_not_ssl_sum)
compare_enabled_control_count_histograms(pings_with_sum, "HTTP_PAGELOAD_IS_NOT_SSL_sum")

In [None]:
compare_enabled_control_histograms(pings, "payload/histograms/SSL_TIME_UNTIL_READY")

In [None]:
compare_enabled_control_histograms(pings, "payload/histograms/SSL_TIME_UNTIL_HANDSHAKE_FINISHED")

In [None]:
compare_enabled_control_histograms(pings, "payload/histograms/SSL_BYTES_BEFORE_CERT_CALLBACK")

#### SSL Intolerance

In [None]:
compare_enabled_control_histograms(pings,
                   "payload/histograms/SSL_TLS13_INTOLERANCE_REASON_PRE",
                   "payload/histograms/SSL_TLS13_INTOLERANCE_REASON_POST",
                   "payload/histograms/SSL_TLS12_INTOLERANCE_REASON_PRE",
                   "payload/histograms/SSL_TLS12_INTOLERANCE_REASON_POST",
                   "payload/histograms/SSL_TLS11_INTOLERANCE_REASON_PRE",
                   "payload/histograms/SSL_TLS11_INTOLERANCE_REASON_POST",
                   "payload/histograms/SSL_TLS10_INTOLERANCE_REASON_PRE",
                   "payload/histograms/SSL_TLS10_INTOLERANCE_REASON_POST")

We shouldn't have any SSL_TLS13_INTOLERANCE_REASON_PRE/SSL_TLS13_INTOLERANCE_REASON_POST results in the control -- taking a closer look below

In [None]:
def reasons(p):
    for message in p["payload/log"]:
        try:
            if message[0] == "EXPERIMENT_TERMINATION":
                if message[3] == "tls13-compat-ff51@experiments.mozilla.org":
                    return (message[2], p)
        except:
            continue
    if p["payload/info/addons"] and "tls13-compat-ff51%40experiments.mozilla.org" in p["payload/info/addons"]:
        return ("addonNoExperimentTag", p)
    return ("other", p)

def filter_for_histogram(p, histogram):
    if p[histogram] is None:
        return False
    try:
        return p[histogram].sum() > 0
    except:
        return False

def filter_for_ssl_tls13_intolerance_reason_pre(p):
    return filter_for_histogram(p, "payload/histograms/SSL_TLS13_INTOLERANCE_REASON_PRE")

get_pings_properties(pings, ["payload/histograms/SSL_TLS13_INTOLERANCE_REASON_PRE",
                             "environment/addons/activeExperiment/branch",
                             "payload/log",
                             "payload/info/addons",
                             "environment/addons/oldExperiment"]) \
    .filter(lambda p: p["environment/addons/activeExperiment/branch"] == "control") \
    .filter(filter_for_ssl_tls13_intolerance_reason_pre) \
    .map(reasons) \
    .countByKey()

The most common reason we see this histogram unexpectedly is when the client expired out of the experiment in the course of the subsession. Another common reason is that the client has the addon install but appears to have to active experiment -- we'll need to investigate why this might happen.
We have a single instance where neither of these conditions are true. It's possible this user manually enabled the tls13 preferance in about:config (that user's data is below.)

In [None]:
get_pings_properties(pings, ["payload/histograms/SSL_TLS13_INTOLERANCE_REASON_PRE",
                             "environment/addons/activeExperiment/branch",
                             "payload/log",
                             "payload/info/addons",
                             "environment/addons/oldExperiment"]) \
    .filter(lambda p: p["environment/addons/activeExperiment/branch"] == "control") \
    .filter(filter_for_histogram) \
    .map(reasons) \
    .filter(lambda p: p[0] == "other") \
    .first()

Now we do the same for SSL_TLS13_INTOLERANCE_REASON_POST

In [None]:
def filter_for_ssl_tls13_intolerance_reason_post(p):
    return filter_for_histogram(p, "payload/histograms/SSL_TLS13_INTOLERANCE_REASON_POST")

get_pings_properties(pings, ["payload/histograms/SSL_TLS13_INTOLERANCE_REASON_POST",
                             "environment/addons/activeExperiment/branch",
                             "payload/log",
                             "payload/info/addons",
                             "environment/addons/oldExperiment"]) \
    .filter(lambda p: p["environment/addons/activeExperiment/branch"] == "control") \
    .filter(filter_for_ssl_tls13_intolerance_reason_post) \
    .map(reasons) \
    .countByKey()

Similar story here, and it appears the single client who doesn't fall into the two explanations is the same one from above

In [None]:
get_pings_properties(pings, ["payload/histograms/SSL_TLS13_INTOLERANCE_REASON_POST",
                             "environment/addons/activeExperiment/branch",
                             "payload/log",
                             "payload/info/addons",
                             "environment/addons/oldExperiment"]) \
    .filter(lambda p: p["environment/addons/activeExperiment/branch"] == "control") \
    .filter(filter_for_ssl_tls13_intolerance_reason_post) \
    .map(reasons) \
    .filter(lambda p: p[0] == "other") \
    .first()

#### Crash counts

In [None]:
compare_enabled_control_count_histograms(pings,
                   "payload/keyedHistograms/SUBPROCESS_CRASHES_WITH_DUMP/pluginhang",
                   "payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plugin",
                   "payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plucontentginhang",
                   "payload/keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/gmplugin",
                   "payload/keyedHistograms/SUBPROCESS_CRASHES_WITH_DUMP/plugin",
                   "payload/keyedHistograms/SUBPROCESS_CRASHES_WITH_DUMP/content",
                   "payload/keyedHistograms/SUBPROCESS_CRASHES_WITH_DUMP/gmplugin",
                   "payload/keyedHistograms/PROCESS_CRASH_SUBMIT_ATTEMPT/main-crash",
                   "payload/keyedHistograms/PROCESS_CRASH_SUBMIT_ATTEMPT/content-crash",
                   "payload/keyedHistograms/PROCESS_CRASH_SUBMIT_ATTEMPT/plugin-crash"
                  )

The conclusion from this comparison is that crashes are too rare of an event for our sample size. See the query at https://sql.telemetry.mozilla.org/queries/2637 for a comparison using the Crash Aggregates dataset, which includes all crashes.