In [7]:
import datetime as dt
import operator
import pandas as pd
import ujson as json

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
from __future__ import division

%pylab inline

Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.




Populating the interactive namespace from numpy and matplotlib


### Prepare dataset

In [8]:
yesterday = (dt.datetime.now() - dt.timedelta(days=1)).strftime("%Y%m%d")

In [9]:
pings = get_pings(sc, app="Firefox", channel="release", submission_date=yesterday, fraction=1, schema="v4")

In [10]:
subset = get_pings_properties(pings, ["clientId",
                                      "environment/system/os/name",
                                      "environment/system/os/version",
                                      "environment/system/cpu/count",
                                      "environment/system/memoryMB",
                                      "environment/settings/telemetryEnabled",
                                      "payload/simpleMeasurements/firstPaint",
                                      "payload/simpleMeasurements/AMI_startup_begin",
                                      "payload/simpleMeasurements/shutdownDuration",
                                      "payload/addonDetails"]).\
        filter(lambda p: p["environment/settings/telemetryEnabled"])

In [11]:
def add_startup(ping):
    AMI_startup = ping.pop("payload/simpleMeasurements/AMI_startup_begin")
    firstPaint = ping.pop("payload/simpleMeasurements/firstPaint")
    ping["startup"] = firstPaint - AMI_startup if firstPaint and AMI_startup else None
    return ping
    
subset = subset.map(add_startup)

In [12]:
def filter_outliers(ping):
    startup = ping["startup"]
    shutdown = ping["payload/simpleMeasurements/shutdownDuration"]
    os = ping["environment/system/os/name"]
    version = ping["environment/system/os/version"]
    cpucount = ping["environment/system/cpu/count"]
    memsize = int(round(ping["environment/system/memoryMB"] / 1000.0))
    
    # Let's remove machines with older configurations or with suspect startup times
    if not startup or not version.startswith("6") or os != "Windows_NT" \
       or cpucount < 2 or memsize < 2 or startup > 60000 or startup <= 0 \
       or not shutdown or shutdown <= 0:
        return False
    
    return True

filtered = get_one_ping_per_client(subset.filter(filter_outliers))

In [13]:
def clean(s):
    try:
        s = s.decode('ascii').strip()
        return s if len(s) > 0 else None
    except:
        return None

def extract_addon_names(ping):
    addons = ping["payload/addonDetails"].get("XPI", {})
    addon_names = set()
    
    for addon, desc in addons.iteritems():
        name = clean(desc.get("name", None))
        if name is not None:
            addon_names.add(name)
            
    return addon_names

addon_counts = pd.Series(filtered.flatMap(extract_addon_names).countByValue())
addon_counts = (addon_counts/addon_counts.sum()).to_dict()
del addon_counts["Default"]

In [15]:
top_addons = pd.DataFrame(sorted(addon_counts.items(), key=operator.itemgetter(1), reverse=True)[:250], columns=["addon", "freq"])
top_addons.to_csv("addons.csv", index=False)

In [16]:
vectorized = pd.DataFrame(filtered.map(vectorize).collect(), columns = list(top_addons["addon"]) +
                          ["startup", "shutdown", "cpucount", "memsize"])
vectorized.to_csv("vectorized.csv", index=False)

ValueError: Shape of passed values is (1, 559960), indices imply (254, 559960)

### Fit model

In [11]:
!mkdir -p ./output

In [5]:
import readline
!pip install --upgrade --user rpy2

Requirement already up-to-date: rpy2 in /home/hadoop/anaconda2/lib/python2.7/site-packages
Requirement already up-to-date: six in /home/hadoop/anaconda2/lib/python2.7/site-packages (from rpy2)
Requirement already up-to-date: singledispatch in /home/hadoop/anaconda2/lib/python2.7/site-packages (from rpy2)
[33mYou are using pip version 8.1.1, however version 8.1.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [6]:
%load_ext rpy2.ipython

In [13]:
%%R -i yesterday

is.installed <- function(mypkg){
  is.element(mypkg, installed.packages()[,1])
}

if (!is.installed("dplyr"))
  install.packages("dplyr", repos="http://cran.rstudio.com/", quiet=TRUE)

if (!is.installed("caret"))
  install.packages("caret", repos="http://cran.rstudio.com/", quiet=TRUE)

library(caret)
library(plyr)
library(dplyr)

select <- dplyr::select
    
addon_plot <- function(df) {
  ggplot(df, aes(factor(addon, levels=rev(unique(addon))), Estimate)) +
    geom_point() +
    geom_errorbar(width=.1, aes(ymin=Estimate-Error, ymax=Estimate+Error)) +
    coord_flip() +
    scale_y_continuous(name="Startup time overhead in ms") + scale_x_discrete(name ="Add-on") +
    theme_bw()
}

extract <- function(model) {
  coefs <- data.frame(coef(summary(model)))
  coefs %>%
    mutate(addon = gsub("`", "", row.names(coefs))) %>%
    select(Estimate, Error=Std..Error, t=t.value, Pr=Pr...t.., addon) %>%
    arrange(-Estimate) %>% filter(Estimate > 0, Pr < 0.01)
}

extract_log <- function(model) {
  coefs <- data.frame(coef(summary(model)))
  coefs %>%
    mutate(addon = gsub("`", "", row.names(coefs))) %>%
    select(Estimate, Error=Std..Error, t=t.value, Pr=Pr...t.., addon) %>%
    arrange(-Estimate) %>% filter(Estimate > 0, addon != "(Intercept)", Pr < 0.01) %>%
    mutate(Estimate = (exp(Estimate) - 1)*100)
}

predict_metric <- function(df, freq, metric, prefix, log.transform=c(FALSE, TRUE)) {
  if (log.transform)
    df[[metric]] <- log(df[[metric]])

  # Partition the dataset into training and test set
  set.seed(42)
  data_partition <- createDataPartition(y = df[[metric]], p = 0.80, list = F)
  training <- df[data_partition,]
  testing <- df[-data_partition,]

  # Create model
  model <- lm(as.formula(paste(metric, "~.")), data=training)

  # Evaluate model
  prediction_train <- predict(model, training)
  cat("R2 on training set: ", R2(prediction_train, training[[metric]]), "\n")
  cat("RMSE on training set: ", RMSE(prediction_train, training[[metric]]), "\n")

  prediction_test <- predict(model, testing)
  cat("R2 on test set: ", R2(prediction_test, testing[[metric]]), "\n")
  cat("RMSE on test set: ", RMSE(prediction_test, testing[[metric]]), "\n")

  # Retrain on whole dataset
  model <- lm(as.formula(paste(metric, "~.")), data=df)

  # Pretty print results
  if (log.transform)
    result <- extract_log(model)
  else
    result <- extract(model)

  # addon_plot(result)
  result <- data.frame(lapply(result, function(x){sapply(x, toString)}))
  result <- left_join(result, freq) %>% select(-Pr)
  result <- result[, c("addon", "freq", "Estimate", "Error", "t")]

  base <- basename(prefix)
  path <- dirname(prefix)
  write.csv(result, file=paste(path, "/", metric, "_", base, ".csv", sep=""), row.names=FALSE, quote=FALSE)
  return(result)
}
      
args <- commandArgs(trailingOnly = TRUE)
addons <- read.csv("vectorized.csv", check.names=F) %>% select(-cpucount, -memsize)
addons_freq <- read.csv("addons.csv", col.names = c("addon", "freq"))
      
# Remove linear combinations
cmbs <- findLinearCombos(addons)$remove
if (!is.null(cmbs))
    addons <- addons[, -cmbs]
    
# Predict!
predict_metric(addons %>% select(-shutdown), addons_freq, "startup", paste("./output/addon_summary_", yesterday, sep=""))
#predict_metric(addons %>% select(-startup), addons_freq, "shutdown", paste("./output/addon_summary_", yesterday, sep=""), TRUE)



  res = super(Function, self).__call__(*new_args, **new_kwargs)


  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)
Attaching package: ‘dplyr’


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    filter, lag


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    intersect, setdiff, setequal, union


  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)


R2 on training set:  0.03146126 
RMSE on training set:  6744.175 
R2 on test set:  0.02986745 
RMSE on test set:  6779.447 
                                                    addon         freq
1                                    GBBD Banco do Brasil 0.0011268326
2                                         Yandex Elements 0.0124129975
3                                             (Intercept)           NA
4                            GBBD Caixa Economica Federal 0.0011669704
5                                                  Zotero 0.0005485504
6                                      McAfee SiteAdvisor 0.0008622202
7                                   Avast Online Security 0.0210456055
8                                Mozilla Firefox Hotfixer 0.0005351712
9                                    AVG Security Toolbar 0.0026580167
10                                   Hola Better Internet 0.0010599362
11                                 avast! Online Security 0.0008919519
12           Avira Searc

In [14]:
%%R -i yesterday

predict_metric(addons %>% select(-startup), addons_freq, "shutdown", paste("./output/addon_summary_", yesterday, sep=""), TRUE)

R2 on training set:  0.01992712 
RMSE on training set:  0.9449842 
R2 on test set:  0.01867965 
RMSE on test set:  0.9454737 
                                                   addon         freq
1                                      Firefox Migration 0.0022596115
2                                        Yandex Elements 0.0124129975
3                                        Session Manager 0.0011149399
4                                                 Zotero 0.0005485504
5  Firefox HTTP authentication from sub-resources Hotfix 0.0067847809
6                                            IDS_SS_NAME 0.0010197984
7                                   Hola Better Internet 0.0010599362
8                     Microsoft .NET Framework Assistant 0.0126463916
9                                            Default Tab 0.0006436920
10                              Mozilla Firefox Hotfixer 0.0005351712
11                                 SaveFrom.net - helper 0.0025138178
12                                

In [15]:
!gzip ./output/{startup*,shutdown*}.csv

### Dashboard configuration

In [16]:
startup_config = {
  "sort-options": {
      "values": ["Estimate (ms)", "Add-on", "Frequency"],
      "selected": "Estimate (ms)"
    },
  "filter-options": [
      {"id": "Limit",
       "values": [10, 50, 100, 200, 500],
       "selected": 10
      }
    ],
  "title": ["Add-ons startup correlations", "Correlations between startup time and add-ons"],
  "description": ["A linear regression model is fit using the add-ons as predictors for the startup time. The job is run weekly on all the data collected on Monday for the release channel on Windows.",
                  "http://robertovitillo.com/2014/10/07/using-ml-to-correlate-add-ons-to-performance-bottlenecks/"],
  "primary-key": ["Add-on"],
  "header": ["Add-on", "Frequency", "Estimate (ms)", "Error (ms)", "t-statistic"],
  "field-description": ["The name of the add-on", "The fraction of pings that contained the add-on", "The add-on coefficient expresses the effect of the addon on startup time wrt the average startup time without any add-ons", "The standard error of the coefficient", "The value of the associated t-statistic for the coefficient"],
  "url-prefix": "https://s3-us-west-2.amazonaws.com/telemetry-public-analysis-2/Addon%20analysis/data/startup_addon_summary"
}

shutdown_config = {
  "sort-options": {
      "values": ["Estimate", "Add-on", "Frequency"],
      "selected": "Estimate"
    },
  "filter-options": [
      {"id": "Limit",
       "values": [10, 50, 100, 200, 500],
       "selected": 10
      }
    ],
  "title": ["Add-ons shutdown influence", "Correlations between shutdown time and add-ons"],
  "description": ["A linear regression model is fit using the add-ons as predictors for the logarithm of the shutdown time. The job is run weekly on all the data collected on Monday for the release channel on Windows.",
                  "http://robertovitillo.com/2014/10/16/correlating-add-ons-to-slow-shutdown-times/"],
  "primary-key": ["Add-on"],
  "header": ["Add-on", "Frequency", "Estimate", "Error (ms)", "t-statistic"],
  "field-description": ["The name of the add-on", "The fraction of pings that contained the add-on", "The add-on coefficient expresses the change in percentage wrt the average shutdown time with no add-ons", "The standard error of the coefficient", "The value of the associated t-statistic for the coefficient"],
  "url-prefix": "https://s3-us-west-2.amazonaws.com/telemetry-public-analysis-2/Addon%20analysis/data/shutdown_addon_summary"
}

with open('./output/startup.json', 'w') as f:
    json.dump(startup_config, f)
              
with open('./output/shutdown.json', 'w') as f:
    json.dump(shutdown_config, f)