### Update orphaning

In [None]:
import datetime as dt
import re
import urllib2
import ujson as json
from os import environ
from collections import defaultdict

%pylab inline

In [None]:
# Get the time when this job started.

startTime = dt.datetime.now()
print "Start: " + str(startTime.strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
sc.defaultParallelism

In [None]:
runManually = None
todayEnvStr = None

# Uncomment out the following two lines and adjust |todayEnvStr| as necessary to run manually
# runManually = True
# todayEnvStr = "20170212"

channelToProcess = "release"
minVersion = 42
upToDateReleases = 2
weeksOfSubsessionData = 12
minUpdatePingCount = 4
minSubsessionHours = 2
minSubsessionSeconds = minSubsessionHours * 60 * 60

In [None]:
if runManually is None:
    todayEnvStr = environ.get("date", None)

assert (todayEnvStr is not None), "The date environment parameter is missing."
today = dt.datetime.strptime(todayEnvStr, "%Y%m%d").date()

# MON = 0, SAT = 5, SUN = 6 -> SUN = 0, MON = 1, SAT = 6
dayIndex = (today.weekday() + 1) % 7
# Filename used to save the report's JSON
reportFilename = (today - datetime.timedelta(dayIndex)).strftime("%Y%m%d")
# Maximum report date which is the previous Saturday
maxReportDate = today - datetime.timedelta(7 + dayIndex - 6)
# Suffix of the longitudinal datasource name to use
longitudinalSuffix = maxReportDate.strftime("%Y%m%d")
# String used in the SQL queries to limit records to the maximum report date.
# Since the queries use less than this is the day after the previous Saturday.
maxReportDateSQL = (maxReportDate + dt.timedelta(days=1)).strftime("%Y-%m-%d")
# The Sunday prior to the last Saturday
minReportDate = maxReportDate - dt.timedelta(days=6)
# String used in the SQL queries to limit records to the minimum report date
# Since the queries use greater than this is six days prior to the previous Saturday.
minReportDateSQL = minReportDate.strftime("%Y-%m-%d")
# Date used to limit records to the number of weeks specified by
# weeksOfSubsessionData prior to the maximum report date
minSubsessionDate = maxReportDate - dt.timedelta(weeks=weeksOfSubsessionData)
# Date used to compute the latest version from firefox_history_major_releases.json
latestVerDateStr = (maxReportDate - dt.timedelta(days=7)).strftime("%Y-%m-%d")

print "maxReportDate     : " + maxReportDate.strftime("%Y%m%d")
print "maxReportDateSQL  : " + maxReportDateSQL
print "minReportDate     : " + minReportDate.strftime("%Y%m%d")
print "minReportDateSQL  : " + minReportDateSQL
print "minSubsessionDate : " + minSubsessionDate.strftime("%Y%m%d")
print "reportFilename    : " + reportFilename
print "latestVerDateStr  : " + latestVerDateStr

In [None]:
def latestVersionOnDate(date, majorReleases):
    latestDate = u"1900-01-01"
    latestVer = 0
    for version, releaseDate in majorReleases.iteritems():
        versionInt = int(version.split(".")[0])
        if releaseDate <= date and releaseDate >= latestDate and versionInt >= latestVer:
            latestDate = releaseDate
            latestVer = versionInt

    return latestVer

majorReleasesJSON = urllib2.urlopen("https://product-details.mozilla.org/1.0/firefox_history_major_releases.json").read()
majorReleases = json.loads(majorReleasesJSON)
latestVersion = latestVersionOnDate(latestVerDateStr, majorReleases)
earliestUpToDateVersion = str(latestVersion - upToDateReleases)

print "Latest Version: " + str(latestVersion)

In [None]:
reportDetailsDict = defaultdict(int)
reportDetailsDict["latestVersion"] = latestVersion
reportDetailsDict["upToDateReleases"] = upToDateReleases
reportDetailsDict["minReportDate"] = minReportDate.strftime("%Y-%m-%d")
reportDetailsDict["maxReportDate"] = maxReportDate.strftime("%Y-%m-%d")
reportDetailsDict["weeksOfSubsessionData"] = weeksOfSubsessionData
reportDetailsDict["minSubsessionDate"] = minSubsessionDate.strftime("%Y-%m-%d")
reportDetailsDict["minSubsessionHours"] = minSubsessionHours
reportDetailsDict["minSubsessionSeconds"] = minSubsessionSeconds
reportDetailsDict["minUpdatePingCount"] = minUpdatePingCount
reportDetailsDict

In [None]:
# Using the parquet is as fast as using 'FROM longitudinal_vYYYMMDD'
# and it allows the query to go further back in time.

#longitudinalFromSQL = ("FROM longitudinal_v{} ").format(longitudinalSuffix)
longitudinalFromSQL = ("FROM parquet.`s3://telemetry-parquet/longitudinal/v{}` ").format(longitudinalSuffix)  
longitudinalFromSQL

In [None]:
buildVersionWhereSQL = "(build.version[0] RLIKE '^[0-9]{2,3}\.0[\.0-9]*$' OR build.version[0] = '50.1.0')"
buildVersionWhereSQL

In [None]:
appVersionWhereSQL = "(appVersion RLIKE '^[0-9]{2,3}\.0[\.0-9]*$' OR appVersion = '50.1.0')"
appVersionWhereSQL

In [None]:
commonWhereSQL = (""
    "build.application_name[0] = 'Firefox' AND "
    "DATEDIFF(SUBSTR(subsession_start_date[0], 0, 10), '{}') >= 0 AND "
    "DATEDIFF(SUBSTR(subsession_start_date[0], 0, 10), '{}') < 0 AND "
    "settings.update.channel[0] = '{}'"
"").format(minReportDateSQL,
           maxReportDateSQL,
           channelToProcess)
commonWhereSQL

In [None]:
summarySQL = (""
"SELECT "
    "COUNT(CASE WHEN build.version[0] >= '{}.' AND build.version[0] < '{}.' THEN 1 END) AS versionUpToDate, "
    "COUNT(CASE WHEN build.version[0] < '{}.' AND build.version[0] >= '{}.' THEN 1 END) AS versionOutOfDate, "
    "COUNT(CASE WHEN build.version[0] < '{}.' THEN 1 END) AS versionTooLow, "
    "COUNT(CASE WHEN build.version[0] > '{}.' THEN 1 END) AS versionTooHigh, "
    "COUNT(CASE WHEN NOT build.version[0] > '0' THEN 1 END) AS versionMissing "
"{} "
"WHERE "
    "{} AND "
    "{}"
"").format(str(latestVersion - upToDateReleases),
           str(latestVersion + 1),
           str(latestVersion - upToDateReleases),
           str(minVersion),
           str(minVersion),
           str(latestVersion + 1),
           longitudinalFromSQL,
           commonWhereSQL,
           buildVersionWhereSQL)
summarySQL

In [None]:
summaryDF = sqlContext.sql(summarySQL)

In [None]:
summaryHead = summaryDF.head()
summaryDict = defaultdict(int)
summaryDict["versionUpToDate"] = summaryHead["versionUpToDate"]
summaryDict["versionOutOfDate"] = summaryHead["versionOutOfDate"]
summaryDict["versionTooLow"] = summaryHead["versionTooLow"]
summaryDict["versionTooHigh"] = summaryHead["versionTooHigh"]
summaryDict["versionMissing"] = summaryHead["versionMissing"]
summaryDict

In [None]:
# Only query for the columns and the records that are used to optimize
# for speed. Adding update_state_code_partial_stage and
# update_state_code_complete_stage increased the time it takes this
# notebook to run by 50 seconds when using 4 clusters.

# Creating a temporary table of the data after the filters have been
# applied and joining it with the original datasource to include
# other columns doesn't appear to speed up the process but it doesn't
# appear to slow it down either so all columns of interest are in this
# query.

outOfDateDetailsSQL = (""
"SELECT "
    "client_id, "
    "build.version, "
    "session_length, "
    "subsession_start_date, "
    "subsession_length, "
    "update_check_code_notify, "
    "update_check_extended_error_notify, "
    "update_check_no_update_notify, "
    "update_not_pref_update_enabled_notify, "
    "update_not_pref_update_auto_notify, "
    "update_ping_count_notify, "
    "update_unable_to_apply_notify, "
    "update_download_code_partial, "
    "update_download_code_complete, "
    "update_state_code_partial_stage, "
    "update_state_code_complete_stage, "
    "update_state_code_unknown_stage, "
    "update_state_code_partial_startup, "
    "update_state_code_complete_startup, "
    "update_state_code_unknown_startup, "
    "update_status_error_code_complete_startup, "
    "update_status_error_code_partial_startup, "
    "update_status_error_code_unknown_startup, "
    "update_status_error_code_complete_stage, "
    "update_status_error_code_partial_stage, "
    "update_status_error_code_unknown_stage "
"{}"
"WHERE "
    "{} AND "
    "{} AND "
    "build.version[0] < '{}.' AND "
    "build.version[0] >= '{}.'"
"").format(longitudinalFromSQL,
           commonWhereSQL,
           buildVersionWhereSQL,
           str(latestVersion - upToDateReleases),
           str(minVersion))
outOfDateDetailsSQL

In [None]:
outOfDateDetailsDF = sqlContext.sql(outOfDateDetailsSQL)

In [None]:
# Create the RDD used to further restrict which clients are out of date
# to focus on clients that are of concern and potentially of concern.

outOfDateDetailsRDD = outOfDateDetailsDF.rdd.cache()

The next several cells are to find the clients that are "out of date, potentially<br/>
of concern" so they can be excluded from the "out of date, of concern" clients.

In [None]:
# Create an RDD of out of date telemetry pings that have and don't have
# a previous telemetry ping with a version that is up to date along
# with a dictionary of the count of True and False.

def hasOutOfDateMaxVersion(d):
    ping = d
    index = 0
    currentVersion = ping.version[0]
    while (index < len(ping.version)):
        if ((ping.version[index] == "50.1.0" or
             p.match(ping.version[index])) and
            ping.version[index] > earliestUpToDateVersion):
            return (False, ping)
        index += 1

    return (True, ping)

# RegEx for a valid release versions except for 50.1.0 which is handled separately.
p = re.compile('^[0-9]{2,3}\\.0[\\.0-9]*$')

hasOutOfDateMaxVersionRDD = outOfDateDetailsRDD.map(hasOutOfDateMaxVersion).cache()
hasOutOfDateMaxVersionDict = hasOutOfDateMaxVersionRDD.countByKey()
hasOutOfDateMaxVersionDict

In [None]:
hasOutOfDateMaxVersionTrueRDD = hasOutOfDateMaxVersionRDD.filter(lambda p: p[0] == True).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date telemetry pings that have and have not
# sent an update telemtry ping for any version of Firefox along with a
# dictionary of the count of True and False.

def hasUpdatePingMapper(d):
    ping = d
    if (ping.update_ping_count_notify is not None and
        (ping.update_check_code_notify is not None or
         ping.update_check_no_update_notify is not None)):
        return (True, ping)

    return (False, ping)

hasUpdatePingRDD = hasOutOfDateMaxVersionTrueRDD.map(hasUpdatePingMapper).cache()
hasUpdatePingDict = hasUpdatePingRDD.countByKey()
hasUpdatePingDict

In [None]:
hasUpdatePingTrueRDD = hasUpdatePingRDD.filter(lambda p: p[0] == True).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date telemetry pings that have and have not
# ran this version of Firefox for more than the amount of seconds as
# specified in minSubsessionSeconds along with a dictionary of the
# count of True and False.

def hasMinSubsessionLengthMapper(d):
    ping = d
    seconds = 0
    index = 0
    currentVersion = ping.version[0]
    while (seconds < minSubsessionSeconds and
           index < len(ping.subsession_start_date) and
           index < len(ping.version) and
           ping.version[index] == currentVersion):
        try:
            date = dt.datetime.strptime(ping.subsession_start_date[index][:10],
                                        "%Y-%m-%d").date()
            if date < minSubsessionDate:
                return (False, ping)

            seconds += ping.subsession_length[index]
            index += 1
        except: # catch *all* exceptions
            index += 1

    if seconds >= minSubsessionSeconds:
        return (True, ping)

    return (False, ping)

hasMinSubsessionLengthRDD = hasUpdatePingTrueRDD.map(hasMinSubsessionLengthMapper).cache()
hasMinSubsessionLengthDict = hasMinSubsessionLengthRDD.countByKey()
hasMinSubsessionLengthDict

In [None]:
hasMinSubsessionLengthTrueRDD = hasMinSubsessionLengthRDD.filter(lambda p: p[0] == True).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date telemetry pings that have and have not
# sent the minimum number of update pings as specified by
# minUpdatePingCount for this version of Firefox along with a
# dictionary of the count of True and False.

def hasMinUpdatePingCountMapper(d):
    ping = d
    index = 0
    updatePingCountTotal = 0
    currentVersion = ping.version[0]
    while (updatePingCountTotal < minUpdatePingCount and
           index < len(ping.update_ping_count_notify) and
           index < len(ping.version) and
           ping.version[index] == currentVersion):

        pingCount = ping.update_ping_count_notify[index]
        # Is this an update ping or just a placeholder for the telemetry ping?
        if pingCount > 0:
            try:
                date = dt.datetime.strptime(ping.subsession_start_date[index][:10],
                                            "%Y-%m-%d").date()
                if date < minSubsessionDate:
                    return (False, ping)

            except: # catch *all* exceptions
                index += 1
                continue

            # Is there also a valid update check code or no update telemetry ping?
            if (ping.update_check_code_notify is not None and
                len(ping.update_check_code_notify) > index):
                for codeValue in ping.update_check_code_notify[index]:
                    if codeValue > 0:
                        updatePingCountTotal += pingCount
                        index += 1
                        continue

            if (ping.update_check_no_update_notify is not None and
                len(ping.update_check_no_update_notify) > index and
                ping.update_check_no_update_notify[index] > 0):
                updatePingCountTotal += pingCount

        index += 1

    if updatePingCountTotal < minUpdatePingCount:
        return (False, ping)

    return (True, ping)

hasMinUpdatePingCountRDD = hasMinSubsessionLengthTrueRDD.map(hasMinUpdatePingCountMapper).cache()
hasMinUpdatePingCountDict = hasMinUpdatePingCountRDD.countByKey()
hasMinUpdatePingCountDict

In [None]:
hasMinUpdatePingCountTrueRDD = hasMinUpdatePingCountRDD.filter(lambda p: p[0] == True).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date telemetry pings that are supported and
# are not supported based on whether they have not received or have
# received the unsupported update xml for the last update check along
# with a dictionary of the count of True and False.

def isSupportedMapper(d):
    ping = d
    index = 0
    updatePingCountTotal = 0
    currentVersion = ping.version[0]
    while (updatePingCountTotal < minUpdatePingCount and
           index < len(ping.update_ping_count_notify) and
           index < len(ping.version) and
           ping.version[index] == currentVersion):
        pingCount = ping.update_ping_count_notify[index]
        # Is this an update ping or just a placeholder for the telemetry ping?
        if pingCount > 0:
            # Is there also a valid update check code or no update telemetry ping?
            if (ping.update_check_code_notify is not None and
                len(ping.update_check_code_notify) > index and
                ping.update_check_code_notify[index][28] > 0):
                return (False, ping)

        index += 1
        
    return (True, ping)

isSupportedRDD = hasMinUpdatePingCountTrueRDD.map(isSupportedMapper).cache()
isSupportedDict = isSupportedRDD.countByKey()
isSupportedDict

In [None]:
isSupportedTrueRDD = isSupportedRDD.filter(lambda p: p[0] == True).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date telemetry pings that have and don't have
# the ability to apply an update along with a dictionary of the count 
# of True and False.

def isAbleToApplyMapper(d):
    ping = d
    index = 0
    currentVersion = ping.version[0]
    while (index < len(ping.update_ping_count_notify) and
           index < len(ping.version) and
           ping.version[index] == currentVersion):
        if ping.update_ping_count_notify[index] > 0:
            # Only check the last value for update_unable_to_apply_notify
            # to determine if the client is unable to apply.
            if (ping.update_unable_to_apply_notify is not None and
                ping.update_unable_to_apply_notify[index] > 0):
                return (False, ping)

            return (True, ping)

        index += 1

    raise ValueError("Missing update unable to apply value!")

isAbleToApplyRDD = isSupportedTrueRDD.map(isAbleToApplyMapper).cache()
isAbleToApplyDict = isAbleToApplyRDD.countByKey()
isAbleToApplyDict

In [None]:
isAbleToApplyTrueRDD = isAbleToApplyRDD.filter(lambda p: p[0] == True).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date telemetry pings that have and don't have
# the application.update.enabled preference set to True / False along
# with a dictionary of the count of True and False.

def hasUpdateEnabledMapper(d):
    ping = d
    index = 0
    currentVersion = ping.version[0]
    while (index < len(ping.update_ping_count_notify) and
           index < len(ping.version) and
           ping.version[index] == currentVersion):
        if ping.update_ping_count_notify[index] > 0:
            # If there is an update ping and update_not_pref_update_enabled_notify
            # has a value greater than 0 then the preference is false. If there is
            # a value of 0 or update_not_pref_update_enabled_notify is None then
            # the preference is true.
            if (ping.update_not_pref_update_enabled_notify is not None and
                ping.update_not_pref_update_enabled_notify[index] > 0):
                return (False, ping)

            return (True, ping)

        index += 1

    raise ValueError("Missing update enabled value!")

hasUpdateEnabledRDD = isAbleToApplyTrueRDD.map(hasUpdateEnabledMapper).cache()
hasUpdateEnabledDict = hasUpdateEnabledRDD.countByKey()
hasUpdateEnabledDict

The next several cells categorize the clients that are "out of date, of concern".

In [None]:
# Create a reference to the dictionary which will be written to the
# JSON that populates the web page data. This way the reference in the
# web page never changes. A reference is all that is needed since the
# dictionary is not modified.

ofConcernDict = hasUpdateEnabledDict

In [None]:
# This RDD is named ofConcernTrueRDD to simplify the addition of new
# code without having to modify consumers of the RDD.

ofConcernTrueRDD = hasUpdateEnabledRDD.filter(lambda p: p[0] == True).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date, of concern telemetry ping client
# versions along with a dictionary of the count of each version.

def byVersionMapper(d):
    ping = d
    return (ping.version[0], ping)

ofConcernByVersionRDD = ofConcernTrueRDD.map(byVersionMapper)
ofConcernByVersionDict = ofConcernByVersionRDD.countByKey()
ofConcernByVersionDict

In [None]:
# Create an RDD of out of date, of concern telemetry ping update check
# codes along with a dictionary of the count of each update check code.

def checkCodeNotifyMapper(d):
    ping = d
    index = 0
    currentVersion = ping.version[0]
    while (index < len(ping.update_ping_count_notify) and
           index < len(ping.version) and
           ping.version[index] == currentVersion):
        if ping.update_ping_count_notify[index] > 0:
            if ping.update_check_code_notify is not None:
                codeIndex = 0
                for codeValue in ping.update_check_code_notify[index]:
                    if codeValue > 0:
                        return (codeIndex, ping)
                    codeIndex += 1

            if (ping.update_check_no_update_notify is not None and
                ping.update_check_no_update_notify[index] > 0):
                return (0, ping)

        index += 1

    return (-1, ping)

checkCodeNotifyOfConcernRDD = ofConcernTrueRDD.map(checkCodeNotifyMapper)
checkCodeNotifyOfConcernDict = checkCodeNotifyOfConcernRDD.countByKey()
checkCodeNotifyOfConcernDict

In [None]:
# Create an RDD of out of date, of concern telemetry pings that had a
# general failure for the update check. The general failure codes are:
# CHK_GENERAL_ERROR_PROMPT: 22
# CHK_GENERAL_ERROR_SILENT: 23

checkCodeNotifyGeneralErrorOfConcernRDD = \
  checkCodeNotifyOfConcernRDD.filter(lambda p: p[0] == 22 or p[0] == 23).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date, of concern telemetry ping update check
# extended error values for the clients that had a general failure for
# the update check along with a dictionary of the count of the error
# values.

def checkExErrorNotifyMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if ping.update_ping_count_notify[index] > 0:
            if ping.update_check_extended_error_notify is not None:
                for keyName in ping.update_check_extended_error_notify:
                    if ping.update_check_extended_error_notify[keyName][index] > 0:
                        if version == currentVersion:
                            keyName = keyName[17:]
                            if len(keyName) == 4:
                                keyName = keyName[1:]
                            return (int(keyName), ping)
                        return (-1, ping)

    return (-2, ping)
checkExErrorNotifyOfConcernRDD = checkCodeNotifyGeneralErrorOfConcernRDD.map(checkExErrorNotifyMapper)
checkExErrorNotifyOfConcernDict = checkExErrorNotifyOfConcernRDD.countByKey()
checkExErrorNotifyOfConcernDict

In [None]:
# Create an RDD of out of date, of concern telemetry ping update
# download codes along with a dictionary of the count of the codes.

def downloadCodeMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if ping.update_download_code_partial is not None:
            codeIndex = 0
            for codeValue in ping.update_download_code_partial[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_download_code_complete is not None:
            codeIndex = 0
            for codeValue in ping.update_download_code_complete[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

    return (-2, ping)

downloadCodeOfConcernRDD = ofConcernTrueRDD.map(downloadCodeMapper)
downloadCodeOfConcernDict = downloadCodeOfConcernRDD.countByKey()
downloadCodeOfConcernDict

In [None]:
# Create an RDD of out of date, of concern telemetry ping staged update
# state codes along with a dictionary of the count of the codes.

def stateCodeStageMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if ping.update_state_code_partial_stage is not None:
            codeIndex = 0
            for codeValue in ping.update_state_code_partial_stage[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_state_code_complete_stage is not None:
            codeIndex = 0
            for codeValue in ping.update_state_code_complete_stage[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_state_code_unknown_stage is not None:
            codeIndex = 0
            for codeValue in ping.update_state_code_unknown_stage[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

    return (-2, ping)

stateCodeStageOfConcernRDD = ofConcernTrueRDD.map(stateCodeStageMapper)
stateCodeStageOfConcernDict = stateCodeStageOfConcernRDD.countByKey()
stateCodeStageOfConcernDict

In [None]:
# Create an RDD of out of date, of concern telemetry pings that failed
# to stage an update.
# STATE_FAILED: 12

stateCodeStageFailedOfConcernRDD = stateCodeStageOfConcernRDD.filter(lambda p: p[0] == 12).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date, of concern telemetry ping staged update
# state failure codes along with a dictionary of the count of the codes.

def stateFailureCodeStageMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if ping.update_status_error_code_partial_stage is not None:
            codeIndex = 0
            for codeValue in ping.update_status_error_code_partial_stage[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_status_error_code_complete_stage is not None:
            codeIndex = 0
            for codeValue in ping.update_status_error_code_complete_stage[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_status_error_code_unknown_stage is not None:
            codeIndex = 0
            for codeValue in ping.update_status_error_code_unknown_stage[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

    return (-2, ping)

stateFailureCodeStageOfConcernRDD = stateCodeStageFailedOfConcernRDD.map(stateFailureCodeStageMapper)
stateFailureCodeStageOfConcernDict = stateFailureCodeStageOfConcernRDD.countByKey()
stateFailureCodeStageOfConcernDict

In [None]:
# Create an RDD of out of date, of concern telemetry ping startup
# update state codes along with a dictionary of the count of the codes.

def stateCodeStartupMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if ping.update_state_code_partial_startup is not None:
            codeIndex = 0
            for codeValue in ping.update_state_code_partial_startup[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_state_code_complete_startup is not None:
            codeIndex = 0
            for codeValue in ping.update_state_code_complete_startup[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_state_code_unknown_startup is not None:
            codeIndex = 0
            for codeValue in ping.update_state_code_unknown_startup[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

    return (-2, ping)

stateCodeStartupOfConcernRDD = ofConcernTrueRDD.map(stateCodeStartupMapper)
stateCodeStartupOfConcernDict = stateCodeStartupOfConcernRDD.countByKey()
stateCodeStartupOfConcernDict

In [None]:
ofConcernStateCodeStartupFailedRDD = stateCodeStartupOfConcernRDD.filter(lambda p: p[0] == 12).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date, of concern telemetry ping startup
# update state failure codes along with a dictionary of the count of the
# codes.

def stateFailureCodeStartupMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if ping.update_status_error_code_partial_startup is not None:
            codeIndex = 0
            for codeValue in ping.update_status_error_code_partial_startup[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_status_error_code_complete_startup is not None:
            codeIndex = 0
            for codeValue in ping.update_status_error_code_complete_startup[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

        if ping.update_status_error_code_unknown_startup is not None:
            codeIndex = 0
            for codeValue in ping.update_status_error_code_unknown_startup[index]:
                if codeValue > 0:
                    if version == currentVersion:
                        return (codeIndex, ping)
                    return (-1, ping)
                codeIndex += 1

    return (-2, ping)

stateFailureCodeStartupOfConcernRDD = ofConcernStateCodeStartupFailedRDD.map(stateFailureCodeStartupMapper)
stateFailureCodeStartupOfConcernDict = stateFailureCodeStartupOfConcernRDD.countByKey()
stateFailureCodeStartupOfConcernDict

In [None]:
# Create an RDD of out of date, of concern telemetry pings that have
# and have not received only no updates available during the update
# check for their current version of Firefox.

def hasOnlyNoUpdateFoundMapper(d):
    ping = d
    if ping.update_check_no_update_notify is None:
        return (False, ping)

    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if currentVersion != version:
            return (True, ping)

        if ping.update_ping_count_notify[index] > 0:
            # If there is an update ping and update_check_no_update_notify
            # has a value equal to 0 then the update check returned a
            # value other than no update found. This could be improved by
            # checking the check value for error conditions and ignoring
            # those codes and ignoring the check below for those cases.
            if (ping.update_check_no_update_notify[index] == 0):
                return (False, ping)

    return (True, ping)

hasOnlyNoUpdateFoundRDD = ofConcernTrueRDD.map(hasOnlyNoUpdateFoundMapper)
hasOnlyNoUpdateFoundDict = hasOnlyNoUpdateFoundRDD.countByKey()
hasOnlyNoUpdateFoundDict

In [None]:
hasOnlyNoUpdateFoundFalseRDD = hasOnlyNoUpdateFoundRDD.filter(lambda p: p[0] == False).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date, of concern telemetry pings that have and
# don't have any update download pings for their current version of
# Firefox.

def hasNoDownloadCodeMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if currentVersion != version:
            return (True, ping)

        if ping.update_download_code_partial is not None:
            codeIndex = 0
            for codeValue in ping.update_download_code_partial[index]:
                if codeValue > 0:
                    return (False, ping)
                codeIndex += 1

        if ping.update_download_code_complete is not None:
            codeIndex = 0
            for codeValue in ping.update_download_code_complete[index]:
                if codeValue > 0:
                    return (False, ping)
                codeIndex += 1

    return (True, ping)

hasNoDownloadCodeRDD = hasOnlyNoUpdateFoundFalseRDD.map(hasNoDownloadCodeMapper)
hasNoDownloadCodeDict = hasNoDownloadCodeRDD.countByKey()
hasNoDownloadCodeDict

In [None]:
hasNoDownloadCodeFalseRDD = hasNoDownloadCodeRDD.filter(lambda p: p[0] == False).map(lambda p: p[1]).cache()

In [None]:
# Create an RDD of out of date, of concern telemetry pings that have and
# don't have an update failure state for their current version of
# Firefox.

def hasUpdateApplyFailureMapper(d):
    ping = d
    currentVersion = ping.version[0]
    for index, version in enumerate(ping.version):
        if currentVersion != version:
            return (False, ping)

        if ping.update_state_code_complete_startup is not None:
            if ping.update_state_code_complete_startup[index][12] > 0:
                return (True, ping)

        if ping.update_state_code_complete_startup is not None:
            if ping.update_state_code_complete_startup[index][12] > 0:
                return (True, ping)

    return (False, ping)

hasUpdateApplyFailureRDD = hasNoDownloadCodeFalseRDD.map(hasUpdateApplyFailureMapper)
hasUpdateApplyFailureDict = hasUpdateApplyFailureRDD.countByKey()
hasUpdateApplyFailureDict

In [None]:
hasUpdateApplyFailureFalseRDD = hasUpdateApplyFailureRDD.filter(lambda p: p[0] == False).map(lambda p: p[1]).cache()

In [None]:
# Create a reference to the dictionary which will be written to the
# JSON that populates the web page data. This way the reference in the
# web page never changes. A reference is all that is needed since the
# dictionary is not modified.

ofConcernCategorizedDict = hasUpdateApplyFailureDict

In [None]:
# Create the JSON that will be written to a file for the report.

results = {"reportDetails": reportDetailsDict,
           "summary": summaryDict,
           "hasOutOfDateMaxVersion": hasOutOfDateMaxVersionDict,
           "hasUpdatePing": hasUpdatePingDict,
           "hasMinSubsessionLength": hasMinSubsessionLengthDict,
           "hasMinUpdatePingCount": hasMinUpdatePingCountDict,
           "isSupported": isSupportedDict,
           "isAbleToApply": isAbleToApplyDict,
           "hasUpdateEnabled": hasUpdateEnabledDict,
           "ofConcern": ofConcernDict,
           "hasOnlyNoUpdateFound": hasOnlyNoUpdateFoundDict,
           "hasNoDownloadCode": hasNoDownloadCodeDict,
           "hasUpdateApplyFailure": hasUpdateApplyFailureDict,
           "ofConcernCategorized": ofConcernCategorizedDict,
           "ofConcernByVersion": ofConcernByVersionDict,
           "checkCodeNotifyOfConcern": checkCodeNotifyOfConcernDict,
           "checkExErrorNotifyOfConcern": checkExErrorNotifyOfConcernDict,
           "downloadCodeOfConcern": downloadCodeOfConcernDict,
           "stateCodeStageOfConcern": stateCodeStageOfConcernDict,
           "stateFailureCodeStageOfConcern": stateFailureCodeStageOfConcernDict,
           "stateCodeStartupOfConcern": stateCodeStartupOfConcernDict,
           "stateFailureCodeStartupOfConcern": stateFailureCodeStartupOfConcernDict}
resultsJSON = json.dumps(results, ensure_ascii=False)
resultsJSON

In [None]:
# Save the output to be uploaded automatically once the job completes.
# The file will be stored at:
# https://analysis-output.telemetry.mozilla.org/SPARKJOBNAME/data/FILENAME

filename = "./output/" + reportFilename + ".json"
if runManually is None:
    with open(filename, 'w') as f:
        f.write(final_results_json)

print "Filename: " + filename

In [None]:
# Get the time when this job ended.

endTime = dt.datetime.now()
print "End: " + str(endTime.strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
# Get the elapsed time it took to run this job.

elapsedTime = endTime - startTime
print "Elapsed Seconds: " + str(int(elapsedTime.total_seconds()))