In [1]:
import datetime as dt
import pandas as pd
import ujson as json

from moztelemetry import get_pings, get_pings_properties

%pylab inline

Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/bucket-whitelist.json). Assuming all histograms are acceptable.
Populating the interactive namespace from numpy and matplotlib


Create a set of pings from "saved-session" to build a set of core client data.

In [2]:
update_channel = "nightly"
now = dt.datetime.now()
start = dt.datetime(2016,2,23) #now - dt.timedelta(30)
end = dt.datetime(2016,2,28) #now - dt.timedelta(1)

pings = get_pings(sc, app="Fennec", channel=update_channel,
                  submission_date=(start.strftime("%Y%m%d"), end.strftime("%Y%m%d")),
                  build_id=("20100101000000", "99999999999999"),
                  fraction=1)

subset = get_pings_properties(pings, ["meta/clientId",
                                      "meta/documentId",
                                      "meta/submissionDate",
                                      "creationDate",
                                      "application/version",
                                      "environment/system/os/version",
                                      "environment/profile/creationDate",
                                      "environment/settings/locale",
                                      "environment/settings/defaultSearchEngine"])

Take the set of pings, make sure we have actual clientIds and remove duplicate pings.

In [3]:
def dedupe_pings(rdd):
    return rdd.filter(lambda p: p["meta/clientId"] is not None)\
              .map(lambda p: (p["meta/clientId"] + p["meta/documentId"], p))\
              .reduceByKey(lambda x, y: x)\
              .map(lambda x: x[1])

subset = dedupe_pings(subset)
print subset.first()

{'meta/documentId': u'1ef07f3f-c51c-4dea-93b5-f6b7ee82fc16', 'meta/submissionDate': u'20160226', 'environment/system/os/version': 22, 'application/version': u'47.0a1', 'environment/profile/creationDate': 16570, 'meta/clientId': u'7746e9c7-4baf-463f-97d7-e1d5cf608816', 'creationDate': u'2016-02-26T12:21:38.090Z', 'environment/settings/defaultSearchEngine': None, 'environment/settings/locale': u'en-US'}


Reduce the set of pings to one ping per client environment, using the all of the data to define the environment. We can have multiple pings/rows per day per client, as changes happen during the day.

In [4]:
def safe_str(obj):
    if obj is None:
        return unicode("")
    return unicode(obj)

# Make a key out of any data item we want to monitor over time.
def build_client_history_key(p):
    # We ignore creationDate, but include submissionDate. This means we'll have one entry per
    # client per day at minimum.
    key = p["meta/clientId"] +\
          p["meta/submissionDate"] +\
          safe_str(p["application/version"]) +\
          safe_str(p["environment/system/os/version"]) +\
          safe_str(p["environment/profile/creationDate"]) +\
          safe_str(p["environment/settings/locale"]) +\
          safe_str(p["environment/settings/defaultSearchEngine"])
    
    return key

# Reduce the set of pings to a unique list of environment history changes.
# Multiple changes can happen per day
def reduce_by_client_history(rdd):
    return rdd.map(lambda p: (build_client_history_key(p), p))\
              .reduceByKey(lambda x, y: x)\
              .map(lambda x: x[1])

reduced = reduce_by_client_history(subset)
reduced.take(3)

[{'application/version': u'47.0a1',
  'creationDate': u'2016-02-22T16:28:03.273Z',
  'environment/profile/creationDate': 16836,
  'environment/settings/defaultSearchEngine': u'baidu',
  'environment/settings/locale': u'zh-CN',
  'environment/system/os/version': 21,
  'meta/clientId': u'fcdc38b3-684a-418a-8069-6ed755681516',
  'meta/documentId': u'7142b6fd-3398-4854-bb40-2bb060f86377',
  'meta/submissionDate': u'20160223'},
 {'application/version': u'46.0a1',
  'creationDate': u'2016-02-24T11:47:01.241Z',
  'environment/profile/creationDate': None,
  'environment/settings/defaultSearchEngine': u'google',
  'environment/settings/locale': u'pl',
  'environment/system/os/version': 21,
  'meta/clientId': u'44585ab0-60d8-4e64-859d-827dfbc52d9c',
  'meta/documentId': u'4f5384e0-045b-4e17-9b78-d89675beb5f8',
  'meta/submissionDate': u'20160224'},
 {'application/version': u'47.0a1',
  'creationDate': u'2016-02-25T21:47:53.462Z',
  'environment/profile/creationDate': 16847,
  'environment/settin

Transform and sanitize the pings into arrays.

In [7]:
def transform(ping):    
    clientId = ping["meta/clientId"] # Should not be None since we filter those out

    profileDate = None
    profileDaynum = ping["environment/profile/creationDate"]
    if profileDaynum is not None:
        profileDate = dt.date(1970, 1, 1) + dt.timedelta(int(profileDaynum))

    creationDate = ping["creationDate"]
    if creationDate is not None:
        # This is only accurate because we know the creation date is always in 'Z' (zulu) time.
        creationDate = dt.datetime.strptime(ping["creationDate"], "%Y-%m-%dT%H:%M:%S.%fZ")

    # Added via the ingestion process so should not be None.
    submissionDate = dt.datetime.strptime(ping["meta/submissionDate"], "%Y%m%d").date()

    version = ping["application/version"]
    os_version = int(ping["environment/system/os/version"])
    locale = ping["environment/settings/locale"]
    defaultSearch = ping["environment/settings/defaultSearchEngine"]

    # Experiments will be in delivered via "core" ping, so let's just pad it for now.
    experiments = []
            
    return [clientId, profileDate, submissionDate, creationDate, version, os_version, locale, defaultSearch, json.dumps(experiments)]

transformed = reduced.map(transform)
print transformed.first()

[u'6720b638-501b-4899-a883-c6d2125443aa', datetime.date(2016, 2, 28), datetime.date(2016, 2, 28), datetime.datetime(2016, 2, 28, 9, 28, 44, 789000), u'47.0a1', 21, u'zh-CN', u'baidu', '[]']


Output the data to CSV or Parquet.

In [8]:
grouped = pd.DataFrame(transformed.collect(), columns=["clientid", "profiledate", "submissiondate", "creationdate", "appversion", "osversion", "locale", "defaultsearch", "experiments"])
!mkdir -p ./output
grouped.to_csv("./output/android-clients-" + update_channel + "-" + end.strftime("%Y%m%d") + ".csv", index=False, encoding="utf-8")

#s3_output = "s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mfinkle/android_clients"
#s3_output += "/v1/channel=" + update_channel + "/end_date=" + end.strftime("%Y%m%d") 
#grouped = sqlContext.createDataFrame(transformed, ["clientid", "profiledate", "submissiondate", "version", "osversion", "memory"])
#grouped.saveAsParquetFile(s3_output)
