In [None]:
import datetime as dt
import pandas as pd
import ujson as json

from moztelemetry import get_pings, get_pings_properties

%pylab inline

Create a set of pings from "saved-session" to build a set of core client data.

In [None]:
update_channel = "nightly"
now = dt.datetime.now()
start = dt.datetime(2016,2,23) #now - dt.timedelta(30)
end = dt.datetime(2016,2,28) #now - dt.timedelta(1)

pings = get_pings(sc, app="Fennec", channel=update_channel,
                  submission_date=(start.strftime("%Y%m%d"), end.strftime("%Y%m%d")),
                  build_id=("20100101000000", "99999999999999"),
                  fraction=1)

subset = get_pings_properties(pings, ["meta/clientId",
                                      "meta/documentId",
                                      "meta/submissionDate",
                                      "creationDate",
                                      "application/version",
                                      "environment/system/os/version",
                                      "environment/profile/creationDate",
                                      "environment/settings/locale",
                                      "environment/settings/defaultSearchEngine"])

Take the set of pings, make sure we have actual clientIds and remove duplicate pings. We collect each unique ping.

In [None]:
def dedupe_pings(rdd):
    return rdd.filter(lambda p: p["meta/clientId"] is not None)\
              .map(lambda p: (p["meta/clientId"] + p["meta/documentId"] + p["creationDate"], p))\
              .reduceByKey(lambda x, y: x)\
              .map(lambda x: x[1])

subset = dedupe_pings(subset)
print subset.first()

Transform and sanitize the pings into arrays.

In [None]:
def transform(ping):
    # Should not be None since we filter those out.
    clientId = ping["meta/clientId"]

    profileDate = None
    profileDaynum = ping["environment/profile/creationDate"]
    if profileDaynum is not None:
        profileDate = dt.date(1970, 1, 1) + dt.timedelta(int(profileDaynum))

    creationDate = ping["creationDate"]
    if creationDate is not None:
        # This is only accurate because we know the creation date is always in 'Z' (zulu) time.
        creationDate = dt.datetime.strptime(ping["creationDate"], "%Y-%m-%dT%H:%M:%S.%fZ")

    # Added via the ingestion process so should not be None.
    submissionDate = dt.datetime.strptime(ping["meta/submissionDate"], "%Y%m%d").date()

    version = ping["application/version"]
    os_version = int(ping["environment/system/os/version"])
    locale = ping["environment/settings/locale"]
    defaultSearch = ping["environment/settings/defaultSearchEngine"]

    return [clientId, profileDate, submissionDate, creationDate, version, os_version, locale, defaultSearch]

transformed = subset.map(transform)
print transformed.first()

Output the data to CSV or Parquet.

In [None]:
#grouped = pd.DataFrame(transformed.collect(), columns=["clientid", "profiledate", "submissiondate", "creationdate", "appversion", "osversion", "locale", "defaultsearch"])
#!mkdir -p ./output
#grouped.to_csv("./output/android-clients-" + update_channel + "-" + end.strftime("%Y%m%d") + ".csv", index=False, encoding="utf-8")

s3_output = "s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mfinkle/android_clients"
s3_output += "/v1/channel=" + update_channel + "/end_date=" + end.strftime("%Y%m%d") 
grouped = sqlContext.createDataFrame(transformed, ["clientid", "profiledate", "submissiondate", "creationdate", "appversion", "osversion", "locale", "defaultsearch"])
grouped.saveAsParquetFile(s3_output)
