In [None]:
import datetime as dt
import pandas as pd
import ujson as json

from moztelemetry import get_pings, get_pings_properties

%pylab inline

Create a set of pings from "saved-session" to build a set of core client data.

In [None]:
update_channel = "nightly"
now = dt.datetime.now()
start = dt.datetime(2016,1,23) #now - dt.timedelta(30)
end = dt.datetime(2016,2,24) #now - dt.timedelta(1)

pings = get_pings(sc, app="Fennec", channel=update_channel,
                  submission_date=(start.strftime("%Y%m%d"), end.strftime("%Y%m%d")),
                  build_id=("20100101000000", "99999999999999"),
                  fraction=1)

subset = get_pings_properties(pings, ["meta/clientId",
                                      "meta/documentId",
                                      "meta/submissionDate",
                                      "application/version",
                                      "environment/profile/creationDate",
                                      "environment/system/os/version",
                                      "environment/system/memoryMB"])

Take the set of pings, make sure we have actual clientIds and remove duplicate pings.

In [None]:
def dedupe_pings(rdd):
    return rdd.filter(lambda p: p["meta/clientId"] is not None)\
              .map(lambda p: (p["meta/clientId"] + p["meta/documentId"], p))\
              .reduceByKey(lambda x, y: x)\
              .map(lambda x: x[1])

subset = dedupe_pings(subset)
print subset.first()

Reduce the set of pings to one ping per client, using the newest ping as determined by submission date.

In [None]:
def sort_by_submission(x, y):
    xDate = int(x["meta/submissionDate"])
    yDate = int(y["meta/submissionDate"])
    if xDate > yDate:
        return x
    return y

def reduce_by_client(rdd):
    return rdd.map(lambda x: (x["meta/clientId"], x))\
    .reduceByKey(lambda x, y: sort_by_submission(x, y))\
    .map(lambda x: x[1])
    
reduced = reduce_by_client(subset)
reduced.take(5)

Transform and sanitize the pings into arrays.

In [None]:
def transform(ping):    
    clientId = ping["meta/clientId"] # Should not be None since we filter those out

    profileDate = None
    profileDaynum = ping["environment/profile/creationDate"]
    if profileDaynum is not None:
        profileDate = (dt.date(1970, 1, 1) + dt.timedelta(int(profileDaynum))).strftime("%Y%m%d")

    submissionDate = ping["meta/submissionDate"] # Added via the ingestion process so should not be None

    version = ping["application/version"]
    os_version = int(ping["environment/system/os/version"])
    memory = ping["environment/system/memoryMB"]
    if memory is None:
        memory = 0
    else:
        memory = int(memory)
            
    return [clientId, profileDate, submissionDate, version, os_version, memory]

transformed = reduced.map(transform)
print transformed.first()

Output the data to CSV.

In [None]:
#grouped = pd.DataFrame(transformed.collect(), columns=["clientid", "profiledate", "submissiondate", "version", "osversion", "memory"])
#!mkdir -p ./output
#grouped.to_csv("./output/android-clients-" + update_channel + "-" + end.strftime("%Y%m%d") + ".csv", index=False)

s3_output = "s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mfinkle/android_clients"
s3_output += "/v1/channel=" + update_channel + "/end_date=" + end.strftime("%Y%m%d") 
grouped = sqlContext.createDataFrame(transformed, ["clientid", "profiledate", "submissiondate", "version", "osversion", "memory"])
grouped.saveAsParquetFile(s3_output)
