In [None]:
!git clone https://github.com/marco-c/missing_symbols.git

In [None]:
import os
from datetime import datetime, timedelta
from pyspark.sql import functions
import boto3

In [None]:
known_modules = set([module[:-4].lower() for module in os.listdir('missing_symbols/known_modules')])

In [None]:
num_days = 3
days = [datetime.utcnow().date() - timedelta(1) - timedelta(i) for i in range(0, num_days)]

In [None]:
dataset = SQLContext(sc).read.load(['s3://telemetry-parquet/socorro_crash/v2/crash_date=' + day.strftime('%Y%m%d') for day in days], 'parquet')

In [None]:
modules = dataset\
.filter(dataset['product'] == 'Firefox')\
.select(['uuid'] + [functions.explode(dataset['json_dump']['modules']).alias('module')])\
.dropDuplicates(['uuid', 'module'])\
.select(['module'])\
.rdd\
.map(lambda v: v['module'])\
.filter(lambda m: m['missing_symbols'] and m['filename'].lower() not in known_modules)\
.flatMap(lambda m: [((m['filename'], m['version']), 1)])\
.reduceByKey(lambda x, y: x + y)\
.map(lambda v: (v[0][0], [(v[0][1], v[1])]))\
.reduceByKey(lambda x, y: x + y)\
.sortBy(lambda v: sum(count for ver,count in v[1]), ascending=False)\
.collect()

In [None]:
len(modules)

In [None]:
top_missing = sorted([(name, version, count) for name, versions in modules for version, count in versions if count > 2000], key=lambda m: m[2], reverse=True)

In [None]:
[(module, sum(count for ver, count in versions)) for module, versions in modules[:50]]

In [None]:
with open('missing_symbols/firefox_modules.txt', 'r') as f:
    firefox_modules = f.read().split('\n')[:-1]

In [None]:
import boto3

subject = 'Weekly report of modules with missing symbols in crash reports'

body = """
<table style="border-collapse:collapse;">
  <tr>
  <th style="border: 1px solid black;">Name</th>
  <th style="border: 1px solid black;">Version</th>
  <th style="border: 1px solid black;"># of crash reports (*)</th>
</tr>
"""
for name, version, count in top_missing:
    body += '<tr>'
    body += '<td style="border: 1px solid black;">'
    if name in firefox_modules:
        body += '<span style="color:red;">%s</span>' % name
    else:
        body += name
    body += '</td>'
    body += '<td style="border: 1px solid black;">%s</td>' % version
    body += '<td style="border: 1px solid black;">%d</td>' % count
    body += '</tr>'
body += '</table>'

body += """
\n\n(*) The number of crash reports refers to the past 3 days.
Only modules with at least 2000 crash reports are shown in this list.

If you see modules that shouldn't be in this list as it's expected not
to have their symbols, either contact mcastelluccio@mozilla.com or open
a PR to add them to https://github.com/marco-c/missing_symbols/tree/master/known_modules.
"""

ses = boto3.client('ses')
ses.send_email(
    Source='telemetry-alerts@mozilla.com',
    Destination={
        'ToAddresses': ['mcastelluccio@mozilla.com', 'release-mgmt@mozilla.com', 'stability@mozilla.org'],
        'CcAddresses': [],
    },
    Message={
        'Subject': {'Data': subject, 'Charset': 'UTF-8'},
        'Body': {'Html': {'Data': body, 'Charset': 'UTF-8'}}
    }
)['MessageId']