In [1]:
!rm -rf crashcorrelations
!git clone https://github.com/marco-c/crashcorrelations
dbutils.library.installPyPI("google-cloud-bigquery", "1.20.0")
dbutils.library.restartPython()

In [2]:
!pip download stemming==1.0.1
!tar xf stemming-1.0.1.tar.gz
sc.addPyFile('stemming-1.0.1/stemming/porter2.py')

In [3]:
import hashlib
from collections import defaultdict
from datetime import datetime

import os
import sys
sys.path += [os.path.abspath("."), os.path.abspath("crashcorrelations")]

from crashcorrelations import download_data, utils, crash_deviations, comments

In [4]:
print(datetime.utcnow())

In [5]:
channels = ['release', 'beta', 'nightly', 'esr']
channel_to_versions = {}

for channel in channels:
    channel_to_versions[channel] = download_data.get_versions(channel)

In [6]:
signatures = {}

for channel in channels:
    signatures[channel] = download_data.get_top(200, versions=channel_to_versions[channel], days=5)

In [7]:
utils.rmdir('top-signatures-correlations_output')
utils.mkdir('top-signatures-correlations_output')

totals = {
    'date': str(utils.utc_today()),
}
addon_related_signatures = defaultdict(list)

for channel in channels:
    print(channel)

    utils.mkdir('top-signatures-correlations_output/' + channel)

    dataset = crash_deviations.get_telemetry_crashes(spark, versions=channel_to_versions[channel], days=5)
    results, total_reference, total_groups = crash_deviations.find_deviations(sc, dataset, signatures=signatures[channel])

    totals[channel] = total_reference

    try:
        dataset = crash_deviations.get_telemetry_crashes(spark, versions=channel_to_versions[channel], days=30)
        top_words = comments.get_top_words(dataset, signatures[channel])
    except:
        top_words = {}

    for signature in signatures[channel]:
        if signature not in results:
            continue

        addons = [result for result in results[signature] if any('Addon' in elem and float(result['count_group']) / total_groups[signature] > float(result['count_reference']) / total_reference for elem in result['item'].keys() if len(result['item']) == 1)]

        if len(addons) > 0:
            addon_related_signatures[channel].append({
                'signature': signature,
                'addons': addons,
                'total': total_groups[signature],
            })

        res = {
            'total': total_groups[signature],
            'results': results[signature],
        }

        if signature in top_words:
            res['top_words'] = top_words[signature]

        utils.write_json('top-signatures-correlations_output/' + channel + '/' + hashlib.sha1(signature.encode('utf-8')).hexdigest() + '.json.gz', res)

utils.write_json('top-signatures-correlations_output/all.json.gz', totals)
utils.write_json('top-signatures-correlations_output/addon_related_signatures.json.gz', addon_related_signatures)

In [8]:
print(datetime.utcnow())

In [9]:
# Will be uploaded under https://analysis-output.telemetry.mozilla.org/top-signatures-correlations/data/
utils.remove_results('top-signatures-correlations')
utils.upload_results('top-signatures-correlations', 'top-signatures-correlations_output')