From c26665f2cf1a6f152e61479b85668347a2a45185 Mon Sep 17 00:00:00 2001 From: Benjamin Smedberg Date: Fri, 1 Mar 2013 09:53:00 -0500 Subject: [PATCH] UDF, pig, and postprocessor to determine which DLLs are most commonly missing symbols. --- .../socorro/pig/eval/MissingSymbols.java | 125 ++++++++++++++++++ src/main/pig/MissingSymbols-post.py | 41 ++++++ src/main/pig/MissingSymbols.pig | 40 ++++++ 3 files changed, 206 insertions(+) create mode 100644 src/main/java/com/mozilla/socorro/pig/eval/MissingSymbols.java create mode 100644 src/main/pig/MissingSymbols-post.py create mode 100644 src/main/pig/MissingSymbols.pig diff --git a/src/main/java/com/mozilla/socorro/pig/eval/MissingSymbols.java b/src/main/java/com/mozilla/socorro/pig/eval/MissingSymbols.java new file mode 100644 index 0000000..b73c777 --- /dev/null +++ b/src/main/java/com/mozilla/socorro/pig/eval/MissingSymbols.java @@ -0,0 +1,125 @@ +/** + * Copyright 2013 Mozilla Foundation . + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.mozilla.socorro.pig.eval; + +import java.io.IOException; +import java.util.HashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.pig.EvalFunc; + +import org.apache.pig.data.DataType; +import org.apache.pig.impl.logicalLayer.schema.Schema; +import org.apache.pig.impl.logicalLayer.schema.SchemaUtil; +import org.apache.pig.data.BagFactory; +import org.apache.pig.data.DataBag; +import org.apache.pig.data.Tuple; +import org.apache.pig.data.TupleFactory; + +public class MissingSymbols extends EvalFunc +{ + private static final Pattern newlinePattern = Pattern.compile("\n"); + private static final Pattern pipePattern = Pattern.compile("\\|"); + + private static final BagFactory bagFactory = BagFactory.getInstance(); + private static final TupleFactory tupleFactory = TupleFactory.getInstance(); + + private class ModuleData + { + public String pdbname_; + public String id_; + public boolean reported_; + + public ModuleData(String pdbname, String id) { + pdbname_ = pdbname; + id_ = id; + reported_ = false; + } + } + + public DataBag exec(Tuple input) throws IOException + { + if (input == null || input.size() != 1) { + return null; + } + + if (!(input.get(0) instanceof String)) { + return null; + } + + String dump = (String) input.get(0); + + DataBag db = bagFactory.newDefaultBag(); + + // maps "" to false (not reported) or true (reported) + HashMap modulemap = new HashMap(100); + + for (String dumpline : newlinePattern.split(dump)) { + String[] splits = pipePattern.split(dumpline, -1); + if (splits.length == 0 || splits[0].length() == 0) { + continue; + } + + if (splits[0].equals("Module")) { + if (!modulemap.containsKey(splits[1])) { + modulemap.put(splits[1], + new ModuleData(splits[3], splits[4])); + } + continue; + } + + if (!Character.isDigit(splits[0].charAt(0)) || + splits.length != 7) { + continue; + } + + if (splits[3].length() == 0) { + // This DLL doesn't have symbols, at least in this particular + // location + ModuleData md = (ModuleData) modulemap.get(splits[2]); + if (md != null) { + if (!md.reported_) { + md.reported_ = true; + + Tuple t = tupleFactory.newTuple(2); + t.set(0, md.pdbname_); + t.set(1, md.id_); + db.add(t); + } + } + } + } + return db; + } + + public Schema outputSchema(Schema input) { + try { + Schema bagSchema = new Schema(); + bagSchema.add( + new Schema.FieldSchema("pdbname", DataType.CHARARRAY)); + bagSchema.add( + new Schema.FieldSchema("id", DataType.CHARARRAY)); + + return new Schema( + new Schema.FieldSchema("modules", bagSchema, DataType.BAG)); + } + catch (Exception e) { + return null; + } + } +} diff --git a/src/main/pig/MissingSymbols-post.py b/src/main/pig/MissingSymbols-post.py new file mode 100644 index 0000000..87234f6 --- /dev/null +++ b/src/main/pig/MissingSymbols-post.py @@ -0,0 +1,41 @@ +import csv +import sys +import os + +items = [] + + +symboldirs = [ + 'symbols_ffx', + 'symbols_adobe', + 'symbols_os' +] +pattern = '/mnt/netapp/breakpad/%(symboldir)s/%(pdbname)s/%(id)s/%(symname)s' + +for t in csv.reader(sys.stdin, dialect='excel-tab'): + pdb, id, c = t + + if pdb.endswith('.pdb'): + symname = pdb[:-4] + '.sym' + else: + symname = pdb + '.sym' + + found = False + for symboldir in symboldirs: + path = pattern % {'symboldir': symboldir, + 'pdbname': pdb, + 'id': id, + 'symname': symname} + if os.path.exists(path): + found = True + break + if found: + continue + + items.append(t) + +items.sort(key=lambda i: int(i[2]), reverse=True) + +w = csv.writer(sys.stdout, dialect='excel-tab') +for t in items: + w.writerow(t) diff --git a/src/main/pig/MissingSymbols.pig b/src/main/pig/MissingSymbols.pig new file mode 100644 index 0000000..269f54d --- /dev/null +++ b/src/main/pig/MissingSymbols.pig @@ -0,0 +1,40 @@ +REGISTER 'socorro-toolbox-0.1-SNAPSHOT.jar'; +REGISTER 'akela-0.4-SNAPSHOT.jar'; + +SET pig.logfile MissingSymbols.log; +SET mapred.compress.map.output true; +SET mapred.map.output.compression.codec org.apache.hadoop.io.compress.SnappyCodec; + +DEFINE JsonMap com.mozilla.pig.eval.json.JsonMap(); +DEFINE MissingSymbols com.mozilla.socorro.pig.eval.MissingSymbols(); + +raw = LOAD 'hbase://crash_reports' + USING com.mozilla.pig.load.HBaseMultiScanLoader('$start_date', '$end_date', + 'yyMMdd', 'processed_data:json', 'true') + AS (k:bytearray, processed_json:chararray); + +processed = FILTER raw BY processed_json IS NOT NULL; + +genmap = FOREACH processed GENERATE + JsonMap(processed_json) as processed_data:map[]; + +limited = FILTER genmap BY + processed_data#'os_name' == 'Windows NT'; + +functed = FOREACH limited GENERATE + MissingSymbols(processed_data#'dump') AS modules; + +flattened = FOREACH functed GENERATE + flatten(modules); + +grouped = GROUP flattened BY (pdbname, id); + +totals = FOREACH grouped GENERATE + group.pdbname, + group.id, + COUNT(flattened) AS c; + +relevant = FILTER totals BY c > 2; + +STORE relevant INTO 'MissingSymbols-windows-$start_date-$end_date' + USING PigStorage();