fix user_story diff parsing, PY3 updates, code formatting

mozilla · Mar 31, 2018 · 103f251 · 103f251
1 parent ba3c2e9
commit 103f251
Show file tree

Hide file tree

Showing 12 changed files with 303 additions and 174 deletions.
diff --git a/bzETL/bz_etl.py b/bzETL/bz_etl.py
@@ -106,12 +106,13 @@ def get_records_from_bugzilla(db, param, please_stop):
     sorted = jx.sort(db_results, [
         "bug_id",
         "_merge_order",
-        {"field": "modified_ts", "sort": -1},
-        "modified_by"
+        {"modified_ts": "desc"},
+        "modified_by",
+        {"id": "desc"}
     ])
 
     process = BugHistoryParser(param, alias_config, output_queue)
-    for s in sorted:
+    for i, s in enumerate(sorted):
         process.processRow(s)
     process.processRow(wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))
 

diff --git a/bzETL/extract_bugzilla.py b/bzETL/extract_bugzilla.py
@@ -12,6 +12,7 @@
 from __future__ import unicode_literals
 
 from bzETL.parse_bug_history import MAX_TIME
+from bzETL.transform_bugzilla import NUMERIC_FIELDS
 from jx_python import jx
 from mo_dots.datas import Data
 from mo_logs import Log
@@ -104,6 +105,8 @@ def get_screened_whiteboard(db):
 
 
 def get_bugs_table_columns(db, schema_name):
+    global bugs_columns
+
     if not bugs_columns:
         columns = db.query("""
             SELECT
@@ -134,7 +137,7 @@ def get_bugs_table_columns(db, schema_name):
 
                 )
         """, {"schema": schema_name})
-        globals()["bugs_columns"] = columns
+        bugs_columns = columns
 
 
 def get_private_bugs_for_delete(db, param):
@@ -231,13 +234,13 @@ def get_bugs(db, param):
 
         #TODO: CF_LAST_RESOLVED IS IN PDT, FIX IT
         def lower(col):
-            if col.column_type.startswith("varchar"):
+            if col.column_type.startswith("varchar") or col.column_type.endswith('text'):
                 return "lower(" + db.quote_column(col.column_name) + ") " + db.quote_column(col.column_name)
             else:
                 return db.quote_column(col.column_name)
 
         param.bugs_columns = jx.select(bugs_columns, "column_name")
-        param.bugs_columns_SQL = SQL(",\n".join([lower(c) for c in bugs_columns]))
+        param.bugs_columns_SQL = SQL(",\n".join(lower(c) for c in bugs_columns))
         param.bug_filter = esfilter2sqlwhere(db, {"terms": {"b.bug_id": param.bug_list}})
         param.screened_whiteboard = esfilter2sqlwhere(db, {"and": [
             {"exists": "m.bug_id"},
@@ -584,6 +587,7 @@ def get_new_activities(db, param):
 
     output = db.query("""
         SELECT
+            a.id, 
             a.bug_id,
             UNIX_TIMESTAMP(bug_when)*1000 AS modified_ts,
             lower(login_name) AS modified_by,

diff --git a/bzETL/parse_bug_history.py b/bzETL/parse_bug_history.py
diff --git a/bzETL/transform_bugzilla.py b/bzETL/transform_bugzilla.py
@@ -14,20 +14,23 @@
 import re
 from datetime import date
 
-from mo_dots import unwraplist, listwrap
-from mo_future import text_type, long
-
 from jx_python import jx
+from mo_dots import listwrap
+from mo_future import text_type, long
 from mo_json import json2value, value2json
 from mo_logs import Log
 from pyLibrary import convert
 from pyLibrary.env import elasticsearch
 
-USE_ATTACHMENTS_DOT = True
+USE_ATTACHMENTS_DOT = True  # REMOVE THIS, ASSUME False
 
 DIFF_FIELDS = ["cf_user_story"]
 MULTI_FIELDS = ["cc", "blocked", "dependson", "dupe_by", "dupe_of", "flags", "keywords", "bug_group", "see_also"]
-NUMERIC_FIELDS=[      "blocked", "dependson", "dupe_by", "dupe_of",
+NUMERIC_FIELDS=[
+    "blocked",
+    "dependson",
+    "dupe_by",
+    "dupe_of",
     "votes",
     "estimated_time",
     "remaining_time",
@@ -36,6 +39,8 @@
 
 ]
 
+NULL_VALUES = ['--', '---']
+
 # Used to reformat incoming dates into the expected form.
 # Example match: "2012/01/01 00:00:00.000"
 DATE_PATTERN_STRICT = re.compile("^[0-9]{4}[\\/-][0-9]{2}[\\/-][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3}")
@@ -44,7 +49,8 @@
 DATE_PATTERN_RELAXED = re.compile("^[0-9]{4}[\\/-][0-9]{2}[\\/-][0-9]{2}")
 
 
-#WE ARE RENAMING THE ATTACHMENTS FIELDS TO CAUSE LESS PROBLEMS IN ES QUERIES
+# WE ARE RENAMING THE ATTACHMENTS FIELDS TO CAUSE LESS PROBLEMS IN ES QUERIES
+# TODO: REMOVE THIS OLD FOMAT
 def rename_attachments(bug_version):
     if bug_version.attachments == None: return bug_version
     if not USE_ATTACHMENTS_DOT:
@@ -83,10 +89,10 @@ def normalize(bug, old_school=False):
             bug.changes=json2value(json)
         bug.changes = jx.sort(bug.changes, ["attach_id", "field_name"])
 
-    #bug IS CONVERTED TO A 'CLEAN' COPY
     bug = elasticsearch.scrub(bug)
-    # bug.attachments = coalesce(bug.attachments, [])    # ATTACHMENTS MUST EXIST
-
+    for k, v in list(bug.items()):
+        if v in NULL_VALUES:
+            bug[k] = None
 
     for f in NUMERIC_FIELDS:
         v = bug[f]

diff --git a/tests/resources/config/test_one.json b/tests/resources/config/test_one.json
@@ -15,31 +15,33 @@
 //			6810,  //ok
 //			9622,  //minor email diff
 //			10575, //ok
-			11040, //alias analysis problem
+//			11040, //alias analysis problem
 //			12911, //alias analysis problem
+//			13534, // (REVIEW MOVES TO OTHER PERSON)
 //			67742, //alias analysis problem
 //			96421, //minor email diff
 //			123203,//expiry only
 //			178960,//minor email
+//			248970, // another cutoff review request
 //			367518,//ok
 //			457765,//ok
 //			458397,//minor email
-//			471427,//component rename, changes not compact
+//			471427,//minor email
 //			544327,//extra history
 //			547727,//extra history
 //			643420,//ok
 //			692436,//minor email
 //			726635,//alias problem
 //			813650,//ERROR in blocked
-			// 1165765 VERY LONG short_desc
-			// 1007019 does not have bug_status, or component, or product
-			// 372836 (REVIEW FLAGS TEST)
-			// 13534 (REVIEW MOVES TO OTHER PERSON)
-			// 393845  added blocking1.9+ twice
-			// 671185 *many* review requests
-			// 937428 whitespace after comma in user story, complex diff
-			// 248970 another cutoff review request
-			// 248971 another cutoff review request
+
+//			NOT VERIFIED
+//			248971, // another cutoff review request
+//			372836, // (REVIEW FLAGS TEST)
+//			393845, //  added blocking1.9+ twice
+//			671185, // *many* review requests
+			937428, // whitespace after comma in user story, complex diff
+//			1007019, // does not have bug_status, or component, or product
+//			1165765, // VERY LONG short_desc
 		],
 		"temp_dir": "tests/resources",
 		"errors": "tests/results/errors",
@@ -62,7 +64,7 @@
 	},
 	"reference": {
 		"filename": "tests/resources/reference/public_bugs.json",
-		"max_timestamp": 1372867005000
+		"max_timestamp": 1372867005000  // MAX TIME IN THE REFERENCE
 	},
 	"bugzilla": {
 		"$ref": "file://~/private.json#bugzilla-dev",

diff --git a/tests/resources/mySQL/README.md b/tests/resources/mySQL/README.md
diff --git a/tests/resources/mySQL/timezone_2011n_posix.zip b/tests/resources/mySQL/timezone_2011n_posix.zip
diff --git a/tests/util/compare_es.py b/tests/util/compare_es.py
@@ -14,6 +14,7 @@
 
 from datetime import datetime
 
+from mo_future import long
 from mo_logs import Log
 
 import jx_elasticsearch

diff --git a/vendor/jx_elasticsearch/es14/expressions.py b/vendor/jx_elasticsearch/es14/expressions.py
@@ -44,7 +44,7 @@
 
 
 class Ruby(Expression):
-    __slots__ = ("miss", "type", "expr", "many")
+    __slots__ = ("miss", "data_type", "expr", "many")
 
     def __init__(self, type, expr, frum, miss=None, many=False):
         self.miss = coalesce(miss, FALSE)  # Expression that will return true/false to indicate missing result

diff --git a/vendor/mo_logs/strings.py b/vendor/mo_logs/strings.py
@@ -20,11 +20,12 @@
 from collections import Mapping
 from datetime import datetime as builtin_datetime
 from datetime import timedelta, date
+from itertools import zip_longest
 from json.encoder import encode_basestring
 
 import sys
 
-from mo_dots import coalesce, wrap, get_module
+from mo_dots import coalesce, wrap, get_module, Data
 from mo_future import text_type, xrange, binary_type, round as _round, PY3, get_function_name
 from mo_logs.convert import datetime2unix, datetime2string, value2json, milli2datetime, unix2datetime
 from mo_logs.url import value2url_param
@@ -729,7 +730,7 @@ def edit_distance(s1, s2):
 DIFF_PREFIX = re.compile(r"@@ -(\d+(?:\s*,\d+)?) \+(\d+(?:\s*,\d+)?) @@")
 
 
-def apply_diff(text, diff, reverse=False):
+def apply_diff(text, diff, reverse=False, verify=True):
     """
     SOME EXAMPLES OF diff
     #@@ -1 +1 @@
@@ -750,45 +751,97 @@ def apply_diff(text, diff, reverse=False):
     +
     +Content Team Engagement & Tasks : https://appreview.etherpad.mozilla.org/40
     """
+
+    output = text
     if not diff:
-        return text
-    if diff[0].strip() == "":
-        return text
+        return output
 
-    matches = DIFF_PREFIX.match(diff[0].strip())
-    if not matches:
-        if not _Log:
-            _late_import()
+    start_of_hunk = 0
+    while True:
+        if start_of_hunk>=len(diff):
+            break
+        header = diff[start_of_hunk]
+        start_of_hunk += 1
+        if not header.strip():
+            continue
 
-        _Log.error("Can not handle {{diff}}\n",  diff= diff[0])
-
-    remove = [int(i.strip()) for i in matches.group(1).split(",")]
-    if len(remove) == 1:
-        remove = [remove[0], 1]  # DEFAULT 1
-    add = [int(i.strip()) for i in matches.group(2).split(",")]
-    if len(add) == 1:
-        add = [add[0], 1]
-
-    # UNUSUAL CASE WHERE @@ -x +x, n @@ AND FIRST LINE HAS NOT CHANGED
-    half = int(len(diff[1]) / 2)
-    first_half = diff[1][:half]
-    last_half = diff[1][half:half * 2]
-    if remove[1] == 1 and add[0] == remove[0] and first_half[1:] == last_half[1:]:
-        diff[1] = first_half
-        diff.insert(2, last_half)
-
-    if not reverse:
-        if remove[1] != 0:
-            text = text[:remove[0] - 1] + text[remove[0] + remove[1] - 1:]
-        text = text[:add[0] - 1] + [d[1:] for d in diff[1 + remove[1]:1 + remove[1] + add[1]]] + text[add[0] - 1:]
-        text = apply_diff(text, diff[add[1] + remove[1] + 1:], reverse=reverse)
-    else:
-        text = apply_diff(text, diff[add[1] + remove[1] + 1:], reverse=reverse)
-        if add[1] != 0:
-            text = text[:add[0] - 1] + text[add[0] + add[1] - 1:]
-        text = text[:remove[0] - 1] + [d[1:] for d in diff[1:1 + remove[1]]] + text[remove[0] - 1:]
+        matches = DIFF_PREFIX.match(header.strip())
+        if not matches:
+            if not _Log:
+                _late_import()
+
+            _Log.error("Can not handle \n---\n{{diff}}\n---\n",  diff=diff)
+
+        remove = tuple(int(i.strip()) for i in matches.group(1).split(","))  # EXPECTING start_line, length TO REMOVE
+        remove = Data(start=remove[0], length=1 if len(remove) == 1 else remove[1])  # ASSUME FIRST LINE
+        add = tuple(int(i.strip()) for i in matches.group(2).split(","))  # EXPECTING start_line, length TO ADD
+        add = Data(start=add[0], length=1 if len(add) == 1 else add[1])
 
-    return text
+        if remove.start == 0 and remove.length == 0:
+            remove.start = add.start
+        if add.start == 0 and add.length == 0:
+            add.start = remove.start
+
+        if remove.start != add.start:
+            if not _Log:
+                _late_import()
+            _Log.warning("Do not know how to handle")
+
+        def repair_hunk(diff):
+            # THE LAST DELETED LINE MAY MISS A "\n" MEANING THE FIRST
+            # ADDED LINE WILL BE APPENDED TO THE LAST DELETED LINE
+            # EXAMPLE: -kward has the details.+kward has the details.
+            # DETECT THIS PROBLEM FOR THIS HUNK AND FIX THE DIFF
+            problem_line = diff[start_of_hunk + remove.length - 1]
+            if reverse:
+                if add.length == 0:
+                    return diff
+                first_added_line = output[add.start - 1]
+                if problem_line.endswith('+' + first_added_line):
+                    split_point = len(problem_line) - len(first_added_line) - 1
+                else:
+                    return diff
+            else:
+                if remove.length == 0:
+                    return diff
+                last_removed_line = output[remove.start - 1]
+                if problem_line.startswith('-' + last_removed_line + "+"):
+                    split_point = len(last_removed_line) + 1
+                else:
+                    return diff
+
+            new_diff = (
+                diff[:start_of_hunk + remove.length - 1] +
+                [problem_line[:split_point], problem_line[split_point:]] +
+                diff[start_of_hunk + remove.length:]
+            )
+            return new_diff
+
+        diff = repair_hunk(diff)
+        if reverse:
+            new_output = (
+                output[:add.start - 1] +
+                [d[1:] for d in diff[start_of_hunk:start_of_hunk + remove.length]] +
+                output[add.start + add.length - 1:]
+            )
+        else:
+            # APPLYING DIFF FORWARD REQUIRES WE APPLY THE HUNKS IN REVERSE TO GET THE LINE NUMBERS RIGHT?
+            new_output = (
+                output[:remove.start-1] +
+                [d[1:] for d in diff[start_of_hunk + remove.length :start_of_hunk + remove.length + add.length ]] +
+                output[remove.start + remove.length - 1:]
+            )
+        start_of_hunk += remove.length + add.length
+        output = new_output
+
+    if verify:
+        original = apply_diff(output, diff, not reverse, False)
+        if any(t!=o for t, o in zip_longest(text, original)):
+            if not _Log:
+                _late_import()
+            _Log.error("logical verification check failed")
+
+    return output
 
 
 def unicode2utf8(value):

diff --git a/vendor/pyLibrary/sql/mysql.py b/vendor/pyLibrary/sql/mysql.py
@@ -279,7 +279,8 @@ def query(self, sql, param=None, stream=False, row_tuples=False):
 
             return result
         except Exception as e:
-            if isinstance(e, InterfaceError) or e.message.find("InterfaceError") >= 0:
+            e = Except.wrap(e)
+            if "InterfaceError" in e:
                 Log.error("Did you close the db connection?", e)
             Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)