Skip to content

Commit

Permalink
fix user_story diff parsing, PY3 updates, code formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
Kyle Lahnakoski committed Mar 31, 2018
1 parent ba3c2e9 commit 103f251
Show file tree
Hide file tree
Showing 12 changed files with 303 additions and 174 deletions.
7 changes: 4 additions & 3 deletions bzETL/bz_etl.py
Expand Up @@ -106,12 +106,13 @@ def get_records_from_bugzilla(db, param, please_stop):
sorted = jx.sort(db_results, [
"bug_id",
"_merge_order",
{"field": "modified_ts", "sort": -1},
"modified_by"
{"modified_ts": "desc"},
"modified_by",
{"id": "desc"}
])

process = BugHistoryParser(param, alias_config, output_queue)
for s in sorted:
for i, s in enumerate(sorted):
process.processRow(s)
process.processRow(wrap({"bug_id": parse_bug_history.STOP_BUG, "_merge_order": 1}))

Expand Down
10 changes: 7 additions & 3 deletions bzETL/extract_bugzilla.py
Expand Up @@ -12,6 +12,7 @@
from __future__ import unicode_literals

from bzETL.parse_bug_history import MAX_TIME
from bzETL.transform_bugzilla import NUMERIC_FIELDS
from jx_python import jx
from mo_dots.datas import Data
from mo_logs import Log
Expand Down Expand Up @@ -104,6 +105,8 @@ def get_screened_whiteboard(db):


def get_bugs_table_columns(db, schema_name):
global bugs_columns

if not bugs_columns:
columns = db.query("""
SELECT
Expand Down Expand Up @@ -134,7 +137,7 @@ def get_bugs_table_columns(db, schema_name):
)
""", {"schema": schema_name})
globals()["bugs_columns"] = columns
bugs_columns = columns


def get_private_bugs_for_delete(db, param):
Expand Down Expand Up @@ -231,13 +234,13 @@ def get_bugs(db, param):

#TODO: CF_LAST_RESOLVED IS IN PDT, FIX IT
def lower(col):
if col.column_type.startswith("varchar"):
if col.column_type.startswith("varchar") or col.column_type.endswith('text'):
return "lower(" + db.quote_column(col.column_name) + ") " + db.quote_column(col.column_name)
else:
return db.quote_column(col.column_name)

param.bugs_columns = jx.select(bugs_columns, "column_name")
param.bugs_columns_SQL = SQL(",\n".join([lower(c) for c in bugs_columns]))
param.bugs_columns_SQL = SQL(",\n".join(lower(c) for c in bugs_columns))
param.bug_filter = esfilter2sqlwhere(db, {"terms": {"b.bug_id": param.bug_list}})
param.screened_whiteboard = esfilter2sqlwhere(db, {"and": [
{"exists": "m.bug_id"},
Expand Down Expand Up @@ -584,6 +587,7 @@ def get_new_activities(db, param):

output = db.query("""
SELECT
a.id,
a.bug_id,
UNIX_TIMESTAMP(bug_when)*1000 AS modified_ts,
lower(login_name) AS modified_by,
Expand Down
241 changes: 165 additions & 76 deletions bzETL/parse_bug_history.py

Large diffs are not rendered by default.

24 changes: 15 additions & 9 deletions bzETL/transform_bugzilla.py
Expand Up @@ -14,20 +14,23 @@
import re
from datetime import date

from mo_dots import unwraplist, listwrap
from mo_future import text_type, long

from jx_python import jx
from mo_dots import listwrap
from mo_future import text_type, long
from mo_json import json2value, value2json
from mo_logs import Log
from pyLibrary import convert
from pyLibrary.env import elasticsearch

USE_ATTACHMENTS_DOT = True
USE_ATTACHMENTS_DOT = True # REMOVE THIS, ASSUME False

DIFF_FIELDS = ["cf_user_story"]
MULTI_FIELDS = ["cc", "blocked", "dependson", "dupe_by", "dupe_of", "flags", "keywords", "bug_group", "see_also"]
NUMERIC_FIELDS=[ "blocked", "dependson", "dupe_by", "dupe_of",
NUMERIC_FIELDS=[
"blocked",
"dependson",
"dupe_by",
"dupe_of",
"votes",
"estimated_time",
"remaining_time",
Expand All @@ -36,6 +39,8 @@

]

NULL_VALUES = ['--', '---']

# Used to reformat incoming dates into the expected form.
# Example match: "2012/01/01 00:00:00.000"
DATE_PATTERN_STRICT = re.compile("^[0-9]{4}[\\/-][0-9]{2}[\\/-][0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3}")
Expand All @@ -44,7 +49,8 @@
DATE_PATTERN_RELAXED = re.compile("^[0-9]{4}[\\/-][0-9]{2}[\\/-][0-9]{2}")


#WE ARE RENAMING THE ATTACHMENTS FIELDS TO CAUSE LESS PROBLEMS IN ES QUERIES
# WE ARE RENAMING THE ATTACHMENTS FIELDS TO CAUSE LESS PROBLEMS IN ES QUERIES
# TODO: REMOVE THIS OLD FOMAT
def rename_attachments(bug_version):
if bug_version.attachments == None: return bug_version
if not USE_ATTACHMENTS_DOT:
Expand Down Expand Up @@ -83,10 +89,10 @@ def normalize(bug, old_school=False):
bug.changes=json2value(json)
bug.changes = jx.sort(bug.changes, ["attach_id", "field_name"])

#bug IS CONVERTED TO A 'CLEAN' COPY
bug = elasticsearch.scrub(bug)
# bug.attachments = coalesce(bug.attachments, []) # ATTACHMENTS MUST EXIST

for k, v in list(bug.items()):
if v in NULL_VALUES:
bug[k] = None

for f in NUMERIC_FIELDS:
v = bug[f]
Expand Down
26 changes: 14 additions & 12 deletions tests/resources/config/test_one.json
Expand Up @@ -15,31 +15,33 @@
// 6810, //ok
// 9622, //minor email diff
// 10575, //ok
11040, //alias analysis problem
// 11040, //alias analysis problem
// 12911, //alias analysis problem
// 13534, // (REVIEW MOVES TO OTHER PERSON)
// 67742, //alias analysis problem
// 96421, //minor email diff
// 123203,//expiry only
// 178960,//minor email
// 248970, // another cutoff review request
// 367518,//ok
// 457765,//ok
// 458397,//minor email
// 471427,//component rename, changes not compact
// 471427,//minor email
// 544327,//extra history
// 547727,//extra history
// 643420,//ok
// 692436,//minor email
// 726635,//alias problem
// 813650,//ERROR in blocked
// 1165765 VERY LONG short_desc
// 1007019 does not have bug_status, or component, or product
// 372836 (REVIEW FLAGS TEST)
// 13534 (REVIEW MOVES TO OTHER PERSON)
// 393845 added blocking1.9+ twice
// 671185 *many* review requests
// 937428 whitespace after comma in user story, complex diff
// 248970 another cutoff review request
// 248971 another cutoff review request

// NOT VERIFIED
// 248971, // another cutoff review request
// 372836, // (REVIEW FLAGS TEST)
// 393845, // added blocking1.9+ twice
// 671185, // *many* review requests
937428, // whitespace after comma in user story, complex diff
// 1007019, // does not have bug_status, or component, or product
// 1165765, // VERY LONG short_desc
],
"temp_dir": "tests/resources",
"errors": "tests/results/errors",
Expand All @@ -62,7 +64,7 @@
},
"reference": {
"filename": "tests/resources/reference/public_bugs.json",
"max_timestamp": 1372867005000
"max_timestamp": 1372867005000 // MAX TIME IN THE REFERENCE
},
"bugzilla": {
"$ref": "file://~/private.json#bugzilla-dev",
Expand Down
31 changes: 0 additions & 31 deletions tests/resources/mySQL/README.md

This file was deleted.

Binary file removed tests/resources/mySQL/timezone_2011n_posix.zip
Binary file not shown.
1 change: 1 addition & 0 deletions tests/util/compare_es.py
Expand Up @@ -14,6 +14,7 @@

from datetime import datetime

from mo_future import long
from mo_logs import Log

import jx_elasticsearch
Expand Down
2 changes: 1 addition & 1 deletion vendor/jx_elasticsearch/es14/expressions.py
Expand Up @@ -44,7 +44,7 @@


class Ruby(Expression):
__slots__ = ("miss", "type", "expr", "many")
__slots__ = ("miss", "data_type", "expr", "many")

def __init__(self, type, expr, frum, miss=None, many=False):
self.miss = coalesce(miss, FALSE) # Expression that will return true/false to indicate missing result
Expand Down
127 changes: 90 additions & 37 deletions vendor/mo_logs/strings.py
Expand Up @@ -20,11 +20,12 @@
from collections import Mapping
from datetime import datetime as builtin_datetime
from datetime import timedelta, date
from itertools import zip_longest
from json.encoder import encode_basestring

import sys

from mo_dots import coalesce, wrap, get_module
from mo_dots import coalesce, wrap, get_module, Data
from mo_future import text_type, xrange, binary_type, round as _round, PY3, get_function_name
from mo_logs.convert import datetime2unix, datetime2string, value2json, milli2datetime, unix2datetime
from mo_logs.url import value2url_param
Expand Down Expand Up @@ -729,7 +730,7 @@ def edit_distance(s1, s2):
DIFF_PREFIX = re.compile(r"@@ -(\d+(?:\s*,\d+)?) \+(\d+(?:\s*,\d+)?) @@")


def apply_diff(text, diff, reverse=False):
def apply_diff(text, diff, reverse=False, verify=True):
"""
SOME EXAMPLES OF diff
#@@ -1 +1 @@
Expand All @@ -750,45 +751,97 @@ def apply_diff(text, diff, reverse=False):
+
+Content Team Engagement & Tasks : https://appreview.etherpad.mozilla.org/40
"""

output = text
if not diff:
return text
if diff[0].strip() == "":
return text
return output

matches = DIFF_PREFIX.match(diff[0].strip())
if not matches:
if not _Log:
_late_import()
start_of_hunk = 0
while True:
if start_of_hunk>=len(diff):
break
header = diff[start_of_hunk]
start_of_hunk += 1
if not header.strip():
continue

_Log.error("Can not handle {{diff}}\n", diff= diff[0])

remove = [int(i.strip()) for i in matches.group(1).split(",")]
if len(remove) == 1:
remove = [remove[0], 1] # DEFAULT 1
add = [int(i.strip()) for i in matches.group(2).split(",")]
if len(add) == 1:
add = [add[0], 1]

# UNUSUAL CASE WHERE @@ -x +x, n @@ AND FIRST LINE HAS NOT CHANGED
half = int(len(diff[1]) / 2)
first_half = diff[1][:half]
last_half = diff[1][half:half * 2]
if remove[1] == 1 and add[0] == remove[0] and first_half[1:] == last_half[1:]:
diff[1] = first_half
diff.insert(2, last_half)

if not reverse:
if remove[1] != 0:
text = text[:remove[0] - 1] + text[remove[0] + remove[1] - 1:]
text = text[:add[0] - 1] + [d[1:] for d in diff[1 + remove[1]:1 + remove[1] + add[1]]] + text[add[0] - 1:]
text = apply_diff(text, diff[add[1] + remove[1] + 1:], reverse=reverse)
else:
text = apply_diff(text, diff[add[1] + remove[1] + 1:], reverse=reverse)
if add[1] != 0:
text = text[:add[0] - 1] + text[add[0] + add[1] - 1:]
text = text[:remove[0] - 1] + [d[1:] for d in diff[1:1 + remove[1]]] + text[remove[0] - 1:]
matches = DIFF_PREFIX.match(header.strip())
if not matches:
if not _Log:
_late_import()

_Log.error("Can not handle \n---\n{{diff}}\n---\n", diff=diff)

remove = tuple(int(i.strip()) for i in matches.group(1).split(",")) # EXPECTING start_line, length TO REMOVE
remove = Data(start=remove[0], length=1 if len(remove) == 1 else remove[1]) # ASSUME FIRST LINE
add = tuple(int(i.strip()) for i in matches.group(2).split(",")) # EXPECTING start_line, length TO ADD
add = Data(start=add[0], length=1 if len(add) == 1 else add[1])

return text
if remove.start == 0 and remove.length == 0:
remove.start = add.start
if add.start == 0 and add.length == 0:
add.start = remove.start

if remove.start != add.start:
if not _Log:
_late_import()
_Log.warning("Do not know how to handle")

def repair_hunk(diff):
# THE LAST DELETED LINE MAY MISS A "\n" MEANING THE FIRST
# ADDED LINE WILL BE APPENDED TO THE LAST DELETED LINE
# EXAMPLE: -kward has the details.+kward has the details.
# DETECT THIS PROBLEM FOR THIS HUNK AND FIX THE DIFF
problem_line = diff[start_of_hunk + remove.length - 1]
if reverse:
if add.length == 0:
return diff
first_added_line = output[add.start - 1]
if problem_line.endswith('+' + first_added_line):
split_point = len(problem_line) - len(first_added_line) - 1
else:
return diff
else:
if remove.length == 0:
return diff
last_removed_line = output[remove.start - 1]
if problem_line.startswith('-' + last_removed_line + "+"):
split_point = len(last_removed_line) + 1
else:
return diff

new_diff = (
diff[:start_of_hunk + remove.length - 1] +
[problem_line[:split_point], problem_line[split_point:]] +
diff[start_of_hunk + remove.length:]
)
return new_diff

diff = repair_hunk(diff)
if reverse:
new_output = (
output[:add.start - 1] +
[d[1:] for d in diff[start_of_hunk:start_of_hunk + remove.length]] +
output[add.start + add.length - 1:]
)
else:
# APPLYING DIFF FORWARD REQUIRES WE APPLY THE HUNKS IN REVERSE TO GET THE LINE NUMBERS RIGHT?
new_output = (
output[:remove.start-1] +
[d[1:] for d in diff[start_of_hunk + remove.length :start_of_hunk + remove.length + add.length ]] +
output[remove.start + remove.length - 1:]
)
start_of_hunk += remove.length + add.length
output = new_output

if verify:
original = apply_diff(output, diff, not reverse, False)
if any(t!=o for t, o in zip_longest(text, original)):
if not _Log:
_late_import()
_Log.error("logical verification check failed")

return output


def unicode2utf8(value):
Expand Down
3 changes: 2 additions & 1 deletion vendor/pyLibrary/sql/mysql.py
Expand Up @@ -279,7 +279,8 @@ def query(self, sql, param=None, stream=False, row_tuples=False):

return result
except Exception as e:
if isinstance(e, InterfaceError) or e.message.find("InterfaceError") >= 0:
e = Except.wrap(e)
if "InterfaceError" in e:
Log.error("Did you close the db connection?", e)
Log.error("Problem executing SQL:\n{{sql|indent}}", sql=sql, cause=e, stack_depth=1)

Expand Down

0 comments on commit 103f251

Please sign in to comment.