From 83e407580c35cb7391bd087ddb27dad7e56ed6b3 Mon Sep 17 00:00:00 2001 From: ayush-1506 Date: Wed, 8 May 2019 01:00:29 +0530 Subject: [PATCH 1/2] use re.compile for feature cleanups --- bugbug/feature_cleanup.py | 43 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/bugbug/feature_cleanup.py b/bugbug/feature_cleanup.py index 6519b7b6d0..dff893456a 100644 --- a/bugbug/feature_cleanup.py +++ b/bugbug/feature_cleanup.py @@ -7,28 +7,30 @@ def url(text): - text = re.sub( - r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+", - "__CODE_REFERENCE_URL__", - text, + pattern_reference_url = re.compile( + r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+" + ) + pattern_url = re.compile(r"http\S+") + return pattern_url.sub( + "__URL__", pattern_reference_url.sub("__CODE_REFERENCE_URL__", text) ) - return re.sub(r"http\S+", "__URL__", text) def fileref(text): - return re.sub( - r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b", - "__FILE_REFERENCE__", - text, + pattern = re.compile( + r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b" ) + return pattern.sub("__FILE_REFERENCE__", text) def responses(text): - return re.sub(">[^\n]+", " ", text) + pattern = re.compile(">[^\n]+") + return pattern.sub(" ", text) def hex(text): - return re.sub(r"\b0[xX][0-9a-fA-F]+\b", "__HEX_NUMBER__", text) + pattern = re.compile(r"\b0[xX][0-9a-fA-F]+\b") + return pattern.sub("__HEX_NUMBER__", text) FIREFOX_DLLS_MATCH = "|".join( @@ -131,8 +133,8 @@ def hex(text): def dll(text): - regex = fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b" - return re.sub(regex, "__DLL_NAME__", text) + regex = re.compile(fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b") + return regex.sub("__DLL_NAME__", text) def synonyms(text): @@ -159,19 +161,16 @@ def synonyms(text): ] for synonym_group, synonym_list in synonyms: - text = re.sub( - "|".join(fr"\b{synonym}\b" for synonym in synonym_list), - synonym_group, - text, - flags=re.IGNORECASE, + pattern = re.compile( + "|".join(fr"\b{synonym}\b" for synonym in synonym_list), flags=re.IGNORECASE ) + text = pattern.sub(synonym_group, text) return text def crash(text): - return re.sub( - r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b", - "__CRASH_STATS_LINK__", - text, + pattern = re.compile( + r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b" ) + return pattern.sub("__CRASH_STATS_LINK__", text) From ee0a53b32bf56e66f3834afe10c2f28de0a3ba19 Mon Sep 17 00:00:00 2001 From: ayush-1506 Date: Thu, 9 May 2019 13:24:58 +0100 Subject: [PATCH 2/2] make cleanup functions classes --- bugbug/feature_cleanup.py | 348 ++++++++++++++++++---------------- bugbug/models/assignee.py | 6 +- bugbug/models/backout.py | 6 +- bugbug/models/component.py | 6 +- bugbug/models/defect.py | 6 +- bugbug/models/devdocneeded.py | 6 +- bugbug/models/qaneeded.py | 6 +- bugbug/models/tracking.py | 12 +- bugbug/models/uplift.py | 6 +- tests/test_feature_cleanup.py | 14 +- 10 files changed, 219 insertions(+), 197 deletions(-) diff --git a/bugbug/feature_cleanup.py b/bugbug/feature_cleanup.py index dff893456a..e5cbbf40d2 100644 --- a/bugbug/feature_cleanup.py +++ b/bugbug/feature_cleanup.py @@ -6,171 +6,193 @@ import re -def url(text): - pattern_reference_url = re.compile( - r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+" - ) - pattern_url = re.compile(r"http\S+") - return pattern_url.sub( - "__URL__", pattern_reference_url.sub("__CODE_REFERENCE_URL__", text) - ) - - -def fileref(text): - pattern = re.compile( - r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b" - ) - return pattern.sub("__FILE_REFERENCE__", text) - - -def responses(text): - pattern = re.compile(">[^\n]+") - return pattern.sub(" ", text) - - -def hex(text): - pattern = re.compile(r"\b0[xX][0-9a-fA-F]+\b") - return pattern.sub("__HEX_NUMBER__", text) - - -FIREFOX_DLLS_MATCH = "|".join( - [ - "libmozwayland.so", - "libssl3.so", - "libnssdbm3.so", - "liblgpllibs.so", - "libmozavutil.so", - "libxul.so", - "libmozgtk.so", - "libnssckbi.so", - "libclearkey.dylib", - "libmozsqlite3.so", - "libplc4.so", - "libsmime3.so", - "libclearkey.so", - "libnssutil3.so", - "libnss3.so", - "libplds4.so", - "libfreeblpriv3.so", - "libsoftokn3.so", - "libmozgtk.so", - "libmozavcodec.so", - "libnspr4.so", - "IA2Marshal.dll", - "lgpllibs.dll", - "libEGL.dll", - "libGLESv2.dll", - "libmozsandbox.so", - "AccessibleHandler.dll", - "AccessibleMarshal.dll", - "api-ms-win-core-console-l1-1-0.dll", - "api-ms-win-core-datetime-l1-1-0.dll", - "api-ms-win-core-debug-l1-1-0.dll", - "api-ms-win-core-errorhandling-l1-1-0.dll", - "api-ms-win-core-file-l1-1-0.dll", - "api-ms-win-core-file-l1-2-0.dll", - "api-ms-win-core-file-l2-1-0.dll", - "api-ms-win-core-handle-l1-1-0.dll", - "api-ms-win-core-heap-l1-1-0.dll", - "api-ms-win-core-interlocked-l1-1-0.dll", - "api-ms-win-core-libraryloader-l1-1-0.dll", - "api-ms-win-core-localization-l1-2-0.dll", - "api-ms-win-core-memory-l1-1-0.dll", - "api-ms-win-core-namedpipe-l1-1-0.dll", - "api-ms-win-core-processenvironment-l1-1-0.dll", - "api-ms-win-core-processthreads-l1-1-0.dll", - "api-ms-win-core-processthreads-l1-1-1.dll", - "api-ms-win-core-profile-l1-1-0.dll", - "api-ms-win-core-rtlsupport-l1-1-0.dll", - "api-ms-win-core-string-l1-1-0.dll", - "api-ms-win-core-synch-l1-1-0.dll", - "api-ms-win-core-synch-l1-2-0.dll", - "api-ms-win-core-sysinfo-l1-1-0.dll", - "api-ms-win-core-timezone-l1-1-0.dll", - "api-ms-win-core-util-l1-1-0.dll", - "api-ms-win-crt-conio-l1-1-0.dll", - "api-ms-win-crt-convert-l1-1-0.dll", - "api-ms-win-crt-environment-l1-1-0.dll", - "api-ms-win-crt-filesystem-l1-1-0.dll", - "api-ms-win-crt-heap-l1-1-0.dll", - "api-ms-win-crt-locale-l1-1-0.dll", - "api-ms-win-crt-math-l1-1-0.dll", - "api-ms-win-crt-multibyte-l1-1-0.dll", - "api-ms-win-crt-private-l1-1-0.dll", - "api-ms-win-crt-process-l1-1-0.dll", - "api-ms-win-crt-runtime-l1-1-0.dll", - "api-ms-win-crt-stdio-l1-1-0.dll", - "api-ms-win-crt-string-l1-1-0.dll", - "api-ms-win-crt-time-l1-1-0.dll", - "api-ms-win-crt-utility-l1-1-0.dll", - "d3dcompiler_47.dll", - "freebl3.dll", - "mozavcodec.dll", - "mozavutil.dll", - "mozglue.dll", - "msvcp140.dll", - "nss3.dll", - "nssckbi.dll", - "nssdbm3.dll", - "qipcap64.dll", - "softokn3.dll", - "ucrtbase.dll", - "vcruntime140.dll", - "xul.dll", - "clearkey.dll", - "libfreebl3.dylib", - "liblgpllibs.dylib", - "libmozavcodec.dylib", - "libmozavutil.dylib", - "libmozglue.dylib", - "libnss3.dylib", - "libnssckbi.dylib", - "libnssdbm3.dylib", - "libplugin_child_interpose.dylib", - "libsoftokn3.dylib", - ] -).replace(".", r"\.") - - -def dll(text): - regex = re.compile(fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b") - return regex.sub("__DLL_NAME__", text) - - -def synonyms(text): - synonyms = [ - ("safemode", ["safemode", "safe mode"]), - ("str", ["str", "steps to reproduce", "repro steps"]), - ("uaf", ["uaf", "use after free", "use-after-free"]), - ("asan", ["asan", "address sanitizer", "addresssanitizer"]), - ( - "permafailure", +class url(object): + def __init__(self): + self.reference_url = re.compile( + r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+" + ) + self.url = re.compile(r"http\S+") + + def __call__(self, text): + return self.url.sub( + "__URL__", self.reference_url.sub("__CODE_REFERENCE_URL__", text) + ) + + +class fileref(object): + def __init__(self): + self.pattern = re.compile( + r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b" + ) + + def __call__(self, text): + return self.pattern.sub("__FILE_REFERENCE__", text) + + +class responses(object): + def __init__(self): + self.pattern = re.compile(">[^\n]+") + + def __call__(self, text): + return self.pattern.sub(" ", text) + + +class hex(object): + def __init__(self): + self.pattern = re.compile(r"\b0[xX][0-9a-fA-F]+\b") + + def __call__(self, text): + return self.pattern.sub("__HEX_NUMBER__", text) + + +class dll(object): + def __init__(self): + FIREFOX_DLLS_MATCH = "|".join( [ - "permafailure", - "permafailing", - "permafail", - "perma failure", - "perma failing", - "perma fail", - "perma-failure", - "perma-failing", - "perma-fail", - ], - ), - ("spec", ["spec", "specification"]), - ] - - for synonym_group, synonym_list in synonyms: - pattern = re.compile( - "|".join(fr"\b{synonym}\b" for synonym in synonym_list), flags=re.IGNORECASE + "libmozwayland.so", + "libssl3.so", + "libnssdbm3.so", + "liblgpllibs.so", + "libmozavutil.so", + "libxul.so", + "libmozgtk.so", + "libnssckbi.so", + "libclearkey.dylib", + "libmozsqlite3.so", + "libplc4.so", + "libsmime3.so", + "libclearkey.so", + "libnssutil3.so", + "libnss3.so", + "libplds4.so", + "libfreeblpriv3.so", + "libsoftokn3.so", + "libmozgtk.so", + "libmozavcodec.so", + "libnspr4.so", + "IA2Marshal.dll", + "lgpllibs.dll", + "libEGL.dll", + "libGLESv2.dll", + "libmozsandbox.so", + "AccessibleHandler.dll", + "AccessibleMarshal.dll", + "api-ms-win-core-console-l1-1-0.dll", + "api-ms-win-core-datetime-l1-1-0.dll", + "api-ms-win-core-debug-l1-1-0.dll", + "api-ms-win-core-errorhandling-l1-1-0.dll", + "api-ms-win-core-file-l1-1-0.dll", + "api-ms-win-core-file-l1-2-0.dll", + "api-ms-win-core-file-l2-1-0.dll", + "api-ms-win-core-handle-l1-1-0.dll", + "api-ms-win-core-heap-l1-1-0.dll", + "api-ms-win-core-interlocked-l1-1-0.dll", + "api-ms-win-core-libraryloader-l1-1-0.dll", + "api-ms-win-core-localization-l1-2-0.dll", + "api-ms-win-core-memory-l1-1-0.dll", + "api-ms-win-core-namedpipe-l1-1-0.dll", + "api-ms-win-core-processenvironment-l1-1-0.dll", + "api-ms-win-core-processthreads-l1-1-0.dll", + "api-ms-win-core-processthreads-l1-1-1.dll", + "api-ms-win-core-profile-l1-1-0.dll", + "api-ms-win-core-rtlsupport-l1-1-0.dll", + "api-ms-win-core-string-l1-1-0.dll", + "api-ms-win-core-synch-l1-1-0.dll", + "api-ms-win-core-synch-l1-2-0.dll", + "api-ms-win-core-sysinfo-l1-1-0.dll", + "api-ms-win-core-timezone-l1-1-0.dll", + "api-ms-win-core-util-l1-1-0.dll", + "api-ms-win-crt-conio-l1-1-0.dll", + "api-ms-win-crt-convert-l1-1-0.dll", + "api-ms-win-crt-environment-l1-1-0.dll", + "api-ms-win-crt-filesystem-l1-1-0.dll", + "api-ms-win-crt-heap-l1-1-0.dll", + "api-ms-win-crt-locale-l1-1-0.dll", + "api-ms-win-crt-math-l1-1-0.dll", + "api-ms-win-crt-multibyte-l1-1-0.dll", + "api-ms-win-crt-private-l1-1-0.dll", + "api-ms-win-crt-process-l1-1-0.dll", + "api-ms-win-crt-runtime-l1-1-0.dll", + "api-ms-win-crt-stdio-l1-1-0.dll", + "api-ms-win-crt-string-l1-1-0.dll", + "api-ms-win-crt-time-l1-1-0.dll", + "api-ms-win-crt-utility-l1-1-0.dll", + "d3dcompiler_47.dll", + "freebl3.dll", + "mozavcodec.dll", + "mozavutil.dll", + "mozglue.dll", + "msvcp140.dll", + "nss3.dll", + "nssckbi.dll", + "nssdbm3.dll", + "qipcap64.dll", + "softokn3.dll", + "ucrtbase.dll", + "vcruntime140.dll", + "xul.dll", + "clearkey.dll", + "libfreebl3.dylib", + "liblgpllibs.dylib", + "libmozavcodec.dylib", + "libmozavutil.dylib", + "libmozglue.dylib", + "libnss3.dylib", + "libnssckbi.dylib", + "libnssdbm3.dylib", + "libplugin_child_interpose.dylib", + "libsoftokn3.dylib", + ] + ).replace(".", r"\.") + self.pattern = re.compile( + fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b" ) - text = pattern.sub(synonym_group, text) - return text + def __call__(self, text): + return self.pattern.sub("__DLL_NAME__", text) + +class synonyms(object): + def __init__(self): + synonyms = [ + ("safemode", ["safemode", "safe mode"]), + ("str", ["str", "steps to reproduce", "repro steps"]), + ("uaf", ["uaf", "use after free", "use-after-free"]), + ("asan", ["asan", "address sanitizer", "addresssanitizer"]), + ( + "permafailure", + [ + "permafailure", + "permafailing", + "permafail", + "perma failure", + "perma failing", + "perma fail", + "perma-failure", + "perma-failing", + "perma-fail", + ], + ), + ("spec", ["spec", "specification"]), + ] + self.pattern = {} + for synonym_group, synonym_list in synonyms: + self.pattern[synonym_group] = re.compile( + "|".join(fr"\b{synonym}\b" for synonym in synonym_list), + flags=re.IGNORECASE, + ) + + def __call__(self, text): + for synonym_group in self.pattern: + text = self.pattern[synonym_group].sub(synonym_group, text) + return text + + +class crash(object): + def __init__(self): + self.pattern = re.compile( + r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b" + ) -def crash(text): - pattern = re.compile( - r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b" - ) - return pattern.sub("__CRASH_STATS_LINK__", text) + def __call__(self, text): + return self.pattern.sub("__CRASH_STATS_LINK__", text) diff --git a/bugbug/models/assignee.py b/bugbug/models/assignee.py index 3f3c62b038..54ef900461 100644 --- a/bugbug/models/assignee.py +++ b/bugbug/models/assignee.py @@ -48,9 +48,9 @@ def __init__(self, lemmatization=False): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/backout.py b/bugbug/models/backout.py index f75962aa4a..dbf04e040c 100644 --- a/bugbug/models/backout.py +++ b/bugbug/models/backout.py @@ -41,9 +41,9 @@ def __init__(self, lemmatization=False): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/component.py b/bugbug/models/component.py index 46cf9052ee..38a37db23b 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -77,9 +77,9 @@ def __init__(self, lemmatization=False): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/defect.py b/bugbug/models/defect.py index a156df59cd..6d5726bca6 100644 --- a/bugbug/models/defect.py +++ b/bugbug/models/defect.py @@ -47,9 +47,9 @@ def __init__(self, lemmatization=False, historical=False): feature_extractors.append(bug_features.had_severity_enhancement()) cleanup_functions = [ - feature_cleanup.url, - feature_cleanup.fileref, - feature_cleanup.synonyms, + feature_cleanup.url(), + feature_cleanup.fileref(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/devdocneeded.py b/bugbug/models/devdocneeded.py index 13d77f71ae..d6fde01343 100644 --- a/bugbug/models/devdocneeded.py +++ b/bugbug/models/devdocneeded.py @@ -41,9 +41,9 @@ def __init__(self, lemmatization=False): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/qaneeded.py b/bugbug/models/qaneeded.py index 1a3098e469..2bbf12ca46 100644 --- a/bugbug/models/qaneeded.py +++ b/bugbug/models/qaneeded.py @@ -36,9 +36,9 @@ def __init__(self, lemmatization=False): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/tracking.py b/bugbug/models/tracking.py index d35d292f31..9af7230be7 100644 --- a/bugbug/models/tracking.py +++ b/bugbug/models/tracking.py @@ -47,12 +47,12 @@ def __init__(self, lemmatization=False): ] cleanup_functions = [ - feature_cleanup.url, - feature_cleanup.fileref, - feature_cleanup.hex, - feature_cleanup.dll, - feature_cleanup.synonyms, - feature_cleanup.crash, + feature_cleanup.url(), + feature_cleanup.fileref(), + feature_cleanup.hex(), + feature_cleanup.dll(), + feature_cleanup.synonyms(), + feature_cleanup.crash(), ] self.extraction_pipeline = Pipeline( diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py index 984fde5f44..0dc7d0086f 100644 --- a/bugbug/models/uplift.py +++ b/bugbug/models/uplift.py @@ -36,9 +36,9 @@ def __init__(self, lemmatization=False): ] cleanup_functions = [ - feature_cleanup.fileref, - feature_cleanup.url, - feature_cleanup.synonyms, + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( diff --git a/tests/test_feature_cleanup.py b/tests/test_feature_cleanup.py index f4791747eb..edfbe593c0 100644 --- a/tests/test_feature_cleanup.py +++ b/tests/test_feature_cleanup.py @@ -26,7 +26,7 @@ def test_url(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.url(orig_text) == cleaned_text + assert feature_cleanup.url()(orig_text) == cleaned_text def test_fileref(): @@ -37,7 +37,7 @@ def test_fileref(): ) ] for orig_text, cleaned_text in tests: - assert feature_cleanup.fileref(orig_text) == cleaned_text + assert feature_cleanup.fileref()(orig_text) == cleaned_text def test_responses(): @@ -57,7 +57,7 @@ def test_responses(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.responses(orig_text) == cleaned_text + assert feature_cleanup.responses()(orig_text) == cleaned_text def test_hex(): @@ -72,7 +72,7 @@ def test_hex(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.hex(orig_text) == cleaned_text + assert feature_cleanup.hex()(orig_text) == cleaned_text def test_dll(): @@ -100,7 +100,7 @@ def test_dll(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.dll(orig_text) == cleaned_text + assert feature_cleanup.dll()(orig_text) == cleaned_text def test_synonyms(): @@ -118,7 +118,7 @@ def test_synonyms(): ("found via address sanitizer or asan", "found via asan or asan"), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.synonyms(orig_text) == cleaned_text + assert feature_cleanup.synonyms()(orig_text) == cleaned_text def test_crash(): @@ -133,4 +133,4 @@ def test_crash(): ), ] for orig_text, cleaned_text in tests: - assert feature_cleanup.crash(orig_text) == cleaned_text + assert feature_cleanup.crash()(orig_text) == cleaned_text