diff --git a/bugbug/models/__init__.py b/bugbug/models/__init__.py index 3703cc1658..eb1a66a84f 100644 --- a/bugbug/models/__init__.py +++ b/bugbug/models/__init__.py @@ -38,6 +38,7 @@ "uplift": "bugbug.models.uplift.UpliftModel", "worksforme": "bugbug.models.worksforme.WorksForMeModel", "fenixcomponent": "bugbug.models.fenixcomponent.FenixComponentModel", + "componentspecific": "bugbug.models.component_specific.ComponentSpecificModel", } diff --git a/bugbug/models/component_specific.py b/bugbug/models/component_specific.py new file mode 100644 index 0000000000..f604310f59 --- /dev/null +++ b/bugbug/models/component_specific.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import logging +from datetime import datetime, timezone + +import dateutil.parser +import xgboost +from dateutil.relativedelta import relativedelta +from sklearn.compose import ColumnTransformer +from sklearn.feature_extraction import DictVectorizer +from sklearn.pipeline import Pipeline + +from bugbug import bug_features, bugzilla, feature_cleanup, utils +from bugbug.model import BugModel + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ComponentSpecificModel(BugModel): + def __init__(self, lemmatization=False, product="Firefox", component="General"): + BugModel.__init__(self, lemmatization) + + self.product = product + self.component = component + + feature_extractors = [ + bug_features.HasSTR(), + bug_features.Severity(), + bug_features.Keywords(), + bug_features.HasCrashSignature(), + bug_features.HasURL(), + bug_features.HasW3CURL(), + bug_features.HasGithubURL(), + bug_features.Whiteboard(), + bug_features.Patches(), + bug_features.Landings(), + ] + + cleanup_functions = [ + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), + ] + + self.extraction_pipeline = Pipeline( + [ + ( + "bug_extractor", + bug_features.BugExtractor( + feature_extractors, cleanup_functions, rollback=True + ), + ), + ] + ) + + self.clf = Pipeline( + [ + ( + "union", + ColumnTransformer( + [ + ("data", DictVectorizer(), "data"), + ("title", self.text_vectorizer(min_df=0.0001), "title"), + ( + "comments", + self.text_vectorizer(min_df=0.0001), + "comments", + ), + ] + ), + ), + ( + "estimator", + xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), + ), + ] + ) + + def get_labels(self): + classes = {} + + for bug_data in bugzilla.get_bugs(): + if dateutil.parser.parse(bug_data["creation_time"]) < datetime.now( + timezone.utc + ) - relativedelta(years=3): + continue + + # Only bugs that were moved out of General and into a specific component + # Or the opposite + + for history in bug_data["history"]: + to_product_firefox = False + to_component_general = False + + from_product_firefox = False + from_component_general = False + + for change in history["changes"]: + if change["field_name"] == "product": + if change["added"] == self.product: + to_product_firefox = True + elif change["removed"] == self.product: + from_product_firefox = True + + if change["field_name"] == "component": + if change["added"] == self.component: + to_component_general = True + elif change["removed"] == self.component: + from_component_general = True + + if from_product_firefox and from_component_general: + classes[bug_data["id"]] = 1 + elif to_product_firefox and to_component_general: + classes[bug_data["id"]] = 0 + + logger.info( + "%d bugs were moved out of %s::%s", + sum(label == 1 for label in classes.values()), + self.product, + self.component, + ) + logger.info( + "%d bugs were moved in %s::%s", + sum(label == 0 for label in classes.values()), + self.product, + self.component, + ) + + return classes, [0, 1] + + def get_feature_names(self): + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/http_service/bugbug_http/models.py b/http_service/bugbug_http/models.py index a2024761cf..23ad1fd6dd 100644 --- a/http_service/bugbug_http/models.py +++ b/http_service/bugbug_http/models.py @@ -28,6 +28,7 @@ MODELS_NAMES = [ "defectenhancementtask", "component", + "componentspecific", "invalidcompatibilityreport", "needsdiagnosis", "regression", diff --git a/infra/data-pipeline.yml b/infra/data-pipeline.yml index a2c1df0140..f1b4024216 100644 --- a/infra/data-pipeline.yml +++ b/infra/data-pipeline.yml @@ -661,6 +661,41 @@ tasks: owner: bugbug-team@mozilla.com source: ${repository}/raw/master/data-pipeline.yml + - ID: train-componentspecific + created: { $fromNow: "" } + deadline: { $fromNow: "3 days" } + expires: { $fromNow: "1 year" } + provisionerId: proj-bugbug + workerType: compute-small + dependencies: + - bugs-retrieval + payload: + maxRunTime: 25200 + image: mozilla/bugbug-base:${version} + command: + - bugbug-train + - componentspecific + + artifacts: + public/componentspecificmodel.tar.zst: + path: /componentspecificmodel.tar.zst + type: file + public/metrics.json: + path: /metrics.json + type: file + + routes: + - notify.email.bugbug-team@mozilla.com.on-failed + - index.project.bugbug.train_componentspecific.${version} + - index.project.bugbug.train_componentspecific.per_version.${version}.${year}.${month}.${day}.${hour}.${minute}.${second} + - index.project.bugbug.train_componentspecific.per_date.${year}.${month}.${day}.${hour}.${minute}.${second}.${version} + - index.project.bugbug.train_componentspecific.latest + metadata: + name: bugbug train componentspecific model + description: bugbug train componentspecific model + owner: bugbug-team@mozilla.com + source: ${repository}/raw/master/data-pipeline.yml + - ID: train-defectenhancementtask created: { $fromNow: "" } deadline: { $fromNow: "3 days" } @@ -1374,6 +1409,7 @@ tasks: workerType: batch dependencies: - train-component + - train-componentspecific - train-defectenhancementtask - train-regression - train-regressor @@ -1416,6 +1452,7 @@ tasks: dependencies: - train-defectenhancementtask - train-component + - train-componentspecific - train-regression - train-stepstoreproduce - train-spambug