Use ColumnTransformer instead of FeatureUnion

Fixes marco-c#72.
marxmit7 · Jan 17, 2019 · 6faa0f2 · 6faa0f2
1 parent 3d6672c
commit 6faa0f2
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 147 deletions.
diff --git a/bugbug/models/bug.py b/bugbug/models/bug.py
@@ -4,15 +4,14 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
-from sklearn.pipeline import FeatureUnion
 from sklearn.pipeline import Pipeline
 
 from bugbug import bug_features
 from bugbug import bugzilla
 from bugbug import labels
 from bugbug.model import Model
-from bugbug.utils import DictSelector
 
 
 class BugModel(Model):
@@ -42,36 +41,17 @@ def __init__(self, lemmatization=False):
             bug_features.cleanup_synonyms,
         ]
 
-        self.data_vectorizer = DictVectorizer()
-        self.title_vectorizer = self.text_vectorizer(stop_words='english')
-        self.first_comment_vectorizer = self.text_vectorizer(stop_words='english')
-        self.comments_vectorizer = self.text_vectorizer(stop_words='english')
-
         self.extraction_pipeline = Pipeline([
             ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
-            ('union', FeatureUnion(
-                transformer_list=[
-                    ('data', Pipeline([
-                        ('selector', DictSelector(key='data')),
-                        ('vect', self.data_vectorizer),
-                    ])),
-
-                    ('title', Pipeline([
-                        ('selector', DictSelector(key='title')),
-                        ('tfidf', self.title_vectorizer),
-                    ])),
-
-                    ('first_comment', Pipeline([
-                        ('selector', DictSelector(key='first_comment')),
-                        ('tfidf', self.first_comment_vectorizer),
-                    ])),
-
-                    ('comments', Pipeline([
-                        ('selector', DictSelector(key='comments')),
-                        ('tfidf', self.comments_vectorizer),
-                    ])),
-                ],
-            )),
+            ('union', ColumnTransformer([
+                ('data', DictVectorizer(), 'data'),
+
+                ('title', self.text_vectorizer(stop_words='english'), 'title'),
+
+                ('first_comment', self.text_vectorizer(stop_words='english'), 'first_comment'),
+
+                ('comments', self.text_vectorizer(stop_words='english'), 'comments'),
+            ])),
         ])
 
         self.clf = xgboost.XGBClassifier(n_jobs=16)
@@ -127,10 +107,7 @@ def get_labels(self):
         return self.get_bugbug_labels('bug')
 
     def get_feature_names(self):
-        return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
-               ['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
-               ['first_comment_' + name for name in self.first_comment_vectorizer.get_feature_names()] +\
-               ['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
+        return self.extraction_pipeline.named_steps['union'].get_feature_names()
 
     def overwrite_classes(self, bugs, classes, probabilities):
         for i, bug in enumerate(bugs):

diff --git a/bugbug/models/component.py b/bugbug/models/component.py
@@ -6,13 +6,12 @@
 from collections import Counter
 
 import xgboost
-from sklearn.pipeline import FeatureUnion
+from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 
 from bugbug import bug_features
 from bugbug import bugzilla
 from bugbug.model import Model
-from bugbug.utils import DictSelector
 
 
 class ComponentModel(Model):
@@ -43,36 +42,19 @@ def __init__(self, lemmatization=False):
             bug_features.cleanup_synonyms,
         ]
 
-        self.title_vectorizer = self.text_vectorizer(stop_words='english')
-        self.first_comment_vectorizer = self.text_vectorizer(stop_words='english')
-
         self.extraction_pipeline = Pipeline([
             ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
-            ('union', FeatureUnion(
-                transformer_list=[
-                    # TODO: Re-enable when we'll support bug snapshotting (#5).
-                    # ('data', Pipeline([
-                    #     ('selector', DictSelector(key='data')),
-                    #     ('vect', self.data_vectorizer),
-                    # ])),
-
-                    ('title', Pipeline([
-                        ('selector', DictSelector(key='title')),
-                        ('tfidf', self.title_vectorizer),
-                    ])),
-
-                    # TODO: Re-enable when we'll support bug snapshotting (#5).
-                    # ('comments', Pipeline([
-                    #     ('selector', DictSelector(key='comments')),
-                    #     ('tfidf', self.comments_vectorizer),
-                    # ])),
-
-                    ('first_comment', Pipeline([
-                        ('selector', DictSelector(key='first_comment')),
-                        ('tfidf', self.first_comment_vectorizer),
-                    ])),
-                ],
-            )),
+            ('union', ColumnTransformer([
+                # TODO: Re-enable when we'll support bug snapshotting (#5).
+                # ('data', DictVectorizer(), 'data'),
+
+                ('title', self.text_vectorizer(stop_words='english'), 'title'),
+
+                # TODO: Re-enable when we'll support bug snapshotting (#5).
+                # ('comments', self.text_vectorizer(stop_words='english'), 'comments'),
+
+                ('first_comment', self.text_vectorizer(stop_words='english'), 'first_comment'),
+            ])),
         ])
 
         self.clf = xgboost.XGBClassifier(n_jobs=16)
@@ -169,5 +151,4 @@ def get_labels(self):
         return {bug_id: component for bug_id, component in classes.items() if component in top_components}
 
     def get_feature_names(self):
-        return ['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
-               ['first_comment_' + name for name in self.first_comment_vectorizer.get_feature_names()]
+        return self.extraction_pipeline.named_steps['union'].get_feature_names()
diff --git a/bugbug/models/qaneeded.py b/bugbug/models/qaneeded.py
@@ -4,14 +4,13 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
-from sklearn.pipeline import FeatureUnion
 from sklearn.pipeline import Pipeline
 
 from bugbug import bug_features
 from bugbug import bugzilla
 from bugbug.model import Model
-from bugbug.utils import DictSelector
 
 
 class QANeededModel(Model):
@@ -40,30 +39,15 @@ def __init__(self, lemmatization=False):
             bug_features.cleanup_synonyms,
         ]
 
-        self.data_vectorizer = DictVectorizer()
-        self.title_vectorizer = self.text_vectorizer(stop_words='english')
-        self.comments_vectorizer = self.text_vectorizer(stop_words='english')
-
         self.extraction_pipeline = Pipeline([
             ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
-            ('union', FeatureUnion(
-                transformer_list=[
-                    ('data', Pipeline([
-                        ('selector', DictSelector(key='data')),
-                        ('vect', self.data_vectorizer),
-                    ])),
-
-                    ('title', Pipeline([
-                        ('selector', DictSelector(key='title')),
-                        ('tfidf', self.title_vectorizer),
-                    ])),
-
-                    ('comments', Pipeline([
-                        ('selector', DictSelector(key='comments')),
-                        ('tfidf', self.comments_vectorizer),
-                    ])),
-                ],
-            )),
+            ('union', ColumnTransformer([
+                ('data', DictVectorizer(), 'data'),
+
+                ('title', self.text_vectorizer(stop_words='english'), 'title'),
+
+                ('comments', self.text_vectorizer(stop_words='english'), 'comments'),
+            ])),
         ])
 
         self.clf = xgboost.XGBClassifier(n_jobs=16)
@@ -90,6 +74,4 @@ def get_labels(self):
         return classes
 
     def get_feature_names(self):
-        return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
-               ['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
-               ['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
+        return self.extraction_pipeline.named_steps['union'].get_feature_names()
diff --git a/bugbug/models/tracking.py b/bugbug/models/tracking.py
@@ -4,14 +4,13 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
-from sklearn.pipeline import FeatureUnion
 from sklearn.pipeline import Pipeline
 
 from bugbug import bug_features
 from bugbug import bugzilla
 from bugbug.model import Model
-from bugbug.utils import DictSelector
 
 
 class TrackingModel(Model):
@@ -40,30 +39,15 @@ def __init__(self, lemmatization=False):
             bug_features.cleanup_synonyms,
         ]
 
-        self.data_vectorizer = DictVectorizer()
-        self.title_vectorizer = self.text_vectorizer(stop_words='english')
-        self.comments_vectorizer = self.text_vectorizer(stop_words='english')
-
         self.extraction_pipeline = Pipeline([
             ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback)),
-            ('union', FeatureUnion(
-                transformer_list=[
-                    ('data', Pipeline([
-                        ('selector', DictSelector(key='data')),
-                        ('vect', self.data_vectorizer),
-                    ])),
-
-                    ('title', Pipeline([
-                        ('selector', DictSelector(key='title')),
-                        ('tfidf', self.title_vectorizer),
-                    ])),
-
-                    ('comments', Pipeline([
-                        ('selector', DictSelector(key='comments')),
-                        ('tfidf', self.comments_vectorizer),
-                    ])),
-                ],
-            )),
+            ('union', ColumnTransformer([
+                ('data', DictVectorizer(), 'data'),
+
+                ('title', self.text_vectorizer(stop_words='english'), 'title'),
+
+                ('comments', self.text_vectorizer(stop_words='english'), 'comments'),
+            ])),
         ])
 
         self.clf = xgboost.XGBClassifier(n_jobs=16)
@@ -92,6 +76,4 @@ def get_labels(self):
         return classes
 
     def get_feature_names(self):
-        return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
-               ['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
-               ['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
+        return self.extraction_pipeline.named_steps['union'].get_feature_names()
diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py
@@ -4,14 +4,13 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import xgboost
+from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
-from sklearn.pipeline import FeatureUnion
 from sklearn.pipeline import Pipeline
 
 from bugbug import bug_features
 from bugbug import bugzilla
 from bugbug.model import Model
-from bugbug.utils import DictSelector
 
 
 class UpliftModel(Model):
@@ -40,30 +39,15 @@ def __init__(self, lemmatization=False):
             bug_features.cleanup_synonyms,
         ]
 
-        self.data_vectorizer = DictVectorizer()
-        self.title_vectorizer = self.text_vectorizer(stop_words='english')
-        self.comments_vectorizer = self.text_vectorizer(stop_words='english')
-
         self.extraction_pipeline = Pipeline([
             ('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
-            ('union', FeatureUnion(
-                transformer_list=[
-                    ('data', Pipeline([
-                        ('selector', DictSelector(key='data')),
-                        ('vect', self.data_vectorizer),
-                    ])),
-
-                    ('title', Pipeline([
-                        ('selector', DictSelector(key='title')),
-                        ('tfidf', self.title_vectorizer),
-                    ])),
-
-                    ('comments', Pipeline([
-                        ('selector', DictSelector(key='comments')),
-                        ('tfidf', self.comments_vectorizer),
-                    ])),
-                ],
-            )),
+            ('union', ColumnTransformer([
+                ('data', DictVectorizer(), 'data'),
+
+                ('title', self.text_vectorizer(stop_words='english'), 'title'),
+
+                ('comments', self.text_vectorizer(stop_words='english'), 'comments'),
+            ])),
         ])
 
         self.clf = xgboost.XGBClassifier(n_jobs=16)
@@ -88,6 +72,4 @@ def get_labels(self):
         return classes
 
     def get_feature_names(self):
-        return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
-               ['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
-               ['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
+        return self.extraction_pipeline.named_steps['union'].get_feature_names()