Skip to content

Commit

Permalink
Use ColumnTransformer instead of FeatureUnion
Browse files Browse the repository at this point in the history
Fixes marco-c#72.
  • Loading branch information
marco-c committed Jan 17, 2019
1 parent 3d6672c commit 6faa0f2
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 147 deletions.
45 changes: 11 additions & 34 deletions bugbug/models/bug.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
# You can obtain one at http://mozilla.org/MPL/2.0/.

import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from bugbug import bug_features
from bugbug import bugzilla
from bugbug import labels
from bugbug.model import Model
from bugbug.utils import DictSelector


class BugModel(Model):
Expand Down Expand Up @@ -42,36 +41,17 @@ def __init__(self, lemmatization=False):
bug_features.cleanup_synonyms,
]

self.data_vectorizer = DictVectorizer()
self.title_vectorizer = self.text_vectorizer(stop_words='english')
self.first_comment_vectorizer = self.text_vectorizer(stop_words='english')
self.comments_vectorizer = self.text_vectorizer(stop_words='english')

self.extraction_pipeline = Pipeline([
('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
('union', FeatureUnion(
transformer_list=[
('data', Pipeline([
('selector', DictSelector(key='data')),
('vect', self.data_vectorizer),
])),

('title', Pipeline([
('selector', DictSelector(key='title')),
('tfidf', self.title_vectorizer),
])),

('first_comment', Pipeline([
('selector', DictSelector(key='first_comment')),
('tfidf', self.first_comment_vectorizer),
])),

('comments', Pipeline([
('selector', DictSelector(key='comments')),
('tfidf', self.comments_vectorizer),
])),
],
)),
('union', ColumnTransformer([
('data', DictVectorizer(), 'data'),

('title', self.text_vectorizer(stop_words='english'), 'title'),

('first_comment', self.text_vectorizer(stop_words='english'), 'first_comment'),

('comments', self.text_vectorizer(stop_words='english'), 'comments'),
])),
])

self.clf = xgboost.XGBClassifier(n_jobs=16)
Expand Down Expand Up @@ -127,10 +107,7 @@ def get_labels(self):
return self.get_bugbug_labels('bug')

def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
['first_comment_' + name for name in self.first_comment_vectorizer.get_feature_names()] +\
['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
return self.extraction_pipeline.named_steps['union'].get_feature_names()

def overwrite_classes(self, bugs, classes, probabilities):
for i, bug in enumerate(bugs):
Expand Down
45 changes: 13 additions & 32 deletions bugbug/models/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@
from collections import Counter

import xgboost
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from bugbug import bug_features
from bugbug import bugzilla
from bugbug.model import Model
from bugbug.utils import DictSelector


class ComponentModel(Model):
Expand Down Expand Up @@ -43,36 +42,19 @@ def __init__(self, lemmatization=False):
bug_features.cleanup_synonyms,
]

self.title_vectorizer = self.text_vectorizer(stop_words='english')
self.first_comment_vectorizer = self.text_vectorizer(stop_words='english')

self.extraction_pipeline = Pipeline([
('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
('union', FeatureUnion(
transformer_list=[
# TODO: Re-enable when we'll support bug snapshotting (#5).
# ('data', Pipeline([
# ('selector', DictSelector(key='data')),
# ('vect', self.data_vectorizer),
# ])),

('title', Pipeline([
('selector', DictSelector(key='title')),
('tfidf', self.title_vectorizer),
])),

# TODO: Re-enable when we'll support bug snapshotting (#5).
# ('comments', Pipeline([
# ('selector', DictSelector(key='comments')),
# ('tfidf', self.comments_vectorizer),
# ])),

('first_comment', Pipeline([
('selector', DictSelector(key='first_comment')),
('tfidf', self.first_comment_vectorizer),
])),
],
)),
('union', ColumnTransformer([
# TODO: Re-enable when we'll support bug snapshotting (#5).
# ('data', DictVectorizer(), 'data'),

('title', self.text_vectorizer(stop_words='english'), 'title'),

# TODO: Re-enable when we'll support bug snapshotting (#5).
# ('comments', self.text_vectorizer(stop_words='english'), 'comments'),

('first_comment', self.text_vectorizer(stop_words='english'), 'first_comment'),
])),
])

self.clf = xgboost.XGBClassifier(n_jobs=16)
Expand Down Expand Up @@ -169,5 +151,4 @@ def get_labels(self):
return {bug_id: component for bug_id, component in classes.items() if component in top_components}

def get_feature_names(self):
return ['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
['first_comment_' + name for name in self.first_comment_vectorizer.get_feature_names()]
return self.extraction_pipeline.named_steps['union'].get_feature_names()
36 changes: 9 additions & 27 deletions bugbug/models/qaneeded.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
# You can obtain one at http://mozilla.org/MPL/2.0/.

import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from bugbug import bug_features
from bugbug import bugzilla
from bugbug.model import Model
from bugbug.utils import DictSelector


class QANeededModel(Model):
Expand Down Expand Up @@ -40,30 +39,15 @@ def __init__(self, lemmatization=False):
bug_features.cleanup_synonyms,
]

self.data_vectorizer = DictVectorizer()
self.title_vectorizer = self.text_vectorizer(stop_words='english')
self.comments_vectorizer = self.text_vectorizer(stop_words='english')

self.extraction_pipeline = Pipeline([
('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
('union', FeatureUnion(
transformer_list=[
('data', Pipeline([
('selector', DictSelector(key='data')),
('vect', self.data_vectorizer),
])),

('title', Pipeline([
('selector', DictSelector(key='title')),
('tfidf', self.title_vectorizer),
])),

('comments', Pipeline([
('selector', DictSelector(key='comments')),
('tfidf', self.comments_vectorizer),
])),
],
)),
('union', ColumnTransformer([
('data', DictVectorizer(), 'data'),

('title', self.text_vectorizer(stop_words='english'), 'title'),

('comments', self.text_vectorizer(stop_words='english'), 'comments'),
])),
])

self.clf = xgboost.XGBClassifier(n_jobs=16)
Expand All @@ -90,6 +74,4 @@ def get_labels(self):
return classes

def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
return self.extraction_pipeline.named_steps['union'].get_feature_names()
36 changes: 9 additions & 27 deletions bugbug/models/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
# You can obtain one at http://mozilla.org/MPL/2.0/.

import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from bugbug import bug_features
from bugbug import bugzilla
from bugbug.model import Model
from bugbug.utils import DictSelector


class TrackingModel(Model):
Expand Down Expand Up @@ -40,30 +39,15 @@ def __init__(self, lemmatization=False):
bug_features.cleanup_synonyms,
]

self.data_vectorizer = DictVectorizer()
self.title_vectorizer = self.text_vectorizer(stop_words='english')
self.comments_vectorizer = self.text_vectorizer(stop_words='english')

self.extraction_pipeline = Pipeline([
('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback)),
('union', FeatureUnion(
transformer_list=[
('data', Pipeline([
('selector', DictSelector(key='data')),
('vect', self.data_vectorizer),
])),

('title', Pipeline([
('selector', DictSelector(key='title')),
('tfidf', self.title_vectorizer),
])),

('comments', Pipeline([
('selector', DictSelector(key='comments')),
('tfidf', self.comments_vectorizer),
])),
],
)),
('union', ColumnTransformer([
('data', DictVectorizer(), 'data'),

('title', self.text_vectorizer(stop_words='english'), 'title'),

('comments', self.text_vectorizer(stop_words='english'), 'comments'),
])),
])

self.clf = xgboost.XGBClassifier(n_jobs=16)
Expand Down Expand Up @@ -92,6 +76,4 @@ def get_labels(self):
return classes

def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
return self.extraction_pipeline.named_steps['union'].get_feature_names()
36 changes: 9 additions & 27 deletions bugbug/models/uplift.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
# You can obtain one at http://mozilla.org/MPL/2.0/.

import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

from bugbug import bug_features
from bugbug import bugzilla
from bugbug.model import Model
from bugbug.utils import DictSelector


class UpliftModel(Model):
Expand Down Expand Up @@ -40,30 +39,15 @@ def __init__(self, lemmatization=False):
bug_features.cleanup_synonyms,
]

self.data_vectorizer = DictVectorizer()
self.title_vectorizer = self.text_vectorizer(stop_words='english')
self.comments_vectorizer = self.text_vectorizer(stop_words='english')

self.extraction_pipeline = Pipeline([
('bug_extractor', bug_features.BugExtractor(feature_extractors, cleanup_functions)),
('union', FeatureUnion(
transformer_list=[
('data', Pipeline([
('selector', DictSelector(key='data')),
('vect', self.data_vectorizer),
])),

('title', Pipeline([
('selector', DictSelector(key='title')),
('tfidf', self.title_vectorizer),
])),

('comments', Pipeline([
('selector', DictSelector(key='comments')),
('tfidf', self.comments_vectorizer),
])),
],
)),
('union', ColumnTransformer([
('data', DictVectorizer(), 'data'),

('title', self.text_vectorizer(stop_words='english'), 'title'),

('comments', self.text_vectorizer(stop_words='english'), 'comments'),
])),
])

self.clf = xgboost.XGBClassifier(n_jobs=16)
Expand All @@ -88,6 +72,4 @@ def get_labels(self):
return classes

def get_feature_names(self):
return ['data_' + name for name in self.data_vectorizer.get_feature_names()] +\
['title_' + name for name in self.title_vectorizer.get_feature_names()] +\
['comments_' + name for name in self.comments_vectorizer.get_feature_names()]
return self.extraction_pipeline.named_steps['union'].get_feature_names()

0 comments on commit 6faa0f2

Please sign in to comment.