diff --git a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py index 8620271..9e9cf55 100644 --- a/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/meal_price_outlier_classifier.py @@ -28,6 +28,11 @@ class MealPriceOutlierClassifier(TransformerMixin): HOTEL_REGEX = r'hote(?:(?:ls?)|is)' CLUSTER_KEYS = ['mean', 'std'] + COLS = ['applicant_id', + 'category', + 'net_value', + 'recipient', + 'recipient_id'] def fit(self, X): _X = X[self.__applicable_rows(X)] @@ -49,7 +54,7 @@ def transform(self, X=None): pass def predict(self, X): - _X = X.copy() + _X = X[self.COLS].copy() companies = _X[self.__applicable_rows(_X)] \ .groupby('recipient_id').apply(self.__company_stats) \ .reset_index() @@ -68,7 +73,7 @@ def predict(self, X): _X = pd.merge(_X, known_thresholds, how='left') if 'cnpj_threshold' in _X.columns: _X.loc[_X['cnpj_threshold'].notnull(), - 'threshold'] = _X['cnpj_threshold'] + 'threshold'] = _X['cnpj_threshold'] _X['y'] = 1 is_outlier = self.__applicable_rows(_X) & \ _X['threshold'].notnull() & \ diff --git a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py index af60a39..0c74925 100644 --- a/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py @@ -12,15 +12,30 @@ class MonthlySubquotaLimitClassifier(TransformerMixin): issue_date : datetime column Date when the expense was made. + month : int column + The quota month matching the expense request. + net_value : float column The value of the expense. + + subquota_number : category column + A number to classify a category of expenses. + + year : int column + The quota year matching the expense request. """ KEYS = ['applicant_id', 'month', 'year'] + COLS = ['applicant_id', + 'issue_date', + 'month', + 'net_value', + 'subquota_number', + 'year'] def fit(self, X): self.X = X - self._X = self.X.copy() + self._X = self.X[self.COLS].copy() self.__create_columns() return self diff --git a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py index dd7529f..6bd5c86 100644 --- a/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py +++ b/rosie/chamber_of_deputies/classifiers/traveled_speeds_classifier.py @@ -36,6 +36,12 @@ class TraveledSpeedsClassifier(TransformerMixin): """ AGG_KEYS = ['applicant_id', 'issue_date'] + COLS = ['applicant_id', + 'category', + 'is_party_expense', + 'issue_date', + 'latitude', + 'longitude'] def __init__(self, contamination=.001): if contamination in [0, 1]: @@ -57,7 +63,7 @@ def transform(self, X=None): def predict(self, X): check_is_fitted(self, ['polynomial', '_polynomial_fn']) - _X = X.copy() + _X = X[self.COLS].copy() _X = self.__aggregate_dataset(_X) _X = self.__classify_dataset(_X) _X = pd.merge(X, _X, how='left', left_on=self.AGG_KEYS, right_on=self.AGG_KEYS)