This repository has been archived by the owner on Mar 1, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
91 additions
and
76 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,69 +1,9 @@ | ||
import os.path | ||
|
||
import numpy as np | ||
from sklearn.externals import joblib | ||
|
||
from rosie.chamber_of_deputies.dataset import Dataset | ||
from rosie.chamber_of_deputies.classifiers.election_expenses_classifier import ElectionExpensesClassifier | ||
from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier | ||
from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier | ||
from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier | ||
from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier | ||
from rosie.chamber_of_deputies.classifiers.irregular_companies_classifier import IrregularCompaniesClassifier | ||
|
||
|
||
class ChamberOfDeputies: | ||
CLASSIFIERS = { | ||
MealPriceOutlierClassifier: 'meal_price_outlier', | ||
MonthlySubquotaLimitClassifier: 'over_monthly_subquota_limit', | ||
TraveledSpeedsClassifier: 'suspicious_traveled_speed_day', | ||
InvalidCnpjCpfClassifier: 'invalid_cnpj_cpf', | ||
ElectionExpensesClassifier: 'election_expenses', | ||
IrregularCompaniesClassifier: 'irregular_companies_classifier' | ||
} | ||
DATASET_KEYS = ['applicant_id', 'year', 'document_id'] | ||
|
||
def __init__(self, dataset, data_path): | ||
self.dataset = dataset | ||
self.data_path = data_path | ||
self.irregularities = self.dataset[self.DATASET_KEYS].copy() | ||
|
||
def run_classifiers(self): | ||
for classifier, irregularity in self.CLASSIFIERS.items(): | ||
model = self.load_trained_model(classifier) | ||
self.predict(model, irregularity) | ||
|
||
self.irregularities.to_csv(os.path.join(self.data_path, 'irregularities.xz'), | ||
compression='xz', | ||
encoding='utf-8', | ||
index=False) | ||
|
||
def load_trained_model(self, classifier): | ||
filename = '{}.pkl'.format(classifier.__name__.lower()) | ||
path = os.path.join(self.data_path, filename) | ||
# palliative since this model is outputting | ||
# a model too large to be loaded with joblib | ||
if filename == 'monthlysubquotalimitclassifier.pkl': | ||
model = classifier() | ||
model.fit(self.dataset) | ||
else: | ||
if os.path.isfile(path): | ||
model = joblib.load(path) | ||
else: | ||
model = classifier() | ||
model.fit(self.dataset) | ||
joblib.dump(model, path) | ||
return model | ||
|
||
def predict(self, model, irregularity): | ||
model.transform(self.dataset) | ||
y = model.predict(self.dataset) | ||
self.irregularities[irregularity] = y | ||
if y.dtype == np.int: | ||
self.irregularities.loc[y == 1, irregularity] = False | ||
self.irregularities.loc[y == -1, irregularity] = True | ||
from rosie.chamber_of_deputies import settings | ||
from rosie.chamber_of_deputies.adapter import Adapter | ||
from rosie.core import Core | ||
|
||
|
||
def main(target_directory='/tmp/serenata-data'): | ||
dataset = Dataset(target_directory).get() | ||
ChamberOfDeputies(dataset, target_directory).run_classifiers() | ||
adapter = Adapter(target_directory) | ||
core = Core(settings, adapter) | ||
core() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from rosie.chamber_of_deputies.classifiers.election_expenses_classifier import ElectionExpensesClassifier | ||
from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier | ||
from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier | ||
from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier | ||
from rosie.chamber_of_deputies.classifiers.irregular_companies_classifier import IrregularCompaniesClassifier | ||
from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier | ||
|
||
|
||
CLASSIFIERS = { | ||
'meal_price_outlier': MealPriceOutlierClassifier, | ||
'over_monthly_subquota_limit': MonthlySubquotaLimitClassifier, | ||
'suspicious_traveled_speed_day': TraveledSpeedsClassifier, | ||
'invalid_cnpj_cpf': InvalidCnpjCpfClassifier, | ||
'election_expenses': ElectionExpensesClassifier, | ||
'irregular_companies_classifier': IrregularCompaniesClassifier | ||
} | ||
|
||
DATASET_KEYS = ('applicant_id', 'year', 'document_id') | ||
|
||
VALUE = 'total_net_value' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,66 @@ | ||
import os.path | ||
class Core: | ||
""" | ||
This is Rosie's core object: it implements a generic pipeline to collect | ||
data, clean and normalize it, analyzies the data and output a dataset with | ||
suspicions. It's initialization module takes a settings module and an | ||
adapter. | ||
import numpy as np | ||
from sklearn.externals import joblib | ||
The settings module should have three constants: | ||
* CLASSIFIERS (dict) with pairs of human readable name (snake case) for | ||
each classifier and the object (class) of the classifiers. | ||
* UNIQUE_IDS (str or iterable) with the column(s) that should be taken as | ||
unique identifiers if the main dataset of each module. | ||
* VALUE (str) with the column that should be taken as the total net value | ||
of the transaction represented by each row of the datset. | ||
class RosieCore: | ||
"""docstring for ClassName""" | ||
def __init__(self, arg): | ||
super(ClassName, self).__init__() | ||
self.arg = arg | ||
The adapter should be an object with: | ||
* A `dataset` property with the main dataset to be analyzed; | ||
* A `path` property with the path to the datasets (where the output will be | ||
saved). | ||
""" | ||
|
||
def __init__(self, settings, adapter, data_path): | ||
self.settings = settings | ||
self.dataset = adapter.dataset | ||
self.data_path = adapter.path | ||
|
||
if isinstance(settings.UNIQUE_IDENTIFIERS, str): | ||
self.settings.UNIQUE_IDS = (self.settings.UNIQUE_IDS,) | ||
|
||
self.suspicions = self.dataset[self.settings.UNIQUE_IDS].copy() | ||
|
||
def __call__(self): | ||
for name, classifier in self.settings.CLASSIFIERS.items(): | ||
model = self.load_trained_model(classifier) | ||
self.predict(model, name) | ||
|
||
output = os.path.join(self.data_path, 'suspicions.xz') | ||
kwargs = (compression='xz', encoding='utf-8', index=False) | ||
self.suspicions.to_csv(output, **kwargs) | ||
|
||
def load_trained_model(self, classifier): | ||
filename = '{}.pkl'.format(classifier.__name__.lower()) | ||
path = os.path.join(self.data_path, filename) | ||
|
||
# palliative: this outputs a model too large for joblib | ||
if classifier.__name__ == 'MonthlySubquotaLimitClassifier': | ||
model = classifier() | ||
model.fit(self.dataset) | ||
|
||
else: | ||
if os.path.isfile(path): | ||
model = joblib.load(path) | ||
else: | ||
model = classifier() | ||
model.fit(self.dataset) | ||
joblib.dump(model, path) | ||
|
||
return model | ||
|
||
def predict(self, model, name): | ||
model.transform(self.dataset) | ||
prediction = model.predict(self.dataset) | ||
self.suspicions[suspicion] = prediciton | ||
if prediciton.dtype == np.int: | ||
self.suspitions.loc[prediciton == 1, name] = False | ||
self.suspitions.loc[prediciton == -1, name] = True |