Skip to content
This repository has been archived by the owner on Mar 1, 2018. It is now read-only.

Commit

Permalink
Merge pull request #19 from datasciencebr/jtemporal-invalid-cnpj-cpf
Browse files Browse the repository at this point in the history
Adds invalid CNPJ CPF Classifier
  • Loading branch information
cuducos authored Jan 11, 2017
2 parents 5eb9e0f + 0ca8333 commit df3b969
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 1 deletion.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
git+https://github.com/datasciencebr/serenata-toolbox.git#egg=serenata-toolbox
geopy>=1.11.0
pycpfcnpj==1.0.2
scikit-learn>=0.17
scipy>=0.18
geopy>=1.11.0
2 changes: 2 additions & 0 deletions rosie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from sklearn.externals import joblib

from rosie.dataset import Dataset
from rosie.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier
from rosie.meal_price_outlier_classifier import MealPriceOutlierClassifier
from rosie.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
from rosie.traveled_speeds_classifier import TraveledSpeedsClassifier
Expand All @@ -14,6 +15,7 @@ class Rosie:
MealPriceOutlierClassifier: 'meal_price_outlier',
MonthlySubquotaLimitClassifier: 'over_monthly_subquota_limit',
TraveledSpeedsClassifier: 'suspicious_traveled_speed_day',
InvalidCnpjCpfClassifier: 'invalid_cnpj_cpf',
}
DATASET_KEYS = ['applicant_id', 'year', 'document_id']

Expand Down
22 changes: 22 additions & 0 deletions rosie/invalid_cnpj_cpf_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import math

import numpy as np
from pycpfcnpj import cpfcnpj
from sklearn.base import TransformerMixin


class InvalidCnpjCpfClassifier(TransformerMixin):

def fit(self, X):
return self

def transform(self, X=None):
return self

def predict(self, X):
self._X = X.copy()
self._X['cnpj_cpf'] = self._X['cnpj_cpf'].astype(np.str)
return np.r_[self._X.apply(self.__is_invalid, axis=1)]

def __is_invalid(self, row):
return (row['document_type'] in [0, 1]) & (not cpfcnpj.validate(row['cnpj_cpf']))
10 changes: 10 additions & 0 deletions tests/fixtures/invalid_cnpj_cpf_classifier.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cnpj_cpf,document_type
22472225000183,0
22472225000180,0
,0
,2
22472225000183,2
22472225000180,2
57725723501,0
11111111111,0
22472225000180,3
41 changes: 41 additions & 0 deletions tests/test_invalid_cnpj_cpf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from unittest import TestCase

import numpy as np
import pandas as pd

from rosie.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier


class TestInvalidCnpjCpfClassifier(TestCase):

def setUp(self):
self.dataset = pd.read_csv('tests/fixtures/invalid_cnpj_cpf_classifier.csv',
dtype={'cnpj_cpf': np.str})
self.subject = InvalidCnpjCpfClassifier()

def test_is_valid_cnpj(self):
self.assertEqual(self.subject.predict(self.dataset)[0], False)

def test_is_invalid_cnpj(self):
self.assertEqual(self.subject.predict(self.dataset)[1], True)

def test_is_none(self):
self.assertEqual(self.subject.predict(self.dataset)[2], True)

def test_none_cnpj_cpf_abroad_is_valid(self):
self.assertEqual(self.subject.predict(self.dataset)[3], False)

def test_valid_cnpj_cpf_abroad_is_valid(self):
self.assertEqual(self.subject.predict(self.dataset)[4], False)

def test_invalid_cnpj_cpf_abroad_is_valid(self):
self.assertEqual(self.subject.predict(self.dataset)[5], False)

def test_is_valid_cpf(self):
self.assertEqual(self.subject.predict(self.dataset)[6], False)

def test_is_invalid_cpf(self):
self.assertEqual(self.subject.predict(self.dataset)[7], True)

def test_invalid_document_type(self):
self.assertEqual(self.subject.predict(self.dataset)[8], False)

0 comments on commit df3b969

Please sign in to comment.