In [1]:
#! pip install corels --user
#! pip install pathlib --user
#! pip install fuzzymatcher --user

In [2]:
%%time
import xlrd 
import os
import pandas as pd
import numpy as np
from corels import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


CPU times: user 1.46 s, sys: 1.49 s, total: 2.94 s
Wall time: 2.61 s


### Exemplo da Documentação
- https://github.com/corels/pycorels

In [3]:

# Train split proportion
train_proportion = 0.8

# Load the dataset
X, y, features, prediction = load_from_csv("compas.csv")

# A maximum cardinality of 3 makes CORELS search all rule antecedents, with up to 3 features combined together
c = CorelsClassifier(max_card=3, n_iter=1000000, verbosity=["progress","rulelist"])

# Generate train and test sets
train_split = int(train_proportion * X.shape[0])

X_train = X[:train_split]
y_train = y[:train_split]

X_test = X[train_split:]
y_test = y[train_split:]

# Fit the model. Features is a list of the feature names
c.fit(X_train, y_train, features=features, prediction_name=prediction)

# Score the model on the test set
a = c.score(X_test, y_test)

RULELIST:
if [Juvenile-Crimes=0 && not Age=18-22 && not Prior-Crimes>3]:
  Recidivate-Within-Two-Years = False
else 
  Recidivate-Within-Two-Years = True


In [4]:
print("Test Accuracy: " + str(a))

# Print the rulelist
print(c.rl())

Test Accuracy: 0.6625086625086625
RULELIST:
if [Juvenile-Crimes=0 && not Age=18-22 && not Prior-Crimes>3]:
  Recidivate-Within-Two-Years = False
else 
  Recidivate-Within-Two-Years = True


#### Regras para classificar "Crimonoso não recorrente nos próximos dois anos"
1) Se o número de crimes juvenis é zero
2) Idade não está na categoria 18-22 (idade é maior que 23 anos)           
3) Número de crimes prévios<3

#### Regras para classificar "Crimonoso recorrente nos próximos dois anos"
1) Se o número de crimes juvenis é maior que zero
2) Idade é maior que 23 anos
3) Número de crimes prévios>3


## Utilizando base de dados de TEDs Sicoob
![Sicoob](sicoob.jpg)


In [5]:
%%time

ted = pd.read_sas("pfa_transcao_ted2020x.sas7bdat")
ted.tail()

CPU times: user 7.36 s, sys: 165 ms, total: 7.52 s
Wall time: 7.55 s


In [6]:
%%time
ted.head()

CPU times: user 4.4 ms, sys: 39 µs, total: 4.43 ms
Wall time: 3.27 ms


Unnamed: 0,NR_CPF_CNPJ,NR_CPR_CNA,VR_TRN,DT_MVM_DWM,TIPO,TEMPO_CONTA,HORA,MES,QUIN,WEK,HOUR,DQTD5,DQTD15,DQTD30,DQTD60,QTD_MES,TED_RENDA,RENDA_TOTAL,TED_RENDA2,FRAUDE,TRAT_FPSM,FATOR_SORTEIO,CORTE_TEDRENDA_0,CORTE_TEDRENDA_1,CORTE_TEDRENDA_2,CORTE_TEDRENDA_3,CORTE_TEDRENDA_4,CORTE_TEDRENDA_5,CORTE_TEDRENDA_6,CORTE_TEDRENDA_7,CORTE_TEDRENDA_8,CORTE_TEDRENDA_9,VALOR_SOBRE_MES,DVALOR,DRENDA,DHIST_MES,CRENDA,BRENDA,LOG_RENDA,SQR_RENDA
0,b'00010069160',5004.0,1000.0,2020-02-01,1.0,7.0,3.0,2.0,1.0,3.0,11.0,0.0,0.0,0.0,0.0,3.0,0.0,6337.15,0.1578,0.0,0.0,b'00',1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,7.0,8.754184,40159.470122
1,b'00010069160',5004.0,1300.0,2020-02-01,1.0,7.0,3.0,2.0,1.0,6.0,11.0,0.0,0.0,0.0,0.0,3.0,0.0,6337.15,0.20514,0.0,0.0,b'00',1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,7.0,8.754184,40159.470122
2,b'00010074910',3034.0,40000.0,2020-09-01,1.0,1.0,4.0,9.0,3.0,4.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,4395.25,9.100734,0.0,0.0,b'00',1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,4.0,7.0,8.38828,19318.222562
3,b'00020015607',3027.0,10000.0,2020-01-01,1.0,12.0,4.0,1.0,1.0,6.0,14.0,0.0,0.0,0.0,0.0,2.0,0.0,7124.81,1.403546,0.0,0.0,b'00',1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,5.0,7.0,8.871338,50762.917536
4,b'00020015607',3027.0,30000.0,2020-01-01,1.0,12.0,5.0,1.0,1.0,2.0,15.0,0.0,0.0,0.0,0.0,2.0,0.0,7124.81,4.210639,0.0,0.0,b'00',1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,5.0,7.0,8.871338,50762.917536


In [7]:
%%time
ted.columns

CPU times: user 16 µs, sys: 1 µs, total: 17 µs
Wall time: 29.8 µs


Index(['NR_CPF_CNPJ', 'NR_CPR_CNA', 'VR_TRN', 'DT_MVM_DWM', 'TIPO', 'TEMPO_CONTA', 'HORA', 'MES', 'QUIN', 'WEK', 'HOUR', 'DQTD5', 'DQTD15', 'DQTD30', 'DQTD60', 'QTD_MES', 'TED_RENDA', 'RENDA_TOTAL', 'TED_RENDA2', 'FRAUDE', 'TRAT_FPSM', 'FATOR_SORTEIO', 'CORTE_TEDRENDA_0', 'CORTE_TEDRENDA_1', 'CORTE_TEDRENDA_2', 'CORTE_TEDRENDA_3', 'CORTE_TEDRENDA_4', 'CORTE_TEDRENDA_5', 'CORTE_TEDRENDA_6', 'CORTE_TEDRENDA_7', 'CORTE_TEDRENDA_8', 'CORTE_TEDRENDA_9', 'VALOR_SOBRE_MES', 'DVALOR', 'DRENDA', 'DHIST_MES', 'CRENDA', 'BRENDA', 'LOG_RENDA', 'SQR_RENDA'], dtype='object')

#### Python Tools for Record Linking and Fuzzy Matching
- https://pbpython.com/record-linking.html


![Fuzzy Matching](https://pbpython.com/images/pbpython_record_linkage.jpg)



##### CPFs dos Amigos para usar de exemplo
- Gustavo:05602581197
- Aline:03807114190
- Leonardo:02079173103
- Marcus:68533977204
- Roberta: 14537003705

In [8]:
%%time

ted1 = ted[ted['TRAT_FPSM']==1].copy()

ted1.drop('TRAT_FPSM',axis=1,inplace=True)


CPU times: user 60.2 ms, sys: 35.7 ms, total: 95.8 ms
Wall time: 94.7 ms


In [9]:
%%time
ted1.columns

CPU times: user 13 µs, sys: 2 µs, total: 15 µs
Wall time: 23.8 µs


Index(['NR_CPF_CNPJ', 'NR_CPR_CNA', 'VR_TRN', 'DT_MVM_DWM', 'TIPO', 'TEMPO_CONTA', 'HORA', 'MES', 'QUIN', 'WEK', 'HOUR', 'DQTD5', 'DQTD15', 'DQTD30', 'DQTD60', 'QTD_MES', 'TED_RENDA', 'RENDA_TOTAL', 'TED_RENDA2', 'FRAUDE', 'FATOR_SORTEIO', 'CORTE_TEDRENDA_0', 'CORTE_TEDRENDA_1', 'CORTE_TEDRENDA_2', 'CORTE_TEDRENDA_3', 'CORTE_TEDRENDA_4', 'CORTE_TEDRENDA_5', 'CORTE_TEDRENDA_6', 'CORTE_TEDRENDA_7', 'CORTE_TEDRENDA_8', 'CORTE_TEDRENDA_9', 'VALOR_SOBRE_MES', 'DVALOR', 'DRENDA', 'DHIST_MES', 'CRENDA', 'BRENDA', 'LOG_RENDA', 'SQR_RENDA'], dtype='object')

In [10]:
%%time

ted1[['NR_CPF_CNPJ','FRAUDE','TIPO']].groupby(['FRAUDE','TIPO']).count()

CPU times: user 12.4 ms, sys: 35 µs, total: 12.4 ms
Wall time: 11.2 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,NR_CPF_CNPJ
FRAUDE,TIPO,Unnamed: 2_level_1
0.0,0.0,12036
0.0,1.0,6225
1.0,0.0,253
1.0,1.0,463


In [31]:
%%time
ted[['NR_CPF_CNPJ','FRAUDE']].groupby(['FRAUDE']).count()

CPU times: user 26 ms, sys: 14 µs, total: 26 ms
Wall time: 23.2 ms


Unnamed: 0_level_0,NR_CPF_CNPJ
FRAUDE,Unnamed: 1_level_1
0.0,116385
1.0,716


In [12]:
%%time

ted.describe().transpose()

CPU times: user 537 ms, sys: 32 ms, total: 569 ms
Wall time: 565 ms


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NR_CPR_CNA,117101.0,3537.724,607.7395,3001.0,3069.0,3210.0,4119.0,5004.0
VR_TRN,117101.0,11037.87,84880.06,500.0,1000.0,2000.0,5460.0,11500000.0
TIPO,117101.0,0.3391602,0.4734263,0.0,0.0,0.0,1.0,1.0
TEMPO_CONTA,117101.0,5.03146,5.177243,0.0,2.0,3.0,6.0,42.0
HORA,117101.0,3.908771,1.073079,1.0,3.0,4.0,5.0,7.0
MES,117101.0,5.284122,2.589537,1.0,3.0,5.0,8.0,9.0
QUIN,117101.0,2.085789,0.8209478,1.0,1.0,2.0,3.0,3.0
WEK,117101.0,4.017481,1.491906,1.0,3.0,4.0,5.0,7.0
HOUR,117101.0,12.6949,3.057157,0.0,10.0,13.0,15.0,23.0
DQTD5,117101.0,0.1450287,0.3521312,0.0,0.0,0.0,0.0,1.0


## Categorizando variávaeis contínuas

In [13]:
import numpy as np
   
cat_dia = np.arange(start=0, stop=30,step=5) 
cat_var = np.arange(start=0, stop=100000,step=10000) 
cat_per = np.arange(start=0, stop=20,step=4)/5

In [14]:
ted['CAT_VR_TRN']     =pd.cut(ted['VR_TRN']          , bins=cat_var)
ted['CAT_VALOR_MES']  =pd.cut(ted['VALOR_SOBRE_MES'] , bins=cat_per)
ted['CAT_QTD_MES']    =pd.cut(ted['QTD_MES']         , bins=cat_dia)

In [15]:
#%%time
#lista = ['CAT_VR_TRN','CAT_RENDA_TOTAL','CAT_VALOR_MES','CAT_TED_RENDA','CAT_QTD_MES']
#for item in lista:
#    print(ted[ted['FRAUDE']==1][['NR_CPF_CNPJ',item]].groupby(item).count())    

In [16]:
import re
lista = ted.columns
r = re.compile(".*TIPO|.*DQTD|.*HORA|.*QUIN|.*WEK|.*CAT|.*CORTE|CORTE_TEDRENDA")
newlist = list(filter(r.match, lista))


In [17]:
newlist

['TIPO',
 'HORA',
 'QUIN',
 'WEK',
 'DQTD5',
 'DQTD15',
 'DQTD30',
 'DQTD60',
 'CORTE_TEDRENDA_0',
 'CORTE_TEDRENDA_1',
 'CORTE_TEDRENDA_2',
 'CORTE_TEDRENDA_3',
 'CORTE_TEDRENDA_4',
 'CORTE_TEDRENDA_5',
 'CORTE_TEDRENDA_6',
 'CORTE_TEDRENDA_7',
 'CORTE_TEDRENDA_8',
 'CORTE_TEDRENDA_9',
 'CAT_VR_TRN',
 'CAT_VALOR_MES',
 'CAT_QTD_MES']

In [18]:
ted2 = pd.get_dummies(ted[newlist])

In [19]:
ted2.columns.values

array(['TIPO', 'HORA', 'QUIN', 'WEK', 'DQTD5', 'DQTD15', 'DQTD30',
       'DQTD60', 'CORTE_TEDRENDA_0', 'CORTE_TEDRENDA_1',
       'CORTE_TEDRENDA_2', 'CORTE_TEDRENDA_3', 'CORTE_TEDRENDA_4',
       'CORTE_TEDRENDA_5', 'CORTE_TEDRENDA_6', 'CORTE_TEDRENDA_7',
       'CORTE_TEDRENDA_8', 'CORTE_TEDRENDA_9', 'CAT_VR_TRN_(0, 10000]',
       'CAT_VR_TRN_(10000, 20000]', 'CAT_VR_TRN_(20000, 30000]',
       'CAT_VR_TRN_(30000, 40000]', 'CAT_VR_TRN_(40000, 50000]',
       'CAT_VR_TRN_(50000, 60000]', 'CAT_VR_TRN_(60000, 70000]',
       'CAT_VR_TRN_(70000, 80000]', 'CAT_VR_TRN_(80000, 90000]',
       'CAT_VALOR_MES_(0.0, 0.1]', 'CAT_VALOR_MES_(0.1, 0.2]',
       'CAT_VALOR_MES_(0.2, 0.3]', 'CAT_VALOR_MES_(0.3, 0.4]',
       'CAT_VALOR_MES_(0.4, 0.5]', 'CAT_VALOR_MES_(0.5, 0.6]',
       'CAT_VALOR_MES_(0.6, 0.7]', 'CAT_VALOR_MES_(0.7, 0.8]',
       'CAT_VALOR_MES_(0.8, 0.9]', 'CAT_QTD_MES_(0, 5]',
       'CAT_QTD_MES_(5, 10]', 'CAT_QTD_MES_(10, 15]',
       'CAT_QTD_MES_(15, 20]', 'CAT_QTD_MES_(20

In [20]:
ted2.drop(['HORA','QUIN','WEK','CORTE_TEDRENDA_7','CORTE_TEDRENDA_8','CORTE_TEDRENDA_9'],axis=1, inplace=True)

## Training Optimal Rules


![elseif](elseif.jpeg)

##### Importação das blibliotecas relevantes para ELM


In [21]:
from sklearn.model_selection import train_test_split

# Load the dataset

X, y = ted2 , ted['FRAUDE']

# Generate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# A maximum cardinality of 5 makes CORELS search all rule antecedents, with up to 5 features combined together
c = CorelsClassifier(max_card=5, n_iter=10000, verbosity=["progress","rulelist"], min_support=0.10)

In [22]:
# Fit the model. Features is a list of the feature names

c.fit(X_train, y_train, prediction_name='prediction')

RULELIST:
prediction = False


CorelsClassifier ({'c': 0.01, 'n_iter': 10000, 'map_type': 'prefix', 'policy': 'lower_bound', 'verbosity': ['progress', 'rulelist'], 'ablation': 0, 'max_card': 5, 'min_support': 0.1})
RULELIST:
prediction = False
All features: (['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8', 'feature9', 'feature10', 'feature11', 'feature12', 'feature13', 'feature14', 'feature15', 'feature16', 'feature17', 'feature18', 'feature19', 'feature20', 'feature21', 'feature22', 'feature23', 'feature24', 'feature25', 'feature26', 'feature27', 'feature28', 'feature29', 'feature30', 'feature31', 'feature32', 'feature33', 'feature34', 'feature35'])

In [23]:
# Score the model on the test set
a = c.score(X_test, y_test)

In [24]:
patrick = c.predict(X_test)

In [28]:
patrick

array([False, False, False, ..., False, False, False])

In [29]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print('Accuracy for {0}: {1}'.format(accuracy_score(y_test, c.predict(X_test))))
print('F1 score for {0}:'.format(f1_score(y_test, c.predict(X_test))))
print('Recall for {0}:'.format(recall_score(y_test, c.predict(X_test))))
print('Precision for {0}:'.format(precision_score(y_test, c.predict(X_test))))  

IndexError: tuple index out of range

In [30]:
# Print the rulelist
print(c.rl())

RULELIST:
prediction = False


In [None]:
#mlp = []
#for h in [(50,50,50), (50,100,50), (120,), (50,50,50,50)]:
#    for a in ['tanh', 'relu','logistic']:
#        for p in [0.25 , 0.5, 0.75]:
#            for alp in [0.0001, 0.05, 0.25]:
#                for l_rate in ['constant','adaptive','invscaling']:
#                    for l_init in [0.0001, 0.025 , 0.05, 0.15]:
#                        mlp.append(MLPClassifier(random_state=666, solver='sgd', batch_size=min(25000, 25000), 
#                                            hidden_layer_sizes =h, activation=a, power_t=p, alpha=alp,
#                                            learning_rate=l_rate , learning_rate_init=l_init))      

In [None]:
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    print('Accuracy for {0}: {1}'.format(name,accuracy_score(y_test, clf.predict(X_test))))
    print('F1 score for {0}:'.format(name,f1_score(y_test, clf.predict(X_test))))
    print('Recall for {0}:'.format(name,recall_score(y_test, clf.predict(X_test))))
    print('Precision for {0}:'.format(name,precision_score(y_test, clf.predict(X_test)))) 