In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import time

# Para tratar as palavras dos textos
import unicodedata # Conjunto de caracteres de uniformidade unicode
import re # Regular Expression

# Para possibilitar processamento paralelo
from multiprocessing import  Pool

pd.set_option('display.max_colwidth', None)

### Dataset de Teste - SICLI - MG

In [2]:
import pandas as pd
arquivo = 'cli_end_amostra_minas_gerais_202007030927.zip'
#arquivo = 'cli_end_amostra.csv'
sep = '\|\|'

df = pd.read_csv(arquivo, 
                 sep=sep,
                 compression='zip',
                 engine='python',
                 header=None)
df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,8480186189848,37550,0,"""37550000""",POUSO ALEGRE,SAO GERALDO,R JOAO SABINO 52,,MG,BRASIL,N INF,NAO INFORMADO,O,"""2""",S,"""201612""",,2016-12-16


In [3]:
df_sicli = df.copy()

In [4]:
df_sicli = df_sicli.rename(columns={0:"nu_pessoa_p17",
                        1:"nu_cep",
                        2:"nu_cep_complemento",
                        3:"nu_cep_full",
                        4:"no_localidade",
                        5:"no_bairro",
                        6:"de_localizacao",
                        7:"sg_posicao_dtrme",
                        8:"sg_uf",
                        9:"no_pais",
                        10:"sg_tipo_ed_imovel",
                        11:"ed_unidade_ocpco",
                        12:"ic_origem_cdsto",
                        13:"ic_validacao",
                        14:"ic_comprovacao",
                        15:"nu_ano_mes_inicio",
                        16:"nu_ano_mes_fim",
                        17:"dt_apuracao_endereco"})
df_sicli.head(1)

Unnamed: 0,nu_pessoa_p17,nu_cep,nu_cep_complemento,nu_cep_full,no_localidade,no_bairro,de_localizacao,sg_posicao_dtrme,sg_uf,no_pais,sg_tipo_ed_imovel,ed_unidade_ocpco,ic_origem_cdsto,ic_validacao,ic_comprovacao,nu_ano_mes_inicio,nu_ano_mes_fim,dt_apuracao_endereco
0,8480186189848,37550,0,"""37550000""",POUSO ALEGRE,SAO GERALDO,R JOAO SABINO 52,,MG,BRASIL,N INF,NAO INFORMADO,O,"""2""",S,"""201612""",,2016-12-16


In [5]:
features_sicli = ['nu_pessoa_p17','no_localidade','no_bairro','de_localizacao','sg_uf','nu_cep_full']

df_sicli = df_sicli[features_sicli].rename(columns={'nu_pessoa_p17':'nu_pessoa_p17', 
                                                    'no_localidade':'log_no', 
                                                    'no_bairro':'bai_no', 
                                                    'de_localizacao':'log_no_abrev', 
                                                    'sg_uf':'ufe_sg', 
                                                    'nu_cep_full':'cep'})

df_sicli.head(2)

Unnamed: 0,nu_pessoa_p17,log_no,bai_no,log_no_abrev,ufe_sg,cep
0,8480186189848,POUSO ALEGRE,SAO GERALDO,R JOAO SABINO 52,MG,"""37550000"""
1,9840064467984,BELO HORIZONTE,SANTO ANDRE,SAO CLEMENTE,MG,"""31230460"""


In [6]:
# paralelizar essa tarefa
df_sicli = df_sicli.replace(to_replace=r'\"', value='', regex=True)

df_sicli['cep'] = df_sicli.cep.astype(int)

df_sicli.shape

(4400000, 6)

In [7]:
df_sicli['cep_3d']   = (df_sicli['cep']/100000).astype(int) #(int(df_sicli['cep'])/100000).astype(int)
df_sicli['label']    = (df_sicli['cep']/1000).astype(int)
df_sicli['id_sicli'] = [id for id in range(len(df_sicli))]
df_sicli.head(1)

Unnamed: 0,nu_pessoa_p17,log_no,bai_no,log_no_abrev,ufe_sg,cep,cep_3d,label,id_sicli
0,8480186189848,POUSO ALEGRE,SAO GERALDO,R JOAO SABINO 52,MG,37550000,375,37550,0


In [8]:
from sklearn.model_selection import train_test_split

features_sicli = ['id_sicli','nu_pessoa_p17','log_no','bai_no','log_no_abrev','ufe_sg','cep','cep_3d','label']

X_train, X_test, y_train, y_test = train_test_split(df_sicli[features_sicli], 
                                                    df_sicli[['ufe_sg','cep_3d']], 
                                                    test_size=0.002, 
                                                    random_state=42)

df_sicli = X_test

del X_train, X_test, y_train, y_test

df_sicli.shape

(8800, 9)

In [9]:
df_sicli.head(1)

Unnamed: 0,id_sicli,nu_pessoa_p17,log_no,bai_no,log_no_abrev,ufe_sg,cep,cep_3d,label
72055,72055,9640151053964,BUENO BRANDAO,PARQUE RENASCE,R OPREVIO DE SOUSA FERRAZ 112,MG,37578000,375,37578


In [10]:
# =============================================================================
# Aplica o conjunto unicode NFKD Compatibility Decomposition
# https://unicode.org/reports/tr15/ 
# =============================================================================

def norma(old):
    new = ''.join(ch for ch in unicodedata.normalize('NFKD', 
                    str(old)) if not unicodedata.combining(ch))
    return new

#print(norma('são áéíóú'))

In [11]:
# Para tratar as palavras dos textos
import unicodedata # Conjunto de caracteres de uniformidade unicode
import re # Regular Expression

# Para possibilitar processamento paralelo
from multiprocessing import  Pool

import numpy as np

# https://github.com/perinm/PI-2020.1/blob/master/Proj%20Final/Data%20Processing.ipynb
def parallelize_dataframe(df, func, n_cores=8):
    """
    Function that parallelizes any function applied to a dataframe
    Input:
    df      - Dataframe
    func    - Function to be aplied to portions of Dataframe
    n_cores - Number of CPU cores to be used on the application of func
    
    """
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [12]:
# https://github.com/perinm/PI-2020.1/blob/master/Proj%20Final/Data%20Processing.ipynb

# Processo que cria a feature end_txt a partir dos campos que compõem um endereço
def preprocessing_df_txt(df):
    #return df[features_txt].apply(lambda x: " ".join(x.astype(str)), axis=1)
    return df[['log_no','bai_no','log_no_abrev']].apply(lambda x: " ".join(x.astype(str)), 
                                                                       axis=1)

In [13]:
# https://github.com/perinm/PI-2020.1/blob/master/Proj%20Final/Data%20Processing.ipynb

#Paralelização do processo que cria a nova feature end_txt
df_sicli['end_txt'] = parallelize_dataframe(df_sicli[['log_no','bai_no','log_no_abrev']],
                                            preprocessing_df_txt)
df_sicli['end_txt'] = [re.sub(r'nan', ' ', x) for x in df_sicli['end_txt'].str.lower()]
df_sicli['end_txt'] = [re.sub(r'\W', ' ', x) for x in df_sicli['end_txt'].str.lower()]
df_sicli['end_txt'] = [re.sub(r'\d', ' ', x) for x in df_sicli['end_txt'].str.lower()]
df_sicli['end_txt'] = [norma(x) for x in df_sicli['end_txt'].str.lower()]
df_sicli.shape

(8800, 10)

In [14]:
def load_stoplist():
    stoplist = open("stopwords.txt", "r")
    stoplist = stoplist.read()
    stoplist = set(stoplist.splitlines())
    return stoplist

stopwords = load_stoplist()

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# inicializar TFIDF
vec = TfidfVectorizer(max_features=4000, stop_words=stopwords)
# criar TFIDF
tfidf_test = vec.fit_transform(df_sicli.end_txt)

tfidf_test.shape
#tfidf_df.shape

(8800, 4000)

In [16]:
from scipy.sparse import csr_matrix #para transformar DataFrame em Sparse DataFrame otimizado

sp_vec = csr_matrix(tfidf_test)
tfidf_sicli = pd.DataFrame.sparse.from_spmatrix(sp_vec)

# amostra
#tfidf_df.iloc[:20, 3000:3050]
tfidf_sicli.shape

(8800, 4000)

In [17]:
tfidf_sicli['id_bow'] = [id for id in range(len(tfidf_sicli))]

In [18]:
tfidf_sicli.shape

(8800, 4001)

In [19]:
tfidf_sicli.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3991,3992,3993,3994,3995,3996,3997,3998,3999,id_bow
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [20]:
df_sicli.head(5)

Unnamed: 0,id_sicli,nu_pessoa_p17,log_no,bai_no,log_no_abrev,ufe_sg,cep,cep_3d,label,end_txt
72055,72055,9640151053964,BUENO BRANDAO,PARQUE RENASCE,R OPREVIO DE SOUSA FERRAZ 112,MG,37578000,375,37578,bueno brandao parque renasce r oprevio de sousa ferraz
4153064,4153064,7810048381781,CONTAGEM,,JOSE O FONTES 1117 ELDORADO,MG,32310520,323,32310,contagem jose o fontes eldorado
3685111,3685111,7460226316746,UBERLANDIA,TAIAMAN,AV JOSE P DEFENSOR 603 CS 68,MG,38415198,384,38415,uberlandia taiaman av jose p defensor cs
3079447,3079447,3430038794343,UNAI,CENTRO,PRACA JK SN,MG,38610000,386,38610,unai centro praca jk sn
294630,294630,50002536005,PEDRO LEOPOLDO,CENTRO,CEL JUVENTINO DIAS 466,MG,33600000,336,33600,pedro leopoldo centro cel juventino dias


In [21]:
df_sicli.reset_index(inplace=True)
df_sicli = df_sicli.drop(columns='index')

In [22]:
df_sicli.head(1)

Unnamed: 0,id_sicli,nu_pessoa_p17,log_no,bai_no,log_no_abrev,ufe_sg,cep,cep_3d,label,end_txt
0,72055,9640151053964,BUENO BRANDAO,PARQUE RENASCE,R OPREVIO DE SOUSA FERRAZ 112,MG,37578000,375,37578,bueno brandao parque renasce r oprevio de sousa ferraz


### Concatenação

In [23]:
tfidf_sicli = pd.concat([df_sicli[['id_sicli','nu_pessoa_p17','ufe_sg','cep_3d','label']], 
                         tfidf_sicli], axis=1, join='inner')
tfidf_sicli.shape

(8800, 4006)

### Cópia para posterior comparação

In [24]:
## 
predict_df = df_sicli.copy()

In [25]:
# Trocando NaN por 0.0
tfidf_sicli = tfidf_sicli.fillna(value=0.0)
tfidf_sicli.shape

(8800, 4006)

In [26]:
tfidf_sicli.head(1)

Unnamed: 0,id_sicli,nu_pessoa_p17,ufe_sg,cep_3d,label,0,1,2,3,4,...,3991,3992,3993,3994,3995,3996,3997,3998,3999,id_bow
0,72055,9640151053964,MG,375,37578,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [27]:
# ENCODER
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(tfidf_sicli['ufe_sg'])
tfidf_sicli['uf'] = label_encoder.transform(tfidf_sicli['ufe_sg'])

tfidf_sicli.head(1)

Unnamed: 0,id_sicli,nu_pessoa_p17,ufe_sg,cep_3d,label,0,1,2,3,4,...,3992,3993,3994,3995,3996,3997,3998,3999,id_bow,uf
0,72055,9640151053964,MG,375,37578,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [29]:
# Elimina a feature cep_3d criada exclusivamente 
# para amostragem estratificada proporcional por ufe_sg e cep
tfidf_sicli = tfidf_sicli.drop(columns=['cep_3d'])
tfidf_sicli = tfidf_sicli.drop(columns=['ufe_sg'])
tfidf_sicli = tfidf_sicli.drop(columns=['id_bow'])
tfidf_sicli = tfidf_sicli.drop(columns=['id_sicli'])
tfidf_sicli = tfidf_sicli.drop(columns=['nu_pessoa_p17'])
predict_df = predict_df.drop(columns='label')

del df_sicli, df

tfidf_sicli.head(1)

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,3991,3992,3993,3994,3995,3996,3997,3998,3999,uf
0,37578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [30]:
tfidf_sicli.shape

(8800, 4002)

In [31]:
from joblib import load
predict_cep_model = load('cep_model_ncr.dat')
print('Modelo carregado: ', predict_cep_model)

Modelo carregado:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [32]:
y_test = tfidf_sicli.label
X_test = tfidf_sicli

In [33]:
cep = predict_cep_model.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score
acuracia = accuracy_score(y_test, cep)
print("Acurácia: %.2f%%" % (acuracia * 100.0))

Acurácia: 23.34%


In [35]:
X_sicli = X_test
X_sicli['cep_inferido'] = cep
X_sicli[['label','cep_inferido']]

Unnamed: 0,label,cep_inferido
0,37578,37561
1,32310,32215
2,38415,38410
3,38610,38606
4,33600,33805
...,...,...
8795,30295,30295
8796,38400,38410
8797,30140,30170
8798,35179,35164


In [47]:
features_sicli = ['nu_pessoa_p17','log_no','bai_no','log_no_abrev','end_txt','uf','cep','label','cep_inferido']
resultado = pd.concat([predict_df, X_sicli], axis=1, sort=False)
resultado[features_sicli]

Unnamed: 0,nu_pessoa_p17,log_no,bai_no,log_no_abrev,end_txt,uf,cep,label,cep_inferido
0,9640151053964,BUENO BRANDAO,PARQUE RENASCE,R OPREVIO DE SOUSA FERRAZ 112,bueno brandao parque renasce r oprevio de sousa ferraz,0,37578000,37578,37561
1,7810048381781,CONTAGEM,,JOSE O FONTES 1117 ELDORADO,contagem jose o fontes eldorado,0,32310520,32310,32215
2,7460226316746,UBERLANDIA,TAIAMAN,AV JOSE P DEFENSOR 603 CS 68,uberlandia taiaman av jose p defensor cs,0,38415198,38415,38410
3,3430038794343,UNAI,CENTRO,PRACA JK SN,unai centro praca jk sn,0,38610000,38610,38606
4,50002536005,PEDRO LEOPOLDO,CENTRO,CEL JUVENTINO DIAS 466,pedro leopoldo centro cel juventino dias,0,33600000,33600,33805
...,...,...,...,...,...,...,...,...,...
8795,3840182988384,BELO HORIZONTE,TAQUARIL,R DOM HENRIQUE 145 CS A,belo horizonte taquaril r dom henrique cs a,0,30295240,30295,30295
8796,6850109302685,UBERLANDIA,APARECIDA,R ITUMBIARA 1124 AP 1201,uberlandia aparecida r itumbiara ap,0,38400617,38400,38410
8797,920071094092,BELO HORIZONTE,LOURDES,AIMORES 2441 AP 202,belo horizonte lourdes aimores ap,0,30140072,30140,30170
8798,4120118740412,SANTANA DO PAR,CIDADE NOVA,R QUARENTA E NOVE 205 AP1,santana do par cidade nova r quarenta e nove ap,0,35179000,35179,35164
