In [1]:
import pickle
from joblib import dump, load
import re
import string
from nltk.corpus import stopwords
# sklearn.__version__ == 0.23.2

In [2]:
def clean_doc(doc=None, string_=True):
    ''' # turn a doc into clean tokens '''
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word.lower() for word in tokens if not word.isdigit()]
    # filter out stop words
    stop_words = set(stopwords.words('portuguese'))
    stop_words.update(['<br />'])
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    if string_: return " ".join(tokens)
    else: return tokens
    
def load_doc(filename):
    ''' # load doc into memory '''
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

def predict_ml_classe(lc, text, l_vocab, pipe):
    l_text_clean = clean_doc(text, False)
    l_text_clean = [w for w in l_text_clean if w in l_vocab]
    l_lc = list()
    lc_clean = re.sub(r'[^\w\s]', '', str(lc))
    if lc_clean in l_vocab:
        l_lc.append(lc_clean)  
        l_lc.extend(l_text_clean)    
        l_lc = [w for w in l_lc if w != '']
        return pipe.predict(l_lc)[0]
    return pipe.predict(l_text_clean)[0]

In [3]:
with open(r"../best_models/ML_model.pickle", 'rb') as handle:
    loaded_pipe = pickle.load(handle)
# vocabulario
loaded_vocab = load_doc(r'../best_models/vocab.txt')

In [4]:
# Descrição de serviço de TI - classe 25
cd = ''
texto = 'desenvolvimento de sistemas'
classe = predict_ml_classe(cd,texto, loaded_vocab, loaded_pipe)
print(f'{texto}: classe {classe}')

desenvolvimento de sistemas: classe 25


In [5]:
# Descrição de serviço de hotelaria - classe 40
cd = ''
texto = 'hospedagem'
classe = predict_ml_classe(cd,texto, loaded_vocab, loaded_pipe)
print(f'{texto}: classe {classe}')

hospedagem: classe 40


In [6]:
# Descrição de serviço de obra de construo civil grupo CNAE 412, 432, 433,439 - classe 90
cd = ''
texto = 'obra de construção civil'
classe = predict_ml_classe(cd,texto, loaded_vocab, loaded_pipe)
print(f'{texto}: classe {classe}')

obra de construção civil: classe 90
