<a href="https://colab.research.google.com/github/lpoggetto/trabalho_nlp_fiap/blob/main/trabalho_nlp_fiap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# limpando a pasta
!rm -rf /content/*

# Pacotes utilizados em NLP/feature engineering
import pandas as pd
import numpy as np
from collections import defaultdict
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor

import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
!python -m spacy download pt_core_news_sm

import re
from nltk.downloader import download
from nltk.stem import RSLPStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
download('stopwords')

# pacote ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
url = 'https://dados-ml-pln.s3.sa-east-1.amazonaws.com/tickets_reclamacoes_classificados.csv'
df = pd.read_csv(url, sep = ';')

In [3]:
df.shape

(21072, 4)

In [4]:
df.head()

Unnamed: 0,id_reclamacao,data_abertura,categoria,descricao_reclamacao
0,3229299,2019-05-01T12:00:00-05:00,Hipotecas / Empréstimos,"Bom dia, meu nome é xxxx xxxx e agradeço se vo..."
1,3199379,2019-04-02T12:00:00-05:00,Cartão de crédito / Cartão pré-pago,Atualizei meu cartão xxxx xxxx em xx/xx/2018 e...
2,3233499,2019-05-06T12:00:00-05:00,Cartão de crédito / Cartão pré-pago,O cartão Chase foi relatado em xx/xx/2019. No ...
3,3180294,2019-03-14T12:00:00-05:00,Cartão de crédito / Cartão pré-pago,"Em xx/xx/2018, enquanto tentava reservar um ti..."
4,3224980,2019-04-27T12:00:00-05:00,Serviços de conta bancária,"Meu neto me dê cheque por {$ 1600,00} Eu depos..."


In [5]:
df['categoria'].value_counts()

Unnamed: 0_level_0,count
categoria,Unnamed: 1_level_1
Serviços de conta bancária,5161
Cartão de crédito / Cartão pré-pago,5006
Roubo / Relatório de disputa,4822
Hipotecas / Empréstimos,3850
Outros,2233


In [6]:
df.isnull().sum()

Unnamed: 0,0
id_reclamacao,0
data_abertura,0
categoria,0
descricao_reclamacao,0


### Criando funções para limpeza/normalização dos dados


In [13]:
# funcoes de limpeza

# removendo urls
def remove_urls(texto):
    return re.sub(r'http\S+|www\S+|https\S+', '', texto, flags=re.MULTILINE)

# removendo espacos extras
def remove_espaco_extra(texto):
    return ' '.join(texto.split())

# funcoes de normalizacao

# todos os caracteres para caixa baixa
def to_lowercase(texto):
    return texto.lower()

def combined_regex(texto):
    texto = re.sub(r'[^\w\s]', '', texto)  # removendo caracteres especiais
    texto = re.sub(r'\d+', '[NUMERO]', texto)  # removendo numeros
    texto = re.sub(r'http\S+|www\S+|https\S+', '', texto, flags=re.MULTILINE)  # URLs
    texto = re.sub(r'\d{2}/\d{2}/\d{4}', '[DATA]', texto)  # datas
    return texto

# removendo stopwords
stopwords_pt = set(stopwords.words('portuguese'))

def remove_stopwords(texto):
    return " ".join([word for word in texto.split() if word not in stopwords_pt])

# Lematizacao
nlp = spacy.load("pt_core_news_sm")

def lemmatize(texto):
    doc = nlp(texto)
    return " ".join([token.lemma_ for token in doc])

def lemmatize(texto):
    doc = nlp(texto)
    return " ".join([token.lemma_ for token in doc])

def ngrams_calc(texto, n=2):
    tokens = texto.split()
    ngrams = defaultdict(int)

    for i in range(len(tokens) - n + 1):
        ngram = ' '.join(tokens[i:i+n])
        ngrams[ngram] += 1

    return dict(ngrams)


def preprocessamento(texto):
    # texto = combined_regex(texto)
    texto = remove_espaco_extra(texto)
    texto = to_lowercase(texto)
    texto = remove_stopwords(texto)
    texto = lemmatize(texto)
    return texto

def preprocessamento_ngrams(texto):
    # texto = combined_regex(texto)
    texto = remove_espaco_extra(texto)
    texto = to_lowercase(texto)
    texto = remove_stopwords(texto)
    texto = lemmatize(texto)
    ngrams = ngrams_calc(texto, n=2)
    return texto


In [28]:
# Initialize spaCy once with disabled components we don't need
nlp = spacy.load("pt_core_news_sm", disable=["parser", "ner"])

# Pre-compile all regex patterns for faster execution
URL_PATTERN = re.compile(r'http\S+|www\S+|https\S+', flags=re.MULTILINE)
SPECIAL_CHARS_PATTERN = re.compile(r'[^\w\s]')
NUMBERS_PATTERN = re.compile(r'\d+')
DATES_PATTERN = re.compile(r'\d{2}/\d{2}/\d{4}')
WHITESPACE_PATTERN = re.compile(r'\s+')

# Convert stopwords to a frozen set for faster lookups
STOPWORDS_PT = frozenset(STOP_WORDS)

def remove_urls(texto):
    """Optimized URL removal using pre-compiled pattern"""
    return URL_PATTERN.sub('', texto)

def remove_espaco_extra(texto):
    """Optimized whitespace normalization"""
    return WHITESPACE_PATTERN.sub(' ', texto).strip()

def combined_regex(texto):
    """Combined regex operations with pre-compiled patterns"""
    texto = SPECIAL_CHARS_PATTERN.sub('', texto)
    texto = NUMBERS_PATTERN.sub('[NUMERO]', texto)
    texto = URL_PATTERN.sub('', texto)
    texto = DATES_PATTERN.sub('[DATA]', texto)
    return texto

@lru_cache(maxsize=10000)
def cached_lemmatize(texto):
    """Cached lemmatization to avoid reprocessing identical texts"""
    doc = nlp(texto)
    return " ".join(token.lemma_ for token in doc)

def remove_stopwords(texto):
    """Optimized stopwords removal with set operations"""
    return " ".join(word for word in texto.split() if word not in STOPWORDS_PT)

def preprocessamento(texto):
    """Optimized preprocessing pipeline"""
    texto = combined_regex(texto)
    texto = remove_espaco_extra(texto)
    texto = texto.lower()  # Direct string method is faster than function call
    texto = remove_stopwords(texto)
    texto = cached_lemmatize(texto)
    return texto

def ngrams_calc(texto, n=2):
    """Optimized ngrams calculation"""
    tokens = texto.split()
    ngrams = defaultdict(int)
    ngrams.update((' '.join(tokens[i:i+n]), 1)
                  for i in range(len(tokens) - n + 1))
    return dict(ngrams)

def preprocessamento_ngrams(texto):
    """Optimized preprocessing with ngrams"""
    texto = combined_regex(texto)
    texto = remove_espaco_extra(texto)
    texto = texto.lower()
    texto = remove_stopwords(texto)
    texto = cached_lemmatize(texto)
    ngrams = ngrams_calc(texto, n=2)
    return texto, ngrams

def batch_preprocess(texts, batch_size=1000):
    """Process texts in batches for better performance"""
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(preprocessamento, texts))
    return results

def batch_preprocess_ngrams(texts, batch_size=1000):
    """Process texts with ngrams in batches"""
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(preprocessamento_ngrams, texts))
    return results

In [16]:
# For regular preprocessing
df['descricao_reclamacao_teste'] = batch_preprocess(df['descricao_reclamacao'].tolist())

# For ngrams preprocessing
df['descricao_reclamacao_teste_ngrams'] = batch_preprocess_ngrams(df['descricao_reclamacao'].tolist())

In [29]:
# 2. Codificar as categorias (variável resposta)
le = LabelEncoder()
df['categoria_encoded'] = le.fit_transform(df['categoria'])

# 3. Dividir os dados
X = df['descricao_reclamacao_teste']  # textos pré-processados
y = df['categoria_encoded']  # categorias codificadas numericamente

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y
                                                    )

# 4. Criar pipeline de classificação
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # vetorização TF-IDF
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))  # classificador
])

# 5. Treinar o modelo
model.fit(X_train, y_train)

# 6. Avaliar o modelo
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))



                                     precision    recall  f1-score   support

Cartão de crédito / Cartão pré-pago       0.86      0.89      0.88      1252
            Hipotecas / Empréstimos       0.88      0.89      0.88       962
                             Outros       0.91      0.61      0.73       558
       Roubo / Relatório de disputa       0.85      0.84      0.85      1206
         Serviços de conta bancária       0.83      0.92      0.87      1290

                           accuracy                           0.86      5268
                          macro avg       0.87      0.83      0.84      5268
                       weighted avg       0.86      0.86      0.85      5268



In [27]:
# 7. Exemplo de previsão para novo texto
novo_texto = "Roubaram o meu celular e invadiram minha conta, levaram 1000 reais e gostaria de reaver o valor perdido visto que não a transação não foi autorizada"
novo_texto_processado = preprocessamento(novo_texto)
categoria_predita = model.predict([novo_texto_processado])
print(f"Categoria prevista: {le.inverse_transform(categoria_predita)[0]}")

Categoria prevista: Serviços de conta bancária


In [31]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline

# # 1. Carregar um modelo de embeddings (ex: BERT em português)
# embedder = SentenceTransformer('neuralmind/bert-base-portuguese-cased')

# # 2. Gerar embeddings para os textos pré-processados
# X_embeddings = embedder.encode(df['descricao_reclamacao_teste'].tolist(),
#                               show_progress_bar=True)

# # 3. Codificar as categorias
# le = LabelEncoder()
# y = le.fit_transform(df['categoria'])

# # 4. Dividir dados
# X_train, X_test, y_train, y_test = train_test_split(
#     X_embeddings, y, test_size=0.2, random_state=42, stratify=y
# )

# # 5. Treinar classificador tradicional nos embeddings
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train, y_train)

# # 6. Avaliar
# y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred, target_names=le.classes_))



Batches:   0%|          | 0/659 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [34]:
from transformers import pipeline

# Carregar modelo de zero-shot em português
classifier = pipeline("zero-shot-classification",
                      model="joeddav/xlm-roberta-large-xnli")

# Definir categorias possíveis
candidate_labels = list(df['categoria'].unique())

# Exemplo de classificação
result = classifier(
    "Roubaram o meu celular e invadiram minha conta, levaram 1000 reais e gostaria de reaver o valor perdido visto que não a transação não foi autorizada",
    candidate_labels=candidate_labels,
    multi_label=False
)

print(f"Categoria prevista: {result['labels'][0]} (confiança: {result['scores'][0]:.2f})")

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Categoria prevista: Roubo / Relatório de disputa (confiança: 0.78)
