In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importando bibliotecas

In [2]:
!pip install gensim==4.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import gensim
import nltk

# Carregando dataset

In [4]:
df = pd.read_csv('/content/drive/Shareddrives/Aprendizado por reforço/NLP/Dataset/dataset_projeto_final.csv')
df

Unnamed: 0,text,label
0,Impossibilitado de arcar com as custas process...,Justiça gratuita
1,"Com efeito , faz jus o Autor à concessão da gr...",Justiça gratuita
2,"Como é cediço , de acordo com o referido diplo...",Justiça gratuita
3,"Por tais razões , e por imperiosa necessidade ...",Justiça gratuita
4,O Autor entabulou com o Banco Réu o Contrato d...,Financiamento veículo
...,...,...
64168,"Neste sentido , no que se refere à Tarifa de C...",Jurisprudência
64169,"Como também , tal entendimento foi pacificado ...",Jurisprudência
64170,“ A COBRANÇA A TÍTULO DE COMISSÃO DO CORRESPON...,Jurisprudência
64171,Assim já decidiu o STJ : RECURSO ESPECIAL REPE...,Jurisprudência


In [5]:
# As classes 'Notificação de inscrição' e 'Inscrição indevida' são parecidas, então optamos por juntá-las em uma só
df['label'] = df['label'].replace(['Notificação de inscrição'], 'Inscrição indevida')

#Pré-processamento

In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
top = stopwords.words('portuguese')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (top)]))
symbols = "!\"#$%&()*+-,./:´~;<=>?[\]^`{|}~\n..."
df["text"] = df["text"].str.translate({ord(symbol):"" for symbol in symbols})
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

In [8]:
df['text_clean'] = df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [9]:
possible_labels = df.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

label_dict
df['label'] = df.label.replace(label_dict)

#Separando o dataset em treino e teste

In [10]:
X_train, X_test, y_train, y_test = train_test_split (df['text_clean'], df['label'] , test_size=0.2, random_state=17, stratify = df['label'])

# Treinando o word2vec

In [11]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=300,
                                   window=20,
                                   min_count=3,
                                   sg = 1)

##  Convertendo o texto nos embeddings

In [12]:
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [13]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))

# Treinando modelo

In [14]:
# Arvore de decisão
dec_tree = RandomForestClassifier()
dec_tree.fit(X_train_vect_avg, y_train.values.ravel())

RandomForestClassifier()

# Testando no dataset de teste

In [15]:
y_pred = dec_tree.predict(X_test_vect_avg)

In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       544
           1       0.81      0.35      0.49       112
           2       0.55      0.84      0.67      1769
           3       0.73      0.25      0.37       341
           4       0.83      0.74      0.79       325
           5       0.94      0.88      0.91       730
           6       0.70      0.62      0.66       294
           7       0.75      0.91      0.82      1856
           8       0.80      0.68      0.73       785
           9       0.85      0.81      0.83       922
          10       0.61      0.54      0.57       147
          11       0.72      0.31      0.44       380
          12       0.72      0.55      0.63       689
          13       0.89      0.63      0.74       156
          14       0.90      0.87      0.89       973
          15       0.99      0.99      0.99       566
          16       0.75      0.17      0.27        18
          17       0.79    