In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import time
import pandas as pd
import re
import string
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
import time

In [2]:
def get_data(nome_arquivo,shuffle):
    ds = pd.read_csv(nome_arquivo,encoding="utf-8")
    if shuffle:
        ds = ds.sample(frac=1)
    ds['texto'] = ds['texto'].apply(str)
    return ds

In [3]:
stop = set(stopwords.words("portuguese"))

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",text)

def remove_punct(text):
    translator = str.maketrans("","",'!""#$%&\'()*+,./:;<=>?@[\\]^_`{|}~º')
    translator = str.maketrans("","",'!""#$%&\'()*+,./:;<=>?@[\\]^_`{|}~º')
    return text.translate(translator)

def remove_numbers(text):
    result = ''.join([i for i in text if not i.isdigit()])
    return result

def remove_hifen(text):
    translator = str.maketrans('-',' ')
    return text.translate(translator)

In [4]:
pattern = re.compile(r"https?//(\S+|www)\.\S+")
def pat(df_t):
    for t in df_t.texto:
        matches = pattern.findall(t)
        for match in  matches:
            print(t)
            print(match)
            print(pattern.sub(r"",t))        
        if len(matches)> 0:
            break

In [5]:
def make_test(df_t):
    df_t["texto"] = df_t.texto.map(remove_URL)
    df_t["texto"] = df_t.texto.map(remove_punct)
    df_t["texto"] = df_t.texto.map(remove_hifen)
    #df_t["texto"] = df_t.texto.map(remove_numbers)
    df_t["texto"] = df_t.texto.map(remove_stopwords)

In [6]:
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

In [7]:
def data_split(df,size):
    train_size = int(df.shape[0]*size)
    train_df = df[:train_size]
    val_df = df[train_size:]
    return train_df, val_df

In [8]:
def data_to_numpy(df):    
    train_sentences = train_df.texto.to_numpy()
    train_labels = train_df.valor.to_numpy()
    val_sentences = val_df.texto.to_numpy()
    val_labels = val_df.valor.to_numpy()
    return train_sentences, train_labels, val_sentences, val_labels

In [28]:
def prepare(teste):
    teste = remove_URL(teste)
    teste = remove_punct(teste)
    teste = remove_hifen(teste)
    teste = remove_stopwords(teste)    
    return teste

def predict(teste):
    predictions = model.predict(np.array(teste)) 
    p1 = [np.argmax(element) for element in predictions]
    if p1[0]:
        print("Disciplina: Eletronica")
    else:
        print("Disciplina: Elétrica")
    return predictions,p1

def tokenization(df):      
    tokenizer.fit_on_texts(train_sentences)
    word_index = tokenizer.word_index
    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    val_sequences = tokenizer.texts_to_sequences(val_sentences)
    return train_sequences, val_sequences, word_index

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

def precision(test_labels,predictions):
    counter = len(test_labels)
    list_c = [i for i,j in zip(predictions,test_labels) if i == j]
    return len(list_c)/counter*100

In [10]:
df = get_data('train.csv',1)

In [11]:
df.shape

(1420, 2)

In [12]:
df.head()

Unnamed: 0,valor,texto
21,1,Muitos métodos foram desenvolvidos para o agru...
372,2,A Constituição Federal do Brasil garante a tod...
1283,8,"Atualmente, as empresas estão inseridas num am..."
176,1,Fora do ramo principal das atividades de engen...
1387,8,O trabalho consiste em uma pesquisa realizada ...


In [13]:
print((df.valor==1).sum())#eletronica
print((df.valor==2).sum())#direito
print((df.valor==3).sum())#eletrica
print((df.valor==4).sum())#odontologia
print((df.valor==5).sum())#computação
print((df.valor==6).sum())#geografia
print((df.valor==7).sum())#ambiental
print((df.valor==8).sum())#mecanica

200
198
200
103
120
199
200
200


In [14]:
pat(df)
make_test(df)

In [15]:
counter = counter_word(df.texto)
num_unique_words = len(counter)
counter.most_common(5)

[('trabalho', 1484),
 ('sistema', 928),
 ('estudo', 784),
 ('análise', 671),
 ('ser', 662)]

In [16]:
train_df, val_df = data_split(df,0.8)

print(len(train_df))
print(len(val_df))

1136
284


In [17]:
train_sentences, train_labels, val_sentences, val_labels = data_to_numpy(df)
train_sentences.shape, val_sentences.shape

((1136,), (284,))

In [18]:
tokenizer = Tokenizer(num_words = num_unique_words,oov_token="<OOV>")
train_sequences, val_sequences, word_index = tokenization(df)

In [49]:
max_length = 500

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding = "post",truncating = "post")
val_padded = pad_sequences(val_sequences, maxlen = max_length, padding = "post", truncating = "post")
train_padded.shape, val_padded.shape

((1136, 500), (284, 500))

In [50]:
reverse_word_index = dict([(idx,word) for (word, idx) in word_index.items()])

In [51]:
model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length = max_length))

#model.add(layers.LSTM(32,dropout = 0.1))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(128, activation = "relu"))
model.add(layers.Dense(720, activation = "relu"))
model.add(layers.Dense(720, activation = "relu"))
model.add(layers.Dense(128, activation = "relu"))
model.add(layers.Dense(9, activation = "softmax"))

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 32)           716288    
_________________________________________________________________
global_average_pooling1d_6 ( (None, 32)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 128)               4224      
_________________________________________________________________
dense_30 (Dense)             (None, 720)               92880     
_________________________________________________________________
dense_31 (Dense)             (None, 720)               519120    
_________________________________________________________________
dense_32 (Dense)             (None, 128)               92288     
_________________________________________________________________
dense_33 (Dense)             (None, 9)                

In [54]:
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optim = keras.optimizers.Adamax(lr=0.001)
metrics = ["accuracy"]

model.compile(loss='rmsprop', optimizer = optim, metrics = metrics)

In [55]:
start = time.perf_counter()
model.fit(train_padded,train_labels, epochs = 50, validation_data=(val_padded,val_labels), verbose=2)
finish = time.perf_counter()
print(f'\nFinished in {round(finish-start, 2)} second(s)')

Epoch 1/50
36/36 - 0s - loss: 1.1778 - accuracy: 0.7324 - val_loss: 5.1504 - val_accuracy: 0.3345
Epoch 2/50
36/36 - 0s - loss: 0.1940 - accuracy: 0.9648 - val_loss: 5.2104 - val_accuracy: 0.3592
Epoch 3/50
36/36 - 0s - loss: 0.1583 - accuracy: 0.9727 - val_loss: 5.1724 - val_accuracy: 0.3380
Epoch 4/50
36/36 - 0s - loss: 0.1488 - accuracy: 0.9745 - val_loss: 5.2649 - val_accuracy: 0.3556
Epoch 5/50
36/36 - 0s - loss: 0.1380 - accuracy: 0.9771 - val_loss: 5.2917 - val_accuracy: 0.3451
Epoch 6/50
36/36 - 0s - loss: 0.1329 - accuracy: 0.9745 - val_loss: 5.3533 - val_accuracy: 0.3521
Epoch 7/50
36/36 - 0s - loss: 0.1250 - accuracy: 0.9780 - val_loss: 5.4020 - val_accuracy: 0.3345
Epoch 8/50
36/36 - 0s - loss: 0.1229 - accuracy: 0.9798 - val_loss: 5.5014 - val_accuracy: 0.3556
Epoch 9/50
36/36 - 0s - loss: 0.1190 - accuracy: 0.9798 - val_loss: 5.5283 - val_accuracy: 0.3451
Epoch 10/50
36/36 - 0s - loss: 0.1174 - accuracy: 0.9789 - val_loss: 5.5909 - val_accuracy: 0.3451
Epoch 11/50
36/36 -

In [24]:
predictions = model.predict(val_padded)
predictions = [np.argmax(element) for element in predictions]

In [25]:
print("Label: ",val_labels[64])
print("Resultado: ",predictions[64],'\n')
print(val_labels,'\n')
print(predictions)

Label:  4
Resultado:  4 

[3 2 5 8 6 7 7 3 3 6 8 1 6 8 1 8 8 6 4 1 2 8 3 6 3 7 3 8 8 3 3 7 4 7 7 2 7
 4 1 2 7 3 5 3 5 2 7 7 2 4 6 8 8 4 6 5 3 8 1 4 7 5 1 1 4 2 2 7 3 3 7 7 6 5
 7 3 6 7 4 1 6 3 7 6 4 1 6 1 2 1 2 7 7 6 3 7 5 1 6 1 2 2 7 4 5 2 6 3 7 1 3
 6 2 3 3 1 7 7 5 4 7 1 2 1 8 1 2 3 8 4 2 6 3 6 8 1 3 4 7 7 7 2 6 5 6 7 2 1
 5 6 7 8 8 3 8 3 7 1 4 6 8 8 3 3 1 8 1 2 8 3 2 6 5 3 4 7 6 5 1 2 3 7 8 6 7
 8 3 4 1 1 6 3 2 3 8 6 6 6 1 7 1 5 6 6 2 8 5 7 1 8 3 1 1 2 5 7 3 2 7 2 7 8
 2 3 3 6 8 6 2 8 1 3 3 8 6 8 7 6 6 2 6 3 8 1 5 6 1 5 7 1 1 1 2 1 1 3 6 7 8
 1 3 1 1 7 6 8 4 2 2 1 4 7 5 2 3 7 8 3 8 3 6 6 3 7] 

[7, 6, 1, 3, 6, 7, 1, 8, 3, 6, 8, 8, 6, 3, 3, 3, 8, 6, 4, 1, 6, 6, 3, 6, 3, 7, 5, 8, 3, 3, 3, 7, 4, 1, 1, 2, 7, 4, 3, 6, 1, 1, 1, 1, 1, 8, 1, 7, 6, 4, 6, 8, 3, 4, 6, 5, 3, 8, 1, 4, 7, 1, 1, 1, 4, 2, 6, 7, 3, 1, 3, 1, 6, 1, 3, 3, 1, 1, 4, 3, 1, 3, 7, 3, 4, 8, 6, 1, 6, 1, 2, 1, 8, 6, 1, 1, 1, 1, 6, 1, 6, 6, 5, 4, 1, 2, 6, 3, 1, 1, 3, 8, 6, 3, 3, 1, 7, 5, 1, 4, 1, 1, 6, 3, 6, 1, 6, 3, 8, 4, 2, 2

In [30]:
r = precision(val_labels,predictions)
print(r)

57.04225352112676
