In [201]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np

In [202]:
def get_data(nome_arquivo,shuffle):
    ds = pd.read_csv(nome_arquivo,encoding="utf-8")
    if shuffle:
        ds = ds.sample(frac=1)
    ds['texto'] = ds['texto'].apply(str)
    return ds

In [203]:
df = get_data("train.csv",1)

In [204]:
df.shape

(800, 2)

In [205]:
df.head()

Unnamed: 0,valor,texto
614,0,O objetivo do projeto é aprimorar a automação ...
150,1,Esta monografia tem como objetivo principal co...
532,0,O presente trabalho ressalta a importância do ...
667,0,Este trabalho apresenta um estudo de estabilid...
339,1,Este projeto tem como objetivo o desenvolvimen...


In [206]:
df.texto

614    O objetivo do projeto é aprimorar a automação ...
150    Esta monografia tem como objetivo principal co...
532    O presente trabalho ressalta a importância do ...
667    Este trabalho apresenta um estudo de estabilid...
339    Este projeto tem como objetivo o desenvolvimen...
                             ...                        
715    Este trabalho apresenta um estudo da operação ...
722    Por muito tempo a principal fonte de energia e...
706    O desenvolvimento da geração distribuída nos ú...
131    Este trabalho aborda o desenvolvimento de um s...
363    O crescente desenvolvimento tecnológico somado...
Name: texto, Length: 800, dtype: object

In [207]:
print((df.valor==1).sum())#eletronica
print((df.valor==0).sum())#eletrica

400
400


In [208]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("portuguese"))

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",text)

def remove_punct(text):
    translator = str.maketrans("","",'!""#$%&\'()*+,./:;<=>?@[\\]^_`{|}~º')
    translator = str.maketrans("","",'!""#$%&\'()*+,./:;<=>?@[\\]^_`{|}~º')
    return text.translate(translator)

def remove_numbers(text):
    result = ''.join([i for i in text if not i.isdigit()])
    return result

def remove_hifen(text):
    translator = str.maketrans('-',' ')
    return text.translate(translator)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mateus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [209]:
pattern = re.compile(r"https?//(\S+|www)\.\S+")
def pat(df_t):
    for t in df_t.texto:
        matches = pattern.findall(t)
        for match in  matches:
            print(t)
            print(match)
            print(pattern.sub(r"",t))        
        if len(matches)> 0:
            break
pat(df)            

In [210]:
def make_test(df_t):
    df_t["texto"] = df_t.texto.map(remove_URL)
    df_t["texto"] = df_t.texto.map(remove_punct)
    df_t["texto"] = df_t.texto.map(remove_hifen)
    df_t["texto"] = df_t.texto.map(remove_numbers)
    df_t["texto"] = df_t.texto.map(remove_stopwords)

make_test(df)

In [211]:
from collections import Counter

def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df.texto)

In [212]:
len(counter)

11626

In [213]:
counter.most_common(5)

[('sistema', 1047),
 ('trabalho', 911),
 ('energia', 656),
 ('projeto', 451),
 ('elétrica', 391)]

In [214]:
num_unique_words = len(counter)

In [215]:
#train_size = int(df.shape[0]*0.8)

#train_df = df[:train_size]
#val_df = df[train_size:]

def data_split(df,size):
    train_size = int(df.shape[0]*size)
    train_df = df[:train_size]
    val_df = df[train_size:]
    return train_df, val_df

train_df, val_df = data_split(df,0.8)

In [216]:
print(len(train_df))
print(len(val_df))

640
160


In [217]:
def data_to_numpy(df):    
    train_sentences = train_df.texto.to_numpy()
    train_labels = train_df.valor.to_numpy()
    val_sentences = val_df.texto.to_numpy()
    val_labels = val_df.valor.to_numpy()
    return train_sentences, train_labels, val_sentences, val_labels

train_sentences, train_labels, val_sentences, val_labels = data_to_numpy(df)

In [218]:
train_sentences.shape, val_sentences.shape

((640,), (160,))

In [219]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = num_unique_words,oov_token="<OOV>")
def tokenization(df):      
    tokenizer.fit_on_texts(train_sentences)
    word_index = tokenizer.word_index
    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    val_sequences = tokenizer.texts_to_sequences(val_sentences)
    return train_sequences, val_sequences, word_index

train_sequences, val_sequences, word_index = tokenization(df)

In [220]:
#word_index = tokenizer.word_index

In [221]:
#train_sequences = tokenizer.texts_to_sequences(train_sentences)
#val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [222]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 600

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding = "post",truncating = "post")
val_padded = pad_sequences(val_sequences, maxlen = max_length, padding = "post", truncating = "post")
train_padded.shape, val_padded.shape

((640, 600), (160, 600))

In [223]:
reverse_word_index = dict([(idx,word) for (word, idx) in word_index.items()])

In [224]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [225]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[1152, 715, 4, 7, 408, 506, 620, 171, 181, 433, 1541, 716, 302, 1152, 238, 302, 95, 1719, 237, 566, 677, 335, 16, 2649, 2635, 224, 254, 195, 717, 2650, 217, 142, 319, 124, 111, 121, 239, 1542, 590, 414, 925, 3177, 266, 1153, 542, 24, 102, 1379, 135, 223, 1380, 448, 19, 72, 1067, 254, 44, 2651, 225, 1543, 620, 1381, 6, 254, 195, 105, 44, 614, 4013, 990, 245, 5678, 271, 351, 991, 1720, 5, 176, 1068, 144, 293, 263, 87, 30, 8, 718, 112, 464, 181, 433, 1938, 80, 506, 5679, 104, 1700, 277, 2652, 796, 992, 752, 719, 75, 567, 1154, 720, 331, 4014, 224, 1155, 181, 16, 1939, 1156, 1721, 1258, 1382, 5, 209, 753, 197, 56, 296, 5680, 28, 254, 16, 1939, 352, 1152, 177, 926, 14, 74, 752, 5681, 5682, 278, 172, 1722, 127, 72, 433, 562, 5683, 308, 51, 1723, 861, 506]
medidor inteligente energia elétrica aplicado indústria projetado proposto motores indução trifásicos dividido módulos medidor possui módulos sensores fazem aquisição tensões correntes fase forma invasiva facilitando instalação módulo princ

In [226]:
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length = max_length))

#model.add(layers.LSTM(256,dropout = 0.1))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(128, activation = "relu"))
model.add(layers.Dense(128, activation = "relu"))
model.add(layers.Dense(24, activation = "softmax"))
#model.add(layers.Dense(1, activation = "sigmoid"))
#model.add(layers.Softmax())
model.add(layers.Dense(2, activation = "sigmoid"))


model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 600, 32)           372032    
_________________________________________________________________
global_average_pooling1d_13  (None, 32)                0         
_________________________________________________________________
dense_56 (Dense)             (None, 128)               4224      
_________________________________________________________________
dense_57 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_58 (Dense)             (None, 24)                3096      
_________________________________________________________________
dense_59 (Dense)             (None, 2)                 50        
Total params: 395,914
Trainable params: 395,914
Non-trainable params: 0
_______________________________________________

In [227]:
#loss = keras.losses.BinaryCrossentropy(from_logits=False)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer = optim, metrics = metrics)

In [229]:
import time
start = time.perf_counter()
model.fit(train_padded,train_labels, epochs = 30, validation_data=(val_padded,val_labels), verbose=2)
finish = time.perf_counter()
print(f'\nFinished in {round(finish-start, 2)} second(s)')

Epoch 1/30
20/20 - 0s - loss: 0.1289 - accuracy: 0.9969 - val_loss: 0.6099 - val_accuracy: 0.7188
Epoch 2/30
20/20 - 0s - loss: 0.1257 - accuracy: 0.9969 - val_loss: 0.7143 - val_accuracy: 0.6750
Epoch 3/30
20/20 - 0s - loss: 0.1195 - accuracy: 0.9984 - val_loss: 0.6129 - val_accuracy: 0.7250
Epoch 4/30
20/20 - 0s - loss: 0.1152 - accuracy: 0.9984 - val_loss: 0.6445 - val_accuracy: 0.7000
Epoch 5/30
20/20 - 0s - loss: 0.1119 - accuracy: 0.9984 - val_loss: 0.6274 - val_accuracy: 0.7188
Epoch 6/30
20/20 - 0s - loss: 0.1089 - accuracy: 0.9984 - val_loss: 0.6344 - val_accuracy: 0.7188
Epoch 7/30
20/20 - 0s - loss: 0.1059 - accuracy: 0.9984 - val_loss: 0.6405 - val_accuracy: 0.7125
Epoch 8/30
20/20 - 0s - loss: 0.1032 - accuracy: 0.9984 - val_loss: 0.6423 - val_accuracy: 0.7188
Epoch 9/30
20/20 - 0s - loss: 0.1005 - accuracy: 0.9984 - val_loss: 0.6457 - val_accuracy: 0.7188
Epoch 10/30
20/20 - 0s - loss: 0.0980 - accuracy: 0.9984 - val_loss: 0.6531 - val_accuracy: 0.7125
Epoch 11/30
20/20 -

In [232]:
predictions = model.predict(val_padded)
print([np.argmax(element) for element in predictions])

#predictions = [1 if p > 0.5 else 0 for p in predictions]

[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]


In [231]:
index = 7
#print(val_sentences[index])
#print(val_padded[index],'\n')
print("Label: ",val_labels[index])
print("Resultado: ",predictions[index],'\n')
print(val_labels,'\n')
print(predictions)

Label:  0
Resultado:  [0.81427556 0.21116403] 

[1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 1 1 1 1 0 1
 1 0 1 1 1 0 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 0
 0 1 0 1 1 0 1 1 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0 1 1 1 1 1 0 0 1 0 0 1 0 1 0
 0 0 1 1 1 0 1 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 1 0 1 0 0
 1 1 0 1 0 1 0 0 0 0 1 1] 

[[0.40413007 0.6037802 ]
 [0.19212398 0.80185044]
 [0.7853347  0.24062052]
 [0.19178507 0.8021738 ]
 [0.8142788  0.21116054]
 [0.8142973  0.2111434 ]
 [0.8136343  0.21182048]
 [0.81427556 0.21116403]
 [0.1917347  0.802222  ]
 [0.81428707 0.21115255]
 [0.7295759  0.29618824]
 [0.21803439 0.7772325 ]
 [0.8142562  0.21118376]
 [0.8142985  0.21114245]
 [0.19778606 0.7964537 ]
 [0.8066536  0.21896389]
 [0.19400662 0.80005485]
 [0.8142562  0.21118316]
 [0.19227326 0.8017081 ]
 [0.3060376  0.69473714]
 [0.223876   0.7717085 ]
 [0.19185176 0.8021102 ]
 [0.75682527 0.2692087 ]
 [0.80715156 0.21845564]
 [0.2675447  0.730656  ]

In [176]:
df_t = get_data("eval.csv",0)

In [177]:
df_t.shape

(8, 2)

In [178]:
df_t.head()

Unnamed: 0,valor,texto
0,1,Este projeto visa uma nova abordagem para um s...
1,1,"Nesta monografia, é apresentada a proposta de ..."
2,1,O objetivo da monografia foi estudar como as t...
3,1,As células de carga são instrumentos versáteis...
4,0,"Na atualidade, a produção termelétrica partici..."


In [179]:
pat(df_t)
make_test(df_t)

In [180]:
df_t.texto

0    projeto visa nova abordagem sistema vigilância...
1    nesta monografia apresentada proposta equipame...
2    objetivo monografia estudar técnicas controle ...
3    células carga instrumentos versáteis usados mo...
4    atualidade produção termelétrica participa mar...
5    neste trabalho apresentado projeto instalação ...
6    trabalho intitulado “estimação preço demanda e...
7    trabalho apresenta estudo sobre impactos causa...
Name: texto, dtype: object

In [181]:
test_sentences = df_t.texto.to_numpy()
test_labels = df_t.valor.to_numpy()

In [182]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [183]:
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding = "post",truncating = "post")

In [233]:
predictions_t = model.predict(test_padded)
print(predictions_t)
#predictions_t = [1 if p > 0.5 else 0 for p in predictions_t]

[[0.1946227  0.79946756]
 [0.7966277  0.22917894]
 [0.7381285  0.28774518]
 [0.1925447  0.8014491 ]
 [0.73600364 0.28985548]
 [0.8139216  0.21152365]
 [0.2916277  0.7081614 ]
 [0.8142838  0.2111558 ]]


In [236]:
index = 6
print(test_sentences[2])#[index])

print(test_labels)#[index])
#print(predictions_t)#[index])
print([np.argmax(element) for element in predictions_t])
def precision_t():
    counter = len(test_labels)
    list_c = [i for i,j in zip(predictions_t,test_labels) if i == j]
    return counter,len(list_c)

objetivo monografia estudar técnicas controle linear utilizadas determinar controladores largamente conhecidos avr pss impactam margem estabilidade transitória sistema máquina versus barramento infinito omib one machine versus infinite bus adota modelo eixo gerador síncrono pólos salientes seguida estudam modelos avr pss apresentados norma ieee standard ieee standard adotando modelo pssa single input daquela norma equações diferenciais sistema numericamente integradas sistema então simulado sob perturbação resposta dinâmica analisada averiguadas ocorrências bifurcações hopf sistema função parâmetros controladores através parametrização autovalores análise comportamento local sistema equilíbrio traçam diagramas bifurcação sistema baseando teoria regiões estabilidade sistemas dinâmicos lineares simulações utilizadas desenvolver método força bruta mfb estimar região estabilidade sistema malha aberta controlado avr controlado avr pss saturadores excitação traçam regiões estabilidade varian