In [147]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, LSTM

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report



In [148]:
import nltk
from nltk.corpus import stopwords 

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/abner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/abner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [149]:
print(tf.__version__)

2.0.0


In [150]:
STOPWORDS = set(stopwords.words('english'))

In [151]:
embedding_dim = 100
trunc_type = 'post'
padding_type = 'post'
training_portion = .8

In [169]:
def csv_to_list (file_name):
    """
    Converte o CSV em duas lista distintas:
    articles e labels
    """
    articles = []
    labels = []

    with open(file_name, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)
        for row in reader:
            labels.append(row[1])
            article = (row[3])
            for word in STOPWORDS:
                token = ' ' + word + ' '
                article = article.replace(token, ' ')
                article = article.replace(' ', ' ')
            articles.append(article)
    
    return labels, articles

In [154]:
labels = csv_to_list('data_training.csv')[0]
category = csv_to_list('data_training.csv')[1]

print(len(category))
print(len(labels))

11413
11413


In [155]:
# Calcula a mediana do tamanho dos textos no conjunto e torna esse valor o tamanho máximo dos textos.
text_len = []
for i in category:
    text_len.append(len(i))

max_length = np.median(text_len)
max_length = max_length.astype(np.int64)

In [156]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(category)

word_index = tokenizer.word_index
vocab_size=len(word_index)

sequences = tokenizer.texts_to_sequences(category)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(padded.shape)
print(vocab_size)

(11413, 428)
31734


In [157]:
train_size = int(len(category) * training_portion)

training_sequences = padded[0:train_size]
training_labels = labels[0:train_size]

validation_sequences = padded[train_size:]
validation_labels = labels[train_size:]

In [167]:
label_tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n')
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(training_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

training_label_seq = tf.keras.utils.to_categorical(training_label_seq)
validation_label_seq = tf.keras.utils.to_categorical(validation_label_seq)

In [168]:
print(training_sequences.shape)
print(training_label_seq.shape)
print(validation_sequences.shape)
print(validation_label_seq.shape)

(9130, 428)
(9130, 91)
(2283, 428)
(2283, 92)


In [160]:
test_labels = csv_to_list('data_test.csv')[0]
test_category = csv_to_list('data_test.csv')[1]

print(len(test_category))
print(len(test_labels))

4024
4024


In [161]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_category)

test_sequences = tokenizer.texts_to_sequences(test_category)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(test_padded.shape)

(4024, 428)


In [162]:
label_tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n')
label_tokenizer.fit_on_texts(test_labels)

test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))

print(test_label_seq.shape)


(4024, 1)


In [171]:
test_label_seq

array([[66],
       [66],
       [66],
       ...,
       [21],
       [21],
       [21]])

In [163]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(training_vocab_size+1, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(92, activation='softmax')
])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 428, 100)          3179800   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_6 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_7 (Dense)              (None, 92)                9292      
Total params: 3,369,992
Trainable params: 3,369,992
Non-trainable params: 0
_________________________________________________________________


In [164]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy']) # https://keras.io/api/models/model_training_apis/ ; https://www.tensorflow.org/api_docs/python/tf/keras/metrics

In [165]:
num_epochs = 1

history = model.fit(training_sequences, training_label_seq, epochs=num_epochs, validation_data=(validation_sequences, validation_label_seq), verbose=2)

Train on 9130 samples, validate on 2283 samples
9130/9130 - 556s - loss: 1.7731 - accuracy: 0.5039 - val_loss: 8.6979 - val_accuracy: 0.0000e+00


In [None]:
plt.title('Accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.plot(history.history['accuracy'], label='train accuracy')
plt.xlabel('Epoch')


plt.legend()
plt.show()

In [None]:
plt.title('loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.plot(history.history['accuracy'], label='train loss')
plt.xlabel('Epoch')


plt.legend()
plt.show()

In [None]:
test_eval = model.evaluate(test_padded, test_label_seq, verbose=2)

In [None]:
y_pred = model.predict(test_padded)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(test_label_seq, y_pred_bool)) #https://scikit-learn.org/stable/modules/model_evaluation.html


In [None]:
# Print f1, precision, and recall scores
print(precision_score(test_label_seq, y_pred_bool , average="macro"))
print(recall_score(test_label_seq, y_pred_bool , average="macro"))
print(f1_score(test_label_seq, y_pred_bool , average="macro"))
print(accuracy_score(test_label_seq, y_pred_bool ))

