In [1]:
import csv
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, LSTM

In [2]:
#vocab_size = 5000
embedding_dim = 100
max_length = 200
trunc_type = 'post'
padding_type = 'post'
#oov_tok = '<OOV>'
#training_portion = .8

In [3]:
def csv_to_list (file_name):
    """
    Converte o CSV em duas lista distintas:
    articles e labels
    """
    articles = []
    labels = []

    with open(file_name, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)
        for row in reader:
            labels.append(row[1])
            articles.append(row[3])
    
    return labels, articles

In [4]:
test_labels = csv_to_list('data_test.csv')[0]
test_category = csv_to_list('data_test.csv')[1]

print(len(test_category))
print(len(test_labels))

4024
4024


In [5]:
training_labels = csv_to_list('data_training.csv')[0]
training_category = csv_to_list('data_training.csv')[1]

print(len(training_category))
print(len(training_labels))

11413
11413


In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_category)

training_word_index = tokenizer.word_index
training_vocab_size=len(training_word_index)

training_sequences = tokenizer.texts_to_sequences(training_category)
traning_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(traning_padded.shape)

(11413, 200)


In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_category)

#test_word_index = tokenizer.word_index
#test_vocab_size=len(test_word_index)

test_sequences = tokenizer.texts_to_sequences(test_category)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(test_padded.shape)

(4024, 200)


In [8]:
label_tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n')
label_tokenizer.fit_on_texts(training_labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(training_labels))

print(training_label_seq.shape)

(11413, 1)


In [9]:
label_tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n')
label_tokenizer.fit_on_texts(test_labels)

test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))

print(test_label_seq.shape)


(4024, 1)


In [10]:
embeddings_index = {};
with open('glove/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((training_vocab_size+1, embedding_dim))
for word, i in training_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [11]:
print(len(embeddings_matrix))

31798


In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(training_vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(92, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 100)          3179800   
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               160800    
_________________________________________________________________
dense (Dense)                (None, 100)               20100     
_________________________________________________________________
dense_1 (Dense)              (None, 92)                9292      
Total params: 3,369,992
Trainable params: 190,192
Non-trainable params: 3,179,800
_________________________________________________________________


In [13]:
num_epochs = 1
history = model.fit(traning_padded, training_label_seq, epochs=num_epochs, verbose=2)

Train on 11413 samples
11413/11413 - 542s - loss: 1.9224 - accuracy: 0.5207
