In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, LSTM

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [None]:
import nltk
from nltk.corpus import stopwords 

nltk.download('punkt')
nltk.download('stopwords')

In [67]:
df = pd.read_csv("data_training.csv", sep=',',usecols=["topico", "texto"])
df.head(5)

Unnamed: 0,topico,texto
0,coconut-oil,us scientists say tropical oils health risk au...
1,coconut-oil,vegetable oils may tighten despite seed surplu...
2,coconut-oil,corrected philippines criticises ec for oil le...
3,coconut-oil,coconut oil contract to change dutch traders r...
4,alum,feb daily ave unwrought aluminium output tonne...


In [68]:
df.topico.unique()

array(['coconut-oil', 'alum', 'rye', 'money-fx', 'copper', 'potato',
       'rubber', 'dlr', 'iron-steel', 'soy-meal', 'sunseed', 'rapeseed',
       'retail', 'silver', 'copra-cake', 'interest', 'platinum',
       'palmkernel', 'nkr', 'nzdlr', 'oat', 'acq', 'palladium', 'unknown',
       'groundnut', 'livestock', 'groundnut-oil', 'oilseed', 'dfl',
       'wheat', 'rice', 'cotton', 'ship', 'gnp', 'lin-oil',
       'money-supply', 'sun-meal', 'l-cattle', 'rape-oil', 'earn',
       'nat-gas', 'hog', 'castor-oil', 'income', 'gas', 'sugar',
       'veg-oil', 'sorghum', 'lei', 'fuel', 'sun-oil', 'soy-oil', 'tea',
       'propane', 'soybean', 'grain', 'naphtha', 'lead', 'wpi', 'crude',
       'lumber', 'strategic-metal', 'coffee', 'rand', 'ipi', 'heat',
       'bop', 'barley', 'cpu', 'jet', 'palm-oil', 'dmk', 'pet-chem',
       'jobs', 'tin', 'zinc', 'orange', 'corn', 'cotton-oil', 'nickel',
       'reserves', 'cpi', 'coconut', 'housing', 'trade', 'carcass',
       'cocoa', 'instal-debt', 'ye

In [None]:
#len(df.topico.unique())

In [None]:
#df.info()

In [None]:
#labels = df['topico'].tolist()

In [None]:
#pd.set_option('display.max_rows', df.shape[0]+1)
#print(df)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    df['texto'], df['topico'],
    test_size=0.33,
    random_state=53) # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html



In [None]:
EMBEDDING_DIMENSION = 64
VOCABULARY_SIZE = None # None mantem todas as palavras
OOV_TOK = '<OOV>'
TRUNCATE_TYPE = 'post'
PADDING_TYPE = 'post'

In [None]:
# Calcula a mediana do tamanho dos textos no conjunto e torna esse valor o tamanho máximo dos textos.
text_len = []
for i in list(x_train) + list(x_val):
    text_len.append(len(i))

MAX_LENGTH = np.median(text_len)
MAX_LENGTH = max_length.astype(np.int64)

In [None]:
#https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer

tokenizer = Tokenizer(num_words=VOCABULARY_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(list(x_train) + list(x_val))

In [None]:
x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_val_sequences = tokenizer.texts_to_sequences(x_val)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))


In [None]:
x_train_pad = pad_sequences(x_train_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNCATE_TYPE)
x_val_pad = pad_sequences(x_val_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNCATE_TYPE)


In [None]:
label_tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n')
label_tokenizer.fit_on_texts(list(y_train))

training_label_seq = np.array(label_tokenizer.texts_to_sequences(y_train))

In [None]:
label_tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n')
label_tokenizer.fit_on_texts(list(y_val))

val_label_seq = np.array(label_tokenizer.texts_to_sequences(y_val))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, EMBEDDING_DIMENSION),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_DIMENSION)),
    tf.keras.layers.Dense(EMBEDDING_DIMENSION, activation='relu'),
    tf.keras.layers.Dense(92, activation='softmax')
])
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 
# https://keras.io/api/models/model_training_apis/ ; https://www.tensorflow.org/api_docs/python/tf/keras/metrics

In [None]:
num_epochs = 1

history = model.fit(x_train_pad, training_label_seq, epochs=num_epochs, validation_data=(x_val_pad, val_label_seq), verbose=2)