### Variação de Hiperparâmetros (Biblioteca Hyperas/Hyperopt)

In [1]:
import os
import re
import json
import string
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics

from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils

from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

import _locale 
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])

---

In [2]:
def data():
    
    # 1. Load the dataset
    json_data_path = '../crawlers/suno/data/results-full-suno-2020.json'

    with open(json_data_path, 'r', encoding='utf8') as json_file:
        data = json.load(json_file)

    df_suno = pd.DataFrame(data)
    
    # Convert date to datetime format
    df_suno['date'] = pd.to_datetime(df_suno['date'], format='%d/%m/%Y %H:%M')

    # Set date column as index
    df_suno.set_index('date', inplace=True)

    # Order by date
    df_suno.sort_index(inplace=True)
    
    # Convert text columns to string
    df_suno['title'] = df_suno['title'].astype('string')
    df_suno['full_text'] = df_suno['full_text'].astype('string')
    
    # Extract main columns
    df_suno = df_suno[['title', 'full_text', 'topic']]
    
    # Select main topics
    df_suno = df_suno[df_suno['topic'].isin(['Mercado', 'Economia', 'Internacional', 'Negócios'])]
    
    # 2. Preprocessing
    def remove_emojis(sentence):

        "Remoção de Emojis nas mensagens de texto."

        # Padrões dos Emojis
        emoji_pattern = re.compile("["
                    u"\U0001F600-\U0001F64F"  # emoticons
                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    u"\U0001f926-\U0001f937"
                    u'\U00010000-\U0010ffff'
                    u"\u200d"
                    u"\u2640-\u2642"
                    u"\u2600-\u2B55"
                    u"\u23cf"
                    u"\u23e9"
                    u"\u231a"
                    u"\u3030"
                    u"\ufe0f"
        "]+", flags=re.UNICODE)

        return emoji_pattern.sub(r'', sentence)

    def remove_valores(sentence):
        new_sentece = ''

        for token in sentence.split():
            if token.isdigit():
                token = '<NUM>'
            new_sentece += ' {}'.format(token)

        return new_sentece

    # Substituir símbolos importantes
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('-feira', ''))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('+', 'mais '))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('-', 'menos '))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('%', ' por cento'))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('R$', ''))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('U$', ''))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('US$', ''))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('S&P 500', 'spx'))

    # Transformar em String e Letras Minúsculas nas Mensagens
    df_suno['title'] = df_suno['title'].map(lambda s: str(s).lower())

    # Remover Pontuações
    df_suno['title'] = df_suno['title'].map(lambda s: s.translate(str.maketrans('', '', string.punctuation)))

    # Remover Emojis     
    df_suno['title'] = df_suno['title'].map(lambda s: remove_emojis(s))

    # Quebras de Linha desnecessárias
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('\n', ' '))

    # Remover aspas duplas
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('\"', ''))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('“', ''))
    df_suno['title'] = df_suno['title'].map(lambda s: s.replace('”', ''))

    # Remover valores
    df_suno['title'] = df_suno['title'].map(lambda s: remove_valores(s))

    # Espaços desnecessários
    df_suno['title'] = df_suno['title'].map(lambda s: s.strip())
    
    # Extract titles
    samples = df_suno['title'].values
    
    # Transformar os tópicos em números inteiros
    label_encoder = preprocessing.LabelEncoder()
    df_suno['topic_number'] = label_encoder.fit_transform(df_suno['topic'])
    class_names = label_encoder.classes_
    
    # Extract labels
    labels = df_suno['topic_number'].values
    
    # Split Train/Test sets (80% Train / 20% Test --> Used for final metrics)
    seed = 1337
    rng = np.random.RandomState(seed)
    rng.shuffle(samples)
    rng = np.random.RandomState(seed)
    rng.shuffle(labels)

    validation_split = 0.2
    num_validation_samples = int(validation_split * len(samples))
    train_samples = samples[:-num_validation_samples]
    val_samples = samples[-num_validation_samples:]
    train_labels = labels[:-num_validation_samples]
    val_labels = labels[-num_validation_samples:]
    
    # Convert to number vector
    from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

    MAX_SEQUENCE_LENGTH = 200

    vectorizer = TextVectorization(max_tokens=20000, 
                                   output_sequence_length=MAX_SEQUENCE_LENGTH)
    text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(32)
    vectorizer.adapt(text_ds)
    
    print (text_ds)
    
    # Define the vocabulary
    voc = [w.decode('utf-8') for w in vectorizer.get_vocabulary()]
    word_index = dict(zip(voc, range(len(voc))))
    
    # Load pre-trained GloVe Embeddings
    path_to_glove_file = '../../data/full_text_financial_news_vectors.txt'

    embeddings_index = {}
    with open(path_to_glove_file, encoding='utf8') as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    print("Found %s word vectors." % len(embeddings_index))
    
    # Map words to vectors
    num_tokens = len(voc) + 2
    embedding_dim = 300
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    
    # Apply transformations to train/valid. sets
    x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
    x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

    y_train = np.array(train_labels)
    y_val = np.array(val_labels)

    x_test = x_val
    y_test = y_val
    
    return x_train, y_train, x_test, y_test

In [3]:
def create_model(x_train, y_train, x_test, y_test):
    """
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    
    embedding_layer = Embedding(
        num_tokens,
        embedding_dim,
        embeddings_initializer=keras.initializers.Constant(embedding_matrix),
        trainable=False,
    )

    int_sequences_input = keras.Input(shape=(None,))
    embedded_sequences = embedding_layer(int_sequences_input)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM({{choice([32, 64])}},  return_sequences=True))(embedded_sequences)
    x = tf.keras.layers.Dropout({{uniform(0, 1)}})(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM({{choice([32, 64])}}))(x)
    x = tf.keras.layers.Dense({{choice([8, 16])}}, activation={{choice(['relu', 'tanh'])}})(x)
    x = tf.keras.layers.Dropout({{uniform(0, 1)}})(x)
    preds = tf.keras.layers.Dense(len(class_names), activation='softmax')(x)

    model = keras.Model(int_sequences_input, preds)
    
    # Compile and Fit
    model.compile(
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'],
        optimizer={{choice(['adam', 'rmsprop'])}}
    )
    
    # 80% from X_train for training / 20% for Validation
    result = model.fit(x_train, 
                       y_train,
                       batch_size=32,
                       verbose=2,
                       validation_split=0.2)  
    
    #get the highest validation accuracy of the training epochs
    validation_acc = np.amax(result.history['val_accuracy']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [None]:
best_run, best_model = optim.minimize(model=create_model,
                                      data=data,
                                      algo=tpe.suggest,
                                      max_evals=10,
                                      trials=Trials(),
                                      notebook_name="5. Classificação de Textos com LSTM e Hyperopt")
X_train, Y_train, X_test, Y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(X_test, Y_test))
print("Best performing model chosen hyper-parameters:")
print(best_run)

>>> Imports:
#coding=utf-8

try:
    import os
except:
    pass

try:
    import re
except:
    pass

try:
    import json
except:
    pass

try:
    import string
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import seaborn as sns
except:
    pass

try:
    import tensorflow as tf
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from sklearn import preprocessing
except:
    pass

try:
    from sklearn import metrics
except:
    pass

try:
    from keras.layers.core import Dense, Dropout, Activation
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.utils import np_utils
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform
except:
    pass

try:
    from tensorflow import keras
exce

<BatchDataset shapes: (None,), types: tf.string>
Found 149078 word vectors.
Converted 9492 words (226 misses)
338/338 - 11s - loss: 1.1164 - accuracy: 0.5710 - val_loss: 0.8394 - val_accuracy: 0.6964                              

Best validation acc of epoch:                                                                                          
0.696408748626709                                                                                                      
338/338 - 12s - loss: 1.3203 - accuracy: 0.4823 - val_loss: 1.2382 - val_accuracy: 0.5165                              

Best validation acc of epoch:                                                                                          
0.5164753794670105                                                                                                     
 20%|█████████▊                                       | 2/10 [00:33<02:12, 16.59s/trial, best loss: -0.696408748626709]

In [None]:
print (best_model.summary())

In [None]:
print(best_model.optimizer.lr)

In [None]:
classes = ['Economia', 'Internacional', 'Mercado', 'Negócios']

predicted = best_model.predict(X_test)

Y_predicted = np.argmax(predicted, axis=1)

print(metrics.classification_report(Y_test, Y_predicted, target_names=classes))
print('Acurácia: {}'.format(metrics.accuracy_score(Y_test, Y_predicted)))

---