<h2>Text</h2>

In [81]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.utils import resample
import re
import matplotlib.pyplot as plt
from pathlib import Path

In [4]:
def load_dataset(filepath, delim=',', top_words=None, pad_len=None):
    df = pd.read_csv(filepath, delimiter=delim)
    
    comments = df['text'].tolist()
    sentiment = df['sentiment'].tolist()
    file_name = "word_to_id.npy"
    file_saved = Path(file_name)
    if file_saved.is_file():
        word_to_id = np.load('word_to_id.npy').tolist()
    else: 
        # Word frequency
        word_freq = Counter() 
        for comment in comments:
            for word in str(comment).split():
                if not word in word_freq:
                    word_freq[word] = 0
                word_freq[word] += 1
        word_to_id = {}
        if top_words:
            top = top_words
            most_common_words = word_freq.most_common(top_words)
            print(len(most_common_words))
            print(len(range(top_words)))
            for i in range(top_words):
                word_to_id[most_common_words[i][0]] = top
                top -= 1
        else:
            top = len(word_freq)
            most_common_words = word_freq.most_common()

            for i in range(len(word_freq)):
                word_to_id[most_common_words[i][0]] = top
                top -= 1

        np.save(file_name, word_to_id)
    
    # Convert comments
    max_len = 0
    
    X_data = []
    for comment in comments:
        # Get max comment length
        if len(str(comment).split()) > max_len:
            max_len = len(str(comment).split())
        
        aux = []
        for word in str(comment).split():
            if not word in word_to_id:
                aux.append(0)
            else:
                aux.append(word_to_id[word])
        
        X_data.append(aux)
    
    padding_size = 0
    if pad_len:
        padding_size = pad_len
    else:
        padding_size = max_len
    return pad_sequences(X_data, padding_size, padding='post'), to_categorical([(s + 1) / 2 for s in sentiment])

In [88]:
def transform_comment(comment, word_to_id, pad_len=None):
    
    data = []
    
    comment = " ".join(re.split("[^a-zA-Z]*", comment))
    comment = comment.lower()
    
    aux = []
    for word in str(comment).split():
        if not word in word_to_id:
            aux.append(0)
        else:
            aux.append(word_to_id[word])
    
    data.append(aux)
    
    if pad_len:
        data = pad_sequences(data, pad_len, padding = 'post')
    else:
        data = np.array(data)
    return data

In [2]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Verdadeiro')
    plt.xlabel('Predito')

In [None]:
def model_training(model, train, test, validation, epochs, callbacks, file):
    model_history = []
    X_train, y_train = train
    X_test, y_test = test
    X_val, y_val = validation
    
    model_history.append(model.fit(X_train, y_train, epochs=10, validation_data=[X_val, y_val], callbacks=callbacks))
    result = model.evaluate(X_test, y_test)

    print('Accuracy (test set): {}'.format(result[1]))

    plt.plot(model_history[-1].history['acc'])
    plt.plot(model_history[-1].history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    model.save_weights(file + ".hdf5")
    return model