In [20]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

from collections import Counter

In [5]:
datasetPath = os.path.join(os.getcwd(),'Musical_instruments_reviews.csv')
df = pd.read_csv(datasetPath)

In [6]:
xnp = np.array(df['reviewText'])
ynp = np.array(df['overall'])

In [8]:
sample_text = df['reviewText'][0]
sample_text

"Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,"

In [10]:
#certificar-se de que todas as instancias sejam strings
for i in range(len(xnp)):
    if type(xnp[i]) != str:
        xnp[i] = str(xnp[i])

#serão considerados positivos overall > 2
ynp = (ynp > 3).astype(int)

In [14]:
#separar train_set, valid_test e  test_set
X_train, X_test, y_train, y_test = train_test_split(xnp, ynp, test_size=0.2, random_state=42, stratify=ynp)

In [15]:
def preprocess(X_batch,y_batch):
    #X_batch = tf.strings.substr(X_batch, 0, 100)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"),y_batch

In [16]:
batch_size = 100
n_batches = len(X_train)//batch_size + 1

In [17]:
#Create dataset, this will return object of TensorSliceDataset
trainDataset = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train),
                                                  tf.convert_to_tensor(y_train)))
trainDataset = trainDataset.batch(batch_size=batch_size).map(preprocess)

In [21]:
#Construct the vocabulary (Bag of words method)
vocabulary = Counter()

In [22]:

for X_batch,y_batch in trainDataset:
    for textPiece in X_batch:
        vocabulary.update(list(textPiece.numpy()))

In [23]:
vocab_size = 15000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]


In [24]:
#Now we need to add a preprocessing step to replace each word with its ID (i.e., its
#index in the vocabulary). Just like we did in Chapter 13, we will create a lookup table
#for this, using 1,000 out-of-vocabulary (oov) buckets:


words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [25]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch


In [26]:
trainDataset = trainDataset.map(encode_words).prefetch(1)

In [27]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
    ])
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])
history = model.fit(trainDataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
