In [1]:
from keras.datasets import imdb

#Cargamos los datos de imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [4]:
import numpy as np

#Codificación one-hot: vector de tamaño 10.000 donde los indices de las palabras de la reseña se establecen en 1 y el resto en 0

def vectorize_sequences(sequences, dimension=10000):
 results = np.zeros((len(sequences), dimension))
 for i, sequence in enumerate(sequences):
     results[i, sequence] = 1.
 return results
 
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [5]:
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [6]:
from keras import models
from keras import layers

#Input: texto vectorizado -> Capa de 16 unidades (relu) -> Capa de 16 unidades (relu) -> Capa de 1 unidad (sigmoide) -> Output: provabilidad [0-1]

model = models.Sequential()
#model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Input(shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [7]:
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])

In [8]:
x_val = x_train[10000]
partial_x_train = x_train[10000:]
y_val = y_train[10000]
partial_y_train = y_train[10000:]

In [9]:
history = model.fit(partial_x_train,
    partial_y_train,
    epochs=20,
    batch_size=512,
    validation_data=(x_val, y_val)
)

Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.7075 - loss: 0.6043 - val_accuracy: 0.8535 - val_loss: 0.4164
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8948 - loss: 0.3569 - val_accuracy: 0.8863 - val_loss: 0.3226
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9214 - loss: 0.2557 - val_accuracy: 0.8858 - val_loss: 0.2929
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9353 - loss: 0.2079 - val_accuracy: 0.8861 - val_loss: 0.2850
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9516 - loss: 0.1655 - val_accuracy: 0.8873 - val_loss: 0.2782
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9575 - loss: 0.1433 - val_accuracy: 0.8861 - val_loss: 0.2827
Epoch 7/20
[1m30/30[0m [32m━━━━━━━━━

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
 plt.clf()
 acc_values = history_dict['acc']
 val_acc_values = history_dict['val_acc']

 plt.plot(epochs, acc_values, 'bo', label='Training acc')
 plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
 plt.title('Training and validation accuracy')
 plt.xlabel('Epochs')
 plt.ylabel('Loss')
 plt.legend()
 plt.show()

In [32]:
#Así se podría decodificar una reseña de imdb de vuelta a texto
import random

index = random.randint(0, len(train_data))

word_index = imdb.get_word_index() #Obtiene un diccionario para mapear cada palabra a un índice numérico único
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

decoded_review = ' '.join([reverse_word_index.get(i- 3, '') for i in train_data[index]]) #IMBD reserva los 3 primeros indices para caracteres y palabras desconocidos.

print(x_train[index])
print(decoded_review)
print(train_labels[index])

[0. 1. 1. ... 0. 0. 0.]
 i just finished watching this film and wow was that bad actually the only thing that kept me watching was that it was so  bad it was kind of entertaining the action of the characters is hilarious from the hyper dramatic way they fall to  to their incredibly bad acting were the bad guys all just pulled off the street or were they actually actors to incredibly bad delivery of lines to their inexplicable actions if you are going to try and shoot someone through a  as they enter obviously the thing to do is shoot directly at the  this film must break some record for worst written and delivered lines br br the camera work was also really bad you can hardly see what's going on in the fight scenes due to switching camera angles and  br br i would have voted 1 except that i do like chiba and sidekick sue  and i was entertained by a couple of scenes 1 breaking of a  arm so the bone pops out of the skin that's gotta hurt 2 a drug  eating a brown  animal a monkey by  away

In [36]:
def text_to_sequence(text):
    word_index = imdb.get_word_index() #Obtiene un diccionario para mapear cada palabra a un índice numérico único
    words = text.lower().split()

    #Creamos un vector de tamaño 10,000 y asignamos 1s en las posiciones de los índices de las palabras
    vector = np.zeros(10000)
    for word in words:
        index = word_index.get(word, None)
        if index is not None and index < 10000:  # Asegurarse de que el índice esté dentro del rango permitido
            vector[index +3] = 1.0
        
    return vector


text_review = input("Escribe una reseña: ")
input_vector = text_to_sequence(text_review)

print(input_vector)

prediction = model.predict(np.array([input_vector]))
print(f"Reseña {"positiva" if prediction[0] > 0.5 else "negativa"}: {prediction[0]}")

Escribe una reseña:  es una puta mierda


[0. 0. 0. ... 0. 0. 0.]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Reseña positiva: [0.5229162]


In [22]:
# Validación K-fold
import tensorflow
import numpy as np
from keras import models, layers
from keras.datasets import imdb
from keras.utils import to_categorical


def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.0
    return results


# Definir el modelo
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['acc'])
    return model

# Cargar y preparar los datos
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
all_data = train_data#np.concatenate((train_data, test_data))
all_labels = train_labels#np.concatenate((train_labels, test_labels))

all_data = vectorize_sequences(all_data)
all_labels = np.array(all_labels).astype('float32')

k = 4  # Número de particiones
folds_data = []
folds_labels = []

fold_size = int(len(all_data) / k)
for fold in range(k):
    print(f"Creado fold: #{fold + 1}")

    folds_data.append(all_data[fold_size * fold: fold_size * (fold + 1)])
    folds_labels.append(all_labels[fold_size * fold: fold_size * (fold + 1)])
num_epochs = 20
batch_size = 512

all_scores = []

# Realizar K-Fold Cross-Validation
for fold in range(k):
    print(f"Analizando fold: #{fold + 1}")

    fold_training_data = np.concatenate([folds_data[i] for i in range(k) if i != fold])
    fold_test_data = folds_data[fold]

    fold_training_labels = np.concatenate([folds_labels[i] for i in range(k) if i != fold])
    fold_test_labels = folds_labels[fold]

    # Crear un nuevo modelo
    model = build_model()

    # Entrenar el modelo
    history = model.fit(
        fold_training_data, fold_training_labels,
        epochs=num_epochs, batch_size=batch_size,
        validation_data=(fold_test_data, fold_test_labels)
    )

    # Evaluar el modelo en los datos de validación
    val_loss, val_acc = model.evaluate(fold_test_data, fold_test_labels, verbose=0)
    print(f"Fold #{fold + 1}: {val_acc}")

    all_scores.append(val_acc)  # Guardar la puntuación

# Calcular el promedio de las puntuaciones
print(f"Media del k-fold: {np.mean(all_scores)}")
print(f"Desviación típica del k-fold: {np.std(all_scores)}")

Creado fold: #1
Creado fold: #2
Creado fold: #3
Creado fold: #4
Analizando fold: #1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - acc: 0.6967 - loss: 0.5888 - val_acc: 0.8715 - val_loss: 0.3582
Epoch 2/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - acc: 0.9020 - loss: 0.3023 - val_acc: 0.8845 - val_loss: 0.2932
Epoch 3/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - acc: 0.9258 - loss: 0.2235 - val_acc: 0.8765 - val_loss: 0.2992
Epoch 4/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - acc: 0.9356 - loss: 0.1846 - val_acc: 0.8915 - val_loss: 0.2730
Epoch 5/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - acc: 0.9495 - loss: 0.1530 - val_acc: 0.8904 - val_loss: 0.2796
Epoch 6/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - acc: 0.9593 - loss: 0.1276 - val_acc: 0.8667 - val_loss: 0.3399
Epoch 7/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - acc: 0.9625 -