In [1]:
import random
import io
import pickle

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout

ModuleNotFoundError: No module named 'tensorflow'

### Dataset
lineas de dialogo de peliculas

In [3]:
dataset = pd.read_csv(
    "movie_lines.tsv", 
    sep='\t', 
    encoding='ISO-8859-2',
    names = ['lineID', 'characterID ', 'movieID', 'character', 'line']
)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293368 entries, 0 to 293367
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   lineID        293368 non-null  object
 1   characterID   289083 non-null  object
 2   movieID       289083 non-null  object
 3   character     289040 non-null  object
 4   line          288823 non-null  object
dtypes: object(5)
memory usage: 11.2+ MB


In [5]:
print("Cantidad de lineas de dialogo:", dataset.shape[0])

Cantidad de lineas de dialogo: 293368


In [6]:
dataset.head()

Unnamed: 0,lineID,characterID,movieID,character,line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [7]:
# achico el dataset porque se come toda la RAM cuando convierto a ohe y crashea
dataset = dataset[0:20000]

In [8]:
lines = dataset.line.dropna()

### Corpus

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence 
from keras.preprocessing.sequence import pad_sequences

In [10]:
corpus = ' '.join(lines)
corpus



In [11]:
tokens = text_to_word_sequence(corpus)
tokens[:20]

['they',
 'do',
 'not',
 'they',
 'do',
 'to',
 'i',
 'hope',
 'so',
 'she',
 'okay',
 "let's",
 'go',
 'wow',
 'okay',
 "you're",
 'gonna',
 'need',
 'to',
 'learn']

In [12]:
print("Cantidad de tokens en el corpus:", len(tokens))

Cantidad de tokens en el corpus: 199218


In [13]:
train_len = 4

In [14]:
text_sequences = []
for i in range(train_len, len(tokens)):
  seq = tokens[i-train_len:i]
  text_sequences.append(seq)

In [15]:
text_sequences[:5]

[['they', 'do', 'not', 'they'],
 ['do', 'not', 'they', 'do'],
 ['not', 'they', 'do', 'to'],
 ['they', 'do', 'to', 'i'],
 ['do', 'to', 'i', 'hope']]

In [16]:
tok = Tokenizer() 
tok.fit_on_texts(text_sequences) 

sequences = tok.texts_to_sequences(text_sequences)

sequences[:20]

[[53, 20, 24, 53],
 [20, 24, 53, 20],
 [24, 53, 20, 4],
 [53, 20, 4, 2],
 [20, 4, 2, 358],
 [4, 2, 358, 39],
 [2, 358, 39, 78],
 [358, 39, 78, 97],
 [39, 78, 97, 144],
 [78, 97, 144, 47],
 [97, 144, 47, 1802],
 [144, 47, 1802, 97],
 [47, 1802, 97, 40],
 [1802, 97, 40, 111],
 [97, 40, 111, 119],
 [40, 111, 119, 4],
 [111, 119, 4, 632],
 [119, 4, 632, 49],
 [4, 632, 49, 4],
 [632, 49, 4, 633]]

In [17]:
print("Cantidad de rows del dataset:", len(sequences))

Cantidad de rows del dataset: 199214


### Set de entrenamiento

In [18]:
ex = np.array([[1,2,3,4],[5,6,7,8]])
ex

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [19]:
input = ex[:,:-1] # todos los rows, menos la ultima col
target = ex[:, -1] # última col de cada row

print("Input:", input)
print("Target:", target)

Input: [[1 2 3]
 [5 6 7]]
Target: [4 8]


In [20]:
arr_sequences = np.array(sequences)
x_data = arr_sequences[:,:-1]
y_data_int = arr_sequences[:,-1]

print(x_data.shape)
print(y_data_int.shape)

(199214, 3)
(199214,)


In [21]:
tok.index_word

{1: 'you',
 2: 'i',
 3: 'the',
 4: 'to',
 5: 'a',
 6: 'it',
 7: 'and',
 8: 'of',
 9: 'me',
 10: 'that',
 11: 'in',
 12: 'is',
 13: 'what',
 14: 'this',
 15: 'for',
 16: 'my',
 17: "i'm",
 18: 'your',
 19: "don't",
 20: 'do',
 21: 'have',
 22: 'he',
 23: 'know',
 24: 'not',
 25: 'we',
 26: 'on',
 27: 'was',
 28: 'be',
 29: 'no',
 30: 'are',
 31: "it's",
 32: 'but',
 33: 'with',
 34: 'just',
 35: 'all',
 36: 'like',
 37: 'get',
 38: 'about',
 39: 'so',
 40: "you're",
 41: 'him',
 42: 'here',
 43: 'if',
 44: 'out',
 45: 'got',
 46: 'up',
 47: 'go',
 48: 'want',
 49: 'how',
 50: 'one',
 51: 'can',
 52: 'think',
 53: 'they',
 54: "that's",
 55: 'at',
 56: 'now',
 57: 'right',
 58: 'there',
 59: 'yes',
 60: 'did',
 61: 'well',
 62: 'why',
 63: 'her',
 64: 'see',
 65: 'good',
 66: 'as',
 67: 'his',
 68: 'going',
 69: 'who',
 70: 'tell',
 71: 'yeah',
 72: 'will',
 73: 'oh',
 74: 'some',
 75: 'man',
 76: 'been',
 77: 'when',
 78: 'she',
 79: "i'll",
 80: "can't",
 81: "he's",
 82: 'were',
 83: 

In [22]:
vocab_size = len(tok.word_counts)
vocab_size

12406

In [None]:
y_data = to_categorical(y_data_int, num_classes=y_data_int.shape[0])

In [1]:
y_data_int_offset = y_data_int - 1
y_data = to_categorical(y_data_int_offset, num_classes=vocab_size) 
y_data.shape

NameError: ignored

In [None]:
input_seq_len = x_data.shape[1] 
input_seq_len

In [None]:
output_size = vocab_size
output_size

In [None]:
model = Sequential()

# Embedding:
# input_seq_len = 3 --> ingreso 3 palabras
# input_dim = vocab_size --> 1628 palabras distintas
# output_dim = 5 --> crear embeddings de tamaño 3 (tamaño variable y ajustable)
model.add(Embedding(input_dim=vocab_size+1, output_dim=5, input_length=input_seq_len))

model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64)) # La última capa LSTM no lleva return_sequences
model.add(Dense(32, activation='relu'))

# Predicción de clasificación con softmax
# La salida vuelve al espacio de 1628 palabras posibles
model.add(Dense(vocab_size, activation='softmax'))

# Clasificación multiple categórica --> loss = categorical_crossentropy
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
hist = model.fit(x_data, y_data, epochs=50, validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Entrenamiento
epoch_count = range(1, len(hist.history['accuracy']) + 1)
sns.lineplot(x=epoch_count,  y=hist.history['accuracy'], label='train')
sns.lineplot(x=epoch_count,  y=hist.history['val_accuracy'], label='valid')
plt.show()

In [None]:
import sys
!{sys.executable} -m pip install gradio --quiet

In [None]:
import gradio as gr

def model_response(human_text):

    # Encodeamos
    encoded = tok.texts_to_sequences([human_text])[0]
    # Si tienen distinto largo
    encoded = pad_sequences([encoded], maxlen=3, padding='pre')
    
    # Predicción softmax
    y_hat = model.predict(encoded).argmax(axis=-1)

    # Debemos buscar en el vocabulario la palabra
    # que corresopnde al indice (y_hat) predicho por le modelo
    out_word = ''
    for word, index in tok.word_index.items():
        if index == y_hat:
            out_word = word
            break

    # Agrego la palabra a la frase predicha
    return human_text + ' ' + out_word

iface = gr.Interface(
    fn=model_response,
    inputs=["textbox"],
    outputs="text",
    layout="vertical")

iface.launch(debug=True)

In [None]:
def generate_seq(model, tokenizer, seed_text, max_length, n_words):
    """
        Exec model sequence prediction

        Args:
            model (keras): modelo entrenado
            tokenizer (keras tokenizer): tonenizer utilizado en el preprocesamiento
            seed_text (string): texto de entrada (input_seq)
            max_length (int): máxima longitud de la sequencia de entrada
            n_words (int): números de palabras a agregar a la sequencia de entrada
        returns:
            output_text (string): sentencia con las "n_words" agregadas
    """
    output_text = seed_text
	# generate a fixed number of words
    for _ in range(n_words):
		# Encodeamos
        encoded = tokenizer.texts_to_sequences([output_text])[0]
		# Si tienen distinto largo
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		
		# Predicción softmax
        y_hat = model.predict(encoded).argmax(axis=-1)
		# Vamos concatenando las predicciones
        out_word = ''

        # Debemos buscar en el vocabulario la palabra
        # que corresopnde al indice (y_hat) predicho por le modelo
        for word, index in tokenizer.word_index.items():
            if index == y_hat:
                out_word = word
                break

		# Agrego las palabras a la frase predicha
        output_text += ' ' + out_word
    return output_text

In [None]:
input_text='hey jude don\'t'

generate_seq(model, tok, input_text, max_length=3, n_words=2)