In [52]:
import os
import pandas as pd
import numpy as np
import pickle
from copy import deepcopy

from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

# Carga de datos

In [21]:
tweets_path = '../datasets'
neil_degrasse_csv = 'datasets_241_800048_NeildeGrasseTysonTweets.csv'

In [22]:
df = pd.read_csv(os.path.join(tweets_path, neil_degrasse_csv))
df.dropna(subset=['text'], inplace=True)

In [23]:
text = ''
for t in df['text']:
    text += ' ' + t

In [24]:
text[:1000]

' Moon’s shadow landfalls Oregon, crosses USA at 1800mph, exits SCarolina. Behold ‘Muuurica’s Eclipse.pic.twitter.com/fIMCnEyyQy @huggy_panda  Oink, oink.   : - ) Future headlines from the Multiverse: Nov 9, 2016: “Trump: How I Got Hillary Elected while Dismantling the Republican Party.” Awww. That’s the nicest thing anybody has said to me in a long while.https://twitter.com/ayeshatron/status/784441432652320769\xa0… If ComicCon people ruled the world, international conflicts would be resolved entirely by plastic  light saber fights in bars On Pluto, with its 248-year orbit around the Sun, birthdays are incompatible with human physiology. @ivychat Maybe I‘m floating in an atmospheric balloon in Saturn’s atmosphere. The urge to want some bit of information to be true often clouds our ability to assess why that information may be false. Evidence that internet Cats are rapidly achieving cosmic consciousness, soon to become our Overlords:https://www.youtube.com/watch?v=LJSH6Ru1xRk&feature=s

# Preparación del train y test set

In [25]:
characters = sorted(list(set(text)))
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

In [26]:
X = []
Y = []
length = len(text)
seq_length = 100
for i in range(0, length-seq_length, 1):
    sequence = text[i:i + seq_length]
    label =text[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])

# Formateo de los datos para la entrada en la red

In [27]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)

# Entrenamiento de un modelo sencillo durante 1 época

In [32]:
vocab_size = len(characters)

In [30]:
model = Sequential()
model.add(LSTM(100, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 100, 100)          40800     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 106)               10706     
Total params: 131,906
Trainable params: 131,906
Non-trainable params: 0
_________________________________________________________________


In [31]:
history = model.fit(X_modified, Y_modified, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history

Train on 267682 samples, validate on 14089 samples
Epoch 1/1


In [41]:
model.save('../models/simple_model_1e.h5')
pickle.dump(history, open('../models/simple_model_1e_history.p', 'wb'))

# Entrenamiento de un modelo sencillo durante 10 épocas

In [42]:
model = Sequential()
model.add(LSTM(100, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 100, 100)          40800     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 106)               10706     
Total params: 131,906
Trainable params: 131,906
Non-trainable params: 0
_________________________________________________________________


In [44]:
history = model.fit(X_modified, Y_modified, validation_split=0.05, batch_size=128, epochs=10, shuffle=True).history

Train on 267682 samples, validate on 14089 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
model.save('../models/simple_model_10e.h5')
pickle.dump(history, open('../models/simple_model_10e_history.p', 'wb'))

# Generación de texto

In [46]:
 def generate_text(model, string_id):
    string_mapped = deepcopy(string_id)
    full_string = [n_to_char[value] for value in string_mapped]
    
    # Generating characters
    for i in range(400):
        x = np.reshape(string_mapped,(1,len(string_mapped), 1))
        x = x / float(len(characters))

        pred_index = np.argmax(model.predict(x, verbose=0))
        seq = [n_to_char[value] for value in string_mapped]
        full_string.append(n_to_char[pred_index])

        string_mapped.append(pred_index)
        string_mapped = string_mapped[1:len(string_mapped)]
        
    text = ""
    for char in full_string:
        text = text + char
    return text

In [49]:
model_1e = load_model('../models/simple_model_1e.h5')
model_10e = load_model('../models/simple_model_10e.h5')

In [59]:
model_1e_results = generate_text(model_1e, X[20])
print(model_1e_results)

alls Oregon, crosses USA at 1800mph, exits SCarolina. Behold ‘Muuurica’s Eclipse.pic.twitter.com/fIM     00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


In [60]:
model_10e_results = generate_text(model_10e, X[20])
print(model_10e_results)

alls Oregon, crosses USA at 1800mph, exits SCarolina. Behold ‘Muuurica’s Eclipse.pic.twitter.com/fIMmmmm Co the aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe the wou aoe 


# Conclusiones

El presente ejercicio tenía como objetivo el del entenar de primera mano redes neuronales y, más que conseguir un modelo perfecto, enfrentarse a los retos que se presentan ante resultados como los anteriormente presentados.

También, otro objetivo era el de presentar una representación del texto distinta a la vista en los problemas de clasificación tradicionales (spam, sentiment, ...) a la hora de trabajar con textos y redes neuronales.

Dada una arquitectura de red, a mayor número de épocas el modelo aprende poco a poco la estructura del lenguaje. Modelos más complejos serán capaces, teóricamente, de aprender mayor número de estructuras en el lenguaje y de mayor complejidad. De hecho, está probado como modelos basados en Deep Learning son capaces de aprender reglas morfológicas, reglas sintácticas, o la semántica de un corpus. Está probado también que distintas partes de la red (capas) aprenden diferentes partes (por ejemplo, las primeras capas extraerían información morfológica y capas más profundas serían capaces de aprender la semántica).

Este tipo de modelos, cuando se presentan en demos como las múltiples que existen de GPT-2, son verdaderamente complejos de entrenar. Entre otros, algunas limitaciones claras son (se espera que el alumno las haya podido intuir durante la realización del ejercicio):
- Disponibilidad de datos
- Formato de los datos
- ¿Con qué datos / chunks entrenamos? ¿Predecimos un carácter? ¿Varios? ¿La longitud importa?
- Diferencia entre entrenar un modelo de NLG para Twitter que para otro caso de uso (un teclado predictivo, un generador de resúmenes, etc.)
- La complejidad en la validación
- Necesidad de GPUs (o incluso TPUs) para entrenar de manera eficiente estos modelos
- Una vez se tiene un modelo con un alto performance, ¿es realmente un buen modelo o un loro que repite todo lo que ha aprendido?
