# NLG (Natural Language Generator)

### Librerías y funciones necesarias

In [None]:
import sys
import pickle
import heapq
import pandas as pd
import pickle

import numpy as np
np.random.seed(42)

import matplotlib.pyplot as plt
%matplotlib inline

from pylab import rcParams
rcParams['figure.figsize'] = 12, 5


import warnings
warnings.filterwarnings('ignore')

# Sets the value of the specified option
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
import tensorflow  as tf
tf.random.set_seed(42)

from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.utils import np_utils

Using TensorFlow backend.


In [None]:
# Función que realiza comprobaciones sobre una columna de un dataframe
def check_column(df, col_name):
    values_unique = len(df[col_name].unique())
    values_nan = df[col_name].isnull().sum()
    portmissing = round((df[col_name].isnull().sum()/len(df))*100, 4)
    
    print (f'{col_name} consta de: {values_unique} valores distintos de un total de {len(df)}')
    print (f'{col_name} consta de: {values_nan} valores ausentes, {portmissing}%')

In [None]:
# Función que devuelve la longitud del corpus de una columna de un dataframe
def lenCorpus(df, col_name):
  numTokens = set()
  for x in df[col_name]:
    for w in x.strip().split(' '):
      numTokens.add(w.strip())
  return len(numTokens)

### Carga de datos

In [None]:
# Montamos GDrive
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [None]:
# Lectura del dataset con los tweets del astrofísico Neil Degrasse Tyson
dfNGTyson = pd.read_csv('/content/drive/My Drive/practica_nlp/NeildeGrasseTysonTweets.csv')
print(f'Dimensiones del dataset son: {dfNGTyson.shape}')

Dimensiones del dataset son: (2428, 7)


In [None]:
# Visualizamos las primeras filas
dfNGTyson.head()

Unnamed: 0.1,Unnamed: 0,date,id,link,retweet,text,author
0,0,Aug 21,767371694834978816,/neiltyson/status/767371694834978817,False,"Moon’s shadow landfalls Oregon, crosses USA at...",deGrasseTyson
1,1,Oct 9,785186636946636800,/neiltyson/status/785186636946636800,False,"@huggy_panda Oink, oink. : - )",deGrasseTyson
2,2,Oct 9,785131023923314688,/neiltyson/status/785131023923314688,False,"Future headlines from the Multiverse: Nov 9, 2...",deGrasseTyson
3,3,Oct 7,784443331568930816,/neiltyson/status/784443331568930817,False,Awww. That’s the nicest thing anybody has said...,deGrasseTyson
4,4,Oct 6,784089429120020480,/neiltyson/status/784089429120020481,False,"If ComicCon people ruled the world, internatio...",deGrasseTyson


In [None]:
# Análisis de la columns text 
check_column(dfNGTyson, 'text')

dfNGTyson['text'].value_counts().head()

text consta de: 2391 valores distintos de un total de 2428
text consta de: 0 valores ausentes, 0.0%


In 5-billion yrs the Sun will expand & engulf our orbit as the charred ember that was once Earth vaporizes. Have a nice day.     5
Just to settle it once and for all: Which came first the Chicken or the Egg? The Egg -- laid by a bird that was not a Chicken    3
How to exaggerate: Prettiest in New York: Miss New York. Prettiest in USA: Miss America. Prettiest on Earth: Miss Universe       3
Why do aliens always disembark via ramp? Do they have problems with stairs? Or are flying saucers just handicap-accessible?      3
When Earth's oil runs out & price of other fuels rises then naughty children will thank Santa for giving them a lump-of-coal     2
Name: text, dtype: int64

In [None]:
# Calculamos la longitud del vocabulario
lenCorpus(dfNGTyson, 'text')


13232

In [None]:
# Generamos un nuevo dataframe únicamente con la columnta text
dfnlg = dfNGTyson[['text']]
print(f'Dimensiones del dataset son: {dfnlg.shape}')

Dimensiones del dataset son: (2428, 1)


### Preprocesado

In [None]:
# Función que realiza el preprocesado de los datos sobre la columna de un df
def preproccess_df(df, col_name):
    
    # Convierte el texto a minúsuculas
    df[col_name] = df[col_name].map(lambda x: x.lower())
    
    # Eliminar los espacions al principio y final 
    df[col_name] = df[col_name].map(lambda x: x.strip())
    
    # Reemplazamos el pipeline '|' por espacio
    df[col_name] = df[col_name].map(lambda x: x.replace('|', ' '))
    
    # Reemmplazamos &#34; ó &quot; (comillas dobles) por comillas simples
    df[col_name] = df[col_name].map(lambda x: x.replace('&#34;', "'" ))
    df[col_name] = df[col_name].map(lambda x: x.replace('&quot;', "'" ))
    
    # Reemmplazamo &#60; o &lt; por (<) y &#62; o &gt; por (>)
    df[col_name] = df[col_name].map(lambda x: x.replace('&#60;', "<" ))
    df[col_name] = df[col_name].map(lambda x: x.replace('&lt;',  "<" ))
    df[col_name] = df[col_name].map(lambda x: x.replace('&#62;', ">" ))
    df[col_name] = df[col_name].map(lambda x: x.replace('&gt;',  ">" ))
    
    # &amp;
    df[col_name] = df[col_name].map(lambda x: x.replace('&amp;',  "&" ))
    
    # &#8203; &#8211; &#8212; &#8216; &#8217; &#8220; &#8221; &#8226; &#8230; &#8482;
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8203;', "" ))
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8211;', "-" ))  # raya corta
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8212;', "-" ))  # raya larga
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8216;', "''" )) # comilla izquierda - citación
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8217;', "''" )) # comilla derecha - citación
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8220;', "''" )) # comillas de citación - arriba izquierda
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8221;', "''" )) # comillas de citación - arriba derecha
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8226;', "" ))   # viñeta - bullet
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8230;', "." ))  # puntos suspensivos
    df[col_name] = df[col_name].map(lambda x: x.replace('&#8482;', "TM" )) # signo de marca registrada - trade mark
    
    return df

In [None]:
# Preprocesado del dataframe dfnlg
dfnlg = preproccess_df(dfnlg, 'text')

In [None]:
# Eliminamos las muestras duplicadas
# quitamos el parámetro inplace=True pq drop_duplicates devuelve NoneType
dfnlg = dfnlg.drop_duplicates()
print(f'Dimensiones del dataset son: {dfnlg.shape}')

Dimensiones del dataset son: (2389, 1)


In [None]:
# Calculamos la longitud del vocabulario
lenCorpus(dfnlg, 'text')


11998

`Observamos que al eliminar los registros duplicados disminuye las dimensiones tanto del dataframe dfnlg como del diccionario`

In [None]:
# Creamos la lista de tweets concatenando el contenido de la columna text separado por salto de línea \n
dfnlg_tweets = dfnlg['text'].tolist()
tweets = ""
for i in dfnlg_tweets:
    tweets += i + '\n'

print('corpus length: {}'.format(len(tweets)))

corpus length: 277225


In [None]:
tweets[:1000]

'moon’s shadow landfalls oregon, crosses usa at 1800mph, exits scarolina. behold ‘muuurica’s eclipse.pic.twitter.com/fimcneyyqy\n@huggy_panda  oink, oink.   : - )\nfuture headlines from the multiverse: nov 9, 2016: “trump: how i got hillary elected while dismantling the republican party.”\nawww. that’s the nicest thing anybody has said to me in a long while.https://twitter.com/ayeshatron/status/784441432652320769\xa0…\nif comiccon people ruled the world, international conflicts would be resolved entirely by plastic  light saber fights in bars\non pluto, with its 248-year orbit around the sun, birthdays are incompatible with human physiology.\n@ivychat maybe i‘m floating in an atmospheric balloon in saturn’s atmosphere.\nthe urge to want some bit of information to be true often clouds our ability to assess why that information may be false.\nevidence that internet cats are rapidly achieving cosmic consciousness, soon to become our overlords:https://www.youtube.com/watch?v=ljsh6ru1xrk&fe

In [None]:
# Build an unordered collection of unique elements.
set(tweets)

{'\n',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\xa0',
 '¢',
 'ä',
 'é',
 'ö',
 'ú',
 'ī',
 '–',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '…',
 '⁰'}

In [None]:
# Nos quedamos con los primeros 99900 caracteres
tweets = tweets[:99900]

In [None]:
# Diccionarios para la entrada y la salida de la red neuronal 
chars = sorted(list(set(tweets)))
char_indices = dict((c, i) for i, c in enumerate(chars))  # Diccionario dado un carácter, devuelve su índice
indices_char = dict((i, c) for i, c in enumerate(chars))  # Diccionario dado un índice, devuelve el carácter asociado 

print('unique chars: {}'.format(len(chars)))

unique chars: 70


In [None]:
# Print del diccionario carácter:índice
print(char_indices)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, '9': 25, ':': 26, ';': 27, '=': 28, '?': 29, '@': 30, '[': 31, ']': 32, '_': 33, 'a': 34, 'b': 35, 'c': 36, 'd': 37, 'e': 38, 'f': 39, 'g': 40, 'h': 41, 'i': 42, 'j': 43, 'k': 44, 'l': 45, 'm': 46, 'n': 47, 'o': 48, 'p': 49, 'q': 50, 'r': 51, 's': 52, 't': 53, 'u': 54, 'v': 55, 'w': 56, 'x': 57, 'y': 58, 'z': 59, '\xa0': 60, 'ö': 61, 'ú': 62, '–': 63, '—': 64, '‘': 65, '’': 66, '“': 67, '”': 68, '…': 69}


In [None]:
# Print del diccionario índice:carácter
print(indices_char)


{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '#', 5: '$', 6: '%', 7: '&', 8: "'", 9: '(', 10: ')', 11: '*', 12: ',', 13: '-', 14: '.', 15: '/', 16: '0', 17: '1', 18: '2', 19: '3', 20: '4', 21: '5', 22: '6', 23: '7', 24: '8', 25: '9', 26: ':', 27: ';', 28: '=', 29: '?', 30: '@', 31: '[', 32: ']', 33: '_', 34: 'a', 35: 'b', 36: 'c', 37: 'd', 38: 'e', 39: 'f', 40: 'g', 41: 'h', 42: 'i', 43: 'j', 44: 'k', 45: 'l', 46: 'm', 47: 'n', 48: 'o', 49: 'p', 50: 'q', 51: 'r', 52: 's', 53: 't', 54: 'u', 55: 'v', 56: 'w', 57: 'x', 58: 'y', 59: 'z', 60: '\xa0', 61: 'ö', 62: 'ú', 63: '–', 64: '—', 65: '‘', 66: '’', 67: '“', 68: '”', 69: '…'}


In [None]:
X = []
Y = []
length = len(tweets)
seq_length = 100
for i in range(0, length-seq_length, 1):
    sequence = tweets[i:i + seq_length]
    label =tweets[i + seq_length]
    X.append([char_indices[char] for char in sequence])
    Y.append(char_indices[label])

In [None]:
print('Dimensiones X: ',  {len(X)})
print('Dimensiones Y: ',  {len(Y)})

Dimensiones X:  {99800}
Dimensiones Y:  {99800}


In [None]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(chars))
Y_modified = np_utils.to_categorical(Y)

In [None]:
print('Dimensiones X_modified: ',  {X_modified.shape})
print('Dimensiones Y_modified: ',  {Y_modified.shape})

Dimensiones X_modified:  {(99800, 100, 1)}
Dimensiones Y_modified:  {(99800, 70)}


### Models 


In [None]:
# Model 1
model_1 = Sequential()
model_1.add(LSTM(400, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_1.add(Dropout(0.2))
model_1.add(LSTM(400))
model_1.add(Dropout(0.2))
model_1.add(Dense(Y_modified.shape[1], activation='softmax'))

# Compile the network
model_1.compile(loss='categorical_crossentropy', optimizer='adam')
# Fit the network con 5% muestra de validation y 20 épocas
history_1 = model_1.fit(X_modified, Y_modified, validation_split=0.05, batch_size=128, epochs=20, shuffle=True).history

model_1.summary()

Train on 94810 samples, validate on 4990 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 400)          643200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 400)               1281600   
_________________________________________________________________
dropout_2 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 70

In [None]:
# Model 2
model_2 = Sequential()
model_2.add(LSTM(700, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_2.add(Dropout(0.2))
model_2.add(LSTM(700))
model_2.add(Dropout(0.2))
model_2.add(Dense(Y_modified.shape[1], activation='softmax'))

# Compile the network
model_2.compile(loss='categorical_crossentropy', optimizer='adam')
# Fit the network con 5% muestra de validation y 20 épocas
history_2 = model_2.fit(X_modified, Y_modified, validation_split=0.05, batch_size=128, epochs=20, shuffle=True).history

model_2.summary()

Train on 94810 samples, validate on 4990 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 100, 700)          1965600   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 700)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 700)               3922800   
_________________________________________________________________
dropout_4 (Dropout)          (None, 700)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 70

In [None]:
# Model 3
model_3 = Sequential()
model_3.add(LSTM(400, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_3.add(Dropout(0.2))
model_3.add(LSTM(400, return_sequences=True))
model_3.add(Dropout(0.2))
model_3.add(LSTM(400))
model_3.add(Dropout(0.2))
model_3.add(Dense(Y_modified.shape[1], activation='softmax'))

# Compile the network
model_3.compile(loss='categorical_crossentropy', optimizer='adam')
# Fit the network con 5% muestra de validation y 20 épocas
history_3 = model_3.fit(X_modified, Y_modified, validation_split=0.05, batch_size=128, epochs=20, shuffle=True).history

model_3.summary()

Train on 94810 samples, validate on 4990 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 100, 400)          643200    
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100, 400)          1281600   
_________________________________________________________________
dropout_6 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 40

### Generate text

In [None]:
def generate_text(model, string_id):
    string_mapped = deepcopy(string_id)
    full_string = [indices_char[value] for value in string_mapped]
    
    # Generating characters
    for i in range(400):
        x = np.reshape(string_mapped,(1,len(string_mapped), 1))
        x = x / float(len(chars))

        pred_index = np.argmax(model.predict(x, verbose=0))
        seq = [indices_char[value] for value in string_mapped]
        full_string.append(indices_char[pred_index])

        string_mapped.append(pred_index)
        string_mapped = string_mapped[1:len(string_mapped)]
        
    text = ""
    for char in full_string:
        text = text + char
    return text

In [None]:
from copy import deepcopy
model_1_results = generate_text(model_1, X[5])

In [None]:
model_2_results = generate_text(model_2, X[5])

In [None]:
model_3_results = generate_text(model_3, X[5])

### Show the results

In [None]:
t = ''
for ch in X[5]:
    t += indices_char.get(ch)
print(t)

s shadow landfalls oregon, crosses usa at 1800mph, exits scarolina. behold ‘muuurica’s eclipse.pic.t


In [None]:
print(model_1_results)

s shadow landfalls oregon, crosses usa at 1800mph, exits scarolina. behold ‘muuurica’s eclipse.pic.twitter.com/nzspsprrr
would be coes of the world ons that in the universe in the world ont that in the universe in the world ont that in the universe in the world ont that in the universe in the world ont that in the universe in the world ont that in the universe in the world ont that in the universe in the world ont that in the universe in the world ont that in the universe in the world ont that i


In [None]:
print(model_2_results)

s shadow landfalls oregon, crosses usa at 1800mph, exits scarolina. behold ‘muuurica’s eclipse.pic.twitter.com/zgq9hojkna
as the cosmos will make to anl of the snuthern hemisphere. the conmints was a painng colsert to mever lake to all pelple in the world. then a domn what wou can wan cioc to al axtrophysicist. and you are nnt boonsers then you’re still the touthern hemisphere.
the clicks of piople love their canling arpund the sun. bankie the lumber of people who are intolerant of inter tese th


In [None]:
print(model_3_results)

s shadow landfalls oregon, crosses usa at 1800mph, exits scarolina. behold ‘muuurica’s eclipse.pic.twitter.com/zghjoooozm
in the eilm #gravity wart to darth, in the eirst planet of the eruator than we dan heed a person hand you whe sruth.
i wonder if woud what dound be coml if dome in the sniverse in the sniverse to all the sun. bonk the sup will earth still live in the sky — they would have soace aliens landed in a ninlion with a comstrly and not on the sun will expand & onace aliens to ce rear


### Conclusiones




```
Epoch 20/20
Model 1  94810/94810 [==============================] - 213s 2ms/step - loss: 1.8154 - val_loss: 2.5978
Model 2  94810/94810 [==============================] - 223s 2ms/step - loss: 1.0490 - val_loss: 3.0984
Model 3  94810/94810 [==============================] - 323s 3ms/step - loss: 1.3818 - val_loss: 2.5797
```

El modelo con menor función de perdidas en training es el modelo 2 pero en validation la función de perdidas en este modelo es la mayor de todos ellos, con lo cual según estos datos y teniendo en cuenta que sólo se ha entrenado en 20 épocas, el mejor modelo es el 3 [loss: 1.3818 - val_loss: 2.5797]

Pero si nos fijamos en la generación de texto, aunque observamos errores gramaticales este está relacionado con el universo.

