<a href="https://colab.research.google.com/github/mvenouziou/text_generator/blob/main/poetry_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# tensorflow modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# for loading pre-trained BERT embeddings
!pip install -q tensorflow-text  # need to install at each Google Colab session
import tensorflow_text as text  
import tensorflow_hub as hub

# general modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import string

In [5]:
# Load CSV and Split into train/test/validation sets
def load_and_split_csv(select_column=False):
    """ Loads data into train/test/validation sets """

    data_URL = 'https://raw.githubusercontent.com/mvenouziou' \
                + '/text_generator/main/robert_frost_collection.csv'

    df = pd.read_csv(data_URL)

    if select_column:
        df = df[[select_column]]
    
    train_df, test_df, _, _ = \
        train_test_split(df, df, test_size=0.3, random_state=42)

    test_df, valid_df, _, _ = \
        train_test_split(test_df, test_df, test_size=0.3, random_state=42)


    return train_df, test_df, valid_df

In [11]:
train_df, test_df, valid_df = load_and_split_csv(select_column=False)
train_df.head()

Unnamed: 0,Name,Content,Collection,Year of Publication
33,A Late Walk,"When I go up through the mowing field,\nThe he...",A boy's Will,1913.0
55,My Butterfly,"Thine emulous fond flowers are dead, too,\r\nA...",A boy's Will,1913.0
69,Good Hours,I had for my winter evening walk- \nNo one at ...,North of Boston,1914.0
28,Meeting and Passing,As I went down the hill along the wall\nThere ...,Mountain Interval,1916.0
40,Asking for Roses,"A house that lacks, seemingly, mistress and ma...",A boy's Will,1913.0


In [6]:
# clean dataset
def clean_text(df, column='Content', dropna=True):
    """ initial text data prep """
    
    # drop missing values
    if dropna:
        df.dropna()

    # add 'EOL' (end of line), poetry breaks without a separating punctiation)
    df[[column]] = df[[column]].apply(lambda x: x.replace('\n', ' EOL '))

    # vocab to pad punctuation with spaces
    spaced_punctuation = \
        list(map(lambda x: ' '.join(['', x, '']), string.punctuation))

    # separate punctuation from words with extra spacing
    for punc, space_punc in zip(string.punctuation, spaced_punctuation):
        df[[column]] = df[[column]].apply(lambda x: x.replace(punc, space_punc))

    # remove any excess spaces created above
    for _ in range(3):
        df[[column]] = df[[column]].apply(lambda x: x.replace('  ', ' '))
        
    return None  # changes reflected in (mutable) dataframe

In [8]:
def create_Xy_df(text):
    """ Breaks a single text input into model inputs X and model outputs y. 
    Left side of text becomes X, the right side is our intended prediction, y
    """

    # split sentence into words
    split_text = text.split()

    # group words into all possible left/right splits
    # stores result in dataframe 'Xy_df'
    Xy_df = pd.DataFrame(columns=['X', 'y'])
    for i in range(1, len(split_text)):
        x = split_text[: i]
        y = split_text[i: ]
    
        # recombine to form X and y halves
        X_section = ' '.join(x)
        y_section = ' '.join(y)

        Xy_df.loc[i] = [X_section, y_section]
        
    return Xy_df
    

def split_x_y(df):
    """ Breaks full dataset's text into model inputs X and model outputs y """
    
    # initialize empty dataframe
    full_df = pd.DataFrame(columns=['X', 'y'])

    # create X and y data for model
    # by applying 'create_Xy_df' to each row of input df
    for i in range(len(df)):
        Xy_df = create_Xy_df(df.iloc[i][0])

        full_df = pd.concat([full_df, Xy_df])

    # separate out X and y components
    X = full_df['X']
    y = full_df['y']

    return X, y

In [9]:
# create BERT encoding layer
def BERT_encoding_layer(text_input):
    """ Creates model layer for encoding text 
    using BERT word embeddings model """
    
    # preprocessor: formats input text for use in BERT encoder
    preprocessor = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2")
    encoder_inputs = preprocessor(text_input)

    # initialize encoder
    encoder = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
        trainable=False)
    
    # apply encoder
    outputs = encoder(encoder_inputs)
    pooled_output = outputs["pooled_output"]  # creates [batch_size, 768].
    #  sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

    return pooled_output  #, sequence_output

In [10]:
def create_model():

    # text input layer
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.int32)

    # RNN netowrk
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True))(encoded_inputs)
    x = tf.keras.layers.LSTM(64, return_sequences=True)(x)
    outputs = tf.keras.layers.Dense(10)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.summary()

    # compiler
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer="sgd",
        metrics=["accuracy"],
        )
    
    return model

In [None]:
#Pre-Processor

def pre-process_text():

    # load data
    train_df, test_df, valid_df = load_and_split_csv(select_column=False)
    
    # prep text
    clean_text(train_df, column='Content', dropna=True)
    # clean_text(test_df, column='Content', dropna=True)
    # clean_text(test_df, column='Content', dropna=True)

    X_train, y_train = split_x_y(train_df)
    # X_test, y_test = split_x_y(test_df)
    # X_valid, y_valid = split_x_y(valid_df)

    # word embedding / BERT encoder
    # text input layer
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)

    # word embedding (BERT model)
    encoded_inputs = tf.data.Dataset.map(lambda x: BERT_encoding_layer()(x))