<a href="https://colab.research.google.com/github/mvenouziou/text_generator/blob/main/poetry_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# tensorflow modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# for loading pre-trained BERT embeddings
!pip install -q tensorflow-text  # need to install at each Google Colab session
import tensorflow_text as text  
import tensorflow_hub as hub

# general modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import string

[?25l[K     |                                | 10kB 19.3MB/s eta 0:00:01[K     |▏                               | 20kB 24.0MB/s eta 0:00:01[K     |▎                               | 30kB 10.6MB/s eta 0:00:01[K     |▍                               | 40kB 9.5MB/s eta 0:00:01[K     |▌                               | 51kB 4.6MB/s eta 0:00:01[K     |▋                               | 61kB 4.8MB/s eta 0:00:01[K     |▊                               | 71kB 5.3MB/s eta 0:00:01[K     |▉                               | 81kB 5.6MB/s eta 0:00:01[K     |▉                               | 92kB 5.5MB/s eta 0:00:01[K     |█                               | 102kB 5.9MB/s eta 0:00:01[K     |█                               | 112kB 5.9MB/s eta 0:00:01[K     |█▏                              | 122kB 5.9MB/s eta 0:00:01[K     |█▎                              | 133kB 5.9MB/s eta 0:00:01[K     |█▍                              | 143kB 5.9MB/s eta 0:00:01[K     |█▌                      

In [97]:
# Load CSV and Split into train/test/validation sets
def load_and_split_csv(select_column='Content'):
    """ Loads data into train/test/validation sets """

    data_URL = 'https://raw.githubusercontent.com/mvenouziou' \
                + '/text_generator/main/robert_frost_collection.csv'

    df = pd.read_csv(data_URL)

    if select_column:
        df = df[[select_column]]
    
    train_df, test_df = \
        train_test_split(df, test_size=0.3, random_state=42)

    valid_df, test_df = \
        train_test_split(test_df, test_size=0.5, random_state=10)


    return train_df, test_df, valid_df

In [99]:
# clean dataset
def clean_text(df, column='Content', dropna=True):
    """ initial text data prep """
    
    # drop missing values
    if dropna:
        df = df.dropna()

    # add 'EOL' (end of line), poetry breaks without a separating punctiation)
    df[column] = df[column].apply(lambda x: x.replace('\r\n', ' EOL '))
    df[column] = df[column].apply(lambda x: x.replace('\n', ' EOL '))

    # vocab to pad punctuation with spaces
    spaced_punctuation = \
        list(map(lambda x: ' '.join(['', x, '']), string.punctuation))

    # separate punctuation from words with extra spacing
    for punc, space_punc in zip(string.punctuation, spaced_punctuation):
        df[column] = df[column].apply(lambda x: x.replace(punc, space_punc))

    # remove any excess spaces created above
    for _ in range(3):
        df[column] = df[column].apply(lambda x: x.replace('  ', ' '))
        
    #return df  # changes reflected in (mutable) dataframe

In [10]:
def create_Xy_df(text):
    """ Breaks a single text input into model inputs X and model outputs y. 
    Left side of text becomes X, the right side is our intended prediction, y
    """

    # split sentence into words
    split_text = text.split()

    # group words into all possible left/right splits
    # stores result in dataframe 'Xy_df'
    Xy_df = pd.DataFrame(columns=['X', 'y'])
    for i in range(1, len(split_text)):
        x = split_text[: i]
        y = split_text[i: ]
    
        # recombine to form X and y halves
        X_section = ' '.join(x)
        y_section = ' '.join(y)

        Xy_df.loc[i] = [X_section, y_section]
        
    return Xy_df
    

def split_x_y(df):
    """ Breaks full dataset's text into model inputs X and model outputs y """
    
    # initialize empty dataframe
    full_df = pd.DataFrame(columns=['X', 'y'])

    # create X and y data for model
    # by applying 'create_Xy_df' to each row of input df
    for i in range(len(df)):
        Xy_df = create_Xy_df(df.iloc[i][0])

        full_df = pd.concat([full_df, Xy_df])

    # separate out X and y components
    X = full_df['X']
    y = full_df['y']

    return X, y

In [11]:
# apply BERT encoding
def BERT_encoding(text_input):
    """ Applies BERT word embeddings model """
    
    # preprocessor: formats input text for use in BERT encoder
    preprocessor = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2")
    encoder_inputs = preprocessor(text_input)

    # initialize encoder
    encoder = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
        trainable=False)
    
    # apply encoder
    outputs = encoder(encoder_inputs)
    pooled_output = outputs["pooled_output"]  # creates [batch_size, 768].
    #  sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

    return pooled_output  #, sequence_output

In [12]:
def create_model():

    # text input layer
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.int32)

    # RNN netowrk
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True))(encoded_inputs)
    x = tf.keras.layers.LSTM(64, return_sequences=True)(x)
    outputs = tf.keras.layers.Dense(10)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.summary()

    # compiler
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer="sgd",
        metrics=["accuracy"],
        )
    
    return model

In [85]:
#Pre-Processor

def preprocess_stage1():

    # load data
    train_df, test_df, valid_df = load_and_split_csv(select_column='Content')
    
    # prep text
    clean_text(train_df, column='Content', dropna=True)
    clean_text(test_df, column='Content', dropna=True)
    clean_text(valid_df, column='Content', dropna=True)

    X_train, y_train = split_x_y(train_df)
    X_test, y_test = split_x_y(test_df)
    X_valid, y_valid = split_x_y(valid_df)

    return X_train, y_train, X_test, y_test, X_valid, y_valid


def preprocess_stage2(X_train, y_train, X_test, y_test, X_valid, y_valid):
    # word embedding (BERT encoding)
    encoded_train_X = BERT_encoding(X_train)
    encoded_train_y = BERT_encoding(y_train)
    
    encoded_test_X = BERT_encoding(X_test)
    encoded_test_y = BERT_encoding(y_test)
    
    encoded_valid_X = BERT_encoding(X_valid)
    encoded_valid_y = BERT_encoding(y_valid)

    return encoded_train_X, encoded_train_y, encoded_test_X, encoded_test_y,
            encoded_valid_X, encoded_valid_y

In [102]:
X_train, y_train, X_test, y_test, X_valid, y_valid = preprocess_stage1()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

ValueError: ignored

In [38]:
encoded_train_X, encoded_train_y, encoded_test_X, encoded_test_y, \
            encoded_valid_X, encoded_valid_y = preprocess_stage2(X_train, y_train, X_test, y_test, X_valid, y_valid)

1                  Late Walk
2                       Walk
1                  Butterfly
1                      Hours
1                and Passing
              ...           
2                         Us
1    Pauper Witch of Grafton
2           Witch of Grafton
3                 of Grafton
4                    Grafton
Name: y, Length: 151, dtype: object