## Introduction & Table of Contents

This is the start of the notebook of a poem generator. 21,500 poems were scraped or collected off the internet, mainly from poemhunter.com and a Kaggle dataset of poems. The poems were cleaned and tokenized on the character level. Then, they were further pre-processed to be in the correct format to input into an LSTM model (a recurrent neural network). Then, the model was used to create a poetry generator. Given a seed of 100+ characters, a 500-character (can be changed) continuation of the poem was generated. 

2. <a href='#scraping'>Web Scraping</a>

    2.1. <a href='#poemhunter'>Web Scraping Poemhunter.com</a>
    
    2.2. <a href='#180poems'>Web Scraping poetry 180</a>
    
    
3. <a href='#clean'>Cleaning the Data</a>


4. <a href='#100'>Create 100 sequence chunks</a>


5. <a href='#1024'>Create batches of 1024 100 sequence chunks</a>
    

6. <a href='#lstm'>LSTM Model</a>


7. <a href='#generator'>Poetry Generator</a>


## Web Scraping 
<a id='scraping'></a>

### Web scraping poemhunter.com
<a id='poemhunter'></a>

In [None]:
# import beautiful soup and other modules
from bs4 import BeautifulSoup
import re
import time
import os
import pandas as pd
import numpy as np
import pickle
import requests

In [None]:
url = 'https://www.poemhunter.com/poem-topics/'

response = requests.get(url)

#check is response = 200.
response

In [None]:
#function to get url for each topic.
def get_urls1(website):
    page = requests.get(website)
    data = page.text
    soup = BeautifulSoup(data, 'html.parser')
    
    list_of_urls = []
    
    for link in soup.findAll('a', attrs={'href': re.compile("poems/")}):
        url = (link.get('href'))
        list_of_urls.append(url)
    return list_of_urls

#url_list contains all urls to the different poem topics.
url_list = get_urls1('https://www.poemhunter.com/poem-topics/')

In [None]:
# get url in each topic (page 1 of 2 only)
def get_urls2(element):
    list_of_urls = []
    #url_list[-12:] is not relevant urls.
    for element in url_list[1:-12]:
        page = requests.get('http://poemhunter.com'+f'{element}')
        data = page.text
        soup = BeautifulSoup(data, 'html.parser')

        for link in soup.findAll('a', attrs={'href': re.compile(f"{element}")}):
            url = (link.get('href'))
            list_of_urls.append(url)
    return list_of_urls

url_list2 = get_urls2(url_list[1:-12])

In [None]:
#filters out list of urls in url_list2 to get the relevant urls for page 1.
regex = re.compile("https:")
filtered = [i for i in url_list2 if not regex.search(i)]

In [None]:
#save urls from page 1 of 2 into a pkl file
pickling_on = open("url_page1.pkl","wb")
pickle.dump(filtered, pickling_on)
pickling_on.close()

In [None]:
#get url in each topic (page 2)
def get_urls3(element):
    list_of_urls = []
    for element in url_list[1:-12]:

        page = requests.get('http://poemhunter.com/'+f'{element}' + 'page-2')
        data = page.text
        soup = BeautifulSoup(data, 'html.parser')

        for link in soup.findAll('a', attrs={'href': re.compile(f"{element}"+ "page-2")}):
            url = (link.get('href'))
            list_of_urls.append(url)
    return list_of_urls

url_list3 = get_urls3(url_list[1:-12])

regex = re.compile("https:")
filtered3 = [i for i in url_list3 if not regex.search(i)]

pickling_on = open("url_page2.pkl","wb")
pickle.dump(filtered3, pickling_on)
pickling_on.close()


In [None]:
#function to get poems!
def get_poems(url):
    list_of_poems = []
    for element in filtered:
        page = requests.get(f'{url}'+f'{element}')
        data = page.text
        soup = BeautifulSoup(data, 'html.parser')
        
        poem=soup.find_all('p')[1].get_text()
            
        list_of_poems.append(poem)
        print(f'{element} has been added!')
    return list_of_poems

poems = get_poems('http://poemhunter.com/')


### Web Scraping poetry 180
<a id='180poems'></a>

Similar code from above was used to scrape poetry 180, but was a lot simpler.

## Cleaning the Data
<a id='clean'></a>

In [2]:
# importing modules
import pickle
import pandas as pd
import numpy as np
import re

In [None]:
# function to aggregate my poems scraped from poemhunter.com and another website
def master_poem(list_of_pkl):
    empty_list = []
    for element in list_of_pkl:
        pickle_in = open(element,"rb")
        pkl = pickle.load(pickle_in)
        empty_list.extend(pkl)
    return empty_list

In [None]:
# aggregating my poems. 
master = master_poem(['poems1.pkl', 'poems2.pkl', 'poems3.pkl', 'poems4.pkl', 'poems5.pkl', 'poems6.pkl', 'poems7.pkl',
             'poems8.pkl', 'poems9.pkl', 'poems10.pkl', 'poems11.pkl', 'poems12.pkl', 'poems13.pkl', 'poems14.pkl',
             'poems15.pkl', 'poems16.pkl', 'poems17.pkl', 'poems18.pkl', 'poems19.pkl', 'poems20.pkl',
             'poems21.pkl', 'poems22.pkl', 'poems23.pkl', 'poems24.pkl', 'poems25.pkl', 'poems26.pkl',
             'poems27.pkl', 'poems28.pkl', 'poems29.pkl', 'poems30.pkl', 'poems31.pkl', 'poems32.pkl',
             'poetry_180_1.pkl', 'poetry_180_2.pkl', 'poetry_180_3.pkl', 'poetry_180_4.pkl'])

#getting poems from kaggle dataset. 
df = pd.read_csv('kaggle_poem_dataset.csv')

#creating list of kaggle poems.
contentlist = []
for element in df['Content']:
    contentlist.append(element)

In [None]:
# creating dataframe of poems from poemhunter.com 
master_df = pd.DataFrame(flatten_master, columns = ['poem'])
#creating dataframe of kaggle poems
kaggle = pd.DataFrame(contentlist, columns = ['poem'])

In [None]:
# creating master dataframe of all poems
master_df = pd.concat([master_df, kaggle], axis=0).reset_index()

In [None]:
#function to clean poems by removing non-english characters
def clean_poems(poem):
    return re.sub('[^ \nA-Za-z!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]','',poem)

In [None]:
#create new column of clean poems.
master_df['poem_clean'] = master_df['poem'].apply(clean_poems)

In [None]:
#drop duplicates and drop index column. 
master_df['poem_clean'].drop_duplicates(inplace=True)
master_df.drop('index', axis=1, inplace=True)

In [None]:
(master_df['poem_clean']).describe()

In [None]:
# create pickle file of master df of clean poems. 
with open('new_master2.pkl', 'wb') as handle:
    pickle.dump(master_df, handle)

## Create 100 sequence chunks
<a id='100'></a>

The following script was edited and re-ran for every 500 poems, creating 43 sets of dataX and dataY files.

In [None]:
# importing modules
import sys
import keras
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer

import numpy as np
import pickle

#load pickle file of cleaned data.
pickle_in = open("new_master2.pkl", "rb")
df = pickle.load(pickle_in)

#create list of clean poems.
documents = []
list1 = []
for poem in df['poem_clean']:
    documents.append(poem)
    
#tokenize the poems on the character level 
tk = Tokenizer(num_words= None, char_level=True, oov_token = 'UNK')
#create dictionary with characters keys and their corresponding numerical values as values.
tk.fit_on_texts(documents)


#set sequence length to 100.
seq_length = 100
#create feature set- dataX, and target set - dataY
dataX = []
dataY = []

#run for every batch of 500 poems.
for k in range (21000,21500):
    #break up poems into 100 character chunks with a moving window of 3 characters.
    for i in range (0, len(documents[k])-seq_length, 3):
        sequences = tk.texts_to_sequences(documents[k])
        
        #getting 100 characters. 
        seq_in =documents[k][i: i + seq_length]
        #getting the next character after the 100 character chunk.
        seq_out = documents[k][i+seq_length]
        
        #append the 100 characters into dataX- features
        dataX.append(sequences[i:i+seq_length])
        #append the 101st, or the next character into dataY-target
        dataY.append(sequences[i+seq_length])
        
#save the sequences and targets to pickle files.
pickle_out = open("dataX_43.pkl","wb")
pickle.dump(dataX, pickle_out)
pickle_out.close()

pickle_out = open(f"dataY_43.pkl", "wb")
pickle.dump(dataY, pickle_out)
pickle_out.close()


## Create batches of 1,024 100 sequence chunks

<a id='1024'></a>

In [None]:
import pickle

#create a list of files that will be read in a loop. 
#These are the files created in the 100 sequence chunks portion.
pkl_files = []
for n in range (1, 44):
    pkl_files.append(f'dataX_{n}.pkl)

#Since some file lengths are not divisible by 1024, remaining sequences will be stored in rem_slice
rem_slice = None
st = 0
count = 0

#loop through pkl_files and load them.
for element in pkl_files:
    pickle_in = open(element, "rb")
    dataX = pickle.load(pickle_in)
    
    # if rem_slice is not None,the starting point where sequences are read for the next dataX file will be shifted.
    # rem_slice sequences from the previous dataX gets added to the beginning of the current dataX file.
    if rem_slice:
        st = 1024 - len(rem_slice)
        rem_slice.extend(dataX[:st])

    #loop over 1024 sequences. 
    for y in range (st, len(dataX)-1024, 1024):
        #append 1024 sequence batches to a file.
        pickle_out = open(f"batch_files3/dickinson_and_5000_X_{count}.pkl", "wb")
        pickle.dump(dataX[y:y+1024], pickle_out)
        #counter keeps track of how many files are created and the naming of the files
        count +=1
                     
    #the rem_slice collects remaining sequences. 
                    
    rem_slice = dataX[y+1024:]
                             

In [None]:
#same code, but for the targets.


rem_slice = None
st = 0
count = 0

#loop through pkl_files and load them.
for element in pkl_files:
    pickle_in = open(element, "rb")
    dataY = pickle.load(pickle_in)
    
    # if rem_slice is not None,the starting point where sequences are read for the next dataX file will be shifted.
    # rem_slice sequences from the previous dataX gets added to the beginning of the current dataX file.
    if rem_slice:
        st = 1024 - len(rem_slice)
        rem_slice.extend(dataY[:st])

    #loop over 1024 sequences. 
    for y in range (st, len(dataY)-1024, 1024):
        #append 1024 sequence batches to a file.
        pickle_out = open(f"batch_files3/dickinson_and_5000_X_{count}.pkl", "wb")
        pickle.dump(dataY[y:y+1024], pickle_out)
        #counter keeps track of how many files are created and the naming of the files
        count +=1
                     
    #the rem_slice collects remaining sequences. 
                    
    rem_slice = dataY[y+1024:]

## LSTM Model
<a id='lstm'></a>

In [None]:
# keras imports
import sys
import keras
from keras.utils import np_utils
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Input, Embedding, Bidirectional, LSTM
#CuDNNLSTM can be used if using GPU to run code.
#from keras.layers import CuDNNLSTM

import numpy as np
import pickle

# function to load pkl files of 1024 100 character sequence chunks created previously
def get_input(pklfile):
    pickle_in = open(f"batch_files3/{pklfile}.pkl", "rb")
    X = pickle.load(pickle_in)
    return X

#function to one hot encode sequences. 
def preprocess_input(X_or_Y):
    X_or_Y_processed = np_utils.to_categorical(X_or_Y, num_classes=80)
    return X_or_Y_processed

#need to get the shape of X and y, features and targets.
X= preprocess_input(get_input('X_2'))
y= preprocess_input(get_input('Y_2'))

#instantiate model
model = Sequential()
#256 memory units
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2])))
#add dropout for regularization
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam')

#counter keeps track of how many batches were processed. 
counter = 0
for n in range(1, 7507):
    X = preprocess_input(get_input(f'X_{n}'))
    y = preprocess_input(get_input(f'Y_{n}'))
    
    # early stopping prevents overfitting
    es = EarlyStopping(monitor='val_loss', mode = 'min', verbose=1)
    callbacks = [es, ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

    model.fit(X, y, epochs=20, batch_size=1024, callbacks=callbacks)
    counter += 1
    #save model for every 750 batches it processes
    if counter % 750 == 0:
        model.save(f"model_weights_{counter}")


## Poetry Generator
<a id ='generator'></a>

In [None]:
# keras imports
import sys
import keras
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer

from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Input, Embedding, Bidirectional, LSTM
#import os
#os.environ['KMP_DUPLICATE_LIB_OK']='True'

import numpy as np
import pickle

In [None]:
# load master dataframe of poems.
pickle_in = open("new_master2.pkl", "rb")
df = pickle.load(pickle_in)

In [None]:
#Getting a list of the clean poems. 
documents = []
list1 = []
for poem in df['poem_clean']:
    documents.append(poem)
    
#tokenizing poems on the character level and getting the dictionary of keys and values, characters and corresponding numerical values
tk = Tokenizer(num_words= None, char_level=True, oov_token = 'UNK')
tk.fit_on_texts(documents)

#see the dictionary
tk.word_index


In [None]:
#functions to get the shape of targets and features.
def get_input(pklfile):
    pickle_in = open(pklfile, "rb")
    X = pickle.load(pickle_in)
    return X

def preprocess_input(X_or_Y):
    X_or_Y_processed = np_utils.to_categorical(X_or_Y, num_classes=80)
    return X_or_Y_processed

X= preprocess_input(get_input('batch_files3/X_1.pkl'))
y= preprocess_input(get_input('batch_files3/Y_1.pkl'))

#instantiate model
model = Sequential()
#use the same hyperparameters from our LSTM model.
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam')
#load the weights from saved model. we are using the model trained with 6000 1024 100 character chunks. 
model.load_weights("model_weights_6000")
model.compile(loss = 'categorical_crossentropy', optimizer='adam')



In [None]:
#inverted dictionary of word_index to translate numerical values to characters.
inverted_dict = dict([[v,k] for k,v in tk.word_index.items()])

In [None]:
#function to set the temperature of poetry generation. 
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
#create the seed for the poetry generator. to_transform can be edited.

to_transform = "anyones hand. \
i found the newborn sparrow next to \
the tumbled nest on the grass. bravely\
  \
opening it"

a = list(to_transform.lower()[0:100])

pattern = []
for element in a:
    pattern.append(tk.word_index[element])
    
len(pattern)

In [None]:
#generate characters for different temperature ranges. 
for temperature in [.1, .3, .5, .7]:
    print('----- temperature:', temperature)
    for i in range(500):  
        #reshapes x to fit into model.
        x = np_utils.to_categorical(pattern, num_classes=80)
        x = np.reshape(x, (1, 100, 80))

        prediction=model.predict(x, verbose=0)[0]
        
        index = sample(prediction, temperature)
        #index = np.argmax(prediction)
        
        result=inverted_dict[index]
        
        seq_in = [inverted_dict[value] for value in pattern]
        sys.stdout.write(result)
        pattern.append(index)
        pattern=pattern[1:len(pattern)]