In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


https://github.com/mwitiderrick/TensorFlow-GLOVE-LSTM

In [2]:
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Embedding, LSTM, Bidirectional,SpatialDropout1D


In [3]:
train_df = pd.read_csv('drive/My Drive/train_data.csv')
val_df = pd.read_csv('drive/My Drive/val_data.csv')
test_df = pd.read_csv('drive/My Drive/test_data.csv')

In [None]:
train_df.columns

Index(['Unnamed: 0', 'author', 'title', 'poetry_foundation_id', 'raw_content',
       'clean_content', 'author_poem_count', 'author_poem_index',
       'author_poem_pct'],
      dtype='object')

In [4]:
X_train = train_df['clean_content']
X_val = val_df['clean_content']
X_test = test_df['clean_content']

In [5]:
#Get cleaned input sequence
t=[]
for i in X_train:
  t.append(i.strip('!"#$%&()*+,-.–—/:;<=>?@[\\]^_`{|}~\t\r\n'))       
v=[]
for i in X_val:
  v.append(i.strip('!"#$%&()*+,-.–—/:;<=>?@[\\]^_`{|}~\t\r\n'))
e=[]
for i in X_test:
  e.append(i.strip('!"#$%&()*+,-.–—/:;<=>?@[\\]^_`{|}~\t\r\n'))

x_train=[]
for i in range(len(t)):
  x_train.append(' '.join(t[i].split('\r\n')))
x_val=[]
for i in range(len(v)):
  x_val.append(' '.join(v[i].split('\r\n')))
x_test=[]
for i in range(len(e)):
  x_test.append(' '.join(e[i].split('\r\n')))
print(len(x_train),len(x_val),len(x_test))

x_train = [i.split(' ') for i in x_train]
x_test = [i.split(' ') for i in x_test]
xv=[]
for i in x_val:
  xv.append(i.split(' '))
x_val=xv

457 127 59


In [6]:
#check input
print(x_train[4])
print(x_val[5])
print(x_test[2])

['sometim', 'someth', 'like', 'second', 'wash', 'base', 'street', 'th', 'father', 'two', 'assistants', 'ar', 'given', 'permiss', 'go', 'on', 'woman', 'ask', '“why', 'did', 'come', 'first', 'place', 'to', 'citadel', 'dampness”', '', 'som', 'day', 'wors', 'others', 'even', "can't", 'believ', 'them', 'but', 'never', 'concern', 'mine', 'reason', 'patient', '', 's', 'scroll', 'never', 'blast', 'us', 'into', 'marmor', 'mean', 'fist', 'it', 'kudo', 'princ', 'journey', 'here', 'to', 'negoti', 'releas', 'believ', 'it', '', "you'r", 'right', 'ballad', 'retreating', 'back', 'atmosphere', 'they', "won't", 'come', 'round', 'again', 'mak', 'peac']
['mani', 'time', 'low', 'foot', 'stagger', 'solder', 'mouth', 'tell', 'tri', 'stir', 'aw', 'rivet', 'tri', 'lift', 'hasp', 'steel', '', 'strok', 'cool', 'forehead', 'hot', 'often', 'lift', 'care', 'listless', 'hair', 'handl', 'adamantin', 'fingers', 'nev', 'thimbl', 'shall', 'wear', 'buzz', 'dull', 'fli', 'chamber', 'window', 'brave', 'shine', 'sun', 'frec

In [7]:
#Check distribution of sequence length
l = [len(i) for i in x_train]
l=pd.DataFrame(l)
l.describe()


Unnamed: 0,0
count,457.0
mean,219.286652
std,634.764432
min,6.0
25%,60.0
50%,80.0
75%,155.0
max,9313.0


In [8]:
#(Optional) Splitting input sequence
author_train=[]
poem_train=[]
for k in range(len(x_train)):
  x = x_train[k]
  if len(x) > 150:  #Can choose different length
    chunks, chunk_size = len(x), 150
    a=[x[i:i+int(chunk_size)] for i in range(0, int(chunks), int(chunk_size)) ]
    for j in a:
      poem_train.append(j)
      author_train.append(train_df['author'][k])
  else:
    poem_train.append(x)
    author_train.append(train_df['author'][k])

author_val=[]
poem_val=[]
for k in range(len(x_val)):
  x = x_val[k]
  if len(x) > 150:
    chunks, chunk_size = len(x), 150
    a=[x[i:i+int(chunk_size)] for i in range(0, int(chunks), int(chunk_size)) ]
    for j in a:
      poem_val.append(j)
      author_val.append(val_df['author'][k])
  else:
    poem_val.append(x)
    author_val.append(val_df['author'][k])

author_test=[]
poem_test=[]
for k in range(len(x_test)):
  x = x_test[k]
  if len(x) > 150:
    chunks, chunk_size = len(x), 150
    a=[x[i:i+int(chunk_size)] for i in range(0, int(chunks), int(chunk_size)) ]
    for j in a:
      poem_test.append(j)
      author_test.append(test_df['author'][k])
  else:
    poem_test.append(x)
    author_test.append(test_df['author'][k])

print(len(author_train)==len(poem_train))
print(len(author_val)==len(poem_val))    
print(len(author_test)==len(poem_test))

True
True
True


In [9]:
#Check distribution
from collections import Counter
Counter(author_train)

Counter({'Alfred, Lord Tennyson': 151,
         'Emily Dickinson': 40,
         'John Ashbery': 57,
         'John Donne': 56,
         'Kay Ryan': 29,
         'Percy sshe Shelley': 110,
         'Rae Armantrout': 46,
         'Walt Whitman': 128,
         'William Butler Yeats': 46,
         'William Shakespeare': 109,
         'William Wordsworth': 106,
         'Yusef Komunyakaa': 37})

In [10]:
# Tokenize the dataset
MAX_NB_WORDS = 50000
oov_token = "<UNK>"
padding_type = "post"
trunction_type='post'
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token=oov_token,filters='!"#$%&()*+,-–./:;<=>?@[\\]^_`{|}~\t\n\r')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 15297 unique tokens.


In [11]:
X_train_sequences = tokenizer.texts_to_sequences(poem_train)
X_train_padded = pad_sequences(X_train_sequences, padding=padding_type, 
                       truncating=trunction_type)
X_val_sequences = tokenizer.texts_to_sequences(poem_val)
X_val_padded = pad_sequences(X_val_sequences, padding=padding_type,
                       truncating=trunction_type)

X_test_sequences = tokenizer.texts_to_sequences(poem_test)
X_test_padded = pad_sequences(X_test_sequences, padding=padding_type, 
                       truncating=trunction_type)

In [12]:
X_train_padded.shape

(915, 150)

In [15]:
#Encode label to numbers  (Use 'SparseCategoricalCrossentropy' if using numbers)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(author_train)
y_train=le.transform(author_train)
y_val=le.transform(author_val)
y_test=le.transform(author_test)

In [16]:
#(Optional)One hot encoding   (Use 'categorical_crossentropy' if using one hot)
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)
