In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


https://github.com/mwitiderrick/TensorFlow-GLOVE-LSTM

In [30]:
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from tensorflow.keras import layers
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from sklearn.model_selection import train_test_split


In [4]:
train_df = pd.read_csv('drive/My Drive/train_data.csv')
val_df = pd.read_csv('drive/My Drive/val_data.csv')
test_df = pd.read_csv('drive/My Drive/test_data.csv')

In [8]:
train_df.columns

Index(['Unnamed: 0', 'author', 'title', 'poetry_foundation_id', 'raw_content',
       'clean_content', 'author_poem_count', 'author_poem_index',
       'author_poem_pct'],
      dtype='object')

In [7]:
X_train = train_df['clean_content']
X_val = val_df['clean_content']
X_test = test_df['clean_content']

In [11]:
vocab_size = 1000
max_length = 200
oov_token = "<UNK>"
padding_type = "post"
trunction_type='post'
tokenizer = Tokenizer(num_words = vocab_size,oov_token=oov_token,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [13]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunction_type)
X_val_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(X_val_sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunction_type)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunction_type)

In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_df['author'])
y_train=le.transform(train_df['author'])
y_val=le.transform(val_df['author'])

In [15]:
embeddings_index = {}
f = open('drive/My Drive/glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [16]:
embedding_matrix = np.zeros((len(word_index) + 1, max_length))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [17]:
embedding_layer = Embedding(len(word_index) + 1,
                            max_length,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [18]:
embedding_dim=200
model = Sequential()

model.add(embedding_layer)
model.add(layers.Bidirectional(layers.LSTM(embedding_dim,dropout = 0.2)))
model.add(layers.Dense(max(y_train)+1, activation = 'softmax'))

model.compile(loss = 'SparseCategoricalCrossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 200)          3060200   
_________________________________________________________________
bidirectional (Bidirectional (None, 400)               641600    
_________________________________________________________________
dense (Dense)                (None, 12)                4812      
Total params: 3,706,612
Trainable params: 646,412
Non-trainable params: 3,060,200
_________________________________________________________________


In [22]:
num_epochs = 100
from keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
history = model.fit(X_train_padded, y_train, epochs=num_epochs, validation_data=(X_val_padded, y_val),callbacks=callback)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
