In [1]:
from numpy import array
from numpy import asarray
from numpy import zeros
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from pathlib import Path
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

%load_ext autoreload
%autoreload 2

In [2]:
#TODO: Download the dataset from https://www.kaggle.com/c/word2vec-nlp-tutorial/data
#TODO: Unzip the dataset and place it in the same folder as this notebook
#TODO: Change the path to the dataset below
# dataset_path = Path(os.getcwd()) / 'data' / 'labeledTrainData.tsv'

# Download and extract the dataset
The dataset is downloaded from the provided URL and extracted into the 'Dataset' folder

In [3]:
import urllib.request
import tarfile
import zipfile

url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

dataset_folder = './Dataset'
if not os.path.exists(dataset_folder):
    os.mkdir(dataset_folder)

dataset_path = os.path.join(dataset_folder, 'dependency_treebank.zip')

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")    
    with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
        print("Extracting dataset")
        zip_ref.extractall(dataset_folder+'/dependency_treebank')
    os.remove(dataset_path)



Successful download
Extracting dataset


# Split the data
The data (199 samples in total) is split into a train, validation and test set:
- 100 train samples
- 50 validation samples
- 49 test samples 
The sets are stored in data frames.

In [4]:
# Define split size
TRAIN_SPLIT = 100
VAL_SPLIT = 150

# Define file iterator
def file_iterator():
    data_dir = Path('dependency_treebank')
    for data_file in filter(lambda f: os.path.isfile(data_dir/f) and f.endswith('.dp'), os.listdir(data_dir)):
        yield data_dir/data_file

# Create train, val and test set
train_set = []
test_set = []
val_set = []
file_counter = 0

# Iterate over files and perform split
for file in file_iterator():
    file_counter += 1
    if file_counter <= TRAIN_SPLIT:
        train_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))
    elif file_counter <= VAL_SPLIT:
        val_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))
    else:
        test_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))

# Check for correct split
assert len(train_set) == 100
assert len(val_set) == 50
assert len(test_set) == 49

# Change to pandas dataframe
train_frame = pd.concat(train_set)
test_frame = pd.concat(test_set)
val_frame = pd.concat(val_set)

# Check for correct transformation
assert sum([e.shape[0] for e in train_set]) == train_frame.shape[0]
assert sum([e.shape[0] for e in test_set]) == test_frame.shape[0]
assert sum([e.shape[0] for e in val_set]) == val_frame.shape[0]

Check the structure of the constructed data frames

In [5]:
train_frame.head()

Unnamed: 0,token,pos
0,In,IN
1,reference,NN
2,to,TO
3,your,PRP$
4,Oct.,NNP


In [6]:
test_frame.head()

Unnamed: 0,token,pos
0,For,IN
1,six,CD
2,years,NNS
3,",",","
4,T.,NNP


In [7]:
val_frame.head()

Unnamed: 0,token,pos
0,Lord,NNP
1,Chilver,NNP
2,",",","
3,63-year-old,JJ
4,chairman,NN


# Tokenization and padding
The data frames are now tokenized and additional padded to have a unit input length 

In [8]:
# merge train_frame['token'].values and val_frame['token'].values
new_train_frame = pd.concat([train_frame, val_frame], ignore_index=True)

# create tokenizer and fit on training and val set
tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_train_frame['token'].values)

# Routine to tokenize and pad data to unit length
def tokenize_and_pad(data_frame, tokenizer):
    vocab_size = len(tokenizer.word_index) + 1
    print('Vocabulary size: %d' % vocab_size)
    encoded_doc = tokenizer.texts_to_sequences(data_frame['token'].values)
    max_length = max([len(s.split()) for s in data_frame['token'].values]) # Changed max length to max length of a sentence
    padded_docs = pad_sequences(encoded_doc, maxlen=max_length, padding='post')
    return padded_docs, vocab_size, tokenizer

# train set
train_padded_docs, train_vocab_size, train_tokenizer = tokenize_and_pad(train_frame, tokenizer)

# val set
val_padded_docs, val_vocab_size, val_tokenizer = tokenize_and_pad(val_frame, tokenizer)

# test set
test_padded_docs, test_vocab_size, test_tokenizer = tokenize_and_pad(test_frame, tokenizer)

Vocabulary size: 8424
Vocabulary size: 8424
Vocabulary size: 8424


# Create embedding matrix
First, the pretrained glove-embedding has been downloaded from https://nlp.stanford.edu/projects/glove/ and converted to a dictionary format (100-dimensional embeddings).

In [9]:
embeddings_index = dict()
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        embeddings_index[values[0]] = asarray(values[1:], dtype='float32')

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


Define a function to create an embedding matrix from a given vocabulary, using the previously loaded glove-embeddings

In [10]:
def create_embedding_matrix(tokenizer, vocab_size):
    # load embedding into memory, skip first
    embedding_matrix = zeros((vocab_size, 100))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            # print("Not in Vocab", word)
            pass
    return embedding_matrix

# Define and train the model
## Encode the POS-labels
We also need to convert the POS labels to a numerical representation (one-hot-encoding)

In [11]:
def encode_labels(data_frame):
    # combined_frame = pd.concat([data_frame, train_val_frame], ignore_index=True)
    encoder = LabelEncoder()
    encoder.fit(new_train_frame['pos'].values)
    encoded_Y = encoder.transform(data_frame['pos'].values)
    # convert integers to dummy variables (i.e. one hot encoded) -> labels
    labels = np_utils.to_categorical(encoded_Y)
    return labels, encoder


## Define the structure and layers of the used model
Here, a sequential model is used, that receives tokens using an Embedding layer (embedding with the loaded glove-embeddings).  
The recurrent structure is implemented using a LSTM layer with 128-units

In [12]:
def create_model(vocab_size, embedding_matrix, plot_model=False):
    # define model
    model = Sequential()
    
    # Input as Embeddings
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=1, trainable=False))
    # A Bidirectional recurrent layer (LSTM units)
    model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128), input_shape=(None, 50)))
    # Dense layer to fit output to label-vector-size
    model.add(Dense(45, activation='softmax'))

    #TODO: Put in train function
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    if plot_model:
        keras.utils.plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
    return model

# run the model
def train_model(model, padded_docs, labels):
    # fit the model
    model.fit(padded_docs, labels, epochs=20, verbose=1)

    # evaluate the model
    loss, accuracy = model.evaluate(padded_docs, labels, verbose=1)
    print('Accuracy: %f' % (accuracy * 100))

def inference(model, tokenizer, vocab_size, test_padded_docs, test_frame):
    # predict the model
    yhat = model.predict(test_padded_docs, verbose=1)
    # map predicted labels to words
    predicted_labels = []
    for i in yhat:
        for word, index in tokenizer.word_index.items():
            if index == i:
                predicted_labels.append(word)
                break
    # map actual labels to words
    actual_labels = []
    for i in test_frame['pos'].values:
        for word, index in tokenizer.word_index.items():
            if index == i:
                actual_labels.append(word)
                break
    # create confusion matrix
    confusion_matrix = pd.crosstab(pd.Series(actual_labels), pd.Series(predicted_labels), rownames=['Actual'], colnames=['Predicted'])
    print(confusion_matrix)


Train routine using a bidirectional LSTM layern with 128 units  
Evaluate the models training progress using the validation set

In [13]:
train_embedding_matrix = create_embedding_matrix(train_tokenizer, train_vocab_size)
train_labels, train_encoder = encode_labels(train_frame)
model = create_model(train_vocab_size, train_embedding_matrix)

# fit the model
model.fit(train_padded_docs, train_labels, epochs=20, verbose=1)

# evaluate the model
loss, accuracy = model.evaluate(train_padded_docs, train_labels, verbose=1)
print('Accuracy: %f' % (accuracy * 100))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 100)            842400    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              234496    
 l)                                                              
                                                                 
 dense (Dense)               (None, 45)                11565     
                                                                 
Total params: 1,088,461
Trainable params: 246,061
Non-trainable params: 842,400
_________________________________________________________________
Epoch 1/20


2022-12-09 14:40:51.709668: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 85.522443


In [14]:
val_labels, val_encoder = encode_labels(val_frame)
loss, accuracy = model.evaluate(val_padded_docs, val_labels, verbose=1)
print('Accuracy: %f' % (accuracy * 100))



[0.52779221534729, 0.8139940500259399]

TEST AREA

In [29]:
# import numpy as np
# # predict model test
# predictions = model.predict(test_padded_docs, verbose=1)
# # map predicted labels to words
# predicted_labels = []
# for i in predictions:
#     for word, index in test_tokenizer.word_index.items():
#         if index == np.argmax(i):
#             predicted_labels.append(word)
#             break
# # map actual labels to words
# actual_labels = []
# for i in test_frame['pos'].values:
#     for word, index in test_tokenizer.word_index.items():
#         if index == i:
#             actual_labels.append(word)
#             break
# # create confusion matrix
# confusion_matrix = pd.crosstab(pd.Series(actual_labels), pd.Series(predicted_labels), rownames=['Actual'], colnames=['Predicted'])

44

In [34]:
# import numpy as np
# np.argmax(val_labels[0])
# np.argmax(predictions[0])

35

In [None]:
model = Sequential()
# Adding the input layer and the first hidden layer
model.add(Dense(16, activation = 'relu', input_dim = 243))
# Adding the output layer
model.add(Dense(units = 1))
model.compile(optimizer = 'adam',loss = 'mean_squared_error')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=40) # Model stop training after 40 epoch where validation loss didnt decrease
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True) #You save model weight at the epoch where validation loss is minimal
train = model.fit((train_X, train_label, batch_size=batch_size),epochs=1000,verbose=1,validation_data=(valid_X, valid_label),callbacks=[es,mc])#you can run for 1000 epoch btw model will stop after 40 epoch without better validation loss