## Exercise - DL Tutorial 09

### Student names: Anastasia Karsten, Pavlo Mospan

Submit you solution by 14 June to manuel.milling@informatik.uni-augsburg.de AND maurice.gerczuk@informatik.uni-augsburg.de

In [1]:
import numpy as np
import re
import pandas
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, Embedding, Dropout
from tensorflow.keras.utils import to_categorical


w2v_embedding_file = "data/embeddings/word2vec-40k-wiki-news-300d.vec"
ewe_embedding_file = "data/embeddings/ewe-40k-300d.vec"

train_tsv = "data/isear/train.tsv"
val_tsv = "data/isear/val.tsv"
test_tsv = "data/isear/test.tsv"

oov_id=1
pad_id=0
seq_length=128

lr=0.005

2. Load word2vec embedding matrix and create word-index-dictionary.

In [2]:
import pandas

def read_embedding_matrix(embedding_file):

    # Read data as a float to save the embedings
    data = np.genfromtxt(embedding_file, delimiter=" ")
    data=np.delete(data,0,1)

    zeros = np.zeros(data.shape[1])
    means = np.mean(data, axis = 0)

    matrix = np.insert(data, 0, zeros, axis=0)
    matrix = np.insert(matrix, 1, means, axis=0) 

    # Read data as string to save in dict
    data = np.genfromtxt(embedding_file, dtype=str, delimiter=" ")
    dictionary = {}
    dictionary["0"] = 0
    dictionary["NaN"] = 1
    for i in range(data.shape[0]):
        dictionary[str(data[i][0])] = (i + 2)  

    return matrix, dictionary
    

w2v_emb_matrix, w2v_word2idx = read_embedding_matrix(w2v_embedding_file)
print(f"w2v_emb_matrix:\t\t{w2v_emb_matrix.shape}")
print(f"w2v_word2idx shape:\t{len(w2v_word2idx)}")

w2v_emb_matrix:		(40001, 300)
w2v_word2idx shape:	40000


3. Prepare data:
- Load the sentences from the tsv files.
- Unify sentences (lower case, remove punctuation, etc.).
- Split sentences into words.
- Cut and zero pad sentences.
- Map words to indices.
- Map string labels to integers.

In [3]:


def read_tsv(tsv, word2idx, oov_id=1, pad_id=0, seq_length=128):
    data = np.genfromtxt(tsv, dtype=str, case_sensitive="lower", delimiter=("\t"), deletechars=".")

    # Get the last word, assign it an int and one-hot encode everything
    y = [i[1] for i in data]
    y = np.asarray(y)
    y = pandas.factorize(y)[0]
    y = to_categorical(y)

    # Get the sentences
    x = [i[0] for i in data]
    
    # Split sentences into words and get rid of punctuation
    hold = []
    hold.append([[re.split(' |-|\\.|\\!|\\?|\\(|\\) ', item) for item in x]])

    # Unpack the senteces
    hold = hold[0]
    hold = hold[0]

    for sentence in hold:
        # Get rid of empty words from the bracket removal
        for word in sentence:
            if word == '' or word == ' ':
                sentence.remove(word) 

        # Lowercase everything
        for k in range(len(sentence)):
            sentence[k] = sentence[k].lower()

            # Look up the word in dict
            if sentence[k] in word2idx:
                sentence[k] = word2idx[sentence[k]]
            else:
                sentence[k] = oov_id
        # Make als sentences seq_lenght long
        if len(sentence) < seq_length:
            for f in range(seq_length - len(sentence)):
                sentence.append(pad_id)
        elif len(sentence) > seq_length:
            for f in range(seq_length, len(sentence)):
                sentence.pop()

    x = np.array(hold)

    return x, y


train_X, train_y = read_tsv(train_tsv, w2v_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)
val_X, val_y = read_tsv(val_tsv, w2v_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)
test_X, test_y = read_tsv(test_tsv, w2v_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)

print(f"x_train shape:\t\t{train_X.shape}")
print(f"x_validation shape:\t{val_X.shape}")
print(f"x_test shape:\t\t{test_X.shape}")
print(f"y_train shape:\t\t{train_y.shape}")
print(f"y_validation shape:\t{val_y.shape}")
print(f"y_test shape:\t\t{test_y.shape}")

x_train shape:		(5976, 128)
x_validation shape:	(752, 128)
x_test shape:		(736, 128)
y_train shape:		(5976, 7)
y_validation shape:	(752, 7)
y_test shape:		(736, 7)


4. Initialise, train  and evaluate model.

In [4]:
model = keras.Sequential()
model.add(Embedding(40001, 300, weights=[w2v_emb_matrix], input_length=seq_length, trainable=False))
model.add(Bidirectional(LSTM(64, dropout=0.5, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout=0.5)))
model.add(Dense(7, activation="softmax"))

model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss="categorical_crossentropy", metrics="accuracy")
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 300)          12000300  
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 128)          186880    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 7)                 903       
Total params: 12,286,899
Trainable params: 286,599
Non-trainable params: 12,000,300
_________________________________________________________________


In [5]:
model.fit(train_X, train_y, validation_data=(val_X, val_y), batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f0fded7fa60>

In [6]:
model.evaluate(test_X, test_y)



[4.243688583374023, 0.05298912897706032]

5. EWE embeddings.

In [7]:
ewe_emb_matrix, ewe_word2idx = read_embedding_matrix(ewe_embedding_file)
print(f"ewe_emb_matrix:\t\t{ewe_emb_matrix.shape}")
print(f"ewe_word2idx shape:\t{len(ewe_word2idx)}")

train_X, train_y = read_tsv(train_tsv, ewe_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)
val_X, val_y = read_tsv(val_tsv, ewe_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)
test_X, test_y = read_tsv(test_tsv, ewe_word2idx, oov_id=oov_id, pad_id=pad_id, seq_length=seq_length)

print(f"x_train shape:\t\t{train_X.shape}")
print(f"x_validation shape:\t{val_X.shape}")
print(f"x_test shape:\t\t{test_X.shape}")
print(f"y_train shape:\t\t{train_y.shape}")
print(f"y_validation shape:\t{val_y.shape}")
print(f"y_test shape:\t\t{test_y.shape}")

ewe_emb_matrix:		(40002, 300)
ewe_word2idx shape:	40001
x_train shape:		(5976, 128)
x_validation shape:	(752, 128)
x_test shape:		(736, 128)
y_train shape:		(5976, 7)
y_validation shape:	(752, 7)
y_test shape:		(736, 7)


6. Custom word embeddings.

In [8]:
model = keras.Sequential()
model.add(Embedding(40001, 300, mask_zero=True, input_length=seq_length, embeddings_initializer="random_normal"))
model.add(Bidirectional(LSTM(64, dropout=0.5, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout=0.5,)))
model.add(Dense(7, activation="softmax"))

model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr), loss="categorical_crossentropy", metrics="accuracy")
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 300)          12000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128, 128)          186880    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 903       
Total params: 12,286,599
Trainable params: 12,286,599
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(train_X, train_y, validation_data=(val_X, val_y), batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f0e9ef65be0>

In [10]:
model.evaluate(test_X, test_y)



[6.560274124145508, 0.05298912897706032]