## A4 - POS LSTM Classifier

Author: Austin Blodgett

Adaptation to colab: Nitin Venkateswaran


### Follow the steps to use this notebook for your A4. 
**NOTE**: It is best to use your Georgetown Google accounts.
##### 1. Save a copy of this notebook starter template in your Google Drive (File -> Save a copy in drive)
##### 2. Upload a copy of all 3 tsv files in **pos-data** directory (available in a4.zip) to your Google Drive in the folder location **A4/pos-data/**; you will need to create the folder 'A4' at the root location in your Drive, followed by the subfolder 'pos-data'
##### 3. You are all set!


###Import libraries and mount Google Drive





In [None]:
# !wget https://nlp.stanford.edu/data/glove.6B.zip
# !unzip -o glove.6B.zip
!pip install transformers
!pip install tensorflow==2.11.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, ht

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import tensorflow as tf

from collections import Counter

from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed
from keras.activations import softmax

import numpy as np
from keras import backend as K

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_file = '/content/drive/My Drive/ENLP_Project/My_try/german/de_falko-merlin_train.tsv'
dev_file = '/content/drive/My Drive/ENLP_Project/My_try/german/de_falko-merlin_dev.tsv'
test_file = '/content/drive/My Drive/ENLP_Project/My_try/german/de_falko-merlin_dev.tsv'
UNK = '[UNK]'
PAD = '[PAD]'

### Implement this function if you want to transform the input text, e.g. normalizing case


In [None]:
# TODO
def transform_text_sequence(seq):
    '''
    Implement this function if you want to transform the input text,
    for example normalizing case.
    '''
    return seq


###Helper Functions (no need to implement)



In [None]:
def get_vocabulary_and_data(data_file, max_vocab_size=None):
    vocab = Counter()
    # every comment line: initial statement is on the top 
    #pos_vocab = {'<s>','</s>'}
    label_vocab = {'<s>'}
    vocab[UNK] = 1
    vocab[PAD] = 1
    data = []
    gold_labels = []
    with open(data_file, 'r', encoding='utf8') as f:
        sent = []
        sent_label = []
        for line in f:
            if line.strip():
                tok, label = line.strip().split('\t')[0], line.strip().split('\t')[1]
                sent.append(tok)
                sent_label.append(label)
                vocab[tok]+=1
                #vocab['<s>'] += 1
                #vocab['</s>'] += 1
                label_vocab.add(label)
            elif sent:
                #sent.append('</s>')
                #sent_pos.append('</s>')
                sent = transform_text_sequence(sent)
                data.append(sent)
                gold_labels.append(sent_label)
                #sent = ['<s>']
                sent=[]
                #sent_pos = ['<s>']
                sent_label=[]
    vocab = sorted(vocab.keys(), key = lambda k: vocab[k], reverse=True)
    if max_vocab_size:
        vocab = vocab[:max_vocab_size-2]
    vocab = [UNK, PAD] + vocab
    label_vocab.remove('<s>')
    return {k:v for v,k in enumerate(vocab)}, list(label_vocab), data, gold_labels


def vectorize_sequence(seq, vocab):
    seq = [tok if tok in vocab else UNK for tok in seq]
    return [vocab[tok] for tok in seq]


def unvectorize_sequence(seq, vocab):
    translate = sorted(vocab.keys(),key=lambda k:vocab[k])
    return [translate[i] for i in seq]


def one_hot_encode_label(label, label_set):
    vec = [1.0 if l==label else 0.0 for l in label_set]
    return np.array(vec)

def clean(seqs, vocab, unk):
    for i,seq in enumerate(seqs):
        for j,tok in enumerate(seq):
            if tok>=len(vocab):
                seq[j] = unk

def binary_encode_label(label, label_set):
    vec = [1.0 if label==label_set[0] else 0.0]
    return np.array(vec)

def batch_generator(data, labels, vocab, label_set, batch_size=1):
    while True:
        batch_x = []
        batch_y = []
        for sent, sent_label in zip(data,labels):
            batch_x.append(vectorize_sequence(sent, vocab))
            # print([one_hot_encode_label(label, label_set) for label in sent_pos])
            batch_y.append([binary_encode_label(label, label_set) for label in sent_label])
            if len(batch_x) >= batch_size:
                clean(batch_x, vocab, vocab[UNK])
                # Pad Sequences in batch to same length
                batch_x = pad_sequences(batch_x, vocab[PAD])
                batch_y = pad_sequences(batch_y, binary_encode_label(PAD, label_set))
                yield np.array(batch_x), np.array(batch_y)
                batch_x = []
                batch_y = []


def describe_data(data, gold_labels, label_set, generator):
    batch_x, batch_y = [], []
    for bx, by in generator:
        batch_x = bx
        batch_y = by
        break
    print('Data example:',data[0])
    print('Label:',gold_labels[0])
    print('Label count:', len(label_set),label_set)
    print('Data size', len(data))
    print('Batch input shape:', batch_x.shape)
    print('Batch output shape:', batch_y.shape)


def pad_sequences(batch_x, pad_value):
    ''' This function should take a batch of sequences of different lengths
        and pad them with the pad_value token so that they are all the same length.

        Assume that batch_x is a list of lists.
    '''
    pad_length = len(max(batch_x, key=lambda x: len(x)))
    for i, x in enumerate(batch_x):
        if len(x) < pad_length:
            batch_x[i] = x + ([pad_value] * (pad_length - len(x)))

    return batch_x

###Change these arguments for the main procedure call as needed for your experiments

In [None]:
epochs = 5 # number of epochs
learning_rate = 0.01 # learning rate
dropout = 0.3 # dropout rate
early_stopping = -1 # early stopping criteria
embedding_size = 100 # embedding dimension size
hidden_size = 10 # hidden layer size
batch_size = 32 # batch size

###Check the GPU is available

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  device_name = '/cpu:0'
  print(
      '\n\n This notebook is not '
      'configured to use a GPU.  You can change this in Notebook Settings. Defaulting to:' + device_name)
else:
  print ('GPU Device found: ' + device_name)

GPU Device found: /device:GPU:0


In [None]:
from keras import Model
class BERT_Wrapper(Model):

  def __init__(self):
    super(BERT_Wrapper, self).__init__()
    self.encoder = TFBertModel.from_pretrained("bert-base-multilingual-cased", trainable=False)
    self.dense = Dense(hidden_size)

  def call(self, inputs, **kwargs):
      outputs = self.encoder(inputs)
      last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
      output = self.dense(last_hidden_states)
      return output

In [None]:
vocab, labels, train_data, train_labels = get_vocabulary_and_data(train_file)
_, _, dev_data, dev_labels = get_vocabulary_and_data(dev_file)
_, _, test_data, test_labels = get_vocabulary_and_data(test_file)
print(train_data[:3], train_labels[:3])
print(dev_data[:3], dev_labels[:3])
print(test_data[:3], test_labels[:3])
describe_data(train_data, train_labels, labels,
              batch_generator(train_data, train_labels, vocab, labels, batch_size))

glove_file = '/content/drive/My Drive/ENLP_Project/My_try/german/german_glove.txt' # Change as necessary
def load_pretrained_embeddings(glove_file, vocab):
    embedding_matrix = np.zeros((len(vocab)+2, embedding_size))
    with open(glove_file, encoding='utf8') as f:
        for line in f:
          #word,*vector=line.split()
          word,coefs=line.split(maxsplit=1)
          if word in vocab: 
            coefs = np.fromstring(coefs, "f", sep=" ")
            embedding_matrix[vocab[word]]=coefs
            # Each line will be a word and a list of floats, separated by spaces.
            # If the word is in your vocabulary, create a numpy array from the list of floats.
            # Assign the array to the correct row of embedding_matrix.
    embedding_matrix[vocab[UNK]] = np.random.randn(embedding_size)
    return embedding_matrix

# TOM:
# def load_pretrained_embeddings(glove_file, vocab):
#     embedding_matrix = np.zeros((len(vocab), embedding_size))
#     with open(glove_file, encoding='utf8') as f:
#         for line in f:
#             # Each line will be a word and a list of floats, separated by spaces.
#             # If the word is in your vocabulary, create a numpy array from the list of floats.
#             # Assign the array to the correct row of embedding_matrix.
#            line = line.split()
#            if line[0] in vocab:
#              embedding_matrix[vocab[line[0]]] = np.asarray(line[1:])

#     embedding_matrix[vocab[UNK]] = np.random.randn(embedding_size)
#     return embedding_matrix

base_line = False
use_bert = False
from keras.initializers import Constant
from transformers import BertTokenizer, TFBertLMHeadModel, BertConfig, TFBertModel
with tf.device(device_name):
    if base_line: # baseline--True
        embedding_layer = Embedding(input_dim=len(vocab),output_dim=embedding_size)
    elif use_bert: # bert--true
        embedding_layer = BERT_Wrapper()
    else: # glove--true
        embedding_size = 300
        embedding_layer = Embedding(input_dim=len(vocab)+2
        ,output_dim=embedding_size
        ,embeddings_initializer=Constant(load_pretrained_embeddings(glove_file, vocab))
        ,trainable=False)
    # Implement your model here! ----------------------------------------------------------------------
    # Use the variables batch_size, hidden_size, embedding_size, dropout, epochs
    pos_tagger = Sequential(name="my_sequential")
    # (class) Embedding(input_dim, output_dim
    # pos_tagger.add(Embedding(input_dim=len(vocab),output_dim=embedding_size))
    pos_tagger.add(embedding_layer)
    
    pos_tagger.add(Bidirectional(LSTM(units=hidden_size,return_sequences=True),name="layer3"))
    # pos_tagger.add(tf.keras.layers.Dropout(dropout))
    
    pos_tagger.add(TimeDistributed(Dense(units=1,activation='sigmoid'))) # i or c
    
    # ------------------------------------------------------------------------------------------------

    # Define your optimizer and learning rate
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate)

    pos_tagger.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    # pos_tagger.summary()

    for i in range(epochs):
        print('Epoch',i+1,'/',epochs)
        # Training
        print(vocab)
        print(labels)
        pos_tagger.fit(batch_generator(train_data, train_labels, vocab, labels, batch_size),
                                  epochs=1, steps_per_epoch=len(train_data)/batch_size)
        # Evaluation
        loss, acc, prec, recall = pos_tagger.evaluate(batch_generator(dev_data, dev_labels, vocab, labels),
                                                  steps=len(dev_data))
        print('Dev Loss:', loss, 'Dev Acc:', acc, 'Dev Precision:', prec, 'Dev Recall:', recall)


[['In', 'einem', 'vollkommenen', 'Welt', 'würde', 'es', 'sein', ',', 'dass', 'die', 'finanzielle', 'Entlohnung', 'eines', 'Menschen', 'dem', 'Beitrag', 'entspricht', ',', 'den', 'er', '/', 'sie', 'für', 'die', 'Gesellschaft', 'geleistet', 'hat', '.'], ['Dennoch', 'ist', 'das', 'sowohl', 'unmöglich', 'als', 'auch', 'unrealistisch', ',', 'weil', 'keine', 'Gesellschaft', 'den', 'relativen', 'Wert', 'jedes', 'Berufs', 'richtig', 'entscheiden', 'könnte', '.'], ['Auf', 'dem', 'ersten', 'Blick', 'sollte', 'man', 'glauben', ',', 'dass', 'die', 'Behauptung', 'dieses', 'Themas', 'selbstverständlich', 'ist', '.']] [['c', 'i', 'c', 'c', 'i', 'c', 'i', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'i', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'i', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c']]
[['4', 'Die', 'meisten', 'Universitätsabschlüsse', 'sind'

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ('self',)


Epoch 1 / 5
{'[UNK]': 8280, '[PAD]': 8281, ',': 2, '.': 3, 'die': 4, 'und': 5, 'der': 6, 'ist': 7, 'in': 8, 'zu': 9, 'ich': 10, 'nicht': 11, 'dass': 12, 'es': 13, 'man': 14, 'eine': 15, 'Ich': 16, 'für': 17, 'ein': 18, 'sie': 19, 'das': 20, 'auch': 21, 'haben': 22, 'sind': 23, 'mit': 24, 'von': 25, 'den': 26, 'Frauen': 27, '?': 28, 'oder': 29, 'als': 30, 'kann': 31, 'sich': 32, 'aber': 33, '\\"': 34, 'hat': 35, 'sehr': 36, 'habe': 37, 'wie': 38, 'wenn': 39, 'gibt': 40, 'auf': 41, 'Die': 42, 'Es': 43, 'um': 44, 'Stadt': 45, 'nur': 46, 'im': 47, 'wir': 48, 'weil': 49, 'viele': 50, 'werden': 51, 'Sie': 52, 'so': 53, 'viel': 54, 'mehr': 55, 'an': 56, 'muss': 57, 'sein': 58, 'einen': 59, 'können': 60, 'mich': 61, 'nach': 62, 'machen': 63, 'dem': 64, 'X': 65, 'des': 66, 'diese': 67, 'Feminismus': 68, 'Gesellschaft': 69, 'Leute': 70, 'Studenten': 71, 'immer': 72, 'was': 73, 'wird': 74, 'du': 75, 'Das': 76, 'möchte': 77, 'noch': 78, 'meine': 79, 'gut': 80, 'mir': 81, 'Wenn': 82, 'war': 83, 'Ar

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Dev Loss: 1.8302733550967787e-08 Dev Acc: 0.7871412634849548 Dev Precision: 0.26305097341537476 Dev Recall: 0.21359063684940338
Epoch 2 / 5
{'[UNK]': 8280, '[PAD]': 8281, ',': 2, '.': 3, 'die': 4, 'und': 5, 'der': 6, 'ist': 7, 'in': 8, 'zu': 9, 'ich': 10, 'nicht': 11, 'dass': 12, 'es': 13, 'man': 14, 'eine': 15, 'Ich': 16, 'für': 17, 'ein': 18, 'sie': 19, 'das': 20, 'auch': 21, 'haben': 22, 'sind': 23, 'mit': 24, 'von': 25, 'den': 26, 'Frauen': 27, '?': 28, 'oder': 29, 'als': 30, 'kann': 31, 'sich': 32, 'aber': 33, '\\"': 34, 'hat': 35, 'sehr': 36, 'habe': 37, 'wie': 38, 'wenn': 39, 'gibt': 40, 'auf': 41, 'Die': 42, 'Es': 43, 'um': 44, 'Stadt': 45, 'nur': 46, 'im': 47, 'wir': 48, 'weil': 49, 'viele': 50, 'werden': 51, 'Sie': 52, 'so': 53, 'v

In [None]:
# Generate predictions for the test data
predictions = pos_tagger.predict(batch_generator(test_data,test_labels,vocab,labels),steps=len(test_data))
predictions = predictions.numpy()
#print(probabilities)
tags=[]
for sentence in predictions:
  #print(sentence)
  tag=[]
  for word in sentence: # for each sentence
    if word==1.0:
      tag.append('i')
    else:
      tag.append('c')
  tags.append(tag)
print(tags)
testloss, testacc, testprecision, testrecall = pos_tagger.evaluate(batch_generator(train_data, train_labels, vocab, labels),
                                                  steps=len(test_data))
print('test Loss:', testloss, 'test Acc:', testacc, 'test Precision:', testprecision, 'test Recall:', testrecall)

[['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'