In [3]:
import numpy as np
import pandas as pd
import re
import sklearn

In [18]:
train_headers = ["index", "word", "ner_tag"]
train_df = pd.read_csv("./data/train", sep=' ', header=None, quoting=3)
train_df.columns = train_headers

dev_df = pd.read_csv("./data/dev", sep=' ', header=None, quoting=3)
dev_df.columns = train_headers

test_headers = ["index", "word"]
test_df = pd.read_csv("./data/test", sep=' ', header=None, engine='python', error_bad_lines=False, quoting=3)
test_df.columns = test_headers

In [19]:
train_df["word"] = train_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)
dev_df["word"] = dev_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)
test_df["word"] = test_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)

In [20]:
word_frequency = {}
for row in train_df.iterrows():
    if row[1]["word"] in word_frequency:
        word_frequency[row[1]["word"]] += 1
    else:
        word_frequency[row[1]["word"]] = 1

In [21]:
threshold = 2
unknown_words = [] 
for key, value in word_frequency.items():
    if value < threshold:
        unknown_words.append(key)

In [22]:
train_df["word"] = train_df["word"].apply(lambda x: "<unk>" if x in unknown_words else x)

In [25]:
train_df["word"] = train_df["word"].astype(str)
train_vocab_size = len(np.unique(train_df["word"]))
all_train_words = np.unique(train_df["word"])

In [27]:
dev_df["word"] = dev_df["word"].apply(lambda x: "<unk>" if x in unknown_words else ("<unk>" if x not in all_train_words else x))
dev_df["word"] = dev_df["word"].astype(str)

In [223]:
def format_data(df):
    train_formatted = []
    first = df.iloc[0]
    x = [first["word"]]
    y = [first["ner_tag"]]
    for row in df.iloc[1:].iterrows():
        if row[1]["index"] == 1:
            train_formatted.append([x, y])
            x, y = [], []
            x.append(row[1]["word"])
            sentence_y.append(row[1]["ner_tag"])
            if row[0] == (df.shape[0]-1):
                train_formatted.append([x, y])
        else:
            x.append(row[1]["word"])
            y.append(row[1]["ner_tag"])
    return train_formatted

In [31]:
train_formatted = format_data(train_df)
dev_formatted = format_data(dev_df)

In [32]:
word_map = {"<pad>":0}
for i, word in enumerate(set(train_df["word"])):
    word_map[word] = i+1

In [37]:
tag_map = {"<pad>":-1}
for i, word in enumerate(set(train_df["ner_tag"])):
    tag_map[word] = i

In [40]:
tag_map_without_pad = {}
for i, word in enumerate(set(train_df["ner_tag"])):
    tag_map_without_pad[word] = i

In [49]:
max_len_sentence = max([len(s[0]) for s in train_formatted])

In [51]:
def pad_sentences(sentences_formatted):
    train_padded = []
    for sentence in sentences_formatted:
        word_lst = sentence[0]
        ner_lst = sentence[1]
        mapped_word_lst, mapped_ner_lst = [], []
        for word in word_lst:
            mapped_word_lst.append(word_map[word])
        for ner in ner_lst:
            mapped_ner_lst.append(tag_map[ner])

        word_cnt = len(mapped_word_lst)
        diff_ = max_len_sentence - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_
        mapped_ner_lst = mapped_ner_lst + [-1] * diff_

        train_padded.append([mapped_word_lst, mapped_ner_lst])
    return train_padded

In [53]:
train_padded = pad_sentences(train_formatted)
dev_padded = pad_sentences(dev_formatted)

### Task 1: Simple Bidirectional LSTM Model:

In [56]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn

In [57]:
class BLSTM(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        
        lstm_hidden_dim = 256
        lstm_num_layers = 1
        linear_output_dim =128
        output_dim = 10
        
        self.embeddings = nn.Embedding(vocab_size, 100)
        self.lstm = nn.LSTM(input_size=100, hidden_size=256,
                          num_layers=1, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.linear1 = nn.Linear(512, 128)
        self.linear2 = nn.Linear(128, 9)
        self.elu = nn.ELU()
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        lstm_out, self.hidden = self.lstm(embeds.view(len(inputs), 1, -1))
        lstm_out_dropped = self.dropout(lstm_out)
        out = self.linear1(lstm_out_dropped.view(len(inputs), -1))
        linear_out_dropped = self.dropout(out)
        elu_out = self.elu(linear_out_dropped)
        l2_out = self.linear2(elu_out)
        log_probs = F.log_softmax(l2_out, dim=1)
        return log_probs


In [59]:
# EMBEDDING_DIM = 100
# VOCAB_SIZE = train_vocab_size+1
# n_epochs = 20
# trainloader = torch.utils.data.DataLoader(train_padded, batch_size=50)
# devloader = torch.utils.data.DataLoader(dev_padded, batch_size=50)
# blstm = BLSTM(VOCAB_SIZE)

# criterion = nn.CrossEntropyLoss(ignore_index=-1, size_average=True)
# optimizer = torch.optim.SGD(blstm.parameters(), lr=0.1, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# test_loss_min = 10000

# for epoch in range(n_epochs):
#     scheduler.step()
#     print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))
#     train_loss = 0
#     test_loss = 0
#     blstm.train()
#     for data, target in trainloader:
#         optimizer.zero_grad()
#         output = blstm(torch.cat(data,dim=0))
#         loss = criterion(output, torch.cat(target,dim=0))
#         loss.backward()
#         optimizer.step()
#         train_loss += loss
#     with torch.no_grad():
#         for data, target in devloader:
#             output = blstm(torch.cat(data,dim=0))
#             loss = criterion(output, torch.cat(target,dim=0))
#             test_loss += loss
#     train_loss = train_loss/len(trainloader.dataset)
#     test_loss = test_loss/len(devloader.dataset)
#     print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
#         epoch+1, 
#         train_loss,
#         test_loss
#         ))    
#     if test_loss <= test_loss_min:
#         torch.save(blstm.state_dict(), 'blstm1.pt')
#         test_loss_min = test_loss
# print('All done.')

Epoch-0 lr: 0.1
Epoch: 1 	Training Loss: 0.013217 	Test Loss: 0.012173
Epoch-1 lr: 0.1
Epoch: 2 	Training Loss: 0.011612 	Test Loss: 0.011400
Epoch-2 lr: 0.1
Epoch: 3 	Training Loss: 0.010564 	Test Loss: 0.010077
Epoch-3 lr: 0.1
Epoch: 4 	Training Loss: 0.009544 	Test Loss: 0.009327
Epoch-4 lr: 0.1
Epoch: 5 	Training Loss: 0.008793 	Test Loss: 0.008687
Epoch-5 lr: 0.1
Epoch: 6 	Training Loss: 0.008140 	Test Loss: 0.008190
Epoch-6 lr: 0.1
Epoch: 7 	Training Loss: 0.007589 	Test Loss: 0.007709
Epoch-7 lr: 0.1
Epoch: 8 	Training Loss: 0.007064 	Test Loss: 0.007310
Epoch-8 lr: 0.1
Epoch: 9 	Training Loss: 0.006607 	Test Loss: 0.007015
Epoch-9 lr: 0.05
Epoch: 10 	Training Loss: 0.005940 	Test Loss: 0.006579
Epoch-10 lr: 0.05
Epoch: 11 	Training Loss: 0.005729 	Test Loss: 0.006494
Epoch-11 lr: 0.05
Epoch: 12 	Training Loss: 0.005554 	Test Loss: 0.006363
Epoch-12 lr: 0.05
Epoch: 13 	Training Loss: 0.005381 	Test Loss: 0.006306
Epoch-13 lr: 0.05
Epoch: 14 	Training Loss: 0.005244 	Test Loss: 0

KeyboardInterrupt: 

In [162]:
def pad_test_sentences(sentences_formatted):
    test_padded = []
    for sentence in sentences_formatted:
        mapped_word_lst = []
        for word in sentence:
            mapped_word_lst.append(word_map[word])

        word_cnt = len(mapped_word_lst)
        diff_ = max_len_sentence - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_

        test_padded.append(mapped_word_lst)
    return test_padded

In [225]:
def format_data_test(df):
    test_formatted = []
    first = df.iloc[0]
    x = [first["word_formatted"]]
    for row in df.iloc[1:].iterrows():
        if row[1]["index"] == 1:
            test_formatted.append(x)
            x = []
            x.append(row[1]["word_formatted"])
            if row[0] == (df.shape[0]-1):
                train_formatted.append([x])
        else:
            x.append(row[1]["word_formatted"])
    
    return test_formatted

In [60]:
test_df["word_formatted"] = test_df["word"].apply(lambda x: "<unk>" if x in unknown_words else ("<unk>" if x not in all_train_words else x))
test_df["word_formatted"] = test_df["word_formatted"].astype(str)

test_formatted = format_data_test(test_df)
test_padded = pad_test_sentences(test_formatted)

In [61]:
blstm.load_state_dict(torch.load('blstm1.pt'))

<All keys matched successfully>

In [208]:
def predict_test(model, dataloader, data_test):
    prediction_list = []
    with torch.no_grad():
        for data in dataloader:
            output = model(torch.cat(data,dim=0))
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    overall_pred = []
    for i, sentence in enumerate(data_test):
        non_padded_pred = len(np.nonzero(sentence)[0])
        pred_i = prediction_list[i].tolist()[0:non_padded_pred]
        overall_pred.append(pred_i)
    convert_overall_pred = []
    for sentence in overall_pred:
        for idx in sentence:
            convert_overall_pred.append(list(tag_map_without_pad.keys())[idx])
    return convert_overall_pred

In [209]:
def predict_dev(model, dataloader, data_dev):
    prediction_list = []
    with torch.no_grad():
        for data, target in dataloader:
            output = model(torch.cat(data,dim=0))
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    overall_pred = []
    for i, sentence in enumerate(data_dev):
        actual_sentence = sentence[0]
        non_padded_pred = len(np.nonzero(actual_sentence)[0])
        pred_i = prediction_list[i].tolist()[0:non_padded_pred]
        overall_pred.append(pred_i)
    convert_overall_pred = []
    for sentence in overall_pred:
        for idx in sentence:
            convert_overall_pred.append(list(tag_map_without_pad.keys())[idx])
    return convert_overall_pred

In [194]:
devloader = torch.utils.data.DataLoader(dev_padded, batch_size=1)
predictions_dev = predict_dev(blstm, devloader, dev_padded)

In [221]:
y_true = np.array(dev_df["ner_tag"])

In [82]:
def write_results(name, y_true, y_pred, df):
    with open(name, 'w') as f:
        for row in df.iloc[0:].iterrows():
            f.write(str(row[1]["index"]))
            f.write(" ")
            f.write(row[1]["word"])
            f.write(" ")
            f.write(y_true[row[0]])
            f.write(" ")
            f.write(y_pred[row[0]])
            f.write("\n")

In [101]:
def write_results_test(name, y_pred, df):
    with open(name, 'w') as f:
        for row in df.iloc[0:].iterrows():
            f.write(str(row[1]["index"]))
            f.write(" ")
            f.write(row[1]["word"])
            f.write(" ")
            f.write(y_pred[row[0]])
            f.write("\n")

In [102]:
write_results("dev1.out", y_true, predictions_dev, dev_df)

In [206]:
testloader = torch.utils.data.DataLoader(test_padded, batch_size=1)
predictions_test = predict_test(blstm, testloader, test_padded)

In [104]:
write_results_test("test1.out", predictions_test, test_df)

IndexError: list index out of range

### Task 2: Using GloVe Word Embeddings:

In [115]:
with open("./glove.6B.100d","r",encoding="UTF-8") as f:
    word2vec={}
    for word_embedding in f:
        word_split = word_embedding.split()
        word = word_split[0]
        word2vec[word] = np.array(word_split[1:], dtype=np.float64)

In [116]:
word_map_2 = {}
for i, word in enumerate(set(train_df["word"]).union(set(dev_df["word"]))):
    word_map_2[word] = i+1
word_map_2["<unk>"] = i+1

In [130]:
EMBEDDING_DIM = 101
VOCAB_SIZE = len(word_map_2)+1

In [131]:
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, idx in word_map.items():
    if word in word2vec:
        word_embedding = word2vec[word]
        embedding_matrix[idx,:] = np.concatenate((word_embedding, [0]))
    elif word.lower() in word2vec: 
        word_embedding = word2vec[word.lower()]
        embedding_matrix[idx,:] = np.concatenate((word_embedding, [1]))
    else:
        continue

embedding_blstm2 = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) 
embedding_blstm2.load_state_dict({"weight": torch.tensor(embedding_matrix)})

<All keys matched successfully>

In [228]:
def format_data_glove(df):
    train_formatted = []
    first = df.iloc[0]
    x = [first["word"]]
    y = [first["ner_tag"]]
    
    for row in df.iloc[1:].iterrows():
        if row[1]["index"] == 1:
            train_formatted.append([x, y])
            x, y = [], []
            x.append(row[1]["word"])
            y.append(row[1]["ner_tag"])
            if row[0] == (df.shape[0]-1):
                train_formatted.append([x, y])
        else:
            x.append(row[1]["word"])
            y.append(row[1]["ner_tag"])
    return train_formatted

In [229]:
train_formatted_glove = format_data_glove(train_df)
dev_formatted_glove = format_data_glove(dev_df)

In [None]:
def format_data_test_glove(df):
    test_formatted = []
    first = df.iloc[0]
    x = [first_word["word_formatted"]]
    for row in df.iloc[1:].iterrows():
        if row[1]["index"] == 1:
            test_formatted.append(x)
            x = []
            x.append(row[1]["word_formatted"])
            if row[0] == (df.shape[0]-1):
                test_formatted.append(x)
        else:
            x.append(row[1]["word_formatted"])
    return test_formatted

In [231]:
def pad_glove(sentences_formatted):
    train_padded = []
    cnt = 0
    for sentence in sentences_formatted:
        word_lst = sentence[0]
        ner_lst = sentence[1]
        mapped_word_lst, mapped_ner_lst = [], []
        cnt += len(word_lst)
        for word in word_lst:
            mapped_word_lst.append(word_map_2[word])
        for ner in ner_lst:
            mapped_ner_lst.append(ner_map[ner])
        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_
        mapped_ner_lst = mapped_ner_lst + [-1] * diff_
        train_padded.append([mapped_word_lst, mapped_ner_lst])
    return train_padded

In [232]:
train_padded_glove = pad_glove(train_formatted_glove)
dev_padded_glove = pad_glove(dev_formatted_glove)

In [233]:
def pad_test_glove(sentences_formatted):
    test_padded = []
    cnt = 0
    for sentence in sentences_formatted:
        mapped_word_lst = []
        cnt += len(sentence)
        for word in sentence:
            mapped_word_lst.append(word_map_2[word])
        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_
        test_padded.append(mapped_word_lst)
    return test_padded

In [None]:
test_df["word_formatted"] = test_df["word"].apply(lambda x: "<unk>" if x not in word_map_2 else x)
test_df["word_formatted"] = test_df["word_formatted"].astype(str)
test_formatted_glove = format_data_test(test_df)

In [217]:
test_padded_glove = pad_test_sentences_glove(test_formatted_glove)

In [234]:
class BLSTM_2(nn.Module):
    
    def __init__(self, embeddings):
        super().__init__()
        
        lstm_hidden_dim = 256
        lstm_num_layers = 1
        linear_output_dim =128
        output_dim = 10
        
        self.embeddings = embeddings
        self.lstm = nn.LSTM(input_size=101, hidden_size=256,
                          num_layers=1, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.linear1 = nn.Linear(512, 128)
        self.elu = nn.ELU()
        self.linear2 = nn.Linear(128, 9)
        
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        lstm_out, self.hidden = self.lstm(embeds.view(len(inputs), 1, -1))
        lstm_out_dropped = self.dropout(lstm_out)
        out = self.linear1(lstm_out_dropped.view(len(inputs), -1))
        elu_out = self.elu(out)
        l2_out = self.linear2(elu_out)
        log_probs = F.log_softmax(l2_out, dim=1)
        return log_probs


In [None]:
# EMBEDDING_DIM = 101
# VOCAB_SIZE = len(word_map_2)
# n_epochs = 20
# trainloader = torch.utils.data.DataLoader(train_padded_glove, batch_size=16)
# devloader = torch.utils.data.DataLoader(dev_padded_glove, batch_size=16)
# blstm2 = BLSTM_2(embedding_blstm2)
# criterion = nn.CrossEntropyLoss(ignore_index=-1, size_average=True)
# optimizer = torch.optim.SGD(blstm2.parameters(), lr=0.1, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# test_loss_min = 10000

# for epoch in range(n_epochs):
#     scheduler.step()
#     print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))
#     train_loss = 0
#     test_loss = 0
#     blstm2.train()
#     for data, target in trainloader:
#         optimizer.zero_grad()
#         output = blstm2(torch.cat(data,dim=0))
#         loss = criterion(output, torch.cat(target,dim=0))
#         loss.backward()
#         optimizer.step()
#         train_loss += loss 
#     with torch.no_grad():
#         for data, target in devloader:
#             output = blstm2(torch.cat(data,dim=0))
#             loss = criterion(output, torch.cat(target,dim=0))
#             test_loss += loss
#     train_loss = train_loss/len(trainloader.dataset)
#     test_loss = test_loss/len(devloader.dataset) 
#     print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
#         epoch+1, 
#         train_loss,
#         test_loss
#         ))   
#     if test_loss <= test_loss_min:
#         torch.save(blstm2.state_dict(), 'blstm2.pt')
#         test_loss_min = test_loss
# print('All done.')

Epoch-0 lr: 0.1
Epoch: 1 	Training Loss: 0.020680 	Test Loss: 0.018366
Epoch-1 lr: 0.1
Epoch: 2 	Training Loss: 0.014163 	Test Loss: 0.018013
Epoch-2 lr: 0.1
Epoch: 3 	Training Loss: 0.013037 	Test Loss: 0.017807
Epoch-3 lr: 0.1
Epoch: 4 	Training Loss: 0.012327 	Test Loss: 0.017678
Epoch-4 lr: 0.1
Epoch: 5 	Training Loss: 0.011875 	Test Loss: 0.017604
Epoch-5 lr: 0.1
Epoch: 6 	Training Loss: 0.011515 	Test Loss: 0.017511
Epoch-6 lr: 0.1
Epoch: 7 	Training Loss: 0.011281 	Test Loss: 0.017460
Epoch-7 lr: 0.1
Epoch: 8 	Training Loss: 0.011062 	Test Loss: 0.017408
Epoch-8 lr: 0.1


In [143]:
blstm2 = BLSTM_2(embedding_blstm2)
blstm2.load_state_dict(torch.load('blstm2.pt'))

<All keys matched successfully>

In [199]:
devloader = torch.utils.data.DataLoader(dev_padded_glove, batch_size=1)
predictions_dev_glove = predict_dev(blstm2, devloader, dev_padded_glove)

In [222]:
y_true = np.array(dev_df["ner_tag"])

In [201]:
write_results("dev2.out", y_true, predictions_dev_glove, dev_df)

In [218]:
testloader = torch.utils.data.DataLoader(test_padded_glove, batch_size=1)
predictions_test_glove = predict_test(blstm2, testloader, test_padded_glove)

In [220]:
write_results_test("test2.out", predictions_test_glove, test_df)

IndexError: list index out of range