In [4]:
# First run this cell
import csv
from tqdm import tqdm
import string
import codecs
import torch
import torch.optim as optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

In [5]:
# Labels for each text
start_year = 1700
nb_decades = 20
def linear_decades(t):
    return start_year + 10*t

In [6]:
# Run this cell to init mappings from characters to IDs and back again,
# from words to IDs and back again, and from labels to IDs and back again

UNKNOWN = '<unk>'  # Unknown char or unknown word
CHARS = [UNKNOWN, '’', '—'] + list(string.punctuation) + list(string.ascii_letters) + list(string.digits)
char_to_id = {c:i for i,c in enumerate(CHARS)}
PADDING_WORD = '<pad>'
id_to_label = ['{}'.format(linear_decades(d)) for d in range(nb_decades)]

def label_to_id(label):
    return int(label)

print(CHARS)
print(id_to_label)

['<unk>', '’', '—', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['1700', '1710', '1720', '1730', '1740', '1750', '1760', '1770', '1780', '1790', '1800', '1810', '1820', '1830', '1840', '1850', '1860', '1870', '1880', '1890']


In [7]:
def load_glove_embeddings(embedding_file,
                          padding_word=PADDING_WORD, 
                          unknown_word=UNKNOWN):
    """
    Reads Glove embeddings from a file.

    Returns vector dimensionality, the word_to_id mapping (as a dict),
    and the embeddings (as a list of lists).
    """
    word_to_id = {}  # Dictionary to store word-to-ID mapping
    word_to_id[padding_word] = 0
    word_to_id[unknown_word] = 1
    embeddings = []
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            data = line.split()
            word = data[0]
            vec = [float(x) for x in data[1:]]
            embeddings.append(vec)
            word_to_id[word] = len(word_to_id)
    D = len(embeddings[0])

    embeddings.insert(word_to_id[padding_word], [0]*D)  # <PAD> has an embedding of just zeros
    embeddings.insert(word_to_id[unknown_word], [-1]*D)      # <UNK> has an embedding of just minus-ones

    return D, word_to_id, embeddings

In [8]:
####### A MODIFIER ########

class HistDataset(Dataset):
    """
    A class loading a dataset from a CSV file to be used as an input 
    to PyTorch DataLoader.

    The CSV file has 2 fields: chunk of text and label.

    Datapoints are sentences + associated labels for each word. If the 
    words have not been seen before (i.e, they are not found in the 
    'word_to_id' dict), they will be mapped to the unknown word '<UNK>'.
    """

    def __init__(self, filename, word_to_id):
        reader = csv.reader(codecs.open(filename, encoding='ascii',
                                        errors='ignore'), delimiter=';')

        self.passages = []
        self.labels = []

        for row in reader:
            if row:
                self.passages.append(row[0].strip().split()) #### TOKENIZATION SHOULD BE CHANGED
                self.labels.append(label_to_id(row[1].strip()))
        

    def __len__(self):
        return len(self.passages)

    def __getitem__(self, idx):
        return self.passages[idx], self.labels[idx]

In [9]:
# Let's check out some of these data structures
dim, word_to_id, embeddings = load_glove_embeddings('../glove.6B.50d.txt')
print("The embedding for the word 'good' looks like this:")
print(embeddings[word_to_id['good']])
print()

The embedding for the word 'good' looks like this:
[-0.35586, 0.5213, -0.6107, -0.30131, 0.94862, -0.31539, -0.59831, 0.12188, -0.031943, 0.55695, -0.10621, 0.63399, -0.4734, -0.075895, 0.38247, 0.081569, 0.82214, 0.2222, -0.0083764, -0.7662, -0.56253, 0.61759, 0.20292, -0.048598, 0.87815, -1.6549, -0.77418, 0.15435, 0.94823, -0.3952, 3.7302, 0.82855, -0.14104, 0.016395, 0.21115, -0.036085, -0.15587, 0.86583, 0.26309, -0.71015, -0.03677, 0.0018282, -0.17704, 0.27032, 0.11026, 0.14133, -0.057322, 0.27207, 0.31305, 0.92771]



In [10]:
# Read the data we are going to use for testing the model
test_set = HistDataset('test.csv', word_to_id)
print("There are", len(test_set), "data points in the test set")

for dp in range(2):
    sentence, label = test_set[dp]
    print("Data point", dp, "is", sentence)
    print("It has the label", label)

There are 2 data points in the test set
Data point 0 is ['This', 'is', 'an', 'example', 'of', 'training', 'dataset.', 'You', 'would', 'have', 'this', 'kinds', 'of', 'passages', '!']
It has the label 1850
Data point 1 is ['In', 'the', 'CSV', 'file,', 'the', 'datapoint', 'is', 'a', 'row', 'of', 'words,', 'and', 'the', 'last', 'one', 'is', 'the', 'label', 'of', 'the', 'passage.']
It has the label 1730


In [11]:
# Run this cell. The function below will take care of the case of
# sequences of unequal lengths.

def pad_sequence(batch, padding_word=PADDING_WORD):
    batch_data, batch_labels = zip(*batch)
    max_len = max(map(len, batch_data))
    padded_data = [[b[i] if i < len(b) else padding_word for i in range(max_len)] for b in batch_data]
    return padded_data, batch_labels

In [12]:
# This is how it works
x = [([1,2,3],1750), ([4,5],1890), ([6,7,8,9],1900)]
pad_sequence(x)

([[1, 2, 3, '<pad>'], [4, 5, '<pad>', '<pad>'], [6, 7, 8, 9]],
 (1750, 1890, 1900))

In [None]:
class NERClassifier(nn.Module):

    def __init__(self, word_embeddings,  # Pre-trained word embeddings
                 char_to_id,             # Mapping from chars to ids
                 word_to_id,             # Mapping from words to ids
                 char_emb_size=16,
                 char_hidden_size=25,    # Hidden size of the character-level biRNN
                 word_hidden_size=100,   # Hidden size of the word-level biRNN
                 class_size=nb_decades,
                 padding_word=PADDING_WORD,
                 unknown_word=UNKNOWN,
                 char_bidirectional=True,
                 word_bidirectional=True,
                 device='cpu'
            ):

        super(NERClassifier, self).__init__()
        self.padding_word = padding_word
        self.unknown_word = unknown_word
        self.char_to_id = char_to_id
        self.word_to_id = word_to_id
        self.char_emb_size = char_emb_size
        self.char_hidden_size = char_hidden_size
        self.word_hidden_size = word_hidden_size
        self.class_size = class_size
        self.char_bidirectional = char_bidirectional
        self.word_bidirectional = word_bidirectional

        # Create an embedding tensor for the words and import the Glove
        # embeddings. The embeddings are frozen (i.e., they will not be
        # updated during training).
        vocabulary_size = len(word_embeddings)
        self.word_emb_size = len(word_embeddings[0])
        
        self.word_emb = nn.Embedding(vocabulary_size, self.word_emb_size)
        self.word_emb.weight = nn.Parameter(torch.tensor(word_embeddings, dtype=torch.float), 
                                            requires_grad=False)

        # Create an embedding tensor for character embeddings. These embeddings
        # are learnt from scratch (i.e., they are not frozen).
        if self.char_emb_size > 0:
            self.char_emb = nn.Embedding(len(char_to_id), char_emb_size)
            self.char_birnn = nn.GRU(
                self.char_emb_size, 
                self.char_hidden_size, 
                bidirectional=char_bidirectional,
                batch_first=True
            )
        else:
            self.char_hidden_size = 0

        multiplier = 2 if self.char_bidirectional else 1
        self.word_birnn = nn.GRU(
            self.word_emb_size + multiplier * self.char_hidden_size, # input size
            self.word_hidden_size,
            bidirectional=word_bidirectional,
            batch_first=True
        )

        # Binary classification - 0 if not part of the name, 1 if a name
        multiplier = 2 if self.word_bidirectional else 1
        self.final_pred = nn.Linear(multiplier * self.word_hidden_size, self.class_size)


    def forward(self, x):
        """
        Performs a forward pass of a NER classifier
        Takes as input a 2D list `x` of dimensionality (B, T),
        where B is the batch size;
              T is the max sentence length in the batch (shorter sentences
              are already padded with the special token <PAD>)
              
        Returns logits, i.e. the output of the last linear layer before applying softmax.

        :param      x:    A batch of sentences
        :type       x:    list of strings
        """

        # First find all word IDs of all words in all sentences in the batch
        # and the character IDs of all characters in all words in all sentences
        word_ids = [[word_to_id[word] if word in word_to_id.keys() else 1 for word in sentence.split()] for sentence in x]
        char_ids = [[[char_to_id[char] if char in char_to_id.keys() else 0 for char in word] for word in sentence.split()] for sentence in x]
        max_word_len = max(len(word) for sentence in char_ids for word in sentence)
        char_ids = [[[word[i] if i < len(word) else self.unknown_word for i in range(max_word_len)] for word in sentence] for sentence in char_ids]
        
        # The 'to(self.device)' below is necessary for making sure that 
        # the model and the data are on the same device (CPU or CUDA).
        word_tensor = torch.tensor(word_ids).to(self.device)  
        char_tensor = torch.tensor(char_ids).to(self.device)
        
        # YOUR CODE HERE
        
        #Dataset parameters
        batch_size = x.shape[0]
        len_sentence = x.shape[1]
        
        #Embedding layer
        E_w = self.word_emb(word_tensor)
        E_c = self.char_emb(char_tensor)
        E_c = E_c.reshape(batch_size*len_sentence,max_word_len,self.char_emb_size)
        
        #RNN
        _,hidden_char = self.char_birnn(E_c)
        H_c = torch.cat([hidden_char[0],hidden_char[1]],dim=1)
        H_c = H_c.reshape(batch_size,len_sentence,2*self.char_hidden_size)
        
        E_final = torch.cat([E_w,H_c],dim=2)
        outputs,_ = self.word_birnn(E_final)
        output = torch.cat([outputs[0],outputs[1]],dim = 1)
        
        final_pred = self.final_pred(output)
        
        return final_pred
        #return torch.zeros((len(x), len(x[0]), 2), requires_grad=True)

In [None]:
# ================== Hyper-parameters ==================== #

learning_rate = 0.001
epochs = 5

# ======================= Training ======================= #

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print( "Running on", device )

dim, word_to_id, embeddings = load_glove_embeddings('glove.6B.50d.txt')
training_set = NERDataset('ner_training.csv', word_to_id)
training_loader = DataLoader(training_set, batch_size=128, collate_fn=pad_sequence)

ner = NERClassifier(embeddings, char_to_id, word_to_id, device=device).to(device)