In [None]:
import torch
import time
import copy
import numpy as np
from datetime import date
from nltk import word_tokenize
from torch import nn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [None]:
#Uncomment if running on Google Colab
#!pip install pyconll
import pyconll

***
Help functions
***

In [None]:
def vectorize(tokenized_sentences, token_vocab, label_vocab, max_sentence_len, 
              max_token_len, tok_format=None):
    #Vectorize sentences. 
    #tokenized_sentences: word tokens
    #add one column before and after token to mark beginning and end of the token
    data_tensor = torch.zeros((len(tokenized_sentences), max_sentence_len, max_token_len+2), 
                              dtype=torch.long)
    if tok_format == 'pyconll':
        labels_tensor = torch.zeros((len(tokenized_sentences), max_sentence_len), dtype=torch.long)

    for i, sent in enumerate(tokenized_sentences):
        for j, tok in enumerate(sent[:max_sentence_len]):
            if tok_format == 'pyconll':
                tok = tok.form
            for k, ch in enumerate(tok.lower()[:max_token_len]):
                data_tensor[i, j, k+1] = token_vocab.get(ch, 0)
            if tok_format == 'pyconll':
                labels_tensor[i, j] = label_vocab.get(tok.upos, label_vocab['X'])
    if tok_format == 'pyconll':
        return data_tensor, labels_tensor
    else:
        return data_tensor

def train_model(model, 
                train_dataset, 
                test_dataset, 
                loss_fun=nn.functional.cross_entropy,
                lr=5e-3,
                num_epoch=10,
                batch_sz=64, 
                dev='cuda'):
    
    device = dev if torch.cuda.is_available() else 'cpu'
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, verbose=True)    
    data_loader_train = torch.utils.data.DataLoader(train_dataset, 
                                              batch_size=batch_sz, shuffle=True, drop_last=True)
    data_loader_val = torch.utils.data.DataLoader(test_dataset, 
                                                 batch_size=batch_sz, shuffle=True, drop_last=True)
    model.to(device)
    best_loss = float('inf')
    best_model = None
    for epoch in range(num_epoch):
        st = time.perf_counter()
        model.train()
        for samples, labels in data_loader_train:
            samples, labels = samples.to(device), labels.to(device)
            pred = model.forward(samples)
            loss_val = loss_fun(pred, labels)
            model.zero_grad()
            loss_val.backward()
            opt.step()
        
        model.eval()    
        with torch.no_grad():
            nb = 0
            mean_loss = 0
            for samples, labels in data_loader_val:
                nb += 1
                samples, labels = samples.to(device), labels.to(device)
                pred = model.forward(samples)
                mean_loss += float(loss_fun(pred, labels))
            mean_loss = mean_loss/nb
            print(f"Epoch {epoch} loss {mean_loss}, time {time.perf_counter()-st}.")
            if mean_loss < best_loss:
                #best_model = copy.deep_copy(model)
                best_model = copy.deepcopy(model)  #version for colab
        scheduler.step(mean_loss)
    return best_loss, best_model
    
def predict(model, dataset, predict_limit=None, batch_sz=64):
    import tqdm
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if len(dataset) < batch_sz:
        batch_sz = len(dataset)
    
    data_loader = torch.utils.data.DataLoader(dataset,batch_size=batch_sz, shuffle=True, drop_last=True)
    model.to(device)
    model.eval() 
    out_labels = []
    predicted = []
    with torch.no_grad():
        num_predict = 0
        for samples, labels in tqdm.tqdm(data_loader, total=len(dataset)/batch_sz ):
            out_labels.append(labels.numpy())
            samples, labels = samples.to(device), labels.to(device)
            pred = model.forward(samples)
            predicted.append(pred.detach().cpu().numpy())
            num_predict += len(samples)
            if predict_limit and num_predict >= predict_limit:
                break
    #print(f"shape labels {np.array(out_labels).shape}, pred {np.array(predicted).shape}")
    return np.concatenate(out_labels), np.concatenate(predicted)    

***
<font size=5>
Get the train and test data. Tokenize it into character tokens. Vectorize it.
</font>

***

In [None]:
'''import wget
out = './data/ru_syntagrus-ud-train.conllu'
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu"
wget.download(url, out)
out = './data/ru_syntagrus-ud-test.conllu'
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-b.conllu"
wget.download(url, out)'''

In [None]:
data_dir = './data/'
model_dir = './models/'
#If run on Google Colab uncomment
#from google.colab import drive
#drive.mount('/content/drive')
#data_dir = '/content/drive/MyDrive/nlp_data/'

In [None]:
train_data = pyconll.load_from_file(data_dir+'ru_syntagrus-ud-train.conllu')
test_data = pyconll.load_from_file(data_dir+'ru_syntagrus-ud-test.conllu')

In [None]:
print(' '.join([tok.form for tok in train_data[1]]))

In [None]:
MAX_TOKEN_LEN = max(len(tok.form) for sent in train_data for tok in sent)
MAX_SENT_LEN = max(len(sent) for sent in train_data)
print(f"The longest sentence has {MAX_SENT_LEN} tokens")
print(f"The longest token has {MAX_TOKEN_LEN} characters")

In [None]:
train_texts = [' '.join(tok.form for tok in sent).lower() for sent in train_data]
test_texts = [' '.join(tok.form for tok in sent).lower() for sent in test_data]

In [None]:
vect = CountVectorizer(lowercase=False, analyzer = 'char')
vect.fit_transform(train_texts)
#Insert pad word into vocabulary, it's more convenient if pad word has value 0
#hence swap it with whatever token has value 0
last_keyval = len(vect.vocabulary_)
for zerok, v in vect.vocabulary_.items():
    if v == 0:
        break
vect.vocabulary_['<PAD>'] = 0
vect.vocabulary_[zerok] = last_keyval
print(f"Vocabulary has {len(vect.vocabulary_)} unique tokens")

In [None]:
TAGS = sorted({token.upos for sent in train_data for token in sent if token.upos})
#move "tag unknown"('X') to to front so that it has zero id
TAGS = [TAGS[-1]] + TAGS[:-1]
label2id = {label:id for id, label in enumerate(TAGS)}
print(f"There are total of {len(TAGS)} unique tags")

In [None]:
train_tensor, train_labels_tensor = vectorize(train_data, 
                                              vect.vocabulary_, 
                                              label2id, 
                                              MAX_SENT_LEN, 
                                              MAX_TOKEN_LEN, 
                                              tok_format='pyconll')
test_tensor, test_labels_tensor = vectorize(test_data, 
                                            vect.vocabulary_, 
                                            label2id, 
                                            MAX_SENT_LEN, 
                                            MAX_TOKEN_LEN, 
                                            tok_format='pyconll')

In [None]:
#Pack it into a dataset so that we can feed it in batches to the model
train_dataset = torch.utils.data.TensorDataset(train_tensor, train_labels_tensor)
test_dataset = torch.utils.data.TensorDataset(test_tensor, test_labels_tensor)

***
<font size=5>
    Model architecture
</font>

***


In [None]:
#A stack of 1D convolution layers
class StackedConv1d(torch.nn.Module):
    def __init__(self, num_features, num_layers=1, kernel_size=3, dropout_probab=0.5):
        super().__init__()
        layers = []
        for _ in range(num_layers):
            layers.append(nn.Sequential(nn.Conv1d(num_features, num_features, kernel_size, padding=kernel_size//2), 
                                        nn.Dropout(dropout_probab), 
                                        nn.LeakyReLU()))
        self.layers = nn.ModuleList(layers)
    
    def forward(self, x):
        for layer in self.layers:
            x = x + layer(x)
        return x

#POS tagger net that predicts POS of separate tokens without considering context
class TokenPOSTaggerNet(nn.Module):
    def __init__(self, vocab_size, num_labels, emb_size=32, **kwargs):
        super().__init__()
        self.char_embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.backbone = StackedConv1d(emb_size, **kwargs)
        self.global_pooling = nn.AdaptiveMaxPool1d(1)
        self.out = nn.Linear(emb_size, num_labels)
        self.num_labels = num_labels
    
    def forward(self, tokens):
        batch_sz, sent_len, token_len = tokens.shape
        #Collapse it into 2D Matrix (BatchSize * MAX_SENT_LEN) x  MAX_TOKEN_LEN so that we could 
        #feed it into embeddings
        flat_view = tokens.view(batch_sz*sent_len, token_len)
        
        #Get initial char embeddings (BatchSize * MAX_SENT_LEN) x  MAX_TOKEN_LEN X EmbSize
        emb = self.char_embeddings(flat_view)
        #To pass it into neural network the order of dimentions should be: 
        #         NUM_SAMPLES X NUM_FEATURES X ...(other dimensions)...
        #   Hence we need to change the dim order in data to 
        #   (BatchSize * MAX_SENT_LEN) x  EmbSize x MAX_TOKEN_LEN
        emb = emb.permute(0, 2, 1)
        
        #Pass it through the convolution layers
        features = self.backbone(emb) 
        #Use Max Pooling to transform character embeddings of a token into a token embedding
        token_features = self.global_pooling(features).squeeze(-1) #(BatchSize * MAX_SENT_LEN) x  EmbSize
        
        #predict token labels
        pred = self.out(token_features) #(BatchSize * MAX_SENT_LEN) x  NumLabels
        #reshape it back into sentences
        pred = pred.view(batch_sz, sent_len, self.num_labels)
        #transpose the output so that the dimensions correspond to what is expected 
        #in the loss function
        pred = pred.permute(0, 2, 1)
        return pred
    
#POS tagger net that predicts POS of tokens considering context
class ContextPOSTaggerNet(nn.Module):
    def __init__(self, vocab_size, num_labels, emb_size=32, **kwargs):
        super().__init__()
        self.char_embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.token_backbone = StackedConv1d(emb_size, **kwargs)
        self.global_pooling = nn.AdaptiveMaxPool1d(1)
        self.context_backbone = StackedConv1d(emb_size, **kwargs)
        self.out = nn.Linear(emb_size, num_labels)
        self.num_labels = num_labels
        self.emb_size = emb_size
    
    def forward(self, tokens):
        batch_sz, sent_len, token_len = tokens.shape
        #Collapse it into 2D Matrix (BatchSize * MAX_SENT_LEN) x  MAX_TOKEN_LEN so that we could 
        #feed it into embeddings
        flat_view = tokens.view(batch_sz*sent_len, token_len)
        
        #Get initial char embeddings (BatchSize * MAX_SENT_LEN) x  MAX_TOKEN_LEN X EmbSize
        emb = self.char_embeddings(flat_view)
        #To pass it into neural network the order of dimentions should be: 
        #         NUM_SAMPLES X NUM_FEATURES X ...(other dimensions)...
        #   Hence we need to change the dim order in data to 
        #   (BatchSize * MAX_SENT_LEN) x  EmbSize x MAX_TOKEN_LEN
        emb = emb.permute(0, 2, 1)
        
        #Pass it through the convolution layers
        features = self.token_backbone(emb) 
        #Use Max Pooling to transform character embeddings of a token into a token embedding
        features = self.global_pooling(features).squeeze(-1) #(BatchSize * MAX_SENT_LEN) x  EmbSize (x 1)
        
        #Get context features
        features = features.view(batch_sz, sent_len, self.emb_size).permute(0, 2, 1)
        features = self.context_backbone(features) # BatchSize x EmbSize x MaxSentenceLen
        features = features.permute(0,2,1).view(batch_sz*sent_len, self.emb_size)
        #predict token labels
        pred = self.out(features) # (BatchSize*MaxSentenceLen) x NumLabels 
        #Reshape it back
        pred = pred.view(batch_sz, sent_len, self.num_labels).permute(0,2,1)

        return pred
#Class that tags POS
class POSTagger:
    def __init__(self, model, char2id, tags, max_sentence_len, max_token_len):
        self.model = model
        self.char2id = char2id
        self.tags = tags
        self.max_sent_len = max_sentence_len
        self.max_tok_len = max_token_len
        
    def __call__(self, tokenized_sentences):
        #tokenized_sentences: word tokens
        char_tokenized = vectorize(tokenized_sentences, self.char2id, None, self.max_sent_len, 
                                  self.max_tok_len)
        dataset = torch.utils.data.TensorDataset(char_tokenized, 
                                                 torch.zeros(len(tokenized_sentences), self.max_sent_len))
        dummy_labels, pred = predict(self.model, dataset) #num_sent x num_labels x max_sent_len
        pred = pred.argmax(1)
        print("shape of pred", pred.shape)
        
        out_tags = []
        for i, sent in enumerate(tokenized_sentences): 
            out_tags.append([self.tags[label] for label in pred[i, :len(sent)]])
        return out_tags

***
First train a model that tags separate tokens without considering the sentence context.
***

In [None]:
emb_sz = 64
dropout_p=0.3

In [None]:
token_tagger_model = TokenPOSTaggerNet(len(vect.vocabulary_), len(label2id), num_layers=3, emb_size=emb_sz, dropout_probab=dropout_p)
print('Number of params in the model: ', sum(np.product(t.shape) for t in token_tagger_model.parameters()))

In [None]:
#loss, token_tagger_model = train_model(token_tagger_model, train_dataset, test_dataset, 
                               #       num_epoch=10)

In [None]:
#torch.save(token_tagger_model.state_dict(), model_dir+'token_pos-'+'emb'+str(emb_sz)+'-p'+str(dropout_p)+'.pth')
token_tagger_model.load_state_dict(torch.load(model_dir+'token_pos-'+'emb'+str(emb_sz)+'-p'+str(dropout_p)+'.pth', map_location=torch.device('cpu'))) 

In [None]:
labels, predicted_labels = predict(token_tagger_model, test_dataset)
token_loss = nn.functional.cross_entropy(torch.tensor(predicted_labels), torch.tensor(labels))
print(f"Loss on test data {token_loss}")
print(classification_report(labels.reshape(-1), predicted_labels.argmax(1).reshape(-1), 
                      labels=list(label2id.values()), 
                      target_names = list(label2id.keys()), 
                      zero_division=0)) 

***
Now train a model that tags tokens considering the sentence context.
***

In [None]:
context_tagger_model = ContextPOSTaggerNet(len(vect.vocabulary_), len(label2id), num_layers=3, emb_size=emb_sz, dropout_probab=dropout_p)
print('Number of params in the model: ', sum(np.product(t.shape) for t in context_tagger_model.parameters()))

In [None]:
context_loss, context_tagger_model = train_model(context_tagger_model, train_dataset, test_dataset, 
                                      num_epoch=10)

In [None]:
#torch.save(context_tagger_model.state_dict(), model_dir+'context_pos-'+'emb'+str(emb_sz)+'-p'+str(dropout_p)+'.pth')
context_tagger_model.load_state_dict(torch.load(model_dir+'context_pos-'+'emb'+str(emb_sz)+'-p'+str(dropout_p)+'.pth', map_location=torch.device('cpu')) )

In [None]:
labels, predicted_labels = predict(context_tagger_model, test_dataset)
context_loss = nn.functional.cross_entropy(torch.tensor(predicted_labels), torch.tensor(labels))
print(f"Loss on test data {token_loss}")
print(classification_report(labels.reshape(-1), predicted_labels.argmax(1).reshape(-1), 
                      labels=list(label2id.values()), 
                      target_names = list(label2id.keys()), 
                      zero_division=0))

***
Test the taggers on previously unseen data.
***

In [None]:
test_corpus = [
    'Мама мыла раму.',
    'Косил косой косой косой.',
    'Глокая куздра штеко будланула бокра и куздрячит бокрёнка.',
    'Сяпала Калуша с Калушатами по напушке.',
    'Пирожки поставлены в печь, мама любит печь.',
    'Ведро дало течь, вода стала течь.',
    'Три да три, будет дырка.',
    'Три да три, будет шесть.',
    'Сорок сорок'
]
test_corpus_tokens = [word_tokenize(sent) for sent in test_corpus]

In [None]:
tokTagger = POSTagger(token_tagger_model, vect.vocabulary_, TAGS, MAX_SENT_LEN, MAX_TOKEN_LEN)
tok_tags = tokTagger(test_corpus_tokens)

In [None]:
for i, sent in enumerate(test_corpus_tokens):
    print([(tok, tag) for tok, tag in zip(sent, tok_tags[i])])

In [None]:
contextTagger = POSTagger(context_tagger_model, vect.vocabulary_, TAGS, MAX_SENT_LEN, MAX_TOKEN_LEN)
context_tags = contextTagger(test_corpus_tokens)

In [None]:
for i, sent in enumerate(test_corpus_tokens):
    print([(tok, tag) for tok, tag in zip(sent, context_tags[i])])