In [1]:
from conllu import parse_incr
for sent in parse_incr(open("sequoia-ud.parseme.frsemcor.simple.small", encoding='UTF-8')):
    print(" ".join(tok["upos"] for tok in sent))

for sent in parse_incr(open("sequoia-ud.parseme.frsemcor.simple.small", encoding='UTF-8')):
    print(" ".join(tok["form"] for tok in sent))


DET NOUN AUX VERB DET NOUN CCONJ ADP DET NOUN ADP DET NOUN PUNCT VERB DET NOUN ADP DET NOUN ADP PUNCT NUM NOUN ADP NOUN ADJ ADP DET NOUN ADP PROPN PUNCT PUNCT
NOUN ADJ DET NOUN ADP DET NOUN ADP NUM NOUN NUM ADP NUM NOUN PUNCT
ADV ADP DET NOUN PUNCT PRON VERB DET NOUN VERB ADP DET ADJ NOUN PUNCT
DET NOUN ADP DET NOUN VERB DET NOUN ADP PRON VERB DET NOUN PUNCT ADP NUM NOUN PUNCT NOUN PROPN PUNCT
NOUN PROPN VERB NUM NOUN PUNCT
PRON AUX DET NOUN ADP DET ADJ PROPN PROPN PUNCT ADV ADJ ADP DET NOUN PUNCT NOUN ADP DET NOUN ADP ADP NUM PUNCT NOUN ADP PRON PRON AUX VERB ADP VERB DET NOUN ADP PROPN ADP VERB VERB DET NOUN ADJ ADP PROPN PUNCT
ADP VERB DET NOUN PUNCT PROPN PROPN PUNCT NOUN ADP DET NOUN ADP NOUN ADP PROPN PUNCT
DET ADJ NOUN PUNCT ADP DET NOUN ADP PRON PUNCT DET NOUN PROPN PUNCT AUX VERB ADP DET NOUN DET NOUN ADP NOUN PUNCT DET NOUN ADP DET NOUN PUNCT
DET ADJ NOUN AUX VERB SCONJ PRON PRON VERB CCONJ ADV ADP DET NUM NOUN PUNCT NOUN NUM NOUN PUNCT ADP DET NOUN ADP DET NOUN ADJ ADJ ADP D

In [2]:
import torch.nn as nn
import torch

class BOWClassifier(nn.Module):
    def __init__(self, d_embed, d_in, d_out):
        super().__init__()
        self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
        self.dropout = nn.Dropout(0.3)
        self.decision = nn.Linear(d_embed, d_out)
    def forward(self, idx_words):
        embedded = self.embed(idx_words)
        averaged = torch.mean(embedded, dim=1) # dim 0 is batch
        return self.decision(self.dropout(averaged))


In [10]:
from collections import defaultdict
import torch 

def pad_tensor(X, max_len):
    res = torch.full((len(X), max_len), 0) # padding
    for (i, row) in enumerate(X) :
        x_len = min(max_len, len(X[i]))
        res[i,:x_len] = torch.LongTensor(X[i][:x_len])
    return res

wordvocab = defaultdict(lambda : len(wordvocab))
wordvocab["<PAD>"]; wordvocab["<UNK>"] # Special token IDs

sentences = []
for sent in parse_incr(open("sequoia-ud.parseme.frsemcor.simple.small", encoding='UTF-8')):
    sentences.append([wordvocab[tok["form"]] for tok in sent])

max_len = max(len(s) for s in sentences)
max_len = 10
padded_sentences = pad_tensor(sentences, max_len)

print(padded_sentences)


uposvocab = defaultdict(lambda: len(uposvocab))
uposvocab["<PAD>"]; uposvocab["<UNK>"]  # Special token IDs

upos_sentences = []
for sent in parse_incr(open("sequoia-ud.parseme.frsemcor.simple.small", encoding='UTF-8')):
    upos_sentences.append([uposvocab[tok["upos"]] for tok in sent])

padded_upos_sentences = pad_tensor(upos_sentences, max_len)

print(padded_upos_sentences)


tensor([[  2,   3,   4,   5,   6,   7,   8,   9,  10,  11],
        [ 32,  33,   6,  34,  35,  20,  36,  12,  37,  38],
        [ 42,  40,  20,  43,  15,  44,  45,  10,  46,  47],
        [ 52,  53,  12,  20,  54,  55,  10,  56,  12,  57],
        [ 64,  65,  66,  67,  68,  31,   0,   0,   0,   0],
        [ 69,  70,  20,  71,  12,  72,  73,  74,  75,  15],
        [ 98,  99, 100, 101,  15, 102, 103,  15, 104,  12],
        [ 52, 108, 109,  15,  40,  20, 110,  12, 111,  15],
        [124, 125, 126, 127, 128, 129,  44, 130, 131,   8],
        [157, 158, 127, 159,  40, 160, 161,  28, 162,  15],
        [169, 170, 171, 172, 173,  40, 174, 175, 176,  15],
        [124, 193,  12,  20, 194, 195,  10, 196,   9,  20],
        [203,   6, 204,  15,  44, 171, 205, 206, 207, 208],
        [ 22, 214,  15, 185,  12, 215, 216,  22,   0,   0],
        [217, 218, 219, 220,  76, 174, 221,  15, 173, 119],
        [230, 161,  20, 231,  12,  20, 232,   2, 233,   8],
        [ 22, 244, 171, 245, 173,  94, 2

In [52]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset





tds = TensorDataset(padded_sentences, padded_upos_sentences)
dataloader = DataLoader(tds, batch_size=2, shuffle=True)

class GRUClassifier(nn.Module):
    def __init__(self, d_embed, d_hidden, d_in, d_out):
        super().__init__()
        self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
        self.gru = nn.GRU(d_embed, d_hidden, batch_first=True, bias=False)
        self.dropout = nn.Dropout(0.1)
        self.decision = nn.Linear(d_hidden, d_out)
    
    def forward(self, idx_words):
        embedded = self.embed(idx_words)
        hidden, _ = self.gru(embedded)
        hidden = hidden.contiguous().view(-1, hidden.size(-1))  # Flatten hidden states
        return self.decision(self.dropout(hidden))


res = GRUClassifier(10, 20, len(wordvocab), len(uposvocab))(padded_sentences)
print(res.shape)
print(res)

torch.Size([300, 15])
tensor([[-0.1445,  0.1108, -0.2580,  ..., -0.1363, -0.2051, -0.0924],
        [-0.0880,  0.1836, -0.4074,  ..., -0.0267, -0.3332,  0.1166],
        [-0.0565,  0.0232, -0.3631,  ..., -0.0447, -0.2186,  0.1038],
        ...,
        [-0.1693,  0.0580,  0.0689,  ..., -0.1117, -0.0036,  0.0155],
        [-0.0692, -0.1959, -0.0450,  ..., -0.0759, -0.0883,  0.0386],
        [-0.1118, -0.2656, -0.0656,  ..., -0.0065, -0.0535, -0.1043]],
       grad_fn=<AddmmBackward0>)


In [53]:
def fit(model, dataloader, epochs=10):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    for epoch in range(epochs):
        model.train()
        for words, upos in dataloader:
            optimizer.zero_grad()
            output = model(words)
            output = output.view(-1, output.size(-1))  # Reshape output to (batch_size * sequence_length, num_classes)
            upos = upos.view(-1)  # Flatten target tensor to (batch_size * sequence_length)
            loss = loss_fn(output, upos)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

fit(GRUClassifier(10, 2, len(wordvocab), len(uposvocab)), dataloader)

NameError: name 'optim' is not defined