In [1]:
import sys
sys.path.append("src")

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from collections import Counter, defaultdict

In [3]:
data_path = "data/conll/train.txt"

In [4]:
with open(data_path, "r") as f:
    raw_data = f.readlines()

In [5]:
print(type(raw_data))
print(len(raw_data))

<class 'list'>
219554


In [6]:
raw_data[:20]

['-DOCSTART- -X- -X- O\n',
 '\n',
 'EU NNP B-NP B-ORG\n',
 'rejects VBZ B-VP O\n',
 'German JJ B-NP B-MISC\n',
 'call NN I-NP O\n',
 'to TO B-VP O\n',
 'boycott VB I-VP O\n',
 'British JJ B-NP B-MISC\n',
 'lamb NN I-NP O\n',
 '. . O O\n',
 '\n',
 'Peter NNP B-NP B-PER\n',
 'Blackburn NNP I-NP I-PER\n',
 '\n',
 'BRUSSELS NNP B-NP B-LOC\n',
 '1996-08-22 CD I-NP O\n',
 '\n',
 'The DT B-NP O\n',
 'European NNP I-NP B-ORG\n']

In [7]:
data = []
data_dict = None
idx = 0

for line in raw_data:
    if line.startswith("-DOCSTART-"):
        continue
    
    if line.strip() == "":
        if data_dict:
            data.append(data_dict)
        data_dict = None
    else:
        if data_dict is None:
            data_dict = {
                "id": idx,
                "tokens": [],
                "ner_outputs": []
            }
            idx += 1
        l = line.split()
        data_dict["tokens"].append(f"{l[0]}_{l[1]}")
        data_dict["ner_outputs"].append(l[3])

In [8]:
data[:3]

[{'id': 0,
  'tokens': ['EU_NNP',
   'rejects_VBZ',
   'German_JJ',
   'call_NN',
   'to_TO',
   'boycott_VB',
   'British_JJ',
   'lamb_NN',
   '._.'],
  'ner_outputs': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']},
 {'id': 1,
  'tokens': ['Peter_NNP', 'Blackburn_NNP'],
  'ner_outputs': ['B-PER', 'I-PER']},
 {'id': 2,
  'tokens': ['BRUSSELS_NNP', '1996-08-22_CD'],
  'ner_outputs': ['B-LOC', 'O']}]

# build vocab

In [9]:
sentences = []

for i in data:
    tokens = i["tokens"]
    sentences.extend(tokens)

print(len(sentences))
print(len(set(sentences)))

203621
26774


In [10]:
ner_outputs = []

for i in data:
    ner = i["ner_outputs"]
    ner_outputs.extend(ner)

print(len(ner_outputs))
print(len(set(ner_outputs)))
print(set(ner_outputs))

203621
9
{'B-ORG', 'B-PER', 'B-MISC', 'I-MISC', 'O', 'I-LOC', 'I-PER', 'I-ORG', 'B-LOC'}


In [11]:
token_counter  = Counter(token for token in sentences)

In [12]:
sorted_tokens = sorted(token_counter.keys(), key=lambda x: (-token_counter[x], x))

In [13]:
vocab = {token: idx for idx, token in enumerate(sorted_tokens)}

In [14]:
def build_vocab(sentence, specials=None, special_first=True):

    token_counter = Counter(token for token in sentence)
    sorted_tokens = sorted(token_counter.keys(), key=lambda x: (-token_counter[x], x))

    if specials:
        if special_first:
            sorted_tokens = specials + sorted_tokens
        else:
            sorted_tokens = sorted_tokens + specials

    vocab = {token: idx for idx, token in enumerate(sorted_tokens)}

    return vocab

In [15]:
vocab = build_vocab(sentences, specials=["<unk>", "<pad>"])

In [16]:
ner_outputs = build_vocab(ner_outputs)

In [17]:
vocab

{'<unk>': 0,
 '<pad>': 1,
 '._.': 2,
 ',_,': 3,
 'the_DT': 4,
 'of_IN': 5,
 'to_TO': 6,
 'in_IN': 7,
 'a_DT': 8,
 '(_(': 9,
 ')_)': 10,
 'and_CC': 11,
 '"_"': 12,
 'on_IN': 13,
 'said_VBD': 14,
 'for_IN': 15,
 '1_CD': 16,
 "'s_POS": 17,
 '-_:': 18,
 'The_DT': 19,
 'was_VBD': 20,
 '2_CD': 21,
 '0_CD': 22,
 '3_CD': 23,
 'at_IN': 24,
 'with_IN': 25,
 'from_IN': 26,
 'by_IN': 27,
 ':_:': 28,
 'is_VBZ': 29,
 'he_PRP': 30,
 '4_CD': 31,
 'has_VBZ': 32,
 'had_VBD': 33,
 'it_PRP': 34,
 'as_IN': 35,
 'his_PRP$': 36,
 'not_RB': 37,
 'were_VBD': 38,
 'be_VB': 39,
 'that_IN': 40,
 'an_DT': 41,
 'after_IN': 42,
 'who_WP': 43,
 'will_MD': 44,
 '5_CD': 45,
 'but_CC': 46,
 'U.S._NNP': 47,
 'been_VBN': 48,
 '$_$': 49,
 '--_:': 50,
 'first_JJ': 51,
 'two_CD': 52,
 'are_VBP': 53,
 'their_PRP$': 54,
 'have_VBP': 55,
 '6_CD': 56,
 'which_WDT': 57,
 'would_MD': 58,
 'I_PRP': 59,
 'its_PRP$': 60,
 'they_PRP': 61,
 'percent_NN': 62,
 'beat_VB': 63,
 'year_NN': 64,
 'Thursday_NNP': 65,
 'this_DT': 66,
 'million

In [18]:
ner_outputs

{'O': 0,
 'B-LOC': 1,
 'B-PER': 2,
 'B-ORG': 3,
 'I-PER': 4,
 'I-ORG': 5,
 'B-MISC': 6,
 'I-LOC': 7,
 'I-MISC': 8}

In [19]:
def encode(tokens, vocab):
    return [vocab[token] for token in tokens]

In [20]:
for i in range(len(data)):
    list_token = encode(data[i]["tokens"], vocab)
    list_output = encode(data[i]["ner_outputs"], ner_outputs)
    data[i]["tokens"] = list_token
    data[i]["ner_outputs"] = list_output

In [21]:
data

[{'id': 0,
  'tokens': [1062, 25226, 264, 1488, 6, 4637, 289, 7953, 2],
  'ner_outputs': [3, 0, 6, 0, 0, 0, 6, 0, 0]},
 {'id': 1, 'tokens': [693, 4270], 'ner_outputs': [2, 4]},
 {'id': 2, 'tokens': [1341, 131], 'ner_outputs': [1, 0]},
 {'id': 3,
  'tokens': [19,
   464,
   449,
   14,
   13,
   65,
   34,
   11429,
   25,
   264,
   4593,
   6,
   2584,
   6,
   25689,
   289,
   7953,
   389,
   4121,
   4688,
   475,
   1856,
   1965,
   613,
   278,
   39,
   12627,
   6,
   2014,
   2],
  'ner_outputs': [0,
   3,
   5,
   0,
   0,
   0,
   0,
   0,
   0,
   6,
   0,
   0,
   0,
   0,
   0,
   6,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]},
 {'id': 4,
  'tokens': [112,
   17,
   4109,
   6,
   4,
   464,
   452,
   17,
   3086,
   1021,
   10951,
   11013,
   14,
   13,
   69,
   2584,
   243,
   1018,
   25644,
   26,
   503,
   121,
   110,
   120,
   389,
   4,
   2690,
   4593,
   20,
   22039,
   2],
  'ner_outputs': [1,
   0,
   0,
   0

# From SRC

In [22]:
from ner.data import NERDataset

In [23]:
data = NERDataset(data_path="data/conll/train.txt")

In [24]:
data.data

[{'id': 0,
  'tokens': ['EU_NNP',
   'rejects_VBZ',
   'German_JJ',
   'call_NN',
   'to_TO',
   'boycott_VB',
   'British_JJ',
   'lamb_NN',
   '._.'],
  'ner_outputs': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']},
 {'id': 1,
  'tokens': ['Peter_NNP', 'Blackburn_NNP'],
  'ner_outputs': ['B-PER', 'I-PER']},
 {'id': 2,
  'tokens': ['BRUSSELS_NNP', '1996-08-22_CD'],
  'ner_outputs': ['B-LOC', 'O']},
 {'id': 3,
  'tokens': ['The_DT',
   'European_NNP',
   'Commission_NNP',
   'said_VBD',
   'on_IN',
   'Thursday_NNP',
   'it_PRP',
   'disagreed_VBD',
   'with_IN',
   'German_JJ',
   'advice_NN',
   'to_TO',
   'consumers_NNS',
   'to_TO',
   'shun_VB',
   'British_JJ',
   'lamb_NN',
   'until_IN',
   'scientists_NNS',
   'determine_VBP',
   'whether_IN',
   'mad_JJ',
   'cow_NN',
   'disease_NN',
   'can_MD',
   'be_VB',
   'transmitted_VBN',
   'to_TO',
   'sheep_NN',
   '._.'],
  'ner_outputs': ['O',
   'B-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',

In [25]:
data[0]

(tensor([ 1062, 25226,   264,  1488,     6,  4637,   289,  7953,     2]),
 tensor([5, 2, 8, 2, 2, 2, 8, 2, 2]))

In [26]:
data[1]

(tensor([ 693, 4270]), tensor([4, 6]))

In [27]:
loader = DataLoader(
    dataset=data,
    batch_size=32,
    shuffle=True,
    num_workers=11,
    collate_fn=pad_sequence
)

In [28]:
next(iter(loader))

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/nurgoni/anaconda3/envs/ner/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/nurgoni/anaconda3/envs/ner/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/home/nurgoni/anaconda3/envs/ner/lib/python3.8/site-packages/torch/nn/utils/rnn.py", line 399, in pad_sequence
    return torch._C._nn.pad_sequence(sequences, batch_first, padding_value)
TypeError: expected Tensor as element 0 in argument 0, but got tuple
