In [1]:
import json
import nltk
from tqdm import tqdm

from nltk.corpus import semcor

from data_loader import NO_SENSE

In [2]:
nltk.download('semcor')

[nltk_data] Downloading package semcor to
[nltk_data]     /Users/urisherman/nltk_data...
[nltk_data]   Package semcor is already up-to-date!


True

In [3]:
def parse(tagged_sentence):
    sentence = []
    labels = []

    def append(chunk, sense):
        for s in chunk:
            sentence.append(s)
            labels.append(sense)

    for chunk in tagged_sentence:
        if type(chunk) == list:
            append(chunk, NO_SENSE)
        elif type(chunk) == nltk.tree.Tree:
            labeled_words = chunk.leaves()

            if len(labeled_words) >= 1:
                if type(chunk.label()) == nltk.corpus.reader.wordnet.Lemma:
                    # nltk 3.4.5
                    str_lbl = chunk.label().synset().name()  # + '_' + chunk.label().name()
                elif type(chunk.label()) == str:
                    str_lbl = chunk.label()
                else:
                    raise ValueError(f'Illegal chunk label {chunk.label()}')
                append(labeled_words, str_lbl)
    return sentence, labels

In [4]:
sentences = []
senses = []

for i, ts in enumerate(tqdm(semcor.tagged_sents(tag='sem'))):
    sentence, sense = parse(ts)
    sentences.append(sentence)
    senses.append(sense)

100%|██████████| 37176/37176 [00:50<00:00, 729.38it/s] 


In [7]:
import numpy as np
sent_idxs = np.random.permutation(len(sentences))

TRAIN_SIZE = 30000

with open('./data/sentences.train.jsonl', 'w') as sent_out:
    with open('./data/senses.train.jsonl', 'w') as sense_out:
        for i in sent_idxs[:TRAIN_SIZE]:
            json.dump(sentences[i], sent_out)
            sent_out.write('\n')

            json.dump(senses[i], sense_out)
            sense_out.write('\n')
    
with open('./data/sentences.test.jsonl', 'w') as sent_out:
    with open('./data/senses.test.jsonl', 'w') as sense_out:
        for i in sent_idxs[TRAIN_SIZE:]:
            json.dump(sentences[i], sent_out)
            sent_out.write('\n')

            json.dump(senses[i], sense_out)
            sense_out.write('\n')

In [None]:
# with open('sentences.jsonl', 'w') as sent_out:
#     with open('senses.jsonl', 'w') as sense_out:
#         for i, ts in enumerate(tqdm(semcor.tagged_sents(tag='sem'))):
#             sentence, sense = parse(ts)
#             json.dump(sentence, sent_out)
#             sent_out.write('\n')
            
#             json.dump(sense, sense_out)
#             sense_out.write('\n')

In [20]:
with open('senses.jsonl', 'r') as data:
    head = [next(data) for x in range(3)]
    for line in head:
        print(json.loads(line))

['no_sense', 'group.n.01', 'group.n.01', 'group.n.01', 'group.n.01', 'state.v.01', 'friday.n.01', 'no_sense', 'probe.n.01', 'no_sense', 'atlanta.n.01', 'no_sense', 'late.s.03', 'primary.n.01', 'primary.n.01', 'produce.v.04', 'no_sense', 'no_sense', 'evidence.n.01', 'no_sense', 'no_sense', 'no_sense', 'abnormality.n.04', 'happen.v.01', 'happen.v.01', 'no_sense']
['no_sense', 'jury.n.01', 'far.r.02', 'state.v.01', 'no_sense', 'term.n.02', 'end.n.02', 'presentment.n.01', 'no_sense', 'no_sense', 'group.n.01', 'group.n.01', 'group.n.01', 'no_sense', 'no_sense', 'own.v.01', 'overall.s.02', 'mission.n.03', 'no_sense', 'no_sense', 'election.n.01', 'no_sense', 'no_sense', 'deserve.v.01', 'no_sense', 'praise.n.01', 'no_sense', 'thanks.n.01', 'no_sense', 'no_sense', 'location.n.01', 'location.n.01', 'location.n.01', 'no_sense', 'no_sense', 'no_sense', 'manner.n.01', 'no_sense', 'no_sense', 'no_sense', 'election.n.01', 'no_sense', 'conduct.v.01', 'no_sense']
['no_sense', 'september.n.01', 'october

In [21]:
with open('sentences.jsonl', 'r') as data:
    head = [next(data) for x in range(3)]
    for line in head:
        print(json.loads(line))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['The', 'jury', 'further', 'said', 'in', 'term', 'end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['The', 'September', 'October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']


In [54]:
import torch.nn.functional as F
import numpy as np
import torch

# Pad last 2 dimensions of tensor with (0, 1) -> Adds extra column/row to the right and bottom, whilst copying the values of the current last column/row
F.pad(torch.tensor([1,1]), (0,5))

tensor([1, 1, 0, 0, 0, 0, 0])