In [1]:
from itertools import chain
from collections import namedtuple, OrderedDict


Sentence = namedtuple("Sentence", "words tags")

In [2]:
sentences = {"s0": Sentence(("See", "Spot", "run"), ("VERB", "NOUN", "VERB")), 
             "s1": Sentence(("Spot", "ran"), ("NOUN", "VERB"))}

In [3]:
sum(1 for _ in chain(*(s.words for s in sentences.values())))

5

In [None]:
class Dataset(namedtuple("_Dataset", "sentences keys vocab X tagset Y training_set testing_set N stream")):
    def __new__(cls, tagfile, datafile, train_test_split=0.8, seed=112890):
        tagset = read_tags(tagfile)
        sentences = read_data(datafile)
        keys = tuple(sentences.keys())
        wordset = frozenset(chain(*[s.words for s in sentences.values()]))
        word_sequences = tuple([sentences[k].words for k in keys])
        tag_sequences = tuple([sentences[k].tags for k in keys])
        N = sum(1 for _ in chain(*(s.words for s in sentences.values())))
        
        # split data into train/test sets
        _keys = list(keys)
        if seed is not None: random.seed(seed)
        random.shuffle(_keys)
        split = int(train_test_split * len(_keys))
        training_data = Subset(sentences, _keys[:split])
        testing_data = Subset(sentences, _keys[split:])
        stream = tuple(zip(chain(*word_sequences), chain(*tag_sequences)))
        return super().__new__(cls, dict(sentences), keys, wordset, word_sequences, tagset,
                               tag_sequences, training_data, testing_data, N, stream.__iter__)

    def __len__(self):
        return len(self.sentences)

    def __iter__(self):
        return iter(self.sentences.items())


In [5]:
d = {i: 1/i for i in range(1, 11)}

In [6]:
sorted(d, key=d.get)

[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

In [7]:
from collections import Counter, defaultdict

In [8]:
count = defaultdict(Counter)
for tags, words in zip("19074027964031294378091340137408931", 
                       "ABCDEFGABCDEFGABCDEFGABCDEFGABCDEFG"):
    for tag, word in zip(tags, words):
        count[tag][word] += 1
count

defaultdict(collections.Counter,
            {'1': Counter({'A': 1, 'G': 2, 'B': 1, 'F': 1}),
             '9': Counter({'B': 3, 'A': 1, 'E': 1}),
             '0': Counter({'C': 2, 'F': 1, 'E': 2, 'G': 1}),
             '7': Counter({'D': 1, 'A': 2, 'E': 1}),
             '4': Counter({'E': 1, 'D': 2, 'C': 1, 'B': 1}),
             '2': Counter({'G': 1, 'A': 1}),
             '6': Counter({'C': 1}),
             '3': Counter({'F': 2, 'D': 1, 'C': 1, 'G': 1}),
             '8': Counter({'F': 1, 'D': 1})})

In [12]:
w = 'B'
{w: max([t for t in count.keys()], key=lambda x: count[x][w]) for w in "ABCDEFGH"}

{'A': '7',
 'B': '9',
 'C': '0',
 'D': '4',
 'E': '0',
 'F': '3',
 'G': '1',
 'H': '1'}

In [13]:
Counter([1, 2, 3, 8, 2, 1, 8, 3, 1])

Counter({1: 3, 2: 2, 3: 2, 8: 2})

In [14]:
Counter(chain(*[[1, 2, 3], [8, 2, 1], [8, 3, 1]]))

Counter({1: 3, 2: 2, 3: 2, 8: 2})