In [1]:
import os

from collections import Counter

In [2]:
DIR = "topicclass"
PREFIX = "topicclass"
TRAIN_FILE = "{}/{}_train.txt".format(DIR, PREFIX)
VALID_FILE = "{}/{}_valid.txt".format(DIR, PREFIX)
TEST_FILE = "{}/{}_test.txt".format(DIR, PREFIX)

print("Train: {}".format(TRAIN_FILE))
print("Validation: {}".format(VALID_FILE))
print("Test: {}".format(TEST_FILE))

Train: topicclass/topicclass_train.txt
Validation: topicclass/topicclass_valid.txt
Test: topicclass/topicclass_test.txt


**Function to convert file to text, and then to labels and texts**

In [3]:
def get_text(filename):
    with open(filename, "r") as f:
        return f.readlines()

In [4]:
def split_cols(lines):
    labels = []
    texts = []
    for line in lines:
        tokens = line.split("|||")
        labels.append(tokens[0].strip())
        texts.append(tokens[1].strip())
    return labels, texts

**Get labels and texts**

In [5]:
train_labels, train_texts = split_cols(get_text(TRAIN_FILE))
valid_labels, valid_texts = split_cols(get_text(VALID_FILE))
test_labels, test_texts = split_cols(get_text(TEST_FILE))

print("Training size: {}".format(len(train_labels)))
print("Validation size: {}".format(len(valid_labels)))
print("Test size: {}".format(len(test_labels)))

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 6609: character maps to <undefined>

**Remove top-k words**

Removes top k words from dataset if `TOP_K` is specified. Note top k is determined from `train_dataset` only

In [None]:
TOP_K = 20
most_common_k_words = []

# Removes top-k words
if TOP_K > 0:
    
    counter = Counter()
    for line in train_texts:
        for w in line.split():
            counter[w] += 1
    
    most_common_k_words = counter.most_common(TOP_K)

print(most_common_k_words)


**Vocabulary and Label Sizes**

In [17]:
train_labels_set = set(train_labels)
valid_labels_set = set(train_labels)
train_labels_set = set(train_labels)
labels_set = train_labels_set.union(valid_labels_set, train_labels_set)
print("Total label set size: {}".format(len(labels_set)))
print(labels_set)

Total label set size: 16


In [23]:
def get_vocab_set(lines):
    s = set()
    for line in lines:
        for w in line.split():
            s.add(w)
    return s

train_vocab_set = get_vocab_set(train_texts)
valid_vocab_set = get_vocab_set(valid_texts)
test_vocab_set = get_vocab_set(test_texts)

total_vocab_set = train_vocab_set.union(valid_vocab_set, test_vocab_set)

print("Train vocab set size: {}".format(len(train_vocab_set)))
print("Valid vocab set size: {}".format(len(valid_vocab_set)))
print("Test vocab set size: {}".format(len(test_vocab_set)))
print("Total vocab set size: {}".format(len(total_vocab_set)))

# Count unseen words
unseen_valid = set()
unseen_test = set()
for w in valid_vocab_set:
    if w not in train_vocab_set:
        unseen_valid.add(w)
for w in test_vocab_set:
    if w not in train_vocab_set:
        unseen_test.add(w)

print("Unseen validation vocab: {}".format(len(unseen_valid)))
print("Unseen test vocab: {}".format(len(unseen_test)))

print(unseen_valid)


Train vocab set size: 137959
Valid vocab set size: 4446
Test vocab set size: 4548
Total vocab set size: 138378
Unseen validation vocab: 184
Unseen test vocab: 235
{'Pasig', 'Roxxi', 'Orevada', 'Dohrn', 'Lieberstein', '၁၂၇၇', 'Eichmanns', 'Asuma', 'Tut', 'jackrabbits', 'Associação', 'Demelza', 'gradations', 'Markowitz', 'mince', 'Nosek', 'Klea', 'Benchmark', 'RCTA', 'betrayer', 'Leaning', 'gammarus', 'Gapel', 'Bolan', 'Zhengyan', 'Devoted', 'Tagaung', '胡正言', 'Ranh', 'inclinata', 'Oriented', 'Olufunmilayo', 'Brasileira', 'electrocyclic', 'Homarus', 'Torv', 'Darlings', 'Stuckle', 'Strela', 'HNTB', 'sympathizing', 'aflatoxins', 'cyclopentenone', 'Bosses', 'Estell', 'palmetto', 'protic', 'Vagaland', 'Sarutobi', 'underplay', 'Dunder', 'starboard', 'Peavey', 'Champa', 'Lligwy', 'တရုတ', 'Akimichi', 'Perséphone', 'cyclopentenones', 'Heatherwick', 'cyclization', 'Careful', 'promisingly', '奈良', 'blowholes', '154th', 'Pinkner', 'Gaceta', 'Fled', 'Rainn', 'Overtoom', 'Krasinski', 'ဂို', 'Refueling'