In [1]:
import torch
from os import listdir
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
# tokenizer will split a long text into a list of english words
tokenizer = get_tokenizer('basic_english')

def read_files(datapath='../data_train/'):
    """
    Return a list of strings, one for each line in each .txt files in 'datapath'
    """
    # Find all txt files in directory 
    files = listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]
    
    # Stores each line of each book in a list
    lines = []
    for f_name in files:
        with open(f_name) as f:
            book = [line.strip() for line in f.readlines()]
        lines += book
    return lines

# Match any word containing digit
no_digits = '\w*[0-9]+\w*'
# Match word containing a uppercase 
no_names = '\w*[A-Z]+\w*'
# Match any sequence containing more than one space
no_spaces = '\s+'

books_train = read_files('../data_train/')
books_val = read_files('../data_val/')
books_test = read_files('../data_test/')

def tokenize(lines):
    """
    Tokenize the list of lines
    """
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text

def yield_tokens(lines):
    """
    Yield tokens, ignoring names and digits to build vocabulary
    """
    for line in lines:
        line = re.sub(no_digits, ' ', line)
        line = re.sub(no_names, ' ', line)
        line = re.sub(no_spaces, ' ', line)
        yield tokenizer(line)

def count_freqs(data, vocab):
    """
    Count occurrences of each word in vocabulary in the data
    """
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in data:
        freqs[vocab[w]] += 1
    return freqs

# List of words contained in the dataset
list_words_train = tokenize(books_train)
list_words_val = tokenize(books_val)
list_words_test = tokenize(books_test)

# vocab contains the vocabulary found in the data, associating an index to each word
vocab = build_vocab_from_iterator(yield_tokens(books_train), min_freq=100, specials=["<unk>"])
# vocab.append_token("i")

# Value of default index. This index will be returned when OOV (Out Of Vocabulary) token is queried.
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)

print("Total number of words in the training dataset:     ", len(list_words_train))
print("Total number of words in the validation dataset:   ", len(list_words_val))
print("Number of distinct words in the training dataset:  ", len(set(list_words_train)))
print("Number of distinct words kept (vocabulary size):   ", vocab_size)

freqs = count_freqs(list_words_train, vocab)
print(freqs.sum())
n_words_train = len(list_words_train)
weights = ((n_words_train - freqs) / n_words_train).to(torch.float)

Total number of words in the dataset:    2684706
Total number of words in the dataset:    49526
Number of distinct words in the dataset: 52105
Number of distinct words kept:           1879
tensor(2684706)


In [3]:
generated_path = '../generated/'
torch.save(list_words_train, generated_path + 'books_train.pt')
torch.save(list_words_val,   generated_path + 'books_val.pt')
torch.save(list_words_test,  generated_path + 'books_test.pt')

torch.save(vocab,   generated_path + 'vocab.pt')
torch.save(weights, generated_path + "weight.pt")

In [4]:
print(freqs[:vocab_size],[vocab.lookup_tokens(range(vocab_size))] )

tensor([456903, 182537, 151278,  ...,    100,    182,    100],
       dtype=torch.int32) [['<unk>', ',', 'the', '.', 'and', 'of', 'to', 'a', 'in', 'that', 'he', 'was', 'his', 'it', 'with', 'had', 'is', 'not', 'as', 'on', 'him', 'for', 'at', 'you', 'be', 'her', 's', 'which', '!', 'all', '?', 'have', 'from', 'but', 'this', 'by', 'they', 'said', 'are', 'she', 'one', 'were', 'who', 'so', 'there', 'or', 'me', 'them', 'an', 'my', 'will', 'man', 'we', 'up', 'their', 'out', 'been', 'when', 'no', 'would', 'what', 'into', 'if', 'more', 'very', 'could', 'did', 'men', 'has', 'do', 'then', 'some', 'king', 'other', 'time', 'about', 'should', 'went', 'himself', 'came', 'now', 'only', 'your', 'like', 'two', 'little', 'before', 'over', 'made', 'than', 'see', 'may', 'down', 'old', 'us', 'know', 'can', 'good', 'where', 't', '(', ')', 'must', 'great', 'our', 'people', 'go', 'again', 'come', 'its', 'these', 'after', 'any', 'without', 'day', 'upon', 'eyes', '—', 'first', 'way', 'back', 'away', 'am', 'same',

In [5]:
print([w.item() for w in weights])
print(sum([1-w.item() for w in weights]))

[0.8298126459121704, 0.9320085644721985, 0.9436519145965576, 0.9539141058921814, 0.9693489670753479, 0.9755425453186035, 0.9766220450401306, 0.9816628098487854, 0.9845506548881531, 0.9884337186813354, 0.9861560463905334, 0.9891809225082397, 0.9901263117790222, 0.9901836514472961, 0.9922293424606323, 0.9924911856651306, 0.9925634264945984, 0.9941550493240356, 0.9938194155693054, 0.9941502213478088, 0.994612455368042, 0.9942947030067444, 0.9941006302833557, 0.9940581917762756, 0.9950627684593201, 0.9952702522277832, 0.9952329993247986, 0.9955585598945618, 0.9956017732620239, 0.9956271052360535, 0.9961493015289307, 0.996198832988739, 0.9961243271827698, 0.9950642585754395, 0.9957298636436462, 0.9964841604232788, 0.9957179427146912, 0.9967232942581177, 0.9967221617698669, 0.9958822131156921, 0.9964476823806763, 0.9969385862350464, 0.9968100786209106, 0.9968916773796082, 0.9964953064918518, 0.9973658323287964, 0.9975509643554688, 0.9975870847702026, 0.9976053237915039, 0.997441828250885, 0.

In [6]:
print(vocab)

Vocab()
