In [13]:
import os
from general_utils import get_logger
from data_utils import get_trimmed_glove_vectors,  load_vocab, \
        get_processing_word

In [21]:
class Config():
    def __init__(self, load=True):
        """Initialize hyperparameters and load vocabs

        Args:
            load_embeddings: (bool) if True, load embeddings into
                np array, else None

        """
        # directory for training outputs
        if not os.path.exists(self.dir_output):
            os.makedirs(self.dir_output)

        # create instance of logger
        self.logger = get_logger(self.path_log)

        # load if requested (default)
        if load:
            self.load()
            
    def load(self):
        """Loads vocabulary, processing functions and embeddings

        Supposes that build_data.py has been run successfully and that
        the corresponding files have been created (vocab and trimmed GloVe
        vectors)

        """
        # 1. vocabulary
        self.vocab_words = load_vocab(self.filename_words)
        #self.vocab_tags  = load_vocab(self.filename_tags)
        #self.vocab_chars = load_vocab(self.filename_chars)

        self.nwords     = len(self.vocab_words)
        #self.nchars     = len(self.vocab_chars)
        #self.ntags      = len(self.vocab_tags)

        # 2. get processing functions that map str -> id
        #self.processing_word = get_processing_word(self.vocab_words,
        #        self.vocab_chars, lowercase=True, chars=self.use_chars)
        #self.processing_tag  = get_processing_word(self.vocab_tags,
        #        lowercase=False, allow_unk=False)

        # 3. get pre-trained embeddings
        self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
                if self.use_pretrained else None)
        
        
    dir_output = "results/large_lstm_test/"
    dir_model  = dir_output + "model.weights/"
    path_log   = dir_output + "log.txt"
        
    filename_words = "../../data/vocab.txt"
    
    filename_trimmed = "../../data/embedding_vectors.npy.npz"
    
    use_pretrained = True

## Test

In [22]:
config = Config(load=True)

In [17]:
config.nwords

1863820

In [16]:
config.vocab_words

{'': 8,
 'дёда': 1340893,
 'дитмансдорф': 51726,
 'рандеву»': 105292,
 'импредикативный': 1106054,
 'καθηγητής': 1674480,
 'земля…': 1036726,
 'сосканировать': 533094,
 'норбекова': 216937,
 'отомо': 1797091,
 'келлеру': 826991,
 'кувандык': 1230511,
 'bangombe': 76203,
 'таллирдиевой': 173509,
 'kamon': 463229,
 'diskless': 1444169,
 'xcix': 1048366,
 'пренебрегающий': 378547,
 'малороссиян»': 1540109,
 'фастак': 1452245,
 'fhm': 1365623,
 'моле́': 1674758,
 'тварью': 1054235,
 'creativity»': 721495,
 'нигматзянов': 806706,
 'attribution': 485320,
 'кальеро': 571121,
 'folks»': 1636578,
 'арлекино': 1717978,
 'freezer': 130432,
 'invicta': 442940,
 'бенник': 1788054,
 'трутнями': 1852498,
 'фижака': 625211,
 'алваладе': 707501,
 'шерстоткацкой': 1290759,
 'aoû': 532803,
 'причетниками': 1226255,
 'pyatyhatka': 1476867,
 'çамрǎк': 1574610,
 'anka': 323125,
 'бачинскайте': 172164,
 'синхараджа': 114839,
 'tudjman': 405184,
 'мбаби': 1375010,
 'юсеньги': 944612,
 'титулярника': 925480,
 