In [1]:
# primitive
import sys
import os
import pickle
import itertools
from tqdm import tqdm
from joblib import Parallel, delayed
from pprint import pprint
import itertools
from collections import Counter
from time import time

# data handling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# text
import MeCab
import spacy
import gensim
from gensim.models import KeyedVectors

# nn
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.vocab import Vocab

# **
# handmade libs
# *
src = '../../src'
if src not in sys.path: sys.path.append(src)

# constants
from const import *
constants = {k: v for k, v in locals().items() if k.isupper()}
pprint(constants)

# modules
from my_tokenizer import get_tokenizer
from livedoor_dataset import LivedoorDataset
from sudachi_tokenizer import SudachiTokenizer

{'DEVICE': 'cpu',
 'DIR_BIN': '/tmp/work/livedoor/bin',
 'DIR_DATA': '/tmp/work/livedoor/data',
 'DIR_LOG': '/tmp/work/livedoor/log',
 'DIR_MECAB_DIC': '/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd',
 'DIR_MODEL': '/tmp/work/livedoor/model',
 'ROOT': '/tmp/work/livedoor',
 'SAMPLE_SENT': 'ワンマンライブに行きたい。',
 'SEED': 123,
 'TOKENIZER': 'mecab'}


# Preprocess for training

In [2]:
file = os.path.join(DIR_BIN, 'train_subset.pkl')
with open(file, 'rb') as f:
    train_dataset = pickle.load(f)

file = os.path.join(DIR_BIN, 'test_subset.pkl')
with open(file, 'rb') as f:
    test_dataset = pickle.load(f)

In [4]:
from collections import Counter
from torchtext.vocab import Vocab

def create_vocab(corpus):
    '''トークナイズ済みコーパスから Vocab を作成する
    
    Args:
      corpus: text の list
    Returns:
      vocab: index と word のマッピング
    '''
    
    counter = Counter()
    for text in tqdm(corpus):
        counter.update(text)
    
    return Vocab(counter, min_freq=1)

In [6]:
%%time
ENGINE = 'sudachi'
DICT = 'core'
EMBEDDING = 'chive_mc90'
file_vocab = os.path.join(DIR_BIN, f'vocab.{ENGINE}.{DICT}.{EMBEDDING}.pkl')

if os.path.isfile(file_vocab):
    print(f'file exists: {file_vocab}')
    pass
else:
    tokenizer = SudachiTokenizer()
    corpus = map(lambda row: row[1], train_dataset) # 学習用データセットからテキストだけ取得
    corpus = tokenizer.tokenized_corpus(corpus) # トークン列のジェネレータ
    vocab = create_vocab(corpus)
    
    print(f'create: {file_vocab}')
    with open(file_vocab, 'wb') as f:
        pickle.dump(vocab, f)
# Wall time: 27min 36s

5893it [27:36,  3.56it/s]


create: /tmp/work/livedoor/bin/vocab.sudachi.core.chive_mc90.pkl
CPU times: user 28min 32s, sys: 20.2 s, total: 28min 53s
Wall time: 27min 36s
