In [1]:
# primitive
import sys
import os
import pickle
import itertools
from tqdm import tqdm
from joblib import Parallel, delayed
from pprint import pprint
import itertools
from collections import Counter
from time import time

# data handling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# text
import MeCab
import spacy
import gensim
from gensim.models import KeyedVectors

# nn
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.vocab import Vocab

# **
# handmade libs
# *
src = '../../src'
if src not in sys.path: sys.path.append(src)

# constants
from const import *
constants = {k: v for k, v in locals().items() if k.isupper()}
pprint(constants)

# modules
from my_tokenizer import get_tokenizer
from livedoor_dataset import LivedoorDataset
from sudachi_tokenizer import SudachiTokenizer

{'DEVICE': 'cpu',
 'DIR_BIN': '/tmp/work/livedoor/bin',
 'DIR_DATA': '/tmp/work/livedoor/data',
 'DIR_LOG': '/tmp/work/livedoor/log',
 'DIR_MECAB_DIC': '/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd',
 'DIR_MODEL': '/tmp/work/livedoor/model',
 'ROOT': '/tmp/work/livedoor',
 'SAMPLE_SENT': 'ワンマンライブに行きたい。',
 'SEED': 123,
 'TOKENIZER': 'mecab'}


# Preprocess for training

In [2]:
# vocab
ENGINE = 'sudachi'
DICT = 'core'
DATASET = 'livedoor'
EMBEDDING = 'chive_mc90'
VERSION = 'intersection'
file_vocab = os.path.join(DIR_BIN, f'{ENGINE}.{DICT}.{DATASET}.{EMBEDDING}.{VERSION}.vocab.pkl')
with open(file_vocab, 'rb') as f:
    vocab = pickle.load(f)

# 学習済み単語分散表現
vectors = gensim.models.KeyedVectors.load('/data/chive_v1.2mc90/chive-1.2-mc90_gensim/chive-1.2-mc90.kv')

In [3]:
# vectors を作る
def create_vectors(vocab, vectors):
    '''学習・推論時に用いる単語分散表現のベクトル
    Args:
      vocab: 学習用コーパスの語彙と学習済単語ベクトルの語彙に共通で登場する語彙。torchtext.vocab.Vocab
      pretrained_embedding: 上記 vocab 構築時に参照した単語分散表現
    Returns:
      語に対応するインデックスの要素として語に対応するベクトルを持つ numpy.array
    '''
    words = [vocab.itos[i] for i in range(len(vocab))]
    emb_pretrained = np.array([vectors[w] for w in words[2:]])
    emb_for_model = np.zeros((2, 300))
    emb_for_model = np.concatenate((emb_for_model, emb_pretrained), axis=0)
    
    return emb_for_model

In [4]:
%%time
file_embedding = os.path.join(DIR_BIN, f'{ENGINE}.{DICT}.{DATASET}.{EMBEDDING}.{VERSION}.embedding.pkl')
if os.path.isfile(file_embedding):
    print(f'file exists: {file_vocab}')
    pass
else:
    emb_for_model = create_vectors(vocab, vectors)
    with open(file_embedding, 'wb') as f:
        pickle.dump(emb_for_model, f)

CPU times: user 575 ms, sys: 173 ms, total: 748 ms
Wall time: 2.74 s
