In [1]:
import spacy
import gensim
import os
import pandas as pd
from gensim.models import KeyedVectors
from joblib import Parallel, delayed
import itertools
import numpy as np
import pickle

construct dataset

In [2]:
def get_files(dir_path):
    ngs = ['LICENSE.txt']
    files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f not in ngs]
    return files

def get_lines(file_path):
    with open(file_path, 'r') as f:
        lines = [l.strip() for l in f.readlines()]
    return [l for l in lines if len(l) > 0]

p = '/data/livedoor/text/'
medium = [d for d in os.listdir(p) if os.path.isdir(os.path.join(p, d))]

dataset = []

for media in medium:
    dir_path = os.path.join(p, media)
    files = get_files(dir_path)
    
    for file in files:
        
        file_path = os.path.join(dir_path, file)
        lines = get_lines(file_path)
        
        url, timestamp, title, text = lines[0], lines[1], lines[2], ''.join(lines[3:])
        dataset.append((media, url, timestamp, title, text))

dataset = pd.DataFrame(dataset, columns=['media', 'url', 'timestamp', 'title', 'text'])
dataset = dataset[dataset.index!=6031]
dataset.head()

Unnamed: 0,media,url,timestamp,title,text
0,sports-watch,http://news.livedoor.com/article/detail/5985384/,2011-11-02T09:00:00+0900,石川遼、“異例の交際発表”めぐる水面下の攻防とは？,先月24日深夜、プロゴルファー・石川遼が、マネジメント会社「ケーアイ企画」を通し、一般女性と...
1,sports-watch,http://news.livedoor.com/article/detail/6819620/,2012-08-03T10:00:00+0900,柔道界の現状に、古賀氏「日本の場合は先生の圧力が強い」,ロンドン五輪における柔道男子は、ここまで金メダルを獲得できずにいる。2日も男子100kg級に...
2,sports-watch,http://news.livedoor.com/article/detail/5571206/,2011-05-20T02:00:00+0900,【Sports Watch】女性店員の契約選手侮辱ツイートでアディダスが謝罪,ヴァンフォーレ甲府のFWで、アディダス契約選手の一人＝ハーフナー・マイクが、18日に入籍を発...
3,sports-watch,http://news.livedoor.com/article/detail/4970500/,2010-08-27T07:25:00+0900,【Sports Watch】スエマエの五輪後、前田は「人間不信になった感じ」,2008年、北京五輪の舞台で世界ランキング一位のペアから金星を挙げ、一躍その名を知らしめた女...
4,sports-watch,http://news.livedoor.com/article/detail/6158040/,2011-12-30T09:00:00+0900,天皇杯で敗れたチームのファン（？）、チケットを燃やす,29日に行なわれた第91回天皇杯全日本サッカー選手権大会は、J2のFC東京と京都サンガが決勝...


tokenize

In [3]:
nlp = spacy.load('ja_ginza')

def split_list(ls, step):
    n = len(ls)
    for i in range(0, n, step):
        yield ls[i:i+step]

def tokenize(batch):
    return [[str(token) for token in nlp(text)] for text in batch]

In [4]:
%%time
ls = list(split_list(dataset['text'].to_list(), 100))
r = Parallel(n_jobs=-1)([delayed(tokenize)(batch) for batch in ls])
r = list(itertools.chain.from_iterable(r))
dataset.loc[:, 'tokens'] = r
# >>> CPU times: user 1.9 s, sys: 334 ms, total: 2.24 s
# >>> Wall time: 1min 43s

# dataset.loc[:, 'tokens'] = dataset['text'].apply(lambda x: [str(t) for t in nlp(x)])
# >>> CPU times: user 23min 23s, sys: 10.5 s, total: 23min 33s
# >>> Wall time: 23min 34s

# dataset.loc[:, 'tokens'] = [[t.lemma_ for t in doc] for doc in nlp.pipe(dataset['text'], n_threads=-1)]
# >>> CPU times: user 19min 56s, sys: 1min 44s, total: 21min 41s
# >>> Wall time: 21min 41s

CPU times: user 1.88 s, sys: 336 ms, total: 2.22 s
Wall time: 1min 52s


embedding

In [5]:
%%time
wv = KeyedVectors.load_word2vec_format('/data/chive/chive-1.1-mc5-20200318.txt')
wv

CPU times: user 12min 21s, sys: 10.8 s, total: 12min 32s
Wall time: 12min 32s


<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f7bf936c290>

In [None]:
%%time
def embed(tokens):
    embeddings = []
    for t in tokens:
        try:
            embeddings.append(list(wv[t]))
        except:
            pass
    return embeddings

# parquet は np.array 出力できないため list 形式
dataset.loc[:, 'embeddings'] = dataset['tokens'].apply(lambda tokens: embed(tokens))
dataset.head()

In [None]:
%%time
# Kernel dies
dataset.to_parquet('./data/word_embeddings.parquet.gz', compression='gzip', engine='fastparquet')

In [10]:
%%time
# pickle
with open('./data/word_embeddings.pickle', 'wb') as f:
    pickle.dump(dataset, f)

In [None]:
# csv
# corpus.to_csv('./data/word_embeddings.csv', index=False)
# list を保持できないため csv 形式での保存は不採用