In [1]:
# primitive
import sys
import os
import pickle
import itertools
from tqdm import tqdm
from joblib import Parallel, delayed
from pprint import pprint
import itertools
from collections import Counter
from time import time

# data handling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# text
import MeCab
import spacy
import gensim
from gensim.models import KeyedVectors

# nn
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.vocab import Vocab

# **
# handmade libs
# *
src = '../../src'
if src not in sys.path: sys.path.append(src)

# constants
from const import *
constants = {k: v for k, v in locals().items() if k.isupper()}
pprint(constants)

# modules
from my_tokenizer import get_tokenizer
from livedoor_dataset import LivedoorDataset

{'DEVICE': 'cuda',
 'DIR_BIN': '/tmp/work/livedoor/bin',
 'DIR_DATA': '/tmp/work/livedoor/data',
 'DIR_LOG': '/tmp/work/livedoor/log',
 'DIR_MECAB_DIC': '/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd',
 'DIR_MODEL': '/tmp/work/livedoor/model',
 'ROOT': '/tmp/work/livedoor',
 'SAMPLE_SENT': 'ワンマンライブに行きたい。',
 'SEED': 123,
 'TOKENIZER': 'mecab'}


# Preprocess for training

In [2]:
ld_df = pd.read_csv(os.path.join(DIR_DATA, 'livedoor&text=text.csv'))#.head(10)

# 概観
print(ld_df.shape)
display(ld_df.head())
display(pd.DataFrame(ld_df.media.value_counts()).sort_index().T.style.background_gradient('Blues', axis=1))

(7366, 2)


Unnamed: 0,media,text
0,3,前回の「プロに聞く“合コンの極意”（前編）　合コンアナリスト水谷麻衣に聞く、合コンの勝ちパタ...
1,3,「3年で転職は早すぎる？」「将来が見えない」「仕事が面白くない」・・・若手社会人の悩みは尽き...
2,3,こんにちは、「ビズリーチ年収1000万円研究所」所長の佐藤和男です。この研究所では、年収10...
3,3,6月7日、表参道のカフェバー「MERCER CAFE TERRACE HOUSE」でHenn...
4,3,「3年で転職は早すぎる？」「将来が見えない」「仕事が面白くない」・・・若手社会人の悩みは尽き...


Unnamed: 0,0,1,2,3,4,5,6,7,8
media,870,870,863,511,870,842,770,900,870


In [3]:
# get torch dataset
dataset = LivedoorDataset(ld_df)

# split dataset
TEST_RATE = 0.20
n = len(dataset)
n_test = int(np.floor(n * TEST_RATE))
n_train = int(n - n_test)
train_dataset, test_dataset = \
        random_split(dataset, [n_train, n_test], generator=torch.Generator().manual_seed(12345))

def get_subset_label_balance(dataset):
    c = Counter([l for l, _ in dataset])
    return pd.DataFrame(c.most_common()).set_index(0).sort_index().T

display(get_subset_label_balance(train_dataset).style.background_gradient('Blues', axis=1))
display(get_subset_label_balance(test_dataset).style.background_gradient('Blues', axis=1))

0,0.1,1,2,3,4,5,6,7,8
1,698,687,690,392,675,688,632,732,699


0,0.1,1,2,3,4,5,6,7,8
1,172,183,173,119,195,154,138,168,171


In [4]:
splits = ['train', 'test']
datasets = [train_dataset, test_dataset]

for split, dataset in zip(splits, datasets):
    file = os.path.join(DIR_BIN, f'{split}_subset.pkl')
    if os.path.isfile(file):
        print(f'file exists: {file}')
        pass
    else:
        print(f'create: {file}')
        with open(file, 'wb') as f:
            pickle.dump(dataset, f)

create: /tmp/work/livedoor/bin/train_subset.pkl
create: /tmp/work/livedoor/bin/test_subset.pkl
