- 文章の長さ
- 語の出現頻度

In [1]:
# primitive
import sys
import os
import pickle
import itertools
from joblib import Parallel, delayed
from pprint import pprint
import itertools
from collections import Counter
from time import time

# data handling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# text
import MeCab
import spacy
import gensim
from gensim.models import KeyedVectors

# ml
# from sklearn.model_selection import 

# nn
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.vocab import Vocab

# **
# handmade libs
# *
src = '../../src'
if src not in sys.path: sys.path.append(src)

# constants
from const import *
constants = {k: v for k, v in locals().items() if k.isupper()}
pprint(constants)

# modules
from my_tokenizer import get_tokenizer
from livedoor_dataset import LivedoorDataset

/tmp/work/livedoor
{'DEVICE': 'cuda',
 'DIR_BIN': '/tmp/work/livedoor/bin',
 'DIR_DATA': '/tmp/work/livedoor/data',
 'DIR_LOG': '/tmp/work/livedoor/log',
 'DIR_MECAB_DIC': '/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd',
 'DIR_MODEL': '/tmp/work/livedoor/model',
 'ROOT': '/tmp/work/livedoor',
 'SAMPLE_SENT': 'ワンマンライブに行きたい。',
 'TOKENIZER': 'mecab'}


Functions

In [2]:
ld_df = pd.read_csv(os.path.join(DIR_DATA, 'livedoor&text=text.csv'))

# 概観
print(ld_df.shape)
display(ld_df.head())
display(pd.DataFrame(ld_df.media.value_counts()).sort_index().T.style.background_gradient('Blues', axis=1))

(7366, 2)


Unnamed: 0,media,text
0,3,前回の「プロに聞く“合コンの極意”（前編）　合コンアナリスト水谷麻衣に聞く、合コンの勝ちパタ...
1,3,「3年で転職は早すぎる？」「将来が見えない」「仕事が面白くない」・・・若手社会人の悩みは尽き...
2,3,こんにちは、「ビズリーチ年収1000万円研究所」所長の佐藤和男です。この研究所では、年収10...
3,3,6月7日、表参道のカフェバー「MERCER CAFE TERRACE HOUSE」でHenn...
4,3,「3年で転職は早すぎる？」「将来が見えない」「仕事が面白くない」・・・若手社会人の悩みは尽き...


Unnamed: 0,0,1,2,3,4,5,6,7,8
media,870,870,863,511,870,842,770,900,870


In [6]:
# Construct vocabulary; using only train and valid dataset
tokenizer = get_tokenizer('mecab')
print('output :', tokenizer(SAMPLE_SENT))

output : ['ワンマンライブ', 'に', '行き', 'たい', '。']


ランダム

In [62]:
ld_df.loc[:, 'pred'] = [np.random.randint(9) for _ in range(len(ld_df))]

acc = sum(ld_df.media==ld_df.pred) / len(ld_df)
print('accuracy', acc)

cross = pd.crosstab(ld_df.media, ld_df.pred)
display(cross.style.background_gradient('Blues', axis=None))

accuracy 0.10969318490361119


pred,0,1,2,3,4,5,6,7,8
media,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,95,97,103,84,97,93,89,112,100
1,95,97,103,98,93,82,98,104,100
2,109,87,92,91,104,92,102,95,91
3,62,45,47,62,62,59,61,59,54
4,100,78,95,98,100,104,102,97,96
5,95,89,91,115,87,97,88,92,88
6,72,80,83,93,90,83,96,79,94
7,93,92,105,111,116,100,100,82,101
8,92,100,96,94,110,109,95,87,87


文長

In [109]:
ld_df.loc[:, 'tokens'] = ld_df['text'].apply(lambda x: [str(t) for t in tokenizer(x)])
summary = ld_df.groupby('media')['tokens'].agg([lambda x: np.mean(x.apply(len))]).rename(columns={'<lambda>': 'length'})
summary = summary.sort_values('length', ascending=False)
(summary + summary.shift(1))/2

Unnamed: 0_level_0,length
media,Unnamed: 1_level_1
1,
3,828.257589
0,786.175405
4,731.385632
5,682.621251
8,657.987343
2,525.435499
6,385.335395
7,360.395202


In [107]:
def predict_by_len(tokens):
    df = ((summary + summary.shift(1))/2)[::-1]
    for ix, length in df.iterrows():
        if (len(tokens)<length.item()) or np.isnan(length.item()):
            label = ix
            break
        else:
            continue
    return label

predict_by_len(list(range(700)))

In [113]:
ld_df.loc[:, 'pred_by_len'] = ld_df['tokens'].apply(lambda x: predict_by_len(x))
ld_df.head()

Unnamed: 0,media,text,tokens,pred,pred_by_len
0,3,前回の「プロに聞く“合コンの極意”（前編）　合コンアナリスト水谷麻衣に聞く、合コンの勝ちパタ...,"[前回, の, 「, プロ, に, 聞く, “, 合コン, の, 極意, ”, （, 前編,...",8,1
1,3,「3年で転職は早すぎる？」「将来が見えない」「仕事が面白くない」・・・若手社会人の悩みは尽き...,"[「, 3年, で, 転職, は, 早, すぎる, ？, 」, 「, 将来, が, 見え, ...",0,6
2,3,こんにちは、「ビズリーチ年収1000万円研究所」所長の佐藤和男です。この研究所では、年収10...,"[こんにちは, 、, 「, ビズリーチ, 年収, 1000万円, 研究所, 」, 所長, の...",8,8
3,3,6月7日、表参道のカフェバー「MERCER CAFE TERRACE HOUSE」でHenn...,"[6月7日, 、, 表参道, の, カフェバー, 「, MERCER, CAFE, TERR...",5,2
4,3,「3年で転職は早すぎる？」「将来が見えない」「仕事が面白くない」・・・若手社会人の悩みは尽き...,"[「, 3年, で, 転職, は, 早, すぎる, ？, 」, 「, 将来, が, 見え, ...",7,2


In [115]:
acc = sum(ld_df.media==ld_df.pred_by_len) / len(ld_df)
print('accuracy', acc)

cross = pd.crosstab(ld_df.media, ld_df.pred_by_len)
display(cross.style.background_gradient('Blues', axis=None))

accuracy 0.21069780070594624


pred_by_len,0,1,2,3,4,5,6,7,8
media,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,35,282,181,31,30,27,34,107,143
1,65,503,35,54,62,23,2,12,114
2,23,41,185,5,10,6,60,472,61
3,22,199,98,13,23,8,14,67,67
4,43,259,219,30,41,19,23,94,142
5,35,197,210,33,47,24,22,120,154
6,2,6,223,2,0,2,92,399,44
7,5,7,196,3,2,7,64,538,78
8,55,195,196,42,34,14,29,184,121


語の出現頻度

In [4]:
texts = [tokenizer(str(t)) for i, (l, t) in ld_df.iterrows()]
freq = Counter(itertools.chain.from_iterable(texts)).most_common()
freq[:10]

[('、', 203844),
 ('の', 196481),
 ('。', 134637),
 ('に', 126605),
 ('を', 123117),
 ('が', 112796),
 ('は', 109134),
 ('て', 99219),
 ('で', 81866),
 ('た', 81702)]