## 形態素解析まで

In [36]:
from janome.tokenizer import Tokenizer
sample_str = "手をつないだら行ってみよう"

In [24]:
t = Tokenizer()
res = t.tokenize(sample_str)
res

[<janome.tokenizer.Token at 0x11643d9b0>,
 <janome.tokenizer.Token at 0x11643d5c0>,
 <janome.tokenizer.Token at 0x11643d780>,
 <janome.tokenizer.Token at 0x11643d4a8>,
 <janome.tokenizer.Token at 0x11643d6d8>,
 <janome.tokenizer.Token at 0x11643d470>,
 <janome.tokenizer.Token at 0x11643d6a0>,
 <janome.tokenizer.Token at 0x11643d978>]

In [25]:
for r in res:
    print(r)

手	名詞,一般,*,*,*,*,手,テ,テ
を	助詞,格助詞,一般,*,*,*,を,ヲ,ヲ
つない	動詞,自立,*,*,五段・ガ行,連用タ接続,つなぐ,ツナイ,ツナイ
だら	助動詞,*,*,*,特殊・タ,仮定形,だ,ダラ,ダラ
行っ	動詞,自立,*,*,五段・カ行促音便,連用タ接続,行く,イッ,イッ
て	助詞,接続助詞,*,*,*,*,て,テ,テ
みよ	動詞,非自立,*,*,一段,未然ウ接続,みる,ミヨ,ミヨ
う	助動詞,*,*,*,不変化型,基本形,う,ウ,ウ


## 品詞情報の取得と特定品詞の単語のみの抽出

In [26]:
res[0].part_of_speech

'名詞,一般,*,*'

In [27]:
[f.part_of_speech.split(",")[0] for f in res]

['名詞', '助詞', '動詞', '助動詞', '動詞', '助詞', '動詞', '助動詞']

In [28]:
# 1つの品詞だけ
[token.surface for token in res if token.part_of_speech.split(",")[0] == u"名詞"]

['手']

In [29]:
# 複数の品詞
target = [u"名詞", u"動詞"]
[token.surface for token in res if token.part_of_speech.split(",")[0] in target]

['手', 'つない', '行っ', 'みよ']

## 関数化

In [30]:
# type of function
def parser(value_str, tag=u"名詞"):
    t = Tokenizer()
    res = t.tokenize(value_str)
    if isinstance(tag, list):
        return [token.surface for token in res if token.part_of_speech.split(",")[0] in tag]
    else:
        return [token.surface for token in res if token.part_of_speech.split(",")[0] == tag]
    
# type of class's function
class JanomeParser:
    def __init__(self):
        self.t = Tokenizer()
        
    def parser(self, value_str, tag=u"名詞"):
        res = self.t.tokenize(value_str)
        if isinstance(tag, list):
            return [token.surface for token in res if token.part_of_speech.split(",")[0] in tag]
        else:
            return [token.surface for token in res if token.part_of_speech.split(",")[0] == tag]

In [31]:
parser(sample_str)

['手']

In [32]:
t = JanomeParser()
t.parser(sample_str)

['手']

## ユーザー辞書の作成

In [91]:
!pip install pandas

[33mYou are using pip version 19.0.2, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [115]:
words = ["果てない思い", "ウルトラソウル"]
create_user_dic(words, "B'z_dic")

In [113]:
# utils
import pandas as pd

def create_user_dic(words, dic_name):
    """
      TASK: 指定された単語群からJanome指定のユーザー辞書.csvファイルを作成する
      words: []string -> 辞書に登録したい単語群
      dic_name: string -> __.csvに該当するファイル名
      return void
    """
    df = words_to_df(words)
    df = to_janome_csv_style(df)
    save_df_to_csv(df, dic_name)

def words_to_df(words):
    """
      TASK: 単語リストをpandas.DataFrame形式に変換する
      wrods: []string -> 対象の単語群
      return pandas.DataFrame
    """
    to_df_list = [[w] for w in words]
    return pd.DataFrame(words, columns=["単語"])

def to_janome_csv_style(df):
    """
      TASK: 読み込まれたdfをjanomeが指定するユーザー辞書.csv形式に変換する
      df: pandas.DataFrame -> 単語リストから生成されたdf
      return pandas.DataFrame
    """
    return df.assign(
    a=df.pipe(lambda df: -1),
    b=df.pipe(lambda df: -1),
    c=df.pipe(lambda df: 1000),
    d=df.pipe(lambda df: "名詞"),
    e=df.pipe(lambda df: "一般"),
    f=df.pipe(lambda df: "*"),
    g=df.pipe(lambda df: "*"),
    h=df.pipe(lambda df: "*"),
    i=df.pipe(lambda df: "*"),
    j=df.pipe(lambda df: df["単語"]),
    k=df.pipe(lambda df: "*"),
    l=df.pipe(lambda df: "*"),
)

def save_df_to_csv(df, file_name):
    """
      TASK: dfを.csv形式で保存する
      df: pandas.DataFrame -> to_janome_csv_style()の戻り値のdf
      file_name: ファイルの保存名(拡張子は含まない)
      return void
    """
    df.to_csv(f"{file_name}.csv", header=False, index=False, encoding="cp932")

## ユーザー辞書の読み込み

In [116]:
!ls

B'z_dic.csv         janome_sample.ipynb


In [117]:
USER_DIC_PATH = "B'z_dic.csv"

In [118]:
def parser(value_str, dic_path, tag=u"名詞"):
    
    t = Tokenizer(USER_DIC_PATH, udic_enc="cp932")
    res = t.tokenize(value_str)
    if isinstance(tag, list):
        return [token.surface for token in res if token.part_of_speech.split(",")[0] in tag]
    else:
        return [token.surface for token in res if token.part_of_speech.split(",")[0] == tag]

In [119]:
parser("果てない思い")

['果てない思い']

In [120]:
parser("ウルトラソウル")

['ウルトラソウル']

## ベンチマーク

In [83]:
import time
TRY_NUM = 10000

In [84]:
# type of function
results = [common_exec(sample_str, lambda x: parser(x)) for _ in range(0, TRY_NUM)]
print("[info] average time: {0:.5f}s".format(calc_average(results)))

[info] average time: 0.08182s


In [85]:
# type of class's function
# instanceを作るところから記録
start_at = time.time()
j_p = JanomeParser()
end_at = time.time()
elapsed_time_create_instance = end_at - start_at

results = [common_exec(sample_str, lambda x: j_p.parser(x)) for _ in range(0, TRY_NUM)]
print("[info] average time: {0:.5f}s".format(calc_average(results) + elapsed_time_create_instance))

[info] average time: 0.08891s


In [71]:
# for benchmark utils function
def common_exec(value_str, func):
    """
      TASK: 対象の関数の実行にかかった時間を算出
      value_str: string -> 対象の文章
      func: func(string)[]string -> 形態素解析を行う関数
      return []float
    """
    # 時間記録のために現在時刻を取得
    start_at = time.time()
    
    # 無名関数として渡されてきた対象の関数を実行
    func(value_str)
    end_at = time.time()
    
    # 終了時間との差分を算出
    return end_at - start_at

def calc_average(results):
    """
      TASK: []int, []float 配列から平均値を算出
      results: []int, []float -> 記録地をまとめた配列
      return float
    """
    return sum(results) / len(results)