## Setup

必要なパッケージのインストール

In [None]:
!pip install pandas sudachipy sudachidict_core

## Download JCLdic (Full CSV)

CSVフォーマットのJCLdicをダウンロード

In [None]:
import urllib.request

url = 'https://s3-ap-northeast-1.amazonaws.com/chakki.jcl.jp/public/jcl_full_mecab.csv.zip'
urllib.request.urlretrieve(url, 'jcl_full_mecab.csv.zip')

In [None]:
import zipfile

with zipfile.ZipFile('jcl_full_mecab.csv.zip') as file:
  file.extractall()

## Fix MeCab CSV Column

`genkei` カラムに 「`,`」（カンマ）が入っている場合があり、後処理の変換時にエラーとなるため修正

In [None]:
input_csv_file_name = 'jcl_full_mecab_backup.csv'
output_csv_file_name = 'jcl_full_mecab_backup_fix.csv'
fixed_count = 0

output_file = open(output_csv_file_name, 'w', encoding='UTF-8')

with open(input_csv_file_name) as input_file:
  for line in input_file:
    cols = line.split(',')
    if len(cols) > 13:
      print(f"[x]{','.join(cols)}")
      a = cols[:10]
      b = ''.join(cols[10:-2])
      c = cols[-2:]
      a.append(b)
      a.extend(c)
      print(f"[o]{','.join(a)}")
      output_file.write(f"{','.join(a)}")
      fixed_count += 1
    else:
      output_file.write(f"{line}")

output_file.close()

print(f"fixed count: {fixed_count}")

## Convert MeCab format to Sudachi format

Sudachiフォーマットへの変換

In [None]:
import pandas as pd
import unicodedata

mecab_dic = pd.read_csv(output_csv_file_name, header=None, encoding='UTF-8')
mecab_dic.columns = ["hyoso", "left_id", "right_id", "cost", "hinshi", 
              "hinshi_sai1", "hinshi_sai2", "hinshi_sai3", "katuyo1",
              "katuyo2", "genkei", "yomi", "hatuon"]

mecab_dic.head()

In [None]:
sudachi_dic = mecab_dic.copy()
sudachi_dic["midashi_trie"] = sudachi_dic["hyoso"].map(lambda x: unicodedata.normalize("NFKC", str(x).lower()))
sudachi_dic["midashi_hyoji"] = sudachi_dic["midashi_trie"]
sudachi_dic["seiki"] = sudachi_dic["midashi_trie"]
sudachi_dic["zisyo_id"] = "*"
sudachi_dic["bunkatu"] = "*"
sudachi_dic["bunkatu_a"] = "*"
sudachi_dic["bunkatu_b"] = "*" 
sudachi_dic["mishiyou"] = "*"

sudachi_dic = sudachi_dic.reindex(
    columns=["midashi_trie", "left_id", "right_id", "cost", "midashi_hyoji", "hinshi",
             "hinshi_sai1", "hinshi_sai2", "hinshi_sai3", "katuyo1", "katuyo2",
             "yomi", "seiki", "zisyo_id", "zisyo_id", "bunkatu_a", "bunkatu_b", 
             "mishiyou", "hyoso", "genkei", "hatuon", "tuiki"])

sudachi_dic = sudachi_dic.drop(["hyoso", "genkei", "hatuon", "tuiki"], axis=1)

# replace "組織" to "一般"
sudachi_dic = sudachi_dic.replace({'hinshi_sai2': {'組織': '一般'}})

sudachi_dic.head()

In [None]:
sudachi_dic.to_csv('sudachi_dic.csv', header=False, index=False, encoding="UTF-8")

## Build Sudachi User Dictionary

Sudachiのユーザー辞書を作成  
※ `system.dic` へのパスは環境に依存するため必要に応じて要修正  

In [None]:
!sudachipy ubuild -s .venv/lib/python3.7/site-packages/sudachidict_core/resources/system.dic sudachi_dic.csv