# Word and Sentence Tokenization for Japanese Text
Sample Code

In [1]:
!ls ./

### Sentence Tokenization using konoha
##### Referene
https://github.com/himkt/konoha

In [2]:
!pip install 'konoha[SentenceTokenizer]'

In [3]:
from konoha import SentenceTokenizer

sentence = "私は猫だ。名前なんてものはない。だが，「かわいい。それで十分だろう」。"

tokenizer = SentenceTokenizer()
print(tokenizer.tokenize(sentence))


### Word Tokenization using fugashi
##### Referene
https://pypi.org/project/fugashi/

In [4]:
!pip install fugashi[unidic]
!python -m unidic download

In [5]:
from fugashi import Tagger

tagger = Tagger('-Owakati')
text = "麩菓子は、麩を主材料とした日本の菓子。"
tagger.parse(text)

In [5]:
!sudo apt -y install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!./mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y
!pip install mecab-python3

In [6]:
!echo `mecab-config --dicdir`"/mecab-ipadic-neologd"
!ls /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd

In [7]:
!pip install swig

In [8]:
import pprint as pp
from MeCab import Tagger
from typing import Text
import os
from typing import Iterable

#opts = os.getenv('/dev/null', '-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd/')
tagger = Tagger('-r /dev/null -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
parsed = tagger.parseToNode('サザエさんは走った')
parsed = parsed.next
pp.pprint(parsed.surface.strip())

In [9]:
def _tokenize(sentence: Text) -> Iterable[Text]:
    tagger = Tagger('-r /dev/null -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
    # mecab = MeCab.Tagger ()
    parsed = tagger.parseToNode('サザエさんは走った')
    parsed = parsed.next
    parsed = tagger.parseToNode(sentence)
    while parsed:
        token = parsed.surface.strip()
        if token:
            yield token
        parsed = parsed.next



In [18]:
def _tokenize_2(sentence: Text) -> Iterable[Text]:
    # tagger = Tagger('-r /dev/null -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
    tagger = MeCab.Tagger ()
    parsed = tagger.parseToNode('サザエさんは走った')
    parsed = parsed.next
    parsed = tagger.parseToNode(sentence)
    while parsed:
        token = parsed.surface.strip()
        if token:
            yield token
        parsed = parsed.next



In [10]:
def _lemmatize(sentence: Text) -> Iterable[Text]:
    tagger = Tagger('-r /dev/null -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
    # mecab = MeCab.Tagger ()
    parsed = tagger.parseToNode('サザエさんは走った')
    parsed = parsed.next
    parsed = tagger.parseToNode(sentence)
    while parsed:
        # The format of parsed.features is:
        #
        # Original Form\tPart of Speech,
        # Part of Speech section 1,
        # Part of Speech section 2,
        # Part of Speech section 3,
        # Conjugated form,
        # Inflection,
        # Reading,
        # Pronounciation
        #
        features = parsed.feature.split(',')
        if features[0] != 'BOS/EOS':
            yield features[-3]
        parsed = parsed.next

In [20]:
def _lemmatize_2(sentence: Text) -> Iterable[Text]:
    # tagger = Tagger('-r /dev/null -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
    tagger = MeCab.Tagger ()
    parsed = tagger.parseToNode('サザエさんは走った')
    parsed = parsed.next
    parsed = tagger.parseToNode(sentence)
    while parsed:
        # The format of parsed.features is:
        #
        # Original Form\tPart of Speech,
        # Part of Speech section 1,
        # Part of Speech section 2,
        # Part of Speech section 3,
        # Conjugated form,
        # Inflection,
        # Reading,
        # Pronounciation
        #
        features = parsed.feature.split(',')
        if features[0] != 'BOS/EOS':
            yield features[-3]
        parsed = parsed.next

In [11]:
sentence = "サザエさんは走った"
tokens = _tokenize(sentence)
tokenized = Text(' '.join(tokens))
print(tokenized)

In [19]:
sentence = "サザエさんは走った"
tokens = _tokenize_2(sentence)
tokenized = Text(' '.join(tokens))
print(tokenized)

In [12]:
sentence = "サザエさんは走った"
tokens = _lemmatize(sentence)
tokenized = Text(' '.join(tokens))
print(tokenized)

In [13]:
import pprint
text = "麩を主材料とした日本の菓子。麩を主材料とした日本の菓子。麩を主材料とした日本の菓子。"
pp = pprint.PrettyPrinter()
pp.pprint(tagger.parse(text))

### Word Tokenization using Mecab
##### Referene
https://colab.research.google.com/drive/1k44d8t_jW24dCH5sv0oaWflJtj_rfRvV#scrollTo=gb-sYfHxi7qR

In [1]:
!apt-get -q -y install swig 
!apt-get -y install mecab
!apt-get install libmecab-dev
!apt-get -y install mecab-ipadic-utf8
!pip install mecab-python3
!pip install unidic-lite

In [2]:
import sys
import MeCab
mecab = MeCab.Tagger ()
text = mecab.parse ("すもももももももものうち")
print(text)

In [3]:
# 例文
例文 = """どうも、藤本唯美です。XYZ社でデータサイエンティストとして一生懸命働いています。\
場所は東京都の新宿で、最高の街です! 😊 \
なんとラーメン一杯が７００円で食べれます（令和元年１２月１３日現在）。 (^_^)"""

text = mecab.parse (例文)
print(text)

In [16]:
txt = "Hello, welcome to my world."

x = txt.find("welcome")

print(x)

In [22]:
text = "麩を主材料とした日本の菓子。麩を主材料とした日本の菓子。麩を主材料とした日本の菓子。"
i_loc = text.find("主材料")
print(i_loc)
i_loc_1 = text.find("麩を")
print(i_loc_1)
i_loc_2 = text.find("とした")
print(i_loc_2)
print(text.count("主材料"))

In [24]:
text = "私は猫だ。名前なんてものはない。だが，「かわいい。それで十分だろう」。"
i_loc = text.find("猫名前")
print(i_loc)

In [18]:
#defining string and substring
str1 = "This dress looks good; you have good taste in clothes."
substr = "good"

#occurrence of word 'good' in whole string
count1 = str1.count(substr)
print(count1)

#occurrence of word 'good' from index 0 to 25
count2 = str1.count(substr,0,25)
print(count2)

In [1]:
pip install -U ginza ja_ginza

/bin/bash: nvs: line 1: syntax error: unexpected end of file
/bin/bash: error importing function definition for `nvs'
/bin/bash: nvsudo: line 1: syntax error: unexpected end of file
/bin/bash: error importing function definition for `nvsudo'
Collecting ginza
  Downloading ginza-5.1.2-py3-none-any.whl (20 kB)
Collecting ja_ginza
  Downloading ja_ginza-5.1.2-py3-none-any.whl (59.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hCollecting SudachiPy<0.7.0,>=0.6.2
  Downloading SudachiPy-0.6.6-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m
[?25hCollecting plac>=1.3.3
  Downloading plac-1.3.5-py2.py3-none-any.whl (22 kB)
Collecting SudachiDict-core

In [None]:
# cmd line
# ginzame
# 銀座でランチをご一緒しましょう。