In [1]:
# Install spacy japanese 
# ! pip install spacy[ja]

Collecting sudachipy>=0.4.5; extra == "ja"
  Downloading SudachiPy-0.4.7.tar.gz (67 kB)
[K     |████████████████████████████████| 67 kB 777 kB/s eta 0:00:01
[?25hCollecting sudachidict-core>=20200330; extra == "ja"
  Downloading SudachiDict-core-20200330.tar.gz (6.4 kB)
Collecting dartsclone~=0.9.0
  Downloading dartsclone-0.9.0-cp37-cp37m-macosx_10_13_x86_64.whl (113 kB)
[K     |████████████████████████████████| 113 kB 1.8 MB/s eta 0:00:01
Collecting Cython
  Using cached Cython-0.29.20-cp37-cp37m-macosx_10_9_x86_64.whl (1.9 MB)
Building wheels for collected packages: sudachipy, sudachidict-core
  Building wheel for sudachipy (setup.py) ... [?25ldone
[?25h  Created wheel for sudachipy: filename=SudachiPy-0.4.7-cp37-cp37m-macosx_10_14_x86_64.whl size=278026 sha256=70e0e21755f8d0b45c2e732c7b148a900227bbaa580c37713bc96695304912c3
  Stored in directory: /Users/pema/Library/Caches/pip/wheels/9e/cf/9e/7ee168332d82f39009e06942ccd7a559207a7ca5605156a732
  Building wheel for sudachidict-c

# NOTES
1. for word segmentation and part-of-speech tagging, Spacy uses <b>SudachiPy</b>  <br><br>
2. The vectors were trained using <b>FastText</b> with the same settings as FastText’s word vectors (CBOW, 300 dimensions, character n-grams of length 5)<br><br>
3. <b>Prodigy</b> v1.10 comes with a new annotation interface for tasks like relation extraction and coreference resolution, full-featured audio and video annotation <br><br>


### Using Default Ja lib if you haven't downloaded the model

In [79]:
"""
Default model doesn't support NER. Didn't give similarity as well.
"""
from spacy.lang.ja import Japanese
ja_nlp = Japanese()

text1 = "私は日本語がわかりません" #Watashi wa nihongo ga wakarimasen - i dont know japanese language
doc = ja_nlp(text1)

# Look up the hash for the word "cake"-"ケーキ"
cake_hash = doc.vocab.strings["ケーキ"]
print(cake_hash)

print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

print()

15184338029493711698
Noun phrases: []
Verbs: ['わかる']



### Using downloaded 'ja_core_news_lg'  model


In [98]:
"""
To download the model: 
! python -m spacy download ja_core_news_lg
"""

import spacy
from spacy import displacy


ja_nlp = spacy.load("ja_core_news_lg")

In [82]:
text1 = "私は日本語がわかりません" #Watashi wa nihongo ga wakarimasen - i dont know japanese language
text2 = "私はケーキが好きです。" #Watashi wa kēki ga sukidesu. - i like cakes.
text3 = "私はクッキーが好きです。" #Watashi wa kukkī ga sukidesu. - i like cookies.
text4 = "私はクッキーが好きではありません" #Watashi wa kukkī ga sukide wa arimasen. -  i don't like cookies

doc1 = ja_nlp(text1)
doc2 = ja_nlp(text2)
doc3 = ja_nlp(text3)
doc4 = ja_nlp(text4)



In [83]:
cake_hash = doc1.vocab.strings["ケーキ"]
print(cake_hash)

# Look up the cat_hash to get the string
cake_string = doc1.vocab.strings[cake_hash]
print(cake_string)

15184338029493711698
ケーキ


# word Tokenize

In [84]:
words = [token.text for token in doc1]

print (words)

['私', 'は', '日本', '語', 'が', 'わかり', 'ま', 'せん']


# Sentence Tokenize

In [86]:
test = "私はケーキが好きです。私はクッキーが好きです。"
test_doc = ja_nlp(test)

sent_tokenize = (list(test_doc.sents))
for sent in sent_tokenize:
    print (sent)

私はケーキが好きです。
私はクッキーが好きです。


# Analyzing text

In [95]:
print("Noun phrases:", [chunk.text for chunk in doc1.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc1 if token.pos_ == "VERB"])
print()

print("Find named entities, phrases and concepts")
for entity in doc1.ents:
    print(entity.text, entity.label_)
    
    
print()
print("Linguistic annotations")
for token in doc:
    print(token.text, token.pos_, token.dep_)

print()
print("Linguistic annotations")
for token in doc1:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Noun phrases: ['私', '日本語', 'ません']
Verbs: ['わかる']

Find named entities, phrases and concepts
日本語 LANGUAGE

Linguistic annotations
私 PRON 
は ADP 
日本 PROPN 
語 NOUN 
が SCONJ 
わかり VERB 
ま INTJ 
せん NOUN 

Linguistic annotations
私 私 PRON 代名詞 nsubj x True False
は は ADP 助詞-係助詞 case x True True
日本 日本 PROPN 名詞-固有名詞-地名-国 compound xx True False
語 語 NOUN 名詞-普通名詞-一般 nsubj x True False
が が SCONJ 助詞-接続助詞 case x True True
わかり わかる VERB 動詞-一般 acl xxx True False
ま ま INTJ 感動詞-フィラー dep x True True
せん せん NOUN 名詞-普通名詞-一般 ROOT xx True False


# Dependency Tree

In [103]:
displacy.render(doc1, style="dep")

In [113]:
new_doc1 = ja_nlp("大和さんは佐藤健の父")
new_doc2 = ja_nlp("佐藤健のお母さんはさくらさん")
displacy.render([new_doc1,new_doc2], style="dep")

# Document Categorization


In [112]:
try:
    for ent in doc3.cats:
        print(ent.text, ent)
except Exception as e:
    print (e)
    

# POS TAG

In [104]:
for w in doc1:
    print (w,"==", w.pos_)

私 == PRON
は == ADP
日本 == PROPN
語 == NOUN
が == SCONJ
わかり == VERB
ま == INTJ
せん == NOUN


# NER

In [107]:
try:
    for ent in doc1.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
except Exception as e:
    print (e)
    
displacy.render(doc1, style="ent")

日本語 2 5 LANGUAGE


# Word Similarity

In [89]:
token1 = doc2[2]
token2 = doc3[2]
word_sim_score = token1.similarity(token2)
print("word similarity between '%s' and '%s' is '%e'"%(token1,token2,word_sim_score))

word similarity between 'ケーキ' and 'クッキー' is '6.558005e-01'


# Doc Similarity

In [90]:
sim_score = doc1.similarity(doc2)
print("Similarity between doc1_downloaded and doc2_downloaded:", sim_score)


Similarity between doc1_downloaded and doc2_downloaded: 0.7712370757550278


# similarity between the token and doc

In [91]:
tokenDoc_sim_score= doc1.similarity(token1)
print ("similarity between the token '%s' and doc '%s' is %e"%(token1,doc1,tokenDoc_sim_score))

similarity between the token 'ケーキ' and doc '私は日本語がわかりません' is 3.423057e-01
