# spaCy 功能测试

## 安装套件

In [None]:
# 安装套件及预先训练的模型
!conda install -c conda-forge spacy
!python -m spacy download zh_core_web_sm# 小型的中文模型
!python -m spacy download en_core_web_sm# 小型的英文模型

^C

[x] No compatible package found for 'zh_core_web_sm#' (spaCy v3.1.2)



2021-09-03 11:14:42.526294: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-09-03 11:14:42.526347: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## 载入相关套件及预先训练的模型

In [1]:
# 载入相关套件
import spacy

In [2]:
# 载入词向量模型
nlp = spacy.load("en_core_web_sm")

In [3]:
# 分词及取得词性标签(POS Tagging)
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [4]:
# 取得详细的词性标签(POS Tagging)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [3]:
# 显示语意分析图
from spacy import displacy

text = "Apple is looking at buying U.K. startup for $1 billion"

doc = nlp(text)

displacy.render(doc, style="dep")

In [4]:
from pathlib import Path

svg = spacy.displacy.render(doc, style='dep', jupyter=False)
output_path = Path("imag1.svg")
output_path.open("w", encoding="utf-8").write(svg)

8866

In [5]:
# 标示实体
from spacy import displacy
text = "When Sebastian Thrun started working on self-driving cars " + \
       "at Google in 2007, few people outside of the company took him seriously."

doc = nlp(text)
# style="ent"：实体
displacy.render(doc, style="ent")

In [5]:
# 繁体中文分词
import spacy

nlp = spacy.load("zh_core_web_sm")
doc = nlp("清華大學位於新竹")
for token in doc:
    print(token.text, token.pos_, token.dep_)

清華 NOUN nsubj
大 ADV advmod
學位 ADV dep
於 ADP case
新竹 PROPN ROOT


In [6]:
# 简体中文分词
import spacy

nlp = spacy.load("zh_core_web_sm")
doc = nlp("清华大学位于北京")
for token in doc:
    print(token.text, token.pos_, token.dep_)

清华 PROPN compound:nn
大学 NOUN nsubj
位于 VERB ROOT
北京 PROPN dobj


In [10]:
# 显示中文语意分析图
from spacy import displacy

displacy.render(doc, style="dep")

In [11]:
# 显示依存关系
nlp = spacy.load("zh_core_web_sm")
doc = nlp("清华大学位于北京")
for token in doc:
    print(token.text, token.pos_, token.dep_)

清华 PROPN compound:nn
大学 NOUN nsubj
位于 VERB ROOT
北京 PROPN dobj


In [15]:
# 分词，并判断是否不在字典中(Out of Vocabulary, OOV)
nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


In [16]:
# 相似度比较
nlp = spacy.load("en_core_web_md")

# 测试两语句
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# 两语句的相似度比较
print(doc1, "<->", doc2, doc1.similarity(doc2))

# 关键字的相似度比较
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.77994864211694
salty fries <-> hamburgers 0.7304624
