In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u"The quick brown fox jumped over the lazy doc's back.")

In [3]:
# 품사 태깅(명사, 복수명사, 최상급, 형용사, 동사,,,)
# token.pos_
# token.tag_ (더 세밀하게)
# spacy.explain(tag) (설명해줌)

In [4]:
print(doc.text) # 문자 형태로

The quick brown fox jumped over the lazy doc's back.


In [5]:
print(doc[4].text, ' - ', doc[4].pos_, ' - ', doc[4].tag_, ' - ', spacy.explain(doc[4].tag_))

jumped  -  VERB  -  VBD  -  verb, past tense


In [6]:
spacy.explain('VBD')

'verb, past tense'

In [7]:
for token in doc:
    print(f'{token.text:{15}} {token.pos_:{8}} {token.tag_:{10}} {spacy.explain(token.tag_)}')

The             DET      DT         determiner
quick           ADJ      JJ         adjective (English), other noun-modifier (Chinese)
brown           ADJ      JJ         adjective (English), other noun-modifier (Chinese)
fox             NOUN     NN         noun, singular or mass
jumped          VERB     VBD        verb, past tense
over            ADP      IN         conjunction, subordinating or preposition
the             DET      DT         determiner
lazy            ADJ      JJ         adjective (English), other noun-modifier (Chinese)
doc             NOUN     NN         noun, singular or mass
's              AUX      VBZ        verb, 3rd person singular present
back            ADV      RB         adverb
.               PUNCT    .          punctuation mark, sentence closer


In [12]:
# I read books on NLP -> 과거인지 현재 동사인지
doc = nlp(u'I read books on NLP')
r = doc[1]
r

read

In [11]:
# read -> 3인칭 현재 시제 아님
print(r.text, ' - ', r.pos_, ' - ', r.tag_, ' - ', spacy.explain(r.tag_))

read  -  VERB  -  VBP  -  verb, non-3rd person singular present


In [13]:
doc = nlp(u'I am reading a book')
r = doc[2]
r

reading

In [14]:
# reading -> 진행형
print(r.text, ' - ', r.pos_, ' - ', r.tag_, ' - ', spacy.explain(r.tag_))

reading  -  VERB  -  VBG  -  verb, gerund or present participle


### 빈도수

In [15]:
# 단어의 수치화 -> 빈도수 측정 가능.
doc = nlp(u"The quick brown fox jumped over the lazy doc's back.")

In [16]:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 84: 3, 92: 2, 100: 1, 85: 1, 87: 1, 86: 1, 97: 1}

In [18]:
doc.vocab[92].text # -> 위 문장에서 2번 나옴.

'NOUN'

In [22]:
POS_counts.items()

dict_items([(90, 2), (84, 3), (92, 2), (100, 1), (85, 1), (87, 1), (86, 1), (97, 1)])

In [19]:
# sorted 하면 정렬됨.
for key, value in sorted(POS_counts.items()):
    print(f'{key}. {doc.vocab[key].text:{5}}: {value}')

84. ADJ  : 3
85. ADP  : 1
86. ADV  : 1
87. AUX  : 1
90. DET  : 2
92. NOUN : 2
97. PUNCT: 1
100. VERB : 1


In [20]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for key, value in sorted(TAG_counts.items()):
    print(f'{key}. {doc.vocab[key].text:{5}}: {value}')

164681854541413346. RB   : 1
1292078113972184607. IN   : 1
10554686591937588953. JJ   : 3
12646065887601541794. .    : 1
13927759927860985106. VBZ  : 1
15267657372422890137. DT   : 2
15308085513773655218. NN   : 2
17109001835818727656. VBD  : 1
