# Spacy Training : Word-Tagging

Documentation : https://spacy.io/usage#installation

In [1]:
# Download spacy library
!pip install spacy

Collecting spacy
  Downloading spacy-3.7.4-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp312-cp312-win_amd64.

In [4]:
# Download pre-trained model (if need to run in local)
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 880.9 kB/s eta 0:00:15
      --------------------------------------- 0.2/12.8 MB 1.4 MB/s eta 0:00:09
     - -------------------------------------- 0.4/12.8 MB 2.0 MB/s eta 0:00:07
     - -------------------------------------- 0.5/12.8 MB 2.2 MB/s eta 0:00:06
     -- ------------------------------------- 0.7/12.8 MB 2.4 MB/s eta 0:00:06
     -- ------------------------------------- 0.8/12.8 MB 2.6 MB/s eta 0:00:05
     --- ------------------------------------ 1.0/12.8 MB 2.8 MB/s eta 0:00:05
     --- ------------------------------------ 1.2/12.8 MB 2.9 MB/s eta 0:00:05
     ---- ----------------------------------- 1

In [5]:
# Import library and load pre-trained model
import spacy
nlp = spacy.load('en_core_web_sm')

In [20]:
# Processing sentence
sentence = "President Jokowi will complete his duties as the President of the Republic of Indonesia on October 20, 2024" #initiate sentence that will be processed
doc = nlp(sentence) #save the sentence in an object called doc

In [21]:
# Check text only
for token in doc :
    print(token)

President
Jokowi
will
complete
his
duties
as
the
President
of
the
Republic
of
Indonesia
on
October
20
,
2024


In [29]:
#Check the language input
for token in doc :
    print(token.lang_)

en
en
en
en
en
en
en
en
en
en
en
en
en
en
en
en
en
en
en


Note that unfortunately Indonesian language is still not supported as in 27 February 2024. Check available languages in : https://spacy.io/usage/models#languages

Check all the useful methods that can be done for the token in the documentation : https://spacy.io/usage/linguistic-features

In [23]:
# Seeing text, token, and position in sentence
for token in doc:
    print("{0}\t{1}\t{2} ".format(token.text,token.idx,token.pos_))

President	0	PROPN 
Jokowi	10	PROPN 
will	17	AUX 
complete	22	VERB 
his	31	PRON 
duties	35	NOUN 
as	42	ADP 
the	45	DET 
President	49	PROPN 
of	59	ADP 
the	62	DET 
Republic	66	PROPN 
of	75	ADP 
Indonesia	78	PROPN 
on	88	ADP 
October	91	PROPN 
20	99	NUM 
,	101	PUNCT 
2024	103	NUM 


In [30]:
# Seeing Named-entity
from spacy import displacy
displacy.render(docs=doc, style='ent', jupyter=True)

In [27]:
# Seeing dependency tree
displacy.render(docs=doc, style='dep', jupyter=True, options={'distance': 100})