In [1]:
import spacy

In [2]:
mynlp = spacy.load("en_core_web_sm")    #ML Model: Language model

In [3]:
text1 = "Tom is playing football in the city Delhi"

In [4]:
doc1 = mynlp(text1)       #spacy by default convert any text into tokens

In [5]:
type(doc1)

spacy.tokens.doc.Doc

In [6]:
mynlp.pipeline    #By default NLP Pipeline contains all the layers

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2299dd255b0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2299dd24110>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2299cab70d0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2299dd9f250>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2299dda9010>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2299cab7370>)]

In [7]:
mynlp = spacy.load("en_core_web_sm", exclude=["parser","ner"])  #If we want to eliminate any step/layer, we do it in this way

In [8]:
mynlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2299caba870>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2299f7fc590>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2299f81b510>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2299f81a390>)]

In [9]:
mynlp.pipe_names

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']

In [10]:
mynlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'attribute_ruler': [],
  'lemmatizer': []},
 'attrs': {'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},
  'token.tag': {'assigns': ['tagger'], 'requires': []},
  'doc.tensor': {'assigns': ['tok2vec'], 'requires': []}}}

In [11]:
mynlp.analyze_pipes(pretty=True)

[1m

#   Component         Assigns       Requires   Scores      Retokenizes
-   ---------------   -----------   --------   ---------   -----------
0   tok2vec           doc.tensor                           False      
                                                                      
1   tagger            token.tag                tag_acc     False      
                                                                      
2   attribute_ruler                                        False      
                                                                      
3   lemmatizer        token.lemma              lemma_acc   False      

[38;5;2m✔ No problems found.[0m


{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'attribute_ruler': [],
  'lemmatizer': []},
 'attrs': {'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},
  'token.tag': {'assigns': ['tagger'], 'requires': []},
  'doc.tensor': {'assigns': ['tok2vec'], 'requires': []}}}

In [12]:
# We can also create our customr NLP Pipelines

In [13]:
mycustomnlp = spacy.blank("en")

In [14]:
mycustomnlp.pipeline    #as of now,our custom NLP pipeline is blank

[]

In [15]:
doc2 = mycustomnlp(text1)

In [16]:
type(doc2)

spacy.tokens.doc.Doc

In [17]:
mycustomnlp.add_pipe("tagger")      #add tagging step/layer.pipeline

<spacy.pipeline.tagger.Tagger at 0x2299e994770>

In [18]:
mycustomnlp.pipeline

[('tagger', <spacy.pipeline.tagger.Tagger at 0x2299e994770>)]

In [19]:
# These tagging,lemmatizing,ner are known as the components of NLP Pipelining. It can also be known as function as we did some practicals in these. And maybe there are not so much pre created components/functions which we need in our use case. So, we can create our own component/functin through pure python programming.

In [20]:
# For example we want to create a component which will recognize if there is a digit in the given text or not

In [21]:
text2 = 'My roll number is 71'

In [24]:
@Language.component("mydigitcomp")    #annotation concept in programming language
def mydigit():
    for word in text2.split():
        if word.isdigit():
            print(word)

In [None]:
mydigit()

In [None]:
# Now, we need to add this new custom component made by us in the NLP Pipelining

In [23]:
from spacy.language import Language

In [25]:
mynlp.add_pipe("mydigitcomp",first = True)

<function __main__.mydigit()>

In [26]:
mynlp.pipeline

[('mydigitcomp', <function __main__.mydigit()>),
 ('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2299caba870>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2299f7fc590>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2299f81b510>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2299f81a390>)]

In [None]:
# Adding custom attributes

In [None]:
dir(doc2)     #These all are known as the attributes which we can apply on our text. But it may be the case that spacy doesn't know some other operations which we want to perform on our text, which generates the need of creating our custom attributes.

In [27]:
doc2

Tom is playing football in the city Delhi

In [28]:
text3 = "I am doing my career as a job in LW"

In [30]:
mynlp = spacy.load("en_core_web_sm") 

In [31]:
doc3 = mynlp(text3)

In [32]:
doc3[9].tag_

'NNP'

In [33]:
spacy.explain('NNP')

'noun, proper singular'

In [35]:
doc3[9].is_company    #Let's say we want to create our new attribute as is_company, which spacy is not aware of as of now.

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'is_company'

In [36]:
# This process of adding/creating a custom attribute is known a token extension. Hence, we are registering a new token extension attribute

In [37]:
from spacy.tokens import Token

In [39]:
Token.set_extension("is_company",default=False)     #By default, our new attribute will have a false value on every token.

In [40]:
doc3[9].is_company = True    #But we want to set the value of this new attribute to be True on the 9th index number. 

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'is_company'

In [41]:
#But, still it says that spacy doesn't have any attribute known as is_company.It is because, if you do dir(doc3), you will be able to see that any attribute/extension which spacy already has, it is a part of some special variable. Its ._ in this case. 

In [47]:
doc3[9]._.is_company = True    #So, we will use our newly created attribute/extension in this way

In [48]:
for i in doc3:
    print(i, " : ",i._.is_company)

I  :  False
am  :  False
doing  :  False
my  :  False
career  :  False
as  :  False
a  :  False
job  :  False
in  :  False
LW  :  True
