<a href="https://colab.research.google.com/github/mr-alamdari/NLP-POS-NER/blob/main/NLP_POS_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#POS 
##Part Of Speech

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'this is a sentence about a cat which I have been looking for it, but i discoverd that it had flown over the river')

In [4]:
for token in doc:
  print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:<{10}} {spacy.explain(token.tag_)}")

this       DET        DT         determiner
is         AUX        VBZ        verb, 3rd person singular present
a          DET        DT         determiner
sentence   NOUN       NN         noun, singular or mass
about      ADP        IN         conjunction, subordinating or preposition
a          DET        DT         determiner
cat        NOUN       NN         noun, singular or mass
which      DET        WDT        wh-determiner
I          PRON       PRP        pronoun, personal
have       AUX        VBP        verb, non-3rd person singular present
been       AUX        VBN        verb, past participle
looking    VERB       VBG        verb, gerund or present participle
for        ADP        IN         conjunction, subordinating or preposition
it         PRON       PRP        pronoun, personal
,          PUNCT      ,          punctuation mark, comma
but        CCONJ      CC         conjunction, coordinating
i          PRON       PRP        pronoun, personal
discoverd  VERB       VBP    

In [5]:
pos_counts = doc.count_by(spacy.attrs.POS)

In [6]:
pos_counts

{85: 3, 87: 4, 89: 1, 90: 5, 92: 3, 95: 4, 97: 1, 98: 1, 100: 3}

In [7]:
doc.vocab[85].text

'ADP'

In [8]:
for k, v in sorted(pos_counts.items()):
  print(f'{k}. {doc.vocab[k].text:{10}} {v}')

85. ADP        3
87. AUX        4
89. CCONJ      1
90. DET        5
92. NOUN       3
95. PRON       4
97. PUNCT      1
98. SCONJ      1
100. VERB       3


In [9]:
tag_counts = doc.count_by(spacy.attrs.TAG)

In [10]:
for k, v in sorted(tag_counts.items()):
  print(f'{k}. {doc.vocab[k].text:{10}} {v}')

1292078113972184607. IN         4
1534113631682161808. VBG        1
2593208677638477497. ,          1
3822385049556375858. VBN        2
9188597074677201817. VBP        2
13656873538139661788. PRP        4
13927759927860985106. VBZ        1
15267657372422890137. DT         4
15308085513773655218. NN         3
17109001835818727656. VBD        1
17202369883303991778. WDT        1
17571114184892886314. CC         1


In [11]:
spacy.displacy.render(doc, style='dep', jupyter=True)

In [12]:
options = {'distance': 100, 'compact': 'True', 'color':'#blue', 'bg':'#094379', 'font':'Times'}

In [13]:
spacy.displacy.render(doc, style='dep', jupyter=True, options = options)

In [14]:
doc1 = nlp(u'this is a sentence about a cat. which I have been looking for it. but i discoverd that. it had flown over the river')

In [15]:
for sentence in doc1.sents:
  print(sentence)

this is a sentence about a cat.
which I have been looking for it.
but i discoverd that.
it had flown over the river


In [16]:
spans = list(doc1.sents)

In [17]:
# spacy.displacy.serve(spans, style='dep', options={'distance': 100})

#NER (Named Entity Recognition)

In [18]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(f'{ent.text} - {ent.label_} - {spacy.explain(ent.label_)}')
  else:
    print('No Entity')

In [19]:
doc1 = nlp(u'Hi, how are you?')

In [20]:
show_ents(doc1)

No Entity


In [123]:
doc2 = nlp('Hello EEE ESI, go to L.A in the U.S . tomorrow to get 20 dollars. and buy Harry potter book, it was $20 million at first')

In [124]:
show_ents(doc2)

Hello EEE ESI - ORG - Companies, agencies, institutions, etc.
L.A - GPE - Countries, cities, states
U.S - GPE - Countries, cities, states
tomorrow - DATE - Absolute or relative dates or periods
20 dollars - MONEY - Monetary values, including unit
Harry potter book - PERSON - People, including fictional
$20 million - MONEY - Monetary values, including unit
first - ORDINAL - "first", "second", etc.


In [125]:
org = doc2.vocab.strings[u'ORG']

In [126]:
new_ent = spacy.tokens.Span(doc2, 1, 2, label=org)

In [127]:
# doc2.ents = list(doc2.ents) + [new_ent]

In [128]:
doc1 = nlp(u'This is a book-shelf',
          u'I like this bookshelf')

In [129]:
show_ents(doc1)

No Entity


In [130]:
matcher = spacy.matcher.PhraseMatcher(nlp.vocab)

In [131]:
phrase_list = ['book-shelf', 'bookshelf']

In [132]:
phrase_patterns = [nlp(t) for t in phrase_list]

In [133]:
matcher.add('NewProducts', None, *phrase_patterns)

In [134]:
found_matches = matcher(doc1)

In [135]:
found_matches

[(8673304197534513844, 3, 6)]

In [136]:
PROD = doc1.vocab.strings[u'PRODUCT']

In [137]:
new_ents = [spacy.tokens.Span(doc1, match[1], match[2], label=PROD) for match in found_matches]

In [138]:
new_ents

[book-shelf]

In [139]:
# doc1.ents = list(doc1.ents) + new_ents

In [140]:
spacy.displacy.render(doc2, style='ent', jupyter=True)

In [141]:
for sent in doc2.sents:
  spacy.displacy.render(sent, style='ent', jupyter=True)

In [142]:
colors = {'ORG': '#540294'}
options = {'ents': ['PRODUCT', 'ORG'], 'colors':colors}

In [143]:
spacy.displacy.render(doc2, style='ent', jupyter=True, options=options)

#Sentence Segmentation

In [145]:
for cent in doc2.sents:
  print(cent, type(cent))

Hello EEE ESI, go to L.A in the U.S . <class 'spacy.tokens.span.Span'>
tomorrow to get 20 dollars. <class 'spacy.tokens.span.Span'>
and buy Harry potter book, it was $20 million at first <class 'spacy.tokens.span.Span'>


###add a segmentation rule

In [147]:

def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == ':':
      doc[token.i+1].is_sent_start = True
  return doc

In [149]:
nlp.add_pipe(set_custom_boundaries, before='parser')

In [150]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [151]:
doc3 = nlp(u'Hey You: you are welcome: To this page')

In [152]:
for sent in doc3.sents:
  print(sent)

Hey You:
you are welcome:
To this page


###change a segmentation rule


In [153]:
nlp = spacy.load('en_core_web_sm')

In [154]:
t_str = u'this is a sentence. So what \n\nThis is \nan another one'

In [155]:
print(t_str)

this is a sentence. So what 

This is 
an another one


In [156]:
doc = nlp(t_str)

In [157]:
for sent in doc.sents:
  print(sent)

this is a sentence.
So what 

This is 
an another one


In [159]:
def split_on_newsigns(doc, new_sign='\n'):
  start = 0
  seen_newline = False

  for word in doc:
    if seen_newline:
      yield doc[start: word.i]
      start = word.i
      seen_newline = False
    elif word.text.startswith(new_sign):
      seen_newline = True
  yield doc[start:]

In [160]:
sbd = spacy.pipeline.SentenceSegmenter(nlp.vocab, strategy=split_on_newsigns)

In [161]:
nlp.add_pipe(sbd)

In [162]:
doc = nlp(t_str)

In [164]:
for sent in doc.sents:
  print(sent)

this is a sentence. So what 


This is 

an another one
