# **Spacy for Entity Extraction**

NER -> Name entity recognition

![alt text](https://miro.medium.com/max/1400/1*qQggIPMugLcy-ndJ8X_aAA.png)

In [None]:
# NER
# Name entity Recognition

# NLP
# Natural language Processing

In [None]:
# !pip install spacy

In [None]:
!python -m spacy download en_core_web_sm -q


[38;5;2m[+] Download and installation successful[0m

2023-06-09 22:24:36.457214: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2023-06-09 22:24:36.457260: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.



You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.5.tar.gz


In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
doc=nlp('I spent $25 today')
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(I, 'O', ''), (spent, 'O', ''), ($, 'O', ''), (25, 'B', 'MONEY'), (today, 'B', 'DATE')]


In [None]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X, X.ent_iob_, X.ent_type_) for X in doc])


[(European, 'B', 'NORP'), (authorities, 'O', ''), (fined, 'O', ''), (Google, 'B', 'ORG'), (a, 'O', ''), (record, 'O', ''), ($, 'B', 'MONEY'), (5.1, 'I', 'MONEY'), (billion, 'I', 'MONEY'), (on, 'O', ''), (Wednesday, 'B', 'DATE'), (for, 'O', ''), (abusing, 'O', ''), (its, 'O', ''), (power, 'O', ''), (in, 'O', ''), (the, 'O', ''), (mobile, 'O', ''), (phone, 'O', ''), (market, 'O', ''), (and, 'O', ''), (ordered, 'O', ''), (the, 'O', ''), (company, 'O', ''), (to, 'O', ''), (alter, 'O', ''), (its, 'O', ''), (practices, 'O', '')]


In [None]:
displacy.render(nlp(str(doc)), jupyter=True, style='ent')


In [None]:
displacy.render(nlp(str(doc)), style='dep', jupyter = True, options = {'distance': 120})

# **Sentences written in a certain way makes a huge impact**
like when a word begins witha capital letter spacy is more likely to identify it correctly

In [None]:
doc=nlp('Miss. Smritika  book a flight from Mumbai to Indore in india with $3500 at 9am in the morning')
displacy.render(nlp(str(doc)), jupyter=True, style='ent')


In [None]:
# changed Indore to indore
doc=nlp('Miss. Smritika  book a flight from Mumbai to indore in india with $3500 at 9am in the morning')
displacy.render(nlp(str(doc)), jupyter=True, style='ent')


In [None]:
#removed 'in india'
doc=nlp('Miss. Smritika  book a flight from Mumbai to Indore with $3500 at 9am in the morning')
displacy.render(nlp(str(doc)), jupyter=True, style='ent')


In [None]:
# remove 'Miss.' and changed $ to dollars and added ' and scored 100 marks'
doc=nlp('Smritika  book a flight from Mumbai to Indore with 3500 dollar at 9am in the morning and scored 100 marks')
displacy.render(nlp(str(doc)), jupyter=True, style='ent')


In [None]:
#add 'Sadhukhan
doc=nlp('Smritika Sadhukhan book a flight from Mumbai to Indore in india with $3500 at 9am in the morning')
displacy.render(nlp(str(doc)), jupyter=True, style='ent')


# **Using Spacy on web Scraped data**

In [None]:
pip install html5lib

Collecting html5libNote: you may need to restart the kernel to use updated packages.
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)

Installing collected packages: html5lib
Successfully installed html5lib-1.1


In [None]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.lipsum.com/')
article = nlp(ny_bb)
len(article.ents)

119

In [None]:
article1= str(article).strip()

## Frequency of NER found

In [None]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 51,
         'ORG': 23,
         'NORP': 7,
         'GPE': 9,
         'DATE': 10,
         'PRODUCT': 2,
         'LANGUAGE': 2,
         'CARDINAL': 11,
         'WORK_OF_ART': 1,
         'ORDINAL': 2,
         'TIME': 1})

## Most Common NER

In [None]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Lorem Ipsum', 19), ('Latin', 4), ('45', 4)]

# **Visualising a single sentence**

In [None]:
for x in article.sents:
    print(str(x).replace('  ',''))

 Lorem Ipsum - All the facts - Lipsum generatorՀայերեն Shqip ‫العربية Български Català 中文简体 Hrvatski Česky Dansk Nederlands English Eesti Filipino Suomi Français ქართული Deutsch Ελληνικά ‫עברית हिन्दी Magyar Indonesia Italiano Latviski Lietuviškai македонски Melayu Norsk
Polski Português Româna Pyccкий Српски Slovenčina Slovenščina Español Svenska ไทย
Türkçe Українська Tiếng ViệtLorem Ipsum "Neque porro quisquam est qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit..."
"There is no one who loves pain itself, who seeks after it and wants to have it, simply because it is pain..." 
What is Lorem Ipsum?
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.
It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.
It was po

In [None]:
displacy.render(nlp(str(article)), jupyter=True, style='ent')


In [None]:
displacy.render(nlp(str(article)), style='dep', jupyter = True, options = {'distance': 70})

## **Word with their Lemma form, POS tags and Entity type**

In [None]:
list1=[y for y in nlp(str(article))]

list1[3].is_stop

False

In [None]:
for i in list1:
  if i.is_stop:
    print(i)

All
the
There
is
no
one
who
itself
who
after
it
and
to
have
it
because
it
is
What
is
is
of
the
and
has
been
the
's
ever
since
the
when
an
a
of
and
it
to
make
a
It
has
not
only
five
but
also
the
into
It
was
in
the
with
the
of
and
more
with
of
Why
do
we
it
It
is
a
that
a
will
be
by
the
of
a
when
at
its
The
of
using
is
that
it
has
a
more
or
less
of
as
to
using
here
here
it
Many
and
now
as
their
and
a
for
will
many
still
in
their
Various
have
over
the
sometimes
by
sometimes
on
and
the
Where
does
it
from
to
is
not
It
has
in
a
of
from
it
over
a
at
in
up
one
of
the
more
from
a
and
through
the
of
the
in
the
from
and
of
The
of
and
by
in
This
is
a
on
the
of
very
during
the
The
first
of
from
a
in
of
used
since
the
is
below
for
those
and
from
by
are
also
in
their
by
from
the
by
Where
can
I
get
some
There
are
many
of
of
but
the
have
in
some
by
or
which
do
n't
even
If
you
are
to
a
of
you
to
be
there
is
n't
anything
in
the
of
All
the
on
the
to
as
this
the
first
on
the
It
a
of
over
with
a
of
to
which


In [None]:

[(x,x.pos_, x.lemma_,x.ent_type_) for x in [y
                                      for y
                                      in nlp(str(article))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[( , 'SPACE', ' ', ''),
 (Lorem, 'PROPN', 'Lorem', 'PERSON'),
 (Ipsum, 'PROPN', 'Ipsum', 'PERSON'),
 (facts, 'NOUN', 'fact', ''),
 (Lipsum, 'PROPN', 'Lipsum', ''),
 (generator, 'NOUN', 'generator', ''),
 (             , 'SPACE', '             ', ''),
 (Հայերեն, 'PROPN', 'Հայերեն', ''),
 (Shqip, 'PROPN', 'Shqip', ''),
 (‫العربية, 'PROPN', '\u202bالعربية', ''),
 (Български, 'PROPN', 'Български', 'ORG'),
 (Català, 'PROPN', 'Català', 'ORG'),
 (中文简体, 'ADJ', '中文简体', ''),
 (Hrvatski, 'PROPN', 'Hrvatski', 'PERSON'),
 (Česky, 'PROPN', 'Česky', 'PERSON'),
 (Dansk, 'PROPN', 'Dansk', 'PERSON'),
 (Nederlands, 'PROPN', 'Nederlands', ''),
 (English, 'PROPN', 'English', ''),
 (Eesti, 'PROPN', 'Eesti', ''),
 (Filipino, 'PROPN', 'Filipino', 'NORP'),
 (Suomi, 'PROPN', 'Suomi', ''),
 (Français, 'PROPN', 'Français', ''),
 (ქართული, 'NOUN', 'ქართული', ''),
 (Deutsch, 'PROPN', 'Deutsch', 'PERSON'),
 (Ελληνικά, 'PROPN', 'Ελληνικά', 'PERSON'),
 (‫עברית, 'PROPN', '\u202bעברית', ''),
 (हिन्दी, 'NOUN', 'हिन्दी', 

##  **ALL THE RELEVENT WORDS WITH ENTITY TYPE**

In [None]:
for i in dict([(str(x), x.label_) for x in nlp(str(article)).ents]).items():
    print((i[0].replace('  ',''),i[1]))


('Lorem Ipsum - All', 'PERSON')
('Български Català', 'ORG')
('Hrvatski Česky Dansk', 'PERSON')
('Filipino', 'NORP')
('Deutsch Ελληνικά', 'PERSON')
('Magyar', 'GPE')
('Indonesia', 'GPE')
('Српски', 'GPE')
('Svenska', 'GPE')
('Türkçe Українська', 'PERSON')
('Lorem Ipsum', 'PERSON')
('qui dolorem', 'PERSON')
('consectetur', 'GPE')
('adipisci velit', 'PERSON')
('the 1500s', 'DATE')
('only five centuries', 'DATE')
('the 1960s', 'DATE')
('Letraset', 'ORG')
('Aldus', 'ORG')
('PageMaker', 'PRODUCT')
('English', 'LANGUAGE')
('the years', 'DATE')
('Latin', 'NORP')
('45', 'CARDINAL')
('BC', 'ORG')
('2000 years old', 'DATE')
('Richard McClintock', 'PERSON')
('Hampden-Sydney College', 'ORG')
('Virginia', 'GPE')
('one', 'CARDINAL')
('1.10.32', 'CARDINAL')
('The Extremes of Good and Evil', 'WORK_OF_ART')
('Cicero', 'ORG')
('Renaissance', 'ORG')
('first', 'ORDINAL')
('Lorem', 'PERSON')
('1914', 'DATE')
('H. Rackham', 'PERSON')
('200', 'CARDINAL')
('Loremipsum', 'PERSON')
('PayPal', 'ORG')
('16UQLq1HZ3

In [None]:
import nlp_preprocessing

ModuleNotFoundError: No module named 'tensorflow.python.eager.polymorphic_function'

In [None]:
!pip install nlp-preprocessing

Collecting nlp-preprocessing
  Downloading nlp_preprocessing-0.2.0-py3-none-any.whl (19 kB)
Collecting deepdish
  Downloading deepdish-0.3.7-py2.py3-none-any.whl (37 kB)
Installing collected packages: deepdish, nlp-preprocessing
Successfully installed deepdish-0.3.7 nlp-preprocessing-0.2.0
