# Using SpaCy for NLP tasks

I used the following version:

```bash
pip install -U pip setuptools wheel
pip install -U 'spacy[cuda11x]'
python -m spacy download en_core_web_trf
```

In [2]:
import spacy

spacy.require_gpu()
nlp = spacy.load("en_core_web_trf")

In [3]:
# tokenizer
doc = nlp('Leonard Simon Nimoy was born on March 26, 1931, in an Irish section \
of West End of Boston, Massachusetts, to Jewish immigrants from Iziaslav, Ukraine. \
His mother, Dora (née Spinner; 1904–1987), was a homemaker, and his father, \
Max Nimoy (1901–1987), owned a barbershop in the Mattapan section of Boston. \
Leonard Simon Nimoy was an American actor, famed for playing Spock in the Star Trek \
franchise for almost 50 years.')

# print tokens
for token in doc[:10]:
    print(token.text, token.pos_, token.dep_)

Leonard PROPN compound
Simon PROPN compound
Nimoy PROPN nsubjpass
was AUX auxpass
born VERB ROOT
on ADP prep
March PROPN pobj
26 NUM nummod
, PUNCT punct
1931 NUM nummod


In [4]:
# doc.sents - an iterator over the sentences in the Doc object
for id, sent in enumerate(doc.sents):
  print(f'Sentence {id+1}: {sent}')

Sentence 1: Leonard Simon Nimoy was born on March 26, 1931, in an Irish section of West End of Boston, Massachusetts, to Jewish immigrants from Iziaslav, Ukraine.
Sentence 2: His mother, Dora (née Spinner; 1904–1987), was a homemaker, and his father, Max Nimoy (1901–1987), owned a barbershop in the Mattapan section of Boston.
Sentence 3: Leonard Simon Nimoy was an American actor, famed for playing Spock in the Star Trek franchise for almost 50 years.


In [5]:
token_details = []
for idx, token in enumerate(doc):
  token_details.append((idx, token.text, token.lemma_, token.pos_, token.tag_, token.dep_))

In [6]:
from tabulate import tabulate

print(tabulate(token_details[:25], headers=['ID', 'TEXT', 'LEMMA', 'POS', 'TAG', 'DEP']))

  ID  TEXT           LEMMA          POS    TAG    DEP
----  -------------  -------------  -----  -----  ---------
   0  Leonard        Leonard        PROPN  NNP    compound
   1  Simon          Simon          PROPN  NNP    compound
   2  Nimoy          Nimoy          PROPN  NNP    nsubjpass
   3  was            be             AUX    VBD    auxpass
   4  born           bear           VERB   VBN    ROOT
   5  on             on             ADP    IN     prep
   6  March          March          PROPN  NNP    pobj
   7  26             26             NUM    CD     nummod
   8  ,              ,              PUNCT  ,      punct
   9  1931           1931           NUM    CD     nummod
  10  ,              ,              PUNCT  ,      punct
  11  in             in             ADP    IN     prep
  12  an             an             DET    DT     det
  13  Irish          irish          ADJ    JJ     amod
  14  section        section        NOUN   NN     pobj
  15  of             of             ADP 

In [7]:
from spacy import displacy

for sent in doc.sents:
  displacy.render(sent, style="dep", jupyter=True, options={'distance': 100})

In [8]:
ner_details = []

for ent in doc.ents:
  ner_details.append((ent.text, ent.start_char, ent.end_char, ent.label_))

In [9]:
import pandas as pd

# for now, just for printing tabular data nicely ;)
pd.DataFrame(ner_details, columns=['TEXT', 'START', 'END', 'LABEL'])

Unnamed: 0,TEXT,START,END,LABEL
0,Leonard Simon Nimoy,0,19,PERSON
1,"March 26, 1931",32,46,DATE
2,Irish,54,59,NORP
3,West End,71,79,LOC
4,Boston,83,89,GPE
5,Massachusetts,91,104,GPE
6,Jewish,109,115,NORP
7,Iziaslav,132,140,GPE
8,Ukraine,142,149,GPE
9,Dora,163,167,PERSON


In [10]:
from spacy import displacy

# Let's use displacy to display the entities.
displacy.render(doc, style='ent', jupyter=True)

In [11]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

## WordNet

In [15]:
import nltk
nltk.download('wordnet')

#Next we import wordnet from nltk
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /home/michael/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
wn.synsets("star")

[Synset('star.n.01'),
 Synset('ace.n.03'),
 Synset('star.n.03'),
 Synset('star.n.04'),
 Synset('star.n.05'),
 Synset('headliner.n.01'),
 Synset('asterisk.n.01'),
 Synset('star_topology.n.01'),
 Synset('star.v.01'),
 Synset('star.v.02'),
 Synset('star.v.03'),
 Synset('leading.s.01')]

In [17]:
# or look up all definitions, lexical information and synonyms of a specific synset
i=0
for sense in wn.synsets("star"):
  i+=1
  print(i,sense.name(),": ",sense.lexname(),", ",sense.definition(),", ",sense.lemma_names())

1 star.n.01 :  noun.object ,  (astronomy) a celestial body of hot gases that radiates energy derived from thermonuclear reactions in the interior ,  ['star']
2 ace.n.03 :  noun.person ,  someone who is dazzlingly skilled in any field ,  ['ace', 'adept', 'champion', 'sensation', 'maven', 'mavin', 'virtuoso', 'genius', 'hotshot', 'star', 'superstar', 'whiz', 'whizz', 'wizard', 'wiz']
3 star.n.03 :  noun.object ,  any celestial body visible (as a point of light) from the Earth at night ,  ['star']
4 star.n.04 :  noun.person ,  an actor who plays a principal role ,  ['star', 'principal', 'lead']
5 star.n.05 :  noun.shape ,  a plane figure with 5 or more points; often used as an emblem ,  ['star']
6 headliner.n.01 :  noun.person ,  a performer who receives prominent billing ,  ['headliner', 'star']
7 asterisk.n.01 :  noun.communication ,  a star-shaped character * used in printing ,  ['asterisk', 'star']
8 star_topology.n.01 :  noun.cognition ,  the topology of a network whose components ar

In [18]:
star = wn.synset("star.n.03")
hypernyms = lambda s:s.hypernyms()
list(star.closure(hypernyms))

[Synset('celestial_body.n.01'),
 Synset('natural_object.n.01'),
 Synset('whole.n.02'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

In [19]:
#simplified lesk algorithm
def lesk(sentence, ambiguous_word):
  max_overlaps = 0
  lesk_sense = ""
  #the context is composed of all the single words in the sentence
  context = sentence.split()

  #for all synsets of the ambiguous word
  for sense in wn.synsets(ambiguous_word):
    lesk_dictionary=[]
    #split the definition into words
    lesk_dictionary = sense.definition().split()
    #add the group of lemmas with a similar meaning from the same sense
    lesk_dictionary += sense.lemma_names()
    #count the overlaps between definition and sentence
    overlaps = set(lesk_dictionary).intersection(context)

    if len(overlaps) > max_overlaps:
      #the correct sense is the one with the highest overlap
      lesk_sense = sense
      max_overlaps = len(overlaps)

  return lesk_sense

sentence1 = "The astronomer loves the star who plays the lead role"
ambiguous_word = 'star'

answer1 = lesk(sentence1, ambiguous_word)
print(answer1)
print(answer1.definition())

Synset('star.n.04')
an actor who plays a principal role


In [20]:
#try another sentence
sentence2 = "The astronomer loves the star that twinkles in the sky so bright"
ambiguous_word = 'star'

answer2 = lesk(sentence2, ambiguous_word)
print(answer2)
print(answer2.definition())

Synset('star.n.01')
(astronomy) a celestial body of hot gases that radiates energy derived from thermonuclear reactions in the interior
