In [0]:
### Information Reterival
## Steps: tokenization, POS tagging, Entity Detection, Relation Detection

In [0]:
import nltk
import re
from nltk.tokenize import sent_tokenize

In [0]:
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('ieer')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package ieer to /root/nltk_data...
[nltk_data]   Unzipping corpora/ieer.zip.


True

In [0]:
# for entity detection we use ne_chunking
nltk.download('treebank')
sent = nltk.corpus.treebank.tagged_sents()[0]
print(sent)
print(nltk.ne_chunk(sent, binary=True))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
(S
  (NE Pierre/NNP Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


In [0]:
print(nltk.ne_chunk(sent))
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [0]:
ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(ex)))
print(ne_tree)


(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [0]:
ne_tree.draw

<bound method Tree.draw of Tree('S', [Tree('GPE', [('European', 'JJ')]), ('authorities', 'NNS'), ('fined', 'VBD'), Tree('PERSON', [('Google', 'NNP')]), ('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.1', 'CD'), ('billion', 'CD'), ('on', 'IN'), ('Wednesday', 'NNP'), ('for', 'IN'), ('abusing', 'VBG'), ('its', 'PRP$'), ('power', 'NN'), ('in', 'IN'), ('the', 'DT'), ('mobile', 'JJ'), ('phone', 'NN'), ('market', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), ('the', 'DT'), ('company', 'NN'), ('to', 'TO'), ('alter', 'VB'), ('its', 'PRP$'), ('practices', 'NNS')])>

In [0]:
### Information extraction

In [0]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


In [0]:
doc.text.draw()

In [0]:
from nltk.corpus import wordnet as wn

In [0]:
wn.synsets('dog')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [0]:
wn.synsets('dog', pos=wn.VERB)

[Synset('chase.v.01')]

In [0]:
wn.synset('dog.n.01').definition()

'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds'

In [0]:
wn.synset('dog.n.01').examples()

['the dog barked all night']


In [0]:
wn.synset('dog.n.01').hypernyms()

[Synset('canine.n.02'), Synset('domestic_animal.n.01')]

In [0]:
wn.synset('dog.n.01').root_hypernyms()

[Synset('entity.n.01')]

In [0]:
wn.synset('dog.n.01').hyponyms()

[Synset('basenji.n.01'),
 Synset('corgi.n.01'),
 Synset('cur.n.01'),
 Synset('dalmatian.n.02'),
 Synset('great_pyrenees.n.01'),
 Synset('griffon.n.02'),
 Synset('hunting_dog.n.01'),
 Synset('lapdog.n.01'),
 Synset('leonberg.n.01'),
 Synset('mexican_hairless.n.01'),
 Synset('newfoundland.n.01'),
 Synset('pooch.n.01'),
 Synset('poodle.n.01'),
 Synset('pug.n.01'),
 Synset('puppy.n.01'),
 Synset('spitz.n.01'),
 Synset('toy_dog.n.01'),
 Synset('working_dog.n.01')]

In [0]:
wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01'))

[Synset('carnivore.n.01')]

In [0]:
lion = wn.synset('lion.n.01')
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')

In [0]:
lion.path_similarity(cat)

0.25

In [0]:
lion.path_similarity(dog)

0.16666666666666666

In [0]:
## lch and wup similarities for different relations in the taxonomy tree

In [0]:
import spacy
nlp = spacy.load('en')

In [0]:
doc = nlp(ex)

In [0]:
for x in doc.ents:
    print(x, x.label_)

European NORP
Google ORG
$5.1 billion MONEY
Wednesday DATE


In [0]:
x

practices

In [0]:
x[0].dep_

'aux'

In [0]:
x = list(x.head.lefts)

In [0]:
for x in doc:
    print(x, x.ent_iob_, x.ent_type_)

In [0]:
import spacy


sentence = " Mohamed was born in Cairo. Ahmed was born in Alex.  Ahmed in ZewailCity is in Giza."

spacy_nlp = spacy.load('en')
document = spacy_nlp((sentence))

import spacy

nlp = spacy.load("en_core_web_sm")


persons=[]
gpes=[]
for element in document.ents:
    print("", (element.label_, element))
    if element.label_ =='GPE':
     gpes.append(element)
    if element.label_ =='PERSON':  
     persons.append(element)
ins =['in']*len(persons)   
print (list(zip(persons,ins,gpes)))
print([(X, X.ent_iob_, X.ent_type_) for X in document])

 ('PERSON', Mohamed)
 ('GPE', Cairo)
 ('PERSON', Ahmed)
 ('GPE', Alex)
 ('PERSON', Ahmed)
 ('ORG', ZewailCity)
 ('PERSON', Giza)
[(Mohamed, 'in', Cairo), (Ahmed, 'in', Alex)]
[( , 'O', ''), (Mohamed, 'B', 'PERSON'), (was, 'O', ''), (born, 'O', ''), (in, 'O', ''), (Cairo, 'B', 'GPE'), (., 'O', ''), (Ahmed, 'B', 'PERSON'), (was, 'O', ''), (born, 'O', ''), (in, 'O', ''), (Alex, 'B', 'GPE'), (., 'O', ''), ( , 'O', ''), (Ahmed, 'B', 'PERSON'), (in, 'O', ''), (ZewailCity, 'B', 'ORG'), (is, 'O', ''), (in, 'O', ''), (Giza, 'B', 'PERSON'), (., 'O', '')]


In [0]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import nltk
nltk.download('punkt')

import re
from nltk.tokenize import sent_tokenize
print(nltk.sent_tokenize(sentence))
pattern = [{"POS": {"REGEX": "N"}}]

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
matcher.add("PLoc", pattern)

matches = matcher(document)
print(matches)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[' Mohamed was born in Cairo.', 'Ahmed was born in Alex.', 'Ahmed in ZewailCity is in Giza.']
[]


In [0]:
pe_loc=[]
for sent in nltk.sent_tokenize(sentence):
  doc=nlp(sent)
  childs={}
  for token in doc:
     #print([(token.dep_,token,child,child.tag_) for child in token.children])

     if token.dep_ in ['ROOT','nsubj']:
        usefull_c=[child for child in token.children if child.tag_ in ['NNP','IN']]
        childs[token.dep_]=usefull_c
     elif token.dep_=='prep' :
              usefull_c=[child for child in token.children if child.tag_ in ['NNP']]
              childs[token.dep_]=usefull_c

     else :
      pass
    
  try :  
   pe_loc.append(childs['nsubj']+childs['prep']) 
  except :
    pass
  pe_loc.append(childs['ROOT']+childs['prep'])
  print (childs.keys())
    #print(list(token.children))
    #print(token.head.pos_)
    #print(token.text, token.dep_,"hi"+ token.head.text, token.head.pos_,
    #print([(token,child,child.tag_) for child in token.children])
print(pe_loc)  

dict_keys(['ROOT', 'prep'])
dict_keys(['ROOT', 'prep'])
dict_keys(['nsubj', 'prep', 'ROOT'])
[[Mohamed, in, Cairo], [Ahmed, in, Alex], [in, Giza], [Ahmed, in, Giza]]


In [0]:
ex = 'Mohamed was born in Cairo. Ahmed was born in Alex. Ahmed in ZewailCity which is in Giza. '
ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(ex)))
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
doc = nltk.corpus.reader.ieer.IEERDocument(text = ne_tree,  headline='my text')
for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', doc, corpus = 'ieer', pattern = IN):
    print(nltk.sem.rtuple(rel))


[PER: 'Ahmed/NNP'] 'in/IN' [ORG: 'ZewailCity/NNP']
