<a href="https://colab.research.google.com/github/lwachowiak/Evaluating-Crowdsourced-Annotations/blob/main/KBs_for_Annotation_Verification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Yago and DBpedia

How to query Yago and DBpedia to learn more about extracted terms, find relations between them, and verify their correctness 

In [None]:
!pip install SPARQLWrapper

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON

DBpedia

In [4]:
def get_description_dbpedia(query):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql") 
    sparql.setReturnFormat(JSON)
    sparql.setQuery(query)  
    return sparql.query().convert()

In [5]:
# What city was C. Ronaldo born in? Funchal
query="""SELECT *
WHERE
{
  ?athlete  rdfs:label      "Cristiano Ronaldo"@en ;
            dbo:birthPlace  ?place .
  ?place    a               dbo:City ;
            rdfs:label      ?cityName .
  FILTER ( LANG ( ?cityName ) = 'en' )
}"""
get_description_dbpedia(query)

{'head': {'link': [], 'vars': ['athlete', 'place', 'cityName']},
 'results': {'bindings': [{'athlete': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Cristiano_Ronaldo'},
    'cityName': {'type': 'literal', 'value': 'Funchal', 'xml:lang': 'en'},
    'place': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Funchal'}}],
  'distinct': False,
  'ordered': True}}

Yago

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://linkeddata1.calcul.u-psud.fr/sparql")
sparql.setQuery("""
    select  *
where {
        <http://yago-knowledge.org/resource/Elvis_Presley> ?property ?valueOrObject .
      } 
LIMIT 100""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

results

In [None]:
# asking for relation between to things
# Here: What is the relation between Albert Einstein and Alfred Kleiner?
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://linkeddata1.calcul.u-psud.fr/sparql")
sparql.setQuery("""
SELECT * WHERE { <http://yago-knowledge.org/resource/Albert_Einstein> ?relation <http://yago-knowledge.org/resource/Alfred_Kleiner> }""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

results

{'head': {'link': [], 'vars': ['relation']},
 'results': {'bindings': [{'relation': {'type': 'uri',
     'value': 'http://yago-knowledge.org/resource/linksTo'}},
   {'relation': {'type': 'uri',
     'value': 'http://yago-knowledge.org/resource/hasAcademicAdvisor'}}],
  'distinct': False,
  'ordered': True}}

In [None]:
# Often, no relations are found, e.g.: 
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("https://linkeddata1.calcul.u-psud.fr/sparql")
sparql.setQuery("""
SELECT * WHERE { <http://yago-knowledge.org/resource/Giraffe> ?relation <http://yago-knowledge.org/resource/yago:Northern_Giraffe> }""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

results

{'head': {'link': [], 'vars': ['relation']},
 'results': {'bindings': [], 'distinct': False, 'ordered': True}}

# ConceptNet 

How to use the ConceptNet to see how related 2 terms (relatedness score) and to extract a list of all existing relations between the terms

In [None]:
import requests

In [None]:
obj = requests.get('http://api.conceptnet.io/c/en/vw_beetle').json()
obj.keys() # all interesting data is saved in edges

dict_keys(['@context', '@id', 'edges', 'version'])

In [None]:
# number of relations found
len(obj['edges'])

2

In [None]:
# one relation with 2 entities
# the interesting info is saved in "end", "rel", "start"
# there are 34 possible relations https://github.com/commonsense/conceptnet5/wiki/Relations
# here we can learn that: a VW Beetle is a VW
obj['edges'][0]

{'@id': '/a/[/r/IsA/,/c/en/vw_beetle/n/,/c/en/vw/n/]',
 '@type': 'Edge',
 'dataset': '/d/opencyc',
 'end': {'@id': '/c/en/vw/n',
  '@type': 'Node',
  'label': 'vw',
  'language': 'en',
  'sense_label': 'n',
  'term': '/c/en/vw'},
 'license': 'cc:by/4.0',
 'rel': {'@id': '/r/IsA', '@type': 'Relation', 'label': 'IsA'},
 'sources': [{'@id': '/s/resource/opencyc/2012',
   '@type': 'Source',
   'contributor': '/s/resource/opencyc/2012'}],
 'start': {'@id': '/c/en/vw_beetle/n',
  '@type': 'Node',
  'label': 'vw beetle',
  'language': 'en',
  'sense_label': 'n',
  'term': '/c/en/vw_beetle'},
 'surfaceText': None,
 'weight': 1.0}

In [None]:
# check if any relations exist between 2 given concepts 
# What is the relation between giraffe and animal? "Is a"
obj = requests.get("https://api.conceptnet.io/query?node=/c/en/giraffe&other=/c/en/animal").json()
print(obj["edges"][0]["rel"]["label"])
obj["edges"][0]

IsA


{'@id': '/a/[/r/IsA/,/c/en/giraffe/n/,/c/en/animal/]',
 '@type': 'Edge',
 'dataset': '/d/wiktionary/de',
 'end': {'@id': '/c/en/animal',
  '@type': 'Node',
  'label': 'animal',
  'language': 'en',
  'term': '/c/en/animal'},
 'license': 'cc:by-sa/4.0',
 'rel': {'@id': '/r/IsA', '@type': 'Relation', 'label': 'IsA'},
 'sources': [{'@id': '/and/[/s/process/wikiparsec/2/,/s/resource/wiktionary/de/]',
   '@type': 'Source',
   'contributor': '/s/resource/wiktionary/de',
   'process': '/s/process/wikiparsec/2'}],
 'start': {'@id': '/c/en/giraffe/n',
  '@type': 'Node',
  'label': 'giraffe',
  'language': 'en',
  'sense_label': 'n',
  'term': '/c/en/giraffe'},
 'surfaceText': None,
 'weight': 1.0}

In [None]:
# What is the relation between seat and train? "At location"
obj = requests.get("https://api.conceptnet.io/query?node=/c/en/seat&other=/c/en/train").json()
obj["edges"]

[{'@id': '/a/[/r/AtLocation/,/c/en/seat/,/c/en/train/]',
  '@type': 'Edge',
  'dataset': '/d/conceptnet/4/en',
  'end': {'@id': '/c/en/train',
   '@type': 'Node',
   'label': 'a train',
   'language': 'en',
   'term': '/c/en/train'},
  'license': 'cc:by/4.0',
  'rel': {'@id': '/r/AtLocation', '@type': 'Relation', 'label': 'AtLocation'},
  'sources': [{'@id': '/and/[/s/activity/omcs/omcs1_possibly_free_text/,/s/contributor/omcs/bedume/]',
    '@type': 'Source',
    'activity': '/s/activity/omcs/omcs1_possibly_free_text',
    'contributor': '/s/contributor/omcs/bedume'}],
  'start': {'@id': '/c/en/seat',
   '@type': 'Node',
   'label': 'a seat',
   'language': 'en',
   'term': '/c/en/seat'},
  'surfaceText': 'You are likely to find [[a seat]] in [[a train]]',
  'weight': 1.0},
 {'@id': '/a/[/r/RelatedTo/,/c/en/seat/,/c/en/train/]',
  '@type': 'Edge',
  'dataset': '/d/verbosity',
  'end': {'@id': '/c/en/train',
   '@type': 'Node',
   'label': 'train',
   'language': 'en',
   'term': '/c/e

In [None]:
# If no relation found, go with relatedness score 
# What is the relatedness score between food and giraffe?
obj = requests.get("https://api.conceptnet.io/relatedness?node1=/c/en/giraffe&node2=/c/en/food").json()
obj["value"]

0.036

In [None]:
obj = requests.get("https://api.conceptnet.io/relatedness?node1=/c/en/giraffe&node2=/c/en/mammal").json()
obj["value"]

0.404

In [None]:
obj = requests.get("https://api.conceptnet.io/relatedness?node1=/c/en/giraffe&node2=/c/en/lion").json()
obj["value"]

0.36

In [None]:
obj = requests.get("https://api.conceptnet.io/relatedness?node1=/c/en/giraffe&node2=/c/en/africa").json()
obj["value"]

0.326

In [None]:
obj = requests.get("https://api.conceptnet.io/relatedness?node1=/c/en/vw_beetle&node2=/c/en/car").json()
obj["value"]

0.592

# Term/NER/Noun Extraction

## NER 

An example of using the neural Stanford NLP library stanza for NER

In [None]:
!pip install stanza
import stanza

In [164]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp("Max Mustermann teaches at Stanford University, he lives in the Bay Area and loves giraffes and tower bells.")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-05-05 15:38:29 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-05-05 15:38:29 INFO: Use device: cpu
2022-05-05 15:38:29 INFO: Loading: tokenize
2022-05-05 15:38:29 INFO: Loading: ner
2022-05-05 15:38:30 INFO: Done loading processors!


In [165]:
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

entity: Max Mustermann	type: PERSON
entity: Stanford University	type: ORG
entity: the Bay Area	type: LOC


## All Nouns

Here, we use the stanza dependency parser to extract all nouns & noun compounds

In [166]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp('Max Mustermann teaches at Stanford University, he lives in the Bay Area and loves giraffes and towers bells.')
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-05-05 15:38:32 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-05-05 15:38:32 INFO: Use device: cpu
2022-05-05 15:38:32 INFO: Loading: tokenize
2022-05-05 15:38:32 INFO: Loading: pos
2022-05-05 15:38:32 INFO: Loading: lemma
2022-05-05 15:38:33 INFO: Loading: depparse
2022-05-05 15:38:33 INFO: Done loading processors!


id: 1	word: Max	head id: 3	head: teaches	deprel: nsubj
id: 2	word: Mustermann	head id: 1	head: Max	deprel: flat
id: 3	word: teaches	head id: 0	head: root	deprel: root
id: 4	word: at	head id: 6	head: University	deprel: case
id: 5	word: Stanford	head id: 6	head: University	deprel: compound
id: 6	word: University	head id: 3	head: teaches	deprel: obl
id: 7	word: ,	head id: 9	head: lives	deprel: punct
id: 8	word: he	head id: 9	head: lives	deprel: nsubj
id: 9	word: lives	head id: 3	head: teaches	deprel: parataxis
id: 10	word: in	head id: 13	head: Area	deprel: case
id: 11	word: the	head id: 13	head: Area	deprel: det
id: 12	word: Bay	head id: 13	head: Area	deprel: compound
id: 13	word: Area	head id: 9	head: lives	deprel: obl
id: 14	word: and	head id: 15	head: loves	deprel: cc
id: 15	word: loves	head id: 9	head: lives	deprel: conj
id: 16	word: giraffes	head id: 15	head: loves	deprel: obj
id: 17	word: and	head id: 19	head: bells	deprel: cc
id: 18	word: towers	head id: 19	head: bells	deprel: comp

In [167]:
doc

[
  [
    {
      "id": 1,
      "text": "Max",
      "lemma": "Max",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 3,
      "deprel": "nsubj",
      "start_char": 0,
      "end_char": 3
    },
    {
      "id": 2,
      "text": "Mustermann",
      "lemma": "Mustermann",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 1,
      "deprel": "flat",
      "start_char": 4,
      "end_char": 14
    },
    {
      "id": 3,
      "text": "teaches",
      "lemma": "teach",
      "upos": "VERB",
      "xpos": "VBZ",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
      "head": 0,
      "deprel": "root",
      "start_char": 15,
      "end_char": 22
    },
    {
      "id": 4,
      "text": "at",
      "lemma": "at",
      "upos": "ADP",
      "xpos": "IN",
      "head": 6,
      "deprel": "case",
      "start_char": 23,
      "end_char": 25
    },
    {
      "id": 5,
      "text": "Stanford",

In [168]:
list_of_nouns=[]
for sent in doc.sentences:
  for w in sent.words:
    if w.upos in ["NOUN", "PROPN"]:
      noun=w.text
      # add additional word if compound 
      if w.deprel=="compound":
        for w2 in sent.words:
          if w2.id==w.head:
            noun+=" "+w2.text
      list_of_nouns.append(noun)
  
list_of_nouns

['Max',
 'Mustermann',
 'Stanford University',
 'University',
 'Bay Area',
 'Area',
 'giraffes',
 'towers bells',
 'bells']

## Term Extraction

Lastly, we use automatic term extraction provided by the ELG Text2TCS library. This extracts all domain specific terms independent from their wordclass.

In [170]:
!pip install elg



In [171]:
from elg import Service
service = Service.from_id(8122)
result = service(request_input="Max Mustermann teaches at Stanford University, he lives in the Bay Area and loves giraffes and tower bells.", request_type="text")

Calling:
	[8122] Text to Terminological Concept System
with request:
	type: text - content: Max Mustermann teaches at Stanford University, he lives in the Bay Area and loves giraffes and tower bells. - mimeType: text/plain

Progress: 44.4%
Progress: 44.4%
Progress: 100.0%
Progress: 100.0%


In [172]:
for c in result["annotations"]:
  print(c)

c1: teaches
c2: Stanford University
c3: Bay Area
c4: giraffes
c5: tower bells
