# Language technology final project
##### Ilse Kerkhove, Marieke Schelhaas & Nikki van Gurp





In [1]:
!python3 -m spacy download nl_core_news_lg

Collecting nl-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_lg-3.8.0/nl_core_news_lg-3.8.0-py3-none-any.whl (568.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.1/568.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nl-core-news-lg
Successfully installed nl-core-news-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('nl_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy

nlp = spacy.load("nl_core_news_lg")

In [3]:
import requests
import re
import time

In [31]:
# vinden van de mogelijke entity, de wiki id
def get_entity(sentence):
    url = 'https://www.wikidata.org/w/api.php'
    headers = { 'User-Agent': 'nikkivgurp'}
    params = {'action':'wbsearchentities',
          'language':'nl',
          'uselang':'nl',
          'format':'json'}
    if len(sentence.ents) != 0:
        for ent in sentence.ents:
            params['search'] = ent.text
        try:
          time.sleep(1) # To stop wiki_data from seeing me as a bot
          result = requests.get(url,params).json()
          return result['search'][0]['id']
        except (KeyError, IndexError):
            return None
    else:
        return None

# vinden van de mogelijke properties met Spacy
def find_candidates(sentence):
    sentence = nlp(question)
    candidates = set()
    for token in sentence:
        # using dependency relations
        if token.dep_ in ("attr", "dobj", "popj", "nsubj", "ROOT"):
            candidates.add(token.lemma_)
        elif token.pos_ == "VERB" and token.lemma_ != "zijn":
            candidates.add(token.lemma_)
    for chunk in sentence.noun_chunks:
        candidates.add(chunk.root.text)
        candidates.add(chunk.root.head.text)
        candidates.add(chunk.lemma_)
        candidates.add(chunk.text)
    print("+++ ",candidates)
    return candidates

def get_property(candidates, entity_type):
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action':'wbsearchentities',
          'language':'nl',
          'uselang':'nl',
          'format':'json',
          'type': entity_type}
    if candidates:
        properties = []
        for candidate in candidates:
            params['search'] = candidate
            try:
              time.sleep(1) # To stop wiki_data from seeing me as a bot
              result = requests.get(url,params).json()
              properties.append(result['search'][0]['id'])
            except:
                continue
        return properties
    else:
        return None


# wikidata vinden met Sparql
def get_wikidata(query):
    url = 'https://query.wikidata.org/sparql'
    headers = {'Accept': 'application/json', 'User-Agent': 'nikkivgurp'}
    time.sleep(1) # To stop wiki_data from seeing me as a bot
    results = requests.get(url, params={'query': query, 'format': 'json'}, headers=headers)
    results.raise_for_status()
    return results.json()

# get all possible properties when given an entity
def get_all_p(entity):
  query = 'SELECT ?prop WHERE { wd:' + entity + ' ?prop ?val . FILTER STRSTARTS(STR(?prop), "http://www.wikidata.org/prop/direct/P")}'
  wikidata = get_wikidata(query)
  possible_p_values = set()
  pattern = r'P\d+'
  for item in wikidata['results']['bindings']:
        p_value = re.search(pattern, item['prop']['value'])
        if p_value:
          possible_p_values.add(p_value.group())
  return possible_p_values


# question answer functie
def QA(question):
    sentence = nlp(question)
    properties = get_property(find_candidates(question), 'property')
    entity = get_entity(sentence)
    niksgevonden = False
    yes_no_question = False

    # checken of het een ja nee vraag is (vraag die start met werkwoord)
    if sentence[0].pos_ == 'VERB' or sentence[0].pos_ =='AUX':
      yes_no_question = True
      extra_properties = get_property(find_candidates(question), 'item')
      extra_properties.extend(properties)

    answers = []
    # Als het een ja of nee-vraag is
    if yes_no_question:
      if extra_properties and entity:
        for prop in extra_properties:
          try:
              # simple yes no query
              yes_no_query = 'ASK WHERE { wd:' + entity + ' wdt:' + prop + ' ?val. }'
              response = get_wikidata(yes_no_query)
              if response['boolean'] == True:
                answers.append('Ja')
                break
          except:
            continue
        if len(answers) == 0:
          question_prop = get_all_p(entity)
          for prop in extra_properties:
            for prop_q in question_prop:
              try:
                if len(answers) == 0:
                  # more difficult yes no query
                  yes_no_query = 'ASK WHERE { wd:' + entity + ' wdt:' + prop_q + ' wd:'  + prop + ' . }'
                  response = get_wikidata(yes_no_query)
                  if response['boolean'] == True:
                    answers.append('Ja')
                    break
              except Exception as e:
                # print('error', e)
                continue
        if len(answers) == 0:
          answers.append('Nee')

    # Overige vragen, waarin entity en property is ontdekt
    elif properties and entity:
        for prop in properties:
            query = '''SELECT ?answer ?answerLabel WHERE {
                       wd:''' + entity + ''' wdt:''' + prop + ''' ?answer .
                       SERVICE wikibase:label {bd:serviceParam wikibase:language "nl" .
                       }
                      }'''
            wikidata = get_wikidata(query)
            bindings = wikidata.get('results', {}).get('bindings', [])
            for result in bindings:
                if 'answerLabel' in result:
                    answers.append(result['answerLabel']['value'])

    if len(answers) == 0:
      niksgevonden = True
      print(question, '\t', 'Niks gevonden')
    else:
        print(question, '\t', answers[-1])


# Debugging Testsets
## General Question-set
q1 = 'Welke talen spreekt Arjen Lubach?'
q2 = 'Wanneer is Jan Smit geboren?'
q3 = 'Wanneer is Rembrandt van Rijn overleden?'
q4 = 'Hoe heet het kind van Michiel de Ruyter?'
question_list1 = [q1, q2, q3, q4]

## Hoe Question-set
q5 = 'Hoeveel volgers heeft Jan Smit?'
q6 = 'Hoe oud is Mark Rutte?'
q7 = 'Hoeveel onderscheidingen heeft Froukje?'
q14 = 'Hoeveel kinderen heeft Dick Schoof?'
question_list2 = [q5, q6, q7]

## Lijst Question-set
q8 = 'Welke talen spreek Arjen Lubach'
q9 = '?'
q10 = '?'
question_list3 = [q8, q9, q10]

## Ja/Nee Question-set
q11 = '?'
q12 = '?'
q13 = '?'
question_list4 = [q11, q12, q13]

# question = input("Stel een vraag over een BN'er\n")
# hij doet er wel een paar seconde over

for question in question_list2:
  QA(question)

+++  {'Jan', 'volger', 'hebben', 'volgers', 'heeft'}
Hoeveel volgers heeft Jan Smit? 	 Niks gevonden
+++  {'oud', 'Mark'}
Hoe oud is Mark Rutte? 	 Niks gevonden
+++  {'hebben', 'onderscheiding', 'onderscheidingen', 'Froukje', 'heeft'}
Hoeveel onderscheidingen heeft Froukje? 	 Niks gevonden
