# Language technology final project
##### Ilse Kerkhove, Marieke Schelhaas & Nikki van Gurp





In [4]:
!python3 -m spacy download nl_core_news_lg

Collecting nl-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_lg-3.8.0/nl_core_news_lg-3.8.0-py3-none-any.whl (568.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.1/568.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nl-core-news-lg
Successfully installed nl-core-news-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('nl_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy

nlp = spacy.load("nl_core_news_lg")

In [6]:
import requests
import re
import time
from datetime import datetime

In [8]:
extra_words = {
    'wonen' : 'woonplaats',
    'werken' : 'beroep',
    'doen' : 'beroep', # voor "Wat doet X" vragen
    'lid' : 'omvat deel',
    'leden' : 'omvat deel',
    'bestaan' : 'onderdelen',
    'lang' : 'hoogte',
    'groot' : 'hoogte',
    'hoog' : 'hoogte',
    'zwaar' : 'gewicht',
    'studeren' : 'opleiding',
    'volgers' : 'aantal volgers op sociale media',
    'heten' : 'naam in moedertaal',
    'oud' : 'geboortedatum',
    'leeftijd' : 'geboortedatum',
    'premier' : 'premier van Nederland',
    'minister-president' : 'premier van Nederland',
    'Feyenoord' : 'Feyenoord Rotterdam',
    'Liverpool' : 'Liverpool FC',
    'meedoen' : 'deelgenomen aan',
    'presenteerde' : 'relevant werk'
}

In [22]:
# vinden van de mogelijke entity, de wiki id
def get_entity(sentence):
    url = 'https://www.wikidata.org/w/api.php'
    headers = { 'User-Agent': 'nikkivgurp'}
    params = {'action':'wbsearchentities',
          'language':'nl',
          'uselang':'nl',
          'format':'json'}
    entities = {}
    if len(sentence.ents) != 0:
        for ent in sentence.ents:
            params['search'] = ent.text
            try:
              time.sleep(1) # To stop wiki_data from seeing me as a bot
              result = requests.get(url,params).json()
              if 'search' in result and result['search']:
                entities[ent] = result['search'][0]['id']
            except:
              continue
    if len(entities) == 0:
      return None
    else:
      return entities


# vinden van de mogelijke properties met Spacy
def find_candidates(sentence):
    candidates = set()
    for token in sentence:
        # using dependency relations
        if token.dep_ in ("attr", "dobj", "popj", "nsubj", "ROOT"):
            candidates.add(token.lemma_)
        elif token.pos_ == "VERB" and token.lemma_ != "zijn":
            candidates.add(token.lemma_)
    for chunk in sentence.noun_chunks:
        candidates.add(chunk.root.text)
        candidates.add(chunk.root.lemma_)
        candidates.add(chunk.root.head.text)
        candidates.add(chunk.lemma_)
        candidates.add(chunk.text)
    extra_candidates = {extra_words[word] for word in candidates if word in extra_words}
    if 'waar' in sentence.text.lower() and 'geboren' in sentence.text.lower():
      extra_candidates.add('geboorteplaats')
    elif 'wanneer' in sentence.text.lower() and 'geboren' in sentence.text.lower():
      extra_candidates.add('geboortedatum')
    candidates.update(extra_candidates)

    # als simpele vragen het niet meer doen, verwijder de code hierna, jammer dan
    for token in sentence:
      if token.text[0].isupper() and token.pos_ in {"PROPN", "NOUN"}:
        candidates.add(token.text)

    return candidates

def get_property(candidates, entity_type):
    url = 'https://www.wikidata.org/w/api.php'
    params = {'action':'wbsearchentities',
          'language':'nl',
          'uselang':'nl',
          'format':'json',
          'type': entity_type}
    if candidates:
        properties = []
        for candidate in candidates:
            params['search'] = candidate
            try:
              time.sleep(1) # To stop wiki_data from seeing me as a bot
              result = requests.get(url,params).json()
              properties.append(result['search'][0]['id'])
            except:
                continue
        return properties
    else:
        return None


# wikidata vinden met Sparql
def get_wikidata(query):
    url = 'https://query.wikidata.org/sparql'
    headers = {'Accept': 'application/json', 'User-Agent': 'nikkivgurp'}
    time.sleep(1) # To stop wiki_data from seeing us as a bot
    results = requests.get(url, params={'query': query, 'format': 'json'}, headers=headers)
    results.raise_for_status()
    return results.json()


# get all possible properties when given an entity
def get_all_p(entity):
  query = 'SELECT ?prop WHERE { wd:' + entity + ' ?prop ?val . FILTER STRSTARTS(STR(?prop), "http://www.wikidata.org/prop/direct/P")}'
  wikidata = get_wikidata(query)
  possible_p_values = set()
  pattern = r'P\d+'
  for item in wikidata['results']['bindings']:
        p_value = re.search(pattern, item['prop']['value'])
        if p_value:
          possible_p_values.add(p_value.group())
  return possible_p_values


def answer_yes_no_question(extra_properties, entity, answers):
  if extra_properties and entity:
    question_prop = get_all_p(entity)
    if 'P735' in question_prop and 'naam' not in question:
      question_prop.remove('P735')
    for prop in extra_properties:
      for prop_q in question_prop:
        try:
          if len(answers) == 0:
            yes_no_query = 'ASK WHERE { wd:' + entity + ' wdt:' + prop_q + ' wd:'  + prop + ' . }'
            response = get_wikidata(yes_no_query)
            if response['boolean'] == True:
              answers.append('Ja')
              break
        except:
          continue
  if len(answers) == 0:
    answers.append('Nee')
  return answers


def answer_age_question(entity):
  age_query = '''SELECT ?birthDate WHERE {
              wd:''' + entity + ''' wdt:P569 ?birthDate .
              SERVICE wikibase:label {bd:serviceParam wikibase:language "nl" .
              }
              }'''
  age_wikidata = get_wikidata(age_query)
  bindings = age_wikidata.get('results', {}).get('bindings', [])
  if bindings:
    birth_date = bindings[0]['birthDate']['value']
    date = datetime.strptime(birth_date, '%Y-%m-%dT%H:%M:%SZ')
    today = datetime.today()
    age = 2025 - date.year
    if (today.month, today.day) < (date.month, date.day):
      age -= 1
  else:
    niksgevonden = True
  return age


def answer_start_end_question(entity, sentence, question_type, answers):
  all_properties = get_all_p(entity)
  tijd_properties = ['P108', 'P54', 'P69', 'P39']
  all_entities = list(get_entity(sentence).values())[1:]
  for tp in tijd_properties:
    if tp in all_properties:
      for other_entity in all_entities:
        if "start_question" in question_type:
          start_query = ''' SELECT ?start WHERE {
                            wd:''' + entity + ''' p:''' + tp + ''' ?club .
                            ?club ps:''' + tp + ''' wd:''' + other_entity + ''' ;
                            pq:P580 ?start .} '''
        elif "end_question" in question_type:
          start_query = ''' SELECT ?start WHERE {
                            wd:''' + entity + ''' p:''' + tp + ''' ?club .
                            ?club ps:''' + tp + ''' wd:''' + other_entity + ''' ;
                            pq:P582 ?start .} '''

        wikidata = get_wikidata(start_query)
        bindings = wikidata.get('results', {}).get('bindings', [])
        for result in bindings:
            if 'start' in result:
              if ".well-known/genid/" in result['start']['value']:
                answers.append("Er is nog geen einddatum.")
              else:
                answers.append(result['start']['value'])
  return answers


def answer_others(entity, prop, answers):
  query = '''SELECT ?answer ?answerLabel WHERE {
          wd:''' + entity + ''' wdt:''' + prop + ''' ?answer .
          SERVICE wikibase:label {bd:serviceParam wikibase:language "nl" .
          }
         }'''
  wikidata = get_wikidata(query)
  bindings = wikidata.get('results', {}).get('bindings', [])
  for result in bindings:
      if 'answerLabel' in result:
          answers.append(result['answerLabel']['value'])
  return answers


def print_answers(answers, question, candidates, question_type):
  date_pattern = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$"
  if re.match(date_pattern, answers[-1]):
    if ' jaar' in question.lower().split(' ') and not 'meedoen' in candidates:
      print("yes")
      print(question, '\t', answers[-1][:4])
    elif 'datum' in question.lower() or 'dag' in question.lower() or 'wanneer' in question.lower():
      print(question, '\t', answers[-1][8:10], '-', answers[-1][5:7], '-', answers[-1][:4])
    elif ' tijd' in question.lower() or 'hoe laat' in question.lower():
      print(question, '\t', answers[-1][-9:-1])
    elif 'leeftijd' in question.lower() or 'hoe oud' in question.lower() or 'jaar oud' in question.lower():
      if datetime.now().month >= int(answers[-1][5:7]) and datetime.now().day >= int(answers[-1][8:10]):
        age = datetime.now().year - int(answers[-1][:4])
      else:
        age = datetime.now().year - int(answers[-1][:4]) -1
      print(question, '\t', age)
    else:
      print(question, '\t', answers[-1])
  elif "hoeveel_question" in question_type:
    print(question, '\t', len(answers))
  elif "list_question" in question_type:
    print(question, '\t', ', '.join(answers))
  else:
    print(question, '\t', answers[-1])



# question answer functie
def QA(question):
    sentence = nlp(question)
    properties = get_property(find_candidates(sentence), 'property')
    candidates = find_candidates(sentence)
    entity = list(get_entity(sentence).values())[0]
    question_type = set()
    niksgevonden = False

    # checken of het een ja nee vraag is (vraag die start met werkwoord)
    if sentence[0].pos_ == 'VERB' or sentence[0].pos_ =='AUX':
      question_type.add("yes_no_question")
      extra_properties = get_property(candidates, 'item')
      extra_properties.extend(properties)

    # Checken of het een lijst vraag is
    meervoud = []
    for token in sentence:
      if token.pos_ in {'NOUN', 'PROPN'} and token.text.lower().endswith(('en', 's')):
        meervoud.append(token.text)
    if sentence[0].text.lower() in ['welke', 'wat zijn'] or len(meervoud) > 0:
      question_type.add("list_question")

    # checken of het een start vraag is
    if any(word in question.lower() for word in ['start', 'starten', 'startte', 'startten', 'gestart', 'begin', 'beginnen', 'begon', 'begonnen']):
      question_type.add("start_question")
    # checken of het een eind vraag is
    if any(word in question.lower() for word in ['eindigen', 'eindigt', 'eindigde', 'geeindigd', 'stopt', 'stoppen', 'stopte', 'gestopt']):
      question_type.add("end_question")
    # Checken of het een Hoeveel vraag is
    if sentence[0].text.lower() in ['hoeveel'] and sentence[1].text.lower() not in ['kinderen', 'volgers']:
      question_type.add("hoeveel_question")

    answers = []
    # Als het een ja of nee-vraag is
    if "yes_no_question" in question_type:
      answers = answer_yes_no_question(extra_properties, entity, answers)

    # Overige vragen, waarin entity en property is ontdekt
    elif properties and entity:
        if "age_question" in question_type:
            age = answer_age_question(entity)
        elif "start_question" in question_type or "end_question" in question_type:
          answers = answer_start_end_question(entity, sentence, question_type, answers)
        else:
          for prop in properties:
            answers = answer_others(entity, prop, answers)

        # If simple query does not work, try other question types
        if len(answers) != 0:
          print_answers(answers, question, candidates, question_type)
        else:
          niksgevonden = True

        if "yes_or_no_question" in question_type:
          print(question, '\t', answers[-1])
        elif niksgevonden:
          print(question, '\t', 'Niks gevonden')



# Debugging Testsets
# ## General Question-set
q1 = 'Wat zijn de adellijke titels van Wilhelmina der Nederlanden?'
q1 = 'Wat zijn de beroepen van Monica Geuze?'
q2 = 'In welk jaar deed S10 mee aan het Eurovisie Songfestival?'
q3 = 'Is Aletta Jacobs geboren in Sappemeer?'
q4 = 'Waar is Joost Klein geboren?'
q5 = 'Wat was het hoofdvak van de master van Rik van der Westerlaken?'
q6 = 'Hoeveel seizoenen heeft Klaas van Kruistum gepresenteerd van televisieserie Checkpoint?'
q7 = 'Wanneer is Michiel de Ruyter geboren?'
q8 = 'Voor welke club heeft Johan Crujiff het laatst gespeeld?'
q9 = 'Hoeveel leden heeft K3 gehad?'

question_list1 = [q9, q4, q1]


# question = input("Stel een vraag over een BN'er\n")
# hij doet er wel een paar seconde over

for question in question_list1:
  QA(question)

Hoeveel leden heeft K3 gehad? 	 Niks gevonden
Waar is Joost Klein geboren? 	 1997-11-10T00:00:00Z
Wat zijn de beroepen van Monica Geuze? 	 diskjockey, filmregisseur, vlogger, model, presentator
