# Question Answering Challenge

### Using a Wikipedia dump for offline retrieval

As specified in https://en.wikipedia.org/wiki/Wikipedia:Database_download, we download the Polish Wikipedia dump from https://dumps.wikimedia.org/.

In [None]:
import requests
from tqdm import tqdm

url = 'https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles.xml.bz2'

response = requests.get(url, stream=True)
if response.status_code == 200:
    file_name = 'plwiki-latest-pages-articles.xml.bz2'
    total_size = int(response.headers.get('content-length', 0))
    chunk_size = 1024  # 1 KB
    with open(file_name, 'wb') as file, tqdm(
        desc=file_name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if chunk:
                file.write(chunk)
                bar.update(len(chunk))
    print(f'Wikipedia dump downloaded successfully: {file_name}')
else:
    print(f"Failed to download the Wikipedia dump. HTTP Status Code: {response.status_code}")


plwiki-latest-pages-articles.xml.bz2:   1%|          | 19.7M/2.35G [00:22<4:24:50, 158kB/s]

### Extracting data from the Wikipedia dump using WikiExtractor

Run the following command to extract articles from the dump into an `extracted/` folder:
```bash
wikiextractor --json plwiki-latest-pages-articles.xml.bz2 -o extracted
```

### Processing the extracted data and indexing it into Elasticsearch

In [4]:
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
import os
import json

load_dotenv("elastic-start-local/.env")

es = Elasticsearch("http://localhost:9200", api_key=os.getenv("ES_LOCAL_API_KEY"))

INDEX_NAME = 'wiki_index'
INDEX_BODY = {
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "content": {"type": "text"}
        }
    }
}
    
if not es.indices.exists(index=INDEX_NAME):
    es.indices.create(index=INDEX_NAME, body=INDEX_BODY)
    
def index_articles(dump_dir, es, index_name):
    for root, _, files in os.walk(dump_dir):
        for file in files:
            print(f'Indexing file: {file}')
            if file.startswith('wiki_'):
                file_path = os.path.join(root, file)
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        article = json.loads(line)
                        title = article.get('title', '')
                        content = article.get('text', '')
                        
                        es.index(index=index_name, body={
                            "title": title,
                            "content": content
                        })

index_articles('wiki-dump', es, INDEX_NAME)

### Searching and answer extraction

In [12]:
from transformers import pipeline

model = pipeline("question-answering", model="sdadas/polish-gpt2-large")

def retrieve_context(question, es, index_name, num_results=3):
    query = {
        "query": {
            "multi_match": {
                "query": question,
                "fields": ["title", "content"],
                "operator": "and"
            }
        },
        "size": num_results
    }
    response = es.search(index=index_name, body=query)
    
    context = " ".join(hit['_source']['content'] for hit in response['hits']['hits'])
    return context

def generate_answer(question, context):
    result = model(question=question, context=context)
    return result["answer"]

def quiz_answer_system(question, es, index_name):
    context = retrieve_context(question, es, index_name)
    return generate_answer(question, context) if context else "No answer found"

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at sdadas/polish-gpt2-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [14]:
question = "Jak nazywa się bohaterka gier komputerowych z serii Tomb Raider?"
answer = quiz_answer_system(question, es, INDEX_NAME)
print(f"Q: {question}\nA: {answer}")

Q: Jak nazywa się bohaterka gier komputerowych z serii Tomb Raider?
A:  w poprzednich częściach, a zamiast tego skupić się na realistycznych proporcjach


### Evaluation

In [None]:
from difflib import SequenceMatcher
import re

def levenshtein_distance(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

def is_textual_match(pred, gold, threshold=0.5):
    return levenshtein_distance(pred.lower(), gold.lower()) >= threshold

def is_numerical_match(pred, gold):
    pred_num = re.search(r"\d+", pred)
    gold_num = re.search(r"\d+", gold)
    if pred_num and gold_num:
        return pred_num.group() == gold_num.group()
    return False

def evaluate(in_file, expected_file):
    with open(in_file, 'r', encoding='utf-8') as file_in, open(expected_file, 'r', encoding='utf-8') as file_expected:
        questions = [line.strip() for line in file_in]
        gold_answers = [line.strip() for line in file_expected]
    
    total = len(questions)
    correct = 0

    for index, (question, gold) in enumerate(zip(questions, gold_answers)):
        if index >= 10:
            # My machine cannot handle more
            break
        pred = quiz_answer_system(question, es, INDEX_NAME)
        print(f"Q: {question}\nA: {pred}\nExpected: {gold}\n\n")
        if is_numerical_match(pred, gold) or is_textual_match(pred, gold):
            correct += 1

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.2%}")

DEV_0_IN = "data/dev-0/in.tsv"
DEV_0_EXPECTED = "data/dev-0/expected.tsv"
evaluate(DEV_0_IN, DEV_0_EXPECTED)

Q: Jak nazywa się pierwsza litera alfabetu greckiego?
A:  wielu alfabetom
Expected: alfa


Q: Jak nazywa się dowolny odcinek łączący dwa punkty okręgu?
A: No answer found
Expected: cięciwa


Q: W którym państwie rozpoczyna się akcja powieści „W pustyni i w puszczy”?
A: No answer found
Expected: w Egipcie


Q: Czy w państwach starożytnych powoływani byli posłowie i poselstwa?
A: No answer found
Expected: tak


Q: W jakim zespole występowała Hanka w filmie „Żona dla Australijczyka”?
A: No answer found
Expected: Mazowsze


Q: W którym państwie leży Bombaj?
A: No answer found
Expected: w Indiach


Q: Który numer boczny nosi czołg Rudy z „Czterech pancernych”?
A: No answer found
Expected: 102


Q: Co budował w Egipcie inżynier Tarkowski, ojciec Stasia?
A: No answer found
Expected: Kanał Sueski


Q: Czy owoce niektórych kaktusów są jadalne?
A: No answer found
Expected: tak


Q: Kwartet – to ilu wykonawców?
A: No answer found
Expected: czterech	czworo	4


Accuracy: 0.00%


0.0