# Question Answering Challenge

### Using a Wikipedia dump for offline retrieval

As specified in https://en.wikipedia.org/wiki/Wikipedia:Database_download, we download the Polish Wikipedia dump from https://dumps.wikimedia.org/.

In [None]:
import requests
from tqdm import tqdm

url = 'https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles.xml.bz2'

response = requests.get(url, stream=True)
if response.status_code == 200:
    file_name = 'plwiki-latest-pages-articles.xml.bz2'
    total_size = int(response.headers.get('content-length', 0))
    chunk_size = 1024  # 1 KB
    with open(file_name, 'wb') as file, tqdm(
        desc=file_name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if chunk:
                file.write(chunk)
                bar.update(len(chunk))
    print(f'Wikipedia dump downloaded successfully: {file_name}')
else:
    print(f"Failed to download the Wikipedia dump. HTTP Status Code: {response.status_code}")


plwiki-latest-pages-articles.xml.bz2:   1%|          | 19.7M/2.35G [00:22<4:24:50, 158kB/s]

### Extracting data from the Wikipedia dump using WikiExtractor

Run the following command to extract articles from the dump into an `extracted/` folder:
```bash
wikiextractor --json plwiki-latest-pages-articles.xml.bz2 -o extracted
```

### Processing the extracted data and indexing it into Elasticsearch

In [1]:
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
import os
import json

load_dotenv("elastic-start-local/.env")

es = Elasticsearch("http://localhost:9200", api_key=os.getenv("ES_LOCAL_API_KEY"))

INDEX_NAME = 'wiki_index'
INDEX_BODY = {
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "content": {"type": "text"}
        }
    }
}
    
if not es.indices.exists(index=INDEX_NAME):
    es.indices.create(index=INDEX_NAME, body=INDEX_BODY)
    
def index_articles(dump_dir, es, index_name):
    for root, _, files in os.walk(dump_dir):
        for file in files:
            print(f'Indexing file: {file}')
            if file.startswith('wiki_'):
                file_path = os.path.join(root, file)
                
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        article = json.loads(line)
                        title = article.get('title', '')
                        content = article.get('text', '')
                        
                        es.index(index=index_name, body={
                            "title": title,
                            "content": content
                        })

index_articles('wiki-dump', es, INDEX_NAME)

### Searching and answer extraction

In [2]:
from transformers import pipeline

model = pipeline("question-answering", model="sdadas/polish-gpt2-large")

def retrieve_context(question, es, index_name, num_results=3):
    query = {
        "query": {
            "multi_match": {
                "query": question,
                "fields": ["title", "content"],
                "operator": "and"
            }
        },
        "size": num_results
    }
    response = es.search(index=index_name, body=query)
    
    context = " ".join(hit['_source']['content'] for hit in response['hits']['hits'])
    return context

def generate_answer(question, context):
    result = model(question=question, context=context)
    return result["answer"]

def quiz_answer_system(question, es, index_name):
    context = retrieve_context(question, es, index_name)
    return generate_answer(question, context)

question = "Jak nazywa się bohaterka gier komputerowych z serii Tomb Raider?"
answer = quiz_answer_system(question, es, INDEX_NAME)
print(f"Q: {question}\nA: {answer}")

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at sdadas/polish-gpt2-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Q: Jak nazywa się bohaterka gier komputerowych z serii Tomb Raider?
A: Playboy
