# Neural search for question answering


### Windows installation is tough

In wsl:

```
sudo apt update
sudo apt install -y software-properties-common
sudo apt upgrade -y
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update
sudo apt install -y python3.10 python3.10-venv python3.10-distutils
python3.10 --version
sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
sudo update-alternatives --config python3
python3 -m venv haystack_env
source haystack_env/bin/activate
pip install --upgrade pip
pip install jupyter
pip install farm-haystack[all]

sudo apt install -y libpq-dev libsndfile1 ffmpeg

jupyter notebook --no-browser --ip=0.0.0.0

```

Connect to `http://127.0.0.1:8888/?token=your_token_here`

The token is displayed in the wsl command prompt window

---

Installing and setting up haystack on windows took most of the time spent on this laboratory

---

### Configure document store

In [1]:
from haystack.document_stores import FAISSDocumentStore

document_store = FAISSDocumentStore(
    similarity="cosine",  # the e5 models were trained with a cosine similarity function
    embedding_dim=768,
)

# # if the index is already saved
# document_store = FAISSDocumentStore(
#     faiss_index_path="faiss_index.faiss" ,
# )

document_store.save("faiss_index.faiss")

---

### Load fiqa and store the documents

In [2]:
from haystack import Document
from datasets import load_dataset

fiqa_corpus = load_dataset("clarin-knext/fiqa-pl", "corpus")["corpus"]
fiqa_queries = load_dataset("clarin-knext/fiqa-pl", "queries")["queries"]

documents = []

for entry in fiqa_corpus:
    idx = entry["_id"]
    text = entry["text"]
    documents.append(Document(content=text, id=idx))

document_store.write_documents(documents)


riting Documents: 60000it [00:36, 1662.71it/s]                                                                         

In [3]:
fiqa_qa = load_dataset("clarin-knext/fiqa-pl-qrels")["test"]

### Update embeddings

In [4]:
from haystack.nodes import EmbeddingRetriever

e5 = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="intfloat/multilingual-e5-base",
    model_format="transformers",  # Make sure we specify the transformers model format
    pooling_strategy="reduce_mean",  # This is the pooling method used to train the e5 models
    top_k=20,
    max_seq_len=512,
)

In [5]:
%%capture

from time import time

start = time()
document_store.update_embeddings(e5)
end = time()

In [6]:
print(f"embedding update time: {round(end-start,2)}s")

embedding update time: 464.18s


---

### Prepare data for NDCG@k computation

In [7]:
# create map query id -> query text
query_map = {int(idx): q for idx, q in zip(fiqa_queries["_id"], fiqa_queries["text"])}

# create map query id -> relevant corpus ids
query_corpus_map = {}
for query_id, corpus_id in zip(fiqa_qa["query-id"], fiqa_qa["corpus-id"]):
    if query_id not in query_corpus_map:
        query_corpus_map[query_id] = []
    query_corpus_map[query_id].append(corpus_id)

---

### NDCG@k

In [39]:
import numpy as np


def ndcg_at_k(k):
    logs = np.log2(np.arange(2, 2 + k))

    # iterate over all queries to compute ndcg for each of them
    ndcg_list = []

    for query_id, corpus_id_list in query_corpus_map.items():
        query_text = query_map[query_id]

        retrieved_docs = e5.retrieve(query=query_text, top_k=k)

        hits = [int(h.id) for h in retrieved_docs]

        idcg = [1 if i < len(corpus_id_list) else 0 for i in range(k)]
        dcg = [1 if h in corpus_id_list else 0 for h in hits]

        idcg = np.array(idcg) / logs
        dcg = np.array(dcg) / logs

        ndcg_list.append(dcg.sum() / idcg.sum())

    return np.mean(ndcg_list)

In [40]:
%%capture

ndcg_at_5 = ndcg_at_k(5)
ndcg_at_10 = ndcg_at_k(10)

In [42]:
ndcg_at_5

0.2139199747330796

In [43]:
ndcg_at_10

0.23300445613152695

Using FTS we got NDCG@5 around 0.18 and 0.15 for NDCG@10

Using reranking we got NDCG@10 equal to around 0.05

Using Neural search we managed to drastically improve the NDCG@k score in comparison to FTS achieving over 0.21 and 0.23 scores for NDCG@5 and NDCG@10 respectively

---