# Full Text Search with Elasticsearch

Under windows, download and unzip the elasticsearch-8.15.2 in this directory.

In `elasticsearch-8.15.2/config/elasticsearch.yml` change the contents of security settings :

```
# Enable security features
xpack.security.enabled: false

xpack.security.enrollment.enabled: false
```

Run the following command:

```
elasticsearch-8.15.2\bin\elasticsearch.bat
```

---

### import needed libraries

In [1]:
from datasets import load_dataset
import json
import requests

### Create urls and headers for requests

In [2]:
headers = {"Content-Type": "application/json"}
elastic_url = "http://localhost:9200/pol"
bulk_url = "http://localhost:9200/pol/_bulk"
search_url = "http://localhost:9200/pol/_search?pretty"

### Clean up the elasticsearch index

In [3]:
!curl -X DELETE "localhost:9200/pol"

{"acknowledged":true}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100    21  100    21    0     0    346      0 --:--:-- --:--:-- --:--:--   350


### Create analyzer

In [None]:
synonym_list = [
    "kwiecień, kwi, IV",
    "styczeń, sty, I",
    "luty, lut, II",
    "marzec, mar, III",
    "maj, V",
    "czerwiec, cze, VI",
    "lipiec, lip, VII",
    "sierpień, sie, VIII",
    "wrzesień, wrz, IX",
    "październik, paź, X",
    "listopad, lis, XI",
    "grudzień, gru, XII",
]

In [4]:
analyzer_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "polish_synonym": {
                    "type": "synonym",
                    "synonyms": synonym_list,
                }
            },
            "analyzer": {
                "polish_with_synonyms": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "polish_synonym",
                        "morfologik_stem",
                        "lowercase",
                    ],
                },
                "polish": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "morfologik_stem", "lowercase"],
                },
            },
        }
    },
    "mappings": {
        "properties": {
            "text_synonyms": {
                "type": "text",
                "analyzer": "polish_with_synonyms",
                "fields": {"keyword": {"type": "keyword"}},
            },
            "text": {
                "type": "text",
                "analyzer": "polish",
                "fields": {"keyword": {"type": "keyword"}},
            },
        }
    },
}

analyzer_settings = json.dumps(analyzer_settings)

# send the analyzer settings and mappings to elasticsearch
response = requests.put(
    elastic_url,
    headers=headers,
    data=analyzer_settings,
)

response.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'pol'}

### Load the corpus into elasticsearch

We use fiqa-pl dataset

In [5]:
fiqa = load_dataset("clarin-knext/fiqa-pl", "corpus")

fiqa_corpus = fiqa["corpus"]
fiqa_queries = fiqa["queries"]

# create data index
data = []
for _id, text in zip(fiqa_corpus["_id"], fiqa_corpus["text"]):
    id_head = json.dumps(
        {"index": {"_index": "pol", "_id": str(_id)}}, ensure_ascii=False
    )
    # to create double index for synonyms and no synonyms we need to input the text data twice
    # this is not optimal
    content = json.dumps({"text_synonyms": text, "text": text}, ensure_ascii=False)
    data.append(id_head)
    data.append(content)

# join the bul data
bulk_data = "\n".join([item for item in data]) + "\n"

response = requests.post(
    bulk_url, headers=headers, data=bulk_data, verify=False, auth=("xxx", "xxx")
)

print(response.status_code)
print(response.json()["errors"])

200
False


### Create function to retrieve the number of files and occurrences of given word under certain index field

In [6]:
def retrieve_counts(field, word):
    # get number of documents that contain the searched word
    query_dict = {"query": {"match": {field: word}}}
    query = json.dumps(query_dict)
    response = requests.get(search_url, headers=headers, data=query)
    n_documents = response.json()["hits"]["total"]["value"]

    # retrieve the document ids
    query_dict = {
        "size": n_documents,
        "query": {"match": {field: word}},
        "_source": False,
    }
    query = json.dumps(query_dict)
    response = requests.get(search_url, headers=headers, data=query)
    doc_ids = [idx["_id"] for idx in response.json()["hits"]["hits"]]

    # count occurrences in all documents
    word_count = 0
    for idx in doc_ids:
        termvectors_url = f"http://localhost:9200/pol/_termvectors/{idx}?pretty"

        query_dict = {
            "fields": [field],
            "term_statistics": False,
            "field_statistics": False,
            "positions": False,
            "offsets": False,
        }

        query = json.dumps(query_dict)
        response = requests.get(termvectors_url, headers=headers, data=query)
        data = response.json()["term_vectors"][field]["terms"][word]
        word_count += data["term_freq"]

    return len(doc_ids), word_count

### Get number of documents and term occurrences for kwiecień without synonyms

In [7]:
count_docs, count_words = retrieve_counts("text", "kwiecień")

print(f"number of documents without synonyms   : {count_docs}")
print(f"number of occurrences without synonyms : {count_words}")

number of documents without synonyms   : 257
number of occurrences without synonyms : 353


### Get number of documents and term occurrences for kwiecień with synonyms

In [8]:
count_docs, count_words = retrieve_counts("text_synonyms", "kwiecień")

print(f"number of documents with synonyms   : {count_docs}")
print(f"number of occurrences with synonyms : {count_words}")

number of documents with synonyms   : 306
number of occurrences with synonyms : 439


### Dowload fiqa-pl-grels

In [27]:
fiqa_qa = load_dataset("clarin-knext/fiqa-pl-qrels")

In [28]:
query_ids = fiqa_qa["test"]["query-id"]
corpus_ids = fiqa_qa["test"]["corpus-id"]