# Full Text Search
Mateusz Wojtulewicz

In [1]:
import json
import tqdm
import requests

from pathlib import Path

In [34]:
# helper function
def pprint(response: requests.Response | dict | list):
    if isinstance(response, requests.Response):
        response = response.json()
    print(json.dumps(response, indent=4, ensure_ascii=False))

Connecting to Elasticsearch.

In [3]:
es_url = "http://localhost:9200"

In [4]:
pprint(requests.get(es_url))

{
    "name": "b40d2515d547",
    "cluster_name": "docker-cluster",
    "cluster_uuid": "U5o6JseNQMWv_ouTF8lKYA",
    "version": {
        "number": "8.4.3",
        "build_flavor": "default",
        "build_type": "docker",
        "build_hash": "42f05b9372a9a4a470db3b52817899b99a76ee73",
        "build_date": "2022-10-04T07:17:24.662462378Z",
        "build_snapshot": false,
        "lucene_version": "9.3.0",
        "minimum_wire_compatibility_version": "7.17.0",
        "minimum_index_compatibility_version": "7.0.0"
    },
    "tagline": "You Know, for Search"
}


## 3. Define an ES analyzer for Polish texts containing [...]

## 4. Define an ES index for storing the contents of the legislative acts.

I'm defining an `acts` index with property `content` of type `text` with custom analyzer `polish-law-analyzer`. Note that the mapping is set after the analyzer.

`polish-law-analyzer` uses `standard` tokenizer and has custom `synonym` filter, `morfologik_stem` filter and `lowercase` filter. The first filter maps abbreviations with their definitions and the second one is a Morfologik-based polish lemmatizer.

In [5]:
response = requests.put(
    url=f"{es_url}/acts",
    json={
        "settings": {
            "analysis": {
                "analyzer": {
                    "polish-law-analyzer": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "synonym-filter",
                            "morfologik_stem",
                            "lowercase"
                        ]
                    }
                },
                "filter": {
                    "synonym-filter": {
                        "type": "synonym",
                        "synonyms": [
                            "kpk => kodeks postępowania karnego",
                            "kpc => kodeks postępowania cywilnego",
                            "kk => kodeks karny",
                            "kc => kodeks cywilny"
                        ]
                    }
                }
            }
        }
    }
)

pprint(response)

{
    "acknowledged": true,
    "shards_acknowledged": true,
    "index": "acts"
}


In [6]:
response = requests.put(
    url=f"{es_url}/acts/_mapping",
    json={
        "properties": {
            "content": {
                "type": "text",
                "analyzer": "polish-law-analyzer"
            }
        }
    }
)

pprint(response)

{
    "acknowledged": true
}


Example of `polish-law-analyzer` in action:

In [7]:
response = requests.get(
    url=f"{es_url}/acts/_analyze",
    json={
        "text": "jestem kpc",
        "analyzer": "polish-law-analyzer"
    }
)

pprint(response)

{
    "tokens": [
        {
            "token": "być",
            "start_offset": 0,
            "end_offset": 6,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "kodeks",
            "start_offset": 7,
            "end_offset": 10,
            "type": "SYNONYM",
            "position": 1
        },
        {
            "token": "postępowanie",
            "start_offset": 7,
            "end_offset": 10,
            "type": "SYNONYM",
            "position": 2
        },
        {
            "token": "postępować",
            "start_offset": 7,
            "end_offset": 10,
            "type": "SYNONYM",
            "position": 2
        },
        {
            "token": "cywilny",
            "start_offset": 7,
            "end_offset": 10,
            "type": "SYNONYM",
            "position": 3
        }
    ]
}


## 5. Load the data to the ES index.

In [8]:
acts_dir = Path("../data/ustawy/")
n_acts = len(list(acts_dir.iterdir()))

index_name = "acts"

for act in tqdm.tqdm(acts_dir.iterdir(), desc="Indexing acts", total=n_acts):
    act_id = act.stem
    requests.post(
        url=f"{es_url}/{index_name}/_doc/{act_id}",
        json={
            "content": act.read_text(encoding="utf8")
        }
    )

Indexing acts: 100%|██████████| 1179/1179 [02:11<00:00,  8.95it/s]


Ensuring that each act was loaded.

In [9]:
for act in tqdm.tqdm(acts_dir.iterdir(), desc="Checking acts", total=n_acts):
    found = requests.get(url=f"{es_url}/acts/_doc/{act.stem}").json()["found"]
    if not found:
        print(act.stem)

Checking acts: 100%|██████████| 1179/1179 [00:38<00:00, 30.54it/s]


Examplary document:

In [10]:
response = requests.get(url=f"{es_url}/acts/_doc/1993_645")

pprint(response)

{
    "_index": "acts",
    "_id": "1993_645",
    "_version": 1,
    "_seq_no": 2,
    "_primary_term": 1,
    "found": true,
    "_source": {
        "content": "\n\n\n\nDz.U. z 1993 r. Nr 134, poz. 645\n                                Ustawa \n                          z dnia 3 grudnia 1993 r.\n         o zmianie ustawy o kombatantach oraz niektórych osobach\n        będących ofiarami represji wojennych i okresu powojennego.\n                                Art. 1.\nW ustawie z dnia 24 stycznia 1991 r. o kombatantach oraz niektórych osobach\nbędących ofiarami represji wojennych i okresu powojennego (Dz.U. Nr 17, poz. 75 i\nNr 104, poz. 450, z 1992 r. Nr 21, poz. 85 oraz z 1993 r. Nr 29, poz. 133 i Nr 129,\npoz. 602) w art. 27 skreśla się wyrazy \"jednak nie dłużej niż do dnia 31 grudnia\n1993 r.\"\n                                Art. 2.\nUstawa wchodzi w życie z dniem ogłoszenia. \n"
    }
}


## 6. Determine the number of legislative acts containing the word **ustawa** (in any form).

I've used basic search, with highlits and response filtering.

In [11]:
response = requests.get(
    url=f"{es_url}/acts/_search?filter_path=hits.total.value,hits.hits.highlight.content",
    json={
        "query": {
            "match": {
                "content": {
                    "query": "ustawa"
                }
            }
        },
        "highlight": {
            "fields": {
                "content": {
                    "fragment_size": 1
                }
            }
        },
        "size": 2
    }
)

pprint(response)

{
    "hits": {
        "total": {
            "value": 1178
        },
        "hits": [
            {
                "highlight": {
                    "content": [
                        "<em>ustawy</em>",
                        " \n<em>ustawa</em>",
                        "<em>ustawy</em>",
                        "<em>ustaw</em>",
                        "<em>ustawie</em>"
                    ]
                }
            },
            {
                "highlight": {
                    "content": [
                        "<em>ustawy</em>",
                        " \n<em>USTAWA</em>",
                        "<em>ustawy</em>",
                        "<em>ustaw</em>",
                        "<em>ustawie</em>"
                    ]
                }
            }
        ]
    }
}


Answer:

In [12]:
response.json()["hits"]["total"]["value"]

1178

## 7. Determine the number of occurrences of the word **ustawa** by searching for this particular form, including the other inflectional forms.

Using `termvectors` I've found `total term frequency`  (`ttf`) in all indexed documents for term **ustawa**. All its inflectional forms will be found because it is a base form.

In [29]:

response = requests.get(
    url=f"{es_url}/acts/_termvectors/1993_645",
    json={
        "fields": ["content"],
        "term_statistics": True
    }
)

pprint(response.json()["term_vectors"]["content"]["terms"]["ustawa"])

{
    "doc_freq": 1178,
    "ttf": 24934,
    "term_freq": 4,
    "tokens": [
        {
            "position": 8,
            "start_offset": 69,
            "end_offset": 75
        },
        {
            "position": 17,
            "start_offset": 147,
            "end_offset": 153
        },
        {
            "position": 33,
            "start_offset": 302,
            "end_offset": 309
        },
        {
            "position": 100,
            "start_offset": 672,
            "end_offset": 678
        }
    ]
}


Answer:

In [30]:
response.json()["term_vectors"]["content"]["terms"]["ustawa"]["ttf"]

24934

## 8. Determine the number of occurrences of the word **ustaw** by searching for this particular form, including the other inflectional forms.

First, I've checked what are all basic inflectional forms for word **ustaw**. Then I've checked `total term frequency` for each of them to add them up.

In [15]:
response = requests.get(
    url=f"{es_url}/acts/_analyze",
    json={
        "text": "ustaw",
        "analyzer": "polish-law-analyzer"
    }
)

pprint(response)

{
    "tokens": [
        {
            "token": "ustawa",
            "start_offset": 0,
            "end_offset": 5,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "ustawić",
            "start_offset": 0,
            "end_offset": 5,
            "type": "<ALPHANUM>",
            "position": 0
        }
    ]
}


In [16]:
response = requests.get(
    url=f"{es_url}/acts/_termvectors/1993_599",
    json={
        "fields": ["content"],
        "term_statistics": True
    }
)

response.json()["term_vectors"]["content"]["terms"]["ustawić"]["ttf"]

913

In [17]:
response.json()["term_vectors"]["content"]["terms"]["ustawa"]["ttf"]

24934

Answer: 

In [18]:
response.json()["term_vectors"]["content"]["terms"]["ustawić"]["ttf"] + response.json()["term_vectors"]["content"]["terms"]["ustawa"]["ttf"]

25847

## 9. Determine the number of legislative acts containing the words **kodeks postępowania cywilnego** in the specified order, but in any inflection form.

I've used `match_phrase` query to match the words in the specified order.

In [19]:
response = requests.get(
    url=f"{es_url}/acts/_search?filter_path=hits.total.value,hits.hits.highlight.content",
    json={
        "query": {
            "match_phrase": {
                "content": {
                    "query": "kodeks postępowania cywilnego"
                }
            }
        },
        "highlight": {
            "fields": {
                "content": {
                    "number_of_fragments": 2
                }
            }
        },
        "size": 2
    }
)

pprint(response)

{
    "hits": {
        "total": {
            "value": 99
        },
        "hits": [
            {
                "highlight": {
                    "content": [
                        "– <em>Kodeks</em> <em>postępowania</em>\n<em>cywilnego</em> oraz niektórych innych ustaw[1])\nArt. 1. ",
                        "W\nustawie z dnia 17 listopada 1964 r. – <em>Kodeks</em> <em>postępowania</em> <em>cywilnego</em>\n(Dz. "
                    ]
                }
            },
            {
                "highlight": {
                    "content": [
                        "Przepisu art. 694{6} § 2 <em>Kodeksu</em> <em>postępowania</em> <em>cywilnego</em> nie\n           stosuje się.\n         4.",
                        "Przepisu art. 694{6} § 2 <em>Kodeksu</em> <em>postępowania</em> <em>cywilnego</em>\n             nie stosuje się.\n         2."
                    ]
                }
            }
        ]
    }
}


Answer:

In [20]:
response.json()["hits"]["total"]["value"]

99

## 10. Determine the number of legislative acts containing the words **wchodzi w życie** (in any form) allowing for up to 2 additional words in the searched phrase.

I've used `match_phrase` with `"slop": 2` to allow for up to 2 additional words in the phrase.

In [37]:
response = requests.get(
    url=f"{es_url}/acts/_search?filter_path=hits.total.value,hits.hits.highlight.content",
    json={
        "query": {
            "match_phrase": {
                "content": {
                    "query": "wchodzi w życie",
                    "slop": 2
                }
            }
        },
        "highlight": {
            "fields": {
                "content": {
                    "number_of_fragments": 5
                }
            }
        },
        "size": 2
    }
)

pprint(response)

{
    "hits": {
        "total": {
            "value": 1174
        },
        "hits": [
            {
                "highlight": {
                    "content": [
                        "Nr 91, poz.\n  578), zwana dalej \"ustawą o samorządzie powiatowym\", <em>wchodzi</em> <em>w</em> <em>życie</em> z dniem\n  1 stycznia",
                        "Akty powołania, o których mowa w art. 35 ust. 3 pkt 1 ustawy o samorządzie\n  powiatowym, <em>wchodzą</em> <em>w</em> <em>życie</em>",
                        "Nr 91, poz.\n  576), zwana dalej \"ustawą o samorządzie województwa\", <em>wchodzi</em> <em>w</em> <em>życie</em> z dniem\n  1 stycznia",
                        "Nr 91, poz. 577) <em>wchodzi</em> <em>w</em> <em>życie</em> z dniem 1 stycznia 1999 r.",
                        "Ilekroć w przepisach ustawy o samorządzie powiatowym i o samorządzie\nwojewództwa, <em>wchodzących</em> <em>w</em> <em>życie</em>"
                    ]
                }
            },
            {
        

Answer:

In [22]:
response.json()["hits"]["total"]["value"]

1174

## 11. Determine the 10 documents that are the most relevant for the phrase **konstytucja**.

I've found 10 documents with highest matching score.

In [32]:
response = requests.get(
    url=f"{es_url}/acts/_search?filter_path=hits.hits._id,hits.hits._score",
    json={
        "query": {
            "match": {
                "content": {
                    "query": "konstytucja"
                }
            }
        },
        "size": 10
    }
)

Answer:

In [33]:
pprint(response.json()["hits"]["hits"])

[
    {
        "_id": "1997_629",
        "_score": 6.869184
    },
    {
        "_id": "2000_443",
        "_score": 6.663479
    },
    {
        "_id": "1997_604",
        "_score": 6.632288
    },
    {
        "_id": "1996_350",
        "_score": 6.6273947
    },
    {
        "_id": "1997_642",
        "_score": 6.2522817
    },
    {
        "_id": "2001_23",
        "_score": 6.056855
    },
    {
        "_id": "1996_199",
        "_score": 5.9267144
    },
    {
        "_id": "1999_688",
        "_score": 5.848894
    },
    {
        "_id": "1997_681",
        "_score": 5.4653444
    },
    {
        "_id": "2001_1082",
        "_score": 5.4653444
    }
]


## 12. Print the excerpts containing the word **konstytucja** (up to three excerpts per document) from the previous task.

In [25]:
response = requests.get(
    url=f"{es_url}/acts/_search?filter_path=hits.hits._id,hits.hits.highlight",
    json={
        "query": {
            "match": {
                "content": {
                    "query": "konstytucja"
                }
            }
        },
        "highlight": {
            "fields": {
                "content": {
                    "number_of_fragments": 2
                }
            }
        },
        "size": 10
    }
)

pprint(response)

{
    "hits": {
        "hits": [
            {
                "_id": "1997_629",
                "highlight": {
                    "content": [
                        "o zmianie ustawy konstytucyjnej o trybie przygotowania\n           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej",
                        "W ustawie  konstytucyjnej z  dnia 23 kwietnia 1992 r. o trybie przygotowania i \nuchwalenia <em>Konstytucji</em>"
                    ]
                }
            },
            {
                "_id": "2000_443",
                "highlight": {
                    "content": [
                        "umowy międzynarodowej i nie wypełnia przesłanek określonych w art. 89\n     ust. 1 lub art. 90 <em>Konstytucji</em>",
                        "co do zasadności wyboru\n  trybu ratyfikacji umowy międzynarodowej, o którym mowa w art. 89 ust. 2\n  <em>Konstytucji</em>"
                    ]
                }
            },
            {
                "_id": "1997_604",
     