In [1]:
import tqdm
import requests

from pathlib import Path
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch("http://localhost:9200")

print(es.ping())
print(es.info())

True
{'name': 'b40d2515d547', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'U5o6JseNQMWv_ouTF8lKYA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [3]:
es_url = "http://localhost:9200"

In [4]:
requests.get(es_url).json()

{'name': 'b40d2515d547',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'U5o6JseNQMWv_ouTF8lKYA',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [5]:
response = requests.put(
    url=f"{es_url}/acts",
    json={
        "settings": {
            "analysis": {
                "analyzer": {
                    "polish-law-analyzer": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "synonym-filter",
                            "morfologik_stem",
                            "lowercase"
                        ]
                    }
                },
                "filter": {
                    "synonym-filter": {
                        "type": "synonym",
                        "expand": False,
                        "synonyms": [
                            "kpk => kodeks postępowania karnego",
                            "kpc => kodeks postępowania cywilnego",
                            "kk => kodeks karny",
                            "kc => kodeks cywilny"
                        ]
                    }
                }
            }
        }
    }
)

response.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'acts'}

In [6]:
response = requests.put(
    url=f"{es_url}/acts/_mapping",
    json={
        "properties": {
            "content": {
                "type": "text",
                "analyzer": "polish-law-analyzer"
            }
        }
    }
)

response.json()

{'acknowledged': True}

In [7]:
response = requests.get(
    url=f"{es_url}/acts/_analyze",
    json={
        "text": "jestem kpc",
        "analyzer": "polish-law-analyzer"
    }
)

response.json()

{'tokens': [{'token': 'być',
   'start_offset': 0,
   'end_offset': 6,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'kodeks',
   'start_offset': 7,
   'end_offset': 10,
   'type': 'SYNONYM',
   'position': 1},
  {'token': 'postępowanie',
   'start_offset': 7,
   'end_offset': 10,
   'type': 'SYNONYM',
   'position': 2},
  {'token': 'postępować',
   'start_offset': 7,
   'end_offset': 10,
   'type': 'SYNONYM',
   'position': 2},
  {'token': 'cywilny',
   'start_offset': 7,
   'end_offset': 10,
   'type': 'SYNONYM',
   'position': 3}]}

In [25]:
response = requests.get(
    url=f"{es_url}/acts/_analyze",
    json={
        "text": "ustaw",
        "analyzer": "polish-law-analyzer"
    }
)

response.json()

{'tokens': [{'token': 'ustawa',
   'start_offset': 0,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'ustawić',
   'start_offset': 0,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 0}]}

In [9]:
acts_dir = Path("../data/ustawy/")
n_acts = len(list(acts_dir.iterdir()))

index_name = "acts"

for act in tqdm.tqdm(acts_dir.iterdir(), desc="Indexing acts", total=n_acts):
    act_id = act.stem
    requests.post(
        url=f"{es_url}/{index_name}/_doc/{act_id}",
        json={
            "content": act.read_text(encoding="utf8")
        }
    )

Indexing acts: 100%|██████████| 1179/1179 [00:30<00:00, 38.11it/s]


In [13]:
requests.get(
    url=f"{es_url}/acts/_doc/1993_645"
).json()

{'_index': 'acts',
 '_id': '1993_645',
 '_version': 1,
 '_seq_no': 2,
 '_primary_term': 1,
 'found': True,
 '_source': {'content': '\n\n\n\nDz.U. z 1993 r. Nr 134, poz. 645\n                                Ustawa \n                          z dnia 3 grudnia 1993 r.\n         o zmianie ustawy o kombatantach oraz niektórych osobach\n        będących ofiarami represji wojennych i okresu powojennego.\n                                Art. 1.\nW ustawie z dnia 24 stycznia 1991 r. o kombatantach oraz niektórych osobach\nbędących ofiarami represji wojennych i okresu powojennego (Dz.U. Nr 17, poz. 75 i\nNr 104, poz. 450, z 1992 r. Nr 21, poz. 85 oraz z 1993 r. Nr 29, poz. 133 i Nr 129,\npoz. 602) w art. 27 skreśla się wyrazy "jednak nie dłużej niż do dnia 31 grudnia\n1993 r."\n                                Art. 2.\nUstawa wchodzi w życie z dniem ogłoszenia. \n'}}

In [12]:
for act in acts_dir.iterdir():
    found = requests.get(url=f"{es_url}/acts/_doc/{act.stem}").json()["found"]
    if not found:
        print(act.stem)

In [17]:
response = requests.get(
    url=f"{es_url}/acts/_search",
    json={
        "query": {
            "match": {
                "content": {
                    "query": "ustawa"
                }
            }
        }
    }
)

response.json()["hits"]["total"]["value"]

1178

In [26]:
response = requests.get(
    url=f"{es_url}/acts/_count",
    json={
        "query": {
            "match": {
                "content": {
                    "query": "ustawa"
                }
            }
        }
    }
)

response.json()

{'count': 1178,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}