In [1]:
import os
from elasticsearch import Elasticsearch

def create_index(elastic_client):
    elastic_client.indices.create(
        index="my_custom_index",
        body={
            "settings":{  
                "analysis":{  
                    "analyzer":{  
                        "my_custom_analyzer":{  
                        "type":"custom",
                        "tokenizer":"standard",
                        "filter":[  
                            "lowercase",
                            "my_synonyms",
                            "morfologik_stem"
                        ]
                        }
                    },
                    "filter":{
                        "my_synonyms":{  
                        "type":"synonym",
                        "synonyms":[  
                            "kpk, kodeks postępowania karnego",
                            "kpc, kodeks postępowania cywilnego", 
                            "kk, kodeks karny",
                            "kc, kodeks cywilny"
                        ]
                        }
                    }
                }
            },
    "mappings":{
            "properties":{
                "data": {
                    "type":"text",
                    "analyzer":"my_custom_analyzer"
                }
            }
    }
    }
    )


def add_files(elastic_client):
    files = os.listdir("ustawy")
    for file_name in files:
        with open("ustawy" + '/' + file_name, 'r') as document:
            act = document.read()
            elastic_client.create("my_custom_index", file_name, {"data": act})


def elastic_search_query(elastic_client, query_body):
    result = elastic_client.search(index="my_custom_index", body=query_body)
    print(result["hits"]["total"])


def task_one(elastic_client):
    # Determine the number of legislative acts containing the word ustawa (in any form).
    query_body = {"query": {
                    "match": {
                        "data": {
                            "query": "ustawa"
                                }
                            }
                        }
                }
    elastic_search_query(elastic_client, query_body)


def task_two(elastic_client):
    # Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in an any inflection form.
    query_body = {"query": {
                    "match_phrase": {
                        "data": {
                            "query": "kodeks postępowania cywilnego"
                                }
                            }
                        }
                }
    elastic_search_query(elastic_client, query_body)


def task_three(elastic_client):
    # Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase.
    query_body = {"query": {
                    "match_phrase": {
                        "data": {
                            "query": "wchodzi w życie",
                            "slop": 2
                                }
                            }
                        }
                }
    elastic_search_query(elastic_client, query_body)


def task_four(elastic_client):
    # Determine the 10 documents that are the most relevant for the phrase konstytucja.
    query_body = {"query": {
                    "match": {
                        "data": {
                            "query": "konstytucja"
                                }
                            }
                        }
                }
    elastic_search_query(elastic_client, query_body)


def task_five(elastic_client):
    # Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task.
    pass

elastic_client = Elasticsearch()
# create_index(elastic_client)
# add_files(elastic_client)


In [2]:
 task_one(elastic_client)

{'value': 1179, 'relation': 'eq'}


In [3]:
task_two(elastic_client)

{'value': 100, 'relation': 'eq'}


In [4]:
task_three(elastic_client)

{'value': 1175, 'relation': 'eq'}
