## Install ElasticSearch (ES).
### install elasticsearch on Ubunt
https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
sudo apt-get install elasticsearch=7.9.2

## start/stop
sudo -i service elasticsearch start
sudo -i service elasticsearch stop

## check if running
curl -X GET "localhost:9200/?pretty"

In [1]:
import json
import requests
response = requests.get('http://localhost:9200/?pretty')
print(response.json())

{'name': 'micha-ThinkPad-E570', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'VOsFF90kQr2uZSI0euCoVg', 'version': {'number': '7.9.2', 'build_flavor': 'default', 'build_type': 'deb', 'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e', 'build_date': '2020-09-23T00:45:33.626720Z', 'build_snapshot': False, 'lucene_version': '8.6.2', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


## Install an ES plugin for Polish https://github.com/allegro/elasticsearch-analysis-morfologik
## install Morfologik in eleasticsearch directory - use the same version as elasticsearch
cd /usr/share/elasticsearch
sudo bin/elasticsearch-plugin install pl.allegro.tech.elasticsearch.plugin:elasticsearch-analysis-morfologik:7.9.2

## restart
sudo -i service elasticsearch start
sudo -i service elasticsearch stop

## check if working
curl -XGET "http://localhost:9200/_analyze?pretty" -H 'Content-Type: application/json' -d '{ "analyzer": "morfologik", "text": "jestem" }'

In [1]:
headers = {'Content-Type': 'application/json',}

params = (('pretty', ''),)

data = '{ "analyzer": "morfologik", "text": "jestem" }'

response = requests.post('http://localhost:9200/_analyze', headers=headers, params=params, data=data)
print(response.json())

NameError: name 'requests' is not defined

## Define an ES analyzer for Polish texts containing:
        standard tokenizer
                      "tokenizer": "standard"
        synonym filter with the following definitions:
            kpk - kodeks postępowania karnego
            kpc - kodeks postępowania cywilnego
            kk - kodeks karny
            kc - kodeks cywilny
                       "filter":{
                        "synonym_filter":{
                            "type": "synonym",
                            "synonyms":[
                                "kpk => kodeks postępowania karnego",
                                "kpc => kodeks postępowania cywilnego",
                                "kk => kodeks karny",
                                "kc => kodeks cywilny"]}}
        Morfologik-based lemmatizer
                       "filter": "morfologik_stem"
        lowercase filter
                       "filter": "lowercase"

## Define an ES index for storing the contents of the legislative acts.

In [3]:
di = {
  "settings": {
    "analysis": {
      "analyzer": {
        "default": {
            "tokenizer": "standard",
            "filter": ["synonym_filter", "morfologik_stem", "lowercase"]
        }
      },
        "filter":{
        "synonym_filter":{
            "type": "synonym",
            "synonyms":[
                "kpk => kodeks postępowania karnego",
                "kpc => kodeks postępowania cywilnego",
                "kk => kodeks karny",
                "kc => kodeks cywilny"
                    ]
                }
            }
    }
  }
}
data = json.dumps(di)
response = requests.put('http://localhost:9200/test_index', headers=headers, params=params, data=data)
print(response.json())

response = requests.post('http://localhost:9200/test_index/_open')
print(response)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'test_index'}
<Response [200]>


## Load the data to the ES index.

In [4]:
import os
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch(host="localhost", port=9200)

In [5]:
def load_data():
    directory = '../ustawy/'
    fileList = os.listdir(os.getcwd() + '/' + directory)

    for filename in fileList:
        with open(os.path.join(directory + filename), 'r') as file:
            infile = file.read()
            data = {
                "filename": filename,
                "payload" : infile
            }
            res = es.index(index='test_index', body=data)
            # print(res)

load_data()


## Determine the number of legislative acts containing the word ustawa (in any form).

In [6]:
query = {"query":{
                "match": {
                    "payload": "ustawa"
                }
            }}
res = es.search(index='test_index', body=query)
# print(res)
print("The total number of hits is: {}".format(res['hits']['total']['value']))

The total number of hits is: 1179


## Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in an any inflection form.

In [7]:
query = {"query":{
                "match_phrase": {
                    "payload": "kodeks postępowania cywilnego"
                }
            }}
res = es.search(index='test_index', body=query)
# print(res)
print("The total number of hits is: {}".format(res['hits']['total']['value']))

The total number of hits is: 100


## Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase.

In [8]:
query = {
  "query": {
    "intervals" : {
      "payload" : {
              "match" : {
                "query" : "wchodzi w życie",
                "max_gaps" : 2
        }
      }
    }
  }
}
res = es.search(index='test_index', body=query)
# print(res)
print("The total number of hits is: {}".format(res['hits']['total']['value']))

The total number of hits is: 1175


## Determine the 10 documents that are the most relevant for the phrase konstytucja.

In [9]:
query = {
            "sort": [
            "_score"
            ],
    "query":{
                "match": {
                    "payload": "konstytucja"
}}}
res = es.search(index='test_index', body=query)
# print(res)

In [10]:
for hit in res['hits']['hits']:
    print(hit['_source']['filename'])


1997_629.txt
2000_443.txt
1997_604.txt
1996_350.txt
1997_642.txt
2001_23.txt
1996_199.txt
1999_688.txt
1997_681.txt
2001_1082.txt


## Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task.

In [11]:
query = {
            "sort": [
            "_score"
            ],
                "highlight" : {
        "fields" : {
            "payload" : {"fragment_size" : 150, "number_of_fragments" : 3}}
    },
    "query":{
                "match": {
                    "payload": "konstytucja"},
}}
res = es.search(index='test_index', body=query)
# print(res)

In [12]:
for hit in res['hits']['hits']:
    print("FILE: {}".format(hit['_source']['filename']))
    i = 1
    for h in hit['highlight']['payload']:
        print('EXCERPT {}: \n'.format(i))
        print(h)
        i += 1


FILE: 1997_629.txt
EXCERPT 1: 

o zmianie ustawy konstytucyjnej o trybie przygotowania
           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej Polskiej
EXCERPT 2: 

W ustawie  konstytucyjnej z  dnia 23 kwietnia 1992 r. o trybie przygotowania i 
uchwalenia <em>Konstytucji</em> Rzeczypospolitej Polskiej (Dz.U.
EXCERPT 3: 

Zasady, na których opierać się ma <em>Konstytucja</em> mogą
                być poddane pod referendum.
              2.
FILE: 2000_443.txt
EXCERPT 1: 

Ratyfikacji podlegają umowy międzynarodowe, o których mowa w art. 89 ust.
  1 i art. 90 <em>Konstytucji</em> Rzeczypospolitej Polskiej, oraz inne umowy
  międzynarodowe
EXCERPT 2: 

ma charakter wykonawczy w stosunku do obowiązującej
     umowy międzynarodowej i nie wypełnia przesłanek określonych w art. 89
     ust. 1 lub art. 90 <em>Konstytucji</em>
EXCERPT 3: 

Rzeczypospolitej Polskiej lub po zawiadomieniu
  Sejmu Rzeczypospolitej Polskiej zgodnie z art. 89 ust. 2 <em>Konstytucji</em>
  Rzeczypospolitej Polsk

In [13]:
# print indicies
for index in es.indices.get('*'):
  print(index)

test_index


In [14]:
# delate index
res = es.indices.delete(index='test_index', ignore=[400, 404])
print(res)

{'acknowledged': True}


In [15]:
print(es.indices.get('*'))

{}
