# NLP - lab 2

**1. Install ElasticSearch (ES).**

Firstly we are going to install and setup ElasticSearch.

In [3]:
!pip install elasticsearch==7.10.0
!pip install elasticsearch_dsl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting elasticsearch==7.10.0
  Downloading elasticsearch-7.10.0-py2.py3-none-any.whl (321 kB)
[K     |████████████████████████████████| 321 kB 7.2 MB/s 
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting elasticsearch_dsl
  Downloading elasticsearch_dsl-7.4.0-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 2.1 MB/s 
Installing collected packages: elasticsearch-dsl
Successfully installed elasticsearch-dsl-7.4.0


In [4]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.0-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.0-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-7.10.0-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.10.0/
shasum -a 512 -c elasticsearch-7.10.0-linux-x86_64.tar.gz.sha512

elasticsearch-7.10.0-linux-x86_64.tar.gz: OK


**2. Install an ES plugin for Polish.**


In [5]:
%%bash

sudo elasticsearch-7.10.0/bin/elasticsearch-plugin install pl.allegro.tech.elasticsearch.plugin:elasticsearch-analysis-morfologik:7.10.0

-> Installing pl.allegro.tech.elasticsearch.plugin:elasticsearch-analysis-morfologik:7.10.0
-> Downloading pl.allegro.tech.elasticsearch.plugin:elasticsearch-analysis-morfologik:7.10.0 from maven central
-> Installed analysis-morfologik


Let's start ElasticSearch.

In [6]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.10.0/bin/elasticsearch

In [8]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "004596a5dbda",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "VNi-azCURF-wI5wG9wQrpw",
  "version" : {
    "number" : "7.10.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "51e9d6f22758d0374a0f3f5c6e8f3a7997850f96",
    "build_date" : "2020-11-09T21:30:33.964949Z",
    "build_snapshot" : false,
    "lucene_version" : "8.7.0",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [9]:
import elasticsearch
import elasticsearch_dsl
import os
import regex as reg

es = elasticsearch.Elasticsearch()
elasticsearch_dsl.connections.add_connection('python_client', es)
es.info()

{'name': '004596a5dbda',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'VNi-azCURF-wI5wG9wQrpw',
 'version': {'number': '7.10.0',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '51e9d6f22758d0374a0f3f5c6e8f3a7997850f96',
  'build_date': '2020-11-09T21:30:33.964949Z',
  'build_snapshot': False,
  'lucene_version': '8.7.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

**3. Define an ES analyzer for Polish texts.**

In [10]:
analyzer = elasticsearch_dsl.analyzer(
    'bills_analyzer',
    type='custom',
    tokenizer='standard',                             # standard tokenizer
    filter=[
        'lowercase',                                  # lowercase filter
        elasticsearch_dsl.analysis.token_filter(      # synonym filter
            'synonym_filter', 
            type='synonym', 
            synonyms=[
                'kpk,kodeks postępowania karnego',
                'kpc,kodeks postępowania cywilnego',
                'kk,kodeks karny',
                'kc,kodeks cywilny',
            ],
        ),
        'morfologik_stem',                          # Morfologik-based lemmatizer
    ]
)

In [110]:
print(es.cat.plugins(params={'v': '', 'h': 'name,component,version,description'}))

name         component           version description
004596a5dbda analysis-morfologik 7.10.0  Morfologik Polish Lemmatizer plugin for Elasticsearch



**4. Define an ES index for storing the contents of the legislative acts.**

In [11]:
# Wrapper class
class Act(elasticsearch_dsl.Document):
    # file_name = elasticsearch_dsl.Text()
    # year = elasticsearch_dsl.Integer()
    content = elasticsearch_dsl.Text(
        analyzer=analyzer
    )
    
    class Index:
        name = 'acts'

In [197]:
# Initialization
if Act._index.exists(using=es):
    Act._index.delete(using=es)

Act.init(using=es)

**5. Load the data to the ES index.**

In [200]:
import tqdm
from pathlib import Path
import requests

path = Path('./Acts')
n_acts = len(list(path.iterdir()))
index_name = "acts"
es_url = "http://localhost:9200"

def load_data_to_ES_index(path, n_acts, index_name) -> None:

  for act in tqdm.tqdm(path.iterdir(), desc="Indexing acts", total=n_acts):
    act_id = act.stem
    requests.post(
        url="{}/{}/_doc/{}".format(es_url, index_name, act_id),
        json={
            "content": act.read_text(encoding="utf8")
        }
    )

load_data_to_ES_index(path, n_acts, index_name)

Indexing acts: 100%|██████████| 1179/1179 [00:44<00:00, 26.46it/s]


In [201]:
def check_correcntess(index_name, es=es, path=path) -> bool:
  """Checks if the number of documents is equal to the number of files in the corpus"""

  result_es = es.search(index = index_name)['hits']['total']['value']
  result_true = len(os.listdir(path))

  if result_es == result_true:
    return True
  else:
    return False

print(check_correcntess(index_name))

True


**6. Determine the number of legislative acts containing the word ustawa (in any form).**

In [264]:
def calculate_number_of_files_with_word(word, index, es = es) -> int:

  search = es.search(
      index=index, 
      body={
        "query": {
            "match_phrase": {
                "content": {
                    "query": word
                }
            }
        }
      }
  )
  return search['hits']['total']['value']

print('The number of files containing the word "ustawa" is: {}'.format(calculate_number_of_files_with_word('ustawa', 'acts')))

The number of files containing the word "ustawa" is: 1178


**7. Determine the number of occurrences of the word ustawa by searching for this particular form, including the other inflectional forms.**

In [257]:
result_7 = requests.get(
    url="{}/acts/_termvectors/2004_894".format(es_url),
    json={
        "fields": ["content"],
        "term_statistics": True
    }
)

result_7.json()['term_vectors']['content']['terms']['ustawa']['ttf']

24934

**8. Determine the number of occurrences of the word ustaw by searching for this particular form, including the other inflectional forms.**

In [256]:
result_8 = requests.get(
    url=f"{es_url}/acts/_termvectors/2004_894",
    json={
        "fields": ["content"],
        "term_statistics": True
    }
)

result_8.json()['term_vectors']['content']['terms']['ustawić']['ttf']

913

**9. Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in any inflection form.**

In [265]:
result_9 = calculate_number_of_files_with_word('kodeks postępowania cywilnego', 'acts')
print('The number of files containing the word "kodeks postępowania cywilnego" is: {}'.format(result_9))

The number of files containing the word "kodeks postępowania cywilnego" is: 99


**10. Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase.**

In [266]:
result_10 = es.search(
    index=index_name, 
    body={
      "query": {
          "match_phrase": {
              "content": {
                  "query": "wchodzi w życie",
                  "slop": 2
              }
          }
      }
    }
)

print('The number of files containing the words "wchodzi w życie" is: {}'.format(result_10['hits']['total']['value']))

The number of files containing the words "wchodzi w życie" is: 1174


**11. Determine the 10 documents that are the most relevant for the phrase konstytucja.**

In [278]:
result_11 = requests.get(
    url="{}/acts/_search?filter_path=hits.hits._id,hits.hits._score".format(es_url),
    json={
        "query": {
            "match": {
                "content": {
                    "query": "konstytucja"
                }
            }
        },
        "size": 10
    }
)

result_11.json()['hits']['hits']

[{'_id': '1997_629', '_score': 6.869376},
 {'_id': '2000_443', '_score': 6.6642833},
 {'_id': '1997_604', '_score': 6.633483},
 {'_id': '1996_350', '_score': 6.628302},
 {'_id': '1997_642', '_score': 6.2530584},
 {'_id': '2001_23', '_score': 6.0589767},
 {'_id': '1996_199', '_score': 5.9289904},
 {'_id': '1999_688', '_score': 5.8507533},
 {'_id': '2001_1082', '_score': 5.467437},
 {'_id': '1997_681', '_score': 5.467437}]

**12. Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task.**

In [280]:
result_12 = requests.get(
    url="{}/acts/_search?filter_path=hits.hits._id,hits.hits.highlight".format(es_url),
    json={
        "query": {
            "match": {
                "content": {
                    "query": "konstytucja"
                }
            }
        },
        "highlight": {
            "fields": {
                "content": {
                    "number_of_fragments": 3
                }
            }
        },
        "size": 10
    }
)

result_12.json()['hits']['hits']

[{'_id': '1997_629',
  'highlight': {'content': ['o zmianie ustawy konstytucyjnej o trybie przygotowania\n           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej',
    'W ustawie  konstytucyjnej z  dnia 23 kwietnia 1992 r. o trybie przygotowania i \nuchwalenia <em>Konstytucji</em>',
    'Do zgłoszenia projektu <em>Konstytucji</em> załącza się wykaz \n                obywateli popierających zgłoszenie']}},
 {'_id': '2000_443',
  'highlight': {'content': ['umowy międzynarodowej i nie wypełnia przesłanek określonych w art. 89\n     ust. 1 lub art. 90 <em>Konstytucji</em>',
    'międzynarodowej lub załącznika nie\n     wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>',
    'co do zasadności wyboru\n  trybu ratyfikacji umowy międzynarodowej, o którym mowa w art. 89 ust. 2\n  <em>Konstytucji</em>']}},
 {'_id': '1997_604',
  'highlight': {'content': ['Jeżeli Trybunał Konstytucyjny wyda orzeczenie o sprzeczności celów partii \n   politycznej z <em>Konstyt