Marcin Wardyński  
czwartek, 8:00

## Lab 2

Używam zdockeryzowanego Elasticsearch z repozytorium i zakładam, że przed wykonaniem jakichkolwiek zapytań kontener ten zostanie uruchomiony.

#### 1. Utwórz analizator polskich tekstów

In [42]:
es_url = "http://localhost:9200"
index_name = "mw_nlp_lab2"

index_url = F"{es_url}/{index_name}"

In [49]:
import requests

# Delete the index
delete_response = requests.delete(f"{index_url}")

# Check if the deletion was successful
if delete_response.status_code == 200:
    print(f"Index '{index_name}' deleted successfully.")
else:
    print(f"Failed to delete index '{index_name}': {delete_response.text}")

Index 'mw_nlp_lab2' deleted successfully.


In [53]:
import requests

index_list_response = requests.get(f"{es_url}/_cat/indices?format=json")
index_list_response.content

b'[{"health":"yellow","status":"open","index":"mw_nlp_lab2","uuid":"yqa92N2ZSneOi0LMvvYkGQ","pri":"1","rep":"1","docs.count":"57638","docs.deleted":"0","store.size":"86.7mb","pri.store.size":"86.7mb","dataset.size":"86.7mb"}]'

In [51]:
import requests
import json

fiqa_index_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "polish_months_synonym": {
                    "type": "synonym",
                    "synonyms": [
                        "kwiecień, kwi, IV",
                    ]
                },
                "polish_morfologik": {
                    "type": "morfologik_stem"
                }
            },
            "analyzer": {
                "polish_analyzer_1": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "polish_months_synonym",
                        "polish_morfologik",
                        "lowercase"
                    ]
                },
                "polish_analyzer_2": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "polish_morfologik",
                        "lowercase"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "fields": {
                    "analyzed_1": {
                        "type": "text",
                        "analyzer": "polish_analyzer_1"
                    },
                    "analyzed_2": {
                        "type": "text",
                        "analyzer": "polish_analyzer_2"
                    }
                }
            }
        }
  }
}

response = requests.put(index_url, headers={"Content-Type": "application/json"}, data=json.dumps(fiqa_index_settings))

# Check if the index was created successfully
if response.status_code == 200:
    print("Index created.")
else:
    print(f"Index creation failed: {response.text}")


Index created.


In [52]:
from datasets import load_dataset

fiqa_dataset = load_dataset("clarin-knext/fiqa-pl", name="corpus")

bulk_data = ""
for entry in fiqa_dataset["corpus"]:
    doc_id = entry["_id"]
    bulk_data += json.dumps({"index": {"_index": index_name, "_id": doc_id}}) + "\n"
    bulk_data += json.dumps({"text": entry['text']}) + "\n"

        

bulk_response = requests.post(f"{es_url}/_bulk", headers={"Content-Type": "application/x-ndjson"}, data=bulk_data)

if bulk_response.status_code == 200:
    response_data = bulk_response.json()
    if any(item.get("index", {}).get("error") for item in response_data["items"]):
        print("Some documents failed to index:")
        for item in response_data["items"]:
            if "error" in item["index"]:
                print(item["index"]["error"])
    else:
        print("All documents indexed successfully.")
else:
    print(f"Failed to index data: {bulk_response.text}")

All documents indexed successfully.


In [59]:
search_word = "kwiecień" 
search_field = "text.analyzed_2"

max_docs_no = 1000
max_highlights_no = 100

search_query = {
    "size": max_docs_no,
    "query": {
        "match": {
            search_field: search_word
        }
    },
    "highlight": {
        "fields": {
            search_field: {
                "type": "plain",
                "fragment_size": 0,
                "number_of_fragments": max_highlights_no
            }
        }
    }
}

response = requests.get(f"{index_url}/_search", headers={"Content-Type": "application/json"}, data=json.dumps(search_query))

docs_found = set([])

if response.status_code == 200:
    search_results = response.json()
        
    matches_counter = 0
    for hit in search_results["hits"]["hits"]:
        docs_found.add(hit['_id'])
        matches_counter += len(hit['highlight'][search_field])

    print(f"Found {search_results['hits']['total']['value']} documents and {matches_counter} matches of '{search_word}' in {search_field}.")
else:
    print(f"Search failed: {response.text}")


Found 257 documents and 353 matches of 'kwiecień' in text.analyzed_2.


Powyższy kod zwraca przy wyszukiwaniu z synoniami 306 dokumentów.
Namiast bez synonimów 257 dokumentów.

W poprzednim laboratorium mieliśmy za zadanie utworzyć wyrażenie regularne, które znajduje "kwiecień" w pełnej odmianie przez przypadki obydwu liczb. Poniżej użyję tego kodu jeszcze raz, żeby sprawdzić, jak się mają jego wyniki z wynikami analizatora bez synonimów. (Porównanie z synonimami nie ma większego sensu, gdyż wyrażenie regularne nie miało ich uwzględniać).

In [61]:
import regex

corpus = fiqa_dataset['corpus']

april_p = r"kwie(cień|tni)"
april_pattern = regex.compile(april_p, flags=regex.IGNORECASE | regex.MULTILINE)

def count_april_occurrences(what, pattern):
    occurrences = {}
    counter = 0

    for entry in corpus:
        found = regex.findall(pattern, entry['text'])
        
        counter += len(found)
        if found:
            occurrences[entry["_id"]] = len(found)

        
        
    print(f"{what} found in {len(occurrences.keys())} documents in total {counter} times.")
    return occurrences


occ_april = count_april_occurrences("'Kwiecien' (directly)", april_pattern)
regex_docs_found = occ_april.keys()

'Kwiecien' (directly) found in 265 documents in total 362 times.


In [62]:
docs_found-regex_docs_found

set()

Nie ma dokumentów ze słowem bazującym na "kwiecień", które by zostało znalezione przez Elasticsearch, ale nie przez wyrażenie regularne.

In [63]:
regex_docs_found-docs_found

{'109292', '159500', '166563', '208216', '265866', '441143', '469888', '82284'}

Za to istnieje osiem dokumentów odnalezionych przez wyrażenie regularne, ale nie przez FTS. Bierze się to z faktu, iż wyrażenie regularne zostało sformuowane dość luźno, przez co znajdowało przymiotnik od słowa "kwiecień", czyli "kwietniowy" i jego pełną fleksję.

In [None]:
from datasets import load_dataset

def prepare_fiqa_qrels():
    subset = 'test'
    query_to_corpus_dict = {}

    qrels_dataset = load_dataset("clarin-knext/fiqa-pl-qrels")

    for item in qrels_dataset[subset]:
        if item['query-id'] not in query_to_corpus_dict:
            query_to_corpus_dict[item['query-id']] = {}

        query_to_corpus_dict[item['query-id']][item['corpus-id']] = item['score']

    for query_id in query_to_corpus_dict:
        sorted_corpuses_by_score = dict(sorted(query_to_corpus_dict[query_id].items(), key=lambda item: item[1]))
        query_to_corpus_dict[query_id] = sorted_corpuses_by_score

    return query_to_corpus_dict

def prepare_fiqa_queries(query_to_corpus_dict):
    queries_dict = {}

    queries_dataset = fiqa_dataset['query']
    for entry in queries_dataset:
        if entry['_id'] in query_to_corpus_dict.keys():
            queries_dict[entry['_id']] = entry['text']

    return queries_dict


query_to_corpus_dict = prepare_fiqa_qrels()
queries_dict = prepare_fiqa_queries(query_to_corpus_dict)

In [None]:
def elastic_analyze(analyzer, query):
    payload = {
        "analyzer": analyzer,
        "text": query
    }

    response = requests.post(f"{index_url}/_analyze", headers={"Content-Type": "application/json"}, data=json.dumps(payload))

    # Check if the request was successful and print the response
    if response.status_code == 200:
        print("Analysis result:", response.json())
    else:
        print(f"Failed to analyze text: {response.status_code}")
        print(response.text)
    



def calculate_ndcg_5(analyzer, query):
    elastic_analyze(analyzer, query)
    return None

