In [1]:
import json
import re

In [2]:
def parse_elastic_result(file_path):
    with open(file_path) as file:
        data = file.read()

    data = re.sub(r'"""(.*?)"""', lambda m: json.dumps(m.group(1)), data, flags=re.DOTALL)
    json_data = json.loads(data)

    return [hit['_source']['column1'] for hit in json_data['hits']['hits']]

In [3]:
def parse_our_result(file_path):
    with open(file_path) as file:
        data = file.readlines()

    return [line.strip() for line in data]

In [None]:
queries = ['cat', 'cat salmon', 'cat salmon fresh', 'my cat eats a salmon a day']
limit = 100

# this expects a results folder with the following structure:
# results
# ├── ElasticSearch
# │   ├── elastic_cat.txt
# │   ├── ...
# └── Trigram
#     ├── stemming_cat.txt
#     ├── ...
#     ├── unstemmed_cat.txt
#     ├── ...
for query in queries:
    elastic_file_path = f"../results/ElasticSearch/elastic_{query.replace(' ', '-')}.txt"
    elastic_result = parse_elastic_result(elastic_file_path)
    trigram_with_stemmer_file_path = f"../results/Trigram/stemming_{query.replace(' ', '-')}.txt"
    trigram_with_stemmer_result = parse_our_result(trigram_with_stemmer_file_path)
    trigram_without_stemmer_file_path = f"../results/Trigram/unstemmed_{query.replace(' ', '-')}.txt"
    trigram_without_stemmer_result = parse_our_result(trigram_without_stemmer_file_path)

    elastic_result_set = set(elastic_result[:limit])
    trigram_with_stemmer_result_set = set(trigram_with_stemmer_result[:limit])
    trigram_without_stemmer_result_set = set(trigram_without_stemmer_result[:limit])
    intersection_with_stemmer = elastic_result_set.intersection(trigram_with_stemmer_result_set)
    intersection_without_stemmer = elastic_result_set.intersection(trigram_without_stemmer_result_set)
    print(f'"{query}": {len(intersection_with_stemmer)} with stemmer, {len(intersection_without_stemmer)} without stemmer')