In [47]:
import math
import requests


def query(query, metrics=None, summary=True):
    default = {
        "qf": {
            "book": 3,
            "key": 10,
            "path": 1,
            "text": 1,
            "text_raw": 1.5,
            "title": 1,
            "title_raw": 1.5,
        },
        "pf": {
            "book": 3,
            "key": 10,
            "path": 1,
            "text": 1,
            "text_raw": 1.5,
            "title": 1,
            "title_raw": 1.5,
        },
        "consolidado": 4,
        "tie": 1,
    }
    if metrics is None:
        metrics = default

    x = requests.get(
        url="http://localhost:5002/solr/articles/select",
        params={
            "defType": "edismax",
            "qf": f"book^{metrics['qf']['book'] or default['qf']['book']} key^{metrics['qf']['key'] or default['qf']['key']} path^{metrics['qf']['path'] or default['qf']['path']} text^{metrics['qf']['text'] or default['qf']['text']} text_raw^{metrics['qf']['text_raw'] or default['qf']['text_raw']} title^{metrics['qf']['title'] or default['qf']['title']} title_raw^{metrics['qf']['title_raw'] or default['qf']['title_raw']}",
            "pf": f"book^{metrics['pf']['book'] or default['pf']['book']} key^{metrics['pf']['key'] or default['pf']['key']} path^{metrics['pf']['path'] or default['pf']['path']} text^{metrics['pf']['text'] or default['pf']['text']} text_raw^{metrics['pf']['text_raw'] or default['pf']['text_raw']} title^{metrics['pf']['title'] or default['pf']['title']} title_raw^{metrics['pf']['title_raw'] or default['pf']['title_raw']}",
            "bq": f"state:Consolidado^{metrics['consolidado'] or default['consolidado']}",
            "tie": f"{metrics['tie'] or default['tie']}",
            "q": query,
            "start": 0,
            "rows": 20
        }
    )

    res = x.json()['response']['docs']
    if summary:
        res = _parse_articles(res)

    return res


def _parse_articles(articles):
    # gets the last element of a list
    return [build_article(article['book'], article['key'], article['path'][-1], article['date']) for article in articles]


def build_article(book, key, last_path, date):
    return "{}/{}/{}/{}".format(book, key, last_path, date)


def ndcg(results, expected, order=False):
    """
    Compute the normalized discounted cumulative gain (NDCG) of the results
    against the expected results.

    The relevance grade is the inverse order of the list. E.g. [3, 2, 1]
    """
    len_expected = len(expected)

    dcg = 0.0
    for i, result in enumerate(results):
        if result in expected:
            if order:
                relevance_grade = len_expected - expected.index(result)
            else:
                relevance_grade = 1
            dcg += (2 ** relevance_grade - 1) / math.log(i + 2, 2)
    idcg = 0.0
    for i, result in enumerate(expected):
        if order:
            relevance_grade = len_expected - i
        else:
            relevance_grade = 1
        idcg += (2 ** relevance_grade - 1) / math.log(i + 2, 2)

    return dcg / idcg


def ap(results, expected):
    """
    Compute the average precision (AP) of the results against the expected
    results.
    """
    ap = 0.0
    for i, result in enumerate(results):
        if result in expected:
            ap += 1 / (i + 1)
    best = 0.0
    for i, result in enumerate(expected):
        best += 1 / (i + 1)
    return ap / best


In [48]:
def eval(query_str, expected, metrics):
    values = query(query_str, metrics)
    return ndcg(values, expected)


In [111]:
import itertools
import random


def score(metrics):
    queries = {
        "horas suplementares": [
            build_article("Código do Trabalho", 226,
                          "Subsecção VII Trabalho suplementar", "2012-08-01T00:00:00Z"),
            build_article("Código do Trabalho", 227,
                          "Subsecção VII Trabalho suplementar", "2009-02-12T00:00:00Z"),
            build_article("Código do Trabalho", 228,
                          "Subsecção VII Trabalho suplementar", "2009-02-12T00:00:00Z"),
            build_article("Código do Trabalho", 229,
                          "Subsecção VII Trabalho suplementar", "2012-08-01T00:00:00Z"),
            build_article("Código do Trabalho", 230,
                          "Subsecção VII Trabalho suplementar", "2012-08-01T00:00:00Z"),
            build_article("Código do Trabalho", 231,
                          "Subsecção VII Trabalho suplementar", "2009-02-12T00:00:00Z")
        ],
        "+socialismo +date:[1976-01-01T00:00:00Z TO 1976-12-31T23:59:59Z] +book:constituicao": [
            build_article("Constituição da República Portuguesa", 185,
                          "Capítulo I Função e estrutura", "1976-04-10T00:00:00Z"),
            build_article("Constituição da República Portuguesa",
                          2, "Princípios fundamentais", "1976-04-10T00:00:00Z"),
            build_article("Constituição da República Portuguesa", 273,
                          "Título X Defesa Nacional", "1976-04-10T00:00:00Z"),
            build_article("Constituição da República Portuguesa", 89,
                          "Título I Princípios gerais", "1976-04-10T00:00:00Z"),
        ],
        "horario flexivel": [
            build_article("Código do Trabalho", 56,
                          "Subsecção IV Parentalidade", "2015-09-06T00:00:00Z"),
            build_article("Código do Trabalho", 56,
                          "Subsecção IV Parentalidade", "2009-02-12T00:00:00Z"),
            build_article("Código do Trabalho", 57,
                          "Subsecção IV Parentalidade", "2009-02-12T00:00:00Z"),
        ],
        "artigo 1 da constituicao": [
            build_article("Constituição da República Portuguesa",
                          1, "Princípios fundamentais", "1989-08-07T00:00:00Z"),
            build_article("Constituição da República Portuguesa",
                          1, "Princípios fundamentais", "1976-04-10T00:00:00Z"),
        ],
        "art 2 registo civil": [
            build_article("Código do Registo Civil", 2,
                          "Capítulo I Objecto e valor do registo civil", "1995-06-06T00:00:00Z"),
            build_article("Código do Registo Civil", 2,
                          "Diploma", "1995-06-06T00:00:00Z"),
        ],
        "ideologia fascista": [
            build_article("Constituição da República Portuguesa", 163,
                          'Capítulo II Competência', '1976-04-10T00:00:00Z'),
            build_article("Constituição da República Portuguesa", 160,
                          'Capítulo I Estatuto e eleição', '1997-10-05T00:00:00Z'),
            build_article("Constituição da República Portuguesa", 46,
                          'Capítulo I Direitos, liberdades e garantias pessoais', '1982-10-30T00:00:00Z'),
            build_article("Constituição da República Portuguesa", 46,
                          'Capítulo I Direitos, liberdades e garantias pessoais', '1976-04-10T00:00:00Z'),
            build_article("Constituição da República Portuguesa", 46,
                          'Capítulo I Direitos, liberdades e garantias pessoais', '1997-10-05T00:00:00Z'),
        ],
        'art 10 Código Penal': [
            build_article('Código Penal', 10, 'Diploma',
                          '1995-03-15T00:00:00Z'),
            build_article(
                'Código Penal', 10, 'Capítulo I Pressupostos da punição', '1995-03-15T00:00:00Z'),
            build_article(
                'Código Penal', 10, 'Capítulo I Pressupostos da punição', '1998-09-07T00:00:00Z'),
        ]
    }

    total = 0
    for query_str, expected in queries.items():
        evaluation = eval(query_str, expected, metrics)
        evaluation = (evaluation - 0.5) * 2
        if evaluation >= 0:
            total += evaluation

    return total / len(queries)


def _flat(metrics):
    res = {}
    # for key value in dict
    for key, value in metrics.items():
        # if the type of the value is a dict
        if isinstance(value, dict):
            # for each key value in this dict
            for key2, value2 in value.items():
                # sets a new entry for the top dict with a changed name
                res[f'.{key}.{key2}'] = value2
        else:
            res[key] = value

    return res


def _unflat(flat):
    res = {}

    for key, value in flat.items():
        if '.' in key:
            # removes the first char
            key = key[1:]
            # gets the top dict name
            top_key = key.split('.')[0]
            # gets the bottom dict name
            bottom_key = key.split('.')[1]
            # verifies if the top key entry already exists
            if top_key in res:
                # adds the bottom key entry
                res[top_key][bottom_key] = value
            else:
                # creates the dict
                res[top_key] = {bottom_key: value}
        else:
            res[key] = value

    return res


def _create_metrics(metrics):
    flat = _flat(metrics)
    keys, values = zip(*flat.items())
    combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
    combinations = [_unflat(c) for c in combinations]
    return combinations


def _random_metric(metrics):
    flat = _flat(metrics)
    res = {}
    for key, value in flat.items():
        # random number between 2 floats, inclusive
        res[key] = random.uniform(value[0], value[1])

    return _unflat(res)


def _create_random_metrics(metrics, max_metrics):
    flat = _flat(metrics)

    res = []
    rand = 0
    for i in range(max_metrics):
        random_metrics = {}
        for key, value in flat.items():
            random_metrics[key] = random.choice(value)
        # if res contains this set, ignores
        if random_metrics in res:
            rand += 1
            # if rand is big, stops
            if rand > 50:
                break
        else:
            res.append(random_metrics)

    # unflats the metrics
    res = [_unflat(m) for m in res]
    return res


def improvement_hungry_tuning(all_metrics, improveless_iterations=200):
    best_metrics = None
    best_score = 0

    not_improving = 0
    while not_improving < improveless_iterations:
        rand_metric = _random_metric(all_metrics)
        s = score(rand_metric)

        if s > best_score:
            best_metrics = rand_metric
            best_score = s
            not_improving = 0

            print(f'New best score: {best_score}')
        else:
            not_improving += 1

    return best_metrics, best_score


def iterative_tuning(metrics, improveless_iterations=200):
    best_metrics = None
    best_score = 0
    last_metrics, last_score = improvement_hungry_tuning(metrics, improveless_iterations)

    while last_score > best_score:
        best_metrics = last_metrics
        best_score = last_score

        # cuts the range of the metrics in half, centered on the last best value given
        metrics_flat = _flat(metrics)
        best_metrics_flat = _flat(best_metrics)
        for key, value in best_metrics_flat.items():
            # max between 2 variables
            max_value = max(
                value - metrics_flat[key][0], metrics_flat[key][1] - value)
            new_min = value - max_value / 2
            new_max = value + max_value / 2
            if new_min < 0:
                new_min = 0
            metrics_flat[key] = (new_min, new_max)

        # calls the improvement hungry tuning
        metrics = _unflat(metrics_flat)
        print(metrics)

        last_metrics, last_score = improvement_hungry_tuning(
            metrics, improveless_iterations)

    return best_metrics, best_score


In [112]:
metrics = {
    "qf": {
        "book": (0, 30),
        "key": (0, 30),
        "path": (0, 30),
        "text": (0, 30),
        "text_raw": (0, 30),
        "title": (0, 30),
        "title_raw": (0, 30),
    },
    "pf": {
        "book": (0, 30),
        "key": (0, 30),
        "path": (0, 30),
        "text": (0, 30),
        "text_raw": (0, 30),
        "title": (0, 30),
        "title_raw": (0, 30),
    },
    "consolidado": (0, 30),
    "tie": (0, 1),
}

a = iterative_tuning(metrics)
print(a)

New best score: 0.4346888312750479
New best score: 0.5146571470729683
New best score: 0.6573462951969065
{'qf': {'book': (8.450113220468413, 25.35033966140524), 'key': (10.37272207911934, 31.118166237358018), 'path': (0, 18.9115611062361), 'text': (0, 19.167166503722548), 'text_raw': (0, 17.752368743126304), 'title': (0, 17.028848549366337), 'title_raw': (0, 16.739676271223495)}, 'pf': {'book': (0, 17.520108405091037), 'key': (0, 15.430122293281132), 'path': (12.828069148228366, 38.4842074446851), 'text': (7.810556501334479, 23.431669504003438), 'text_raw': (0, 18.503701057821964), 'title': (9.02814620348262, 27.08443861044786), 'title_raw': (12.25828536006352, 36.77485608019056)}, 'consolidado': (0, 17.972970418371496), 'tie': (0.06700464609966267, 0.6890015486998875)}
New best score: 0.45894180878459073
New best score: 0.5075857460542154
New best score: 0.5855437081783768
New best score: 0.8027217693014662
New best score: 0.8161096636057016
{'qf': {'book': (16.286447074578074, 31.959

In [139]:
query('interrupção voluntária da gravidez', a[0], False)


[{'book': 'Código do Registo Civil',
  'book_url': ['/dre/legislacao-consolidada/decreto-lei/1995-34525275'],
  'date': '2002-04-25T00:00:00Z',
  'details': ['Aditado pelo/a Artigo único do/a Decreto-Lei n.º 113/2002  - Diário da República n.º 93/2002, Série I-A de 2002-04-20, em vigor a partir de 2002-04-25'],
  'key': '209 A',
  'path': ['Diploma',
   'Anexo',
   'Título II Actos de registo',
   'Capítulo II Actos de registo em especial',
   'Secção VI Óbito',
   'Subsecção V Morte fetal'],
  'president_name': ['Jorge Sampaio'],
  'president_party': ['Socialista'],
  'state': ['Consolidado'],
  'text': 'É dispensado o certificado médico de morte fetal quando ocorra a interrupção voluntária da gravidez, prevista na alínea c) do n.º 1 do artigo 142.º do Código Penal, bem como, até às 24 semanas da gestação, quando a interrupção da gravidez seja espontânea.',
  'title': 'Dispensa de certificado médico de morte fetal',
  'id': 'ebaba4ee-b2fb-46ff-bd91-8dd95e6e3363',
  '_version_': 172237