# Scoring based on the Search API

The aim of this notebook to give an example how to populate the PaNOSC Search Scoring with the items to be scored from the reference implementation of the PaNOSC Search API.

Requirements
- A running instance of the reference implementation Search API service with enabled scoring.
  (please use this branch: <https://github.com/panosc-eu/search-api/tree/SWAP-2417>)
- A running instance of the PaNOSC Search Scoring service.


**Important**: This example deletes all items from the given scoring API service and re-populates it based on the give search API service.


### Load required packages

In [94]:
# load packages
import requests
import os
import json
import time

### Load configuration

In [95]:
# load configuration
def load_configuration():
    search_api_url = os.environ.get('SEARCH_API_URL', 'http://localhost:3000/api' )  # Address of the search API
    score_api_url = os.environ.get('SCORE_API_URL', 'http://localhost:8000' )  # Address of the scoring API

    dataset_filter = {
        'filter': json.dumps({
            'include': ['instrument', 'techniques', 'samples', 'parameters']
        })
    }

    document_filter = {
        'filter': json.dumps({
            'include': [
                    {
                        'relation': 'members',
                        'scope': {
                            'include': ['person', 'affiliation']
                        }
                    },
                    {
                        'relation': 'parameters',
                    },
                ]
        })
    }

    dataset_mapper = {
        'title': lambda d: d['title'],
        'instrument': lambda d: copy_dict(d.get('instrument', {}), skip_fields=['pid']),
        'techniques': lambda d: [copy_dict(t, skip_fields=['pid']) for t in d.get('techniques', [])],
        'samples': lambda d: [copy_dict(t, skip_fields=['pid']) for t in d.get('samples', [])],
        'parameters': lambda d: [copy_dict(t, skip_fields=['id', 'datasetId']) for t in d.get('parameters', [])],
    }

    document_mapper = {
        'title': lambda d: d.get('title', ''),
        'summary': lambda d: d.get('summary', ''),
        'type': lambda d: d.get('type', ''),
        'parameters': lambda d: [copy_dict(t, skip_fields=['id', 'documentId']) for t in d.get('parameters', [])],
        'members': lambda d: [
            {
                'role': m.get('role', ''),
                'person': copy_dict(m.get('person', {}), skip_fields=['id']),
                'affiliation': copy_dict(m.get('affiliation', {}), skip_fields=['id']),
            } for m in d.get('members', [])
        ]
    }

    # TODO: It would be nice to have a loopback filter to skip mapping part, we need a loopback expert.

    return {
        'search_api_url': search_api_url,
        'score_api_url': score_api_url,
        'dataset_filter': dataset_filter,
        'document_filter': document_filter,
        'mappers': {
            'datasets': dataset_mapper,
            'documents': document_mapper,
        },
    }

configuration = load_configuration()

### Collect datasets via Search API (instrument, techniques, samples and parameters are included)

In [96]:
# get datasets
def load_datasets(config):
    response = requests.get(
        url=f'{config["search_api_url"]}/Datasets',
        headers={'Accept': 'application/json'},
        params=config['dataset_filter']
    )
    return response.json()

datasets = load_datasets(config=configuration)

### Collect documents via Search API (members and parameters are included)

In [97]:
# get documents
def load_documents(config):
    response = requests.get(
        url=f'{config["search_api_url"]}/Documents',
        headers={'Accept': 'application/json'},
        params=config['document_filter']
    )
    return response.json()

documents = load_documents(config=configuration)

### Create data for scoring based on datasets and documents

In [98]:
# Create score data for datasets and documents
def copy_dict(d, skip_fields=None):
    skip = skip_fields if skip_fields is not None else []
    return {k: v for k, v in d.items() if k not in skip}

def extract(data, d_map, group):
    fields = {key: mapper(data) for key, mapper in d_map.items()}
    return {
        'id': data['pid'],
        'group': group,
        'fields': fields,
    }

prepared_datasets = [extract(dataset, configuration['mappers']['datasets'], 'Datasets') for dataset in datasets]
prepared_documents = [extract(document, configuration['mappers']['documents'], 'Documents') for document in documents]

### Clear score database

In [99]:
# Clear score database
def clear_scoring_service(config):
    count = requests.get(f'{config["score_api_url"]}/items/count').json()['count']
    if count > 0:
        response = requests.get(
            url=f'{config["score_api_url"]}/items',
            params={
                'limit': count
            }
        )
        current_items = response.json()
        deleted_items = []
        for item in current_items:
            response = requests.delete(url='/'.join([f'{config["score_api_url"]}/items', item['id']]))
        deleted_items.append(response.status_code)

clear_scoring_service(config=configuration)

### Upload data to scoring service

In [100]:
# upload
def upload_data(config, data):
    requests.post(
        url=f'{config["score_api_url"]}/items',
        json=data
    )

upload_data(config=configuration, data=prepared_datasets)
upload_data(config=configuration, data=prepared_documents)


### Compute weight information

In [101]:
# compute weight
def compute_weight(config):
    requests.post(
        url=f'{config["score_api_url"]}/compute'
    )
    # Wait till compute finishes
    while requests.get(url=f'{config["score_api_url"]}/compute').json()['inProgress']:
        time.sleep(1)

compute_weight(config=configuration)

## Test scoring

In [125]:
def check_only_scoring(config):
    response = requests.post(
        url=f'{config["score_api_url"]}/score',
        data=json.dumps(
            {
                'query': 'proposal',
                'itemIds': [
                    '10.5072/panosc-document1',
                    '10.5072/panosc-document2',
                ],
                # 'group': 'Documents'
            },
        )
    )

    if response.status_code == 200:
        for item in response.json()['scores']:
            print(f'{item}')
    else:
        print(f'{response.text}')

check_only_scoring(config=configuration)

{'itemId': '10.5072/panosc-document2', 'score': 0.45000000000000007, 'group': ''}
{'itemId': '10.5072/panosc-document2', 'score': 0.45000000000000007, 'group': ''}


In [103]:
def test_query(config=configuration):
    response = requests.get(
        url=f'{config["search_api_url"]}/Datasets',
        headers={'Accept': 'application/json'},
        params={
            'filter': json.dumps(
                {
                    "query": "james pub",
                    "limit":50
                }
            )
        }
    )
    print(response.json())
test_query(config=configuration)

[{'pid': '20.500.12269/panosc-dataset1', 'title': 'PaNOSC Test Dataset 1', 'isPublic': True, 'creationDate': '2020-05-05T15:01:02.341Z', 'instrumentId': '20.500.12269/0f98fcf2-7bd7-430e-ad20-d47031ca8f71'}, {'pid': '20.500.12269/panosc-dataset2', 'title': 'PaNOSC Test Dataset 2', 'isPublic': True, 'creationDate': '2020-05-05T15:01:02.341Z', 'instrumentId': '20.500.12269/125e8172-d0f4-4547-98be-a9db903a6269'}, {'pid': '20.500.12269/panosc-dataset3', 'title': 'PaNOSC Test Dataset 3', 'isPublic': True, 'creationDate': '2020-05-05T15:01:02.341Z', 'instrumentId': '20.500.12269/f0637030-9f89-4398-8f01-09211145efa1'}, {'pid': '20.500.12269/panosc-dataset4', 'title': 'PaNOSC Test Dataset 4', 'isPublic': True, 'creationDate': '2020-05-05T15:01:02.341Z', 'instrumentId': '20.500.12269/d3dd2880-637a-40b5-9815-990453817f0e'}]
