# Basic Logic Mill API usage

In [2]:
import requests
import json
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

API_KEY = os.getenv("API_KEY")

# API documentation: https://api.logic-mill.net/api/v1/graph

# Get Information about Logic Mill API

## Version

In [3]:
query = """{
  Version
}"""

url = 'https://api.logic-mill.net/api/v1/graphql/'
r = requests.post(url, json={'query': query})

print(r.status_code) # should be 200 else an error has occured, eg. server not available
print(r.text) # result in text format

200
{
    "data": {
        "Version": "0.1"
    }
}


In [4]:
# make sure the API endpoint is of a specific version
assert r.json()["data"]["Version"] == '0.1'

## Get the names of document sets (indices)
https://api.logic-mill.net/api/v1/graph/?query=%7B%0A%20%20IndicesNames%20%7B%0A%20%20%20%20amountOfDocuments%0A%20%20%20%20name%0A%20%20%7D%0A%7D

In [5]:
query = """{
  IndicesNames {
    amountOfDocuments
    name
  }
}"""

url = 'https://api.logic-mill.net/api/v1/graphql/'
r = requests.post(url, json={'query': query})

print(r.text)


{
    "data": {
        "IndicesNames": [
            {
                "amountOfDocuments": 64501832,
                "name": "patents_v2"
            },
            {
                "amountOfDocuments": 585449,
                "name": "upc_cases"
            },
            {
                "amountOfDocuments": 126918838,
                "name": "publications"
            }
        ]
    }
}


In [6]:
# convert the names of the endpoints to a list (we'll use them later)
indices = pd.DataFrame(r.json()['data']['IndicesNames'])["name"].to_list()
indices

['patents_v2', 'upc_cases', 'publications']

## Keyword search

In [6]:
## TODO

## Getting basic information from a document by using the document ID

By changing the GraphQL query you can limit or extend which fields the server will retrieve for you.
The full list can be found online: https://api.logic-mill.net/api/v1/graph


> THis end point will be deprecated soon. Use `Documents`

In [19]:
# simple version
headers = {
    'content-type': 'application/json',
    'Authorization': 'Bearer ' + API_KEY,
}

query="""{
	Document(index:"publications", id:"W4313483528") {
    id
    title
    url
	}
}"""


url = 'https://api.logic-mill.net/api/v1/graphql/'

r = requests.post(url, headers=headers, json={'query': query})

if r.status_code == 200:

    print(r.text)
else:
    print(r.text)
    print(f"Error executing\n{query}\non {url}")



{
    "data": {
        "Document": {
            "id": "W4313483528",
            "title": "Logic Mill -- A Knowledge Navigation System",
            "url": "https://openalex.org/works/W4313483528"
        }
    }
}


In [16]:
# Parameterized version

# 1. add `query` and the variables in parenthesis. Copy the types from the documentation.
# 2. add the variables in the query (eg. `$index`)
query="""query ($id:String!, $index:String!) {
	Document(index:$index, id:$id) {
	  id
    url
    title

	}
}"""


variables = {
    "id":"W4313483528",
    "index":"publications",
}

r = requests.post(url, headers=headers, json={'query': query , 'variables': variables})
r.json()

{'data': {'Document': {'id': 'W4313483528',
   'title': 'Logic Mill -- A Knowledge Navigation System',
   'url': 'https://openalex.org/works/W4313483528'}}}

## Info from the document

To select specific items you have to use the keys in the dictionary


In [21]:
data = r.json()['data']
title = data['Document']['title']
print(f"Title:\n{title}\n")


Title:
Logic Mill -- A Knowledge Navigation System



## Get the numerical representiation/embedding

The query is the same as above, only different fields are requested

In [34]:
# same parameter setup as above
query="""query ($id:String!, $index:String!) {
	Document(index:$index, id:$id) {
	  id
    url
    title
	PatspecterEmbedding
	}
}"""


variables = {
    "id":"W4313483528",
    "index":"publications",
}

r = requests.post(url, headers=headers, json={'query': query , 'variables': variables})


if r.status_code == 200:
    embedding = r.json()['data']['Document']['PatspecterEmbedding']
    print("First 10 numbers of the numerical representation")
    print(embedding[:10])
else:
    print(f"Error executing\n{query}\non {url}")

First 10 numbers of the numerical representation
[-0.23040916, 0.09366139, -0.5264982, 0.35253185, -0.32284304, -1.2042444, 1.4183519, 0.85640496, -0.4407626, 0.8229229]


## Search _n_ most similar documents compared to a document in the database

In [54]:
n=5

query="""query SimilaritySearch($index: String!, $id: String!, $amount: Int, $indices: [String], $model: String!) {
  SimilaritySearch(
    index: $index
    id: $id
    amount: $amount
    indices: $indices
    model: $model
  ) {
    id
    score
    index
    document {
      title
      url
    }
  }
}
"""

variables={
  "model": "patspecter",
  "amount": n+1,
  "id": "91081326",
  "index": "patents",
  "indices": [
    "publications"
  ]
}

url = 'https://api.logic-mill.net/api/v1/graphql/'
r = requests.post(url, headers=headers, json={
                  'query': query, 'variables': variables})

if r.status_code == 200:
    # results = r.json()
    results = r.json()['data']['SimilaritySearch']
    for i in results:
        print(i)
else:
    print(f"Error executing\n{query}\non {url}")

{'document': {'title': 'SPECIAL CHARACTERISTICS OF GLEANER COMBINE HARVESTERS', 'url': 'https://openalex.org/works/W3169719195'}, 'id': 'W3169719195', 'index': 'publications', 'score': 0.97302663}
{'document': {'title': 'A Wheel Elevator for Sugar Beet Harvesters', 'url': 'https://openalex.org/works/W2016749744'}, 'id': 'W2016749744', 'index': 'publications', 'score': 0.97295034}
{'document': {'title': 'Stepwise Construction of the Dedekind-MacNeille Completion (Research Note)', 'url': 'https://openalex.org/works/W1501118946'}, 'id': 'W1501118946', 'index': 'publications', 'score': 0.9728551}
{'document': {'title': 'Stochastic Models of Control and Economic Dynamics', 'url': 'https://openalex.org/works/W1588789815'}, 'id': 'W1588789815', 'index': 'publications', 'score': 0.97222877}
{'document': {'title': 'Diseño y simulación de un dispositivo vibrador multidireccional para cosechar capulí.', 'url': 'https://openalex.org/works/W3162661967'}, 'id': 'W3162661967', 'index': 'publications'

In [55]:
pd.DataFrame(results)[1:]

Unnamed: 0,document,id,index,score
1,{'title': 'A Wheel Elevator for Sugar Beet Har...,W2016749744,publications,0.97295
2,{'title': 'Stepwise Construction of the Dedeki...,W1501118946,publications,0.972855
3,{'title': 'Stochastic Models of Control and Ec...,W1588789815,publications,0.972229
4,{'title': 'Diseño y simulación de un dispositi...,W3162661967,publications,0.971483
5,{'title': 'Methods and devices for processing ...,W4289387989,publications,0.971297


**Observation 1:**

`id` and `score` at the top of the hierarchy and can be used directly. Within `document` contains the other relevant information. 

**Observation 2:**

Because of the nested structure, a loop or an apply is needed. 

In [64]:
# Example with apply
similar_docs = pd.DataFrame(results)
keys = similar_docs["document"][0].keys()

for k in keys:
    similar_docs[k] = similar_docs["document"].apply(lambda x: x[k])

# remove document
del similar_docs["document"]

similar_docs

Unnamed: 0,id,index,score,title,url
0,W3169719195,publications,0.973027,SPECIAL CHARACTERISTICS OF GLEANER COMBINE HAR...,https://openalex.org/works/W3169719195
1,W2016749744,publications,0.97295,A Wheel Elevator for Sugar Beet Harvesters,https://openalex.org/works/W2016749744
2,W1501118946,publications,0.972855,Stepwise Construction of the Dedekind-MacNeill...,https://openalex.org/works/W1501118946
3,W1588789815,publications,0.972229,Stochastic Models of Control and Economic Dyna...,https://openalex.org/works/W1588789815
4,W3162661967,publications,0.971483,Diseño y simulación de un dispositivo vibrador...,https://openalex.org/works/W3162661967
5,W4289387989,publications,0.971297,Methods and devices for processing the stemsto...,https://openalex.org/works/W4289387989


## Create a new embedding for a user document

We use the `encodeDocuments` endpoint. 


The data variable has the following form (`EncodeObject`):

```
  "data": [
    {
      "id": "trade_resolutions_pat",
      "parts": [
        {
          "key": "title",
          "value": "d"
        },
        {
          "key": "abstract",
          "value": "An electronic trading system utilizes...."
        }
      ]
    },
```
The `id` is for identification purposes and can be any number.

Many time we may have the data available in some datastructure (CSV, Excel file, dictionary). In the following example we will use a dictionary.

In [69]:
# our data

biblios = [{
    "id": "ML001",
    "title" : "Towards A Rigorous Science of Interpretable Machine Learning",
    "abstract" : "As machine learning systems become ubiquitous, there has been a surge of interest in interpretable machine learning: systems that provide explanation for their outputs. These explanations are often used to qualitatively assess other criteria such as safety or non-discrimination. However, despite the interest in interpretability, there is very little consensus on what interpretable machine learning is and how it should be measured. In this position paper, we first define interpretability and describe when interpretability is needed (and when it is not). Next, we suggest a taxonomy for rigorous evaluation and expose open questions towards a more rigorous science of interpretable machine learning."
} , {
    "id": "ML002",
    "title": "Machine Learning Interpretability: A Science rather than a tool",
    "abstract": """The term "interpretability" is oftenly used by machine learning researchers each with their own intuitive understanding of it. There is no universal well agreed upon definition of interpretability in machine learning. As any type of science discipline is mainly driven by the set of formulated questions rather than by different tools in that discipline, e.g. astrophysics is the discipline that learns the composition of stars, not as the discipline that use the spectroscopes. Similarly, we propose that machine learning interpretability should be a discipline that answers specific questions related to interpretability. These questions can be of statistical, causal and counterfactual nature. Therefore, there is a need to look into the interpretability problem of machine learning in the context of questions that need to be addressed rather than different tools. We discuss about a hypothetical interpretability framework driven by a question based scientific approach rather than some specific machine learning model. Using a question based notion of interpretability, we can step towards understanding the science of machine learning rather than its engineering. This notion will also help us understanding any specific problem more in depth rather than relying solely on machine learning methods"""
}, {
    "id": "ML003",
    "title": "Opening the black box of neural networks: methods for interpreting neural network models in clinical applications",
    "abstract": """Artificial neural networks (ANNs) are powerful tools for data analysis and are particularly suitable for modeling relationships between variables for best prediction of an outcome. While these models can be used to answer many important research questions, their utility has been critically limited because the interpretation of the "black box" model is difficult. Clinical investigators usually employ ANN models to predict the clinical outcomes or to make a diagnosis; the model however is difficult to interpret for clinicians. To address this important shortcoming of neural network modeling methods, we describe several methods to help subject-matter audiences (e.g., clinicians, medical policy makers) understand neural network models. Garson's algorithm describes the relative magnitude of the importance of a descriptor (predictor) in its connection with outcome variables by dissecting the model weights. The Lek's profile method explores the relationship of the outcome variable and a predictor of interest, while holding other predictors at constant values (e.g., minimum, 20th quartile, maximum). While Lek's profile was developed specifically for neural networks, partial dependence plot is a more generic version that visualize the relationship between an outcome and one or two predictors. Finally, the local interpretable model-agnostic explanations (LIME) method can show the predictions of any classification or regression, by approximating it locally with an interpretable model. R code for the implementations of these methods is shown by using example data fitted with a standard, feed-forward neural network model. We offer codes and step-by-step description on how to use these tools to facilitate better understanding of ANN"""
}]

pd.DataFrame(biblios)

Unnamed: 0,id,title,abstract
0,ML001,Towards A Rigorous Science of Interpretable Ma...,"As machine learning systems become ubiquitous,..."
1,ML002,Machine Learning Interpretability: A Science r...,"The term ""interpretability"" is oftenly used by..."
2,ML003,Opening the black box of neural networks: meth...,Artificial neural networks (ANNs) are powerful...


We wish to encode the first item

In [70]:
# take the first record
biblio = biblios[0]

In [71]:
# prepare the data

data = {"id": biblio['id'], "parts":[]}
# create the key value pairs; (a more direct way of doing this is also possible)
data["parts"] = [{"key": k, "value":v} for k,v in biblio.items() if k!='id']
data


{'id': 'ML001',
 'parts': [{'key': 'title',
   'value': 'Towards A Rigorous Science of Interpretable Machine Learning'},
  {'key': 'abstract',
   'value': 'As machine learning systems become ubiquitous, there has been a surge of interest in interpretable machine learning: systems that provide explanation for their outputs. These explanations are often used to qualitatively assess other criteria such as safety or non-discrimination. However, despite the interest in interpretability, there is very little consensus on what interpretable machine learning is and how it should be measured. In this position paper, we first define interpretability and describe when interpretability is needed (and when it is not). Next, we suggest a taxonomy for rigorous evaluation and expose open questions towards a more rigorous science of interpretable machine learning.'}]}

In [86]:
# add to the variables dictionary
variables = {"data" : [data]}


In [109]:
# We need to add the model
variables['model']="patspecter"
variables

{'model': 'patspecter',
 'data': [{'id': 'trade_resolutions_pat',
   'parts': [{'key': 'title',
     'value': 'market driven implied trade resolutions_pat'},
    {'key': 'abstract',
     'value': 'An electronic trading system utilizes a Match Engine that receives orders, stores them internally, calculates tradable combinations and advertises the availability of real and implied orders in the form of market data. New tradable items defined as combinations of other tradable items may be included in the calculation of tradable combinations. The disclosed embodiments relate to detection of market conditions where identification of implied opportunities may, for example, subvert real orders resulting in undesirable effects. Under circumstances where such undesirable effects are likely to occur, identification of implied opportunities may be delayed thereby allowing market forces to attempt to resolve the aberrant market conditions and avoid the undesirable effects.'}]},
  {'id': 'trading_sy

In [110]:
# prepare query with
query="""
query encodeDocuments($data: [EncodeObject], $model: String!) {
  encodeDocuments(data: $data, model: $model)
}
"""

In [111]:
from urllib3.util import Retry
from requests import Session
from requests.adapters import HTTPAdapter

URL = 'https://api.logic-mill.net/api/v1/graphql/'

# Establish session for robust connection
s = Session()
retries = Retry(total=5, backoff_factor=0.1,
                status_forcelist=[500, 501, 502, 503, 504, 524])

s.mount('https://', HTTPAdapter(max_retries=retries))

r = s.post(URL, headers=headers, json={'query': query , 'variables': variables})

if r.status_code == 200:
    response = r.json()
    print(response)

{'data': {'encodeDocuments': [[-0.5094876, -0.42910758, -0.60446244, 0.08993333, 0.07467303, -1.3158134, 1.0888672, 0.32978496, -0.1328897, 0.45940572, 0.22710985, -0.19483793, -0.5310378, -0.5598794, -0.19108793, 0.18091515, -0.32183328, 0.8051622, -0.3257313, -0.5954319, 0.45176947, 0.45920807, -0.3988887, 0.36588034, 0.3895847, 1.0595201, -0.687179, 0.7835675, -0.15440154, 0.6325534, -0.07596238, -1.130564, 0.92189246, -0.0053143157, -0.14579551, 0.089132085, 0.322743, -1.0702231, -0.6496761, 0.042815957, -0.6911558, 0.39844596, 0.5646807, -1.519328, -0.5809928, 0.96163785, -0.30681768, 0.9669546, -0.23047183, -0.3112278, 0.83491683, -1.6832802, 0.75912595, 0.7178667, 0.33589488, 0.25779188, -1.0830967, -0.8940718, 0.8459311, 0.053522922, -1.3081673, 0.19630818, 1.0427357, -0.92023265, 1.5911087, 0.3184592, -0.02456668, -0.43697524, 0.054902494, 0.529553, 0.92171264, -0.23722008, 0.33234715, 0.11387506, 0.97097033, 0.07819168, -0.06347807, 0.8892983, -1.262248, -0.4431137, -0.212102

## Create embedding for multiple documents

We use a similar setup as with one document but in a loop with multiple API calls


In [20]:
query = """mutation ($data:LmDocumentMutationObject) {
  embedDocument(data: $data)
}"""
url = 'https://api.logic-mill.net/api/v1/graphql/'


df = pd.DataFrame()
for biblio in biblios:
    data = {"id": biblio['id']}
    data["parts"] = [{"key": k, "value":v} for k,v in biblio.items() if k!='id']
    variables = {"data" : data}

    r = requests.post(url, json={'query': query, 'variables': variables})
    if r.status_code == 200:

      # store the results
      vector=r.json()["data"]["embedDocument"]
      embedding =  pd.DataFrame(vector).T
      embedding.index = [biblio['id']]
      df = pd.concat([df,embedding], axis=0)

df


## Find similarity scores between user supplied documents

In [112]:
# prepare the data field
data = []

for biblio in biblios:
    # put data in expected structure.
    record = {"id": biblio['id'], "parts":[]}
    record["parts"] = [{"key": k, "value":v} for k,v in biblio.items() if k!='id']
    data.append(record)



In [113]:
data

[{'id': 'ML001',
  'parts': [{'key': 'title',
    'value': 'Towards A Rigorous Science of Interpretable Machine Learning'},
   {'key': 'abstract',
    'value': 'As machine learning systems become ubiquitous, there has been a surge of interest in interpretable machine learning: systems that provide explanation for their outputs. These explanations are often used to qualitatively assess other criteria such as safety or non-discrimination. However, despite the interest in interpretability, there is very little consensus on what interpretable machine learning is and how it should be measured. In this position paper, we first define interpretability and describe when interpretability is needed (and when it is not). Next, we suggest a taxonomy for rigorous evaluation and expose open questions towards a more rigorous science of interpretable machine learning.'}]},
 {'id': 'ML002',
  'parts': [{'key': 'title',
    'value': 'Machine Learning Interpretability: A Science rather than a tool'},
   

In [114]:
query = """
query encodeDocumentAndSimilarityCalculation($data: [EncodeObject], $similarityMetric: similarityMetric, $model: String!) {
  encodeDocumentAndSimilarityCalculation(
    data: $data
    similarityMetric: $similarityMetric
    model: $model
  ) {
    similarities
    xs {
      id
    }
    ys {
      id
    }
  }
}
"""

variables = {"data": data, "metric": 'cosine',  "model": "patspecter"}

url = 'https://api.logic-mill.net/api/v1/graphql/'
r = requests.post(url,headers=headers,  json={'query': query, 'variables': variables})
print(r.text)

{
    "data": {
        "encodeDocumentAndSimilarityCalculation": {
            "similarities": [
                [
                    1,
                    0.9697906,
                    0.9458891
                ],
                [
                    0.9697906,
                    1,
                    0.93912196
                ],
                [
                    0.9458891,
                    0.93912196,
                    1
                ]
            ],
            "xs": [
                {
                    "id": "ML001"
                },
                {
                    "id": "ML002"
                },
                {
                    "id": "ML003"
                }
            ],
            "ys": [
                {
                    "id": "ML001"
                },
                {
                    "id": "ML002"
                },
                {
                    "id": "ML003"
                }
            ]
        }
    }
}


In [115]:
# prettify the similarity data in a dataframe

# create a df of just the scores
df = pd.DataFrame(
    r.json()['data']['encodeDocumentAndSimilarityCalculation']['similarities'])

# get column and row names
cols = pd.DataFrame(
    r.json()['data']['encodeDocumentAndSimilarityCalculation']['xs'])
rows = pd.DataFrame(
    r.json()['data']['encodeDocumentAndSimilarityCalculation']['ys'])

# set the column names and indexes
df.columns = cols["id"].to_list()
df.index = cols["id"].to_list()

df

Unnamed: 0,ML001,ML002,ML003
ML001,1.0,0.969791,0.945889
ML002,0.969791,1.0,0.939122
ML003,0.945889,0.939122,1.0


## Find similar documents in the database compared to user uploaded document


In [118]:
# simple version

query = """query embedDocumentAndSimilaritySearch($data: [EncodeDocumentPart], $indices: [String], $amount: Int, $model: String!) {
  encodeDocumentAndSimilaritySearch(
    data: $data
    indices: $indices
    amount: $amount
    model: $model
  ) {
    id
    score
    index
    document {
      title
      url
      PatspecterEmbedding
    }
  }
}
"""

variables= {
  "model": "patspecter",
  "data": [
    {
      "key": "title",
      "value": "Airbags"
    },
    {
      "key": "abstract",
      "value": "Airbags are one of the most important safety gears in motor vehicles such as cars and SUVs. These are cushions built into a vehicle that are intended to inflate in case of a car accident in order to protect occupants from injuries by preventing them from striking the interior of vehicle during a crash."
    }
  ],
  "amount": 25,
  "indices": [
    "patents",
    "publications"
  ]
}


url = 'https://api.logic-mill.net/api/v1/graphql/'
r = requests.post(url, headers=headers, json={'query': query, 'variables':variables})
# print(r.status_code)
print(r.text)


{
    "data": {
        "encodeDocumentAndSimilaritySearch": [
            {
                "document": {
                    "PatspecterEmbedding": [
                        -0.24447682,
                        0.6343788,
                        0.17487822,
                        0.9189939,
                        0.09276425,
                        -0.61977524,
                        0.93790174,
                        0.25174838,
                        0.01875864,
                        0.62750125,
                        0.6246354,
                        -0.945797,
                        0.18904786,
                        -0.9901383,
                        -1.1706774,
                        -0.22823699,
                        -0.16266072,
                        -0.3273233,
                        0.3186071,
                        -1.2807487,
                        0.1614953,
                        0.40833607,
                        -0.44869405,
                     

In [121]:
# parameterized version

query = """
query embedDocumentAndSimilaritySearch($data: [EncodeDocumentPart], $indices: [String], $amount: Int, $model: String!) {
  encodeDocumentAndSimilaritySearch(
    data: $data
    indices: $indices
    amount: $amount
    model: $model
  ) {
    id
    score
    index
    document {
      title
      url
    }
  }
}
"""




In [122]:
biblio = biblios[2]
data = [{"key": k, "value":v} for k,v in biblio.items() if k!='id']

variables = {
     "model": "patspecter",
    "amount": 3,
    "data": data,
    "indices": [
    "patents",
    "publications"
  ]
}

url = 'https://api.logic-mill.net/api/v1/graphql/'
r = requests.post(url, headers=headers, json={'query': query, 'variables': variables})
print(r.status_code)


200


In [124]:
r.json()

{'data': {'encodeDocumentAndSimilaritySearch': [{'document': {'title': 'Opening the black box of neural networks: methods for interpreting neural network models in clinical applications',
     'url': 'https://openalex.org/works/W2807036076'},
    'id': 'W2807036076',
    'index': 'publications',
    'score': 0.9983717},
   {'document': {'title': 'Advantages and disadvantages of using artificial neural networks versus logistic regression for predicting medical outcomes',
     'url': 'https://openalex.org/works/W2121394390'},
    'id': 'W2121394390',
    'index': 'publications',
    'score': 0.99275565},
   {'document': {'title': '<b>NeuralNetTools</b>: Visualization and Analysis Tools for Neural Networks',
     'url': 'https://openalex.org/works/W2885307904'},
    'id': 'W2885307904',
    'index': 'publications',
    'score': 0.99086666},
   {'document': {'title': 'Time-dependent outcome prediction using neural networks',
     'url': 'https://worldwide.espacenet.com/patent/search?q=US20

In [137]:
# prettify output
results = r.json()["data"]["encodeDocumentAndSimilaritySearch"]

#  see above
similar_docs = pd.DataFrame(results)

keys = similar_docs["document"][0].keys()

for k in keys:
    similar_docs[k] = similar_docs["document"].apply(lambda x: x[k])




# # remove document
del similar_docs["document"]

similar_docs.sort_values("score", ascending=False)[1:]

Unnamed: 0,id,index,score,title,url
1,W2121394390,publications,0.992756,Advantages and disadvantages of using artifici...,https://openalex.org/works/W2121394390
2,W2885307904,publications,0.990867,<b>NeuralNetTools</b>: Visualization and Analy...,https://openalex.org/works/W2885307904
3,30118004,patents,0.98501,Time-dependent outcome prediction using neural...,https://worldwide.espacenet.com/patent/search?...
4,45807655,patents,0.984483,TIME TO EVENT DATA ANALYSIS METHOD AND SYSTEM,https://worldwide.espacenet.com/patent/search?...
5,80855462,patents,0.984008,Attribution methodologies for neural networks ...,https://worldwide.espacenet.com/patent/search?...
