In [33]:
import requests
import openai
import faiss
import json
import numpy as np

from dotenv import load_dotenv

load_dotenv()

client = OpenAI()

In [42]:
# Get documentation from OBP
obp_base_url = "https://test.openbankproject.com"
obp_version = "v5.1.0"

# swagger docs
swagger_url = "{}/obp/v5.1.0/resource-docs/{}/swagger".format(obp_base_url, obp_version)
swagger_response = requests.get(swagger_url)
swagger_json = swagger_response.json()

# glossary
glossary_url = "{}/obp/{}/api/glossary".format(obp_base_url, obp_version)
glossary_response = requests.get(glossary_url)
glossary_json = glossary_response.json()

In [51]:
def parse_swagger(swagger_json):
    endpoints = []
    for path, methods in swagger_json['paths'].items():
        for method, details in methods.items():
            endpoint_info = {
                'path': path,
                'method': method,
                'description': details.get('description', ''),
                'parameters': details.get('parameters', []),
                'responses': details.get('responses', {})
            }
            endpoints.append(endpoint_info)
    return endpoints

endpoints = parse_swagger(swagger_json)
endpoints[:2]

[{'path': '/obp/v5.1.0/account/check/scheme/iban',
  'method': 'post',
  'description': '<p>Validate and check IBAN for errors</p><p>Authentication is Optional</p>',
  'parameters': [{'in': 'body',
    'name': 'body',
    'description': 'IbanAddress object that needs to be added.',
    'required': True,
    'schema': {'$ref': '#/definitions/IbanAddress'}}],
  'responses': {'201': {'description': 'Success',
    'schema': {'$ref': '#/definitions/IbanCheckerJsonV400'}},
   '400': {'description': 'Error',
    'schema': {'$ref': '#/definitions/ErrorUnknownError'}}}},
 {'path': '/obp/v5.1.0/accounts/public',
  'method': 'get',
  'description': '<p>Get public accounts at all banks (Anonymous access).<br />Returns accounts that contain at least one public view (a view where is_public is true)<br />For each account the API returns the ID and the available views.</p><p>Authentication is Optional</p>',
  'parameters': [{'in': 'body',
    'name': 'body',
    'description': 'EmptyClassJson object t

In [52]:
def parse_glossary(glossary_json):
    glossary_items = glossary_json['glossary_items']
    parsed_items = []
    
    for item in glossary_items:
        title = item.get('title', 'No title')
        description_info = item.get('description', {})
        
        # Get markdown description or else return no description
        description = description_info.get('markdown', 'No description')
        # do not add descriptions if they are empty
        if description == "":
            continue
        
        parsed_items.append({
            'title': title,
            'description': description
        })
    
    return parsed_items

glossary_items = parse_glossary(glossary_json)
glossary_items[:2]

[{'title': 'API',
  'description': 'The terms `API` (Application Programming Interface) and `Endpoint` are used somewhat interchangeably.\n\nHowever, an API normally refers to a group of Endpoints.\n\nAn endpoint has a unique URL path and HTTP verb (GET, POST, PUT, DELETE etc).\n\nWhen we POST a Swagger file to the Create Endpoint endpoint, we are in fact creating a set of Endpoints that have a common Tag. Tags are used to group Endpoints in the API Explorer and filter the Endpoints in the Resource Doc endpoints.\n\nEndpoints can also be grouped together in Collections.\n\nSee also [Endpoint](/glossary#Endpoint)\n\n\t\t\t\t '},
 {'title': 'API Collection',
  'description': 'An API Collection is a collection of endpoints grouped together for a certain purpose.\n\nHaving read access to a Collection does not constitute execute access on the endpoints in the Collection.\n\n(Execute access is governed by Entitlements to Roles - and in some cases, Views.)\n\nCollections can be created and sh

In [28]:
# Create vector embeddings
def get_embeddings(texts):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    
    return [e.embedding for e in response.data]

In [55]:
def create_and_save_embedding_faiss(formatted_texts: list, json_metadata: list, filename: str):
    """
    Creates and saves text embeddings and metadata for a given list of texts 

    texts: formatted list of texts for creating embeddings
    json_metadata: dict to pass as json metadata (ie. [{'title': 'API', 'description': '<API DESCRIPTION>'}, ...] for glossary)
    filename: prefix to attach to saved index and metadata i.e. 'glossary' for saving 'glossary_index.faiss' and 'glossary_metadata.json'
    """
    embeddings = get_embeddings(formatted_texts)
    
    # Convert embeddings to a numpy array
    embeddings_np = np.array(embeddings).astype('float32')
    
    # Create a FAISS index
    index = faiss.IndexFlatL2(embeddings_np.shape[1])  # L2 distance index
    index.add(embeddings_np)
    
    # Optionally, save the index to disk for later use
    faiss.write_index(index, f"{filename}_index.faiss")
    
    # Save metadata for retrieval
    with open(f"{filename}_metadata.json", 'w') as f:
        json.dump(json_metadata, f)

In [56]:
#endpoint_texts = [f"{e['method'].upper()} {e['path']} - {e['description']}" for e in endpoints]
#endpoint_embeddings = get_embeddings(endpoint_texts)


In [57]:
glossary_texts = [f"{g['title']} - {g['description']}" for g in glossary_items]
create_and_save_embedding_faiss(glossary_texts, glossary_items, "glossary")



In [None]:
def search_endpoints(query):
    query_embedding = get_embeddings([query])[0]
    query_embedding_np = np.array([query_embedding]).astype('float32')

    
    
    # Perform the search
    distances, indices = index.search(query_embedding_np, k=5)
    
    # Load metadata
    with open('endpoints_metadata.json', 'r') as f:
        endpoints = json.load(f)
    
    # Retrieve matching endpoints
    matches = [endpoints[i] for i in indices[0]]
    return matches

