In [3]:
import requests
import openai
import faiss
import json
import numpy as np

from markdownify import markdownify
from dotenv import load_dotenv

load_dotenv()

client = openai.OpenAI()

In [12]:
# Get documentation from OBP
obp_base_url = "https://test.openbankproject.com"
obp_version = "v5.1.0"

# swagger docs
swagger_url = "{}/obp/v5.1.0/resource-docs/{}/swagger".format(obp_base_url, obp_version)
swagger_response = requests.get(swagger_url)
swagger_json = swagger_response.json()

# glossary
glossary_url = "{}/obp/{}/api/glossary".format(obp_base_url, obp_version)
glossary_response = requests.get(glossary_url)
glossary_json = glossary_response.json()

In [13]:
def resolve_reference(ref, definitions, resolved={}):
    """
    Resolves a $ref to its definition, avoiding circular references.
    """
    ref_name = ref.split('/')[-1]
    if ref_name in resolved:
        return resolved[ref_name]

    if ref_name in definitions.keys():
        definition = definitions[ref_name]
    else:
        definition = {}
    resolved[ref_name] = definition
    properties = definition.get('properties', {})
    resolved_properties = resolve_properties(properties, definitions, resolved)
    return {**definition, 'properties': resolved_properties}

def resolve_properties(properties, definitions, resolved):
    """
    Resolves nested references in properties, avoiding circular references.
    """
    resolved_properties = {}
    for prop_name, prop_details in properties.items():
        if '$ref' in prop_details:
            resolved_properties[prop_name] = resolve_reference(prop_details['$ref'], definitions, resolved)
        elif prop_details.get('type') == 'array' and 'items' in prop_details and '$ref' in prop_details['items']:
            resolved_properties[prop_name] = {
                "type": "array",
                "items": resolve_reference(prop_details['items']['$ref'], definitions, resolved)
            }
        else:
            resolved_properties[prop_name] = prop_details
    return resolved_properties

def parse_swagger(swagger_json):
    paths = swagger_json['paths']
    definitions = swagger_json['definitions']

    endpoints = []
    for path, methods in paths.items():
        for method, details in methods.items():
            endpoint_info = {
                'path': path,
                'method': method,
                'summary': details.get('summary'),
                'description': markdownify(details.get('description', '')),
                'responses': []
            }

            if 'parameters' in details:
                endpoint_info["parameters"] = {
                        "type": "object",
                        "properties": {},
                        "required": []
                    }
                for param in details['parameters']:
                    if param['in'] == 'body' and '$ref' in param['schema']:
                        ref = param['schema']['$ref']
                        definition_name = ref.split('/')[-1]
                        definition = resolve_reference(ref, definitions)
                        
                        endpoint_info['parameters']['required'].extend(definition.get('required', []))
                        endpoint_info['parameters']['properties'].update(
                            resolve_properties(definition.get('properties', {}), definitions, {})
                        )
                    elif param['in'] == 'path':
                        endpoint_info['parameters']['required'].append(param['name'])
                        endpoint_info['parameters']['properties'][param['name']] = {
                            "type": param['type'],
                            "description": param.get('description', '')
                        }
                    elif param['in'] == 'query':
                        endpoint_info['parameters']['properties'][param['name']] = {
                            "type": param['type'],
                            "description": param.get('description', '')
                        }
                        if param.get('required', False):
                            endpoint_info['parameters']['required'].append(param['name'])
            
            if 'responses' in details:
                for code, response in details['responses'].items():
                    if "schema" in response.keys() and ("$ref" in response['schema']):
                        ref = response['schema']['$ref']
                        definition_name = ref.split('/')[-1]
                        definition = resolve_reference(ref, definitions)

                        response_resolved = {
                            "code": code,
                            "body": resolve_properties(definition.get('properties', {}), definitions, {})
                        }

                        endpoint_info["responses"].append(response_resolved)
                    else:
                        print("no reference or unresolvable response body")
                        
                    
                    
                    
            
            endpoints.append(endpoint_info)
    return endpoints

endpoints = parse_swagger(swagger_json)
for endpoint in endpoints[:2]:
    print(json.dumps(endpoint, indent=2))

no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no reference or unresolvable response body
no referenc

In [52]:
def parse_glossary(glossary_json):
    glossary_items = glossary_json['glossary_items']
    parsed_items = []
    
    for item in glossary_items:
        title = item.get('title', 'No title')
        description_info = item.get('description', {})
        
        # Get markdown description or else return no description
        description = description_info.get('markdown', 'No description')
        # do not add descriptions if they are empty
        if description == "":
            continue
        
        parsed_items.append({
            'title': title,
            'description': description
        })
    
    return parsed_items

glossary_items = parse_glossary(glossary_json)
glossary_items[:2]

[{'title': 'API',
  'description': 'The terms `API` (Application Programming Interface) and `Endpoint` are used somewhat interchangeably.\n\nHowever, an API normally refers to a group of Endpoints.\n\nAn endpoint has a unique URL path and HTTP verb (GET, POST, PUT, DELETE etc).\n\nWhen we POST a Swagger file to the Create Endpoint endpoint, we are in fact creating a set of Endpoints that have a common Tag. Tags are used to group Endpoints in the API Explorer and filter the Endpoints in the Resource Doc endpoints.\n\nEndpoints can also be grouped together in Collections.\n\nSee also [Endpoint](/glossary#Endpoint)\n\n\t\t\t\t '},
 {'title': 'API Collection',
  'description': 'An API Collection is a collection of endpoints grouped together for a certain purpose.\n\nHaving read access to a Collection does not constitute execute access on the endpoints in the Collection.\n\n(Execute access is governed by Entitlements to Roles - and in some cases, Views.)\n\nCollections can be created and sh

In [28]:
# Create vector embeddings
def get_embeddings(texts):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    
    return [e.embedding for e in response.data]

In [55]:
def create_and_save_embedding_faiss(formatted_texts: list, json_metadata: list, filename: str):
    """
    Creates and saves text embeddings and metadata for a given list of texts 

    texts: formatted list of texts for creating embeddings
    json_metadata: dict to pass as json metadata (ie. [{'title': 'API', 'description': '<API DESCRIPTION>'}, ...] for glossary)
    filename: prefix to attach to saved index and metadata i.e. 'glossary' for saving 'glossary_index.faiss' and 'glossary_metadata.json'
    """
    embeddings = get_embeddings(formatted_texts)
    
    # Convert embeddings to a numpy array
    embeddings_np = np.array(embeddings).astype('float32')
    
    # Create a FAISS index
    index = faiss.IndexFlatL2(embeddings_np.shape[1])  # L2 distance index
    index.add(embeddings_np)
    
    # Optionally, save the index to disk for later use
    faiss.write_index(index, f"{filename}_index.faiss")
    
    # Save metadata for retrieval
    with open(f"{filename}_metadata.json", 'w') as f:
        json.dump(json_metadata, f)

In [56]:
endpoint_texts = [f"{e['method'].upper()} {e['path']} - {e['description']}" for e in endpoints]
create_and_save_embedding_faiss(endpoint_texts, endpoints, "endpoint")


In [57]:
glossary_texts = [f"{g['title']} - {g['description']}" for g in glossary_items]
create_and_save_embedding_faiss(glossary_texts, glossary_items, "glossary")



In [None]:
def search_endpoints(query):
    query_embedding = get_embeddings([query])[0]
    query_embedding_np = np.array([query_embedding]).astype('float32')

    
    
    # Perform the search
    distances, indices = index.search(query_embedding_np, k=5)
    
    # Load metadata
    with open('endpoints_metadata.json', 'r') as f:
        endpoints = json.load(f)
    
    # Retrieve matching endpoints
    matches = [endpoints[i] for i in indices[0]]
    return matches

