In [59]:
import requests
import json
import os
import re

from markdownify import markdownify as md
from sklearn.model_selection import train_test_split

In [4]:
# Get documentation from OBP
obp_base_url = "https://test.openbankproject.com"
obp_version = "v5.1.0"

swagger_url = "{}/obp/v5.1.0/resource-docs/{}/swagger".format(obp_base_url, obp_version)
swagger_response = requests.get(swagger_url)
swagger_json = swagger_response.json()

resource_docs_url = "{}/obp/v5.1.0/resource-docs/{}/obp".format(obp_base_url, obp_version)
resource_docs_response = requests.get(resource_docs_url)
resource_docs_json = resource_docs_response.json()

glossary_url = "{}/obp/v5.1.0/api/glossary".format(obp_base_url)
glossary_response = requests.get(glossary_url)
glossary_json = glossary_response.json()

In [5]:
def gen_json_body_from_reference(reference):
    '''
    generates example json bodies from the swagger file
    '''
    ref = os.path.split(reference)[1]
    if "required" not in swagger_json["definitions"][ref].keys():
        return reference
    properties = swagger_json["definitions"][ref]["required"]
    json_body_example = {}
    
    for prop in properties:
        if '$ref' in swagger_json["definitions"][ref]["properties"][prop].keys():
            json_prop_example = gen_json_body_from_reference(swagger_json["definitions"][ref]["properties"][prop]['$ref'])
            
        elif swagger_json["definitions"][ref]["properties"][prop]["type"] == "array":
            items = swagger_json["definitions"][ref]["properties"][prop]["items"]
            json_prop_example = []
            
            for i, item in enumerate(items):
                if item == '$ref':
                    json_prop_example.append(gen_json_body_from_reference(swagger_json["definitions"][ref]["properties"][prop]["items"][item]))
                else: 
                    json_prop_example.append(swagger_json["definitions"][ref]["properties"][prop]["items"][item])
        elif 'enum' in swagger_json["definitions"][ref]["properties"][prop].keys():
            json_prop_example = swagger_json["definitions"][ref]["properties"][prop]["enum"]
        elif 'properties' in swagger_json["definitions"][ref]["properties"][prop].keys():
            json_prop_example = swagger_json["definitions"][ref]["properties"][prop]["properties"]
        else:
            json_prop_example = swagger_json["definitions"][ref]["properties"][prop]["example"]
        json_body_example[prop] = json_prop_example
        
    return json_body_example

In [53]:

def generate_prompt_answer_pairs(paths):
    data = []

    # Generate prompt answer pairs from swagger file
    for path in paths:
        for method in swagger_json["paths"][path].keys():
            
            # Which endpoint to do this...?
            question = "Which endpoint would I use to {}".format(swagger_json["paths"][path][method]["summary"])
            answer = "You would use the {} {} endpoint".format(method.upper(), path)
            
            data.append({"question": question, "answer": answer})
    
            # What does the endpoint do
            question = "What does the {} method for the {} endpoint do?".format(method.upper(), path)
            answer = "{}".format(md(swagger_json["paths"][path][method]["description"]))
            
            data.append({"question": question, "answer": answer})
    
            # Request Body
            question = "What parameters does the {} method for the {} endpoint take?".format(method.upper(), path)
            if len(swagger_json["paths"][path][method]["parameters"]) == 0:
                answer = "This endpoint does not take any parameters."
            else:
                body_params = []
                path_params = []
                for parameter in swagger_json["paths"][path][method]["parameters"]:
                    if parameter["in"] == "body":
                        body_params.append(parameter)
                    elif parameter["in"] == "path":
                        path_params.append(parameter)
                    else:
                        print('unknown parameter location "{}"'.format(parameter["in"]))

                #
                formatted_body_params = ""
                for param in body_params:
                    if '$ref' not in param['schema']:
                        formatted_body_params = formatted_body_params.join("{}".format(i) for i in param['schema']['properties'])
                    else:
                        formatted_body_params = formatted_body_params.join("{}".format(gen_json_body_from_reference(param['schema']['$ref'])))
                        
                body_answer = "This endpoint takes the following example request body:\n{}".format(formatted_body_params)
                
                formatted_path_params = "\n".join("Name: {}\nDescription: {}\nType: {}\n".format(i['name'], i['description'], i['type']) for i in path_params)
                path_answer = "This endpoint takes the following parameters in the request path:\n{}\n".format(formatted_path_params)
    
                if len(body_params) != 0 & len(path_params) != 0:
                    answer = body_answer + path_answer
                elif len(body_params) != 0:
                    answer = body_answer
                elif len(path_params) != 0:
                    answer = path_answer
            
            data.append({"question": question, "answer": answer})

    # Generate prompt-answer pairs from glossary
    for glossary_item in glossary_json["glossary_items"]:
        description = glossary_item["description"]["markdown"]
        if "Description: no-description-provided" in description:
            continue
        title = glossary_item["title"]
        levels = title.split(".")
        question = "Could you explain {}".format(" ".join(levels))
        answer = "{}".format(glossary_item["description"]["markdown"])
        data.append({"question": question, "answer": answer})

        question_variant = "What is {}".format(" ".join(levels))
        answer = "{}".format(glossary_item["description"]["markdown"])
        data.append({"question": question, "answer": answer})
    
    return data

In [66]:
paths_short = list(swagger_json["paths"])[25:50]
data = generate_prompt_answer_pairs(list(swagger_json["paths"]))

data_train, data_test = train_test_split(data, test_size=0.3)

In [67]:
def make_supervised_jsonl(data, filename):
    system_prompt = "You are a helpful assistant, giving information on the Open Bank Project APIs, you should stick strictly to topics around the api and open banking, and generating code in various languages."
    with open(filename, "w") as data_file:
        for line in list(data):
            if line["answer"] == "":
                continue
            jsonl_line = {"messages": [{"role": "system", "content": system_prompt},{"role": "user", "content": line['question']},{"role": "model", "content": line["answer"]}]}
            print(json.dumps(jsonl_line) + "\n")
            data_file.write(json.dumps(jsonl_line) + '\n')

In [68]:
make_supervised_jsonl(data_train, "data_train.jsonl")
make_supervised_jsonl(data_test, "data_test.jsonl")

{"messages": [{"role": "system", "content": "You are a helpful assistant, giving information on the Open Bank Project APIs, you should stick strictly to topics around the api and open banking, and generating code in various languages."}, {"role": "user", "content": "What parameters does the GET method for the /sustrans endpoint take?"}, {"role": "model", "content": "This endpoint does not take any parameters."}]}

{"messages": [{"role": "system", "content": "You are a helpful assistant, giving information on the Open Bank Project APIs, you should stick strictly to topics around the api and open banking, and generating code in various languages."}, {"role": "user", "content": "Which endpoint would I use to Delete a Transaction Narrative"}, {"role": "model", "content": "You would use the DELETE /obp/v5.1.0/banks/{BANK_ID}/accounts/{ACCOUNT_ID}/{VIEW_ID}/transactions/{TRANSACTION_ID}/metadata/narrative endpoint"}]}

{"messages": [{"role": "system", "content": "You are a helpful assistant,

In [58]:

def validate_jsonl(file_path):
    try:
        with open(file_path, "r") as file:
            for line_num, line in enumerate(file, start=1):
                try:
                    json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Error in line {line_num}: {e}")
                    return False
    except FileNotFoundError:
        print("File not found.")
        return False
    return True

# Example usage
file_path = "data.jsonl"
if validate_jsonl(file_path):
    print("JSON Lines file is valid.")
else:
    print("JSON Lines file is invalid.")


validate_jsonl("data.jsonl")

JSON Lines file is valid.


True