# Setup

In [None]:
import json
import urllib.parse
import requests
import pandas as pd
import numpy as np
from ollama import chat
from ollama import ChatResponse

file_path = "./generated_test_cases/combined_test_cases.json"
with open(file_path, "r", encoding="utf-8") as f:
    all_tests = json.load(f)

results = np.zeros([2,4,50])

# Test Case Generation

In [None]:
models = ['deepseek-r1:8b', 'deepseek-r1:14b']
api_list = ["FDIC", "LargeLanguageTool", "GenomeNexus", "Spotify"]
test_cases_per_api = 50
output_file_path = './generated_test_cases/combined_test_cases.json'

combined_test_cases = {model: {f'/{api}': [] for api in api_list} for model in models}

for model in models:
    for api in api_list:
        test_cases = []

        while len(test_cases) < test_cases_per_api:
            prompt_file_path = f"./prompt/{api}.txt"
    
            with open(prompt_file_path, "r", encoding="utf-8") as f:
                prompt_content = f.read()

            response = chat(model=model, messages=[
                {'role': 'user', 'content': prompt_content,},
            ])

            deepseek_response = response.message.content  # response from LLM
    
            # Extract JSON block
            start_index = deepseek_response.find("```json")
            end_index = deepseek_response.rfind("```")
    
            if start_index != -1 and end_index != -1:
                json_content = deepseek_response[start_index + 7 : end_index].strip()  # extract JSON content
                try:
                    batch_cases = json.loads(json_content)  # parse JSON
                    test_cases.extend(batch_cases)
                except json.JSONDecodeError:
                    print("Error: JSON parsing failed, please check the content manually")
            else:
                print("Error: Can't find the JSON block, please check it manually")
        
        test_cases = test_cases[:test_cases_per_api]
        combined_test_cases[model][f'/{api}'] = test_cases

# Save the combined test cases to a JSON file
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(combined_test_cases, f, indent=4)

print(f"Test cases successfully saved to {output_file_path}")   

# FDIC

In [99]:
BASE_URL = "https://banks.data.fdic.gov/api"  # API URL
HEADERS = {"Accept": "application/json"}

for i, model in enumerate(all_tests):
        for j, test_case in enumerate(all_tests[model]['/FDIC']):
            endpoint = test_case["endpoint"]
            method = test_case["method"].upper()
            params = test_case["parameters"]
            expected_response = test_case["expected_response"]

            try:
                if method == "GET":
                    response = requests.get(f"{BASE_URL}{endpoint}", params=params, headers=HEADERS)
                elif method == "POST":
                    response = requests.post(f"{BASE_URL}{endpoint}", json=params, headers=HEADERS)
                else:
                    print(f"❌ Unsupported HTTP method: {method}")
                    continue

                # HTTP status
                actual_response_code = response.status_code

            except requests.RequestException as e:
                actual_response_code = "ERROR"

            if actual_response_code == expected_response:
                test_result = "✅ Passed"
                results[i][0][j] = 1 
            else:
                test_result =  f"❌ Failed (Expected: {expected_response}, Got: {actual_response_code})"
                results[i][0][j] = 0

            print(f"Executing: {method} {BASE_URL}{endpoint}")
            print(f"Test Result: {test_result}\n")


Executing: GET https://banks.data.fdic.gov/api/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://banks.data.fdic.gov/api/institution
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://banks.data.fdic.gov/api/location
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://banks.data.fdic.gov/api/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://banks.data.fdic.gov/api/failure
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://banks.data.fdic.gov/api/history
Test Result: ✅ Passed

Executing: GET https://banks.data.fdic.gov/api/institutions
Test Result: ✅ Passed

Executing: GET https://banks.data.fdic.gov/api/institutions
Test Result: ❌ Failed (Expected: 400, Got: 200)

Executing: GET https://banks.data.fdic.gov/api/locations
Test Result: ✅ Passed

Executing: GET https://banks.data.fdic.gov/api/locations
Test Result: ❌ Failed (Expected: 400, Got: 200)

Executing: POST https:

# LargeLanguageTool

In [100]:
BASE_URL = "https://api.languagetoolplus.com/v2"  # API URL
HEADERS_GET = {"Accept": "application/json"}
HEADERS_POST = {
    "Accept": "application/json",
    "Content-Type": "application/x-www-form-urlencoded"
}

for i, model in enumerate(all_tests):
        for j, test_case in enumerate(all_tests[model]['/LargeLanguageTool']):
            endpoint = test_case["endpoint"]
            method = test_case["method"].upper()
            params = test_case["parameters"]
            expected_response = test_case["expected_response"]

            try:
                if method == "GET":
                    response = requests.get(f"{BASE_URL}{endpoint}", params=params, headers=HEADERS_GET)
                elif method == "POST":
                    form_encoded_data = urllib.parse.urlencode(params)
                    response = requests.post(f"{BASE_URL}{endpoint}", data=form_encoded_data, headers=HEADERS_POST)
                else:
                    print(f"❌ Unsupported HTTP method: {method}")
                    continue

                # HTTP status code
                actual_response_code = response.status_code

            except requests.RequestException:
                actual_response_code = "ERROR"

            if actual_response_code == expected_response:
                test_result = "✅ Passed"
                results[i][1][j] = 1 
            else:
                test_result =  f"❌ Failed (Expected: {expected_response}, Got: {actual_response_code})"
                results[i][1][j] = 0

            print(f"Executing: {method} {BASE_URL}{endpoint}")
            print(f"Test Result: {test_result}\n")


Executing: POST https://api.languagetoolplus.com/v2/check
Test Result: ❌ Failed (Expected: 200, Got: 400)

Executing: POST https://api.languagetoolplus.com/v2/check
Test Result: ✅ Passed

Executing: POST https://api.languagetoolplus.com/v2/check/another
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.languagetoolplus.com/v2/check/another
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.languagetoolplus.com/v2/check/another
Test Result: ❌ Failed (Expected: 400, Got: 404)

Executing: GET https://api.languagetoolplus.com/v2/check/get
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://api.languagetoolplus.com/v2/check/get
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://api.languagetoolplus.com/v2/check/get
Test Result: ❌ Failed (Expected: 400, Got: 404)

Executing: POST https://api.languagetoolplus.com/v2/check/post
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://a

# GenomeNexus

In [101]:
BASE_URL = "https://api.languagetoolplus.com/v2"  # API URL
HEADERS_GET = {"Accept": "application/json"}
HEADERS_POST = {
    "Accept": "application/json",
    "Content-Type": "application/x-www-form-urlencoded"
}

for i, model in enumerate(all_tests):
        for j, test_case in enumerate(all_tests[model]['/GenomeNexus']):
            endpoint = test_case["endpoint"]
            method = test_case["method"].upper()
            params = test_case["parameters"]
            expected_response = test_case["expected_response"]

            try:
                if method == "GET":
                    response = requests.get(f"{BASE_URL}{endpoint}", params=params, headers=HEADERS_GET)
                elif method == "POST":
                    form_encoded_data = urllib.parse.urlencode(params)
                    response = requests.post(f"{BASE_URL}{endpoint}", data=form_encoded_data, headers=HEADERS_POST)
                else:
                    print(f"❌ Unsupported HTTP method: {method}")
                    continue

                # HTTP status code
                actual_response_code = response.status_code

            except requests.RequestException:
                actual_response_code = "ERROR"

            if actual_response_code == expected_response:
                test_result = "✅ Passed"
                results[i][2][j] = 1 
            else:
                test_result =  f"❌ Failed (Expected: {expected_response}, Got: {actual_response_code})"
                results[i][2][j] = 0

            print(f"Executing: {method} {BASE_URL}{endpoint}")
            print(f"Test Result: {test_result}\n")

Executing: POST https://api.languagetoolplus.com/v2/check
Test Result: ❌ Failed (Expected: 200, Got: 400)

Executing: POST https://api.languagetoolplus.com/v2/check
Test Result: ✅ Passed

Executing: GET https://api.languagetoolplus.com/v2/vep
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.languagetoolplus.com/v2/vcf
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://api.languagetoolplus.com/v2/
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.languagetoolplus.com/v2/vcf
Test Result: ❌ Failed (Expected: 201, Got: 404)

Executing: POST https://api.languagetoolplus.com/v2/vue
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://api.languagetoolplus.com/v2/check
Test Result: ❌ Failed (Expected: 200, Got: 400)

Executing: POST https://api.languagetoolplus.com/v2/users
Test Result: ❌ Failed (Expected: 201, Got: 404)

Executing: GET https://api.languagetoolplus.com/v2/products
Test Result: ❌ Fail

# Spotify

In [102]:
BASE_URL = "https://api.spotify.com/v1"  # Spotify API URL
# Access token (expired in 1 hour)
AUTH_TOKEN = "BQCClIh5zcv1bTSU05d5NVC17o_ZN92mCMzV6a_yqck0406qomaflKXSRV80L9fdN7WxRFPsTLzkWlL3D9ARpibT0KGh10v2hQ0r8HqKtd5Q1xgMjoEq14zPVUDoH5KDfuAeAkLXXyo"
HEADERS = {
    "Authorization": f"Bearer {AUTH_TOKEN}",
    "Accept": "application/json",
    "Content-Type": "application/json"
}

for i, model in enumerate(all_tests):
        for j, test_case in enumerate(all_tests[model]['/Spotify']):
            endpoint = test_case["endpoint"]
            method = test_case["method"].upper()
            params = test_case["parameters"]
            expected_response = test_case["expected_response"]

            try:
                if method == "GET":
                    response = requests.get(f"{BASE_URL}{endpoint}", params=params, headers=HEADERS)
                elif method == "POST":
                    response = requests.post(f"{BASE_URL}{endpoint}", json=params, headers=HEADERS)
                else:
                    print(f"❌ Unsupported HTTP method: {method}")
                    continue

                # HTTP status code
                actual_response_code = response.status_code

            except requests.RequestException:
                actual_response_code = "ERROR"

            if actual_response_code == expected_response:
                test_result = "✅ Passed"
                results[i][3][j] = 1 
            else:
                test_result =  f"❌ Failed (Expected: {expected_response}, Got: {actual_response_code})"
                results[i][3][j] = 0

            print(f"Executing: {method} {BASE_URL}{endpoint}")
            print(f"Test Result: {test_result}\n")

Executing: GET https://api.spotify.com/v1/users/{user_id}
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://api.spotify.com/v1/users/{user_id}/activity
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.spotify.com/v1/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.spotify.com/v1/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.spotify.com/v1/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.spotify.com/v1/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.spotify.com/v1/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.spotify.com/v1/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: POST https://api.spotify.com/v1/check
Test Result: ❌ Failed (Expected: 200, Got: 404)

Executing: GET https://api.spotify.com/v1/tracks
Test Result: ✅ Passed

Executing: GET http

# Analysis

In [None]:
aves = np.mean(results, axis=2)
sums = np.sum(results, axis=2)
for i, model in enumerate(all_tests):
    for j, api in enumerate(all_tests[model]):
        print(f'Accuracy of {model} on {api}: {aves[i][j]}')
    print()

Accuracy of deepseek-r1:8b on /FDIC: 0.1
Accuracy of deepseek-r1:8b on /LargeLanguageTool: 0.14
Accuracy of deepseek-r1:8b on /GenomeNexus: 0.18
Accuracy of deepseek-r1:8b on /Spotify: 0.26

Accuracy of deepseek-r1:14b on /FDIC: 0.38
Accuracy of deepseek-r1:14b on /LargeLanguageTool: 0.28
Accuracy of deepseek-r1:14b on /GenomeNexus: 0.18
Accuracy of deepseek-r1:14b on /Spotify: 0.38

