In [1]:
from sdkgenerator.generate import generate_sdk
from sdkgenerator.constants import DATA_DIR, GENERATED_SDK_DIR
from sdkgenerator.utils import get_code_from_model_response
from sdkgenerator.db import db
import pandas as pd

Database connected successfully. (MongoDB)


In [2]:
TEST_DATA_DIR = DATA_DIR / 'test'
TEST_DATA_DIR.mkdir(exist_ok=True)
SPECIFICATIONS_DIR = DATA_DIR / 'openapi-specifications'

## Query testing data

In [3]:
sdk_code_test_data_pipeline = [
        {"$match": {"step": "final_code", "response.openai.status": "success"}},
        {"$sort": {"_id": 1}},  # Sort by _id to sort by insertion date (I accidentally added multiple final_code for some data)
        {"$group": {
            "_id": "$sdk_name",
            "last_response_text": {"$last": "$response.openai.generated_text"}
        }},
        {"$project": {
            "sdk_name": "$_id",
            "last_response_text": 1,
            "_id": 0
        }}
    ]

data = list(db["eval"].aggregate(sdk_code_test_data_pipeline))
df_test_data = pd.DataFrame(data)
df_test_data.head()

Unnamed: 0,last_response_text,sdk_name
0,```python\nimport requests\nfrom types import ...,finley
1,```python\nimport json\nimport requests\n\ncla...,affinity
2,```python\nimport requests\nfrom typing import...,flickr
3,```python\nimport requests\nimport json\nfrom ...,adatree_banking
4,```python\nimport requests\n\nclass AdatreeCli...,adatree_data


we need to extract the code from the generated text

In [4]:
df_test_data['sdk_code'] = df_test_data['last_response_text'].apply(lambda x: get_code_from_model_response(x)[0])
df_test_data = df_test_data.drop(columns=['last_response_text'])
df_test_data.head()

Unnamed: 0,sdk_name,sdk_code
0,finley,import requests\nfrom types import *\nfrom typ...
1,affinity,import json\nimport requests\n\nclass Affinity...
2,flickr,"import requests\nfrom typing import Optional, ..."
3,adatree_banking,import requests\nimport json\nfrom typing impo...
4,adatree_data,import requests\n\nclass AdatreeClient:\n d...


In [5]:
# remove rows with None
df_test_data = df_test_data.dropna()
df_test_data.head()

Unnamed: 0,sdk_name,sdk_code
0,finley,import requests\nfrom types import *\nfrom typ...
1,affinity,import json\nimport requests\n\nclass Affinity...
2,flickr,"import requests\nfrom typing import Optional, ..."
3,adatree_banking,import requests\nimport json\nfrom typing impo...
4,adatree_data,import requests\n\nclass AdatreeClient:\n d...


In [6]:
types_code_test_data_pipeline = [
        {"$match": {"step": "types", "response.openai.status": "success"}},
        {"$sort": {"_id": 1}},  # Sort by _id to sort by insertion date (I accidentally added multiple final_code for some data)
        {"$group": {
            "_id": "$sdk_name",
            "last_response_text": {"$last": "$response.openai.generated_text"}
        }},
        {"$project": {
            "sdk_name": "$_id",
            "last_response_text": 1,
            "_id": 0
        }}
    ]

data = list(db["eval"].aggregate(types_code_test_data_pipeline))
df_types = pd.DataFrame(data)
df_types['types_code'] = df_types['last_response_text'].apply(lambda x: get_code_from_model_response(x)[0])
df_types.drop(columns=['last_response_text'], inplace=True)
df_types.head()

Unnamed: 0,sdk_name,types_code
0,finshark,"from typing import Literal, TypedDict, Optiona..."
1,dev,"from typing import TypedDict, Optional, Litera..."
2,foodkit,"from typing import TypedDict, Literal, List\n\..."
3,bluesnap,"from typing import TypedDict, List, Literal, U..."
4,baseten,"from typing import TypedDict, Optional\n\nclas..."


In [7]:
# merge the two dataframes if sdk has no types set types_code to empty string
df_test_data = df_test_data.merge(df_types, on='sdk_name', how='left')
df_test_data.fillna('', inplace=True)


df_test_data.head()

Unnamed: 0,sdk_name,sdk_code,types_code
0,finley,import requests\nfrom types import *\nfrom typ...,"from typing import TypedDict, Optional\n\nclas..."
1,affinity,import json\nimport requests\n\nclass Affinity...,
2,flickr,"import requests\nfrom typing import Optional, ...",
3,adatree_banking,import requests\nimport json\nfrom typing impo...,
4,adatree_data,import requests\n\nclass AdatreeClient:\n d...,"from typing import TypedDict, List, Literal, O..."


In [8]:
def merge_code(sdk_code: str, types_code: str) -> str:
    if types_code:
        return types_code + '\n' + '-'*10 + '\n' + sdk_code
    return sdk_code

In [9]:
#answer is sdk_code + types_code
df_test_data['answer'] = df_test_data.apply(lambda x: merge_code(x['sdk_code'], x['types_code']), axis=1)

df_test_data.head()

Unnamed: 0,sdk_name,sdk_code,types_code,answer
0,finley,import requests\nfrom types import *\nfrom typ...,"from typing import TypedDict, Optional\n\nclas...","from typing import TypedDict, Optional\n\nclas..."
1,affinity,import json\nimport requests\n\nclass Affinity...,,import json\nimport requests\n\nclass Affinity...
2,flickr,"import requests\nfrom typing import Optional, ...",,"import requests\nfrom typing import Optional, ..."
3,adatree_banking,import requests\nimport json\nfrom typing impo...,,import requests\nimport json\nfrom typing impo...
4,adatree_data,import requests\n\nclass AdatreeClient:\n d...,"from typing import TypedDict, List, Literal, O...","from typing import TypedDict, List, Literal, O..."


now we get the generated code for each sdk

In [10]:
df_test_data.drop(columns=['sdk_code', 'types_code'], inplace=True)
df_test_data.head()

Unnamed: 0,sdk_name,answer
0,finley,"from typing import TypedDict, Optional\n\nclas..."
1,affinity,import json\nimport requests\n\nclass Affinity...
2,flickr,"import requests\nfrom typing import Optional, ..."
3,adatree_banking,import requests\nimport json\nfrom typing impo...
4,adatree_data,"from typing import TypedDict, List, Literal, O..."


In [11]:
# fine_tuned_responses_pipeline = [
#     {
#         "$match": {
#             "payload.model": "ft:gpt-3.5-turbo-0125:eden-ai::9UxoYDO3",
#             "step": "final_code",
#             "response.choices": {
#                 "$size": 1
#             },
#             "response.choices.0.finish_reason": "stop"
#         }
#     },
#     {
#         "$sort": {
#             "_id": 1 
#         }
#     },
#     {
#         "$group": {
#             "_id": "$sdk_name",
#             "last_document": { "$last": "$$ROOT" } 
#         }
#     },
#     {
#         "$replaceRoot": { "newRoot": "$last_document" }
#     },
#     {
#         "$project": {
#             "_id": 0, 
#             "sdk_name": 1,
#             "last_response_text": { "$arrayElemAt": ["$response.choices.message.content", 0] }
#         }
#     }
# ]
# fine_tuned_responses = list(db['responses'].aggregate(fine_tuned_responses_pipeline))
# 
# fine_tuned_responses = pd.DataFrame(fine_tuned_responses)
# fine_tuned_responses

In [12]:
user_rules = "1. Use the requests library: All HTTP requests within the SDK must be made using the 'requests' library.\n2. Class structure: The SDK must be a class, with each method representing an endpoint in the API. Choose method names that reflect the action or resource they interact with.\n3. Authenticated requests: Implement a method '_make_authenticated_request' to handle authenticated requests.\n4. JSON request body: Use JSON format for the body of all requests.\n5. Return type: All methods must return the 'Response' object from the 'requests' library."

def get_generated_code(sdk_name):
    try:
        sdk_name_path = GENERATED_SDK_DIR / sdk_name / f"{sdk_name}.py"
        types_code_path = GENERATED_SDK_DIR / sdk_name / f"types.py"
        sdk_code = sdk_name_path.read_text()
        types_code = types_code_path.read_text() if types_code_path.exists() else ""
        return merge_code(sdk_code, types_code)
    except FileNotFoundError:
        print(f"Generating SDK for {sdk_name}")
        spec_path = None
        for file in SPECIFICATIONS_DIR.iterdir():
            if sdk_name in file.name:
                spec_path = file
                break
        if spec_path is None:
            raise Exception(f"Specification file for {sdk_name} not found")
        try:
            _, sdk_output_file, types_output_file = generate_sdk(spec_path, user_rules=user_rules)
        except Exception as e:
            print(f"Error generating SDK for {sdk_name}: {e}")
            return None
        if sdk_output_file is None:
            return None
        sdk_code = sdk_output_file.read_text()
        types_code = types_output_file.read_text() if types_output_file else ""
        
        return merge_code(sdk_code, types_code)

df_test_data['llm_answer'] = df_test_data['sdk_name'].apply(get_generated_code)

In [13]:
df_test_data.head()

Unnamed: 0,sdk_name,answer,llm_answer
0,finley,"from typing import TypedDict, Optional\n\nclas...","from typing import TypedDict, Optional\n\n\ncl..."
1,affinity,import json\nimport requests\n\nclass Affinity...,import requests\n\n\nclass AffinityClient:\n ...
2,flickr,"import requests\nfrom typing import Optional, ...","import requests\nfrom typing import Optional, ..."
3,adatree_banking,import requests\nimport json\nfrom typing impo...,import requests\nimport json\nfrom typing impo...
4,adatree_data,"from typing import TypedDict, List, Literal, O...","from typing import TypedDict, List, Literal\n\..."


# Evaluation

In [14]:
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from codebleu import calc_codebleu
from sdkgenerator.types import Language
from sdkgenerator.config import AGENT

## BLEU Score

In [15]:
def calculate_bleu_score(machine_results, reference_texts):
    return corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results])

In [16]:
bleu_score = calculate_bleu_score(df_test_data['llm_answer'], df_test_data['answer'])
bleu_score

0.6635992832130831

## ROUGE Score

In [17]:
def calculate_rouge_score(machine_results, reference_texts):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, gen) for ref, gen in zip(reference_texts, machine_results)]
    rouge1 = sum(score['rouge1'].fmeasure for score in scores) / len(scores)
    rouge2 = sum(score['rouge2'].fmeasure for score in scores) / len(scores)
    rougeL = sum(score['rougeL'].fmeasure for score in scores) / len(scores)
    return {'rouge1': rouge1, 'rouge2': rouge2, 'rougeL': rougeL}

In [18]:
rouge_score = calculate_rouge_score(df_test_data['llm_answer'], df_test_data['answer'])
rouge_score

{'rouge1': 0.9136068363494741,
 'rouge2': 0.8757805708282291,
 'rougeL': 0.8912410022746833}

## BERT Score

In [19]:
def calculate_bert_score(machine_results, reference_texts):
    pass

In [20]:
def calculate_code_bleu(machine_results, reference_texts, lang:Language):
    return calc_codebleu(reference_texts, machine_results, lang=lang)

In [21]:
code_bleu_score = calculate_code_bleu(df_test_data['llm_answer'], df_test_data['answer'], 'python')
code_bleu_score

{'codebleu': 0.8064848664586155,
 'ngram_match_score': 0.6635992832130831,
 'weighted_ngram_match_score': 0.7451228049346935,
 'syntax_match_score': 0.9556060032791021,
 'dataflow_match_score': 0.861611374407583}

In [22]:
all_scores = {'bleu': bleu_score, **rouge_score, **code_bleu_score}
evaluation = pd.DataFrame(all_scores, index=[AGENT['final_code']['model']])
evaluation

Unnamed: 0,bleu,rouge1,rouge2,rougeL,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score
ft:gpt-3.5-turbo-0125:eden-ai::9UxoYDO3,0.663599,0.913607,0.875781,0.891241,0.806485,0.663599,0.745123,0.955606,0.861611


# Save evaluation

In [23]:
evaluation.to_csv(TEST_DATA_DIR / 'evaluation.csv', index=False)

# Human Evaluation

Let's compare the generated code with the answer and see if it is correct

In [29]:
from pprint import pprint

In [33]:
df_test_data['sdk_name'].values

array(['finley', 'affinity', 'flickr', 'adatree_banking', 'adatree_data',
       'foodkit', '1-password_connect', 'apaleo', 'bluesnap', 'milefy',
       'baseten', '2-c-2-p', 'adatree_consent', 'bity'], dtype=object)

In [35]:
current_sdk_name = 'affinity'
generated_code = get_generated_code(current_sdk_name)
answer = df_test_data[df_test_data['sdk_name'] == current_sdk_name]['answer'].values[0]

In [37]:
pprint(generated_code)

('import requests\n'
 '\n'
 '\n'
 'class AffinityClient:\n'
 '    def __init__(self, api_key):\n'
 '        self.api_key = api_key\n'
 '        self.base_url = "https://api.affinity.co"\n'
 '\n'
 '    def _make_authenticated_request(self, method, url, **kwargs):\n'
 '        headers = {\n'
 '            "Authorization": f"Bearer {self.api_key}",\n'
 '            "Content-Type": "application/json",\n'
 '        }\n'
 '        response = requests.request(method, url, headers=headers, **kwargs)\n'
 '        if response.status_code != 200:\n'
 '            raise Exception(f"Request failed with status code '
 '{response.status_code}")\n'
 '        return response\n'
 '\n'
 '    def get_current_user(self):\n'
 '        """Retrieve the current user\'s information."""\n'
 '        url = f"{self.base_url}/v2/auth/whoami"\n'
 '        return self._make_authenticated_request("GET", url)\n'
 '\n'
 '    def get_all_companies(self, query_params=None):\n'
 '        """Retrieve all companies with opti

In [38]:
pprint(answer)

('import json\n'
 'import requests\n'
 '\n'
 'class AffinityClient:\n'
 '    def __init__(self, api_key):\n'
 '        self.api_key = api_key\n'
 '        self.base_url = "https://api.affinity.co"\n'
 '\n'
 '    def _make_authenticated_request(self, method, url, **kwargs):\n'
 '        headers = {\n'
 '            "Authorization": f"Bearer {self.api_key}",\n'
 '            "Content-Type": "application/json"\n'
 '        }\n'
 '        response = requests.request(method, url, headers=headers, **kwargs)\n'
 '        if response.status_code != 200:\n'
 '            raise Exception(f"Request failed with status code '
 '{response.status_code}")\n'
 '        return response\n'
 '\n'
 '    def get_current_user(self):\n'
 '        """Retrieve the current user\'s information."""\n'
 '        url = f"{self.base_url}/v2/auth/whoami"\n'
 '        return self._make_authenticated_request("GET", url)\n'
 '\n'
 '    def get_all_companies(self, query_params=None):\n'
 '        """Retrieve all companies

compare the generated code with the answer

In [48]:
from difflib import unified_diff

In [49]:
# compare the generated code with the answer
diff = list(unified_diff(answer.splitlines(), generated_code.splitlines(), lineterm=''))
diff

['--- ',
 '+++ ',
 '@@ -1,5 +1,5 @@',
 '-import json',
 ' import requests',
 '+',
 ' ',
 ' class AffinityClient:',
 '     def __init__(self, api_key):',
 '@@ -9,7 +9,7 @@',
 '     def _make_authenticated_request(self, method, url, **kwargs):',
 '         headers = {',
 '             "Authorization": f"Bearer {self.api_key}",',
 '-            "Content-Type": "application/json"',
 '+            "Content-Type": "application/json",',
 '         }',
 '         response = requests.request(method, url, headers=headers, **kwargs)',
 '         if response.status_code != 200:']