In [1]:
from sdkgenerator.generate import generate_sdk
from sdkgenerator.constants import DATA_DIR, GENERATED_SDK_DIR
from sdkgenerator.utils import get_code_from_model_response
from sdkgenerator.db import db
import pandas as pd

Database connected successfully. (MongoDB)


In [2]:
TEST_DATA_DIR = DATA_DIR / 'test'
TEST_DATA_DIR.mkdir(exist_ok=True)
SPECIFICATIONS_DIR = DATA_DIR / 'openapi-specifications'

# Load test data

In [3]:
test_pipeline_data = TEST_DATA_DIR / 'test_data.jsonl'
test_data = pd.read_json(test_pipeline_data, lines=True)

In [4]:
test_data.head()

Unnamed: 0,messages
0,"[{'role': 'system', 'content': 'You are a pyth..."
1,"[{'role': 'system', 'content': 'You are a pyth..."
2,"[{'role': 'system', 'content': 'You are a pyth..."
3,"[{'role': 'system', 'content': 'You are a pyth..."
4,"[{'role': 'system', 'content': 'You are a pyth..."


We can see that out test data if currently lines, each line is an array of objects that contain the following fields:
- `role`: the role of the messanger, e.g. `system`, `user`
- `content`: the content of the message

In [5]:
test_data['sdk'] = test_data.apply(lambda x: x.values[-1][-1], axis=1)

In [6]:
test_data

Unnamed: 0,messages,sdk
0,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
1,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
2,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
3,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
4,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
5,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
6,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
7,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."
8,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python fr..."
9,"[{'role': 'system', 'content': 'You are a pyth...","{'role': 'assistant', 'content': '```python im..."


## Query testing data

In [7]:
sdk_code_test_data_pipeline = [
        {"$match": {"step": "final_code", "response.openai.status": "success"}},
        {"$sort": {"_id": 1}},  # Sort by _id to sort by insertion date (I accidentally added multiple final_code for some data)
        {"$group": {
            "_id": "$sdk_name",
            "last_response_text": {"$last": "$response.openai.generated_text"}
        }},
        {"$project": {
            "sdk_name": "$_id",
            "last_response_text": 1,
            "_id": 0
        }}
    ]

data = list(db["train_data"].aggregate(sdk_code_test_data_pipeline))
df_test_data = pd.DataFrame(data)
df_test_data.head()

Unnamed: 0,last_response_text,sdk_name
0,```python\nimport requests\nfrom typing import...,1-password_connect
1,```python\nimport requests\nfrom typing import...,apaleo
2,```python\nimport requests\n\nclass AdatreeCli...,adatree_data
3,```python\nimport requests\nfrom urllib.parse ...,bluesnap
4,```python\nimport requests\nfrom urllib.parse ...,foodkit


In [8]:
df_test_data['sdk_code'] = df_test_data['last_response_text'].apply(lambda x: get_code_from_model_response(x)[0])
df_test_data = df_test_data.drop(columns=['last_response_text'])
df_test_data.head()

Unnamed: 0,sdk_name,sdk_code
0,1-password_connect,"import requests\nfrom typing import Dict, Any,..."
1,apaleo,"import requests\nfrom typing import Optional, ..."
2,adatree_data,import requests\n\nclass AdatreeClient:\n d...
3,bluesnap,import requests\nfrom urllib.parse import urlj...
4,foodkit,import requests\nfrom urllib.parse import urlj...


In [9]:
# remove rows with None
df_test_data = df_test_data.dropna()
df_test_data.head()

Unnamed: 0,sdk_name,sdk_code
0,1-password_connect,"import requests\nfrom typing import Dict, Any,..."
1,apaleo,"import requests\nfrom typing import Optional, ..."
2,adatree_data,import requests\n\nclass AdatreeClient:\n d...
3,bluesnap,import requests\nfrom urllib.parse import urlj...
4,foodkit,import requests\nfrom urllib.parse import urlj...


In [10]:
types_code_test_data_pipeline = [
        {"$match": {"step": "types", "response.openai.status": "success"}},
        {"$sort": {"_id": 1}},  # Sort by _id to sort by insertion date (I accidentally added multiple final_code for some data)
        {"$group": {
            "_id": "$sdk_name",
            "last_response_text": {"$last": "$response.openai.generated_text"}
        }},
        {"$project": {
            "sdk_name": "$_id",
            "last_response_text": 1,
            "_id": 0
        }}
    ]

data = list(db["train_data"].aggregate(types_code_test_data_pipeline))
df_types = pd.DataFrame(data)
df_types['types_code'] = df_types['last_response_text'].apply(lambda x: get_code_from_model_response(x)[0])
df_types.drop(columns=['last_response_text'], inplace=True)
df_types.head()

Unnamed: 0,sdk_name,types_code
0,baseten,"from typing import TypedDict, Optional\n\nclas..."
1,adatree_data,"from typing import TypedDict, List, Literal, O..."
2,1-password_connect,"from typing import TypedDict, List, Union, Lit..."
3,1-password_partnership,"from typing import TypedDict, Optional\nfrom d..."
4,goody,"from typing import TypedDict, Optional, List, ..."


In [11]:
# merge the two dataframes if sdk has no types set types_code to empty string
df_test_data = df_test_data.merge(df_types, on='sdk_name', how='left')
df_test_data.fillna('', inplace=True)


df_test_data.head()

Unnamed: 0,sdk_name,sdk_code,types_code
0,1-password_connect,"import requests\nfrom typing import Dict, Any,...","from typing import TypedDict, List, Union, Lit..."
1,apaleo,"import requests\nfrom typing import Optional, ...",
2,adatree_data,import requests\n\nclass AdatreeClient:\n d...,"from typing import TypedDict, List, Literal, O..."
3,bluesnap,import requests\nfrom urllib.parse import urlj...,"from typing import TypedDict, List, Literal, U..."
4,foodkit,import requests\nfrom urllib.parse import urlj...,"from typing import TypedDict, Literal, List\n\..."


In [12]:
def merge_code(sdk_code: str, types_code: str) -> str:
    if types_code:
        return types_code + '\n' + '-'*10 + '\n' + sdk_code
    return sdk_code

In [13]:
#answer is sdk_code + types_code
df_test_data['answer'] = df_test_data.apply(lambda x: merge_code(x['sdk_code'], x['types_code']), axis=1)

df_test_data.head()

Unnamed: 0,sdk_name,sdk_code,types_code,answer
0,1-password_connect,"import requests\nfrom typing import Dict, Any,...","from typing import TypedDict, List, Union, Lit...","from typing import TypedDict, List, Union, Lit..."
1,apaleo,"import requests\nfrom typing import Optional, ...",,"import requests\nfrom typing import Optional, ..."
2,adatree_data,import requests\n\nclass AdatreeClient:\n d...,"from typing import TypedDict, List, Literal, O...","from typing import TypedDict, List, Literal, O..."
3,bluesnap,import requests\nfrom urllib.parse import urlj...,"from typing import TypedDict, List, Literal, U...","from typing import TypedDict, List, Literal, U..."
4,foodkit,import requests\nfrom urllib.parse import urlj...,"from typing import TypedDict, Literal, List\n\...","from typing import TypedDict, Literal, List\n\..."


now we get the generated code for each sdk

In [14]:
df_test_data.drop(columns=['sdk_code', 'types_code'], inplace=True)
df_test_data.head()

Unnamed: 0,sdk_name,answer
0,1-password_connect,"from typing import TypedDict, List, Union, Lit..."
1,apaleo,"import requests\nfrom typing import Optional, ..."
2,adatree_data,"from typing import TypedDict, List, Literal, O..."
3,bluesnap,"from typing import TypedDict, List, Literal, U..."
4,foodkit,"from typing import TypedDict, Literal, List\n\..."


In [15]:
user_rules = "1. Use the requests library: All HTTP requests within the SDK must be made using the 'requests' library.\n2. Class structure: The SDK must be a class, with each method representing an endpoint in the API. Choose method names that reflect the action or resource they interact with.\n3. Authenticated requests: Implement a method '_make_authenticated_request' to handle authenticated requests.\n4. JSON request body: Use JSON format for the body of all requests.\n5. Return type: All methods must return the 'Response' object from the 'requests' library."

def get_generated_code(sdk_name):
    try:
        sdk_name_path = GENERATED_SDK_DIR / sdk_name / f"{sdk_name}.py"
        types_code_path = GENERATED_SDK_DIR / sdk_name / f"types.py"
        sdk_code = sdk_name_path.read_text()
        types_code = types_code_path.read_text() if types_code_path.exists() else ""
        return merge_code(sdk_code, types_code)
    except FileNotFoundError:
        print(f"Generating SDK for {sdk_name}")
        spec_path = None
        for file in SPECIFICATIONS_DIR.iterdir():
            if sdk_name in file.name:
                spec_path = file
                break
        if spec_path is None:
            raise Exception(f"Specification file for {sdk_name} not found")
        try:
            _, sdk_output_file, types_output_file = generate_sdk(spec_path, user_rules=user_rules)
        except Exception as e:
            print(f"Error generating SDK for {sdk_name}: {e}")
            return None
        if sdk_output_file is None:
            return None
        sdk_code = sdk_output_file.read_text()
        types_code = types_output_file.read_text() if types_output_file else ""
        
        return merge_code(sdk_code, types_code)

df_test_data['llm_answer'] = df_test_data['sdk_name'].apply(get_generated_code)

In [16]:
df_test_data.head()

Unnamed: 0,sdk_name,answer,llm_answer
0,1-password_connect,"from typing import TypedDict, List, Union, Lit...","from typing import TypedDict, List, Union, Opt..."
1,apaleo,"import requests\nfrom typing import Optional, ...","from typing import List, TypedDict, Literal, D..."
2,adatree_data,"from typing import TypedDict, List, Literal, O...","from typing import TypedDict, List, Literal\n\..."
3,bluesnap,"from typing import TypedDict, List, Literal, U...","from typing import TypedDict, List, Literal, U..."
4,foodkit,"from typing import TypedDict, Literal, List\n\...","from typing import TypedDict, Literal, List\n\..."


# Evaluation

In [57]:
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from codebleu import calc_codebleu
from sdkgenerator.types import Language
from sdkgenerator.config import AGENT

## BLEU Score

In [58]:
def calculate_bleu_score(machine_results, reference_texts):
    return corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results])

In [59]:
bleu_score = calculate_bleu_score(df_test_data['llm_answer'], df_test_data['answer'])
bleu_score

0.6635992832130831

## ROUGE Score

In [60]:
def calculate_rouge_score(machine_results, reference_texts):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, gen) for ref, gen in zip(reference_texts, machine_results)]
    rouge1 = sum(score['rouge1'].fmeasure for score in scores) / len(scores)
    rouge2 = sum(score['rouge2'].fmeasure for score in scores) / len(scores)
    rougeL = sum(score['rougeL'].fmeasure for score in scores) / len(scores)
    return {'rouge1': rouge1, 'rouge2': rouge2, 'rougeL': rougeL}

In [61]:
rouge_score = calculate_rouge_score(df_test_data['llm_answer'], df_test_data['answer'])
rouge_score

{'rouge1': 0.9136068363494741,
 'rouge2': 0.8757805708282291,
 'rougeL': 0.8912410022746833}

## BERT Score

In [62]:
def calculate_bert_score(machine_results, reference_texts):
    pass

In [63]:
def calculate_code_bleu(machine_results, reference_texts, lang:Language):
    return calc_codebleu(reference_texts, machine_results, lang=lang)

In [64]:
code_bleu_score = calculate_code_bleu(df_test_data['llm_answer'], df_test_data['answer'], 'python')
code_bleu_score

{'codebleu': 0.8064848664586155,
 'ngram_match_score': 0.6635992832130831,
 'weighted_ngram_match_score': 0.7451228049346935,
 'syntax_match_score': 0.9556060032791021,
 'dataflow_match_score': 0.861611374407583}

In [65]:
all_scores = {'bleu': bleu_score, **rouge_score, **code_bleu_score}
evaluation = pd.DataFrame(all_scores, index=[AGENT['final_code']['model']])
evaluation

Unnamed: 0,bleu,rouge1,rouge2,rougeL,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score
ft:gpt-3.5-turbo-0125:eden-ai::9UxoYDO3,0.663599,0.913607,0.875781,0.891241,0.806485,0.663599,0.745123,0.955606,0.861611


# Save evaluation

In [66]:
evaluation.to_csv(TEST_DATA_DIR / 'evaluation.csv', index=False)