In [35]:
import pandas as pd
import numpy as np
from pathlib import Path

DEF_PREFIXES = ["def ", "async def "]
NEWLINE = "\n"


def get_function_name(code: str):
    """
    Extract function name from a line beginning with 'def' or 'async def'.
    """
    for prefix in DEF_PREFIXES:
        if code.startswith(prefix):
            return code[len(prefix) : code.index("(")]


def get_until_no_space(all_lines, i):
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, len(all_lines)):
        if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
            ret.append(all_lines[j])
        else:
            break
    return NEWLINE.join(ret)


def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    with open(filepath, "r") as file:
        all_lines = file.read().replace("\r", NEWLINE).split(NEWLINE)
        for i, l in enumerate(all_lines):
            for prefix in DEF_PREFIXES:
                if l.startswith(prefix):
                    code = get_until_no_space(all_lines, i)
                    function_name = get_function_name(code)
                    yield {
                        "code": code,
                        "function_name": function_name,
                        "filepath": filepath,
                    }
                    break


def extract_functions_from_repo(code_root):
    """
    Extract all .py functions from the repository.
    """
    code_files = list(code_root.glob("**/*.py"))

    num_files = len(code_files)
    print(f"Total number of .py files: {num_files}")

    if num_files == 0:
        print("Verify openai-python repo exists and code_root is set correctly.")
        return None

    all_funcs = [
        func for code_file in code_files for func in get_functions(str(code_file))
    ]

    num_funcs = len(all_funcs)
    print(f"Total number of functions extracted: {num_funcs}")

    return all_funcs

def cosine_similarity(a,b):
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [17]:
# Data loading

root_dir = Path.home()

REPO_NAME = 'openai-python'

code_root = root_dir / REPO_NAME

all_funcs = extract_functions_from_repo(code_root)

all_funcs

Total number of .py files: 321
Total number of functions extracted: 350


[{'code': 'def test_pydantic_v1(session: nox.Session) -> None:\n    session.install("-r", "requirements-dev.lock")\n    session.install("pydantic<2")\n\n    session.run("pytest", "--showlocals", "--ignore=tests/functional", *session.posargs)\n',
  'function_name': 'test_pydantic_v1',
  'filepath': '/home/moonpatel/openai-python/noxfile.py'},
 {'code': 'async def transform(\n    data: _T,\n    expected_type: object,\n    use_async: bool,\n) -> _T:\n    if use_async:\n        return await _async_transform(data, expected_type=expected_type)\n\n    return _transform(data, expected_type=expected_type)\n\n',
  'function_name': 'transform',
  'filepath': '/home/moonpatel/openai-python/tests/test_transform.py'},
 {'code': 'async def test_top_level_alias(use_async: bool) -> None:\n    assert await transform({"foo_bar": "hello"}, expected_type=Foo1, use_async=use_async) == {"fooBar": "hello"}\n\n',
  'function_name': 'test_top_level_alias',
  'filepath': '/home/moonpatel/openai-python/tests/test

In [18]:
df = pd.DataFrame(all_funcs[:3])
df

Unnamed: 0,code,function_name,filepath
0,def test_pydantic_v1(session: nox.Session) -> ...,test_pydantic_v1,/home/moonpatel/openai-python/noxfile.py
1,"async def transform(\n data: _T,\n expec...",transform,/home/moonpatel/openai-python/tests/test_trans...
2,async def test_top_level_alias(use_async: bool...,test_top_level_alias,/home/moonpatel/openai-python/tests/test_trans...


In [19]:
from openai import OpenAI

client = OpenAI()

# response = client.embeddings.create(
#     input="",
#     model="text-embedding-3-small"
# )

df['code_embedding'] = df['code'].apply(lambda x: client.embeddings.create(input=x, model="text-embedding-3-small").data[0].embedding)


In [36]:
df

Unnamed: 0,code,function_name,filepath,code_embedding
0,def test_pydantic_v1(session: nox.Session) -> ...,test_pydantic_v1,/home/moonpatel/openai-python/noxfile.py,"[0.015476400032639503, 0.008651133626699448, 0..."
1,"async def transform(\n data: _T,\n expec...",transform,/home/moonpatel/openai-python/tests/test_trans...,"[-0.004722085315734148, 0.030964845791459084, ..."
2,async def test_top_level_alias(use_async: bool...,test_top_level_alias,/home/moonpatel/openai-python/tests/test_trans...,"[0.004833014216274023, -0.019827749580144882, ..."


In [41]:
def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    embedding = client.embeddings.create(input=code_query, model='text-embedding-3-small').data[0].embedding
    df['similarities'] = df['code_embedding'].apply(lambda x: cosine_similarity(x, embedding))

    res = df.sort_values('similarities', ascending=False).head(n)
    print(df['similarities'])

    if pprint:
        for r in res.iterrows():
            print(f"{r[1].filepath}:{r[1].function_name}  score={round(r[1].similarities, 3)}")
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-' * 70)

    return res

In [42]:
res = search_functions(df, 'fine-tuning input data validation logic', n=3)
res

0    0.114624
1    0.172824
2    0.070478
Name: similarities, dtype: float64
/home/moonpatel/openai-python/tests/test_transform.py:transform  score=0.173
async def transform(
    data: _T,
    expected_type: object,
    use_async: bool,
) -> _T:
    if use_async:
        return await _async_transform(data, expected_type=expected_type)
----------------------------------------------------------------------
/home/moonpatel/openai-python/noxfile.py:test_pydantic_v1  score=0.115
def test_pydantic_v1(session: nox.Session) -> None:
    session.install("-r", "requirements-dev.lock")
    session.install("pydantic<2")

    session.run("pytest", "--showlocals", "--ignore=tests/functional", *session.posargs)

----------------------------------------------------------------------
/home/moonpatel/openai-python/tests/test_transform.py:test_top_level_alias  score=0.07
async def test_top_level_alias(use_async: bool) -> None:
    assert await transform({"foo_bar": "hello"}, expected_type=Foo1, use_async

Unnamed: 0,code,function_name,filepath,code_embedding,similarities
1,"async def transform(\n data: _T,\n expec...",transform,/home/moonpatel/openai-python/tests/test_trans...,"[-0.004722085315734148, 0.030964845791459084, ...",0.172824
0,def test_pydantic_v1(session: nox.Session) -> ...,test_pydantic_v1,/home/moonpatel/openai-python/noxfile.py,"[0.015476400032639503, 0.008651133626699448, 0...",0.114624
2,async def test_top_level_alias(use_async: bool...,test_top_level_alias,/home/moonpatel/openai-python/tests/test_trans...,"[0.004833014216274023, -0.019827749580144882, ...",0.070478
