# Prompt Collection - Static Analysis

### 🔨 **Setup**

In [1]:
# Use the Language.build_library method to compile these into a library that's usable from Python. 
# This function will return immediately if the library has already been compiled since the last 
# time its source code was modified:

from tree_sitter import Language, Parser
import os

# Ensuring that the library is compiled each time this cell is run.
if os.path.exists("build/my-languages.so"):
    os.remove("build/my-languages.so")

Language.build_library(
    # Store the library in the `build` directory
    "build/my-languages.so",
    # Include one or more languages
    ["vendor/tree-sitter-python"],
)

True

### 🔍 **Parser**

In [40]:
def parse(filename):
    PY_LANGUAGE = Language('build/my-languages.so', 'python')
    parser = Parser()
    parser.set_language(PY_LANGUAGE)
    result = ""

    with open(filename, "rb") as f:
        tree = parser.parse(f.read())

    query = PY_LANGUAGE.query("""(module
        (import_from_statement
            module_name: (dotted_name) @mod
            name: (dotted_name) @llm
            (#match? @mod "^langchain(.llms)?")
        )
        (expression_statement
            (assignment
                left: (identifier) @llmvar
                right: (call function: (identifier) @llmname)
                (#eq? @llmname @llm)
            )
        )
    )""")

    for llm in filter(lambda x: x[1] == "llmvar", query.captures(tree.root_node)):
        llm_text = llm[0].text.decode("utf-8")
        # print(f"LLM: {llm_text}")
        result += f"LLM: {llm_text}\n"

        query_2 = PY_LANGUAGE.query("""(call
            function: (identifier) @fn.name
            arguments: (argument_list) @fn.args
            (#eq? @fn.name "{llm}")
        )""".format(llm=llm_text))

        # print("Used:")
        result += "Used:\n"
        for usage in query_2.captures(tree.root_node):
            if usage[1] == "fn.name":
                # print("Call: ", usage[0].text.decode("utf-8"))
                result += f'Call: {usage[0].text.decode("utf-8")}\n'
            elif usage[1] == "fn.args":
                # print("with: ", usage[0].text.decode("utf-8"))
                result += f'with: {usage[0].text.decode("utf-8")}\n'

    return result

# Test the parser
print(parse("othertmp.py"))

LLM: zz
Used:
Call: zz
with: (PromptTemplate("arge"))



### 📦 Storing Repo Files for Reliable Parsing

Feel free to run this cell multiple times if there were exceptions when downloading some files.

It will only download files that are not already present in the `repos` folder, and fill up the gaps.

In [23]:
import json, requests

# Import Raw URLs
with open("../data/repo_to_rawFileURL_>=4stars.json", "r") as file:
    repos_prompts = json.load(file)


# NOTE: Refer to this stackoverflow post for issues with requests: 
# https://stackoverflow.com/questions/62599036/python-requests-is-slow-and-takes-very-long-to-complete-http-or-https-request

root_dir = "repos"
if not os.path.exists(root_dir):
    os.mkdir(root_dir)

# Note: Using '~' instead of '/' as a delimiter for file/dir names 
# (because I'm not creative enough to come up with a better solution)
for repo in repos_prompts:
    repo_path = os.path.join(root_dir, repo.replace("/", "~"))
    if not os.path.exists(repo_path):
        os.mkdir(repo_path)

    for url in repos_prompts[repo]:
        filename = url.split("/")[6:]
        filename = "~".join(filename)
        file_path = os.path.join(repo_path, filename)
        
        if not os.path.exists(file_path):
            try:
                r = requests.get(url, timeout=1)
                # Exception thrown before file is created. 
                # So, if file exists, it's safe to assume that it's been downloaded successfully.
                if r.status_code == 200:
                    with open(file_path, "w") as f:
                        f.write(r.text)
                else:
                    print("Error: ", r.status_code, repo_path, filename)
            except Exception as e:
                print(e)
                print("Error: ", repo_path, filename)

print("Done")

Done


### 🧠 Sanity Check

In [34]:
# Count the number of repos in the repos directory
assert len(os.listdir(root_dir)) == len(repos_prompts)
print("Number of repos: ", len(os.listdir(root_dir)), "; Expected 372 for repos >=4stars")

# Count the number of files in each repo
count = 0
for repo in repos_prompts:
    repo_path = os.path.join(root_dir, repo.replace("/", "~"))
    assert len(os.listdir(repo_path)) == len(repos_prompts[repo])
    for file in os.listdir(repo_path):
        with open(os.path.join(repo_path, file), "r") as f:
            assert f.read() != ""  # Complain if file is empty
    count += len(os.listdir(repo_path))
print("Total number of files: ", count, "; Expected 1444 for repos >=4stars")

Number of repos:  372 ; Expected 372 for repos >=4stars
Total number of files:  1444 ; Expected 1444 for repos >=4stars


### 📊 **Prompt Collection** - Parsing

In [45]:
root_dir = "repos"

count = 0
for repo in os.listdir(root_dir):
    repo_path = os.path.join(root_dir, repo)
    for file in os.listdir(repo_path):
        file_path = os.path.join(repo_path, file)
        try:
            prompt = parse(file_path)
            if prompt != "":
                count += 1
                print("Repo: ", repo, "; File: ", file)
                print(prompt)
        except Exception as e:
            print(e)
            print("Error: ", repo_path, file_path)

print(f"Parser Returns result for {count} files out of 1444 files")

Repo:  yym68686~ChatGPT-Telegram-Bot ; File:  test~test_keyword.py
LLM: chainllm
Used:
LLM: keyword_prompt
Used:
LLM: key_chain
Used:

Repo:  yym68686~ChatGPT-Telegram-Bot ; File:  test~test_gpt4free_langchain_agent.py
LLM: tools
Used:
LLM: agent
Used:

Repo:  wordweb~langchain-ChatGLM-and-TigerBot ; File:  chains~dialogue_answering~prompts.py
LLM: SUMMARY_PROMPT
Used:

Repo:  webgrip~PuttyGPT ; File:  Eve~main.py
LLM: tracer
Used:
LLM: callback_manager
Used:
LLM: openai
Used:
LLM: memory
Used:
LLM: readonlymemory
Used:
LLM: embeddings_model
Used:
LLM: vectorstore
Used:
LLM: retriever
Used:
LLM: llm
Used:
LLM: llm
Used:
LLM: todo_chain
Used:
LLM: vectorstore_info
Used:
LLM: toolkit
Used:
LLM: llm
Used:
LLM: agent_executor
Used:
LLM: llm_chain
Used:
LLM: agent
Used:

Repo:  webgrip~PuttyGPT ; File:  Eve~old~main.py
LLM: vectorstore
Used:
LLM: embedding_model
Used:
LLM: longTermMemoryRetriever
Used:
LLM: midTermMemoryRetriever
Used:
LLM: shortTermMemoryRetriever
Used:
LLM: sparseAndDense