# Objectives

Reference: [Patrick's Mind Map](https://excalidraw.com/#json=R3xPQDIzn6TOissiXXcTN,_v-YLa9E2v8DRoVU0-Xjdg)

1. **Search for candidate repos** (containing prompts in .py/.txt files) using:
    - LangChain
    - Guidance (by Microsoft)
    - LlamaIndex
2. **Find Prompts**:
    - 2.1. Filter down via dirs and files
        - Look at dirs for 'template' or 'prompt' folders (or files)
    - 2.2. Filter down via code search:    
        - imports some library like openai, hugginface, etc.
        - (are they in files? Strings?
        - How many use variables?
        - Do they concat, use f-strings, use format? Etc.)

<br />

**Possible Next Steps**:
- Skim through a random sample set of files manually
- Run professor's [sslim check tool](https://github.com/kpister/sllim) on them (semantic analysis to detect errors in prompt files)

## 📚 **Candidate Repos**

In [37]:
import requests, json
from pprint import pprint

def fetch_data(query="langchain+OR+GUIDANCE+OR+LlamaIndex", sort="stars", order="asc", per_page=100, language="python", star_count=4, get_fork=False):
    """
    # GitHub API URL for searching repositories
    # DOCS: https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-repositories

    # Params Default
    query = "langchain+OR+GUIDANCE+OR+LlamaIndex"
    sort = "stars"
    order = "asc"
    per_page = 100  # Max 100
    language = "python"
    star_count = 4  # Fetching all repos with >= 4 stars

    Returns a results dict with the following structure:
    {
        total_count: int,
        items: [{repo1_info}, {repo2_info}, ...]
    }
    """
    # Setting up result dict and file
    result = {"total_count": 0, "items": []}
    
    # NOTE: Only the first 1000 search results are available through this API
    print("Fetching all 10 pages (assuming there are >= 1000 results)")
    for page in range(1, 11):
        url = f"https://api.github.com/search/repositories?q={query}+fork:{get_fork}+language:{language}+stars:>={star_count}&sort={sort}&order={order}&per_page={per_page}&page={page}"
        # Make the API request and get the JSON response
        response = requests.get(url)
        data = response.json()
        # Check if the request was successful
        if response.status_code != 200:
            raise Exception(data.get("message", "Unknown error"))

        # Check if the API returned an error
        if "message" in data:
            raise Exception(data["message"])
        
        # Add the results to the result list and file
        result["items"].extend(data["items"])
        print(f"Page {page} done")
    
    print("Done fetching all 10 pages")

    # Add the total count to the result
    result["total_count"] = data["total_count"]

    # Store the result
    with open("repos.json", "w") as file:
        json.dump(result, file, indent=4)

    return result

# WARNING: SLOW DUE TO API rate limit (10 times per minute). Can be called only once per minute.
# More Details in the DOCS.
##################################################
# repos = fetch_data()  # UNCOMMENT WHEN NEEDED

## 🔎 **Find Prompts**

In [38]:
from utils import get_api_key_github  # custom function for privacy. Get your own API key
import time

def search_repo(repo_name, username="DJPAUL2001", api_key=get_api_key_github()):
    """
    Searches a repo for promp or template file. HELPER FUNCTION for search_all_repos()
    Github API URL: 
    - https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code
    - https://docs.github.com/en/enterprise-server@2.22/search-github/searching-on-github/searching-code
    
    When searching for code, you can get text match metadata for the file content and file path fields when you pass 
    the text-match media type. For more details about how to receive highlighted search results

    Note: The "Search code" endpoint requires you to authenticate and limits you to 10 requests per minute

    Returns a list of files. e.g. [{file1_info}, {file2_info}, ...]
    """
    query = "prompt+OR+template"
    inside = "file,path"  # Search in file content and file path
    extension = "extension:prompt+OR+extension:template+OR+extension:txt+OR+extension:py"
    url = f"https://api.github.com/search/code?q={query}+in:{inside}+{extension}+repo:{repo_name}"
    
    response = requests.get(url, auth=(username, api_key))
    data = response.json()

    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(data.get("message", "Unknown error"))
    
    # Check if the API returned an error
    if "message" in data:
        raise Exception(data["message"])
    
    return data["items"]

def search_all_repos(repos, filename="repos_prompts.json", username="DJPAUL2001", api_key=get_api_key_github()):
    """
    Running search_repo() on all candidate repos in repos.json. Approx runtime: > 1 hr 40 mins (took 115 mins)

    Returns a dict with the following structure:
    {
        repo_name1: [{file1_info}, {file2_info}, ...],
        repo_name2: [{file1_info}, {file2_info}, ...],
        ...
    }
    """
    # Searching for prompt or template files in each repo
    repos_prompts = {}
    for repo in repos["items"]:
        repo_name = repo["full_name"]
        try:
            repos_prompts[repo_name] = search_repo(repo_name, username, api_key)
        except Exception as e:
            print(f"Error: {e}")
            # Assuming the exception is due to rate limit, wait for 1 minute and try again
            time.sleep(60)
            repos_prompts[repo_name] = search_repo(repo_name, username, api_key)

    # STORING RESULT 
    ##################################################
    with open(filename, "w") as file:
        json.dump(repos_prompts, file, indent=4)
    
    return repos_prompts

# WARNING: SLOW DUE TO API rate limit (10 times per minute). Can be called only once per 2 hrs.
# More Details in the DOCS.
# with open("repos_>=4stars.json", "r") as file:
#     repos = json.load(file)
# repos_prompts_2 = search_all_repos(repos)  # UNCOMMENT WHEN NEEDED

🗨️ Number of Repos with Prompts

In [39]:
def get_reposWithPrompt_count(repos_prompts_filename):
    """
    Counts the number of repos with prompt/template search results
    """
    # Loading prompts search results from all repos
    with open(repos_prompts_filename, "r") as file:
        repos_prompts = json.load(file)

    # Counting number of repos with prompt/template search results
    count = 0
    total_files = 0
    for repo_name, files in repos_prompts.items():
        if len(files) > 0:
            count += 1
            total_files += len(files)

    print(f"ReposWithPrompts Count in {repos_prompts_filename}: {count} out of {len(repos_prompts)}, {total_files} files")

get_reposWithPrompt_count("repos_prompts_>=0stars.json")
get_reposWithPrompt_count("repos_prompts_>=4stars.json")

ReposWithPrompts Count in repos_prompts_>=0stars.json: 108 out of 1000, 635 files
ReposWithPrompts Count in repos_prompts_>=4stars.json: 372 out of 981, 1444 files


📂 Grabbing and Storing all raw file URLs in one place

In [40]:
def get_rawFileURL(read_filename="repos_prompts.json", write_filename="repo_to_rawFileURL.json"):
    """
    To get raw file content of repos_prompts files, do the following: 
    1. grab the html_url of a code_search item
    2. remove blob/
    3. replace with github.com with raw.githubusercontent.com

    e.g.
    res = requests.get('https://raw.githubusercontent.com/langchain-ai/langchain/98aff29fbda6bcb99ea6af0cfd1532954b504bdc/libs/langchain/langchain/schema/prompt_template.py')
    print(res.text)

    
    Returns a dict with the following structure (Note: Only contains repos with prompt/template files):
    {
        repo_name1: [file_raw_url1, file_raw_url2, ...],
        repo_name2: [file_raw_url1, file_raw_url2, ...],
        ...
    }
    """
    # Loading prompts search results from all repos
    with open(read_filename, "r") as file:
        repos_prompts = json.load(file)

    # Getting raw file URL
    repo_to_fileURL = {}
    for repo, files in repos_prompts.items():
        for file in files:
            file_raw_url = file["html_url"].replace("blob/", "").replace("github.com", "raw.githubusercontent.com")
            repo_to_fileURL[repo] = repo_to_fileURL.get(repo, []) + [file_raw_url]

    # STORING RESULT
    with open(write_filename, "w") as file:
        json.dump(repo_to_fileURL, file, indent=4)

    return repo_to_fileURL

get_rawFileURL("repos_prompts_>=0stars.json", "repo_to_rawFileURL_>=0stars.json")
get_rawFileURL("repos_prompts_>=4stars.json", "repo_to_rawFileURL_>=4stars.json")

{'pprados/langchain-googledrive': ['https://raw.githubusercontent.com/pprados/langchain-googledrive/562a3d29bb3ca1ca3a432662087d83b734bc25c2/langchain_googledrive/document_loaders/google_drive.py',
  'https://raw.githubusercontent.com/pprados/langchain-googledrive/562a3d29bb3ca1ca3a432662087d83b734bc25c2/langchain_googledrive/utilities/google_drive.py',
  'https://raw.githubusercontent.com/pprados/langchain-googledrive/562a3d29bb3ca1ca3a432662087d83b734bc25c2/tests/integration_tests/document_loaders/test_google_drive.py',
  'https://raw.githubusercontent.com/pprados/langchain-googledrive/562a3d29bb3ca1ca3a432662087d83b734bc25c2/tests/integration_tests/providers/test_google_drive.py',
  'https://raw.githubusercontent.com/pprados/langchain-googledrive/562a3d29bb3ca1ca3a432662087d83b734bc25c2/tests/unit_tests/utilities/test_google_drive.py',
  'https://raw.githubusercontent.com/pprados/langchain-googledrive/562a3d29bb3ca1ca3a432662087d83b734bc25c2/tests/unit_tests/document_loaders/test_go