# 🕸️ Scraping Results

In [1]:
import json
libraries = ["openai", "langchain", "cohere", "llamaindex", "guidance", "anthropic"]

print(
"""
Result Counts from Github (Collected Manually)
    openai: 71.7k
    langchain: 50.2k
    cohere: 5.1k
    guidance: 1.6k
    anthropic: 1.5k
    llamaindex: 91
"""
)

all_hrefs = []
for lib in libraries:
    with open(f'results_{lib}.json') as f:
        data = json.load(f)

    total_num_results = 0
    total_hrefs = 0

    for charCombo, results in data.items():
        num_result = results['num_results']
        hrefs = results['hrefs']

        # Convert num_result to int
        num_result = num_result.split()[0].replace(',', '')
        num_result = int(float(num_result[:-1])) if "k" in num_result else int(num_result)
        total_num_results += num_result

        # Count hrefs
        total_hrefs += len(hrefs)
        all_hrefs += hrefs

    print(f'Library: {lib}')
    print('\tTotal number of results:', total_num_results)
    print('\tTotal number of hrefs:', total_hrefs)


Result Counts from Github (Collected Manually)
    openai: 71.7k
    langchain: 50.2k
    cohere: 5.1k
    guidance: 1.6k
    anthropic: 1.5k
    llamaindex: 91

Library: openai
	Total number of results: 33952
	Total number of hrefs: 18206
Library: langchain
	Total number of results: 28803
	Total number of hrefs: 16775
Library: cohere
	Total number of results: 5380
	Total number of hrefs: 4121
Library: llamaindex
	Total number of results: 91
	Total number of hrefs: 91
Library: guidance
	Total number of results: 1303
	Total number of hrefs: 1195
Library: anthropic
	Total number of results: 1401
	Total number of hrefs: 1328


# 📚 Downloading All Files

In [2]:
import os, requests

print('\nTotal number of hrefs:', len(all_hrefs))

all_rawFileURLs = [href.replace("blob/", "").replace("https://github.com", "https://raw.githubusercontent.com") for href in all_hrefs]

root_dir = "repos"
if not os.path.exists(root_dir):
    os.mkdir(root_dir)

for url in all_rawFileURLs:
    url_split = url.split("/")
    repo_name = "~".join(url_split[3:5])
    filename = "#".join(url_split[-1].split("#")[:-1])

    # repo path
    repo_path = os.path.join(root_dir, repo_name)
    if not os.path.exists(repo_path):
        os.mkdir(os.path.join(root_dir, repo_name))

    # file path
    file_path = os.path.join(repo_path, filename)
    if not os.path.exists(file_path):
        try:
            r = requests.get(url, timeout=1)
            # Exception thrown before file is created. 
            # So, if file exists, it's safe to assume that it's been downloaded successfully.
            if r.status_code == 200:
                with open(file_path, "w") as f:
                    f.write(r.text)
            else:
                print("Error: ", r.status_code, repo_path, filename)
        except Exception as e:
            print(e)
            print("Error: ", repo_path, filename)

print("Done")


Total number of hrefs: 41716
Error:  404 repos/christina8711~llm_cubestacking CubeStackingAssistant.py
Error:  404 repos/JerryWestrick~KnowledgeEngineer kbserver.py
Error:  404 repos/danieljbk~calhacks-spot key.py
Error:  404 repos/ken-at-kore~LG-AiBot-Prototype lg_aibot.py
Error:  404 repos/jayantr7~saatvaChatbot myEmbeddings.py
Error:  404 repos/jayantr7~saatvaChatbot myEmbeddings_copy.py
Error:  404 repos/jayantr7~saatvaChatbot prompter.py
Error:  404 repos/danieljbk~calhacks-spot speech_to_cmd.py
Error:  404 repos/danieljbk~calhacks-spot spot.py
Error:  404 repos/simular-ai~OpenAGI vision.py
Error:  404 repos/BastinFlorian~RAG-GCP firestore.py
Error:  404 repos/ken-at-kore~LG-AiBot-Prototype lg_aibot.py
Error:  404 repos/jfelipenc~NexbotChat nexbot.py
Error:  404 repos/davidlainesv~olivia-finetuning sabana_update.py
Error:  404 repos/danielgross~python-llm main.py
Done
