# 🕸️ Scraping Results

In [66]:
import json
libraries = ["openai", "langchain", "cohere", "llamaindex", "guidance", "anthropic"]

print(
"""
Result Counts from Github (Collected Manually)
    openai: 71.7k
    langchain: 50.2k
    cohere: 5.1k
    guidance: 1.6k
    anthropic: 1.5k
    llamaindex: 91
"""
)

all_hrefs = []
for lib in libraries:
    with open(f'results_{lib}.json') as f:
        data = json.load(f)

    total_num_results = 0
    total_hrefs = 0

    for charCombo, results in data.items():
        num_result = results['num_results']
        hrefs = results['hrefs']

        # Convert num_result to int
        num_result = num_result.split()[0].replace(',', '')
        num_result = int(float(num_result[:-1])) if "k" in num_result else int(num_result)
        total_num_results += num_result

        # Count hrefs
        total_hrefs += len(hrefs)
        all_hrefs += hrefs

    print(f'Library: {lib}')
    print('\tTotal number of results:', total_num_results)
    print('\tTotal number of hrefs:', total_hrefs)


Result Counts from Github (Collected Manually)
    openai: 71.7k
    langchain: 50.2k
    cohere: 5.1k
    guidance: 1.6k
    anthropic: 1.5k
    llamaindex: 91

Library: openai
	Total number of results: 33952
	Total number of hrefs: 18206
Library: langchain
	Total number of results: 28803
	Total number of hrefs: 16775
Library: cohere
	Total number of results: 5380
	Total number of hrefs: 4121
Library: llamaindex
	Total number of results: 91
	Total number of hrefs: 91
Library: guidance
	Total number of results: 1303
	Total number of hrefs: 1195
Library: anthropic
	Total number of results: 1401
	Total number of hrefs: 1328


# 📚 Downloading All Files

In [68]:
import os, requests

print('\nTotal number of hrefs:', len(all_hrefs))

all_rawFileURLs = [href.replace("blob/", "").replace("https://github.com", "https://raw.githubusercontent.com") for href in all_hrefs]
for i in range(len(all_rawFileURLs)):
    if "#" in all_rawFileURLs[i]:
        all_rawFileURLs[i] = "#".join(all_rawFileURLs[i].split("#")[:-1])

root_dir = "repos"
if not os.path.exists(root_dir):
    os.mkdir(root_dir)

count = 0
for url in all_rawFileURLs:
    url_split = url.split("/")

    # Getting repo name
    repo_name = "~".join(url_split[3:5])

    # Remove "#" from filename if it exists
    filename_addr = url_split[6:]
    filename_addr = "~".join(filename_addr)

    # repo path
    repo_path = os.path.join(root_dir, repo_name)
    if not os.path.exists(repo_path):
        os.mkdir(os.path.join(root_dir, repo_name))

    # file path
    file_path = os.path.join(repo_path, filename_addr)

    if not os.path.exists(file_path):
        try:
            r = requests.get(url, timeout=1)
            # Exception thrown before file is created. 
            # So, if file exists, it's safe to assume that it's been downloaded successfully.
            if r.status_code == 200:
                with open(file_path, "w") as f:
                    f.write(r.text)
            else:
                print("Error: ", r.status_code, repo_path, filename_addr)
        except Exception as e:
            print(e)
            print("Error: ", repo_path, filename_addr)

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

print("Done")


Total number of hrefs: 41716
0 100 200 300 400 500 600 700 800 900 1000 Error:  404 repos/supertimmyh~vidsum pages~4_Summarizing_Video.py
1100 1200 1300 1400 1500 1600 1700 1800 1900 HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Read timed out. (read timeout=1)
Error:  repos/mikrl~django-llm llm~admin.py
2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 Error:  404 repos/TheseApps~Prompts gptCalls01~callGpt.py
3900 4000 4100 Error:  404 repos/rixmape~drr-chatbot cli-sample-usage.py
4200 4300 4400 4500 4600 Error:  404 repos/christina8711~llm_cubestacking CubeStackingAssistant.py
4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Read timed out. (read timeout=1)
Error:  repos/uqarni~reposite-demo functions.py
6900 7000 7100 7200 7300 7400 Error:  404 repos/Sea-Snell~LLM_RL llm_rl_scripts~chess~gpt4~gp