# Search notebooks on github
We search the Github using `collected_queries` and `query_combinations`, to get the number of repos, and select queries that has 1-100 repos. 

## Web scraping
We use web scraping to extract the number of repos from Github, 
because PyGithub API returns outdated search results. 

But this is truly flawed, every 10 requests, the program has to be either re-run once or rate limit occurs. 

In [7]:
import os
import requests
from bs4 import BeautifulSoup
import json
import time
import re
import csv
import sys

In [17]:
COLLECTED_QUERIES_FILE = '../../data/query/questions/collected_queries.txt'
COLLECTED_QUERIES_RESULT_FILE = 'statistics/github_repos_collected_queries.csv'

COLLECTED_QUERIES_COMBINATIONS_FILE = '../../data/query/questions/collected_queries_combinations.txt'
COLLECTED_QUERIES_COMBINATIONS_RESULT_FILE = 'statistics/github_repos_combinations.csv'

GITHUB_REPOS_PATH = "github_repos"
REPO_LINKS_FILE = "repo_links/collected_queries_repo_urls.txt"
FILTERED_QUERY_FILE = 'statistics/queries_repo_1_100.txt'

### Get the number of repos

In [40]:
def deduplicate_txt(file): 
    with open(file, "r") as f:
        lines = f.readlines()
        lines = list(set(lines))

    with open(file, "w") as f:
        f.writelines(lines)

In [18]:
def check_row_exists(file_path, query):
    # Open the CSV file for reading
    with open(file_path, 'r', newline='') as f:
        reader = csv.reader(f)
        
        # Check if the query exists in the file
        for row in reader:
            if row[0] == query:
                # print(row)
                # If repo_count is empty, return False
                if row[1] == '':
                    return False
                # Otherwise, return True
                else:
                    return True
                
        # If the query does not exist in the file, return True
        return False


def update_csv(file_path, query, repo_count):
    # Check if the CSV file exists
    try:
        with open(file_path, 'r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            rows = [row for row in reader]
    except FileNotFoundError:
        # Create a new CSV file if it doesn't exist
        with open(file_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['query', 'repo_count'])
        rows = []
    
    # Check if the query already exists in the CSV file
    for row in rows:
        if row[0] == query:
            # Update the repo_count if it's not empty
            if repo_count is not None:
                row[1] = repo_count
                with open(file_path, 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerows(rows)
            return row
    
    # Add a new row to the CSV file if the query doesn't exist
    if repo_count is not None:
        row = [query, repo_count]
        with open(file_path, 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(row)
        return row
    
    # Return None if the repo_count is None
    return None



In [19]:
# Helper function to extract the number of repositories from the GitHub search results page
def extract_repo_count(html):
    soup = BeautifulSoup(html, "html.parser")
    
    # No result: "We couldn't find any repositories matching..."
    for h3_tag in soup.find_all('h3'):
        # Extract the contents of the current <h3> tag
        h3_contents = h3_tag.get_text()
        # print(h3_contents)
        if "find any repositories matching" in h3_contents:
            return 0
    
    results_count = soup.find('div', {'class': 'd-flex flex-column flex-md-row flex-justify-between border-bottom pb-3 position-relative'})
    # Result 1: "XXX repository results"
    try: 
    # Check the number of repositories
        string = results_count.find('h3').get_text().strip()
        pattern = r'\d{1,3}(?:,\d{3})*\s+repository results'  # This regular expression matches any number with or without commas, followed by " repository results"
    
    # Result 2: "Showing XXX available repository results"
    except: 
        string = results_count.find("span", {"class": "v-align-middle"}).get_text().strip()
        string = re.sub(r'\s+', ' ', string)
        pattern = r'Showing \d{1,3}(?:,\d{3})*\s+available repository results'
        
    match = re.search(pattern, string)
    
    if match:
        count = int(match.group(0).split()[0].replace(',', ''))
        return count
    else:
        return -1

# Helper function to get the number of repositories for a given query
def get_repo_count(query):
    SEARCH_QUERY = f"{query} language:\"Jupyter Notebook\""
    SEARCH_SORT = "stars"
    SEARCH_ORDER = "desc"
    url = f"https://github.com/search?q={SEARCH_QUERY}&type=Repositories&s={SEARCH_SORT}&o={SEARCH_ORDER}"
    
    time.sleep(2)
    response = requests.get(url)
    if response.ok:
        # print(type(response.content))
        try: 
            count = extract_repo_count(response.content)
            return count
        except Exception as e:
            print(e)
            # return get_repo_count(query)
            return -1
    elif response.status_code == 429:
        # If we get a 429 error, wait for the recommended number of seconds and retry the request
        retry_after = response.headers.get('Retry-After')
        if retry_after:
            retry_after_secs = int(retry_after)
            print(f'Rate limit exceeded. Waiting for {retry_after_secs} seconds before retrying...')
            time.sleep(retry_after_secs)
            return get_repo_count(query)
    else:
        return -1


In [20]:
query = 'Post processing'
get_repo_count(query)

603

In [38]:
# ------ Get the number of repos ------
# Check if the file exists
files = [(COLLECTED_QUERIES_FILE, COLLECTED_QUERIES_RESULT_FILE), 
                          (COLLECTED_QUERIES_COMBINATIONS_FILE, COLLECTED_QUERIES_COMBINATIONS_RESULT_FILE)]
for file in files: 
    input_file = file[0]
    output_file = file[1]
    
    print(f"--- {input_file} ---")
    if not os.path.isfile(output_file):
        # If it doesn't exist, create it with the header row
        with open(RESULT_PATH, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['query', 'repo_count'])

    # Read the queries
    queries = []
    with open(input_file) as f:
        for line in f:
            query = line.strip()
            queries.append(query)

    # Get the repository counts for each query and store the results in a CSV file
    for query in queries:
        # print(query)
        # First check if the query is there but repo_count is empty
        if not check_row_exists(output_file, query): 
            count = None
            for i in range(10): 
                count = get_repo_count(query)
                if count is not None:
                    break
                # Check if this is the last iteration
                if i == 9:
                    print("Still returned None after 10 loops. Exiting program.")
                    sys.exit()
            print([query, count])
            update_csv(output_file, query, count)
        else: 
            continue

# Deduplicate the csv file
# deduplicate_csv(RESULT_PATH)       

--- ../../data/query/questions/collected_queries.txt ---
--- ../../data/query/questions/collected_queries_combinations.txt ---
['Attention-based multiple instance learning pytorch', -1]
['Attention-based multiple instance learning whole slide image diagnosis', 0]
['edge computing', 102]
['meta learning', 463]


### Save the queries with # repos [1-100]

In [41]:
def extract_queries(file_path, output_file):
    # Open the output file for writing
    with open(output_file, 'a') as outfile:
        # Read the CSV file and extract the queries
        with open(file_path, 'r', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                repo_count = row['repo_count']
                if repo_count.isdigit() and 1 <= int(repo_count) <= 100:
                    query = row['query']
                    outfile.write(query + '\n')
    
    deduplicate_txt(output_file)

extract_queries(COLLECTED_QUERIES_RESULT_FILE, FILTERED_QUERY_FILE)
extract_queries(COLLECTED_QUERIES_COMBINATIONS_RESULT_FILE, FILTERED_QUERY_FILE)

### Get the repo links 

In [3]:
def check_record_exists(file_path): 
    if not os.path.isfile(file_path):
        # The file doesn't exist
        return False

    with open(file_path, "r") as f:
        data = json.load(f)
        if len(data["repo_urls"]) == 0: 
            return False
    return True


In [5]:
def extract_repo_links(html): 
    repo_urls = []
    soup = BeautifulSoup(html, "html.parser")
    # Find all repository links in the HTML using the "v-align-middle" CSS class
    repo_links = soup.find_all("a", class_="v-align-middle")
    # Extract the repository URLs from the links and print them
    for link in repo_links:
        repo_url = "https://github.com" + link["href"]
        repo_urls.append(repo_url)
        # print(repo_url)      
    return repo_urls
        
def get_repo_links(query):
    SEARCH_QUERY = f"{query} language:\"Jupyter Notebook\""
    SEARCH_SORT = "stars"
    SEARCH_ORDER = "desc"    
    # Initialize an empty list to store the repository URLs
    repo_urls = []
    # Loop over the top 3 pages of search results
    for page in range(1, 4):
        # Construct the search URL for the current page
        search_url = f"https://github.com/search?q={SEARCH_QUERY}&type=Repositories&s={SEARCH_SORT}&o={SEARCH_ORDER}&p={page}"

        # Send a GET request to the search URL and parse the HTML response using BeautifulSoup
        time.sleep(2)
        response = requests.get(search_url)
        if response.ok:
            # print(type(response.content))
            try: 
                repo_links = extract_repo_links(response.content)
                repo_urls = repo_urls + repo_links
                if len(repo_links) < 10: 
                    break
            except Exception as e:
                print(e)
                # return get_repo_count(query)
                return -1
        elif response.status_code == 429:
            # If we get a 429 error, wait for the recommended number of seconds and retry the request
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                retry_after_secs = int(retry_after)
                print(f'Rate limit exceeded. Waiting for {retry_after_secs} seconds before retrying...')
                time.sleep(retry_after_secs)
                return get_repo_links(query)
        else:
            return -1
    return repo_urls
    

In [8]:
query = "construct the graph"
get_repo_links(query)

['https://github.com/millingab/multiview-stereo',
 'https://github.com/jacobastern/graph-construction-semi-supervised',
 'https://github.com/y-richie-y/badgraphs',
 'https://github.com/YEZIQM/Node2Vec_Subject_similarity',
 'https://github.com/zzp1012/neo4j-enterprise-KG',
 'https://github.com/hajarmerbouh/Cybersecurity-Knowledge-graph',
 'https://github.com/yuhaozhang94/changi-airport-taxiway-planning',
 'https://github.com/jiayiwus1x/physicist-net',
 'https://github.com/cadovid/hn-data-pipeline',
 'https://github.com/maxwellmckee/Convex_Clusters',
 'https://github.com/FadedIllusions/AAE_Notebook_013_ProbabilisticRoadmap',
 'https://github.com/arnavanand7/Market-Value-of-Footballers',
 'https://github.com/think-high/Text_to_Knowledge_Graph',
 'https://github.com/raminetinati/Knowledge-Graph',
 'https://github.com/nkasmanoff/nasa-eo-knowledge-graph',
 'https://github.com/DavidBraslow/Our-Class-Graph',
 'https://github.com/baller609/Directed-Acyclic-Graph',
 'https://github.com/mehdirost

In [12]:
# ------ Download the results in the first 3 pages ------
# Read the queries
queries = []
with open(FILTERED_QUERY_FILE) as f:
    for line in f:
        query = line.strip()
        queries.append(query)
len(queries)

65

In [9]:
# Get the repository links for each query and store the results in a json file
for query in queries:
    filename = "github_repos/" + query.replace(" ", "_").replace("/", "_") + ".json"
    if not os.path.isfile(filename): 
        for i in range(10): 
            repo_urls = get_repo_links(query)
            if len(repo_urls) > 0:
                break
            # Check if this is the last iteration
            if i == 9:
                print("Still returned None after 10 loops. Exiting program.")
                sys.exit()
        # Define the output dictionary with the query and repo_links fields
        output_dict = {
            "query": query,
            "repo_urls": repo_urls
        }
        print(f"{query}: {len(repo_urls)}")
        with open(filename, "w") as f:
            json.dump(output_dict, f)

In [11]:
# Extract repo links
repo_urls = []

# List all the files in the directory
files = os.listdir(GITHUB_REPOS_PATH)

# Iterate over the files and read all the JSON files
for file in files:
    if file.endswith(".json"):
        file_path = os.path.join(GITHUB_REPOS_PATH, file)
        with open(file_path, "r") as f:
            data = json.load(f)
            repo_urls.extend(data["repo_urls"])

print(f"Number of repos: {len(repo_urls)}")
# Deduplicate the repo URLs
repo_urls = list(set(repo_urls))

print(f"Number of unique repos: {len(repo_urls)}")

# Write all the repo URLs to a text file
with open(REPO_LINKS_FILE, "w") as f:
    for url in repo_urls:
        f.write(url + "\n")


Number of repos: 975
Number of unique repos: 915
