## Download notebooks from Github
Given a Github repo link, we download the Jupyter notebooks from the repo

## Use PyGithub
+ We use PyGithub API to traverse a repository and download .ipynb files from the repo. 
+ `id = hashlib.sha256(html_url)` is used to generate ID for notebook
+ We use a `notebook_metadata` to record the id and some other metadata for the notebooks
+ We use `repo_download_log` to keep reack of the repos being downloaded, making it possible to consume downloading and skip the notebook already exists. 


In [2]:
import json
import os
import requests
from urllib.parse import urlparse
from github import Github
from github.GithubException import RateLimitExceededException
import hashlib
import csv
import datetime
import pytz


In [3]:
REPO_URL_FILE_JUPYTER_WIKI = 'repo_links/jupyter_wiki/github_links.txt'
REPO_URL_FILE_COLLECTED_QUERIES = 'repo_links/collected_queries_repo_urls.txt'
# METADATA_LOG_JUPYTER_WIKI = 'download_logs/notebook_metadata_jupyter_wiki.csv'
# METADATA_LOG_COLLECTED_QUERIES = 'download_logs/notebook_metadata_collected_queries.csv'
REPO_DOWNLOAD_LOG = 'download_logs/repo_download_log.csv'
NOTEBOOK_CONTENT_PATH = 'notebook_contents'

In [4]:
def add_row_csv(file_path, row):
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(row)
    return row

In [11]:
def check_record_in_csv(file_path, search_dict, search_field):
    ''' Check the record in csv file'''
    # open the CSV file and read it into a list of dictionaries
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]

    # search for a record with a specific value in a field
    found = False
    for record in data:
        if record[search_field] == search_dict[search_field]:
            found = True
            break

    if found:
        return True
    else:
        return False

In [5]:
# Traverse the Github repo and download notebooks
def traverse_contents(contents, dst_dir):
    for content_file in contents:
        if content_file.type == 'dir':
            traverse_contents(repo.get_contents(content_file.path), dst_dir)
        elif content_file.name.endswith('.ipynb'):
            # Compute the SHA-256 hash of the html_url as the ID for the notebook
            notebook_id = 'NB_'+hashlib.sha256(content_file.html_url.encode('utf-8')).hexdigest()
            # print(notebook_id)
            notebook_path = os.path.join(dst_dir, notebook_id+'.ipynb')
            metadata_path = os.path.join(dst_dir, notebook_id+'.json')
            # print(path)
            
            if not os.path.exists(metadata_path):
                # Store metadata
                ipynb_info = get_ipynb_info(content_file, notebook_id)
                with open(metadata_path, 'w') as f:
                    json.dump(ipynb_info, f)
                    print(f"Metadata: {ipynb_info['docid']}")
            
            # Download notebook file only if there is no record in the metadata log
            if not os.path.exists(notebook_path): 
                download_file(content_file, notebook_path)
                print(f"Notebook: {notebook_path}")


def download_file(content_file, path):
    response = requests.get(content_file.download_url)
    if response.status_code == 200:
        # dirname = os.path.dirname(path)
        # if dirname != '':
        #     os.makedirs(dirname, exist_ok=True)
        with open(path, 'wb') as f:
            f.write(response.content)
    elif response.status_code == 429:
            # If we get a 429 error, wait for the recommended number of seconds and retry the request
            retry_after = response.headers.get('Retry-After')
            if retry_after:
                retry_after_secs = int(retry_after)
                print(f'Rate limit exceeded. Waiting for {retry_after_secs} seconds before retrying...')
                time.sleep(retry_after_secs)
                return download_file(content_file, path)
    else: 
        return -1
            
def get_ipynb_info(content_file, notebook_id):  
    # Extract the relevant metadata from the ContentFile object
    ipynb_info = {
        'docid': notebook_id,
        'path': content_file.path,
        'name': os.path.basename(content_file.path),
        'html_url': content_file.html_url,
        'url': content_file.url,
        'size': content_file.size,
        'sha': content_file.sha,
        'git_url': content_file.git_url,
        'download_url': content_file.download_url,
        'type': content_file.type,
        'encoding': content_file.encoding,
        'last_modified': content_file.last_modified
    }
    return ipynb_info

In [6]:
def get_rate_limit_info(g): 
    # Get the rate limit information from the last response
    rate_limit = g.get_rate_limit()
    rate_remaining = rate_limit.core.remaining
    rate_reset_utc = rate_limit.core.reset

    # Convert UTC time to local time zone
    local_tz = pytz.timezone('Europe/Amsterdam')  # Replace with your local time zone
    rate_reset = rate_reset_utc.replace(tzinfo=pytz.utc).astimezone(local_tz)

    # Print the rate limit information
    print(f"Rate limit: {rate_limit}")
    print(f"Rate remaining: {rate_remaining}")
    print(f"Rate reset time: {rate_reset}")


In [7]:
# Get github access token
with open('github_token.json', 'r') as f:
    # Load the JSON data into a dictionary
    data = json.load(f)

# Access the values of the 'user' and 'token' keys
user = data['user']
token = data['token']

# Provide your access token or username and password
g = Github(token)

In [8]:
# Get repo URLs
from urllib.parse import urlparse
import requests

with open(REPO_URL_FILE_COLLECTED_QUERIES, 'r') as f:
    urls = f.readlines()
    repo_urls = [url.strip() for url in urls]
print(repo_urls[:5])
print(len(repo_urls))

['https://github.com/ericnjogu/secure-ai-deep-learning-v2-pytorch', 'https://github.com/eastmountyxz/ImageProcessing-Python', 'https://github.com/y-richie-y/badgraphs', 'https://github.com/stes/saliency', 'https://github.com/brechtvandervliet/ResistancePoisoningFederatedMalwareClassifier']
915


In [14]:
# Retrieve the repository
for i, repo_url in enumerate(repo_urls): 
    parsed_url = urlparse(repo_url)
    path_parts = parsed_url.path.split('/')
    user = path_parts[1]
    repo = path_parts[2]

    print(f'[{i}] {user}/{repo}')
    record = {
        'repo_url': repo_url, 
        'downloaded': True
    }
    if check_record_in_csv(REPO_DOWNLOAD_LOG, record, 'repo_url'): 
        # print(f'{repo_url} already downloaded!')
        continue
    else: 
        try: 
            repo = g.get_repo(f"{user}/{repo}")
            contents = repo.get_contents('')
            traverse_contents(contents, NOTEBOOK_CONTENT_PATH)
            add_row_csv(REPO_DOWNLOAD_LOG, [repo_url, True])
        except RateLimitExceededException as e: 
            print(e)
            get_rate_limit_info(g) 
            raise Exception

[0] ericnjogu/secure-ai-deep-learning-v2-pytorch
[1] eastmountyxz/ImageProcessing-Python
[2] y-richie-y/badgraphs
[3] stes/saliency
[4] brechtvandervliet/ResistancePoisoningFederatedMalwareClassifier
[5] ZoyaGul/Histogram-Equalization-with-RGB
[6] cansuyalcinn/lung-ct-registration-challenge
[7] adi112100/-Understanding-and-Predicting-Property-Maintenance-Fines
[8] Western-OC2-Lab/SB-PdM-a-tool-for-predictive-maintenance-of-rolling-bearings-based-on-limited-labeled-data
[9] ElsaScola/DeepNeuralNetworks-BrainComputerInterfaces-analysis
[10] WeeYL/capstone
[11] fherreralab/organic-microcavity-spectra
[12] ybliu9/Cancer_object_detection_dnn
[13] jasonhernacki/poincare-embeddings
[14] GrindSC/C-code-generation-rnn
[15] AmeenAshad19/Saliency-Detection-Using-DMD-and-its-Variants
[16] natandrade/Tutorial-Medical-Image-Registration
[17] simonegiancola09/university_test_rankings
[18] leighirving/sales-conversion-optimization
[19] jugal-krishna/Debiasing
[20] Mithrillion/EarthEngineDataProcessing

Exception: 

In [16]:
i

414

In [15]:
# Count the number of notebooks
def count_ipynb_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.ipynb'):
                count += 1
    return count

# Example usage
directory = 'notebook_contents'
count = count_ipynb_files(directory)
print(f'Number of .ipynb files in {directory}: {count}')


Number of .ipynb files in notebook_contents: 5618
