# Getting data

Since we plan to analyze a few repositories in this workshop, let's download them.

We'll first get metadata about a user or organization thanks to GitHub API, and then download the repositories that interest us the most.

## Retrieving metadata about a user/organization

We iterate as long as the API gives us a pointer to another response page. We filter forks to focus on original repositories.

In [None]:
from logging import getLogger
from os import makedirs
from os.path import join as path_join

from coloredlogs import install as coloredlogs_install


coloredlogs_install()
logger = getLogger("downloader")


git_data_dir = path_join("/devfest", "repos", "git-data")
makedirs(git_data_dir, exist_ok=True)
repos_json = path_join(git_data_dir, "repos.json")

To use GitHub API, we need a token. Please create one in your [GitHub account settings](https://github.com/settings/tokens) (the basic permissions are fine), and fill it here:

In [None]:
TOKEN = 

In [None]:
from json import dump as json_dump
from operator import itemgetter
from re import compile as re_compile
from typing import Any, Dict, List, Optional

import requests
from tqdm import tqdm_notebook as tqdm


next_pattern = re_compile('<(https://api.github.com/user/[^/]+/repos\?[^>]*page=\d+[^>]*)>; rel="next"')
last_pattern = re_compile('<https://api.github.com/user/[^/]+/repos\?[^>]*page=(\d+)[^>]*>; rel="last"')


def parse_next(link_header: str) -> Optional[str]:
    match = next_pattern.search(link_header)
    return match.group(1) if match is not None else None


def parse_last(link_header: str) -> Optional[int]:
    match = last_pattern.search(link_header)
    return int(match.group(1)) if match is not None else None


def list_repositories(user: str,
                      token: str,
                      max_size_mb: int,
                      repos_number: int
                     ) -> List[Dict[str, Any]]:
    repos_list_headers = dict(Authorization="token %s" % token)
    repos_url = "https://api.github.com/users/%s/repos" % user

    request_total = requests.get(repos_url,
                                 headers=repos_list_headers)
    total_pages = parse_last(request_total.headers["Link"])
    assert total_pages is not None

    def get_page_url(page: int):
        return "%s?page=%d" % (repos_url, page)

    logger.info("Retrieving repos list for user %s" % user)
    repos = []
    for page in tqdm(range(1, total_pages + 1)):
        request = requests.get(get_page_url(page),
                               headers=repos_list_headers)
        request.raise_for_status()
        for repo in request.json():
            if repo["fork"]:
                continue
            repos.append(dict(
                name=repo["name"],
                branch=repo["default_branch"],
                clone_url=repo["clone_url"],
                size=repo["size"],
                stars=repo["stargazers_count"]
            ))

    if max_size_mb is not None:
        logger.info(
            "Filtering to keep only repositories under %.2f MB",
            max_size_mb
        )
        repos = [repo for repo in repos
                 if repo["size"] <= max_size_mb * 1024]

    if repos_number is not None:
        logger.info(
            "Filtering to keep only the %d most popular repositories",
            repos_number
        )
        repos = [repo for repo in sorted(repos,
                                         key=itemgetter("stars"),
                                         reverse=True)][:repos_number]

    def get_repo_sha_url(user: str, repo: str, branch: str):
        return "https://api.github.com/repos/%s/%s/commits/%s" % (
            user,
            repo,
            branch
        )

    logger.info("Getting SHA1 for each repository")
    repo_sha_headers = dict(
        Authorization="token %s" % token,
        Accept = "application/vnd.github.VERSION.sha"
    )
    for repo in tqdm(repos):
        request_sha = requests.get(
            get_repo_sha_url(user, repo["name"], repo["branch"]),
            headers=repo_sha_headers)
        if request_sha.status_code == 409:
            # Repo is empty
            continue
        else:
            request_sha.raise_for_status()
        repo["sha"] = request_sha.text
    return repos


with open(repos_json, "w", encoding="utf8") as fh:
    json_dump(
        list_repositories(
            user="apache",
            # Generate a personal access token here
            # https://github.com/settings/tokens
            token=TOKEN,
            max_size_mb=50,
            repos_number=50),
        fh
    )

In [None]:
from json import load as json_load
from multiprocessing.pool import ThreadPool


PARALLEL_DOWNLOADS = 10


def clone_repo(name: str, clone_url: str, sha):
    !cd {git_data_dir} \
        && git clone -q {clone_url} {name} \
        && cd {name} \
        && git checkout -q {sha}


with ThreadPool(PARALLEL_DOWNLOADS) as pool, \
        open(repos_json, encoding="utf8") as fh:
    repos = json_load(fh)
    pool.starmap(clone_repo,
                 [(repo["name"],
                   repo["clone_url"],
                   repo["sha"])
                  for repo in repos])