In [1]:
import os
from typing import Optional

GITHUB_ACCESS_TOKEN = "invalid"

In [2]:
from dataclasses import dataclass
from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport


@dataclass
class QueryParams:
    limit: Optional[int] = None
    min_repos: Optional[int] = None
    prev_cursor: Optional[str] = None

    def repositories_query(self):
        if self.min_repos is None:
            return ""
        return f"repos:<{self.min_repos}"

    def pagination_params(self):
        if self.limit is not None:
            yield f"first:{self.limit}"
        if self.prev_cursor is not None:
            yield f'after:"{self.prev_cursor}"'


def get_query(params: QueryParams):
    pagination_str = ", ".join(params.pagination_params())
    return gql(
        """
        query {
            search(type: USER, query:"type:user sort:repositories-desc %s location:Korea", %s) {
                userCount
                edges {
                    node {
                        __typename
                        ... on User {
                            login
                            avatarUrl
                            name
                            company
                            location
                            repositories {
                                totalCount
                            }
                        }
                    }
                    cursor
                }
            }
        }
        """
        % (params.repositories_query(), pagination_str)
    )


In [3]:
import json
import time
from datetime import datetime


async def fetch(sleep_secs: Optional[int] = None):
    logical_date = datetime.now().isoformat()

    transport = AIOHTTPTransport(
        "https://api.github.com/graphql",
        headers=dict(Authorization=f"Bearer {GITHUB_ACCESS_TOKEN}"),
    )
    async with Client(transport=transport, fetch_schema_from_transport=True) as client:
        query = get_query(QueryParams(limit=1))
        response = await client.execute(query)
        total_users = response["search"]["userCount"]

    print(f"Fetching {total_users} users...")
    params = QueryParams(limit=100)
    users = []
    while len(users) < total_users:
        async with Client(transport=transport, fetch_schema_from_transport=True) as client:
            while True:
                query = get_query(params)
                response = await client.execute(query)

                edges = response["search"]["edges"]
                if not edges:
                    break

                params.prev_cursor = edges[-1]["cursor"]
                users += edges
                print(params)

                if sleep_secs is not None:
                    time.sleep(sleep_secs)

        params.prev_cursor = None
        params.min_repos = users[-1]["node"]["repositories"]["totalCount"]
        
        user_count = len(users)
        if user_count % 1000 == 0:
            with open(f"{logical_date}_{user_count // 1000}.json", "w") as outfile:
                json.dump(users[-1000:], outfile)

    print(f"Fetched {len(users)} users")


await fetch()


Fetching 49457 users...
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjEwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjIwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjMwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjQwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjUwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjYwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjcwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjgwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjkwMA==')
QueryParams(limit=100, min_repos=None, prev_cursor='Y3Vyc29yOjEwMDA=')
QueryParams(limit=100, min_repos=78, prev_cursor='Y3Vyc29yOjEwMA==')
QueryParams(limit=100, min_repos=78, prev_cursor='Y3Vyc29yOjIwMA==')
QueryParams(limit=100, min_repos=78, prev_cursor='Y3Vyc29yOjMwMA==')
QueryParams(limit=100, min_repos=78, prev_cursor='Y3Vyc29yO