# Github: initial fetch

In [1]:
import os
import json
import shutil
import re
from pathlib import Path
import datetime
from typing import Optional
from urllib.parse import urlparse

import requests
import requests_cache
import tqdm
from pydantic import BaseModel,SecretStr

from nya.config import settings
from nya.github.client import GithubSession,GithubRateLimit

In [2]:
AWESOME_README_PATH = settings.root_dir.joinpath("data","raw","scratch","awesome_readme.md")
AWESOME_README_HREFS_PATH = settings.root_dir.joinpath("data","int","awesome_readme_hrefs.json")

In [3]:
sess = GithubSession(token=settings.github_token.get_secret_value(), cache_path=settings.github_cache_path)

# Fetch names of awesome repos

In [4]:
patt = re.compile(r"- \[(.+)\]\((https://github.com.+)\)(?: - (.+))?")
md = AWESOME_README_PATH.read_text()

hrefs = [
    {
        "title": el.group(1),
        "href": el.group(2),
        "description": el.group(3) or None,
        **{
            k: v
            for k, v in zip(
                ["owner", "repo"], urlparse(el.group(2)).path.split("/")[1:]
            )
        },
    }
    for el in patt.finditer(md)
]

print(f"""Found {len(hrefs)} github repos
Saving to: {AWESOME_README_HREFS_PATH}
""")
with open(AWESOME_README_HREFS_PATH, "w") as fp:
    json.dump(hrefs, fp)

Found 627 github repos
Saving to: /home/michael/Documents/nya/py/data/int/awesome_readme_hrefs.json



# Get readme

In [6]:
res = sess.get_readme('jupyterlab','jupyter-renderers')

In [7]:
res


<Response [200]>

In [8]:
pl= res.json()

In [23]:
import base64
from typing import Any
from pydantic import BaseModel, Field, ByteSize


# class ContentFileLinks(BaseModel):
#     url: str = Field(default=..., alias="self")
#     git: str
#     html: str


class ContentFile(BaseModel):
    name: str
    path: str
    sha: str
    size: ByteSize
    url: str
    html_url: str
    git_url: str
    download_url: str
    typ: str = Field(default=..., alias="type")
    content: str  # base64 str
    encoding: str

    # _links: ContentFileLinks = Field(default=..., alias="links")

    def decode_content(self):
        """
        :type: bytes
        """
        assert self.encoding == "base64", f"unsupported encoding: {self.encoding}"
        return base64.b64decode(bytearray(self.content, "utf-8")).decode("utf8")

    @property
    def is_readme(self) -> bool:
        return self.name.lower().startswith("readme")

    def as_readme(self) -> "ReadmeFile":
        if not self.is_readme:
            raise ValueError(f"File '{self.name}' is not a readme.")
        return ReadmeFile.parse_obj({**self.dict(), "text": self.decode_content()})


class ReadmeFile(BaseModel):
    name: str
    path: str
    sha: str
    size: ByteSize
    url: str
    html_url: str
    git_url: str
    download_url: str

    text: str


# @classmethod
# def parse_obj(cls, obj: Any) -> "ReadmeFile":

#     super().parse_obj


In [27]:
cf.as_readme()

ReadmeFile(name='README.md', path='README.md', sha='b2a2d59df778d96e4cf72ab7a2b6dafaa543c7b7', size=6512, url='https://api.github.com/repos/jupyterlab/jupyter-renderers/contents/README.md?ref=master', html_url='https://github.com/jupyterlab/jupyter-renderers/blob/master/README.md', git_url='https://api.github.com/repos/jupyterlab/jupyter-renderers/git/blobs/b2a2d59df778d96e4cf72ab7a2b6dafaa543c7b7', download_url='https://raw.githubusercontent.com/jupyterlab/jupyter-renderers/master/README.md', text="# Jupyter Renderers\n\n![Github Actions Status](https://github.com/jupyterlab/jupyter-renderers/workflows/CI/badge.svg)\n[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/jupyterlab/jupyter-renderers/master?urlpath=lab/tree/notebooks)\n\nThis is a\n[monorepo](https://github.com/lerna/lerna#what-does-a-lerna-repo-look-like) that\nconsists of [JupyterLab](https://github.com/jupyterlab/jupyterlab) _mimerender extensions_ for common file and MIME types.\n\n## Packages\

In [25]:
cf = ContentFile.parse_obj(pl)

cf.decode_content()

"# Jupyter Renderers\n\n![Github Actions Status](https://github.com/jupyterlab/jupyter-renderers/workflows/CI/badge.svg)\n[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/jupyterlab/jupyter-renderers/master?urlpath=lab/tree/notebooks)\n\nThis is a\n[monorepo](https://github.com/lerna/lerna#what-does-a-lerna-repo-look-like) that\nconsists of [JupyterLab](https://github.com/jupyterlab/jupyterlab) _mimerender extensions_ for common file and MIME types.\n\n## Packages\n\n| Name                                              | Mime types                                                         | File extensions                                            | Info                                                                                                                            |\n| ------------------------------------------------- | ------------------------------------------------------------------ | ---------------------------------------------------------- | --

In [14]:
replace_empty_str_with_none(remove_github_url_keys(pl))

{'name': 'README.md',
 'path': 'README.md',
 'sha': 'b2a2d59df778d96e4cf72ab7a2b6dafaa543c7b7',
 'size': 6512,
 'url': 'https://api.github.com/repos/jupyterlab/jupyter-renderers/contents/README.md?ref=master',
 'type': 'file',
 'content': 'IyBKdXB5dGVyIFJlbmRlcmVycwoKIVtHaXRodWIgQWN0aW9ucyBTdGF0dXNd\nKGh0dHBzOi8vZ2l0aHViLmNvbS9qdXB5dGVybGFiL2p1cHl0ZXItcmVuZGVy\nZXJzL3dvcmtmbG93cy9DSS9iYWRnZS5zdmcpClshW0JpbmRlcl0oaHR0cHM6\nLy9teWJpbmRlci5vcmcvYmFkZ2VfbG9nby5zdmcpXShodHRwczovL215Ymlu\nZGVyLm9yZy92Mi9naC9qdXB5dGVybGFiL2p1cHl0ZXItcmVuZGVyZXJzL21h\nc3Rlcj91cmxwYXRoPWxhYi90cmVlL25vdGVib29rcykKClRoaXMgaXMgYQpb\nbW9ub3JlcG9dKGh0dHBzOi8vZ2l0aHViLmNvbS9sZXJuYS9sZXJuYSN3aGF0\nLWRvZXMtYS1sZXJuYS1yZXBvLWxvb2stbGlrZSkgdGhhdApjb25zaXN0cyBv\nZiBbSnVweXRlckxhYl0oaHR0cHM6Ly9naXRodWIuY29tL2p1cHl0ZXJsYWIv\nanVweXRlcmxhYikgX21pbWVyZW5kZXIgZXh0ZW5zaW9uc18gZm9yIGNvbW1v\nbiBmaWxlIGFuZCBNSU1FIHR5cGVzLgoKIyMgUGFja2FnZXMKCnwgTmFtZSAg\nICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB8\nIE1pbWUgdHlwZXMg

In [16]:
pl

{'name': 'README.md',
 'path': 'README.md',
 'sha': 'b2a2d59df778d96e4cf72ab7a2b6dafaa543c7b7',
 'size': 6512,
 'url': 'https://api.github.com/repos/jupyterlab/jupyter-renderers/contents/README.md?ref=master',
 'html_url': 'https://github.com/jupyterlab/jupyter-renderers/blob/master/README.md',
 'git_url': 'https://api.github.com/repos/jupyterlab/jupyter-renderers/git/blobs/b2a2d59df778d96e4cf72ab7a2b6dafaa543c7b7',
 'download_url': 'https://raw.githubusercontent.com/jupyterlab/jupyter-renderers/master/README.md',
 'type': 'file',
 'content': 'IyBKdXB5dGVyIFJlbmRlcmVycwoKIVtHaXRodWIgQWN0aW9ucyBTdGF0dXNd\nKGh0dHBzOi8vZ2l0aHViLmNvbS9qdXB5dGVybGFiL2p1cHl0ZXItcmVuZGVy\nZXJzL3dvcmtmbG93cy9DSS9iYWRnZS5zdmcpClshW0JpbmRlcl0oaHR0cHM6\nLy9teWJpbmRlci5vcmcvYmFkZ2VfbG9nby5zdmcpXShodHRwczovL215Ymlu\nZGVyLm9yZy92Mi9naC9qdXB5dGVybGFiL2p1cHl0ZXItcmVuZGVyZXJzL21h\nc3Rlcj91cmxwYXRoPWxhYi90cmVlL25vdGVib29rcykKClRoaXMgaXMgYQpb\nbW9ub3JlcG9dKGh0dHBzOi8vZ2l0aHViLmNvbS9sZXJuYS9sZXJuYSN3aGF0\nLWRvZXMtYS1sZXJu

These functions remove cruft from repo response:

In [13]:
def remove_github_url_keys(d:dict) -> dict:
    return {k: v if not isinstance(v, dict) else remove_github_url_keys(v) for k,v in d.items() if not k.endswith("_url")}
    
def replace_empty_str_with_none(d:dict) -> dict:
    return {k: (v if v != '' else None) if not isinstance(v, dict) else replace_empty_str_with_none(v) for k,v in d.items()}
    
# replace_empty_str_with_none(remove_github_url_keys(out))

In [2]:
import tenacity

tenacity

<module 'tenacity' from '/home/michael/Documents/nya/py/.venv/lib/python3.11/site-packages/tenacity/__init__.py'>

In [None]:
tenacity.re