# Github: initial fetch

In [4]:
import os
import json
import shutil
import re
from pathlib import Path
import datetime
from typing import Optional

import requests
import requests_cache
import tqdm
from pydantic import BaseModel,SecretStr

from nya.config import settings
from nya.github.client import GithubSession,GithubRateLimit

In [5]:
AWESOME_README_PATH = settings.root_dir.joinpath("data","raw","scratch","awesome_readme.md")

In [None]:
sess = GithubSession(token=settings.github_token.get_secret_value(), cache_path=settings.github_cache_path)

In [None]:
sess._rl

In [97]:
class GithubRateLimit(BaseModel):
    limit:int
    remaining:int
    reset:datetime.datetime
    used:int
    resource:str
    
    @classmethod
    def parse_res(cls, r:requests.Response) -> "GithubRateLimit":
        return cls.parse_obj({k.split('-')[-1].lower(): v  for k,v in  r.headers.items() if k.lower().startswith('x-ratelimit-')})
        
class GithubRateLimitCollection(BaseModel):
    core:GithubRateLimit
    search:GithubRateLimit
    graphql:GithubRateLimit
    integration_manifest:GithubRateLimit
    source_import:GithubRateLimit
    code_scanning_upload:GithubRateLimit
    actions_runner_registration:GithubRateLimit
    scim:GithubRateLimit
    dependency_snapshots:GithubRateLimit

class GithubSession:
    
    rl:GithubRateLimitCollection
    
    def __init__(self, token:str, cache_path:Path):
        self.token = token
        self.cache_path = Path(cache_path)
        self.session = self._init_session(token=self.token, cache_path=self.cache_path)
        self.rl = self._init_rate_limit(session=self.session)
        
    @staticmethod
    def _init_session(token:str,cache_path:Path):
        session_backend = requests_cache.SQLiteCache(cache_path)
        session = requests_cache.CachedSession(
            backend=session_backend,
            expire_after=0.0001,
            should_strip_auth=True,
            ignored_parameters=["Authorization"],
            urls_expire_after={
                "*.github.com": 0.0001,  # Placeholder expiration; should be overridden by Cache-Control
                "*": requests_cache.DO_NOT_CACHE,  # Don't cache anything other than GitHub requests
            },
        )
        session.headers["Authorization"] = f"Bearer {token}"
        return session
    
    def _init_rate_limit(session:requests.Session) -> GithubRateLimitCollection:
        url = "https://api.github.com/rate_limit"
        res = self.session.get(url)
        res.raise_for_status()
        rl = res.json()
        return GithubRateLimitCollection.parse_obj({k:{**v, "resource":k} for k,v in rl['resources'].items()})
    
    def _update_rate_limit(self, res:requests.Response):
        nrl  = GithubRateLimit.parse_res(res)
        setattr(self.rl, nrl.resource, nrl)
        return None
    
    def get_repo(self,owner:str,repo:str)->requests.Response:
        url = f"https://api.github.com/repos/{owner}/{repo}"
        res = session.get(url)
        self._update_rate_limit(res)
        res.raise_for_status()
        return res

In [3]:
session_backend = requests_cache.SQLiteCache(settings.github_cache_path)
session = requests_cache.CachedSession(
    backend=session_backend,
    expire_after=0.0001,
    should_strip_auth=True,
    ignored_parameters=["Authorization"],
    urls_expire_after={
        "*.github.com": 0.0001,  # Placeholder expiration; should be overridden by Cache-Control
        "*": requests_cache.DO_NOT_CACHE,  # Don't cache anything other than GitHub requests
    },
)
if settings.github_token:
    session.headers["Authorization"] = f"Bearer {settings.github_token.get_secret_value()}"


In [23]:
url = "https://api.github.com/repos/sindresorhus/conf"
res = session.get(url)
res

<Response [200]>

In [76]:
def get_rate_limit_from_response(r:requests.Response) ->dict[str,Union[str,int]]

{k.split('-')[-1].lower(): int(v) if v.isdigit() else v for k,v in  res.headers.items() if k.lower().startswith('x-ratelimit-')}

{'limit': 5000,
 'remaining': 4838,
 'reset': 1673345241,
 'used': 162,
 'resource': 'core'}

In [86]:
requests.get(url, headers=session.headers).headers

{'Server': 'GitHub.com', 'Date': 'Tue, 10 Jan 2023 10:22:46 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Cache-Control': 'no-cache', 'X-OAuth-Scopes': 'gist, read:org, repo, workflow', 'X-Accepted-OAuth-Scopes': '', 'x-oauth-client-id': '178c6fc778ccc68e1d6a', 'X-GitHub-Media-Type': 'github.v3; format=json', 'x-github-api-version-selected': '2022-11-28', 'X-RateLimit-Limit': '5000', 'X-RateLimit-Remaining': '4975', 'X-RateLimit-Reset': '1673349164', 'X-RateLimit-Used': '25', 'X-RateLimit-Resource': 'core', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset', 'Access-Control-Allow-Origin': '*', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Frame-O

In [5]:
url = "https://api.github.com/rate_limit" #https://api.github.com/repos/jupyterlab/jupyter-renderers"
r2 = sess.session.get(url)
r2

<Response [200]>

In [8]:
sess.session?

[0;31mType:[0m        CachedSession
[0;31mString form:[0m <CachedSession(cache=<SQLiteCache(name=http_cache)>, expire_after=0.0001, urls_expire_after={'*.g <...> able_codes=(200,), allowable_methods=('GET', 'HEAD'), stale_if_error=False, cache_control=False)>
[0;31mFile:[0m        ~/Documents/nya/py/.venv/lib/python3.11/site-packages/requests_cache/session.py
[0;31mDocstring:[0m  
Session class that extends :py:class:`requests.Session` with caching features.

See individual :py:mod:`backend classes <requests_cache.backends>` for additional backend-specific arguments.
Also see :ref:`user-guide` for more details and examples on how the following arguments
affect cache behavior.

Args:
    cache_name: Cache prefix or namespace, depending on backend
    backend: Cache backend name or instance; name may be one of
        ``['sqlite', 'filesystem', 'mongodb', 'gridfs', 'redis', 'dynamodb', 'memory']``
    serializer: Serializer name or instance; name may be one of
        ``['pickle'

In [87]:
rl = r2.json()

In [91]:
{k:GithubRateLimit.parse_obj({**v, "resource":k}) for k,v in rl['resources'].items()}

{'core': GithubRateLimit(limit=5000, remaining=4975, reset=datetime.datetime(2023, 1, 10, 11, 12, 44, tzinfo=datetime.timezone.utc), used=25, resource='core'),
 'search': GithubRateLimit(limit=30, remaining=30, reset=datetime.datetime(2023, 1, 10, 10, 22, 59, tzinfo=datetime.timezone.utc), used=0, resource='search'),
 'graphql': GithubRateLimit(limit=5000, remaining=4950, reset=datetime.datetime(2023, 1, 10, 10, 55, 43, tzinfo=datetime.timezone.utc), used=50, resource='graphql'),
 'integration_manifest': GithubRateLimit(limit=5000, remaining=5000, reset=datetime.datetime(2023, 1, 10, 11, 21, 59, tzinfo=datetime.timezone.utc), used=0, resource='integration_manifest'),
 'source_import': GithubRateLimit(limit=100, remaining=100, reset=datetime.datetime(2023, 1, 10, 10, 22, 59, tzinfo=datetime.timezone.utc), used=0, resource='source_import'),
 'code_scanning_upload': GithubRateLimit(limit=1000, remaining=1000, reset=datetime.datetime(2023, 1, 10, 11, 21, 59, tzinfo=datetime.timezone.utc), 

In [95]:
GithubRateLimitCollection.parse_obj({k:{**v, "resource":k} for k,v in rl['resources'].items()})

GithubRateLimitCollection(core=GithubRateLimit(limit=5000, remaining=4975, reset=datetime.datetime(2023, 1, 10, 11, 12, 44, tzinfo=datetime.timezone.utc), used=25, resource='core'), search=GithubRateLimit(limit=30, remaining=30, reset=datetime.datetime(2023, 1, 10, 10, 22, 59, tzinfo=datetime.timezone.utc), used=0, resource='search'), graphql=GithubRateLimit(limit=5000, remaining=4950, reset=datetime.datetime(2023, 1, 10, 10, 55, 43, tzinfo=datetime.timezone.utc), used=50, resource='graphql'), integration_manifest=GithubRateLimit(limit=5000, remaining=5000, reset=datetime.datetime(2023, 1, 10, 11, 21, 59, tzinfo=datetime.timezone.utc), used=0, resource='integration_manifest'), source_import=GithubRateLimit(limit=100, remaining=100, reset=datetime.datetime(2023, 1, 10, 10, 22, 59, tzinfo=datetime.timezone.utc), used=0, resource='source_import'), code_scanning_upload=GithubRateLimit(limit=1000, remaining=1000, reset=datetime.datetime(2023, 1, 10, 11, 21, 59, tzinfo=datetime.timezone.utc)

In [93]:
print("class GithubRateLimitCollection(BaseModel):")
for k in rl['resources'].keys():
    print(f"    {k}:GithubRateLimit")

class GithubRateLimitCollection(BaseModel):
    core:GithubRateLimit
    search:GithubRateLimit
    graphql:GithubRateLimit
    integration_manifest:GithubRateLimit
    source_import:GithubRateLimit
    code_scanning_upload:GithubRateLimit
    actions_runner_registration:GithubRateLimit
    scim:GithubRateLimit
    dependency_snapshots:GithubRateLimit


In [16]:
def remove_github_url_keys(d:dict) -> dict:
    return {k: v if not isinstance(v, dict) else remove_github_url_keys(v) for k,v in d.items() if not k.endswith("_url")}
    
def replace_empty_str_with_none(d:dict) -> dict:
    return {k: (v if v != '' else None) if not isinstance(v, dict) else replace_empty_str_with_none(v) for k,v in d.items()}
    
replace_empty_str_with_none(remove_github_url_keys(out))

{'id': 93772766,
 'node_id': 'MDEwOlJlcG9zaXRvcnk5Mzc3Mjc2Ng==',
 'name': 'jupyter-renderers',
 'full_name': 'jupyterlab/jupyter-renderers',
 'private': False,
 'owner': {'login': 'jupyterlab',
  'id': 22800682,
  'node_id': 'MDEyOk9yZ2FuaXphdGlvbjIyODAwNjgy',
  'gravatar_id': None,
  'url': 'https://api.github.com/users/jupyterlab',
  'type': 'Organization',
  'site_admin': False},
 'description': 'Renderers and renderer extensions for JupyterLab',
 'fork': False,
 'url': 'https://api.github.com/repos/jupyterlab/jupyter-renderers',
 'created_at': '2017-06-08T16:59:07Z',
 'updated_at': '2023-01-06T21:03:14Z',
 'pushed_at': '2023-01-08T10:03:57Z',
 'homepage': None,
 'size': 3629,
 'stargazers_count': 452,
 'watchers_count': 452,
 'language': 'Jupyter Notebook',
 'has_issues': True,
 'has_projects': True,
 'has_downloads': True,
 'has_wiki': True,
 'has_pages': False,
 'has_discussions': False,
 'forks_count': 73,
 'archived': False,
 'disabled': False,
 'open_issues_count': 38,
 'licen

In [26]:
import datetime
print(datetime.datetime.fromtimestamp(1673345241))

2023-01-10 05:07:21


In [44]:


# patt = re.compile( r"- \[(.+?)\]\((https://github.com.+?)\)((\s+-\s+)?(.+))?")
patt = re.compile(r"- \[(.+)\]\((https://github.com.+)\)(?: - (.+))?")
md = AWESOME_README_PATH.read_text()

In [59]:
el = next(patt.finditer(md))
el

<re.Match object; span=(6423, 6582), match="- [Node.js](https://github.com/sindresorhus/aweso>

In [63]:
from urllib.parse import urlsplit,urlparse

urlsplit?

[0;31mSignature:[0m       [0murlsplit[0m[0;34m([0m[0murl[0m[0;34m,[0m [0mscheme[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m [0mallow_fragments[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mCall signature:[0m  [0murlsplit[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            _lru_cache_wrapper
[0;31mString form:[0m     <functools._lru_cache_wrapper object at 0x7f2652d9fe20>
[0;31mFile:[0m            ~/.pyenv/versions/3.11.0b3/lib/python3.11/urllib/parse.py
[0;31mDocstring:[0m      
Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>

The result is a named 5-tuple with fields corresponding to the
above. It is either a SplitResult or SplitResultBytes object,
depending on the type of the url parameter.

The username, password, hostname, and port sub-components of netloc
can also be accessed as attributes of the returned object.


In [69]:
{k:v for k,v in  zip(["owner","repo"], urlparse('https://github.com/sindresorhus/awesome-nodejs#readme').path.split("/")[1:])}

{'owner': 'sindresorhus', 'repo': 'awesome-nodejs'}

In [70]:
[
    {
        "title": el.group(1),
        "href": el.group(2),
        "description": el.group(3) or None,
        **{
            k: v
            for k, v in zip(
                ["owner", "repo"], urlparse(el.group(2)).path.split("/")[1:]
            )
        },
    }
    for el in patt.finditer(md)
]

[{'title': 'Node.js',
  'href': 'https://github.com/sindresorhus/awesome-nodejs#readme',
  'description': "Async non-blocking event-driven JavaScript runtime built on Chrome's V8 JavaScript engine.",
  'owner': 'sindresorhus',
  'repo': 'awesome-nodejs'},
 {'title': 'Cross-Platform',
  'href': 'https://github.com/bcoe/awesome-cross-platform-nodejs#readme',
  'description': 'Writing cross-platform code on Node.js.',
  'owner': 'bcoe',
  'repo': 'awesome-cross-platform-nodejs'},
 {'title': 'Frontend Development',
  'href': 'https://github.com/dypsilon/frontend-dev-bookmarks#readme',
  'description': None,
  'owner': 'dypsilon',
  'repo': 'frontend-dev-bookmarks'},
 {'title': 'iOS',
  'href': 'https://github.com/vsouza/awesome-ios#readme',
  'description': 'Mobile operating system for Apple phones and tablets.',
  'owner': 'vsouza',
  'repo': 'awesome-ios'},
 {'title': 'Android',
  'href': 'https://github.com/JStumpp/awesome-android#readme',
  'description': 'Mobile operating system devel