# Github: initial fetch

In [1]:
import os
import json
import shutil
import re
from pathlib import Path
import datetime
from typing import Optional
from urllib.parse import urlparse

import requests
import requests_cache
from tqdm import trange, tqdm
from pydantic import BaseModel,SecretStr

from nya.config import settings
from nya.github.client import GithubSession,GithubRateLimit
from nya.github.models import ContentFile,ReadmeFile

In [2]:
AWESOME_README_DIR = settings.root_dir.joinpath("data","raw","awesome_samples")
AWESOME_README_HREFS_PATH = settings.root_dir.joinpath("data","int","awesome_readme_hrefs.json")

SAMPLE_REPO_RECORDS = settings.root_dir.joinpath("data","int","sample_repo_records.jsonld")

In [3]:
sess = GithubSession(token=settings.github_token.get_secret_value(), cache_path=settings.github_cache_path)

# Fetch names of awesome repos

In [4]:
patt = re.compile(r"[-\*] \[(.+)\]\((https:\/\/github\.com\/(?!repos|topics)[a-zA-Z0-9_\-]+\/[a-zA-Z0-9_\-]+\/?(tree\/[a-zA-Z0-9_\-]+\/?)?(#.*)?)\)(?: - (.+))?")

hrefs = []
for pth in AWESOME_README_DIR.iterdir():
    md = pth.read_text()
    src = pth.name.split("_readme")[0]

    hrefs.extend([
        {
            "src": src,
            "title": el.group(1),
            "href": el.group(2),
            "description": el.group(3) or None,
            **{
                k: v
                for k, v in zip(
                    ["owner", "repo"], urlparse(el.group(2)).path.split("/")[1:3]
                )
            },
        }
        for el in patt.finditer(md)
    ])


print(f"""Found {len(hrefs)} github repos.
Saving to: {AWESOME_README_HREFS_PATH}""")
with open(AWESOME_README_HREFS_PATH, "w") as fp:
    json.dump(hrefs, fp)
    
hrefs = json.loads(AWESOME_README_HREFS_PATH.read_text())

Found 1133 github repos.
Saving to: /home/michael/Documents/nya/py/data/int/awesome_readme_hrefs.json


# Get fetch sample repos + readmes

These functions remove cruft from repo response:

In [5]:
def remove_github_url_keys(d:dict) -> dict:
    return {k: v if not isinstance(v, dict) else remove_github_url_keys(v) for k,v in d.items() if not k.endswith("_url")}
    
def replace_empty_str_with_none(d:dict) -> dict:
    return {k: (v if v != '' else None) if not isinstance(v, dict) else replace_empty_str_with_none(v) for k,v in d.items()}
    
def repo_res_proc(res:requests.Response) -> dict:
    return replace_empty_str_with_none(remove_github_url_keys(res.json()))
# replace_empty_str_with_none(remove_github_url_keys(out))

def check_rate_limit(s:GithubSession)->None:
    if s.rate_limit.remaining == 0:
        raise Exception(f"Rate limit exhausted! @ {s.rate_limit.reset} ({s.rate_limit.reset-datetime.datetime.utcnow()})")

In [6]:
errs = []

i = 0
repos = []
n_repos = 0
n_repo_cache = 0
n_readmes = 0
n_readmes_cache = 0

for el in tqdm( hrefs):
    i += 1
    owner,repo = el['owner'],el['repo']
        
    # Repos : Fetch and process
    check_rate_limit(sess)
    res = sess.get_repo(owner,repo)
    if not res.ok:
        tqdm.tqdm(f"REPO: {owner}/{repo}: ({res.status_code}) {res.reason}")
        errs.append({"el":el, "err":res})
        continue
        
    n_repos += 1
    n_repo_cache += res.from_cache
    repos.append(repo_res_proc(res))
    
    # Just fetch readmes for now.
    check_rate_limit(sess)
    res_md = sess.get_readme(owner,repo)
    if not res_md.ok:
        tqdm.tqdm(f"REPO: {owner}/{repo}: ({res_md.status_code}) {res_md.reason}")
        errs.append({"el":el, "err":res})
        continue
        
    n_readmes += 1
    n_readmes_cache += res_md.from_cache
    

    if i % 100 == 0:
        tqdm.write(f"{i}  repos: {n_repo_cache}/{n_repos}   readmes: {n_readmes_cache}/{n_readmes}  errors: {len(errs)}")

  9%|█████████████                                                                                                                                       | 100/1133 [00:27<04:26,  3.87it/s]

100  repos: 100/100   readmes: 100/100  errors: 0


 18%|██████████████████████████▏                                                                                                                         | 200/1133 [00:55<04:42,  3.30it/s]

200  repos: 200/200   readmes: 200/200  errors: 0


 26%|███████████████████████████████████████▏                                                                                                            | 300/1133 [01:23<03:45,  3.69it/s]

300  repos: 300/300   readmes: 300/300  errors: 0


 35%|████████████████████████████████████████████████████▎                                                                                               | 400/1133 [01:51<03:25,  3.56it/s]

400  repos: 400/400   readmes: 400/400  errors: 0


 44%|█████████████████████████████████████████████████████████████████▎                                                                                  | 500/1133 [02:18<02:46,  3.81it/s]

500  repos: 500/500   readmes: 500/500  errors: 0


 53%|██████████████████████████████████████████████████████████████████████████████▍                                                                     | 600/1133 [02:46<02:23,  3.71it/s]

600  repos: 600/600   readmes: 600/600  errors: 0


 62%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 700/1133 [03:15<02:08,  3.38it/s]

700  repos: 700/700   readmes: 700/700  errors: 0


 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 800/1133 [03:48<02:12,  2.52it/s]

800  repos: 800/800   readmes: 800/800  errors: 0


 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 900/1133 [04:18<01:04,  3.59it/s]

900  repos: 900/900   readmes: 900/900  errors: 0


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 902/1133 [04:18<01:06,  3.48it/s]


AttributeError: type object 'tqdm' has no attribute 'tqdm'