In [5]:
import io
import zipfile
import requests
import frontmatter
import logging

def read_repo_data(repo_owner, repo_name, branch="main"):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner (str): GitHub username or organization
        repo_name (str): Repository name
        branch (str): Branch name (default: main)
    
    Returns:
        List[dict]: File content + frontmatter + metadata
    """
    url = f"https://codeload.github.com/{repo_owner}/{repo_name}/zip/refs/heads/{branch}"
    resp = requests.get(url)

    if resp.status_code == 404 and branch == "main":
        # Try fallback to master
        return read_repo_data(repo_owner, repo_name, branch="master")
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: HTTP {resp.status_code}")

    repository_data = []
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename
            if not filename.lower().endswith((".md", ".mdx")):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read().decode("utf-8", errors="replace")
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data.update({
                        "filename": filename,
                        "repo": repo_name,
                        "owner": repo_owner,
                        "branch": branch
                    })
                    repository_data.append(data)
            except Exception as e:
                logging.warning("Error processing %s: %s", filename, e)
                continue

    return repository_data

In [8]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
print(f"FAQ documents: {len(dtc_faq)}")


FAQ documents: 1217


In [3]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break   
    return result

In [13]:
data = list(dtc_faq)
windows = sliding_window(data, size=4, step=2)


In [14]:
import json

with open("windows_chuncks.json", "w") as f:
    json.dump(windows, f, indent=2)