In [4]:
from deepfunding.data import get_graph_dataframe

import polars as pl

df = get_graph_dataframe()

rows, cols = df.shape
print(f"Rows: {rows}")
print(f"Columns: {cols}")

# Get unique sources and targets
sources = df.get_column("source").unique()
targets = df.get_column("target").unique()

print("Unique sources:", len(sources))
print("Unique targets:", len(targets))

df.sample(10)

Rows: 9896
Columns: 3
Unique sources: 18
Unique targets: 4289


relation,source,target
str,str,str
"""NPM""","""https://github.com/eth-infinit…","""https://github.com/silentcicer…"
"""NPM""","""https://github.com/chainsafe/l…","""https://github.com/terser/ters…"
"""NPM""","""https://github.com/ethereum/re…","""https://github.com/evanw/node-…"
"""NPM""","""https://github.com/web3/web3.j…","""https://github.com/d3/d3-time"""
"""NPM""","""https://github.com/eth-infinit…","""https://github.com/jrburke/amd…"
"""RUST""","""https://github.com/paradigmxyz…","""https://github.com/mvdnes/spin…"
"""NPM""","""https://github.com/eth-infinit…","""https://github.com/feross/simp…"
"""NPM""","""https://github.com/web3/web3.j…","""https://github.com/ethereumjs/…"
"""NPM""","""https://github.com/ethereum/re…","""https://github.com/keik/merge-…"
"""NPM""","""https://github.com/chainsafe/l…","""https://github.com/octokit/typ…"


In [None]:
import asyncio
import httpx
import os

github_token = os.getenv("GITHUB_TOKEN")


async def fetch_repo_info(client, repo_url):
    # Extract owner/repo from the full URL
    _, _, _, owner, repo = repo_url.rstrip("/").split("/")
    api_url = f"https://api.github.com/repos/{owner}/{repo}"

    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {github_token}",
        "X-GitHub-Api-Version": "2022-11-28",
    }

    try:
        response = await client.get(api_url, headers=headers)
        response.raise_for_status()
        return response.json()
    except httpx.HTTPError as e:
        print(f"Error fetching {repo_url}: {e}")
        return None


async def fetch_all_repos():
    # Get unique target repos
    target_repos = df.get_column("target").unique().to_list()

    async with httpx.AsyncClient(
        transport=httpx.AsyncHTTPTransport(retries=2, verify=False),
        follow_redirects=True,
        limits=httpx.Limits(max_keepalive_connections=5, max_connections=10),
    ) as client:
        tasks = [fetch_repo_info(client, repo) for repo in target_repos]
        results = await asyncio.gather(*tasks)

    # Create DataFrame from results
    repo_data = [r for r in results if r is not None]
    return pl.DataFrame(repo_data)


# Run the async function
repo_info_df = await fetch_all_repos()
repo_info_df
