# Exploring the Graph of Dependencies

A small exploration of the graph of dependencies to understand the data and the structure of the graph.

## Data

In [4]:
import polars as pl

_ = pl.Config.set_tbl_rows(20)
_ = pl.Config.set_fmt_str_lengths(100)

In [5]:
import json
import urllib.request

url = "https://raw.githubusercontent.com/deepfunding/dependency-graph/refs/heads/main/graph/unweighted_graph.json"
with urllib.request.urlopen(url) as response:
    graph_data = json.loads(response.read())

df = pl.DataFrame(graph_data.get("links"))

# Drop the weight column
df = df.drop("weight")

rows, cols = df.shape
print(f"Rows: {rows}")
print(f"Columns: {cols}")

# Get unique sources and targets
sources = df.get_column("source").unique()
targets = df.get_column("target").unique()

print("Unique sources:", len(sources))
print("Unique targets:", len(targets))

df.sample(10)


Rows: 9896
Columns: 3
Unique sources: 18
Unique targets: 4289


relation,source,target
str,str,str
"""NPM""","""https://github.com/ethereum/remix-project""","""https://github.com/typestrong/ts-loader"""
"""RUST""","""https://github.com/grandinetech/grandine""","""https://github.com/jplatte/serde_html_form"""
"""GO""","""https://github.com/prysmaticlabs/prysm""","""https://github.com/prometheus/client_model"""
"""NPM""","""https://github.com/web3/web3.js""","""https://github.com/silverwind/default-gateway"""
"""GO""","""https://github.com/prysmaticlabs/prysm""","""https://github.com/hashicorp/go-bexpr"""
"""NPM""","""https://github.com/chainsafe/lodestar""","""https://github.com/felixge/node-dateformat"""
"""NPM""","""https://github.com/web3/web3.js""","""https://github.com/syntax-tree/hast-util-parse-selector"""
"""NPM""","""https://github.com/web3/web3.js""","""https://github.com/dirtyhairy/async-mutex"""
"""RUST""","""https://github.com/grandinetech/grandine""","""https://github.com/hermit-os/hermit-rs"""
"""NPM""","""https://github.com/safe-global/safe-smart-account""","""https://github.com/sindresorhus/path-type"""


In [6]:
df.filter(pl.col("source") == "https://github.com/ethereum/solidity")

relation,source,target
str,str,str
"""PIP""","""https://github.com/ethereum/solidity""","""https://github.com/taminomara/sphinx-a4doc"""
"""PIP""","""https://github.com/ethereum/solidity""","""https://github.com/sphinx-doc/sphinx"""
"""PIP""","""https://github.com/ethereum/solidity""","""https://github.com/readthedocs/sphinx_rtd_theme"""


In [7]:
df.group_by("relation").agg(pl.len()).sort("relation", descending=True)

relation,len
str,u32
"""RUST""",1349
"""PIP""",135
"""NPM""",7814
"""GO""",598


In [8]:
# Split source and target URLs into organization and repository components
df = df.with_columns([
    pl.col("source").str.extract(r"https://github.com/([^/]+)/([^/]+)$", 1).alias("source_organization"),
    pl.col("source").str.extract(r"https://github.com/([^/]+)/([^/]+)$", 2).alias("source_repository"),
    pl.col("target").str.extract(r"https://github.com/([^/]+)/([^/]+)$", 1).alias("target_organization"),
    pl.col("target").str.extract(r"https://github.com/([^/]+)/([^/]+)$", 2).alias("target_repository")
])

df.sample(10)

relation,source,target,source_organization,source_repository,target_organization,target_repository
str,str,str,str,str,str,str
"""NPM""","""https://github.com/ethereum/remix-project""","""https://github.com/unjs/unstorage""","""ethereum""","""remix-project""","""unjs""","""unstorage"""
"""NPM""","""https://github.com/ethereum/remix-project""","""https://github.com/hughsk/ast-parents""","""ethereum""","""remix-project""","""hughsk""","""ast-parents"""
"""NPM""","""https://github.com/safe-global/safe-smart-account""","""https://github.com/ashtuchkin/iconv-lite""","""safe-global""","""safe-smart-account""","""ashtuchkin""","""iconv-lite"""
"""GO""","""https://github.com/prysmaticlabs/prysm""","""https://github.com/quic-go/quic-go""","""prysmaticlabs""","""prysm""","""quic-go""","""quic-go"""
"""GO""","""https://github.com/erigontech/erigon""","""https://github.com/grpc-ecosystem/go-grpc-middleware""","""erigontech""","""erigon""","""grpc-ecosystem""","""go-grpc-middleware"""
"""NPM""","""https://github.com/chainsafe/lodestar""","""https://github.com/chainsafe/blst-ts""","""chainsafe""","""lodestar""","""chainsafe""","""blst-ts"""
"""RUST""","""https://github.com/grandinetech/grandine""","""https://github.com/rust-cli/anstyle""","""grandinetech""","""grandine""","""rust-cli""","""anstyle"""
"""NPM""","""https://github.com/chainsafe/lodestar""","""https://github.com/webpack/enhanced-resolve""","""chainsafe""","""lodestar""","""webpack""","""enhanced-resolve"""
"""NPM""","""https://github.com/ethereum/remix-project""","""https://github.com/thlorenz/deep-is""","""ethereum""","""remix-project""","""thlorenz""","""deep-is"""
"""NPM""","""https://github.com/ethereum/remix-project""","""https://github.com/develar/fs-extra-p""","""ethereum""","""remix-project""","""develar""","""fs-extra-p"""


In [9]:
df.group_by("source").agg(pl.col("target").n_unique()).sort("target", descending=True)

source,target
str,u32
"""https://github.com/ethereum/remix-project""",2277
"""https://github.com/web3/web3.js""",1709
"""https://github.com/chainsafe/lodestar""",1514
"""https://github.com/eth-infinitism/account-abstraction""",854
"""https://github.com/ethereumjs/ethereumjs-monorepo""",796
"""https://github.com/safe-global/safe-smart-account""",519
"""https://github.com/paradigmxyz/reth""",463
"""https://github.com/sigp/lighthouse""",451
"""https://github.com/grandinetech/grandine""",435
"""https://github.com/erigontech/erigon""",253


In [10]:
df.group_by("target").agg(pl.col("source").n_unique()).sort("source", descending=True).head(10)

target,source
str,u32
"""https://github.com/sindresorhus/resolve-from""",7
"""https://github.com/paulmillr/noble-curves""",7
"""https://github.com/kevva/shebang-command""",7
"""https://github.com/chalk/ansi-regex""",7
"""https://github.com/ricmoo/aes-js""",7
"""https://github.com/nodeca/js-yaml""",7
"""https://github.com/substack/text-table""",7
"""https://github.com/acornjs/acorn""",7
"""https://github.com/garycourt/uri-js""",7
"""https://github.com/lydell/js-tokens""",7


In [11]:
df.group_by("target_organization").agg(pl.col("target_repository").n_unique()).sort("target_repository", descending=True).head(10)

target_organization,target_repository
str,u32
"""sindresorhus""",253
"""jonschlinkert""",104
"""npm""",82
"""inspect-js""",49
"""syntax-tree""",46
"""isaacs""",44
"""ljharb""",43
"""gulpjs""",42
"""wooorm""",34
"""d3""",32


## Baseline Model

Let's work on a model that assigns the same weight to all dependencies. Source (`repo`) weights should sum to 1.

In [12]:
df = df.with_columns(
    (1 / pl.col("target").n_unique().over("source")).alias("weight")
)

In [13]:
df.select("source", "target", "weight").sample(10)

source,target,weight
str,str,f64
"""https://github.com/chainsafe/lodestar""","""https://github.com/iarna/console-control-strings""",0.000661
"""https://github.com/eth-infinitism/account-abstraction""","""https://github.com/juliangruber/isarray""",0.001171
"""https://github.com/ethereumjs/ethereumjs-monorepo""","""https://github.com/feross/run-parallel-limit""",0.001256
"""https://github.com/ethereum/remix-project""","""https://github.com/cryptocoinjs/keccak""",0.000439
"""https://github.com/ethereum/remix-project""","""https://github.com/unjs/consola""",0.000439
"""https://github.com/chainsafe/lodestar""","""https://github.com/qix-/color-convert""",0.000661
"""https://github.com/web3/web3.js""","""https://github.com/justmoon/node-extend""",0.000585
"""https://github.com/chainsafe/lodestar""","""https://github.com/mswjs/headers-polyfill""",0.000661
"""https://github.com/ethereum/remix-project""","""https://github.com/winstonjs/winston""",0.000439
"""https://github.com/ethereum/remix-project""","""https://github.com/andarist/react-textarea-autosize""",0.000439


In [14]:
df.group_by("source").agg(pl.col("weight").sum())

source,weight
str,f64
"""https://github.com/grandinetech/grandine""",1.0
"""https://github.com/erigontech/erigon""",1.0
"""https://github.com/ethereum/web3.py""",1.0
"""https://github.com/chainsafe/lodestar""",1.0
"""https://github.com/ethereum/py-evm""",1.0
"""https://github.com/ethereum/go-ethereum""",1.0
"""https://github.com/consensys/teku""",1.0
"""https://github.com/ethereum/remix-project""",1.0
"""https://github.com/web3/web3.js""",1.0
"""https://github.com/vyperlang/vyper""",1.0


## Heuristic Model

Now, lets assign weights based on external data. In this case, we will get some data from GitHub for each package.

In [25]:
from github import Github, Auth
import os

auth = Auth.Token(str(os.getenv("GITHUB_TOKEN")))
g = Github(auth=auth)

# Get data for all target repos
target_repositories = df.select(pl.col("target")).unique()

# Initialize lists to store data
repositories_stats = []

# Process each repo URL
for repository_url in target_repositories["target"]:
    try:
        # Extract owner/repo from URL
        _, _, _, owner, repo = repository_url.split('/')

        # Get repo data from GitHub API
        repo_obj = g.get_repo(f"{owner}/{repo}")

        repositories_stats.append({
            "target": repository_url,
            "stars": repo_obj.stargazers_count,
            "forks": repo_obj.forks_count,
            "created_at": repo_obj.created_at,
            "updated_at": repo_obj.updated_at,
            "is_fork": repo_obj.fork,
            "open_issues": repo_obj.open_issues_count,
            "size": repo_obj.size,
            "language": repo_obj.language,
            "has_wiki": repo_obj.has_wiki,
            "has_discussions": repo_obj.has_discussions,
            "topics": repo_obj.topics,
            "description": repo_obj.description
        })

    except Exception as e:
        print(f"Error getting data for {repository_url}: {e}")
        continue

# Convert to polars DataFrame
github_df = pl.DataFrame(repositories_stats)
print(f"Got data for {len(repositories_stats)} repos")
github_df.head()


target,stars,forks,created_at,updated_at,is_fork,open_issues,size,language,has_wiki,has_discussions,topics,description
str,i64,i64,"datetime[μs, UTC]","datetime[μs, UTC]",bool,i64,i64,str,bool,bool,list[str],str
"""https://github.com/tootallnate/node-bindings""",413,96,2012-01-28 20:26:06 UTC,2024-11-18 06:17:23 UTC,False,35,45,"""JavaScript""",True,False,[],"""Helper module for loading your native module's `.node` file"""
"""https://github.com/jupyter/terminado""",369,95,2014-09-29 23:44:25 UTC,2024-12-18 08:27:35 UTC,False,29,566,"""Python""",True,False,[],"""Terminals served by tornado websockets"""
"""https://github.com/rpetrich/babel-plugin-transform-async-to-promises""",247,18,2018-01-06 23:27:25 UTC,2024-12-10 17:10:35 UTC,False,15,2465,"""TypeScript""",False,False,[],"""Transform async/await to somewhat idiomatic JavaScript promise chains"""
"""https://github.com/libp2p/go-nat""",19,14,2018-09-12 19:04:30 UTC,2024-09-11 06:57:17 UTC,True,2,65,"""Go""",True,False,[],"""NAT port mapping library for Go"""
"""https://github.com/facebook/regenerator""",3836,1159,2013-10-05 14:37:03 UTC,2024-12-17 20:23:40 UTC,False,82,8959,"""JavaScript""",True,False,[],"""Source transformer enabling ECMAScript 6 generator functions in JavaScript-of-today."""


We can also get the data from OSO.

In [13]:
import polars as pl
from google.cloud import bigquery

client = bigquery.Client()

In [14]:
package_list_string = df.select(pl.col("target")).to_series().str.join("', '")

In [15]:
QUERY = f"""
select
  artifact_url as target,
  is_fork,
  star_count,
  fork_count,
  created_at,
  updated_at
from `opensource-observer.oso.repositories_v0`
where artifact_url in ('{package_list_string[0]}')
"""

query_job = client.query(QUERY)
rows = query_job.result()

oso_df = pl.from_arrow(rows.to_arrow())

print(f"Rows: {oso_df.shape[0]}")

oso_df.head()

Rows: 1439


target,is_fork,star_count,fork_count,created_at,updated_at
str,bool,i64,i64,"datetime[μs, UTC]","datetime[μs, UTC]"
"""https://github.com/npm/agent""",False,7,5,2022-06-30 17:47:07 UTC,2024-09-06 23:49:51 UTC
"""https://github.com/npm/security-holder""",False,157,58,2016-03-23 00:22:33 UTC,2024-12-11 17:50:51 UTC
"""https://github.com/jsdom/domexception""",False,21,8,2017-08-14 03:04:01 UTC,2024-05-11 21:05:28 UTC
"""https://github.com/jsdom/abab""",False,93,19,2015-08-29 03:21:16 UTC,2024-05-17 16:43:47 UTC
"""https://github.com/herumi/mcl-wasm""",False,59,18,2017-11-20 06:46:44 UTC,2024-09-18 03:05:14 UTC


In [16]:
# Find targets in df that are not in oso_df
missing_targets = df.filter(~df["target"].is_in(oso_df["target"]))["target"].unique()
print(f"Number of missing targets: {len(missing_targets)}")
print(f"Number of targets in df: {len(df['target'].unique())}")
print(f"Number of targets in oso_df: {len(oso_df['target'].unique())}")
print(f"Percentage of missing targets: {(len(missing_targets) / len(df['target'].unique())) * 100:.2f}%")

missing_targets.sample(10)


Number of missing targets: 2850
Number of targets in df: 4289
Number of targets in oso_df: 1439
Percentage of missing targets: 66.45%


target
str
"""https://github.com/wooorm/lowlight"""
"""https://github.com/isaacs/json-stringify-safe"""
"""https://github.com/yaacovcr/value-or-promise"""
"""https://github.com/syndtr/goleveldb"""
"""https://github.com/mariusvanderwijden/tx-fuzz"""
"""https://github.com/svg/sax"""
"""https://github.com/facebook/emitter"""
"""https://github.com/formidablelabs/prism-react-renderer"""
"""https://github.com/japaric/cast.rs"""
"""https://github.com/dai-shi/proxy-compare"""
