# Exploring the Graph of Dependencies

A small exploration of the graph of dependencies to understand the data and the structure of the graph.

## Data

In [60]:
import json
import urllib.request

url = "https://raw.githubusercontent.com/deepfunding/dependency-graph/refs/heads/main/graph/unweighted_graph.json"
with urllib.request.urlopen(url) as response:
    graph_data = json.loads(response.read())


In [61]:
import polars as pl

pl.Config.set_tbl_rows(20)

df = pl.read_csv('https://raw.githubusercontent.com/deepfunding/dependency-graph/refs/heads/main/graph/unweighted_graph.csv')
df.sample(10)

seed_repo_owner,seed_repo_name,package_name,package_repo_owner,package_repo_name,package_source
str,str,str,str,str,str
"""chainsafe""","""lodestar""","""hast-util-from-parse5""","""syntax-tree""","""hast-util-from-parse5""","""NPM"""
"""eth-infinitism""","""account-abstraction""","""recursive-readdir""","""jergason""","""recursive-readdir""","""NPM"""
"""web3""","""web3.js""","""deep-is""","""thlorenz""","""deep-is""","""NPM"""
"""prysmaticlabs""","""prysm""","""github.com/klauspost/compress""","""klauspost""","""compress""","""GO"""
"""safe-global""","""safe-smart-account""","""@nomicfoundation/edr-darwin-x6…","""nomicfoundation""","""edr""","""NPM"""
"""sigp""","""lighthouse""","""anstyle""","""rust-cli""","""anstyle""","""RUST"""
"""eth-infinitism""","""account-abstraction""","""typedarray-to-buffer""","""feross""","""typedarray-to-buffer""","""NPM"""
"""web3""","""web3.js""","""@yarnpkg/parsers""","""yarnpkg""","""berry""","""NPM"""
"""safe-global""","""safe-smart-account""","""asn1""","""joyent""","""node-asn1""","""NPM"""
"""safe-global""","""safe-smart-account""","""is-arrayish""","""qix-""","""node-is-arrayish""","""NPM"""


In [62]:
df.shape

(13536, 6)

The CSV data is the definition of a graph. The source is `seed_repo_owner/seed_repo_name` and the target is `package_repo_owner/package_repo_name/package_name`. There is also a source for the package, which is `package_source`.

In [63]:
df.filter(
    (pl.col("seed_repo_owner") == "ethereum") &
    (pl.col("seed_repo_name") == "solidity")
)

seed_repo_owner,seed_repo_name,package_name,package_repo_owner,package_repo_name,package_source
str,str,str,str,str,str
"""ethereum""","""solidity""","""sphinx-a4doc""","""taminomara""","""sphinx-a4doc""","""PIP"""
"""ethereum""","""solidity""","""sphinx""","""sphinx-doc""","""sphinx""","""PIP"""
"""ethereum""","""solidity""","""sphinx-rtd-theme""","""readthedocs""","""sphinx_rtd_theme""","""PIP"""


In [64]:
df.group_by("package_source").agg(pl.len())

package_source,len
str,u32
"""NPM""",10746
"""RUST""",2024
"""PIP""",135
"""GO""",631


In [65]:
df = df.with_columns(
    pl.concat_str(
        [
            pl.col("seed_repo_owner"),
            pl.col("seed_repo_name"),
        ],
        separator="/"
    ).alias("repo"),
    pl.concat_str(
        [
            pl.col("package_repo_owner"),
            pl.col("package_repo_name"),
            pl.col("package_name"),
        ],
        separator="/"
    ).alias("package_repo"),
)

In [66]:
df.group_by("repo").agg(pl.col("package_repo").n_unique()).sort("package_repo", descending=True)

repo,package_repo
str,u32
"""ethereum/remix-project""",3230
"""web3/web3.js""",2389
"""chainsafe/lodestar""",2213
"""ethereumjs/ethereumjs-monorepo""",1081
"""eth-infinitism/account-abstrac…",1048
"""paradigmxyz/reth""",728
"""sigp/lighthouse""",653
"""grandinetech/grandine""",643
"""safe-global/safe-smart-account""",636
"""erigontech/erigon""",265


In [67]:
df.group_by("package_repo").agg(pl.col("repo").n_unique()).sort("repo", descending=True)

package_repo,repo
str,u32
"""sindresorhus/globby/globby""",7
"""vercel/ms/ms""",7
"""sindresorhus/escape-string-reg…",7
"""sindresorhus/p-try/p-try""",7
"""epoberezkin/json-schema-traver…",7
"""acornjs/acorn-jsx/acorn-jsx""",7
"""sindresorhus/array-union/array…",7
"""npm/node-which/which""",7
"""lydell/js-tokens/js-tokens""",7
"""sindresorhus/parent-module/par…",7


## Baseline Model

Let's work on a model that assigns the same weight to all dependencies. Source (`repo`) weights should sum to 1.

In [68]:
df = df.with_columns(
    (1 / pl.col("package_repo").n_unique().over("repo")).alias("weight")
)

In [69]:
df.select("repo", "package_repo", "weight").sample(10)

repo,package_repo,weight
str,str,f64
"""sigp/lighthouse""","""rust-lang/futures-rs/futures-i…",0.001531
"""ethereum/remix-project""","""bendrucker/builtin-status-code…",0.00031
"""sigp/lighthouse""","""dtolnay/quote/quote""",0.001531
"""ethereumjs/ethereumjs-monorepo""","""npm/ini/ini""",0.000925
"""ethereum/remix-project""","""es-shims/function.prototype.na…",0.00031
"""grandinetech/grandine""","""frozenlib/parse-display/parse-…",0.001555
"""ethereumjs/ethereumjs-monorepo""","""moment/moment/moment""",0.000925
"""ethereum/remix-project""","""alanshaw/it-concat/it-concat""",0.00031
"""chainsafe/lodestar""","""syntax-tree/mdast-util-frontma…",0.000452
"""chainsafe/lodestar""","""facebook/docusaurus/@docusauru…",0.000452


In [70]:
df.group_by("repo").agg(pl.col("weight").sum())


repo,weight
str,f64
"""safe-global/safe-smart-account""",1.0
"""ethereumjs/ethereumjs-monorepo""",1.0
"""vyperlang/vyper""",1.0
"""paradigmxyz/reth""",1.0
"""sigp/lighthouse""",1.0
"""consensys/teku""",1.0
"""ethereum/web3.py""",1.0
"""web3/web3.js""",1.0
"""chainsafe/lodestar""",1.0
"""grandinetech/grandine""",1.0
