# Exploring the Graph of Dependencies

A small exploration of the graph of dependencies to understand the data and the structure of the graph.

## Data

In [36]:
import json
import urllib.request

url = "https://raw.githubusercontent.com/deepfunding/dependency-graph/refs/heads/main/graph/unweighted_graph.json"
with urllib.request.urlopen(url) as response:
    graph_data = json.loads(response.read())


In [37]:
import polars as pl

pl.Config.set_tbl_rows(20)

df = pl.read_csv('https://raw.githubusercontent.com/deepfunding/dependency-graph/refs/heads/main/graph/unweighted_graph.csv')
df.sample(10)

seed_repo_owner,seed_repo_name,package_name,package_repo_owner,package_repo_name,package_source
str,str,str,str,str,str
"""ethereum""","""remix-project""","""@remixproject/plugin-ws""","""ethereum""","""remix-plugin""","""NPM"""
"""web3""","""web3.js""","""github-build""","""siddharthkp""","""github-build""","""NPM"""
"""web3""","""web3.js""","""inflight""","""npm""","""inflight""","""NPM"""
"""safe-global""","""safe-smart-account""","""@types/concat-stream""","""definitelytyped""","""definitelytyped""","""NPM"""
"""web3""","""web3.js""","""webpack-sources""","""webpack""","""webpack-sources""","""NPM"""
"""grandinetech""","""grandine""","""hex_fmt""","""poanetwork""","""hex_fmt""","""RUST"""
"""chainsafe""","""lodestar""","""babel-loader""","""babel""","""babel-loader""","""NPM"""
"""ethereumjs""","""ethereumjs-monorepo""","""p-timeout""","""sindresorhus""","""p-timeout""","""NPM"""
"""ethereum""","""remix-project""","""p-retry""","""sindresorhus""","""p-retry""","""NPM"""
"""eth-infinitism""","""account-abstraction""","""jsesc""","""mathiasbynens""","""jsesc""","""NPM"""


In [38]:
df.shape

(13536, 6)

The CSV data is the definition of a graph. The source is `seed_repo_owner/seed_repo_name` and the target is `package_repo_owner/package_repo_name/package_name`. There is also a source for the package, which is `package_source`.

In [39]:
df.filter(
    (pl.col("seed_repo_owner") == "ethereum") &
    (pl.col("seed_repo_name") == "solidity")
)

seed_repo_owner,seed_repo_name,package_name,package_repo_owner,package_repo_name,package_source
str,str,str,str,str,str
"""ethereum""","""solidity""","""sphinx-a4doc""","""taminomara""","""sphinx-a4doc""","""PIP"""
"""ethereum""","""solidity""","""sphinx""","""sphinx-doc""","""sphinx""","""PIP"""
"""ethereum""","""solidity""","""sphinx-rtd-theme""","""readthedocs""","""sphinx_rtd_theme""","""PIP"""


In [40]:
df.group_by("package_source").agg(pl.len())

package_source,len
str,u32
"""RUST""",2024
"""NPM""",10746
"""GO""",631
"""PIP""",135


In [41]:
df = df.with_columns(
    pl.concat_str(
        [
            pl.col("seed_repo_owner"),
            pl.col("seed_repo_name"),
        ],
        separator="/"
    ).alias("repo"),
    pl.concat_str(
        [
            pl.col("package_repo_owner"),
            pl.col("package_repo_name"),
            pl.col("package_name"),
        ],
        separator="/"
    ).alias("package_repo"),
)

In [42]:
df.group_by("repo").agg(pl.col("package_repo").n_unique()).sort("package_repo", descending=True)

repo,package_repo
str,u32
"""ethereum/remix-project""",3230
"""web3/web3.js""",2389
"""chainsafe/lodestar""",2213
"""ethereumjs/ethereumjs-monorepo""",1081
"""eth-infinitism/account-abstrac…",1048
"""paradigmxyz/reth""",728
"""sigp/lighthouse""",653
"""grandinetech/grandine""",643
"""safe-global/safe-smart-account""",636
"""erigontech/erigon""",265


In [43]:
df.group_by("package_repo").agg(pl.col("repo").n_unique()).sort("repo", descending=True)

package_repo,repo
str,u32
"""chalk/chalk/chalk""",7
"""sindresorhus/locate-path/locat…",7
"""alexei/sprintf.js/sprintf-js""",7
"""isaacs/node-graceful-fs/gracef…",7
"""mathiasbynens/punycode.js/puny…",7
"""sindresorhus/p-locate/p-locate""",7
"""isaacs/inherits/inherits""",7
"""sindresorhus/callsites/callsit…",7
"""epoberezkin/fast-deep-equal/fa…",7
"""tj/commander.js/commander""",7


In [46]:
df.select("package_repo").n_unique()

6211

In [47]:
df.select("repo").n_unique()

18

In [49]:
df.select(pl.col("package_repo_owner"), pl.col("package_repo_name")).n_unique()

4289

## Baseline Model

Let's work on a model that assigns the same weight to all dependencies. Source (`repo`) weights should sum to 1.

In [17]:
df = df.with_columns(
    (1 / pl.col("package_repo").n_unique().over("repo")).alias("weight")
)

In [18]:
df.select("repo", "package_repo", "weight").sample(10)

repo,package_repo,weight
str,str,f64
"""ethereumjs/ethereumjs-monorepo""","""paulmillr/scure-bip39/@scure/b…",0.000925
"""ethereum/remix-project""","""isaacs/json-stringify-safe/jso…",0.00031
"""ethereum/remix-project""","""mattgperry/motion/motion""",0.00031
"""ethereum/remix-project""","""babel/babel/@babel/helper-func…",0.00031
"""grandinetech/grandine""","""nix-rust/nix/nix""",0.001555
"""chainsafe/lodestar""","""tootallnate/proxy-agents/degen…",0.000452
"""eth-infinitism/account-abstrac…","""cryptocoinjs/secp256k1-node/se…",0.000954
"""web3/web3.js""","""kitten/use-editable/use-editab…",0.000419
"""ethereum/remix-project""","""troygoode/node-require-directo…",0.00031
"""paradigmxyz/reth""","""sagebind/castaway/castaway""",0.001374


In [19]:
df.group_by("repo").agg(pl.col("weight").sum())

repo,weight
str,f64
"""sigp/lighthouse""",1.0
"""status-im/nimbus-eth2""",1.0
"""chainsafe/lodestar""",1.0
"""web3/web3.js""",1.0
"""ethereum/py-evm""",1.0
"""eth-infinitism/account-abstrac…",1.0
"""consensys/teku""",1.0
"""ethereumjs/ethereumjs-monorepo""",1.0
"""paradigmxyz/reth""",1.0
"""grandinetech/grandine""",1.0


## Heuristic Model

Now, lets assign weights based on external data. In this case, we will use the number of stars on GitHub for each package. Data can be obtained from GitHub API, [GitHub Archive](https://www.gharchive.org/), or, my prefered way, using [Open Source Observer public datasets on BigQuery](https://www.opensource.observer/).

In [20]:
import polars as pl
from google.cloud import bigquery

client = bigquery.Client()

In [22]:
package_list_string = "'" + df.select(
    pl.concat_str(pl.lit("https://github.com/"), pl.col("package_repo_owner"), pl.lit("/"), pl.col("package_repo_name"))
).to_series().str.join("', '") + "'"

In [52]:
QUERY = f"""
select
  artifact_url as url,
  is_fork,
  star_count,
  fork_count,
  created_at,
  updated_at
from `opensource-observer.oso.repositories_v0`
where artifact_url in ({package_list_string[0]})
"""

query_job = client.query(QUERY)
rows = query_job.result()

oso_df = pl.from_arrow(rows.to_arrow())
oso_df.head()

url,is_fork,star_count,fork_count,created_at,updated_at
str,bool,i64,i64,"datetime[μs, UTC]","datetime[μs, UTC]"
"""https://github.com/npm/agent""",False,7,5,2022-06-30 17:47:07 UTC,2024-09-06 23:49:51 UTC
"""https://github.com/npm/securit…",False,157,58,2016-03-23 00:22:33 UTC,2024-12-11 17:50:51 UTC
"""https://github.com/jsdom/domex…",False,21,8,2017-08-14 03:04:01 UTC,2024-05-11 21:05:28 UTC
"""https://github.com/jsdom/abab""",False,93,19,2015-08-29 03:21:16 UTC,2024-05-17 16:43:47 UTC
"""https://github.com/herumi/mcl-…",False,59,18,2017-11-20 06:46:44 UTC,2024-09-18 03:05:14 UTC


In [53]:
oso_df.shape

(1439, 6)