In [1]:
from google.cloud import bigquery
import os
import pandas as pd
import re

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'
GCP_PROJECT = 'opensource-observer'
client = bigquery.Client(GCP_PROJECT)

In [3]:
# Helpers

def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

def parse_github(pkg_url):
    if isinstance(pkg_url, str):
        match = re.search(r'github\.com/([^/]+)/([^/.#]+)', pkg_url)
        if match:
            owner = match.group(1).lower()
            repo = match.group(2).lower()
            return '/'.join([owner, repo])    
    return None

def extract_namespace(npm_package_name):
    if npm_package_name.startswith('@'):
        return npm_package_name.split('/')[0][1:]
    return None

# Get SBOMs for repos we care about

In [4]:
CONSENSUS = [
    'prysmaticlabs/prysm',
    'sigp/lighthouse',
    'consensys/teku',
    'status-im/nimbus-eth2',
    'chainsafe/lodestar',
    'grandinetech/grandine'
]
EXECUTION = [
    'ethereum/go-ethereum',
    'nethermindeth/nethermind',
    'hyperledger/besu',
    'erigontech/erigon',
    'paradigmxyz/reth'
]
SEED_REPOS = CONSENSUS + EXECUTION

In [5]:
sbom_results = client.query(f"""
    WITH sboms AS (
      SELECT DISTINCT
        artifact_namespace AS repo_owner,
        CONCAT(artifact_namespace, '/', artifact_name) AS repo_name,
        package_artifact_name,
        package_artifact_source
      FROM `oso.int_sbom_artifacts`
      WHERE package_artifact_source IN ('NPM', 'RUST', 'GO', 'PIP')
    )
    SELECT * FROM sboms
    WHERE
      repo_name IN ({stringify_array(SEED_REPOS)})
      OR repo_owner = 'ethereum' -- also include anything in ethereum namespace
""")
df_sbom = sbom_results.to_dataframe()
df_sbom.to_csv('data/sbom.csv')
df_sbom.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source
40202,ethereum,ethereum/remix-plugin,lodash.set,NPM
40203,ethereum,ethereum/remix-plugin,extend,NPM
40204,ethereum,ethereum/remix-plugin,is-generator-fn,NPM
40205,ethereum,ethereum/remix-plugin,normalize-package-data,NPM
40206,ethereum,ethereum/ethash,pyethereum,PIP


## Lookup Go packages

In [6]:
df_go = df_sbom[df_sbom['package_artifact_source'] == 'GO'].copy()
df_go['package_repo_name'] = df_go['package_artifact_name'].apply(parse_github)
df_go['likely_package_repo_owner'] = df_go['package_repo_name'].apply(
    lambda x: x.split('/')[0] if isinstance(x, str) else None
)
df_go.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
40043,ethereum,ethereum/portal-hive,github.com/prometheus/tsdb,GO,prometheus/tsdb,prometheus
40045,ethereum,ethereum/portal-hive,github.com/gorilla/mux,GO,gorilla/mux,gorilla
40046,ethereum,ethereum/portal-hive,gopkg.in/inconshreveable/log15.v2,GO,,
40047,ethereum,ethereum/portal-hive,github.com/microsoft/go-winio,GO,microsoft/go-winio,microsoft
40048,ethereum,ethereum/portal-hive,github.com/victoriametrics/fastcache,GO,victoriametrics/fastcache,victoriametrics


## Lookup Python packages

In [7]:
df_py = df_sbom[df_sbom['package_artifact_source'] == 'PIP'].copy()
py_pkgs_all = df_py['package_artifact_name'].dropna().unique()

In [8]:
py_pkg_lookup = client.query(f"""
    WITH github_data AS (
      SELECT DISTINCT
        LOWER(name) AS package_name,
        LOWER(REGEXP_EXTRACT(url, r'github\.com/([^/]+)/')) AS github_owner,
        LOWER(REGEXP_EXTRACT(url, r'github\.com/[^/]+/([^/.]+)')) AS github_repo
      FROM `bigquery-public-data.pypi.distribution_metadata`,
      UNNEST(project_urls) AS url
      WHERE url LIKE '%github.com/%'

      UNION ALL

      SELECT DISTINCT
        LOWER(name) AS package_name,
        LOWER(REGEXP_EXTRACT(home_page, r'github\.com/([^/]+)/')) AS github_owner,
        LOWER(REGEXP_EXTRACT(home_page, r'github\.com/[^/]+/([^/.]+)')) AS github_repo
      FROM `bigquery-public-data.pypi.distribution_metadata`
      WHERE home_page LIKE '%github.com/%'
    )
    
    SELECT DISTINCT
      package_name,
      github_owner,
      github_repo
    FROM github_data
    WHERE package_name IN ({stringify_array(py_pkgs_all)})
""")
df_py_pkg = py_pkg_lookup.to_dataframe()

In [9]:
df_py_pkg['package_repo_name'] = df_py_pkg.apply(
    lambda x: f"{x['github_owner']}/{x['github_repo']}".strip("'")
              if x['github_owner'] and x['github_repo']
              else None,
    axis=1
)
py_mapper = (
    df_py_pkg[['package_name', 'package_repo_name']]
    .dropna()
    .drop_duplicates()
    .set_index('package_name')['package_repo_name']
    .to_dict()
)
df_py['package_repo_name'] = df_py['package_artifact_name'].map(py_mapper)
df_py['likely_package_repo_owner'] = df_py['package_repo_name'].apply(
    lambda x: x.split('/')[0] if isinstance(x, str) else None
)

df_py.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
40010,ethereum,ethereum/abm1559,argon2-cffi,PIP,hynek/argon2_cffi,hynek
40011,ethereum,ethereum/abm1559,prompt-toolkit,PIP,prompt-toolkit/python-prompt-toolkit,prompt-toolkit
40012,ethereum,ethereum/abm1559,nest-asyncio,PIP,erdewit/nest_asyncio,erdewit
40013,ethereum,ethereum/abm1559,pyparsing,PIP,pyparsing/pyparsing,pyparsing
40206,ethereum,ethereum/ethash,pyethereum,PIP,,


## Lookup Rust

In [10]:
df_rust = df_sbom[df_sbom['package_artifact_source'] == 'RUST'].copy()
rust_pkgs_all = df_rust['package_artifact_name'].dropna().unique()

In [11]:
rust_pkg_lookup = client.query(f"""
    select distinct
      name,
      lower(repository) as repository
    from `crates.crates`
    where
        name in ({stringify_array(rust_pkgs_all)})
        and repository is not null
""")
rust_py_pkg = rust_pkg_lookup.to_dataframe()
rust_py_pkg['package_repo_name'] = rust_py_pkg['repository'].apply(parse_github)
rust_py_pkg.tail(5)

Unnamed: 0,name,repository,package_repo_name
1306,crossbeam-channel,https://github.com/crossbeam-rs/crossbeam,crossbeam-rs/crossbeam
1307,crossbeam-skiplist,https://github.com/crossbeam-rs/crossbeam,crossbeam-rs/crossbeam
1308,opentelemetry-otlp,https://github.com/open-telemetry/opentelemetr...,open-telemetry/opentelemetry-rust
1309,opentelemetry-proto,https://github.com/open-telemetry/opentelemetr...,open-telemetry/opentelemetry-rust
1310,opentelemetry-semantic-conventions,https://github.com/open-telemetry/opentelemetr...,open-telemetry/opentelemetry-rust


In [12]:
df_rust = df_sbom[df_sbom['package_artifact_source'] == 'RUST'].copy()

crates_mapper = (
    rust_py_pkg[['name', 'package_repo_name']]
    .dropna()
    .drop_duplicates()
    .set_index('name')['package_repo_name']
    .to_dict()
)
df_rust['package_repo_name'] = df_rust['package_artifact_name'].map(crates_mapper)
df_rust['likely_package_repo_owner'] = df_rust['package_repo_name'].apply(
    lambda x: x.split('/')[0] if isinstance(x, str) else None
)

df_rust.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
40040,ethereum,ethereum/trin,iri-string,RUST,lo48576/iri-string,lo48576
40041,ethereum,ethereum/trin,zstd-sys,RUST,gyscos/zstd-rs,gyscos
40108,ethereum,ethereum/eipv,percent-encoding,RUST,servo/rust-url,servo
40109,ethereum,ethereum/eipv,unicode-normalization,RUST,unicode-rs/unicode-normalization,unicode-rs
40110,ethereum,ethereum/eipv,autocfg,RUST,cuviper/autocfg,cuviper


## Lookup NPM

In [13]:
df_npm = df_sbom[df_sbom['package_artifact_source'] == 'NPM'].copy()
npm_pkgs_all = sorted(df_npm['package_artifact_name'].unique())
len(npm_pkgs_all)

6839

In [14]:
# create a registry from deps.dev

q = """
SELECT
  Name as package_name,
  (SELECT lower(URL) 
   FROM UNNEST(Links) 
   WHERE REGEXP_CONTAINS(URL, r'github\.com')
   LIMIT 1) AS package_github_url
FROM `bigquery-public-data.deps_dev_v1.PackageVersionsLatest`
WHERE
  System = 'NPM'
  AND SnapshotAt >= '2024-11-01'
  AND ARRAY_LENGTH(Links) > 0
"""

# save it to static_data_sources.npm_registry

In [15]:
npm_pkg_lookup = client.query(f"""
    SELECT DISTINCT
      package_name,
      package_github_url
    FROM `static_data_sources.npm_registry`
    WHERE package_name in ({stringify_array(npm_pkgs_all)})
""")
df_npm_pkg = npm_pkg_lookup.to_dataframe()

In [16]:
df_npm_pkg['package_repo_name'] = df_npm_pkg['package_github_url'].apply(parse_github)
npm_mapper = (
    df_npm_pkg[['package_name', 'package_repo_name']]
    .dropna()
    .drop_duplicates()
    .set_index('package_name')['package_repo_name']
    .to_dict()
)
df_npm['package_repo_name'] = df_npm['package_artifact_name'].map(npm_mapper)

df_npm['namespace'] = df_npm['package_artifact_name'].apply(extract_namespace)
namespace_to_owner = {}
for package_name, repo_name in npm_mapper.items():
    namespace = extract_namespace(package_name)
    if namespace and repo_name:
        github_owner = repo_name.split('/')[0]
        namespace_to_owner.update({namespace:github_owner})

df_npm['likely_package_repo_owner'] = df_npm['namespace'].map(namespace_to_owner)
df_npm.loc[df_npm['package_repo_name'].notna(), 'likely_package_repo_owner'] = df_npm['package_repo_name'].str.split('/').str[0]
df_npm.drop(columns='namespace', inplace=True)
df_npm.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
40201,ethereum,ethereum/remix-plugin,y18n,NPM,yargs/y18n,yargs
40202,ethereum,ethereum/remix-plugin,lodash.set,NPM,lodash/lodash,lodash
40203,ethereum,ethereum/remix-plugin,extend,NPM,justmoon/node-extend,justmoon
40204,ethereum,ethereum/remix-plugin,is-generator-fn,NPM,sindresorhus/is-generator-fn,sindresorhus
40205,ethereum,ethereum/remix-plugin,normalize-package-data,NPM,meryn/read-package-data,meryn


# Consolidate back into a single graph

In [17]:
df = pd.concat([df_npm,df_go,df_rust,df_py], axis=0, ignore_index=True)
df.fillna('', inplace=True)
df['seed_project'] = df['repo_name'].apply(
    lambda x: x if x in SEED_REPOS else 'ethereum/*'
)
df.rename(columns={
    'package_artifact_name': 'package_name',
    'package_artifact_source': 'package_source',
    'likely_package_repo_owner': 'package_repo_owner'
}, inplace=True)
df = df[[
    'seed_project', 'repo_owner', 'repo_name',
    'package_name', 'package_repo_owner', 'package_repo_name', 'package_source'
]]
df

Unnamed: 0,seed_project,repo_owner,repo_name,package_name,package_repo_owner,package_repo_name,package_source
0,ethereum/*,ethereum,ethereum/ethdev-site,jshint-stylish,sindresorhus,sindresorhus/jshint-stylish,NPM
1,ethereum/*,ethereum,ethereum/ethdev-site,gulp-watch,floatdrop,floatdrop/gulp-watch,NPM
2,ethereum/*,ethereum,ethereum/meteor-dapp-wallet,underscore,jashkenas,jashkenas/underscore,NPM
3,ethereum/*,ethereum,ethereum/meteor-dapp-wallet,pseudomap,isaacs,isaacs/pseudomap,NPM
4,ethereum/*,ethereum,ethereum/meteor-dapp-wallet,chalk,sindresorhus,sindresorhus/chalk,NPM
...,...,...,...,...,...,...,...
40202,ethereum/*,ethereum,ethereum/abm1559,argon2-cffi,hynek,hynek/argon2_cffi,PIP
40203,ethereum/*,ethereum,ethereum/abm1559,prompt-toolkit,prompt-toolkit,prompt-toolkit/python-prompt-toolkit,PIP
40204,ethereum/*,ethereum,ethereum/abm1559,nest-asyncio,erdewit,erdewit/nest_asyncio,PIP
40205,ethereum/*,ethereum,ethereum/abm1559,pyparsing,pyparsing,pyparsing/pyparsing,PIP


In [18]:
owners = list(df['package_repo_owner'].unique())
len(owners)

2500

In [19]:
owner_lookup = client.query(f"""
    SELECT
      artifact_namespace as owner,
      COUNT(distinct artifact_id) AS oso_artifacts_owned
    FROM `oso.artifacts_v1`
    WHERE artifact_namespace in ({stringify_array(owners)})
    GROUP BY 1
""")

owners_on_oso = [row['owner'] for row in owner_lookup]
df['package_repo_owner_is_on_oso'] = df['package_repo_owner'].apply(lambda x: x in owners_on_oso)
df['package_repo_owner_is_on_oso'].mean()

0.7197005496555327

In [20]:
df.to_csv('data/unweighted_graph.csv')

# Create a network graph

In [21]:
import json
import networkx as nx

In [22]:
dff = df[df['package_repo_owner'] != ''].copy()

In [23]:
gh = 'https://github.com/'
dff['source'] = dff['repo_name'].apply(lambda x: f'{gh}{x}')
dff['target'] = dff.apply(
    lambda x: f"{gh}{x['package_repo_name']}"
              if x['package_repo_name'] != ''
              else f"{gh}{x['package_repo_owner']}"
    , axis=1)
dff

Unnamed: 0,seed_project,repo_owner,repo_name,package_name,package_repo_owner,package_repo_name,package_source,package_repo_owner_is_on_oso,source,target
0,ethereum/*,ethereum,ethereum/ethdev-site,jshint-stylish,sindresorhus,sindresorhus/jshint-stylish,NPM,True,https://github.com/ethereum/ethdev-site,https://github.com/sindresorhus/jshint-stylish
1,ethereum/*,ethereum,ethereum/ethdev-site,gulp-watch,floatdrop,floatdrop/gulp-watch,NPM,True,https://github.com/ethereum/ethdev-site,https://github.com/floatdrop/gulp-watch
2,ethereum/*,ethereum,ethereum/meteor-dapp-wallet,underscore,jashkenas,jashkenas/underscore,NPM,True,https://github.com/ethereum/meteor-dapp-wallet,https://github.com/jashkenas/underscore
3,ethereum/*,ethereum,ethereum/meteor-dapp-wallet,pseudomap,isaacs,isaacs/pseudomap,NPM,True,https://github.com/ethereum/meteor-dapp-wallet,https://github.com/isaacs/pseudomap
4,ethereum/*,ethereum,ethereum/meteor-dapp-wallet,chalk,sindresorhus,sindresorhus/chalk,NPM,True,https://github.com/ethereum/meteor-dapp-wallet,https://github.com/sindresorhus/chalk
...,...,...,...,...,...,...,...,...,...,...
40201,ethereum/*,ethereum,ethereum/abm1559,jsonschema,julian,julian/jsonschema,PIP,True,https://github.com/ethereum/abm1559,https://github.com/julian/jsonschema
40202,ethereum/*,ethereum,ethereum/abm1559,argon2-cffi,hynek,hynek/argon2_cffi,PIP,True,https://github.com/ethereum/abm1559,https://github.com/hynek/argon2_cffi
40203,ethereum/*,ethereum,ethereum/abm1559,prompt-toolkit,prompt-toolkit,prompt-toolkit/python-prompt-toolkit,PIP,False,https://github.com/ethereum/abm1559,https://github.com/prompt-toolkit/python-promp...
40204,ethereum/*,ethereum,ethereum/abm1559,nest-asyncio,erdewit,erdewit/nest_asyncio,PIP,True,https://github.com/ethereum/abm1559,https://github.com/erdewit/nest_asyncio


In [25]:
G = nx.DiGraph()

for repo in dff[dff['seed_project'] == 'ethereum/*']['repo_name'].unique():
    G.add_node(f"{gh}{repo}", level=1)

for repo in CONSENSUS + EXECUTION:
    repo_url = f"{gh}{repo}"
    G.add_node(repo_url, level=2)
    G.add_edge(f"{gh}ethereum", repo_url, relation="client_dependency")

for _, row in dff.iterrows():
    if row['seed_project'] == 'ethereum/*':
        package_repo_url = row['target']
        G.add_node(package_repo_url, level=2)
        G.add_edge(row['source'], package_repo_url, relation="package_dependency")
    else:
        package_repo_url = row['target']
        if package_repo_url not in G.nodes:
            G.add_node(package_repo_url, level=3)
        G.add_edge(row['source'], package_repo_url, relation="client_package_dependency")

total_edges = G.number_of_edges()
global_weight = 1.0 / total_edges
for u, v in G.edges:
    G[u][v]['weight'] = global_weight
    
total_weight = sum(data['weight'] for _, _, data in G.edges(data=True))
print(f"Total Weight of All Edges: {total_weight:.5f}")    
    
graph_json = nx.node_link_data(G)
output_path = "data/unweighted_graph.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)

Total Weight of All Edges: 1.00000
