In [1]:
from google.cloud import bigquery
import os
import pandas as pd
import re

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'
GCP_PROJECT = 'opensource-observer'
client = bigquery.Client(GCP_PROJECT)

In [3]:
# Helpers

def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

def parse_github(pkg_url):
    if isinstance(pkg_url, str):
        match = re.search(r'github\.com/([^/]+)/([^/.#]+)', pkg_url)
        if match:
            owner = match.group(1).lower()
            repo = match.group(2).lower()
            return '/'.join([owner, repo])    
    return None

def extract_namespace(npm_package_name):
    if npm_package_name.startswith('@'):
        return npm_package_name.split('/')[0][1:]
    return None

# Get SBOMs for repos we care about

In [4]:
CONSENSUS = [
    'prysmaticlabs/prysm',
    'sigp/lighthouse',
    'consensys/teku',
    'status-im/nimbus-eth2',
    'chainsafe/lodestar',
    'grandinetech/grandine'
]
EXECUTION = [
    'ethereum/go-ethereum',
    'nethermindeth/nethermind',
    'hyperledger/besu',
    'erigontech/erigon',
    'paradigmxyz/reth'
]
OTHER = [
    'ethereum/solidity',
    'ethereum/remix-project',
    'vyperlang/vyper',
    'ethereum/web3.py',
    'ethereum/py-evm',
    'eth-infinitism/account-abstraction',
    'safe-global/safe-smart-account',
    'a16z/helios',
    'web3/web3.js', # prev. 'ethereum/web3.js',
    'ethereumjs/ethereumjs-monorepo'    
]
SEED_REPOS = CONSENSUS + EXECUTION + OTHER
PACKAGE_SERVERS = ['NPM', 'RUST', 'GO', 'PIP']

In [5]:
print(SEED_REPOS)

['prysmaticlabs/prysm', 'sigp/lighthouse', 'consensys/teku', 'status-im/nimbus-eth2', 'chainsafe/lodestar', 'grandinetech/grandine', 'ethereum/go-ethereum', 'nethermindeth/nethermind', 'hyperledger/besu', 'erigontech/erigon', 'paradigmxyz/reth', 'ethereum/solidity', 'ethereum/remix-project', 'vyperlang/vyper', 'ethereum/web3.py', 'ethereum/py-evm', 'eth-infinitism/account-abstraction', 'safe-global/safe-smart-account', 'a16z/helios', 'web3/web3.js', 'ethereumjs/ethereumjs-monorepo']


In [6]:
sbom_results = client.query(f"""
    WITH sboms AS (
      SELECT DISTINCT
        artifact_namespace AS repo_owner,
        CONCAT(artifact_namespace, '/', artifact_name) AS repo_name,
        package_artifact_name,
        package_artifact_source
      FROM `oso.int_sbom_artifacts`
      WHERE package_artifact_source IN ({stringify_array(PACKAGE_SERVERS)})
    )
    SELECT * FROM sboms
    WHERE repo_name IN ({stringify_array(SEED_REPOS)})
""")
df_sbom = sbom_results.to_dataframe()
df_sbom.to_csv('data/sbom.csv')
df_sbom.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source
13801,ethereum,ethereum/go-ethereum,github.com/cespare/cp,GO
13802,ethereum,ethereum/go-ethereum,github.com/protolambda/ztyp,GO
13803,ethereum,ethereum/go-ethereum,github.com/azure/azure-sdk-for-go/sdk/internal,GO
13804,ethereum,ethereum/go-ethereum,go.uber.org/automaxprocs,GO
13805,ethereum,ethereum/go-ethereum,github.com/aws/aws-sdk-go-v2/internal/ini,GO


In [7]:
df_sbom.groupby('package_artifact_source')['package_artifact_name'].nunique()

package_artifact_source
GO       416
NPM     4750
PIP      137
RUST    1076
Name: package_artifact_name, dtype: int64

In [8]:
df_sbom.groupby('package_artifact_source')['package_artifact_name'].nunique().sum()

6379

## Lookup Go packages

In [9]:
df_go = df_sbom[df_sbom['package_artifact_source'] == 'GO'].copy()
df_go['package_repo_name'] = df_go['package_artifact_name'].apply(parse_github)
df_go['likely_package_repo_owner'] = df_go['package_repo_name'].apply(
    lambda x: x.split('/')[0] if isinstance(x, str) else None
)
df_go.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
13801,ethereum,ethereum/go-ethereum,github.com/cespare/cp,GO,cespare/cp,cespare
13802,ethereum,ethereum/go-ethereum,github.com/protolambda/ztyp,GO,protolambda/ztyp,protolambda
13803,ethereum,ethereum/go-ethereum,github.com/azure/azure-sdk-for-go/sdk/internal,GO,azure/azure-sdk-for-go,azure
13804,ethereum,ethereum/go-ethereum,go.uber.org/automaxprocs,GO,,
13805,ethereum,ethereum/go-ethereum,github.com/aws/aws-sdk-go-v2/internal/ini,GO,aws/aws-sdk-go-v2,aws


## Lookup Python packages

In [10]:
df_py = df_sbom[df_sbom['package_artifact_source'] == 'PIP'].copy()
py_pkgs_all = df_py['package_artifact_name'].dropna().unique()

In [11]:
py_pkg_lookup = client.query(f"""
    WITH github_data AS (
      SELECT DISTINCT
        LOWER(name) AS package_name,
        LOWER(REGEXP_EXTRACT(url, r'github\.com/([^/]+)/')) AS github_owner,
        LOWER(REGEXP_EXTRACT(url, r'github\.com/[^/]+/([^/.]+)')) AS github_repo
      FROM `bigquery-public-data.pypi.distribution_metadata`,
      UNNEST(project_urls) AS url
      WHERE url LIKE '%github.com/%'

      UNION ALL

      SELECT DISTINCT
        LOWER(name) AS package_name,
        LOWER(REGEXP_EXTRACT(home_page, r'github\.com/([^/]+)/')) AS github_owner,
        LOWER(REGEXP_EXTRACT(home_page, r'github\.com/[^/]+/([^/.]+)')) AS github_repo
      FROM `bigquery-public-data.pypi.distribution_metadata`
      WHERE home_page LIKE '%github.com/%'
    )
    
    SELECT DISTINCT
      package_name,
      github_owner,
      github_repo
    FROM github_data
    WHERE package_name IN ({stringify_array(py_pkgs_all)})
""")
df_py_pkg = py_pkg_lookup.to_dataframe()

In [12]:
df_py_pkg['package_repo_name'] = df_py_pkg.apply(
    lambda x: f"{x['github_owner']}/{x['github_repo']}".strip("'")
              if x['github_owner'] and x['github_repo']
              else None,
    axis=1
)
py_mapper = (
    df_py_pkg[['package_name', 'package_repo_name']]
    .dropna()
    .drop_duplicates()
    .set_index('package_name')['package_repo_name']
    .to_dict()
)
df_py['package_repo_name'] = df_py['package_artifact_name'].map(py_mapper)
df_py['likely_package_repo_owner'] = df_py['package_repo_name'].apply(
    lambda x: x.split('/')[0] if isinstance(x, str) else None
)

df_py.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
13280,ethereum,ethereum/web3.py,types-requests,PIP,python/typeshed,python
13421,status-im,status-im/nimbus-eth2,pep517,PIP,pypa/pep517,pypa
13422,status-im,status-im/nimbus-eth2,pip-tools,PIP,nvie/pip-tools,nvie
13423,status-im,status-im/nimbus-eth2,debugpy,PIP,microsoft/debugpy,microsoft
13432,safe-global,safe-global/safe-smart-account,certora-cli,PIP,certora/certoracli,certora


## Lookup Rust

In [13]:
df_rust = df_sbom[df_sbom['package_artifact_source'] == 'RUST'].copy()
rust_pkgs_all = df_rust['package_artifact_name'].dropna().unique()

In [14]:
rust_pkg_lookup = client.query(f"""
    select distinct
      name,
      lower(repository) as repository
    from `crates.crates`
    where
        name in ({stringify_array(rust_pkgs_all)})
        and repository is not null
""")
rust_py_pkg = rust_pkg_lookup.to_dataframe()
rust_py_pkg['package_repo_name'] = rust_py_pkg['repository'].apply(parse_github)
rust_py_pkg.tail(5)

Unnamed: 0,name,repository,package_repo_name
1045,crossbeam-deque,https://github.com/crossbeam-rs/crossbeam,crossbeam-rs/crossbeam
1046,crossbeam-epoch,https://github.com/crossbeam-rs/crossbeam,crossbeam-rs/crossbeam
1047,crossbeam-utils,https://github.com/crossbeam-rs/crossbeam,crossbeam-rs/crossbeam
1048,crossbeam-channel,https://github.com/crossbeam-rs/crossbeam,crossbeam-rs/crossbeam
1049,crossbeam-skiplist,https://github.com/crossbeam-rs/crossbeam,crossbeam-rs/crossbeam


In [15]:
df_rust = df_sbom[df_sbom['package_artifact_source'] == 'RUST'].copy()

crates_mapper = (
    rust_py_pkg[['name', 'package_repo_name']]
    .dropna()
    .drop_duplicates()
    .set_index('name')['package_repo_name']
    .to_dict()
)
df_rust['package_repo_name'] = df_rust['package_artifact_name'].map(crates_mapper)
df_rust['likely_package_repo_owner'] = df_rust['package_repo_name'].apply(
    lambda x: x.split('/')[0] if isinstance(x, str) else None
)

df_rust.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
13504,paradigmxyz,paradigmxyz/reth,httparse,RUST,seanmonstar/httparse,seanmonstar
13505,paradigmxyz,paradigmxyz/reth,simple_asn1,RUST,acw/simple_asn1,acw
13506,paradigmxyz,paradigmxyz/reth,ws_stream_wasm,RUST,najamelan/ws_stream_wasm,najamelan
13507,paradigmxyz,paradigmxyz/reth,rlp,RUST,paritytech/parity-common,paritytech
13508,paradigmxyz,paradigmxyz/reth,c-kzg,RUST,,


## Lookup NPM

In [16]:
df_npm = df_sbom[df_sbom['package_artifact_source'] == 'NPM'].copy()
npm_pkgs_all = sorted(df_npm['package_artifact_name'].unique())
len(npm_pkgs_all)

4750

In [17]:
# create a registry from deps.dev

q = """
SELECT
  Name as package_name,
  (SELECT lower(URL) 
   FROM UNNEST(Links) 
   WHERE REGEXP_CONTAINS(URL, r'github\.com')
   LIMIT 1) AS package_github_url
FROM `bigquery-public-data.deps_dev_v1.PackageVersionsLatest`
WHERE
  System = 'NPM'
  AND SnapshotAt >= '2024-11-01'
  AND ARRAY_LENGTH(Links) > 0
"""

# save it to static_data_sources.npm_registry

In [18]:
npm_pkg_lookup = client.query(f"""
    SELECT DISTINCT
      package_name,
      package_github_url
    FROM `static_data_sources.npm_registry`
    WHERE package_name in ({stringify_array(npm_pkgs_all)})
""")
df_npm_pkg = npm_pkg_lookup.to_dataframe()

In [19]:
df_npm_pkg['package_repo_name'] = df_npm_pkg['package_github_url'].apply(parse_github)
npm_mapper = (
    df_npm_pkg[['package_name', 'package_repo_name']]
    .dropna()
    .drop_duplicates()
    .set_index('package_name')['package_repo_name']
    .to_dict()
)
df_npm['package_repo_name'] = df_npm['package_artifact_name'].map(npm_mapper)

df_npm['namespace'] = df_npm['package_artifact_name'].apply(extract_namespace)
namespace_to_owner = {}
for package_name, repo_name in npm_mapper.items():
    namespace = extract_namespace(package_name)
    if namespace and repo_name:
        github_owner = repo_name.split('/')[0]
        namespace_to_owner.update({namespace:github_owner})

df_npm['likely_package_repo_owner'] = df_npm['namespace'].map(namespace_to_owner)
df_npm.loc[df_npm['package_repo_name'].notna(), 'likely_package_repo_owner'] = df_npm['package_repo_name'].str.split('/').str[0]
df_npm.drop(columns='namespace', inplace=True)
df_npm.tail()

Unnamed: 0,repo_owner,repo_name,package_artifact_name,package_artifact_source,package_repo_name,likely_package_repo_owner
13792,web3,web3/web3.js,use-sidecar,NPM,thekashey/use-sidecar,thekashey
13793,web3,web3/web3.js,@nodelib/fs.walk,NPM,nodelib/nodelib,nodelib
13794,web3,web3/web3.js,available-typed-arrays,NPM,inspect-js/available-typed-arrays,inspect-js
13795,web3,web3/web3.js,which-pm-runs,NPM,zkochan/which-pm-runs,zkochan
13796,web3,web3/web3.js,not,NPM,raynos/not,raynos


# Consolidate back into a single graph

In [20]:
df = pd.concat([df_npm,df_go,df_rust,df_py], axis=0, ignore_index=True)
df.fillna('', inplace=True)
df.rename(columns={
    'repo_name': 'seed_repo_name',
    'repo_owner': 'seed_repo_owner',
    'package_artifact_name': 'package_name',
    'package_artifact_source': 'package_source',
    'likely_package_repo_owner': 'package_repo_owner'
}, inplace=True)
df = df[[
    'seed_repo_name', 'seed_repo_owner', 'package_name', 
    'package_repo_owner', 'package_repo_name', 'package_source'
]]
df

Unnamed: 0,seed_repo_name,seed_repo_owner,package_name,package_repo_owner,package_repo_name,package_source
0,eth-infinitism/account-abstraction,eth-infinitism,@eslint/eslintrc,eslint,eslint/eslintrc,NPM
1,eth-infinitism/account-abstraction,eth-infinitism,@types/http-cache-semantics,definitelytyped,definitelytyped/definitelytyped,NPM
2,eth-infinitism/account-abstraction,eth-infinitism,esrecurse,estools,estools/esrecurse,NPM
3,eth-infinitism/account-abstraction,eth-infinitism,typed-array-byte-length,inspect-js,inspect-js/typed-array-byte-length,NPM
4,eth-infinitism/account-abstraction,eth-infinitism,fetch-ponyfill,qubyte,qubyte/fetch-ponyfill,NPM
...,...,...,...,...,...,...
13801,ethereum/web3.py,ethereum,types-requests,python,python/typeshed,PIP
13802,status-im/nimbus-eth2,status-im,pep517,pypa,pypa/pep517,PIP
13803,status-im/nimbus-eth2,status-im,pip-tools,nvie,nvie/pip-tools,PIP
13804,status-im/nimbus-eth2,status-im,debugpy,microsoft,microsoft/debugpy,PIP


In [21]:
for pkg in PACKAGE_SERVERS:
    print(f"\n### Most Popular {pkg} Packages ###")
    pkg_lst = df[(df['package_source']==pkg) & (df['package_repo_owner'] != '')]['package_repo_name'].value_counts()
    nth = int(len(pkg_lst) * (0.025 if pkg == 'NPM' else 0.1))
    pkg_lst_top = list(pkg_lst.head(nth).index)
    print(pkg_lst_top)


### Most Popular NPM Packages ###
['babel/babel', 'definitelytyped/definitelytyped', 'ethers-io/ethers', 'lerna/lerna', 'lodash/lodash', 'evanw/esbuild', 'ethereumjs/ethereumjs-monorepo', 'micromark/micromark', 'facebook/jest', 'streetsidesoftware/cspell-dicts', 'chainsafe/web3', 'facebook/docusaurus', 'radix-ui/primitives', 'typescript-eslint/typescript-eslint', 'ben-eb/cssnano', 'jestjs/jest', '', 'xtuc/webassemblyjs', 'cssnano/cssnano', 'ethereum/web3', 'nrwl/nx', 'nomicfoundation/solidity-analyzer', 'rollup/rollup', 'napi-rs/node-rs', 'smooth-code/svgr', 'ardatan/graphql-tools', 'swc-project/swc', 'algolia/algoliasearch-client-javascript', 'tsconfig/bases', 'getsentry/sentry-javascript', 'blakeembrey/change-case', 'nodelib/nodelib', 'stablelib/stablelib', 'vitest-dev/vitest', 'achingbrain/it', 'tootallnate/proxy-agents', 'streetsidesoftware/cspell', 'webdriverio/webdriverio', 'walletconnect/walletconnect-utils', 'gregberge/svgr', 'ethereumjs/ethereumjs-vm', 'chainsafe/lodestar', '

In [22]:
owners = list(df['package_repo_owner'].unique())
len(owners)

1807

In [23]:
owner_lookup = client.query(f"""
    SELECT
      artifact_namespace as owner,
      COUNT(distinct artifact_id) AS oso_artifacts_owned
    FROM `oso.artifacts_v1`
    WHERE artifact_namespace in ({stringify_array(owners)})
    GROUP BY 1
""")

owners_on_oso = [row['owner'] for row in owner_lookup]
df['package_repo_owner_is_on_oso'] = df['package_repo_owner'].apply(lambda x: x in owners_on_oso)
df['package_repo_owner_is_on_oso'].mean()

0.6926698536868029

In [24]:
df.to_csv('data/unweighted_graph.csv')

# Create a network graph

In [25]:
import json
import networkx as nx

In [26]:
dff = df[df['package_repo_owner'] != ''].copy()
len(dff) / len(df)

0.9811676082862524

In [27]:
gh = 'https://github.com/'
dff['source'] = dff['seed_repo_name'].apply(lambda x: f'{gh}{x}')
dff['target'] = dff.apply(
    lambda x: f"{gh}{x['package_repo_name']}"
              if x['package_repo_name'] != ''
              else f"{gh}{x['package_repo_owner']}"
    , axis=1)
dff

Unnamed: 0,seed_repo_name,seed_repo_owner,package_name,package_repo_owner,package_repo_name,package_source,package_repo_owner_is_on_oso,source,target
0,eth-infinitism/account-abstraction,eth-infinitism,@eslint/eslintrc,eslint,eslint/eslintrc,NPM,True,https://github.com/eth-infinitism/account-abst...,https://github.com/eslint/eslintrc
1,eth-infinitism/account-abstraction,eth-infinitism,@types/http-cache-semantics,definitelytyped,definitelytyped/definitelytyped,NPM,True,https://github.com/eth-infinitism/account-abst...,https://github.com/definitelytyped/definitelyt...
2,eth-infinitism/account-abstraction,eth-infinitism,esrecurse,estools,estools/esrecurse,NPM,False,https://github.com/eth-infinitism/account-abst...,https://github.com/estools/esrecurse
3,eth-infinitism/account-abstraction,eth-infinitism,typed-array-byte-length,inspect-js,inspect-js/typed-array-byte-length,NPM,False,https://github.com/eth-infinitism/account-abst...,https://github.com/inspect-js/typed-array-byte...
4,eth-infinitism/account-abstraction,eth-infinitism,fetch-ponyfill,qubyte,qubyte/fetch-ponyfill,NPM,True,https://github.com/eth-infinitism/account-abst...,https://github.com/qubyte/fetch-ponyfill
...,...,...,...,...,...,...,...,...,...
13801,ethereum/web3.py,ethereum,types-requests,python,python/typeshed,PIP,True,https://github.com/ethereum/web3.py,https://github.com/python/typeshed
13802,status-im/nimbus-eth2,status-im,pep517,pypa,pypa/pep517,PIP,False,https://github.com/status-im/nimbus-eth2,https://github.com/pypa/pep517
13803,status-im/nimbus-eth2,status-im,pip-tools,nvie,nvie/pip-tools,PIP,True,https://github.com/status-im/nimbus-eth2,https://github.com/nvie/pip-tools
13804,status-im/nimbus-eth2,status-im,debugpy,microsoft,microsoft/debugpy,PIP,True,https://github.com/status-im/nimbus-eth2,https://github.com/microsoft/debugpy


In [29]:
G = nx.DiGraph()

for repo in dff['seed_repo_name'].unique():
    repo_url = f"{gh}{repo}"
    G.add_node(repo_url, level=1)
    
for repo in dff['package_repo_name'].unique():
    repo_url = f"{gh}{repo}"
    if repo_url not in G.nodes:
        G.add_node(repo_url, level=2)
        
for _, row in dff.iterrows():
    G.add_edge(row['source'], row['target'], relation=row['package_source'])
    
total_edges = G.number_of_edges()
global_weight = 0 #1.0 / total_edges
for u, v in G.edges:
    G[u][v]['weight'] = global_weight
    
total_weight = sum(data['weight'] for _, _, data in G.edges(data=True))
print(f"Total Weight of All Edges: {total_weight:.5f}")    
    
graph_json = nx.node_link_data(G)
output_path = "data/unweighted_graph.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)

Total Weight of All Edges: 0.00000
