In [1]:
import json
import networkx as nx
import os
import pandas as pd

# Settings

In [2]:
REFRESH_DATA = True
LOCAL_CSV_PATH = 'graphs/scroll_unweighted_graph.csv'
LOCAL_JSON_PATH = 'graphs/scroll_unweighted_graph.json'

In [3]:
SEED_REPOS = ['scroll-tech']
SEED_REPOS = [x.replace('https://github.com/','').lower() for x in SEED_REPOS]
PACKAGE_SERVERS = ['NPM', 'RUST', 'GO', 'PIP']

# Get SBOMs for repos we care about

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Enter the following query into your [console](https://console.cloud.google.com/bigquery) to get a fresh copy of the graph
- Save it as a CSV file to `graph/unweighted_graph.csv`

In [4]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
  -- COPY THIS INTO YOUR BIGQUERY CONSOLE
  
  select distinct
    sboms.from_artifact_namespace as seed_repo_owner,
    sboms.from_artifact_name as seed_repo_name,
    sboms.to_package_artifact_name as package_name,
    package_owners.package_owner_artifact_namespace as package_repo_owner,
    package_owners.package_owner_artifact_name as package_repo_name,
    sboms.to_package_artifact_source as package_source
  from `oso_production.sboms_v0` sboms
  join `oso_production.package_owners_v0` package_owners
    on
      sboms.to_package_artifact_name = package_owners.package_artifact_name
      and sboms.to_package_artifact_source = package_owners.package_artifact_source
  where
    sboms.to_package_artifact_source in ({stringify_array(PACKAGE_SERVERS)})
    and package_owners.package_owner_artifact_namespace is not null
    and sboms.from_artifact_namespace in ({stringify_array(SEED_REPOS)})
"""

if REFRESH_DATA:

    from google.cloud import bigquery

    # replace with your path to credentials
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'

    # replace with your project name
    client = bigquery.Client(project='opensource-observer')
    
    # execute the query and save it
    results = client.query(query)
    df = results.to_dataframe()
    df.to_csv(LOCAL_CSV_PATH)
    print("Query saved to local CSV file.")

else:    
    print(query)

Query saved to local CSV file.


# Load the graph as a CSV

In [5]:
df = pd.read_csv(LOCAL_CSV_PATH)

gh = 'https://github.com/'
df['seed_repo_url'] = df.apply(lambda x: f"{gh}{x['seed_repo_owner']}/{x['seed_repo_name']}", axis=1)
df['package_repo_url'] = df.apply(lambda x: f"{gh}{x['package_repo_owner']}/{x['package_repo_name']}", axis=1)

df.tail()

Unnamed: 0.1,Unnamed: 0,seed_repo_owner,seed_repo_name,package_name,package_repo_owner,package_repo_name,package_source,seed_repo_url,package_repo_url
16525,16525,scroll-tech,scroll-contract-deploy-demo,death,jprichardson,node-death,NPM,https://github.com/scroll-tech/scroll-contract...,https://github.com/jprichardson/node-death
16526,16526,scroll-tech,scroll-contract-deploy-demo,create-hmac,crypto-browserify,createhmac,NPM,https://github.com/scroll-tech/scroll-contract...,https://github.com/crypto-browserify/createhmac
16527,16527,scroll-tech,scroll-contract-deploy-demo,web3-eth-accounts,chainsafe,web3.js,NPM,https://github.com/scroll-tech/scroll-contract...,https://github.com/chainsafe/web3.js
16528,16528,scroll-tech,scroll-contract-deploy-demo,@ethersproject/basex,ethers-io,ethers.js,NPM,https://github.com/scroll-tech/scroll-contract...,https://github.com/ethers-io/ethers.js
16529,16529,scroll-tech,scroll-contract-deploy-demo,neo-async,suguru03,neo-async,NPM,https://github.com/scroll-tech/scroll-contract...,https://github.com/suguru03/neo-async


In [6]:
df.groupby('package_source')['package_name'].nunique()

package_source
GO       292
NPM     3452
RUST     976
Name: package_name, dtype: int64

In [7]:
for pkg in PACKAGE_SERVERS:
    print(f"\n### Most Popular {pkg} Packages ###")
    pkg_lst = df[(df['package_source']==pkg) & (df['package_repo_owner'] != '')]['package_repo_name'].value_counts()
    nth = int(len(pkg_lst) * (0.025 if pkg == 'NPM' else 0.1))
    pkg_lst_top = list(pkg_lst.head(nth).index)
    print(pkg_lst_top)


### Most Popular NPM Packages ###
['babel', 'definitelytyped', 'ethers.js', 'web3.js', 'ethereumjs-monorepo', 'jest', 'sentry-javascript', 'cssnano', 'solidity-analyzer', 'lodash', 'smithy-typescript', 'esbuild', 'micromark', 'postcss-plugins', 'storybook', 'hardhat', 'webassemblyjs', 'typescript-eslint', 'aws-sdk-js-v3', 'bases', 'change-case', 'svgr', 'edr', 'nodelib', 'waffle', 'proxy-agents', 'typechain', 'node-source-map-support', 'rollup', 'acorn', 'react', 'stablelib', 'workbox', 'cliui', 'resolver-engine', 'core-js', 'form-data', 'walletconnect-utils', 'inquirer.js', 'ieee754', 'cli', 'undici', 'emotion', 'watcher', 'source-map', 'color-convert', 'regenerator', 'truffle', 'brace-expansion', 'color-name', 'wrap-ansi', 'minimatch', 'algoliasearch-client-javascript', 'node-lru-cache', 'ansi-regex', 'balanced-match']

### Most Popular RUST Packages ###
['windows-rs', 'wasm-bindgen', 'futures-rs', 'utils', 'formats', 'traits', 'tracing', 'crossbeam', 'parity-common', 'rand', 'hashe

In [8]:
owners = list(df['package_repo_owner'].unique())
len(owners)

1292

# Create a network graph

In [9]:
G = nx.DiGraph()

for seed_repo in SEED_REPOS:
    G.add_node(f"{gh}{seed_repo}", level=1)
    
for repo_url in df['package_repo_url'].unique():
    if repo_url not in G.nodes:
        G.add_node(repo_url, level=2)
        
for _, row in df.iterrows():
    G.add_edge(
        row['seed_repo_url'],
        row['package_repo_url'],
        relation=row['package_source']
    )
    
total_edges = G.number_of_edges()
print(total_edges)

global_weight = 0
for u, v in G.edges:
    G[u][v]['weight'] = global_weight
    
graph_json = nx.node_link_data(G)
with open(LOCAL_JSON_PATH, "w") as f:
    json.dump(graph_json, f, indent=2)

12454
