In [1]:
from google.cloud import bigquery
import json
import os
import pandas as pd

GCP_PROJECT = 'opensource-observer'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../oso_gcp_credentials.json'
client = bigquery.Client(GCP_PROJECT)

In [2]:
projects = [
    'velodrome',
    'sushi',
    'smoldapp',
    'synthetix',
    'rabbithole',
    'sound-xyz',
    'across',
    'basepaint',
    'voteagora',
    'superfluid',
    'ethereum-attestation-service',
    'giveth'
]
print(len(projects))

12


In [3]:
def stringify_array(lst):
    return "'" + "','".join(lst) + "'"
stringify_array(projects)

"'velodrome','sushi','smoldapp','synthetix','rabbithole','sound-xyz','across','basepaint','voteagora','superfluid','ethereum-attestation-service','giveth'"

In [4]:
repo_query = f"""
    select
        r.artifact_id,
        concat(r.artifact_namespace, '/', r.artifact_name) as repo,
        p.project_name,
        r.language
    from `oso_production.repositories_v0` r
    join `oso_production.projects_v1` p
        on r.project_id = p.project_id
    where
        p.project_name in ({stringify_array(projects)})
        and r.language in ('Solidity', 'TypeScript', 'Rust')
        and r.updated_at > '2024-07-01'
        and r.star_count > 5
"""
result = client.query(repo_query)
onchain_repos_df = result.to_dataframe()
onchain_repos_df.tail()

Unnamed: 0,artifact_id,repo,project_name,language
86,UIS92aDARMkp6lXJ9Xf-q2jMfa093pnTQUriwy4NLXk=,across-protocol/sdk,across,TypeScript
87,Wgkm5yqcGnasSH5HBfapUsewzjR5JB9Ev54anEUey2I=,across-protocol/relayer,across,TypeScript
88,Xe04hqbuTpF1mpB1Fy1bB99g1agp8DkBiN96c8a1uWA=,across-protocol/xdelegate,across,Solidity
89,19krm-31HYtuVsXzfub_pgwvA9IGEYVknrsWeg9Jd3c=,across-protocol/frontend,across,TypeScript
90,VYAAmLw9Qh5umJfpY9VIvfJ53T7DHdxxb1q4mBTD63U=,sushiswap/meiji,sushi,Solidity


In [5]:
onchain_repos_df.groupby('project_name')['repo'].unique()

project_name
across                          [across-protocol/scraper-api, across-protocol/...
basepaint                       [basepaint/basepaint-mini, basepaint/basepaint...
ethereum-attestation-service    [ethereum-attestation-service/eas-ponder-graph...
giveth                          [giveth/ui-design-system, giveth/impact-graph,...
rabbithole                      [rabbitholegg/questdk-plugins, rabbitholegg/qu...
smoldapp                        [smoldapp/smolv2, smoldapp/dumpservices, smold...
sound-xyz                       [soundxyz/redis-pubsub, soundxyz/sdk, soundxyz...
superfluid                      [superfluid-finance/custom-supertokens, superf...
sushi                           [sushiswap/contracts, sushiswap/v2-core, sushi...
synthetix                       [synthetixio/synthetix-exchange, synthetixio/j...
velodrome                       [velodrome-finance/v1, velodrome-finance/slips...
voteagora                       [voteagora/liquid-delegator, voteagora/agora-n...
Nam

In [6]:
deps_query = f"""
with package_list as (
  -- Create a list of all packages imported by the Onchain Builder and maintained by the same Devtool URL
  select
    projects.display_name as `Onchain Builder`,
    devtools.display_name as `Devtool Project`,
    concat('https://github.com/', repo.artifact_namespace, '/', repo.artifact_name) as `Devtool URL`,
    array_agg(distinct sboms.to_package_artifact_name) as `List of Packages Used by Project`
  from `oso_production.sboms_v0` sboms
  join `oso_production.projects_v1` projects
    on sboms.from_project_id = projects.project_id
  join `oso_production.package_owners_v0` package_owners
    on sboms.to_package_artifact_name = package_owners.package_artifact_name
    and sboms.to_package_artifact_source = package_owners.package_artifact_source
  join `oso_production.repositories_v0` repo
    on package_owners.package_owner_artifact_id = repo.artifact_id
  join `oso_production.projects_v1` devtools
    on package_owners.package_owner_project_id = devtools.project_id
  where sboms.from_artifact_id in ({stringify_array(onchain_repos_df['artifact_id'].unique())})
  group by
    projects.display_name, devtools.display_name, repo.artifact_namespace, repo.artifact_name
),

package_dependents as (
  -- Calculate dependents for all packages maintained by the same Devtool URL
  select
    concat('https://github.com/', repo.artifact_namespace, '/', repo.artifact_name) as `Devtool URL`,
    count(distinct sboms.from_project_id) as `Total Package Dependents in OSO`,
    count(distinct case when pbc.collection_name = 'op-retrofunding-4' then sboms.from_project_id end) as `Total Package Dependents in RF4`
  from `oso_production.sboms_v0` sboms
  join `oso_production.package_owners_v0` package_owners
    on sboms.to_package_artifact_name = package_owners.package_artifact_name
    and sboms.to_package_artifact_source = package_owners.package_artifact_source
  join `oso_production.repositories_v0` repo
    on package_owners.package_owner_artifact_id = repo.artifact_id
  left join `oso_production.projects_by_collection_v1` pbc
    on sboms.from_project_id = pbc.project_id
  group by repo.artifact_namespace, repo.artifact_name
),

last_commit as (
  -- Ensure active development based on recent commits
  select
    to_artifact_id,
    count(distinct date_trunc(`time`, MONTH)) as commit_months    
  from `oso_production.int_events__github`
  where
    event_type = 'COMMIT_CODE'
    and `time` > '2024-01-01'
  group by to_artifact_id
)

select distinct
  pl.`Onchain Builder`,
  pl.`Devtool Project`,
  pl.`Devtool URL`,
  pl.`List of Packages Used by Project`,
  pd.`Total Package Dependents in OSO`,
  pd.`Total Package Dependents in RF4`,
  repo.star_count as `Devtool Star Count`,
  repo.fork_count as `Devtool Fork Count`,
  format_date('%Y-%m-%d', repo.created_at) as `Devtool Created Date`,
  repo.language as `Devtool Language`,
  repo.license_name as `License`,
  package_owners.package_owner_project_id in (
    select project_id
    from `oso_production.projects_by_collection_v1`
    where collection_name = 'optimism'
  ) as `Past RF Recipient`
from package_list pl
join package_dependents pd
  on pl.`Devtool URL` = pd.`Devtool URL`
join `oso_production.repositories_v0` repo
  on concat('https://github.com/', repo.artifact_namespace, '/', repo.artifact_name) = pl.`Devtool URL`
join `oso_production.package_owners_v0` package_owners
  on repo.artifact_id = package_owners.package_owner_artifact_id
join last_commit
  on package_owners.package_owner_artifact_id = last_commit.to_artifact_id
where last_commit.commit_months >= 3
"""
result = client.query(deps_query)
dependencies_df = result.to_dataframe()
dependencies_df = dependencies_df[dependencies_df['Onchain Builder'] != dependencies_df['Devtool Project']]
dependencies_df.tail()

Unnamed: 0,Onchain Builder,Devtool Project,Devtool URL,List of Packages Used by Project,Total Package Dependents in OSO,Total Package Dependents in RF4,Devtool Star Count,Devtool Fork Count,Devtool Created Date,Devtool Language,License,Past RF Recipient
681,Agora,UnJS,https://github.com/unjs/ufo,[ufo],999,88,1096,47,2020-12-04,TypeScript,MIT License,False
682,Boost Studios,UnJS,https://github.com/unjs/ufo,[ufo],999,88,1096,47,2020-12-04,TypeScript,MIT License,False
683,Across,Metamask,https://github.com/metamask/rpc-errors,[@metamask/rpc-errors],613,70,165,33,2019-08-06,TypeScript,MIT License,True
684,Sushi,Vue,https://github.com/vuejs/router,[vue-router],439,24,4049,1202,2019-02-07,TypeScript,MIT License,False
685,Agora,Ethereum Attestation Service,https://github.com/ethereum-attestation-servic...,[@ethereum-attestation-service/eas-contracts],60,7,262,87,2020-10-03,TypeScript,MIT License,True


In [7]:
dependencies_df[dependencies_df['Past RF Recipient'] == True].groupby('Onchain Builder')['Devtool Project'].nunique()

Onchain Builder
Across                          18
Agora                           11
Basepaint                        1
Boost Studios                   15
Ethereum Attestation Service    13
Giveth                          14
Smol Dapp                        7
Sound.xyz                        4
Superfluid                      19
Sushi                           16
Synthetix                       19
Velodrome                       12
Name: Devtool Project, dtype: int64

In [8]:
dependencies_df['Repo Relative Star Rank'] = dependencies_df.groupby(['Onchain Builder', 'Devtool Project'])['Devtool Star Count'].transform(
    lambda x: x.rank(ascending=False)
)
dependencies_df['OSO Package Rank'] = dependencies_df.groupby('Devtool Language')['Total Package Dependents in OSO'].transform(
    lambda x: x.rank(pct=True)
)
dependencies_df['RF4 Package Rank'] = dependencies_df.groupby('Devtool Language')['Total Package Dependents in RF4'].transform(
    lambda x: x.rank(pct=True)
)
dependencies_df['RF4 Delta'] = dependencies_df['RF4 Package Rank'] - dependencies_df['OSO Package Rank']
dependencies_df.to_csv('data/pairwise/unfiltered_dependency_data_for_pairwise.csv')
dependencies_df.tail()

Unnamed: 0,Onchain Builder,Devtool Project,Devtool URL,List of Packages Used by Project,Total Package Dependents in OSO,Total Package Dependents in RF4,Devtool Star Count,Devtool Fork Count,Devtool Created Date,Devtool Language,License,Past RF Recipient,Repo Relative Star Rank,OSO Package Rank,RF4 Package Rank,RF4 Delta
681,Agora,UnJS,https://github.com/unjs/ufo,[ufo],999,88,1096,47,2020-12-04,TypeScript,MIT License,False,1.0,0.515777,0.455097,-0.06068
682,Boost Studios,UnJS,https://github.com/unjs/ufo,[ufo],999,88,1096,47,2020-12-04,TypeScript,MIT License,False,1.0,0.515777,0.455097,-0.06068
683,Across,Metamask,https://github.com/metamask/rpc-errors,[@metamask/rpc-errors],613,70,165,33,2019-08-06,TypeScript,MIT License,True,6.0,0.252427,0.315534,0.063107
684,Sushi,Vue,https://github.com/vuejs/router,[vue-router],439,24,4049,1202,2019-02-07,TypeScript,MIT License,False,1.0,0.152913,0.104369,-0.048544
685,Agora,Ethereum Attestation Service,https://github.com/ethereum-attestation-servic...,[@ethereum-attestation-service/eas-contracts],60,7,262,87,2020-10-03,TypeScript,MIT License,True,1.0,0.052184,0.052184,0.0


In [9]:
NUM_REPOS = 30
unique_devtool_records = []

for builder, builder_df in dependencies_df.groupby('Onchain Builder'):    
    # Step 1: Identify Past RF4 Recipients & keep top 2 repos per Devtool Project
    past_rf_recipients = (
        builder_df[builder_df['Past RF Recipient']]
        .sort_values(by='RF4 Delta', ascending=False)
        .groupby(['Devtool Project', 'Devtool URL'])  # Group at Devtool Project + Repo level
        .first()  # Keep one record per Devtool Project & URL
        .reset_index()
        .groupby('Devtool Project')  # Now, group at Devtool Project level
        .head(2)  # Take top 2 repos per Devtool Project
    )
    
    # Step 2: Identify remaining projects (non-RF4 recipients) & take only distinct Devtool Projects
    remaining_projects = (
        builder_df[~builder_df['Past RF Recipient']]
        .sort_values(by='RF4 Delta', ascending=False)
        .drop_duplicates(subset=['Devtool Project'])  # Keep only one per Devtool Project
    )
    
    # Step 3: Combine both groups & ensure we only keep NUM_REPOS in total
    top_dependencies = pd.concat([past_rf_recipients, remaining_projects]).head(NUM_REPOS)
    
    unique_devtool_records.append(top_dependencies)

df = (
    pd.concat(unique_devtool_records)
    .reset_index(drop=True)
    .drop(columns=['RF4 Delta', 'OSO Package Rank', 'RF4 Package Rank', 'Repo Relative Star Rank', 'Past RF Recipient'])
    .sort_values(by=['Onchain Builder', 'Devtool Project', 'Devtool URL'])
)

df['List of Packages Used by Project'] = df['List of Packages Used by Project'].apply(lambda x: ", ".join(sorted(x)))

df

Unnamed: 0,Devtool Project,Devtool URL,Onchain Builder,List of Packages Used by Project,Total Package Dependents in OSO,Total Package Dependents in RF4,Devtool Star Count,Devtool Fork Count,Devtool Created Date,Devtool Language,License
0,Blocknative,https://github.com/blocknative/web3-onboard,Across,"@web3-onboard/coinbase, @web3-onboard/common, ...",98,16,876,527,2019-08-16,TypeScript,MIT License
26,Coinbase,https://github.com/coinbase/coinbase-wallet-sdk,Across,@coinbase/wallet-sdk,964,93,1540,613,2019-05-01,TypeScript,MIT License
1,Ethereum Miscellenia,https://github.com/ethereum/solc-js,Across,solc,1799,149,1470,481,2016-04-04,TypeScript,MIT License
2,EthereumJS,https://github.com/ethereumjs/ethereumjs-monorepo,Across,"@ethereumjs/block, @ethereumjs/blockchain, @et...",2298,170,2631,783,2015-07-10,TypeScript,
3,Hardhat,https://github.com/nomicfoundation/hardhat,Across,@nomicfoundation/hardhat-verify,1119,121,7470,1474,2018-04-14,TypeScript,Other
...,...,...,...,...,...,...,...,...,...,...,...
300,Solhint,https://github.com/protofire/solhint,Velodrome,solhint,1014,111,1055,172,2017-10-16,JavaScript,MIT License
301,Tenderly,https://github.com/tenderly/hardhat-tenderly,Velodrome,"@tenderly/hardhat-tenderly, tenderly",202,28,166,45,2020-09-09,TypeScript,Other
302,ethers.js,https://github.com/ethers-io/ethers.js,Velodrome,"@ethersproject/abi, @ethersproject/abstract-pr...",2552,180,8086,1925,2016-07-16,TypeScript,MIT License
303,gas-reporter,https://github.com/cgewecke/hardhat-gas-reporter,Velodrome,hardhat-gas-reporter,1347,139,416,64,2019-06-20,TypeScript,MIT License


In [10]:
df.groupby('Onchain Builder')['Devtool Project'].unique()

Onchain Builder
Across                          [Blocknative, Coinbase, Ethereum Miscellenia, ...
Agora                           [Alchemy, Babel, Coinbase, DefinitelyTyped, ES...
Basepaint                       [Arda TANRIKULU, Babel, DefinitelyTyped, ESLin...
Boost Studios                   [DefinitelyTyped, ESLint, Ethereum Miscellenia...
Ethereum Attestation Service    [Babel, Coinbase, Daniel Bugl, DefinitelyTyped...
Giveth                          [Coinbase, ESLint, Ethereum Attestation Servic...
Smol Dapp                       [Babel, Coinbase, DefinitelyTyped, ESLint, Eth...
Sound.xyz                       [Arda TANRIKULU, Babel, DefinitelyTyped, ESLin...
Superfluid                      [Coinbase, Ethereum Miscellenia, EthereumJS, F...
Sushi                           [Coinbase, DefinitelyTyped, ESLint, Ethereum M...
Synthetix                       [Blocknative, Cannon, Coinbase, Ethereum Misce...
Velodrome                       [Babel, Daniel Bugl, DefinitelyTyped, ESLint, ...


In [11]:
df.to_csv('data/pairwise/sample_pairwise_dump.csv')

In [12]:
firstLevelCategoryList = list(df['Onchain Builder'].unique())
with open("data/pairwise/get1stLevelCategoryList.json", "w") as f:
    json.dump(firstLevelCategoryList, f, indent=2)

projectsForCategory = df.groupby('Onchain Builder')['Devtool URL'].apply(list).to_dict()
with open("data/pairwise/getProjectsForCategory.json", "w") as f:
    json.dump(projectsForCategory, f, indent=2)

projectMetadata = (
    df
    .set_index('Devtool URL')    
    .rename(columns={
        'Devtool Project': 'Dependency Maintainer',
        'Devtool Star Count': 'Stars',
        'Devtool Fork Count': 'Forks',
        'Devtool Created Date': 'Created Date',
        'Devtool Language': 'Language',
    })
    [['Dependency Maintainer', 'Stars', 'Forks', 'Created Date', 'Language']]
    .drop_duplicates()
)
projectMetadata.to_json(
    "data/pairwise/getProjectMetadata.json", 
    orient='index',
    indent=2
)