In [1]:
from google.cloud import bigquery
import json
import os
import pandas as pd

GCP_PROJECT = 'opensource-observer'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../oso_gcp_credentials.json'
client = bigquery.Client(GCP_PROJECT)

In [2]:
projects = [
    'velodrome',
    'sushi',
    'smoldapp',
    'synthetix',
    'rabbithole',
    'sound-xyz',
    'across',
    'multicall-mds1',
    'basepaint',
    'voteagora',
    'superfluid',
    'ethereum-attestation-service',
    'giveth'
]
print(len(projects))

13


In [3]:
def stringify_array(lst):
    return "'" + "','".join(lst) + "'"
stringify_array(projects)

"'velodrome','sushi','smoldapp','synthetix','rabbithole','sound-xyz','across','multicall-mds1','basepaint','voteagora','superfluid','ethereum-attestation-service','giveth'"

In [4]:
repo_query = f"""
    select
        r.artifact_id,
        concat(r.artifact_namespace, '/', r.artifact_name) as repo,
        p.project_name,
        r.language
    from `oso_production.repositories_v0` r
    join `oso_production.projects_v1` p
        on r.project_id = p.project_id
    where
        p.project_name in ({stringify_array(projects)})
        and r.language in ('Solidity', 'TypeScript', 'Rust')
        and r.updated_at > '2024-07-01'
        and r.star_count > 5
"""
result = client.query(repo_query)
onchain_repos_df = result.to_dataframe()
onchain_repos_df.tail()

Unnamed: 0,artifact_id,repo,project_name,language
86,Xe04hqbuTpF1mpB1Fy1bB99g1agp8DkBiN96c8a1uWA=,across-protocol/xdelegate,across,Solidity
87,19krm-31HYtuVsXzfub_pgwvA9IGEYVknrsWeg9Jd3c=,across-protocol/frontend,across,TypeScript
88,Wgkm5yqcGnasSH5HBfapUsewzjR5JB9Ev54anEUey2I=,across-protocol/relayer,across,TypeScript
89,UIS92aDARMkp6lXJ9Xf-q2jMfa093pnTQUriwy4NLXk=,across-protocol/sdk,across,TypeScript
90,VYAAmLw9Qh5umJfpY9VIvfJ53T7DHdxxb1q4mBTD63U=,sushiswap/meiji,sushi,Solidity


In [5]:
onchain_repos_df.groupby('project_name')['repo'].unique()

project_name
across                          [across-protocol/scraper-api, across-protocol/...
basepaint                       [basepaint/basepaint-mini, basepaint/basepaint...
ethereum-attestation-service    [ethereum-attestation-service/eas-indexing-ser...
giveth                          [giveth/impact-graph, giveth/ui-design-system,...
multicall-mds1                                                   [mds1/multicall]
rabbithole                      [rabbitholegg/questdk-plugins, rabbitholegg/qu...
smoldapp                        [smoldapp/corecontracts, smoldapp/smoldapp, sm...
sound-xyz                       [soundxyz/sound-protocol, soundxyz/sdk, soundx...
superfluid                      [superfluid-finance/custom-supertokens, superf...
sushi                           [sushiswap/contracts, sushiswap/strategies, su...
synthetix                       [synthetixio/number-guessing-game, synthetixio...
velodrome                       [velodrome-finance/v1, velodrome-finance/contr...
vot

In [6]:
deps_query = f"""
with package_list as (
  -- Create a list of all packages imported by the Onchain Builder and maintained by the same Devtool URL
  select
    projects.display_name as `Onchain Builder`,
    devtools.display_name as `Devtool Project`,
    concat('https://github.com/', repo.artifact_namespace, '/', repo.artifact_name) as `Devtool URL`,
    array_agg(distinct sboms.to_package_artifact_name) as `List of Packages Used by Project`
  from `oso_production.sboms_v0` sboms
  join `oso_production.projects_v1` projects
    on sboms.from_project_id = projects.project_id
  join `oso_production.package_owners_v0` package_owners
    on sboms.to_package_artifact_name = package_owners.package_artifact_name
    and sboms.to_package_artifact_source = package_owners.package_artifact_source
  join `oso_production.repositories_v0` repo
    on package_owners.package_owner_artifact_id = repo.artifact_id
  join `oso_production.projects_v1` devtools
    on package_owners.package_owner_project_id = devtools.project_id
  where sboms.from_artifact_id in ({stringify_array(onchain_repos_df['artifact_id'].unique())})
  group by
    projects.display_name, devtools.display_name, repo.artifact_namespace, repo.artifact_name
),

package_dependents as (
  -- Calculate dependents for all packages maintained by the same Devtool URL
  select
    concat('https://github.com/', repo.artifact_namespace, '/', repo.artifact_name) as `Devtool URL`,
    count(distinct sboms.from_project_id) as `Total Package Dependents in OSO`,
    count(distinct case when pbc.collection_name = 'op-retrofunding-4' then sboms.from_project_id end) as `Total Package Dependents in RF4`
  from `oso_production.sboms_v0` sboms
  join `oso_production.package_owners_v0` package_owners
    on sboms.to_package_artifact_name = package_owners.package_artifact_name
    and sboms.to_package_artifact_source = package_owners.package_artifact_source
  join `oso_production.repositories_v0` repo
    on package_owners.package_owner_artifact_id = repo.artifact_id
  left join `oso_production.projects_by_collection_v1` pbc
    on sboms.from_project_id = pbc.project_id
  group by repo.artifact_namespace, repo.artifact_name
),

last_commit as (
  -- Ensure active development based on recent commits
  select
    to_artifact_id,
    count(distinct date_trunc(`time`, MONTH)) as commit_months    
  from `oso_production.int_events__github`
  where
    event_type = 'COMMIT_CODE'
    and `time` > '2024-01-01'
  group by to_artifact_id
)

select distinct
  pl.`Onchain Builder`,
  pl.`Devtool Project`,
  pl.`Devtool URL`,
  pl.`List of Packages Used by Project`,
  pd.`Total Package Dependents in OSO`,
  pd.`Total Package Dependents in RF4`,
  repo.star_count as `Devtool Star Count`,
  repo.fork_count as `Devtool Fork Count`,
  format_date('%Y-%m-%d', repo.created_at) as `Devtool Created Date`,
  repo.language as `Devtool Language`,
  repo.license_name as `License`,
  package_owners.package_owner_project_id in (
    select project_id
    from `oso_production.projects_by_collection_v1`
    where collection_name = 'optimism'
  ) as `Past RF Recipient`
from package_list pl
join package_dependents pd
  on pl.`Devtool URL` = pd.`Devtool URL`
join `oso_production.repositories_v0` repo
  on concat('https://github.com/', repo.artifact_namespace, '/', repo.artifact_name) = pl.`Devtool URL`
join `oso_production.package_owners_v0` package_owners
  on repo.artifact_id = package_owners.package_owner_artifact_id
join last_commit
  on package_owners.package_owner_artifact_id = last_commit.to_artifact_id
where last_commit.commit_months >= 3
"""
result = client.query(deps_query)
dependencies_df = result.to_dataframe()
dependencies_df = dependencies_df[dependencies_df['Onchain Builder'] != dependencies_df['Devtool Project']]
dependencies_df.tail()

Unnamed: 0,Onchain Builder,Devtool Project,Devtool URL,List of Packages Used by Project,Total Package Dependents in OSO,Total Package Dependents in RF4,Devtool Star Count,Devtool Fork Count,Devtool Created Date,Devtool Language,License,Past RF Recipient
528,Across,UMA,https://github.com/umaprotocol/protocol,"[@uma/core, @uma/contracts-node, @uma/contract...",12,3,388,184,2018-07-24,JavaScript,GNU Affero General Public License v3.0,False
529,Across,DePay,https://github.com/depayfi/web3-blockchains,[@depay/web3-blockchains],16,5,6,5,2021-06-21,JavaScript,MIT License,False
530,Superfluid,DePay,https://github.com/depayfi/web3-blockchains,[@depay/web3-blockchains],16,5,6,5,2021-06-21,JavaScript,MIT License,False
531,Synthetix,Socket,https://github.com/socketdottech/plugin,[@socket.tech/plugin],10,5,40,16,2022-06-24,TypeScript,MIT License,True
533,Boost Studios,Hop Protocol,https://github.com/hop-protocol/hop,[@hop-protocol/sdk],11,2,2847,195,2020-10-19,TypeScript,MIT License,True


In [7]:
dependencies_df[dependencies_df['Past RF Recipient'] == True].groupby('Onchain Builder')['Devtool Project'].nunique()

Onchain Builder
Across                          18
Agora                           11
Basepaint                        1
Boost Studios                   15
Ethereum Attestation Service    13
Giveth                          14
Smol Dapp                        7
Sound.xyz                        4
Superfluid                      19
Sushi                           16
Synthetix                       19
Velodrome                       12
Name: Devtool Project, dtype: int64

In [8]:
dependencies_df['Repo Relative Star Rank'] = dependencies_df.groupby(['Onchain Builder', 'Devtool Project'])['Devtool Star Count'].transform(
    lambda x: x.rank(ascending=False)
)
dependencies_df['OSO Package Rank'] = dependencies_df.groupby('Devtool Language')['Total Package Dependents in OSO'].transform(
    lambda x: x.rank(pct=True)
)
dependencies_df['RF4 Package Rank'] = dependencies_df.groupby('Devtool Language')['Total Package Dependents in RF4'].transform(
    lambda x: x.rank(pct=True)
)
dependencies_df['RF4 Delta'] = dependencies_df['RF4 Package Rank'] - dependencies_df['OSO Package Rank']
dependencies_df.tail()

Unnamed: 0,Onchain Builder,Devtool Project,Devtool URL,List of Packages Used by Project,Total Package Dependents in OSO,Total Package Dependents in RF4,Devtool Star Count,Devtool Fork Count,Devtool Created Date,Devtool Language,License,Past RF Recipient,Repo Relative Star Rank,OSO Package Rank,RF4 Package Rank,RF4 Delta
528,Across,UMA,https://github.com/umaprotocol/protocol,"[@uma/core, @uma/contracts-node, @uma/contract...",12,3,388,184,2018-07-24,JavaScript,GNU Affero General Public License v3.0,False,1.0,0.0125,0.021875,0.009375
529,Across,DePay,https://github.com/depayfi/web3-blockchains,[@depay/web3-blockchains],16,5,6,5,2021-06-21,JavaScript,MIT License,False,2.0,0.028125,0.046875,0.01875
530,Superfluid,DePay,https://github.com/depayfi/web3-blockchains,[@depay/web3-blockchains],16,5,6,5,2021-06-21,JavaScript,MIT License,False,2.0,0.028125,0.046875,0.01875
531,Synthetix,Socket,https://github.com/socketdottech/plugin,[@socket.tech/plugin],10,5,40,16,2022-06-24,TypeScript,MIT License,True,1.0,0.004747,0.033228,0.028481
533,Boost Studios,Hop Protocol,https://github.com/hop-protocol/hop,[@hop-protocol/sdk],11,2,2847,195,2020-10-19,TypeScript,MIT License,True,1.0,0.009494,0.011076,0.001582


In [9]:
NUM_REPOS = 30
unique_devtool_records = []

for builder, builder_df in dependencies_df.groupby('Onchain Builder'):    
    
    past_rf_recipients = (
        builder_df[builder_df['Past RF Recipient']]
        .sort_values(by='RF4 Delta', ascending=False)
    )
    remaining_projects = (
        builder_df[(~builder_df['Past RF Recipient'])]
        .sort_values(by='RF4 Delta', ascending=False)
        .drop_duplicates(subset=['Devtool Project'])
    )
    if len(past_rf_recipients) > NUM_REPOS:
        top_dependencies = (
            past_rf_recipients
            .sort_values(by='Total Package Dependents in RF4', ascending=False)
            .head(NUM_REPOS)
        )
    else:
        top_dependencies = pd.concat([past_rf_recipients, remaining_projects]).head(NUM_REPOS)
    unique_devtool_records.append(top_dependencies)

df = (
    pd
    .concat(unique_devtool_records)
    .reset_index(drop=True)
    .drop(columns=['RF4 Delta', 'OSO Package Rank', 'RF4 Package Rank', 'Repo Relative Star Rank', 'Past RF Recipient'])
    .sort_values(by=['Onchain Builder', 'Devtool Project', 'Devtool URL'])
)
df['List of Packages Used by Project'] = df['List of Packages Used by Project'].apply(lambda x: ", ".join(sorted(x)))
df

Unnamed: 0,Onchain Builder,Devtool Project,Devtool URL,List of Packages Used by Project,Total Package Dependents in OSO,Total Package Dependents in RF4,Devtool Star Count,Devtool Fork Count,Devtool Created Date,Devtool Language,License
3,Across,Ethereum Miscellenia,https://github.com/ethereum/solc-js,solc,1791,149,1469,479,2016-04-04,TypeScript,MIT License
1,Across,EthereumJS,https://github.com/ethereumjs/ethereumjs-monorepo,"@ethereumjs/block, @ethereumjs/blockchain, @et...",2290,170,2622,776,2015-07-10,TypeScript,
8,Across,Hardhat,https://github.com/nomicfoundation/hardhat,@nomicfoundation/hardhat-verify,1111,121,7445,1470,2018-04-14,TypeScript,Other
24,Across,Jolly Roger and Hardhat-deploy,https://github.com/wighawag/hardhat-deploy,hardhat-deploy,733,80,1206,298,2020-03-19,TypeScript,MIT License
26,Across,Metamask,https://github.com/metamask/core,"@metamask/json-rpc-engine, @metamask/json-rpc-...",540,68,295,201,2018-05-29,TypeScript,MIT License
...,...,...,...,...,...,...,...,...,...,...,...
284,Velodrome,Solhint,https://github.com/protofire/solhint,solhint,1011,111,1054,169,2017-10-16,JavaScript,MIT License
294,Velodrome,Tenderly,https://github.com/tenderly/hardhat-tenderly,"@tenderly/hardhat-tenderly, tenderly",201,28,166,41,2020-09-09,TypeScript,Other
289,Velodrome,ethers.js,https://github.com/ethers-io/ethers.js,"@ethersproject/abi, @ethersproject/abstract-pr...",2543,180,8071,1916,2016-07-16,TypeScript,MIT License
286,Velodrome,gas-reporter,https://github.com/cgewecke/hardhat-gas-reporter,hardhat-gas-reporter,1341,139,413,60,2019-06-20,TypeScript,MIT License


In [10]:
df.groupby('Onchain Builder')['Devtool Project'].unique()

Onchain Builder
Across                          [Ethereum Miscellenia, EthereumJS, Hardhat, Jo...
Agora                           [Alchemy, ESLint, Ethereum Attestation Service...
Basepaint                       [Babel, ESLint, Facebook Open Source, Ponder, ...
Boost Studios                   [Babel, ESLint, Ethereum Miscellenia, Ethereum...
Ethereum Attestation Service    [Babel, Coinbase, ESLint, Ethereum Miscellenia...
Giveth                          [Ethereum Miscellenia, EthereumJS, IPFS, Metam...
Smol Dapp                       [Babel, Coinbase, ESLint, EthereumJS, Facebook...
Sound.xyz                       [Babel, ESLint, Facebook Open Source, Prettier...
Superfluid                      [Ethereum Miscellenia, EthereumJS, Hardhat, Jo...
Sushi                           [Coinbase, ESLint, Ethereum Miscellenia, Ether...
Synthetix                       [Ethereum Miscellenia, EthereumJS, IPFS, Metam...
Velodrome                       [Babel, ESLint, Ethereum Miscellenia, Ethereum...


In [11]:
df.to_csv('data/pairwise/sample_pairwise_dump.csv')

In [12]:
firstLevelCategoryList = list(df['Onchain Builder'].unique())
with open("data/pairwise/get1stLevelCategoryList.json", "w") as f:
    json.dump(firstLevelCategoryList, f, indent=2)

projectsForCategory = df.groupby('Onchain Builder')['Devtool URL'].apply(list).to_dict()
with open("data/pairwise/getProjectsForCategory.json", "w") as f:
    json.dump(projectsForCategory, f, indent=2)

projectMetadata = (
    df
    .set_index('Devtool URL')    
    .rename(columns={
        'Devtool Project': 'Dependency Maintainer',
        'Devtool Star Count': 'Stars',
        'Devtool Fork Count': 'Forks',
        'Devtool Created Date': 'Created Date',
        'Devtool Language': 'Language',
    })
    [['Dependency Maintainer', 'Stars', 'Forks', 'Created Date', 'Language']]
    .drop_duplicates()
)
projectMetadata.to_json(
    "data/pairwise/getProjectMetadata.json", 
    orient='index',
    indent=2
)