In [14]:
from google.cloud import bigquery
import os
import pandas as pd

In [15]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'
GCP_PROJECT = 'opensource-observer'
client = bigquery.Client(GCP_PROJECT)

In [19]:
# get all Ethereum Core repos

results = client.query("""
    select distinct
        artifact_id,
        artifact_namespace as git_org,
        artifact_name as git_repo
    from `oso.artifacts_by_collection_v1`
    where
        collection_name in ('ethereum-github', 'protocol-guild')
        and artifact_source = 'GITHUB'
""")

df_core_repos = results.to_dataframe()
df_core_repos.set_index('artifact_id').to_csv('data/core_repos.csv')
df_core_repos.tail()

Unnamed: 0,artifact_id,git_org,git_repo
862,5kyoSLkWzAZcLG7VgR6ArokScs1kQ7cRJ_Va9ypwl8Y=,nethermindeth,mev-aa-geth
863,sENcqlpQbrAmGWnfe9UllPB0f2uMJeI_KMg9dpoWulY=,nethermindeth,voyager-translations
864,XR8pSCFvzbWGf2OApLvCeLCdZ-jDhSqNyd-WlbSIfOI=,prysmaticlabs,bls-signatures
865,J-Jg6YqpYQJ5Jx0azUJgaWK8Ta82XTWXIqxOaEteTYY=,ethereum,pytest-asyncio-network-simulator
866,E0-UMz2CjlcOENeTK4SJy-BnTpq6YG52nVpFmZ4g3Gc=,ledgerwatch,go-ethereum


In [20]:
# get all repos that are dependencies of Ethereum Core repos
# note: this will only get repos that have been indexed by OSO

results = client.query("""
    with core_repos as (
      select artifact_id
      from `oso.artifacts_by_collection_v1`
      where collection_name in ('ethereum-github', 'protocol-guild')
    ),
    dependencies as (
      select distinct abp.project_id
      from `oso.int_sbom_artifacts` as a
      join core_repos as cr
        on a.artifact_id = cr.artifact_id
      join `oso.artifacts_by_project_v1` as abp
        on a.package_artifact_id = abp.artifact_id
      where a.package_artifact_id is not null
    )
    
    select
      abp.artifact_id,
      abp.artifact_name as git_org,
      abp.artifact_namespace as git_repo,
      abp.project_name
    from `oso.artifacts_by_project_v1` as abp
    join dependencies as d
      on abp.project_id = d.project_id
    where abp.artifact_source = 'GITHUB'
""")

df_dep_repos = results.to_dataframe()
df_dep_repos.set_index('artifact_id').to_csv('data/dep_repos.csv')
df_dep_repos.tail()

Unnamed: 0,artifact_id,git_org,git_repo,project_name
1138,VT7BXkgbkqS8XMPH59lwtdg0CsvZGre9Ea2sPxjqyzU=,tiphub,thirdweb-dev,third-web
1139,YMSHOLAU6JRwyl_RAtuzct6dr4V-cg6IoussgUpWKp4=,v1-docs,uniswap,uniswap
1140,mKHjUofAdBAU9CQou-7t6HnBUhZN9LtAmVHtMSRYtas=,alerting-service,thirdweb-dev,third-web
1141,d1NCCH770A-tiuNyvFowG91X4nupTWEYDgTHEOM_YUc=,babel-archive,babel,babel
1142,EUrNxDjysNACU5anIjOy2cqqFLuNWtLVKaAk2Q-GLcs=,minify,babel,babel


In [21]:
repo_list = df_dep_repos['artifact_id'].to_list() + df_core_repos['artifact_id'].to_list()
len(repo_list)

2010

In [22]:
# get some high level metrics about all of our repos

repo_str = "'" + "','".join(repo_list) + "'"
results = client.query(f"""
    select distinct * except(project_id, artifact_source)
    from `oso.int_repo_metrics_by_project`
    where artifact_id in ({repo_str})
""")

df_repo_metrics = results.to_dataframe().set_index('artifact_id')
df_repo_metrics.to_csv('data/repo_metrics.csv')
df_repo_metrics.tail()

Unnamed: 0_level_0,artifact_namespace,artifact_name,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,created_at,updated_at,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,commit_count
artifact_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
qwfhpXi8eVTFcAGnyF86tVua8eu6g93PA0sWDyyKzLs=,prysmaticlabs,prysm,False,1023,3477,3477,Go,GPL-3.0,2018-01-11 21:31:33+00:00,2024-11-28 01:58:12+00:00,2018-01-15 17:42:31+00:00,2024-11-27 22:36:39+00:00,1081,70,3635.0
o5yI3tVxvW9JYJx5-Kfa-eGNtAB3b_c08XOxfxbPmn8=,metamask,core,False,188,293,293,TypeScript,MIT,2018-05-29 12:55:25+00:00,2024-11-27 22:19:39+00:00,2021-05-31 15:46:13+00:00,2024-11-27 22:19:36+00:00,571,71,1581.0
POjHIClwgZojMGj2z1u42rdmGwUthixu4HAdMHEUzuU=,facebook,lexical,False,1705,19996,19996,TypeScript,MIT,2020-12-03 22:53:26+00:00,2024-11-27 21:19:54+00:00,2022-04-13 16:51:12+00:00,2024-11-28 07:43:33+00:00,641,94,2008.0
kpFV74ZJQS_4V9FBNv4t9HTYWM6ysPMyr1oZ86EDnuo=,metamask,metamask-mobile,False,1120,2177,2177,TypeScript,NOASSERTION,2018-07-18 11:47:08+00:00,2024-11-28 00:40:18+00:00,2021-04-14 20:54:53+00:00,2024-11-28 00:40:16+00:00,674,99,2821.0
bxGDVW_162hhgvymEoLdJGYjBSWauJkJRnf43te0i_A=,metamask,metamask-extension,False,4930,12079,12079,TypeScript,NOASSERTION,2015-09-06 16:34:48+00:00,2024-11-28 00:14:50+00:00,2018-04-25 17:42:05+00:00,2024-11-28 07:17:16+00:00,1646,128,12858.0


In [23]:
# map the first level dependency graph of Ethereum Core repos
# note: any package already on OSO will have a non-null `package_artifact_id`

results = client.query("""
    with core_repos as (
      select distinct artifact_id
      from `oso.artifacts_by_collection_v1`
      where collection_name in ('ethereum-github', 'protocol-guild')
    )
    
    select distinct
      a.artifact_id as dependent_artifact_id,
      a.artifact_namespace as dependent_git_org,
      a.artifact_name as dependent_git_repo,
      a.package_artifact_source as package_manager,
      a.package_artifact_name as package_name,
      a.package_artifact_id,
      abp.project_name as package_project_name
    from `oso.int_sbom_artifacts` a
    left join `oso.artifacts_by_project_v1` as abp
      on a.package_artifact_id = abp.artifact_id
    where a.artifact_id in (select artifact_id from core_repos)
""")

df_dep_graph = results.to_dataframe()
df_dep_graph.to_parquet('data/dep_graph.parquet')
df_dep_graph.tail()

Unnamed: 0,dependent_artifact_id,dependent_git_org,dependent_git_repo,package_manager,package_name,package_artifact_id,package_project_name
115306,zHPR9aQ0GoieJsYupUS5lQI_aeljk79VThgCaHiHnhI=,nethermindeth,sign-in-with-starknet,NPM,@noble/hashes,,
115307,zHPR9aQ0GoieJsYupUS5lQI_aeljk79VThgCaHiHnhI=,nethermindeth,sign-in-with-starknet,NPM,is-fullwidth-code-point,,
115308,zHPR9aQ0GoieJsYupUS5lQI_aeljk79VThgCaHiHnhI=,nethermindeth,sign-in-with-starknet,NPM,jest-config,,
115309,zHPR9aQ0GoieJsYupUS5lQI_aeljk79VThgCaHiHnhI=,nethermindeth,sign-in-with-starknet,NPM,use-sidecar,,
115310,zHPR9aQ0GoieJsYupUS5lQI_aeljk79VThgCaHiHnhI=,nethermindeth,sign-in-with-starknet,NPM,exit,,


In [8]:
# get all GitHub activity to the repos we care about
# note: this is an expensive scan! 

results = client.query("""
    with core_repos as (
      select distinct artifact_id
      from `oso.artifacts_by_collection_v1`
      where collection_name in ('ethereum-github', 'protocol-guild')
    ),
    
    known_dependencies as (
      select distinct package_artifact_id as artifact_id
      from `oso.int_sbom_artifacts`
      where
        package_artifact_id is not null
        and artifact_id in (select artifact_id from core_repos)
    )
    
    select
      e.time,
      from_.artifact_name as git_user,
      to_.artifact_namespace as git_org,
      to_.artifact_name as git_repo,
      e.event_type
    from`oso.timeseries_events_by_artifact_v0` as e
    join `oso.artifacts_v1` as from_
      on e.from_artifact_id = from_.artifact_id
    join `oso.artifacts_v1` as to_
      on e.to_artifact_id = to_.artifact_id
    where
      e.time >= '2017-01-01'
      and e.event_source = 'GITHUB'
      and from_.artifact_name not like '%[bot]%'
      and e.to_artifact_id in (
        select artifact_id from core_repos
        union all
        select artifact_id from known_dependencies
      )
""")

df_events = results.to_dataframe()
df_events.to_parquet('data/events.parquet')
df_events.tail()

Unnamed: 0,time,git_user,git_org,git_repo,event_type
1945265,2022-11-22 11:48:03+00:00,chfast,ipsilon,evm-benchmarks,PULL_REQUEST_REVIEW_COMMENT
1945266,2022-11-08 17:29:37+00:00,chfast,ipsilon,evm-benchmarks,PULL_REQUEST_REVIEW_COMMENT
1945267,2022-11-22 08:36:29+00:00,chfast,ipsilon,evm-benchmarks,PULL_REQUEST_REVIEW_COMMENT
1945268,2022-11-22 10:36:44+00:00,rodiazet,ipsilon,evm-benchmarks,PULL_REQUEST_REVIEW_COMMENT
1945269,2022-11-16 19:47:39+00:00,rodiazet,ipsilon,evm-benchmarks,PULL_REQUEST_REVIEW_COMMENT
