In [1]:
from google.cloud import bigquery
import json
import os
import pandas as pd

In [2]:
# https://docs.opensource.observer/docs/get-started/
# add GCP project and credentials here

PROJECT = 'opensource-observer'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../gcp_credentials.json'
client = bigquery.Client()

# Load and process the applications

In [3]:
applications = json.load(open("data/applications.json"))
print(len(applications))

148


In [4]:
approved_apps = pd.read_csv("data/approved_applications_labeled.csv")
len(approved_apps)

78

In [5]:
missing = [
    {
        'project_name': "Formal Verification of Optimism's L1 pausability mechanism",
        'attestation_id': '0x64961108e65c1c897e033c5609361839998fbf117c312b211ba7523100c28569',
        'project_id': '0x80393c05d524b7a6f7a78b0c141eadf0759642ae8d7e718134318cd2d73d5464',
        'application_id': 'unknown',
        'category_id': 2,
        'repo_url': 'https://github.com/runtimeverification/optimism-ci',
        'repo_name': 'optimism-ci',
        'repo_owner_and_name': 'runtimeverification/optimism-ci',
        'clean_url': 'https://github.com/runtimeverification/optimism-ci'
    },
    {
        'project_name': "Formal Verification of Optimism's L1 pausability mechanism",
        'attestation_id': '0x64961108e65c1c897e033c5609361839998fbf117c312b211ba7523100c28569',        
        'project_id': '0x80393c05d524b7a6f7a78b0c141eadf0759642ae8d7e718134318cd2d73d5464',
        'application_id': 'unknown',
        'category_id': 2,
        'repo_url': 'https://github.com/runtimeverification/_audits_Ethereum-optimism_pausability',
        'repo_name': '_audits_ethereum-optimism_pausability',
        'repo_owner_and_name': 'runtimeverification/_audits_ethereum-optimism_pausability',
        'clean_url': 'https://github.com/runtimeverification/_audits_ethereum-optimism_pausability'
    }
]

In [6]:
projects_data = missing.copy()

for i,app in enumerate(applications):
    project = app.get('project', {})
    repos = project.get('repos', [])
    project_name = project.get('name')
    project_id = project.get('id')
    
    try:
        application_id = app['impactStatementAnswer'][0]['applicationId']
        category_id = app['impactStatementAnswer'][0]['impactStatement']['categoryId']
    except:
        print(f"Missing applicationId for project at index {i}: {project_name}.")
        continue

    if application_id not in approved_apps['application_id'].unique():
        continue
    
    if repos:
        for repo in repos:
            repo_url = repo.get('url', None)
            
            # fix one edge case
            if repo_url.lower() == 'https://github.com/protocolguild/membership':
                repo_url = 'https://github.com/protocolguild/documentation'

            projects_data.append({
                'project_name': project_name,
                'attestation_id': app.get('attestationId'),
                'project_id': project_id,
                'application_id': application_id,
                'category_id': category_id,
                'repo_url': repo_url,
                'repo_name': repo.get('name', None)                
            })
    else:
        projects_data.append({
            'project_name': project_name,
            'attestation_id': app.get('attestationId'),            
            'project_id': project_id,
            'application_id': application_id,
            'category_id': category_id,
            'repo_url': None,
            'repo_name': None            
        })

df_projects = pd.DataFrame(projects_data)

def extract_owner_and_repo(url):
    if url and isinstance(url, str):
        url = url.lower()
        if "github.com" in url:
            parts = url.split('/')
            if len(parts) >= 5:
                return f"{parts[3]}/{parts[4]}"
    return None

def clean_repo_url(owner_and_name):
    if owner_and_name:
        return f"https://github.com/{owner_and_name}"

df_projects['repo_owner_and_name'] = df_projects['repo_url'].apply(extract_owner_and_repo)
df_projects['clean_url'] = df_projects['repo_owner_and_name'].apply(clean_repo_url)

project_name_mappings = df_projects.set_index('application_id')['project_name'].to_dict()
project_category_mappings = df_projects.set_index('application_id')['category_id'].to_dict()

print(f"Loaded {len(df_projects)} records\
        \n... including {len(df_projects['clean_url'].dropna().unique())} repos\
        \n... from {df_projects['project_id'].nunique()} unique applications.\n\n")

repo_urls = list(df_projects['clean_url'].dropna().unique())
df_projects.tail(1)

Missing applicationId for project at index 6: Formal Verification of Optimism's L1 pausability mechanism.
Loaded 109 records        
... including 90 repos        
... from 79 unique applications.




Unnamed: 0,project_name,attestation_id,project_id,application_id,category_id,repo_url,repo_name,repo_owner_and_name,clean_url
108,Solady,0xd42d5fa61ac3f9488e7b5c5fd24709d9d2b130750a9e...,0x9151666888d0ca532a529be98a50d2eb992988117e20...,d53f8df1-c9dd-49a9-a446-945072af1f6f,2,https://github.com/Vectorized/solady,Solady,vectorized/solady,https://github.com/vectorized/solady


# Fetch a snapshot of current repo metrics from OSO

In [12]:
# Get snapshot of repo metrics (taken 2024-09-23)

repo_urls_str = "'" + "','".join(repo_urls) + "'"
repos_query = f"""
    select
      p.project_name,
      abp.artifact_id,
      abp.artifact_namespace,
      abp.artifact_name,
      abp.artifact_url,
      abp.artifact_type,
      rm.is_fork,
      rm.fork_count,
      rm.star_count,
      rm.language,
      rm.license_spdx_id,
      abp.project_id as oso_project_id,
    from `{PROJECT}.oso.int_artifacts_in_ossd_by_project` as abp
    join `{PROJECT}.oso.projects_v1` as p
      on abp.project_id = p.project_id
    join `{PROJECT}.oso.int_repo_metrics_by_project` as rm
      on abp.artifact_id = rm.artifact_id
    where abp.artifact_url in ({repo_urls_str})
"""
repos_query_result = client.query(repos_query)
df_repos = repos_query_result.to_dataframe()
df_repos['license_spdx_id'] =  df_repos['license_spdx_id'].replace({'NOASSERTION': 'Custom'})
df_repos.tail(1)

Unnamed: 0,project_name,artifact_id,artifact_namespace,artifact_name,artifact_url,artifact_type,is_fork,fork_count,star_count,language,license_spdx_id,oso_project_id
87,kroma-network,aXbDRz3QGK6qa0CcrBNLAMi1qlw_eVaB-vq06PnT0O4=,kroma-network,kroma,https://github.com/kroma-network/kroma,REPOSITORY,False,82,159,Go,Custom,F8npEwagURJOf7hOCr27eOcUjo9m51wa4KlOH0ZsO9c=


In [13]:
# identify any repos in apps that do not have data
print("Ignored repos:")
valid_repo_urls = []
for repo in repo_urls:
    if repo not in df_repos['artifact_url'].unique():
        print(repo)
    else:
        valid_repo_urls.append(repo)

print()        
print("Indexed repos:",len(valid_repo_urls))        

Ignored repos:
https://github.com/jsvisa/retro5
https://github.com/richardgreg/op-docs-improvements

Indexed repos: 88


In [18]:
repo_app_mapping = (
    df_projects[df_projects.clean_url.isin(valid_repo_urls)]
    [['clean_url', 'application_id', 'project_id', 'attestation_id']]
    .drop_duplicates()
    .set_index('clean_url')['application_id']
    .to_dict()
)

project_attestation_mapping = df_projects.set_index('application_id')['attestation_id'].to_dict()

df_repos['application_id'] = df_repos['artifact_url'].map(repo_app_mapping)
df_repos['attestation_id'] = df_repos['application_id'].map(project_attestation_mapping)

artifact_app_mapping = df_repos.set_index('artifact_id')['application_id'].to_dict()
artifact_url_mapping = df_repos.set_index('artifact_url')['artifact_id'].to_dict()
project_app_mapping = df_repos.set_index('attestation_id')['project_name'].to_dict()

df_repos.tail(1)

Unnamed: 0,project_name,artifact_id,artifact_namespace,artifact_name,artifact_url,artifact_type,is_fork,fork_count,star_count,language,license_spdx_id,oso_project_id,application_id,attestation_id
87,kroma-network,aXbDRz3QGK6qa0CcrBNLAMi1qlw_eVaB-vq06PnT0O4=,kroma-network,kroma,https://github.com/kroma-network/kroma,REPOSITORY,False,82,159,Go,Custom,F8npEwagURJOf7hOCr27eOcUjo9m51wa4KlOH0ZsO9c=,f29a9466-c088-4314-9f78-0be7034caba2,0x29689510e5add50d929566fcbb78a8f85fac28545928...


In [26]:
with open("data/project_attestation_mapping.json", "w") as f:
    json.dump(project_app_mapping, f, indent=2)

# Fetch OSO event data from relevant repos

In [10]:
# Get all event data (cutoff date of 2024-08-01)

artifact_ids = list(artifact_app_mapping.keys())
artifact_ids_str = "'" + "','".join(artifact_ids) + "'"

CUTOFF = '2024-08-01'

events_query = f"""
    select
        time,
        event_type,
        from_artifact_name as user,
        from_artifact_id,
        to_artifact_id 
    from `{PROJECT}.oso.int_events`
    where
        to_artifact_id in ({artifact_ids_str})
        and time < '{CUTOFF}'
"""

# uncomment everything below if you want live data, otherwise uses local backup

# events_query_results = client.query(events_query)
# df_events = events_query_results.to_dataframe()

# # add application ids
# df_events['application_id'] = df_events['to_artifact_id'].map(artifact_app_mapping)

# # filter bot activity
# bot_list = ['codecov-commenter', 'claassistant', 'googlebot', 'omahs']
# github_users = list(df_events['user'].unique())
# bots = [x for x in github_users if '[bot]' in x or x in bot_list]
# df_events = df_events[df_events['user'].isin(bots) == False]

# df_events.to_parquet("data/rf5_events.parquet")
df_events = pd.read_parquet("data/rf5_events.parquet")

df_events['bucket_day'] = pd.to_datetime(df_events['time'].dt.date)
df_events['amount'] = 1
df_events.tail(1)

Unnamed: 0,time,event_type,user,from_artifact_id,to_artifact_id,application_id,bucket_day,amount
955757,2023-08-07 08:19:24+00:00,PULL_REQUEST_REVIEW_COMMENT,thomaseizinger,xOfgF7_wYw1J5fCCwpUuFs53BTw1iXb1wenhuspVXXM=,dxsMNRXWzfg8lMvq0M4bY-NZ5961glN0Q-X64anZ8BI=,4eee1576-c6aa-42c5-ae48-73d72bbbd82c,2023-08-07,1


# Derive consolidated metrics

- How many unique contributors are we rewarding?
- How many years in development?
- How many forks/stars?

In [11]:
print("Forks:", df_repos['fork_count'].sum())
print("Stars:", df_repos['star_count'].sum())

Forks: 38021
Stars: 132856


In [12]:
print("Total years:")
df_events.groupby('to_artifact_id')['time'].min().apply(lambda x: (2024. + 9/12.) - (x.year + x.month/12.)).sum()

Total years:


194.08333333333348

In [13]:
dev_event_types = [
    'COMMIT_CODE',
    #'PULL_REQUEST_OPENED',
    #'PULL_REQUEST_REVIEW_COMMENT',
    #'ISSUE_OPENED'
]

contributor_event_types = [
    'COMMIT_CODE',
    'PULL_REQUEST_OPENED',
    'PULL_REQUEST_REVIEW_COMMENT',
    'ISSUE_OPENED'
]

cutoff = '2023-10-01'

In [14]:
print("Developers (alltime):")
(
    df_events[df_events['event_type'].isin(dev_event_types)]
)['from_artifact_id'].nunique()

Developers (alltime):


566

In [15]:
print("Developers (RF Period):")
(
    df_events[
        df_events['event_type'].isin(dev_event_types)
        & (df_events['time'] >= cutoff)
    ]
)['from_artifact_id'].nunique()

Developers (RF Period):


374

In [16]:
print("Contributors (alltime):")
(
    df_events[df_events['event_type'].isin(contributor_event_types)]
)['from_artifact_id'].nunique()

Contributors (alltime):


11903

In [17]:
print("Contributors (RF Period):")
(
    df_events[
        df_events['event_type'].isin(contributor_event_types)
        & (df_events['time'] >= cutoff)
    ]
)['from_artifact_id'].nunique()

Contributors (RF Period):


2596

# Generate a collection

In [18]:
for p in sorted(df_repos['project_name'].unique()):
    print('-',p)

- alt-research
- blob-archiver-rs
- builderism-rabbitprincess
- dappnode
- defi-wonderland
- eoa-blockchain-labs
- erigontech
- eth-infinitism-account-abstraction
- ethereum-attestation-service
- ethereum-miscellania
- ethereum-pos-testnet-rzmahmood
- ethereumjs
- ethpandaops
- ethstorage
- fe-ethereum
- gelato
- get-smooth
- go-ethereum
- grandinetech
- hermes-probe-lab
- hildr-optimism-java
- kroma-network
- libp2p
- lighthouse-sigp
- lodestar-chainsafe
- nethermindeth
- nimbus-status-im
- nodeguardians
- op
- op-besu-optimism-java
- op-stack-deployer-aymen-tirchi
- protocol-guild
- quic-go
- redprint-ratimon
- reth-paradigmxyz
- revm-bluealloy
- roll-op-0xfableorg
- runtimeverification
- sherlock-protocol
- shutter-network
- simple-optimism-node-smartcontracts
- solady-vectorized
- solidity-ethereum
- succinctlabs
- testinprod-io
- the-book-of-optimism-fault-proof-joohhnnn
- understanding-optimism-codebase-joohhnnn
- upnodedev
- vacp2p
- vyperlang
- zenbiteth
