In [1]:
from google.cloud import bigquery
from collections import Counter
import json
import networkx as nx
import numpy as np
import os
import pandas as pd
import re

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'
GCP_PROJECT = 'opensource-observer'
client = bigquery.Client(GCP_PROJECT)

# Load the unweighted graph

In [3]:
def check_weight(g):
    return sum(
        data['weight']
        for _, _, data in g.edges(data=True)
    )

def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

In [4]:
with open('data/unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)

G = nx.node_link_graph(graph_data)
check_weight(G)

0.9999999999997822

In [5]:
len(G.edges)

10078

In [6]:
relations = [data['relation'] for _, _, data in G.edges(data=True) if 'relation' in data]
pkg_relation_counts = Counter(relations)
pkg_relation_counts

Counter({'NPM': 8036, 'RUST': 1345, 'GO': 561, 'PIP': 136})

# Grab some basic GitHub stats

In [7]:
ignore_list = ['facebook', 'huggingface', 'webpack', 'babel', 'pandas-dev', 'pnpm', 'eslint', 'numpy']
owners = set()
for node in G.nodes:
    match = re.match(r"https://github\.com/([^/]+)", node)
    if match:
        owner = match.group(1)
        if owner not in ignore_list:
            owners.add(owner)
len(owners)

1805

In [8]:
results = client.query(f"""
    select distinct * except(project_id, artifact_source)
    from `oso.int_repo_metrics_by_project`
    where artifact_namespace in ({stringify_array(owners)})
""")
df_repo_metrics = results.to_dataframe()
df_repo_metrics['github_url'] = df_repo_metrics.apply(
    lambda x: f"https://github.com/{x['artifact_namespace']}/{x['artifact_name']}", axis=1
)
df_repo_metrics.to_csv('data/repo_metrics.csv')
df_repo_metrics.tail()

Unnamed: 0,artifact_id,artifact_namespace,artifact_name,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,created_at,updated_at,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,commit_count,github_url
3985,6fxr8I3LEWIJbAn4FMC4H9lC9FOy2_iIRVEexGio82I=,prysmaticlabs,prysm-testnet-site,False,21,4,4,TypeScript,,2019-02-15 17:13:18+00:00,2024-01-31 06:25:45+00:00,2019-03-18 15:04:07+00:00,2020-11-04 20:15:10+00:00,42,8,248.0,https://github.com/prysmaticlabs/prysm-testnet...
3986,27atwT4IUa3Q5bHROXMU93w38KWR3gCuRTcrLFY5cBA=,walletconnect,notify-server,False,6,20,20,Rust,MIT,2023-07-26 14:01:09+00:00,2024-11-19 07:08:43+00:00,2023-07-26 17:05:19+00:00,2024-05-17 17:53:21+00:00,129,8,742.0,https://github.com/walletconnect/notify-server
3987,tmL0p2YwVc0I7JkkSuJkibr-iUQSkmHp2I2a1Rw6NhA=,walletconnect,actions,False,1,10,10,,,2022-06-22 22:07:15+00:00,2024-10-13 05:17:13+00:00,2022-06-22 22:12:48+00:00,2024-10-09 14:15:43+00:00,37,8,59.0,https://github.com/walletconnect/actions
3988,IJoXe4rw355BlweIwdS4FCc_4wmBuZG3b8lEIfl8ngI=,walletconnect,verify-server,False,3,7,7,HCL,MIT,2023-01-23 20:27:03+00:00,2024-10-01 03:08:32+00:00,2023-01-23 20:42:48+00:00,2024-06-27 17:03:37+00:00,49,8,193.0,https://github.com/walletconnect/verify-server
3989,1Jus8lqY4Kushe1BiBrNqu_Kv6TmOE5wimRqTTQANss=,walletconnect,keys-server,False,8,15,15,HCL,MIT,2022-06-16 10:13:09+00:00,2024-10-13 05:38:46+00:00,2022-07-07 07:28:38+00:00,2024-10-09 10:46:01+00:00,69,8,216.0,https://github.com/walletconnect/keys-server


# Apply a basic (naive) weighting algorithm

In [9]:
repo_star_map = {
    row['github_url']: row['star_count']
    for _, row in df_repo_metrics.iterrows()
}

def calculate_edge_weight(source_stars, target_stars, relation_count):
    EPSILON = 1e-6
    log_source = np.log1p(source_stars)
    log_target = np.log1p(target_stars)
    harmonic_mean = (2 * log_source * log_target) / (log_source + log_target)
    weight = harmonic_mean / (relation_count + EPSILON)
    return weight

for source_repo, target_repo, data in G.edges(data=True):
    source_stars = repo_star_map.get(source_repo, 0)
    target_stars = repo_star_map.get(target_repo, 0)
    relation_count = pkg_relation_counts.get(data['relation'], 1)  # avoid div by zero
    data['weight'] = calculate_edge_weight(source_stars, target_stars, relation_count)
    
total_weight = check_weight(G)
for u, v, data in G.edges(data=True):
    data['weight'] /= total_weight    

In [10]:
weighted_degree = {}
for node in G.nodes:
    weighted_degree[node] = sum(
        data['weight']
        for _, v, data
        in G.edges(node, data=True)
    )
    
node_metrics = pd.DataFrame({
    'Node': list(weighted_degree.keys()),
    'Weighted Degree': list(weighted_degree.values()),
    'Level': [G.nodes[node].get('level',2) for node in G.nodes]
})

top_nodes_by_level = (
    node_metrics.groupby('Level', group_keys=False)
    .apply(lambda x: x.sort_values(by='Weighted Degree', ascending=False).head(20))
)    

top_nodes_by_level

Unnamed: 0,Node,Weighted Degree,Level
7,https://github.com/erigontech/erigon,0.266165,1
8,https://github.com/prysmaticlabs/prysm,0.259328,1
16,https://github.com/ethereum/web3.py,0.164607,1
12,https://github.com/paradigmxyz/reth,0.06835,1
10,https://github.com/sigp/lighthouse,0.059195,1
9,https://github.com/ethereum/go-ethereum,0.05763,1
14,https://github.com/ethereum/py-evm,0.032712,1
11,https://github.com/grandinetech/grandine,0.022341,1
5,https://github.com/ethereum/remix-project,0.021338,1
0,https://github.com/eth-infinitism/account-abst...,0.013226,1


In [11]:
graph_json = nx.node_link_data(G)
output_path = "data/example_weighted_graph.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)

# Demo of how to grab OSO raw event data

In [12]:
artifact_ids = df_repo_metrics['artifact_id'].unique()
len(artifact_ids)

3990

In [13]:
# get all GitHub activity to the repos we care about

q = (f"""
    select
      date_trunc(time, MONTH) as event_month,
      from_artifact_name as git_user,
      to_artifact_namespace as git_org,
      to_artifact_name as git_repo,
      event_type,
      sum(amount) as amount
    from `oso.int_events__github`
    where
        to_artifact_id in ({stringify_array(artifact_ids)})
        and time >= '2020-01-01'
    group by 1,2,3,4,5
""")

results = client.query(q)
df_events = results.to_dataframe()
df_events.to_parquet('data/events.parquet')

df_events = pd.read_parquet('data/events.parquet')
df_events.tail()

Unnamed: 0,event_month,git_user,git_org,git_repo,event_type,amount
1221482,2024-03-01 00:00:00+00:00,as-sajdah,nomicfoundation,hardhat,FORKED,1.0
1221483,2023-10-01 00:00:00+00:00,0x0oz,nomicfoundation,hardhat,ISSUE_CLOSED,1.0
1221484,2022-06-01 00:00:00+00:00,atheter,nomicfoundation,hardhat,ISSUE_COMMENT,1.0
1221485,2024-10-01 00:00:00+00:00,globalynk1,nomicfoundation,hardhat,FORKED,1.0
1221486,2023-09-01 00:00:00+00:00,michaeldavid1999,nomicfoundation,hardhat,STARRED,1.0


In [14]:
print("Repos:", len(artifact_ids))
print("Git Users:", df_events['git_user'].nunique())
df_events.groupby('event_type')['amount'].sum()

Repos: 3990
Git Users: 286740


event_type
COMMIT_CODE                    533503.0
FORKED                         199195.0
ISSUE_CLOSED                   126357.0
ISSUE_COMMENT                  861824.0
ISSUE_OPENED                   218345.0
ISSUE_REOPENED                   4474.0
PULL_REQUEST_CLOSED            381165.0
PULL_REQUEST_MERGED            313453.0
PULL_REQUEST_OPENED            386929.0
PULL_REQUEST_REOPENED            3319.0
PULL_REQUEST_REVIEW_COMMENT    531096.0
RELEASE_PUBLISHED               28597.0
STARRED                        445527.0
Name: amount, dtype: float64