In [1]:
from google.cloud import bigquery
import json
import networkx as nx
import numpy as np
import os
import pandas as pd
import re

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../oso_gcp_credentials.json'
GCP_PROJECT = 'opensource-observer'
client = bigquery.Client(GCP_PROJECT)

# Load our graph

In [3]:
def check_weight(g):
    return sum(
        data['weight']
        for _, _, data in g.edges(data=True)
    )

def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

In [4]:
with open('data/unweighted_graph.json', 'r') as f:
    graph_data = json.load(f)

G = nx.node_link_graph(graph_data)
check_weight(G)

1.0000000000005178

# Grab some basic GitHub stats

In [5]:
ignore_list = ['facebook', 'huggingface', 'webpack', 'babel', 'pandas-dev', 'pnpm', 'eslint', 'numpy']
owners = set()
for node in G.nodes:
    match = re.match(r"https://github\.com/([^/]+)", node)
    if match:
        owner = match.group(1)
        if owner not in ignore_list:
            owners.add(owner)
len(owners)

2494

In [6]:
results = client.query(f"""
    select distinct * except(project_id, artifact_source)
    from `oso.int_repo_metrics_by_project`
    where artifact_namespace in ({stringify_array(owners)})
""")
df_repo_metrics = results.to_dataframe()
df_repo_metrics['github_url'] = df_repo_metrics.apply(
    lambda x: f"https://github.com/{x['artifact_namespace']}/{x['artifact_name']}", axis=1
)
df_repo_metrics.to_csv('data/repo_metrics.csv')
df_repo_metrics.tail()

Unnamed: 0,artifact_id,artifact_namespace,artifact_name,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,created_at,updated_at,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,commit_count,github_url
6584,qwfhpXi8eVTFcAGnyF86tVua8eu6g93PA0sWDyyKzLs=,prysmaticlabs,prysm,False,1023,3477,3477,Go,GPL-3.0,2018-01-11 21:31:33+00:00,2024-11-28 01:58:12+00:00,2018-01-15 17:42:31+00:00,2024-11-27 22:36:39+00:00,1081,70,3635.0,https://github.com/prysmaticlabs/prysm
6585,o5yI3tVxvW9JYJx5-Kfa-eGNtAB3b_c08XOxfxbPmn8=,metamask,core,False,188,293,293,TypeScript,MIT,2018-05-29 12:55:25+00:00,2024-11-27 22:19:39+00:00,2021-05-31 15:46:13+00:00,2024-11-27 22:19:36+00:00,571,71,1581.0,https://github.com/metamask/core
6586,7WT0FAMb3nzt8FfZh5IzdOhmtCUeoOqpLU0TYOFQ7zM=,status-im,status-go,False,247,728,728,Go,MPL-2.0,2016-06-13 15:32:03+00:00,2024-11-27 09:21:53+00:00,2016-09-21 11:51:15+00:00,2024-11-28 07:52:07+00:00,1392,85,4359.0,https://github.com/status-im/status-go
6587,kpFV74ZJQS_4V9FBNv4t9HTYWM6ysPMyr1oZ86EDnuo=,metamask,metamask-mobile,False,1120,2177,2177,TypeScript,NOASSERTION,2018-07-18 11:47:08+00:00,2024-11-28 00:40:18+00:00,2021-04-14 20:54:53+00:00,2024-11-28 00:40:16+00:00,674,99,2821.0,https://github.com/metamask/metamask-mobile
6588,bxGDVW_162hhgvymEoLdJGYjBSWauJkJRnf43te0i_A=,metamask,metamask-extension,False,4930,12079,12079,TypeScript,NOASSERTION,2015-09-06 16:34:48+00:00,2024-11-28 00:14:50+00:00,2018-04-25 17:42:05+00:00,2024-11-28 07:17:16+00:00,1646,128,12858.0,https://github.com/metamask/metamask-extension


# Apply a basic (naive) weighting algorithm

In [7]:
repo_star_map = {
    row['github_url']: row['star_count']
    for _, row in df_repo_metrics.iterrows()
}

for source_repo, target_repo, data in G.edges(data=True):
    source_stars = repo_star_map.get(source_repo, 0)
    target_stars = repo_star_map.get(target_repo, 0)
    data['weight'] = np.sqrt(source_stars**2 + target_stars**2)
    
total_weight = check_weight(G)
for u, v, data in G.edges(data=True):
    data['weight'] /= total_weight    

In [8]:
level_1_nodes = {
    node for node, data in G.nodes(data=True)
    if data.get('level') == 1
}
weighted_degree = {}
for node in G.nodes:
    if node not in level_1_nodes:
        weighted_degree[node] = sum(
            data['weight']
            for _, v, data
            in G.edges(node, data=True)
            if v not in level_1_nodes
        )

importance_metrics = pd.DataFrame({
    'Node': list(weighted_degree.keys()),
    'Weighted Degree': list(weighted_degree.values())
}).sort_values(by='Weighted Degree', ascending=False)

importance_metrics.head(40)

Unnamed: 0,Node,Weighted Degree
4,https://github.com/ethereum/remix-project,0.153187
46,https://github.com/ethereum/go-ethereum,0.137314
44,https://github.com/chainsafe/lodestar,0.049087
50,https://github.com/paradigmxyz/reth,0.048514
41,https://github.com/sigp/lighthouse,0.034741
2,https://github.com/ethereum/remix,0.031509
39,https://github.com/prysmaticlabs/prysm,0.02173
49,https://github.com/erigontech/erigon,0.019985
6,https://github.com/ethereum/sourcify,0.019485
9,https://github.com/ethereum/hive,0.006615


In [9]:
graph_json = nx.node_link_data(G)
output_path = "data/example_weighted_graph.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)

# Demo of how to grab OSO raw event data

In [10]:
artifact_ids = df_repo_metrics['artifact_id'].unique()
len(artifact_ids)

6589

In [11]:
# get all GitHub activity to the repos we care about
# note: this is an expensive scan -- uncomment to run it! 

q = (f"""
    select
      date_trunc(e.time, MONTH) as event_month,
      from_.artifact_name as git_user,
      to_.artifact_namespace as git_org,
      to_.artifact_name as git_repo,
      e.event_type,
      sum(e.amount) as amount
    from`oso.timeseries_events_by_artifact_v0` as e
    join `oso.artifacts_v1` as from_
      on e.from_artifact_id = from_.artifact_id
    join `oso.artifacts_v1` as to_
      on e.to_artifact_id = to_.artifact_id
    where
      e.time >= '2020-01-01'
      and e.event_source = 'GITHUB'
      and from_.artifact_name not like '%[bot]%'
      and e.to_artifact_id in ({stringify_array(artifact_ids)})
    group by 1,2,3,4,5
""")

# results = client.query(q)
# df_events = results.to_dataframe()
# df_events.to_parquet('data/events.parquet')

df_events = pd.read_parquet('data/events.parquet')
df_events.tail()

Unnamed: 0,event_month,git_user,git_org,git_repo,event_type,amount
1579550,2021-03-01 00:00:00+00:00,abortrao,filecoin-project,go-amt-ipld,FORKED,1.0
1579551,2023-01-01 00:00:00+00:00,glacierwalrus,filecoin-project,helm-charts,PULL_REQUEST_OPENED,4.0
1579552,2021-03-01 00:00:00+00:00,keepkalm,filecoin-project,filecoin-docs,STARRED,1.0
1579553,2022-10-01 00:00:00+00:00,nonsense,filecoin-project,boost,PULL_REQUEST_CLOSED,8.0
1579554,2022-04-01 00:00:00+00:00,laudiacay,filecoin-project,notary-governance,ISSUE_COMMENT,2.0


In [12]:
print("Repos:", len(artifact_ids))
print("Git Users:", df_events['git_user'].nunique())
df_events.groupby('event_type')['amount'].sum()

Repos: 6589
Git Users: 328414


event_type
COMMIT_CODE                     740390.0
FORKED                          224868.0
ISSUE_CLOSED                    185505.0
ISSUE_COMMENT                  1078645.0
ISSUE_OPENED                    296934.0
ISSUE_REOPENED                    7700.0
PULL_REQUEST_CLOSED             445122.0
PULL_REQUEST_MERGED             364273.0
PULL_REQUEST_OPENED             423758.0
PULL_REQUEST_REOPENED             4476.0
PULL_REQUEST_REVIEW_COMMENT     681945.0
RELEASE_PUBLISHED                19600.0
STARRED                         542384.0
Name: amount, dtype: float64

In [15]:
len(G.edges)

30373