In [1]:
from google.cloud import bigquery
import numpy as np
import os
import pandas as pd

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../oso_gcp_credentials.json'
GCP_PROJECT = 'opensource-observer'
client = bigquery.Client(GCP_PROJECT)

In [3]:
# get all relevant repos

results = client.query("""
    select metrics.*
    from `oso.artifacts_by_collection_v1` as abc
    join `oso.int_repo_metrics_by_project` as metrics
        on abc.artifact_id = metrics.artifact_id
    where
        abc.collection_name in (
          --'arb-onchain',
          --'arbitrum-crypto-ecosystems',
          'solana-crypto-ecosystems'
        )
""")

df_repos = results.to_dataframe()
df_repos.set_index('artifact_id').to_csv('data/repos.csv')
df_repos.tail()

Unnamed: 0,project_id,artifact_id,artifact_namespace,artifact_name,artifact_source,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,created_at,updated_at,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,commit_count
8697,yrf3GBBIkt_ha517ZTl7I-XOHtMXM_b3o9qwHhLXwE8=,oS0wrapkel65M_DH7Sj5uN6RNqK5oCJw6EQllu2o9Ss=,quiknode-labs,marketplace-starter-ts,GITHUB,False,0,0,0,TypeScript,,2023-12-08 20:42:42+00:00,2023-12-21 20:17:18+00:00,NaT,NaT,,,
8698,yrf3GBBIkt_ha517ZTl7I-XOHtMXM_b3o9qwHhLXwE8=,M6Y1LhThI1iYqBwQfRQNdUkva8ugbH3s9_IleMhv-jY=,quiknode-labs,moddapptut,GITHUB,True,1,4,4,,,2021-12-21 20:25:25+00:00,2023-06-19 10:08:10+00:00,NaT,NaT,,,
8699,yrf3GBBIkt_ha517ZTl7I-XOHtMXM_b3o9qwHhLXwE8=,g7M08WOOGSTsHrIaxD16RV7g_vRAcxWuRPpHJ-Aj4Qg=,quiknode-labs,urql,GITHUB,True,0,0,0,TypeScript,MIT,2023-10-15 22:27:35+00:00,2023-10-15 22:30:17+00:00,NaT,NaT,,,
8700,yrf3GBBIkt_ha517ZTl7I-XOHtMXM_b3o9qwHhLXwE8=,c9Wslw-BF6fqXZKz_TwJ_1NOE1qloNkHizFE8qD9_yQ=,quiknode-labs,eth-wss-block-time-logger,GITHUB,False,0,0,0,JavaScript,,2024-09-16 14:55:39+00:00,2024-10-02 19:46:23+00:00,NaT,NaT,,,
8701,yrf3GBBIkt_ha517ZTl7I-XOHtMXM_b3o9qwHhLXwE8=,azx2wbWAm7CcUQ4PDoSp2rpZ8o3po36oRNy-fGCdzqc=,quiknode-labs,quickstreams-sample-feed,GITHUB,False,0,0,0,JavaScript,,2024-01-29 22:19:44+00:00,2024-01-29 23:03:13+00:00,NaT,NaT,,,


In [4]:
repo_list = df_repos['artifact_id'].to_list()
repo_str = "'" + "','".join(repo_list) + "'"
len(repo_list)

8702

In [5]:
# get all GitHub activity to the repos we care about
# note: this is an expensive scan! 

results = client.query("""    

    with repos as (
      select distinct artifact_id
      from `oso.artifacts_by_collection_v1`
      where
        collection_name in (
          'arb-onchain',
          'arbitrum-crypto-ecosystems',
          'solana-crypto-ecosystems'
        )
    )
    select
      date_trunc(e.time, MONTH) as bucket_month,
      from_.artifact_name as git_user,
      e.to_artifact_id,
      count(*) as num_commits
    from`oso.timeseries_events_by_artifact_v0` as e
    join `oso.artifacts_v1` as from_
      on e.from_artifact_id = from_.artifact_id
    where
      e.time >= '2023-01-01'
      and e.event_type = 'COMMIT_CODE'
      and from_.artifact_name not like '%[bot]%'
      and e.to_artifact_id in (select artifact_id from repos)
    group by 1,2,3
""")

df_events = results.to_dataframe()
df_events.to_csv('data/events.csv')
df_events.tail()

Unnamed: 0,bucket_month,git_user,to_artifact_id,num_commits
56115,2024-10-01 00:00:00+00:00,0xm00k,TQHcGD6dKEEbLTyJq1VmYUEQIZBedBcl96gGxW8Ufrc=,65
56116,2024-10-01 00:00:00+00:00,artyukh,KTF3n7nv_02wb9zfSJ8cKAiVP5yfdc9udBgs2GRD9Zc=,1
56117,2024-10-01 00:00:00+00:00,ryley-o,T4UrokghmyydtfTRyALxNZiOAVCyBhoEbbo8642Ca08=,21
56118,2024-10-01 00:00:00+00:00,sterlu,1tS3i3t_mEOhQc6r0cDqciPskSbQPBvdqgX67R2bzMU=,9
56119,2024-10-01 00:00:00+00:00,chrisamora,vG6utouJzDk_31g6Cd6YxgfbCnY9jBd9wnrhlk_dAQA=,4


In [6]:
df_repos['language'].value_counts().head(20)

language
TypeScript    2416
              2054
Rust          1170
JavaScript     931
Go             351
Python         277
Solidity       203
HTML           149
Shell          125
Swift          105
C#              91
Kotlin          82
C               72
D               71
Vue             62
Erlang          55
CSS             48
Java            44
C++             36
Ruby            36
Name: count, dtype: int64

In [18]:
# simple repo weighting function

def weight_repos(metric, language):
    weight = (metric ** .5) * (1.0 if language == 'Rust' else 0.1)
    return weight

df_repos['v'] = df_repos.apply(lambda x: weight_repos(x['star_count'], x['language']), axis=1)
repo_mapping = df_repos.set_index('artifact_id')['v'].to_dict()
df_events['v_repo'] = df_events['to_artifact_id'].map(repo_mapping)
df_events['v'] = np.sqrt(df_events['num_commits']) * df_events['v_repo']

In [22]:
top_devs = df_events.groupby(['git_user'])['v'].sum().sort_values(ascending=False)
top_devs = top_devs[top_devs>1].reset_index()
top_devs = (
    top_devs
    .groupby(['v'])['git_user']
    .agg(lambda x: ','.join(x.unique()))
    .reset_index()
    .sort_values(by='v', ascending=False)
    .reset_index(drop=True)
)
top_devs.to_csv('data/top_devs.csv')
top_devs.head(40)

Unnamed: 0,v,git_user
0,767.74398,"asd-and-rizzo,dzmitry-lahoda"
1,550.922555,guibescos
2,441.165077,evan-gray
3,436.35992,jafaraz
4,356.914013,mina86
5,285.146207,ali-bahjati
6,284.658828,bruce-riley
7,283.894241,kkast
8,266.930937,joncinque
9,256.706008,mohammadpch
