# Simulation of RF5 GitHub Metrics

Ideas:
1. stars_total
2. stars_from_top_devs
3. forks_total
4. forks_from_top_devs
5. age_of_project
6. avg_fulltime_developers
7. unique_contributors_last_6_months

In [1]:
from google.cloud import bigquery
import os
import pandas as pd

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../gcp_credentials.json'
client = bigquery.Client()

## Queries to OSO

In [2]:
# get projects in RF3 but not in RF4

projects_query = """
    select
      project_id,
      project_name
    from `opensource-observer.oso.projects_by_collection_v1`
    where
        collection_name = 'op-rpgf3'
        and project_id not in (
            select project_id
            from `opensource-observer.oso.projects_by_collection_v1`
            where collection_name = 'op-retrofunding-4'
        )
"""
projects_query_results = client.query(projects_query)

# store as a dict
project_ids_names = projects_query_results.to_dataframe().set_index('project_id')['project_name'].to_dict()
len(project_ids_names)

264

In [3]:
# warning: this query is expensive! use the parquet

events_query = f"""
    select
      events.project_id,
      events.bucket_day,      
      events.from_artifact_id as user_id,
      users.artifact_name as user,
      events.to_artifact_id as repo_id,
      repos.artifact_namespace as repo_owner,
      repos.artifact_name as repo_name,
      events.event_type,      
      events.amount
    from `opensource-observer.oso.int_events_daily_to_project` as events
    join `opensource-observer.oso.artifacts_v1` as users
      on events.from_artifact_id = users.artifact_id
    join `opensource-observer.oso.artifacts_v1` as repos
      on events.to_artifact_id = repos.artifact_id
    where
        events.event_source = 'GITHUB'
        and events.project_id in (
            select project_id
            from ({projects_query})
        )
"""

# events_query_results = client.query(events_query)
# df_events = events_query_results.to_dataframe()
# df_events.to_parquet("data/rf5_events.parquet")

df_events = pd.read_parquet("data/rf5_events.parquet")
df_events['project_name'] = df_events['project_id'].map(project_ids_names)
df_events.tail(1)

Unnamed: 0,project_id,bucket_day,user_id,user,repo_id,repo_owner,repo_name,event_type,amount,project_name
2186651,y9t7a2RCN_Cxpi-g-8Qmb5txYxaC0nARzZwmwSWinuI=,2022-03-09 00:00:00+00:00,ZQ53uNOIn9siBHNyFN836ij1XJf3R5I045bQGbTpZ_w=,emazurek,aQosZmfg_aQpa6Hpb5kbbERkVVpkIsfmCqvzz-rsgvE=,foundry-rs,book,PULL_REQUEST_OPENED,1.0,foundry


In [4]:
# get code metrics for projects

metrics_query = f"""
    select *
    from `opensource-observer.oso.code_metrics_by_project_v1`
    where project_id in (
        select project_id
        from ({projects_query})
    )
"""

metrics_query_results = client.query(metrics_query)
df_metrics = metrics_query_results.to_dataframe()
df_metrics.tail(1)

Unnamed: 0,project_id,project_source,project_namespace,project_name,display_name,event_source,repository_count,first_commit_date,last_commit_date,star_count,...,contributor_count,contributor_count_6_months,new_contributor_count_6_months,fulltime_developer_average_6_months,active_developer_count_6_months,commit_count_6_months,opened_pull_request_count_6_months,merged_pull_request_count_6_months,opened_issue_count_6_months,closed_issue_count_6_months
249,UNKoHLpnT_OziRiPCuX21uhgLNIJygsxuZz3A0TL-cg=,OSS_DIRECTORY,oso,protocol-guild,Protocol Guild,GITHUB,304,2015-01-01 18:35:06+00:00,2024-08-04 04:16:12+00:00,175570,...,17429.0,1538.0,901.0,14.379121,106.0,11757.0,5574.0,4315.0,1688.0,1711.0


## Derive some metrics

In [5]:
# identify the top N users from openrank

N = 420
users = pd.read_csv('data/openrank/users.csv')
top_users = users['peer'].iloc[:N].to_list()

In [6]:
# find the top ranked repo (by openrank) for each project

repos = pd.read_csv('data/openrank/repos.csv')
repos['rank'] = repos['a=0.5'].rank(ascending=False)
repo_rank = repos.set_index('peer')['rank'].to_dict()

df_repos = df_events[['project_name', 'repo_owner', 'repo_name']].drop_duplicates()
df_repos['artifact'] = df_repos.apply(lambda x: '/'.join([x['repo_owner'], x['repo_name']]), axis=1)
df_repos['repo_rank'] = df_repos['artifact'].apply(lambda x: repo_rank.get(x))

top_repo_rank = df_repos.groupby('project_name')['repo_rank'].min().sort_values()
top_repo_rank.head()

project_name
testinprod-io        2.0
reth-paradigmxyz    11.0
foundry             18.0
go-ethereum         23.0
protocol-guild      23.0
Name: repo_rank, dtype: float64

In [7]:
# derive the other metrics from the OSO event data

def metric_factory(metric, user_filter=None):
    metric_name = metric.lower()
    dff = df_events.query('event_type == @metric')
    if user_filter:
        metric_name += '_by_top_devs'
        dff = dff.query('user in @user_filter')
    series = (
        dff
        .groupby('project_name')['amount']
        .sum()
        .sort_values(ascending=False)
    )
    series.name = metric_name
    return series

metric_factory('STARRED', user_filter=top_users).head()

project_name
protocol-guild    1332.0
foundry            288.0
openzeppelin       251.0
wevm               160.0
ipfs               156.0
Name: starred_by_top_devs, dtype: float64

In [8]:
# get age of project

TODAY = 2024. + 8/12
age_of_project = df_events.groupby('project_name')['bucket_day'].min().apply(lambda x: TODAY - (x.year + x.month/12))
age_of_project.name = 'age_of_project_years'

## Consolidate and export the data

In [9]:
metrics = [
    top_repo_rank,
    metric_factory('STARRED'),
    metric_factory('STARRED', user_filter=top_users),
    metric_factory('FORKED'),
    metric_factory('FORKED', user_filter=top_users),
    age_of_project
]

df = (
    df_metrics
    .set_index('project_name')
    [['display_name', 'repository_count', 'star_count', 'fork_count', 'fulltime_developer_average_6_months', 'new_contributor_count_6_months']]
    .join(pd.concat(metrics, axis=1))
)
df

Unnamed: 0_level_0,display_name,repository_count,star_count,fork_count,fulltime_developer_average_6_months,new_contributor_count_6_months,repo_rank,starred,starred_by_top_devs,forked,forked_by_top_devs,age_of_project_years
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
hypeshothq,HypeshotHQ,1,1,0,,,,,,,,
ethereumbarcelona,ETHBarcelona,2,0,0,,,,,,,,
project-squallet-refractor-labs,Project Squallet from Lore,1,2,0,,,20253.0,3.0,,,,0.750000
nacion-bankless,Nación Bankless,2,1,0,,,,1.0,,,,1.000000
cannon-rs-badboilabs,Cannon-rs,1,44,6,0.000000,0.0,136.0,51.0,15.0,8.0,1.0,1.833333
...,...,...,...,...,...,...,...,...,...,...,...,...
justcause-smeee23,JustCause,2,2,1,0.000000,0.0,,2.0,,,,2.916667
filosofiacodigo,FilosofiaCodigo,36,61,51,0.000000,0.0,6786.0,63.0,,63.0,,3.250000
revoke-cash,Revoke,8,757,267,0.862637,29.0,963.0,883.0,13.0,315.0,6.0,4.833333
nethermindeth,Nethermind,258,3844,1379,3.642857,200.0,79.0,4677.0,116.0,1624.0,39.0,6.583333


In [10]:
df.to_csv("data/rf5_metrics_simulation.csv")