In [1]:
from google.cloud import bigquery
import os
import pandas as pd

In [2]:
GCP_PROJECT = 'opensource-observer'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../oso_gcp_credentials.json'
client = bigquery.Client(GCP_PROJECT)

In [3]:
query = """
    select
        p.project_name,
        m.*
    from `oso.int_repo_metrics_by_project` as m
    join `oso.projects_v1` as p
        on m.project_id = p.project_id
"""

# result = client.query(query)
# df_repos = result.to_dataframe()
# df_repos.to_parquet('data/repo_metrics.parquet')

df_repos = pd.read_parquet('data/repo_metrics.parquet')
df_repos['artifact_name_owner'] = df_repos.apply(lambda x: "/".join([x['artifact_namespace'], x['artifact_name']]), axis=1)
df_repos.tail(1)

Unnamed: 0,project_name,project_id,artifact_id,artifact_namespace,artifact_name,artifact_source,is_fork,fork_count,star_count,watcher_count,language,license_spdx_id,first_commit_time,last_commit_time,days_with_commits_count,contributors_to_repo_count,commit_count,artifact_name_owner
53390,privacy-scaling-explorations,die2X_l4rJ6uZTU4L-inhU4RJ1P8vs0bTvw85Bzal_8=,TNCOf8mKsvIlJApGxNaLN1MTFi8871edWlhWp-IRurk=,privacy-scaling-explorations,halo2,GITHUB,True,129,206,206,Rust,NOASSERTION,2021-11-29 09:31:38+00:00,2024-10-24 06:32:33+00:00,89,9,312.0,privacy-scaling-explorations/halo2


In [4]:
df_projects = pd.read_csv('data/projects_oso_with_git.csv')
df_projects['githubProjectLink'] = df_projects['githubProjectLink'].str.lower()

df_projects['artifact_name_owner'] = df_projects['githubProjectLink'].apply(
    lambda x: x.replace('https://github.com/','').strip('/')
)
df_projects['artifact_namespace'] = df_projects['artifact_name_owner'].apply(
    lambda x: x.split('/')[0] if '/' in x else x
)
df_projects['artifact_name'] = df_projects['artifact_name_owner'].apply(
    lambda x: x.split('/')[1] if '/' in x else None
)
df_projects.head()

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,artifact_name_owner,artifact_namespace,artifact_name
0,Filecoin Spark,https://github.com/filecoin-station/spark,,False,filecoin-station/spark,filecoin-station,spark
1,The CalibrationNet Stability Project,https://github.com/filecoin-project/curio,,True,filecoin-project/curio,filecoin-project,curio
2,Rust-Libp2p,https://github.com/libp2p/rust-libp2p,libp2p,True,libp2p/rust-libp2p,libp2p,rust-libp2p
3,Open Source Observer,https://github.com/opensource-observer/oso,,False,opensource-observer/oso,opensource-observer,oso
4,DeFinomics Labs,https://github.com/definomics-labs,,False,definomics-labs,definomics-labs,


In [5]:
df_projects_repos = (
    df_projects[df_projects['artifact_name'].isna()==False]
    .set_index('artifact_name_owner').join(
        df_repos.groupby('artifact_name_owner')[['fork_count', 'star_count', 'commit_count']].sum()
    ).join(
        df_repos.groupby('artifact_name_owner')['project_name'].agg(set)
    )
)
df_projects_repos.to_csv('data/oso_repos.csv')
df_projects_repos.head()

Unnamed: 0_level_0,name,githubProjectLink,OSOName,categoryHasOsoName,artifact_namespace,artifact_name,fork_count,star_count,commit_count,project_name
artifact_name_owner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
starboard-ventures/filecoin-network-health-dashboard,Network Health Dashboard,https://github.com/starboard-ventures/fileco...,,False,starboard-ventures,filecoin-network-health-dashboard,,,,
ancients-research/retropgf,Ancients Research,https://github.com/ancients-research/retropgf,,False,ancients-research,retropgf,,,,
aschmahmann/filexp,filexp,https://github.com/aschmahmann/filexp,,True,aschmahmann,filexp,1.0,11.0,33.0,{filexp-aschmahmann}
asia-spwg/retropgf,Asia Storage Provider Working Group,https://github.com/asia-spwg/retropgf,,False,asia-spwg,retropgf,,,,
auralgenius/agcore,AuralGenius,https://github.com/auralgenius/agcore,,False,auralgenius,agcore,,,,


In [6]:
df_projects_orgs = (
    df_projects[df_projects['artifact_name'].isna()==True]
    .set_index('artifact_namespace').join(
        df_repos.groupby('artifact_namespace')[['fork_count', 'star_count', 'commit_count']].sum()
    ).join(
        df_repos.groupby('artifact_namespace')['project_name'].agg(set)
    )
)

df_projects_orgs.to_csv('data/oso_orgs.csv')
df_projects_orgs.head()

Unnamed: 0_level_0,name,githubProjectLink,OSOName,categoryHasOsoName,artifact_name_owner,artifact_name,fork_count,star_count,commit_count,project_name
artifact_namespace,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
definomics-labs,DeFinomics Labs,https://github.com/definomics-labs,,False,definomics-labs,,0.0,0.0,25.0,{definomics-labs}
chainupcloud,Chainup Cloud (Filecoin Public RPC) - Updated ...,https://github.com/chainupcloud,chainupcloud,True,chainupcloud,,,,,
filecoin-station,Filecoin Station,https://github.com/filecoin-station,,False,filecoin-station,,32.0,146.0,4628.0,{filecoin-station}
fill-lab,FILLiquid,https://github.com/fill-lab,FILL-Lab,True,fill-lab,,3.0,9.0,124.0,{fill-lab}
storswiftlabs,Data123 (Full Archival History Node Service),https://github.com/storswiftlabs,,True,storswiftlabs,,15.0,37.0,559.0,{storswiftlabs}


In [7]:
df = pd.concat([df_projects_orgs.reset_index(), df_projects_repos.reset_index()], axis=0)
df['project_name'] = df['project_name'].apply(lambda x: list(x)[0] if isinstance(x, set) and len(x) == 1 else x)
df

Unnamed: 0,artifact_namespace,name,githubProjectLink,OSOName,categoryHasOsoName,artifact_name_owner,artifact_name,fork_count,star_count,commit_count,project_name
0,definomics-labs,DeFinomics Labs,https://github.com/definomics-labs,,False,definomics-labs,,0,0,25.0,definomics-labs
1,chainupcloud,Chainup Cloud (Filecoin Public RPC) - Updated ...,https://github.com/chainupcloud,chainupcloud,True,chainupcloud,,,,,
2,filecoin-station,Filecoin Station,https://github.com/filecoin-station,,False,filecoin-station,,32,146,4628.0,filecoin-station
3,fill-lab,FILLiquid,https://github.com/fill-lab,FILL-Lab,True,fill-lab,,3,9,124.0,fill-lab
4,storswiftlabs,Data123 (Full Archival History Node Service),https://github.com/storswiftlabs,,True,storswiftlabs,,15,37,559.0,storswiftlabs
...,...,...,...,...,...,...,...,...,...,...,...
126,yhio,Rbot,https://github.com/yhio/rbot,,True,yhio/rbot,rbot,,,,
127,zenground0,Cross Chain Onramp POC,https://github.com/zenground0/onramp-contracts,,False,zenground0/onramp-contracts,onramp-contracts,,,,
128,zondax,Beryx Filecoin API,https://github.com/zondax/rfpg-beryx-api,,True,zondax/rfpg-beryx-api,rfpg-beryx-api,,,,
129,zondax,Beryx Explorer,https://github.com/zondax/web-beryx-explorer,,True,zondax/web-beryx-explorer,web-beryx-explorer,1,4,5.0,zondax


In [8]:
def match_repo(a):
    if '/' in a:
        owner = a.split('/')[0]
        repo = a.split('/')[1]
    else:
        owner = a
        repo = None
        
    dff = df_repos[df_repos['artifact_namespace'] == owner]
    if dff.empty:
        return None
    
    projects = dff['project_name'].unique()

    if len(projects) == 1:
        return projects[0]
    elif len(projects) > 1:
        if not repo:
            return "multiple"
        else:
            dff = dff[dff['artifact_name'] == repo]
            if len(dff):
                projects = dff['project_name'].unique()
                return projects[0]

df_projects['oso_name'] = df_projects['artifact_name_owner'].apply(match_repo)            