In [1]:
from google.cloud import bigquery
import os
import pandas as pd

In [2]:
GCP_PROJECT = 'opensource-observer'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../oso_gcp_credentials.json'
client = bigquery.Client(GCP_PROJECT)

In [3]:
query = """
    select 
      p.project_name,
      a.artifact_source,
      a.artifact_type,
      a.artifact_namespace,
      a.artifact_name,
      a.artifact_url
    from `oso.int_artifacts_in_ossd_by_project` as a
    join `oso.int_projects` as p on a.project_id = p.project_id
    
"""

# result = client.query(query)
# df__oso_project_artifacts = result.to_dataframe()
# df_oso_project_artifacts.to_parquet('data/project_artifacts.parquet')

df_oso_project_artifacts = pd.read_parquet('data/project_artifacts.parquet')
df_oso_project_artifacts.tail(1)

Unnamed: 0,project_name,artifact_source,artifact_type,artifact_namespace,artifact_name,artifact_url
89797,rubicon,ARBITRUM_ONE,CONTRACT,arbitrum_one,0xc715a30fde987637a082cf5f19c74648b67f2db8,0xc715a30fde987637a082cf5f19c74648b67f2db8


In [4]:
df_oso_project_repos = df_oso_project_artifacts[df_oso_project_artifacts['artifact_type']=='REPOSITORY'].copy()
df_oso_project_repos['name_with_owner'] = df_oso_project_repos.apply(
    lambda x: "/".join([x['artifact_namespace'], x['artifact_name']]), axis=1
)
df_oso_project_repos.tail(1)

Unnamed: 0,project_name,artifact_source,artifact_type,artifact_namespace,artifact_name,artifact_url,name_with_owner
74430,builders-garden,GITHUB,REPOSITORY,builders-garden,evm-actions-trade,https://github.com/builders-garden/evm-actions...,builders-garden/evm-actions-trade


In [5]:
query = """
    select distinct
      lower(name_with_owner) as name_with_owner,
      is_fork,
      fork_count,
      star_count,
      created_at,
      updated_at
     from `ossd.repositories`
"""

# result = client.query(query)
# df_oso_repos_raw = result.to_dataframe()
# df_oso_repos_raw.to_parquet('data/repo_raw.parquet')

df_oso_repos_raw = (
    pd.read_parquet('data/repo_raw.parquet')
    .dropna()
    .sort_values(by='updated_at')
    .drop_duplicates(subset=['name_with_owner'], keep='last')
)

df_oso_repos = df_oso_project_repos.merge(df_oso_repos_raw, on='name_with_owner')
df_oso_repos.tail(1)

Unnamed: 0,project_name,artifact_source,artifact_type,artifact_namespace,artifact_name,artifact_url,name_with_owner,is_fork,fork_count,star_count,created_at,updated_at
54759,builders-garden,GITHUB,REPOSITORY,builders-garden,evm-actions-trade,https://github.com/builders-garden/evm-actions...,builders-garden/evm-actions-trade,False,0,0,2024-07-13 15:46:34+00:00,2024-07-13 23:59:02+00:00


In [6]:
df_projects = pd.read_csv('data/projects_oso_with_git.csv')

df_projects['oso_name_clean'] = df_projects['OSOName'].apply(
    lambda x: x.replace('.yaml','').lower().split('/')[-1].strip() if isinstance(x, str) else None
)

df_projects['has_oso_name'] = df_projects.apply(
    lambda x: x['categoryHasOsoName'] == True or isinstance(x['OSOName'], str),
    axis=1
)

df_projects.tail(1)

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,oso_name_clean,has_oso_name
152,Filecoin BlockChain explorer,https://github.com/ipfs-force-community/filsca...,ipfs-force-community-filescan,True,ipfs-force-community-filescan,True


In [7]:
def clean_github_url(url):
    url = url.lower().strip('/')
    url = url.replace('https://github.com/', '').replace('.git', '').replace('orgs/','')
    url = url.strip()
    return url
    
def get_owner(url):
    name_with_owner = clean_github_url(url)
    return name_with_owner.split('/')[0]

def get_repo_name(url):
    name_with_owner = clean_github_url(url)
    if '/' in name_with_owner:
        return name_with_owner.split('/')[1]

    
df_projects['repo_name_with_owner'] = df_projects['githubProjectLink'].apply(clean_github_url)
df_projects['repo_owner'] = df_projects['githubProjectLink'].apply(get_owner)
df_projects['repo_name'] = df_projects['githubProjectLink'].apply(get_repo_name)
df_projects['is_org'] = df_projects['repo_name'].isna() == True

df_projects.tail(1)

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,oso_name_clean,has_oso_name,repo_name_with_owner,repo_owner,repo_name,is_org
152,Filecoin BlockChain explorer,https://github.com/ipfs-force-community/filsca...,ipfs-force-community-filescan,True,ipfs-force-community-filescan,True,ipfs-force-community/filscan-backend,ipfs-force-community,filscan-backend,False


In [9]:
# Define columns to select from df_oso_repos
cols = ['name_with_owner', 'project_name', 'fork_count', 'star_count', 'created_at', 'updated_at']

# Function to merge df_projects with df_oso_repos and add a status column
def merge_with_status(df1, df2, left_on, right_on, status, agg_func=None):
    if agg_func:
        df2 = df2.groupby(right_on).agg(agg_func).reset_index()
    merged_df = df1.merge(df2, left_on=left_on, right_on=right_on)
    merged_df['status'] = status
    return merged_df

# 1. Match by GitHub repository
df_repo_match = merge_with_status(
    df_projects, 
    df_oso_repos[cols], 
    left_on='repo_name_with_owner', 
    right_on='name_with_owner', 
    status='1_oso_match_by_github_repo'
).drop(columns=['name_with_owner'])

# 2. Match by GitHub organization
org_agg_funcs = {
    'project_name': lambda x: ",".join(x.unique()),
    'fork_count': sum,
    'star_count': sum,
    'created_at': min,
    'updated_at': max,
}
df_org_match = merge_with_status(
    df_projects[df_projects['is_org']], 
    df_oso_repos, 
    left_on='repo_owner', 
    right_on='artifact_namespace', 
    status='2_oso_match_by_github_org', 
    agg_func=org_agg_funcs
)

# 3. Projects with no matches so far
df_no_match = df_projects[~df_projects['githubProjectLink'].isin(
    pd.concat([df_org_match['githubProjectLink'], df_repo_match['githubProjectLink']])
)].copy()

# 4. Match by project name
project_name_agg_funcs = {
    'fork_count': sum,
    'star_count': sum,
    'created_at': min,
    'updated_at': max,
}
unmatched_projects = df_no_match['oso_name_clean'].dropna().unique()
df_oso_match = merge_with_status(
    df_projects[df_projects['oso_name_clean'].isin(unmatched_projects)], 
    df_oso_repos, 
    left_on='oso_name_clean', 
    right_on='project_name', 
    status='3_oso_match_by_project_name', 
    agg_func=project_name_agg_funcs
)

# 5. Update remaining unmatched projects
df_no_match = df_no_match[~df_no_match['oso_name_clean'].isin(df_oso_match['oso_name_clean'].unique())]
df_no_match['status'] = df_no_match['has_oso_name'].apply(
    lambda x: '4_could_not_match_to_oso' if x else '5_no_oso_name'
)

# Combine all matches and no matches
df = pd.concat([df_org_match, df_repo_match, df_oso_match, df_no_match], axis=0, ignore_index=True)
df = df.sort_values(by=['status', 'repo_name_with_owner']).reset_index(drop=True)

final_cols = [
    'name', 'githubProjectLink', 'OSOName', 'categoryHasOsoName',
    'repo_name_with_owner', 'repo_owner', 'repo_name', 'is_org', 'project_name',
    'fork_count', 'star_count', 'created_at', 'updated_at', 'status'
]
df = df[final_cols]
df.rename(columns={'project_name': 'verified_oso_name'}, inplace=True)
df.to_csv('data/oso_matches.csv', index=False)

df.head()

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,repo_name_with_owner,repo_owner,repo_name,is_org,verified_oso_name,fork_count,star_count,created_at,updated_at,status
0,filexp,https://github.com/aschmahmann/filexp,,True,aschmahmann/filexp,aschmahmann,filexp,False,filexp-aschmahmann,1,11,2023-01-17 06:28:03+00:00,2024-11-14 15:06:07+00:00,1_oso_match_by_github_repo
1,FILPlus Data Calculation,https://github.com/beck-8/filplus,,True,beck-8/filplus,beck-8,filplus,False,filplus-beck-8,1,0,2022-09-09 09:23:58+00:00,2022-09-13 01:51:53+00:00,1_oso_match_by_github_repo
2,Forest,https://github.com/chainsafe/forest,forest-chainsafe,True,chainsafe/forest,chainsafe,forest,False,forest-chainsafe,158,638,2019-11-11 21:15:40+00:00,2024-11-15 07:16:05+00:00,1_oso_match_by_github_repo
3,Forest Archive,https://github.com/chainsafe/forest,forest-chainsafe,True,chainsafe/forest,chainsafe,forest,False,forest-chainsafe,158,638,2019-11-11 21:15:40+00:00,2024-11-15 07:16:05+00:00,1_oso_match_by_github_repo
4,CIDgravity Nextcloud App,https://github.com/CIDgravity/nextcloud-cidgra...,,False,cidgravity/nextcloud-cidgravity-gateway-app,cidgravity,nextcloud-cidgravity-gateway-app,False,cidgravity,0,1,2024-06-13 10:51:22+00:00,2024-11-05 08:21:45+00:00,1_oso_match_by_github_repo
