In [1]:
from google.cloud import bigquery
import os
import openpyxl
import pandas as pd

In [2]:
GCP_PROJECT = 'opensource-observer'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../oso_gcp_credentials.json'
client = bigquery.Client(GCP_PROJECT)

In [3]:
query = """
    select 
      p.project_name,
      a.artifact_source,
      a.artifact_type,
      a.artifact_namespace,
      a.artifact_name,
      a.artifact_url
    from `oso.int_artifacts_in_ossd_by_project` as a
    join `oso.int_projects` as p on a.project_id = p.project_id
    
"""

# result = client.query(query)
# df_oso_project_artifacts = result.to_dataframe()
# df_oso_project_artifacts.to_parquet('data/project_artifacts.parquet')

df_oso_project_artifacts = pd.read_parquet('data/project_artifacts.parquet')
df_oso_project_artifacts.tail(1)

Unnamed: 0,project_name,artifact_source,artifact_type,artifact_namespace,artifact_name,artifact_url
117481,rubicon,ARBITRUM_ONE,CONTRACT,arbitrum_one,0xc715a30fde987637a082cf5f19c74648b67f2db8,0xc715a30fde987637a082cf5f19c74648b67f2db8


In [4]:
df_oso_project_repos = df_oso_project_artifacts[df_oso_project_artifacts['artifact_type']=='REPOSITORY'].copy()
df_oso_project_repos['name_with_owner'] = df_oso_project_repos.apply(
    lambda x: "/".join([x['artifact_namespace'], x['artifact_name']]), axis=1
)
df_oso_project_repos.tail(1)

Unnamed: 0,project_name,artifact_source,artifact_type,artifact_namespace,artifact_name,artifact_url,name_with_owner
102114,0xparc,GITHUB,REPOSITORY,0xparc,frog-zone,https://github.com/0xparc/frog-zone,0xparc/frog-zone


In [5]:
query = """
    select distinct
      lower(name_with_owner) as name_with_owner,
      is_fork,
      fork_count,
      star_count,
      created_at,
      updated_at
     from `ossd.repositories`
"""

# result = client.query(query)
# df_oso_repos_raw = result.to_dataframe()
# df_oso_repos_raw.to_parquet('data/repo_raw.parquet')

df_oso_repos_raw = (
    pd.read_parquet('data/repo_raw.parquet')
    .dropna()
    .sort_values(by='updated_at')
    .drop_duplicates(subset=['name_with_owner'], keep='last')
)

df_oso_repos = df_oso_project_repos.merge(df_oso_repos_raw, on='name_with_owner')
df_oso_repos.tail(1)

Unnamed: 0,project_name,artifact_source,artifact_type,artifact_namespace,artifact_name,artifact_url,name_with_owner,is_fork,fork_count,star_count,created_at,updated_at
82445,0xparc,GITHUB,REPOSITORY,0xparc,frog-zone,https://github.com/0xparc/frog-zone,0xparc/frog-zone,False,2,5,2024-10-24 13:30:54+00:00,2024-11-19 15:42:19+00:00


In [6]:
df_projects = pd.read_csv('data/projects_oso_with_git.csv')

df_projects['oso_name_clean'] = df_projects['OSOName'].apply(
    lambda x: x.replace('.yaml','').lower().split('/')[-1].strip() if isinstance(x, str) else None
)

df_projects['has_oso_name'] = df_projects.apply(
    lambda x: x['categoryHasOsoName'] == True or isinstance(x['OSOName'], str),
    axis=1
)

df_projects.tail(1)

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,oso_name_clean,has_oso_name
152,Filecoin BlockChain explorer,https://github.com/ipfs-force-community/filsca...,ipfs-force-community-filescan,True,ipfs-force-community-filescan,True


In [7]:
def clean_github_url(url):
    url = url.lower().strip('/')
    url = url.replace('https://github.com/', '').replace('.git', '').replace('orgs/','')
    url = url.strip()
    return url
    
def get_owner(url):
    name_with_owner = clean_github_url(url)
    return name_with_owner.split('/')[0]

def get_repo_name(url):
    name_with_owner = clean_github_url(url)
    if '/' in name_with_owner:
        return name_with_owner.split('/')[1]

    
df_projects['repo_name_with_owner'] = df_projects['githubProjectLink'].apply(clean_github_url)
df_projects['repo_owner'] = df_projects['githubProjectLink'].apply(get_owner)
df_projects['repo_name'] = df_projects['githubProjectLink'].apply(get_repo_name)
df_projects['is_org'] = df_projects['repo_name'].isna() == True

df_projects.tail(1)

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,oso_name_clean,has_oso_name,repo_name_with_owner,repo_owner,repo_name,is_org
152,Filecoin BlockChain explorer,https://github.com/ipfs-force-community/filsca...,ipfs-force-community-filescan,True,ipfs-force-community-filescan,True,ipfs-force-community/filscan-backend,ipfs-force-community,filscan-backend,False


In [8]:
# Define columns to select from df_oso_repos
cols = ['name_with_owner', 'project_name', 'fork_count', 'star_count', 'created_at', 'updated_at']

# Function to merge df_projects with df_oso_repos and add a status column
def merge_with_status(df1, df2, left_on, right_on, status, agg_func=None):
    if agg_func:
        df2 = df2.groupby(right_on).agg(agg_func).reset_index()
    merged_df = df1.merge(df2, left_on=left_on, right_on=right_on)
    merged_df['status'] = status
    return merged_df

# 1. Match by GitHub repository
df_repo_match = merge_with_status(
    df_projects, 
    df_oso_repos[cols], 
    left_on='repo_name_with_owner', 
    right_on='name_with_owner', 
    status='1_oso_match_by_github_repo'
).drop(columns=['name_with_owner'])

# 2. Match by GitHub organization
org_agg_funcs = {
    'project_name': lambda x: ",".join(x.unique()),
    'fork_count': 'sum',
    'star_count': 'sum',
    'created_at': 'min',
    'updated_at': 'max',
}
df_org_match = merge_with_status(
    df_projects[df_projects['is_org']], 
    df_oso_repos, 
    left_on='repo_owner', 
    right_on='artifact_namespace', 
    status='2_oso_match_by_github_org', 
    agg_func=org_agg_funcs
)

# 3. Projects with no matches so far
df_no_match = df_projects[~df_projects['githubProjectLink'].isin(
    pd.concat([df_org_match['githubProjectLink'], df_repo_match['githubProjectLink']])
)].copy()

# 4. Match by project name
project_name_agg_funcs = {
    'fork_count': 'sum',
    'star_count': 'sum',
    'created_at': 'min',
    'updated_at': 'max',
}
unmatched_projects = df_no_match['oso_name_clean'].dropna().unique()
df_oso_match = merge_with_status(
    df_projects[df_projects['oso_name_clean'].isin(unmatched_projects)], 
    df_oso_repos, 
    left_on='oso_name_clean', 
    right_on='project_name', 
    status='3_oso_match_by_project_name', 
    agg_func=project_name_agg_funcs
)

# 5. Update remaining unmatched projects
df_no_match = df_no_match[~df_no_match['oso_name_clean'].isin(df_oso_match['oso_name_clean'].unique())]
df_no_match['status'] = df_no_match['has_oso_name'].apply(
    lambda x: '4_could_not_match_to_oso' if x else '5_no_oso_name'
)

# Combine all matches and no matches
df = pd.concat([df_org_match, df_repo_match, df_oso_match, df_no_match], axis=0, ignore_index=True)
df = df.sort_values(by=['status', 'repo_name_with_owner']).reset_index(drop=True)

final_cols = [
    'name', 'githubProjectLink', 'OSOName', 'categoryHasOsoName',
    'repo_name_with_owner', 'repo_owner', 'repo_name', 'is_org', 'project_name',
    'fork_count', 'star_count', 'created_at', 'updated_at', 'status'
]
df = df[final_cols]
df.rename(columns={'project_name': 'verified_oso_name'}, inplace=True)
df.to_csv('data/oso_matches.csv', index=False)

df.head()

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,repo_name_with_owner,repo_owner,repo_name,is_org,verified_oso_name,fork_count,star_count,created_at,updated_at,status
0,filexp,https://github.com/aschmahmann/filexp,,True,aschmahmann/filexp,aschmahmann,filexp,False,filexp-aschmahmann,1,11,2023-01-17 06:28:03+00:00,2024-11-14 15:06:07+00:00,1_oso_match_by_github_repo
1,FILPlus Data Calculation,https://github.com/beck-8/filplus,,True,beck-8/filplus,beck-8,filplus,False,filplus-beck-8,1,0,2022-09-09 09:23:58+00:00,2022-09-13 01:51:53+00:00,1_oso_match_by_github_repo
2,Forest,https://github.com/chainsafe/forest,forest-chainsafe,True,chainsafe/forest,chainsafe,forest,False,forest-chainsafe,160,638,2019-11-11 21:15:40+00:00,2024-11-19 12:51:31+00:00,1_oso_match_by_github_repo
3,Forest Archive,https://github.com/chainsafe/forest,forest-chainsafe,True,chainsafe/forest,chainsafe,forest,False,forest-chainsafe,160,638,2019-11-11 21:15:40+00:00,2024-11-19 12:51:31+00:00,1_oso_match_by_github_repo
4,CIDgravity Nextcloud App,https://github.com/CIDgravity/nextcloud-cidgra...,,False,cidgravity/nextcloud-cidgravity-gateway-app,cidgravity,nextcloud-cidgravity-gateway-app,False,cidgravity,0,1,2024-06-13 10:51:22+00:00,2024-11-05 08:21:45+00:00,1_oso_match_by_github_repo


In [9]:
owners = "'" + "','".join(df['repo_owner'].unique()) + "'"
query = f"""
    select
      format_timestamp('%Y-%m-%d', timestamp_trunc(e.time, DAY)) AS date,
      e.from_artifact_name,
      e.to_artifact_namespace,
      e.to_artifact_name,
      p.project_name,
      e.event_type,
      sum(e.amount) as amount
    from `oso.int_events__github` as e
    join `oso.artifacts_by_project_v1` as p
      on e.to_artifact_id = p.artifact_id
    where
      e.to_artifact_namespace in ({owners}) 
      and e.from_artifact_name not like '%[bot]%'
    group by 1,2,3,4,5,6
"""
# result = client.query(query)
# df_events = result.to_dataframe()
# df_events['repo_name_with_owner'] = df_events.apply(lambda x: x['to_artifact_namespace'] + '/' + x['to_artifact_name'], axis=1)
#df_events.to_parquet('data/events.parquet')

df_events = pd.read_parquet('data/events.parquet')
df_events.tail(1)

Unnamed: 0,date,from_artifact_name,to_artifact_namespace,to_artifact_name,project_name,event_type,amount,repo_name_with_owner
677539,2022-10-27,fatman13,ipfs-force-community,dev-guidances,ipfs-force-community,PULL_REQUEST_MERGED,1.0,ipfs-force-community/dev-guidances


In [10]:
def compute_metrics(df_events, index_col):
    # Ensure 'date' column is datetime type
    df_events['date'] = pd.to_datetime(df_events['date'])
    
    # Get maximum date in the dataset
    max_date = df_events['date'].max()
    
    # Compute date 6 months before the maximum date
    six_months_ago = max_date - pd.DateOffset(months=6)
    
    # Filter events from the last 6 months
    df_last_6_months = df_events[df_events['date'] >= six_months_ago]
    
    # Define contributor event types
    contributor_event_types = [
        'COMMIT_CODE',
        'ISSUE_COMMENT',
        'ISSUE_OPENED',
        'PULL_REQUEST_MERGED',
        'PULL_REQUEST_OPENED',
        'PULL_REQUEST_REVIEW_COMMENT',
        'PULL_REQUEST_CLOSED',
        'ISSUE_CLOSED',
        'ISSUE_REOPENED',
        'PULL_REQUEST_REOPENED'
    ]
    
    # 1. Forks in the Last 6 Months
    forks_6m = df_last_6_months[df_last_6_months['event_type'] == 'FORKED'].groupby(index_col).size()
    
    # 2. Stars in the Last 6 Months
    stars_6m = df_last_6_months[df_last_6_months['event_type'] == 'STARRED'].groupby(index_col).size()
    
    # 3. Contributor Count (6 Months)
    contributors_6m = df_last_6_months[
        df_last_6_months['event_type'].isin(contributor_event_types)
    ].groupby(index_col)['from_artifact_name'].nunique()
    
    # 4. Contributor Count (All Time)
    contributors_all_time = df_events[
        df_events['event_type'].isin(contributor_event_types)
    ].groupby(index_col)['from_artifact_name'].nunique()
    
    # 5. New Contributor Count (6 Months)
    # Contributors in the last 6 months
    contributors_6m_set = df_last_6_months[
        df_last_6_months['event_type'].isin(contributor_event_types)
    ].groupby(index_col)['from_artifact_name'].apply(set)
    
    # Contributors before the last 6 months
    df_before_6m = df_events[df_events['date'] < six_months_ago]
    contributors_before_6m_set = df_before_6m[
        df_before_6m['event_type'].isin(contributor_event_types)
    ].groupby(index_col)['from_artifact_name'].apply(set)
    
    # List of all unique index values (e.g., repos or projects)
    index_values = df_events[index_col].unique()
    
    # Compute new contributors per index value
    new_contributors = {}
    for idx in index_values:
        contributors_last_6m = contributors_6m_set.get(idx, set())
        contributors_before = contributors_before_6m_set.get(idx, set())
        new_contributors[idx] = contributors_last_6m - contributors_before
    
    # Convert to Series
    new_contributors_count_series = pd.Series({idx: len(users) for idx, users in new_contributors.items()})
    
    # 6. Commit Count (6 Months)
    commit_count_6m = df_last_6_months[
        df_last_6_months['event_type'] == 'COMMIT_CODE'
    ].groupby(index_col)['amount'].sum()
    
    # 7. Active Developer Count (6 Months)
    active_developers_6m = df_last_6_months[
        df_last_6_months['event_type'] == 'COMMIT_CODE'
    ].groupby(index_col)['from_artifact_name'].nunique()
    
    # Combine all metrics into a single DataFrame
    metrics_df = pd.DataFrame(index=index_values)
    metrics_df.index.name = index_col
    metrics_df['Forks in the Last 6 Months'] = forks_6m.reindex(metrics_df.index).fillna(0).astype(int)
    metrics_df['Stars in the Last 6 Months'] = stars_6m.reindex(metrics_df.index).fillna(0).astype(int)
    metrics_df['Contributor Count (6 Months)'] = contributors_6m.reindex(metrics_df.index).fillna(0).astype(int)
    metrics_df['Contributor Count'] = contributors_all_time.reindex(metrics_df.index).fillna(0).astype(int)
    metrics_df['New Contributor Count (6 Months)'] = new_contributors_count_series.reindex(metrics_df.index).fillna(0).astype(int)
    metrics_df['Commit Count (6 Months)'] = commit_count_6m.reindex(metrics_df.index).fillna(0).astype(int)
    metrics_df['Active Developer Count (6 Months)'] = active_developers_6m.reindex(metrics_df.index).fillna(0).astype(int)
    
    return metrics_df

In [11]:
repo_level_metrics = compute_metrics(df_events, 'repo_name_with_owner')
project_level_metrics = compute_metrics(df_events, 'project_name')

In [12]:
# Step 1: Compute Metrics

# Compute metrics indexed by 'repo_name_with_owner'
metrics_by_repo = compute_metrics(df_events, 'repo_name_with_owner')

# Compute metrics indexed by 'project_name'
metrics_by_project = compute_metrics(df_events, 'project_name')

# Step 2: Filter Metadata DataFrame 'df'

# For Repositories (is_org == False and verified_oso_name is not null)
df_repos = df[(df['is_org'] == False) & (df['verified_oso_name'].notna())]

# For Projects (is_org == True and verified_oso_name is not null)
df_projects = df[(df['is_org'] == True) & (df['verified_oso_name'].notna())]

# For everything else
df_remainder = df[df['verified_oso_name'].isna()]

# Step 3: Merge DataFrames

# Merge df_repos with metrics_by_repo on 'repo_name_with_owner'
merged_repos = pd.merge(
    df_repos,
    metrics_by_repo.reset_index(),  # Reset index to bring 'repo_name_with_owner' into columns
    on='repo_name_with_owner',
    how='left'
)

# Merge df_projects with metrics_by_project
merged_projects = pd.merge(
    df_projects,
    metrics_by_project.reset_index(),  # Reset index to bring 'project_name' into columns
    left_on='verified_oso_name',
    right_on='project_name',
    how='left'
)

# Optional: Drop redundant columns if needed
merged_projects.drop(columns=['project_name'], inplace=True)

# Optional: Fill NaN values with zeros in metric columns
metric_columns = [
    'Forks in the Last 6 Months',
    'Stars in the Last 6 Months',
    'Contributor Count (6 Months)',
    'Contributor Count',
    'New Contributor Count (6 Months)',
    'Commit Count (6 Months)',
    'Active Developer Count (6 Months)'
]

merged_repos[metric_columns] = merged_repos[metric_columns].fillna(0).astype(int)
merged_projects[metric_columns] = merged_projects[metric_columns].fillna(0).astype(int)

# Step 4: Combine Merged DataFrames (Optional)

# Combine the two merged DataFrames
merged_df = pd.concat([merged_repos, merged_projects, df_remainder], ignore_index=True)

# Display the final merged DataFrame
merged_df

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,repo_name_with_owner,repo_owner,repo_name,is_org,verified_oso_name,fork_count,...,created_at,updated_at,status,Forks in the Last 6 Months,Stars in the Last 6 Months,Contributor Count (6 Months),Contributor Count,New Contributor Count (6 Months),Commit Count (6 Months),Active Developer Count (6 Months)
0,filexp,https://github.com/aschmahmann/filexp,,True,aschmahmann/filexp,aschmahmann,filexp,False,filexp-aschmahmann,1,...,2023-01-17 06:28:03+00:00,2024-11-14 15:06:07+00:00,1_oso_match_by_github_repo,0.0,2.0,2.0,3.0,1.0,27.0,1.0
1,FILPlus Data Calculation,https://github.com/beck-8/filplus,,True,beck-8/filplus,beck-8,filplus,False,filplus-beck-8,1,...,2022-09-09 09:23:58+00:00,2022-09-13 01:51:53+00:00,1_oso_match_by_github_repo,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,Forest,https://github.com/chainsafe/forest,forest-chainsafe,True,chainsafe/forest,chainsafe,forest,False,forest-chainsafe,160,...,2019-11-11 21:15:40+00:00,2024-11-19 12:51:31+00:00,1_oso_match_by_github_repo,13.0,48.0,24.0,103.0,11.0,282.0,12.0
3,Forest Archive,https://github.com/chainsafe/forest,forest-chainsafe,True,chainsafe/forest,chainsafe,forest,False,forest-chainsafe,160,...,2019-11-11 21:15:40+00:00,2024-11-19 12:51:31+00:00,1_oso_match_by_github_repo,13.0,48.0,24.0,103.0,11.0,282.0,12.0
4,CIDgravity Nextcloud App,https://github.com/CIDgravity/nextcloud-cidgra...,,False,cidgravity/nextcloud-cidgravity-gateway-app,cidgravity,nextcloud-cidgravity-gateway-app,False,cidgravity,0,...,2024-06-13 10:51:22+00:00,2024-11-05 08:21:45+00:00,1_oso_match_by_github_repo,0.0,1.0,2.0,2.0,2.0,8.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,Retriev - Contracts for retrieval guarantees,https://github.com/retriev-protocol/core,,False,retriev-protocol/core,retriev-protocol,core,False,,,...,NaT,NaT,5_no_oso_name,,,,,,,
149,Web3ForGood,https://github.com/samflan875/Web3ForGood,,False,samflan875/web3forgood,samflan875,web3forgood,False,,,...,NaT,NaT,5_no_oso_name,,,,,,,
150,USDfil Stablecoin,https://github.com/usdfil,,False,usdfil,usdfil,,True,,,...,NaT,NaT,5_no_oso_name,,,,,,,
151,Filecoin Kenya Community,https://github.com/Wengi-web,,False,wengi-web,wengi-web,,True,,,...,NaT,NaT,5_no_oso_name,,,,,,,


In [13]:
# https://docs.google.com/spreadsheets/d/1tCjLOJixgDjkl62CP_05Ta64aKtNK9BMpQv8bpCY0Vo/edit?gid=0#gid=0
XLS_PATH = 'data/RPGF2 applications.xlsx'
SHEET = 'All'
WB = openpyxl.load_workbook(XLS_PATH)
sheet = WB[SHEET]

app_links = []
names = []
githubs = []
for row_num in range(2,154):
    cell = sheet.cell(row_num, 1)
    github = sheet.cell(row_num, 5)
    try:
        app_link = cell.hyperlink.target
    except:
        app_link = None
    app_links.append(app_link)
    names.append(cell.value)
    githubs.append(github.value)
            
xls = pd.DataFrame([names, app_links, githubs], index=['name', 'link', 'github_repo']).T

In [14]:
merged_df.sort_values(by=['githubProjectLink', 'name'])

Unnamed: 0,name,githubProjectLink,OSOName,categoryHasOsoName,repo_name_with_owner,repo_owner,repo_name,is_org,verified_oso_name,fork_count,...,created_at,updated_at,status,Forks in the Last 6 Months,Stars in the Last 6 Months,Contributor Count (6 Months),Contributor Count,New Contributor Count (6 Months),Commit Count (6 Months),Active Developer Count (6 Months)
73,Network Health Dashboard,https://github.com/starboard-ventures/fileco...,,False,starboard-ventures/filecoin-network-health-das...,starboard-ventures,filecoin-network-health-dashboard,False,starboard-ventures,0,...,2022-11-19 13:13:58+00:00,2023-12-29 07:13:39+00:00,1_oso_match_by_github_repo,0.0,0.0,4.0,9.0,2.0,0.0,0.0
116,Ancients Research,https://github.com/Ancients-Research/retroPGF,,False,ancients-research/retropgf,ancients-research,retropgf,False,,,...,NaT,NaT,5_no_oso_name,,,,,,,
117,Asia Storage Provider Working Group,https://github.com/Asia-SPWG/retroPGF,,False,asia-spwg/retropgf,asia-spwg,retropgf,False,,,...,NaT,NaT,5_no_oso_name,,,,,,,
119,Consensus Pledge design flaw fix,https://github.com/CELtd/builtin-actors/tree/f...,,False,celtd/builtin-actors/tree/fip0081,celtd,builtin-actors,False,,,...,NaT,NaT,5_no_oso_name,,,,,,,
120,Filecoin Supply Simulation,https://github.com/CELtd/mechafil-jax-web-levers,,False,celtd/mechafil-jax-web-levers,celtd,mechafil-jax-web-levers,False,,,...,NaT,NaT,5_no_oso_name,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,Commp-Worker,https://github.com/xutianyi1999/commp_worker,,True,xutianyi1999/commp_worker,xutianyi1999,commp_worker,False,,,...,NaT,NaT,4_could_not_match_to_oso,,,,,,,
80,TreeD-GPU,https://github.com/xutianyi1999/treed_gpu,,True,xutianyi1999/treed_gpu,xutianyi1999,treed_gpu,False,treed-gpu-xutianyi1999,0,...,2023-12-11 06:59:03+00:00,2024-10-22 10:31:36+00:00,1_oso_match_by_github_repo,0.0,1.0,1.0,1.0,1.0,1.0,1.0
114,Lotus LevelDB-To-YugabyteDB (Boost Migration T...,https://github.com/yhio/LtoY,,True,yhio/ltoy,yhio,ltoy,False,,,...,NaT,NaT,4_could_not_match_to_oso,,,,,,,
113,Lotus Pilot (Worker Switching Tool),https://github.com/yhio/lotus-pilot,,True,yhio/lotus-pilot,yhio,lotus-pilot,False,,,...,NaT,NaT,4_could_not_match_to_oso,,,,,,,


In [15]:
# merged_df.sort_values(by=['githubProjectLink', 'name']).to_csv('mdf.csv')
# xls.sort_values(by=['github_repo', 'name']).to_csv('xls.csv')