In [23]:
from datetime import datetime, timedelta
from google.cloud import bigquery
import json
import os
import pandas as pd
import sys

sys.path.append(os.path.abspath("../../scripts/"))
from github import validate_github_artifact, get_owner_type

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../gcp_credentials.json'
client = bigquery.Client()

# Get all repos on OSO

In [11]:
query = """
SELECT *
FROM `opensource-observer.oso.repos_by_project`
"""
result = client.query(query)
repos = result.to_dataframe()
repos['repo_owner'] = repos['repo_name_with_owner'].apply(lambda x: x.split('/')[0])
repos.tail(3)

Unnamed: 0,project_id,project_slug,project_name,repository_source,artifact_id,repo_is_fork,repo_fork_count,repo_star_count,first_commit_date,last_commit_date,repo_name_with_owner,repo_owner
37068,wXDIn1ZhRNdQ682uvEAtHNxdN_fzvxGWSbcWWIA96s0=,freedom-of-the-press-foundation,Freedom of the Press Foundation,GITHUB,IrJU7rLHqTirc8WnGq7sezm2uhax6bNdKZ84-zR77p8=,True,0,0,NaT,NaT,freedomofpress/python-gnupg,freedomofpress
37069,wXDIn1ZhRNdQ682uvEAtHNxdN_fzvxGWSbcWWIA96s0=,freedom-of-the-press-foundation,Freedom of the Press Foundation,GITHUB,ca0yOVhuQghHd-nPYbCgtAIzzpcylrZIt4SHv39Kijo=,True,0,0,NaT,NaT,freedomofpress/terraform-google-gke,freedomofpress
37070,wXDIn1ZhRNdQ682uvEAtHNxdN_fzvxGWSbcWWIA96s0=,freedom-of-the-press-foundation,Freedom of the Press Foundation,GITHUB,CQfiljcSIgRrUpiHcp2u0Y3YmDzyUIqZ9UNBYAxa4Xc=,True,1,0,2020-03-26 00:00:00+00:00,2020-03-26 00:00:00+00:00,freedomofpress/redmine_cc_addresses,freedomofpress


In [12]:
oso_repos = sorted(repos['repo_name_with_owner'].unique())
oso_owners = sorted(repos['repo_owner'].unique())

print(len(oso_repos))
print(len(oso_owners))

36986
1648


# Get GG projects from Metabase

In [50]:
projects = pd.read_csv('data/csv/gg20_projects_2024-04-28T02_16_23.202076Z.csv')
projects.tail()

Unnamed: 0,chain_id,round_id,round_name,project_name,project_github,payout_address,status
518,42161,23,Hackathon Alumni,Dspyt- into CodeVerse,dspytdao,0x4C11BA2ed1D936d769d0cce34CbC7Ea1E85182d0,APPROVED
519,42161,23,Hackathon Alumni,Coordination-Play,coordination-play,0x955Af1c1637Facf4dD5d9D2428e073573dAD5699,APPROVED
520,42161,23,Hackathon Alumni,AI Swarm,aiswarm,0x52ea367e1C074409e841a559dFfA321BDB12b3bE,APPROVED
521,42161,23,Hackathon Alumni,FundIt,ecoland-world,0xc8f0bae52D42f42d1Aed7b4af00CA3EF6C516c23,APPROVED
522,42161,23,Hackathon Alumni,AdLand,adcommune,0x26bBec292e5080ecFD36F38FF1619FF35826b113,APPROVED


In [51]:
project_githubs = sorted(projects['project_github'].dropna().str.lower().unique())
len(project_githubs)

333

# Identify projects that should be added to OSO

In [32]:
found = []
not_found = []

for project in project_githubs:
    project = project.strip().strip('/')
    type_ = 'repo' if '/' in project else 'owner'
    if project in oso_repos or project in oso_owners:
        found.append({project: type_})
    else:
        not_found.append({
            'artifact': project,
            'type': type_,
            'outcome': validate_github_artifact(project)
        })

{'commits': 38, 'authors': 3, 'unique_days': 8, 'newest_commit_date': '2024-04-27', 'oldest_commit_date': '2024-04-13'}
Error parsing data: 'NoneType' object is not subscriptable {'data': {'repository': {'defaultBranchRef': None}}}
{'commits': 76, 'authors': 5, 'unique_days': 4, 'newest_commit_date': '2024-04-03', 'oldest_commit_date': '2024-03-28'}
{'commits': 28, 'authors': 1, 'unique_days': 9, 'newest_commit_date': '2024-04-26', 'oldest_commit_date': '2024-04-18'}
Error parsing data: 'NoneType' object is not subscriptable {'data': {'repository': {'defaultBranchRef': None}}}
{'commits': 47, 'authors': 1, 'unique_days': 16, 'newest_commit_date': '2024-04-25', 'oldest_commit_date': '2024-03-30'}
{'commits': 33, 'authors': 3, 'unique_days': 15, 'newest_commit_date': '2024-04-17', 'oldest_commit_date': '2024-03-09'}
{'commits': 27, 'authors': 3, 'unique_days': 5, 'newest_commit_date': '2024-04-17', 'oldest_commit_date': '2024-03-15'}
{'commits': 15, 'authors': 2, 'unique_days': 4, 'newes

In [28]:
with open("data/gg20-github-checks.json", "w") as f:
    json.dump(not_found, f, indent=2)

174

In [36]:
projects_to_add = []
for p in not_found:
    if p['outcome']['Approved']:
        projects_to_add.append(p)
        
artifacts_to_add = [x['artifact'] for x in projects_to_add]        

In [48]:
csv_version = (
    projects[projects['project_github'].str.lower().isin(artifacts_to_add)]
    [['project_name', 'project_github']]
    .drop_duplicates()
)
csv_version.columns = ['Project', 'GitHub']

csv_version['GitHub'] = csv_version['GitHub'].apply(lambda x: f"https://github.com/{x.lower()}")
csv_version

Unnamed: 0,Project,GitHub
10,Metrics Garden Labs,https://github.com/metrics-garden-labs
12,Armitage,https://github.com/armitage-labs
20,Index Wallets,https://github.com/optionhq
36,IPC Explorer,https://github.com/cronian-tech
37,Proof of passport,https://github.com/zk-passport
...,...,...
507,Agents With Benefits,https://github.com/agentswbenefits
513,Perpetual Organization Architect,https://github.com/perpetualorganizationarchitect
519,Coordination-Play,https://github.com/coordination-play
520,AI Swarm,https://github.com/aiswarm


In [52]:
csv_version.to_csv("data/csv/gg20_new_projects.csv")