In [1]:
from google.cloud import bigquery
import json
import os
import pandas as pd
from urllib.parse import urlparse

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../gcp_credentials.json'
client = bigquery.Client()

In [3]:
with open("data/FIL_RetroPGF1_applications.json", "r") as f:
    projects = json.load(f)

In [5]:
query = """
SELECT *
FROM `opensource-observer.oso.repos_by_project`
"""
result = client.query(query)
REPOS = result.to_dataframe()

In [6]:
REPOS_TO_SLUGS = dict(zip(REPOS['repo_name_with_owner'].str.lower(), REPOS['project_slug']))
REPOS['owner'] = REPOS['repo_name_with_owner'].apply(lambda x: x.split("/")[0])
OWNERS = REPOS[['owner', 'project_slug']].drop_duplicates()
OWNERS_TO_SLUGS = dict(zip(OWNERS['owner'].str.lower(), OWNERS['project_slug']))

In [7]:
def process_github(url):
    url = url.strip('/').strip()
    parsed_url = urlparse(url)
    path_parts = parsed_url.path.strip('/').split('/')
    if len(path_parts) >= 2:
        owner, repo = path_parts[0], path_parts[1]
        return f"{owner}/{repo}".lower()
    elif len(path_parts) == 1:
        return path_parts[0].lower()
    else:
        return None

def map_to_oso(github):
    if not isinstance(github, str):
        return None
    elif '/' in github:
        return REPOS_TO_SLUGS.get(github)
    else:
        return OWNERS_TO_SLUGS.get(github)

In [8]:
github_links = []
data = []
for p in projects:
    app = p['app']
    links = app['contributionLinks']
    
    githubs = list(set([process_github(x['url']) for x in links if x['type'] == 'GITHUB_REPO']))
    github_links.extend(githubs)

    r = REPOS[REPOS['repo_name_with_owner'].isin(githubs) | REPOS['owner'].isin(githubs)]
    
    data.append({
        'id': p['id'],
        'name': p['name'],
        'bio': app['bio'],
        'contribution': app['contributionDescription'],
        'impact': app['impactDescription'],        
        'category': app['impactCategory'],
        'github_links': githubs,
        'fork_count': r['repo_fork_count'].sum(),
        'star_count': r['repo_star_count'].sum(),
        'first_commit': r['first_commit_date'].min(),
        'last_commit': r['last_commit_date'].max()
    })

len(data)

106

In [9]:
df = pd.DataFrame(data)
df

Unnamed: 0,id,name,bio,contribution,impact,category,github_links,fork_count,star_count,first_commit,last_commit
0,0xf9f22dc2367a0a06b984f9dd6289f3b9e5b1e91c1da1...,Asia SPWG,This particular branch of SPWG (see media anno...,This particular branch of SPWG (see media anno...,- Contribution in FIP discussion that impacted...,[GOVERNANCE],[filecoin-project/fips],161,309,2020-09-10 00:00:00+00:00,2024-04-27 00:00:00+00:00
1,0x01190adcf707f7b31580d2f600aac61569e4fef1b256...,AuralGenius,AuralGenius is an AI voice note app built on W...,Our team's vision: 'To make people's lives bet...,"Since its launch in January 2024, AuralGenius ...","[INFRASTRUCTURE, TOOLING, COMMUNITY_EDUCATION,...",[],0,0,NaT,NaT
2,0xbad5b1ef0e513116a9f4da037fa3b5c9b8e35e744d9d...,Banyan Storage,Banyan is a decentralized file storage platfor...,We built a platform that not only makes onboar...,We have 30+ active conversations/LOI to onboar...,[RESEARCH_AND_DEVELOPMENT],[banyancomputer/banyanfs],0,7,2024-03-19 00:00:00+00:00,2024-05-01 00:00:00+00:00
3,0xf9b123ffd4b691b96b8962a81bcbb687deccd2442b3e...,Beryx,"Beryx, created by Zondax, is a dynamic web pla...",Beryx has been at the forefront of supporting ...,Zondax has been a pioneer in dealing with big ...,[INFRASTRUCTURE],"[zondax/web-beryx-explorer, zondax]",931,778,2018-02-11 00:00:00+00:00,2024-05-01 00:00:00+00:00
4,0xe005bfa9d9494de9433a92c1946a7e49fda5bd15bf9b...,BlockScience,"BlockScience is a complex systems engineering,...",In the midst of the FIP0056 governance debates...,These contributions to Filecoin governance wer...,[GOVERNANCE],"[blockscience/filecoin-sdm-cdm-notebooks, bloc...",1,3,2023-02-15 00:00:00+00:00,2024-04-12 00:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...
101,0xed9b91d8a490a97d1e12fac7e9600bec02fb8941aa83...,Web3bridge,Web3bridge has in the last 4years introduced o...,Web3bridge being one of the top platforms/prog...,- Over one hundred and fifty Web3 developers i...,[COMMUNITY_EDUCATION],[],0,0,NaT,NaT
102,0xf22180101e2de2a49f49e638889e8ab2a2e56275ca41...,"Women Biz: Bootcamp ""Women in Web3""",Women Biz is a community aimed at empowering w...,"At Women Biz, we have significantly focused on...",Women Biz's initiative has created a significa...,[COMMUNITY_EDUCATION],[],0,0,NaT,NaT
103,0x443dcf92e2a8f4b09006ef98ffe28375e67012d53a83...,Zengo: Decentralized Budget,Zengo: Decentralized Budget is a collaborative...,Zengo is a public good for collaborative gover...,While the majority of prototyping and developm...,"[COMMUNITY_EDUCATION, END_USER_EXPERIENCE, RES...","[zenbiteth/spacetimedao, zenbiteth/zengo]",2,2,2023-07-31 00:00:00+00:00,2023-11-16 00:00:00+00:00
104,0xc989fffdc49afe91eac808050a05b085892d9a998982...,區塊勢 (Blocktrend),Blocktrend is Taiwan's independent media that ...,Blocktrend meticulously crafts content on IPFS...,"The impact of Blocktrend's work is profound, p...",[COMMUNITY_EDUCATION],[],0,0,NaT,NaT


In [10]:
df.to_csv("data/FIL_RetroPGF1_projects.csv")