In [72]:
from collections import defaultdict
from itertools import combinations
import json
import numpy as np
import os
import pandas as pd
from google.cloud import bigquery
from datetime import datetime

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = = '../../gcp_credentials.json'
client = bigquery.Client()

# Get today's date in YYYY-MM-DD format
today = datetime.now().strftime('%Y-%m-%d')

## Grab historic Gitcoin funding event data for all projects
- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Run the query below
- Add filter for OSS rounds starting GG18

In [73]:
query = """
    select
      oso_project_name,
      oso_display_name,
      round_number,
      oso_generated_round_label,
      gitcoin_project_id,
      count(distinct donor_address) as num_donors,
      sum(case when donor_address is not null then amount_in_usd else 0 end) as crowd_funding_usd,
      sum(case when donor_address is null then amount_in_usd else 0 end) as match_funding_usd
    from `oso_production.gitcoin_funding_events_by_project_v0`
    where ((round_number = 22 and gitcoin_round_id = '608') or
      (round_number = 22 and gitcoin_round_id = '609') or
      (round_number = 22 and gitcoin_round_id = '610') or
      (round_number = 22 and gitcoin_round_id = '611') or
      (round_number = 20 and gitcoin_round_id = '23') or
      (round_number = 20 and gitcoin_round_id = '25') or
      (round_number = 20 and gitcoin_round_id = '26') or
      (round_number = 20 and gitcoin_round_id = '27') or
      (round_number = 19 and lower(gitcoin_round_id) = '0xd4cc0dd193c7dc1d665ae244ce12d7fab337a008') or
      (round_number = 19 and lower(gitcoin_round_id) = '0xa1d52f9b5339792651861329a046dd912761e9a9') or
      (round_number = 19 and lower(gitcoin_round_id) = '0x98720dd1925d34a2453ebc1f91c9d48e7e89ec29') or
      (round_number = 18 and lower(gitcoin_round_id) = '0x8de918f0163b2021839a8d84954dd7e8e151326d') or
      (round_number = 18 and lower(gitcoin_round_id) = '0x222ea76664ed77d18d4416d2b2e77937b76f0a35') or
      (round_number = 18 and lower(gitcoin_round_id) = '0x2871742b184633f8dc8546c6301cbc209945033e'))
    group by 1, 2, 3, 4, 5
    having match_funding_usd > 0
    order by 8 desc
"""

results = client.query(query)
gitcoin_df = results.to_dataframe()
gitcoin_df.tail(5)



Unnamed: 0,oso_project_name,oso_display_name,round_number,oso_generated_round_label,gitcoin_project_id,num_donors,crowd_funding_usd,match_funding_usd
1461,,,18,GG-18 - 0x2871742b184633f8dc8546c6301cbc209945...,0xf44b50ada1837f5e950d9e002be80c24113e589ae897...,25,32.702959,6.250474
1462,scryprotocol,Scry Protocol,18,GG-18 - 0x8de918f0163b2021839a8d84954dd7e8e151...,0x4660be9bc678bba66a839a73c56212b592c15e97c8b6...,130,110.910948,6.000455
1463,,,20,GG-20 - 25,0xc313a8cea83983edd81609810cba8f3828b2903438af...,8,60.266736,5.093193
1464,icdevsorg,ICDevs.org,22,GG-22 - 610,0xab3b35780a86f5ec610e911fe359e8cbbef9278b300e...,45,61.867394,1.199074
1465,ethelo,Ethelo,18,GG-18 - 0x2871742b184633f8dc8546c6301cbc209945...,0x757a365c73eb60c71d682b2f7098a9fb11f4bb2dbd90...,30,177.376357,0.750057


## Apply some round and project quality filters

- Filter out less competitive rounds
- Ensure projects have been in multiple rounds

In [74]:
round_stats = gitcoin_df.groupby('oso_generated_round_label').agg({
    'oso_project_name': 'nunique',
    'gitcoin_project_id': 'nunique',
    'match_funding_usd': 'sum'
}).sort_values(by='match_funding_usd', ascending=False)

oss_heavy_rounds = round_stats[(round_stats['match_funding_usd'] >= 20_000) & (round_stats['gitcoin_project_id'] >= 10)]
oss_heavy_rounds


Unnamed: 0_level_0,oso_project_name,gitcoin_project_id,match_funding_usd
oso_generated_round_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GG-20 - 27,38,46,386673.696867
GG-22 - 609,37,53,333145.428515
GG-20 - 26,44,75,314686.761894
GG-22 - 610,35,47,312112.764672
GG-18 - 0x8de918f0163b2021839a8d84954dd7e8e151326d,115,123,302566.342875
GG-22 - 608,82,139,301761.84701
GG-20 - 25,83,153,299575.594535
GG-18 - 0x2871742b184633f8dc8546c6301cbc209945033e,58,238,249989.700907
GG-18 - 0x222ea76664ed77d18d4416d2b2e77937b76f0a35,28,28,206764.931207
GG-19 - 0xa1d52f9b5339792651861329a046dd912761e9a9,26,28,203959.090561


In [75]:
# Check for projects with missing display names. This may occur for Gitcoin grants that are not indexed in OSO
missing_names_df = (gitcoin_df[gitcoin_df['oso_display_name'].isnull()]
    .groupby('gitcoin_project_id')
    .agg({
        'match_funding_usd': 'sum'
    })
    .sort_values('match_funding_usd', ascending=False)
)

print(f"Total projects with missing display names: {len(missing_names_df)}")
missing_names_df.to_csv('data/2025-02-03_missing_names.csv')
missing_names_df.head(10)


Total projects with missing display names: 605


Unnamed: 0_level_0,match_funding_usd
gitcoin_project_id,Unnamed: 1_level_1
0xa7b8948d5c8b8d2cd502cd9c8638dda011ecfbfc10abe0df58a504e4802f5d6d,29976.859375
0xb3f29ba7f49ecfcc6a5ada3494f3fdc50eedcb08ab1ff647e3d31fa6ad57dc9f,28885.505249
0x457ec30327c665492be165353e1d2e1bfcca298a3ca9ee6e2b69acee9ca4940c,23803.422677
0xa43d39ea6e0c57d3082983686ceffca5cd0e96dcef1ddf84fb124623b0bbe05a,16771.39264
0x1249835eaab22b2d2fd7f34556a20cae9cfd902f56c54789730081e6996ed012,15507.587969
0xb5a30ed4936627d0ff4b3f92e98c1249ee7984ee1fd2ed3e8983a369e037edb5,15001.137187
0x8922fedbc096d4323fdf3e6de55477a6e487d4bde5ab92d1954ff0f0a9104de9,14849.591509
0xbc0c38f75ec02f37af3c6cc5acef9951e0f69c0b49a26e9bf680b1fbcf4f76cb,14647.892565
0xf56dd3fa328f36673fd27422bf169559797919b4e8dd639b341d14127219c70a,14404.092418
0x1bb64e87b901e6e0f4d7877a99bc5a035ca4be1ec2517fd3f2e00337e636daf7,14360.838623


In [76]:
# Patch to get missing OSO names for projects indexed in OSO but missing wallet information
# First, identify missing names
missing_ids = gitcoin_df[gitcoin_df['oso_display_name'].isnull()]['gitcoin_project_id'].unique()
missing_ids_str = "', '".join(missing_ids)

# Query to get missing names
query = """
SELECT dir.gitcoin_project_id, dir.latest_project_github, 
       proj.project_name as oso_project_name, 
       proj.display_name as oso_display_name
FROM `oso_production.gitcoin_project_directory_v0` dir 
JOIN `oso_production.projects_v1` proj
ON proj.project_name = dir.latest_project_github
WHERE dir.gitcoin_project_id IN ('""" + missing_ids_str + """')
"""

missing_names_results = client.query(query)
missing_names_df = missing_names_results.to_dataframe()

print(f"Found name mappings for {len(missing_names_df)} out of {len(missing_ids)} missing projects")

# Update gitcoin_df with the found names
for _, row in missing_names_df.iterrows():
    mask = gitcoin_df['gitcoin_project_id'] == row['gitcoin_project_id']
    gitcoin_df.loc[mask, 'oso_project_name'] = row['oso_project_name']
    gitcoin_df.loc[mask, 'oso_display_name'] = row['oso_display_name']

gitcoin_df.to_csv('data/2025-02-03_gitcoin_df_with_names.csv')

# Verify updates
print("\nRemaining missing names after update:")
print(gitcoin_df[gitcoin_df['oso_display_name'].isnull()]['gitcoin_project_id'].nunique())



Found name mappings for 162 out of 605 missing projects

Remaining missing names after update:
443


Filter for:
- OSS rounds based on size and competitiveness
- Projects indexed in OSO

Selects all projects with at least one appearance in OSS round since GG18

In [77]:
oss_heavy_rounds_list = list(oss_heavy_rounds.index)
round_filter = gitcoin_df['oso_generated_round_label'].isin(oss_heavy_rounds_list)

project_round_count = gitcoin_df.groupby('oso_project_name')['round_number'].nunique()
projects_above_round_count_threshold = list(project_round_count[project_round_count >= 1].index)
project_filter = gitcoin_df['oso_project_name'].isin(projects_above_round_count_threshold)

oss_project_filter = gitcoin_df['oso_project_name'].isna() == False

filtered_df = gitcoin_df[round_filter & project_filter & oss_project_filter].reset_index(drop=True)
filtered_df.head()

Unnamed: 0,oso_project_name,oso_display_name,round_number,oso_generated_round_label,gitcoin_project_id,num_donors,crowd_funding_usd,match_funding_usd
0,passportxyz,Passport XYZ,22,GG-22 - 609,0xa7b8948d5c8b8d2cd502cd9c8638dda011ecfbfc10ab...,6493,7281.864109,29976.859375
1,defi-llama,DefiLlama,22,GG-22 - 609,0xbb1d90979d5a76457be64366c845853b41d0f849f51c...,8064,12537.023827,29976.859375
2,ethereum-attestation-service,Ethereum Attestation Service,22,GG-22 - 610,0x0d157806d0b64bcc680b8bbeeb8aea011d85ae54c7ef...,745,1634.645314,29976.859375
3,dappnode,DAppNode,22,GG-22 - 610,0x22724a362754b65d9b3c7fc7dda94c7dc233a39c06ee...,1393,3719.642127,29976.859375
4,l2beat,L2BEAT,22,GG-22 - 610,0x805aeaf9c52db21357df138ef7884790093f63283bce...,2426,5250.7208,29976.859375


## Model head-to-head appearances by projects in the same round

- Weight match funding more heavily than crowd funding
- Match funding accounts for collusion / Sybil, etc

In [78]:
simulation_data = []
for gitcoin_round in oss_heavy_rounds_list:
    dff = filtered_df[filtered_df['oso_generated_round_label'] == gitcoin_round]
    round_num = dff['round_number'].mean()
    if round_num != int(round_num):
        print(gitcoin_round)
    projects = list(dff['oso_project_name'].unique())
    comparisons = combinations(projects, 2)
    for (project_a, project_b) in comparisons:
        match_a = dff[dff['oso_project_name'] == project_a]['match_funding_usd'].sum()
        match_b = dff[dff['oso_project_name'] == project_b]['match_funding_usd'].sum()
        crowd_a = dff[dff['oso_project_name'] == project_a]['crowd_funding_usd'].sum()
        crowd_b = dff[dff['oso_project_name'] == project_b]['crowd_funding_usd'].sum()
        amount_total = match_a + match_b + crowd_a + crowd_b
        simulation_data.append({
            'round_number': int(round_num),
            'project_a': project_a,
            'project_b': project_b,
            'weight_a': (match_a + crowd_a * 0.5) / amount_total,
            'weight_b': (match_b + crowd_b * 0.5) / amount_total
        })
        
simulation_df = pd.DataFrame(simulation_data)

# Sort by round number descending (latest rounds first)
simulation_df = simulation_df.sort_values('round_number', ascending=True)

simulation_df.tail()

Unnamed: 0,round_number,project_a,project_b,weight_a,weight_b
14202,22,charmverse,jobstash,0.490766,0.460646
14201,22,charmverse,0xnextlabs,0.572103,0.365403
14200,22,charmverse,luncosim,0.552208,0.381549
14198,22,charmverse,breadchaincoop,0.59001,0.373737
40877,22,nebulaid,stogramhq,0.492898,0.36584


## Run the standard ELO algorithm

In [79]:
def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(rating_a, rating_b, score_a, k_factor):
    expected_a = expected_score(rating_a, rating_b)
    rating_a_new = rating_a + k_factor * (score_a - expected_a)
    rating_b_new = rating_b + k_factor * ((1 - score_a) - (1 - expected_a))
    return rating_a_new, rating_b_new
    
def elo_simulation(dataframe):
    
    elo_ratings = defaultdict(lambda: 1500) # Default ELO rating for all projects
    appearances = defaultdict(int)
    
    for _,row in dataframe.iterrows():
        project_a = row['project_a']
        project_b = row['project_b']
        share_a = row['weight_a']
        score_a = 1 if share_a > 0.5 else 0 if share_a < 0.5 else 0.5

        k_a = 40 / (1 + appearances[project_a] / 5)
        k_b = 40 / (1 + appearances[project_b] / 5)
        k_factor = (k_a + k_b) / 2
        
        elo_ratings[project_a], elo_ratings[project_b] = update_elo(
            elo_ratings[project_a], elo_ratings[project_b], score_a, k_factor
        )
        appearances[project_a] += 1
        appearances[project_b] += 1
    
    margin_of_error = {project: 400 / np.sqrt(appearances[project]) for project in appearances}

    return (
        pd.DataFrame([
            {
                'project': project,
                'head-to-head_comparisons': appearances[project],        
                'elo_rating': rating,        
                'margin_of_error': margin_of_error[project]
            }
            for project, rating in elo_ratings.items()
        ])
        .sort_values(by='elo_rating', ascending=False)
        .set_index('project', drop=True)
    )

def run_multiple_simulations(dataframe, n_simulations=50):
    all_results = []
    
    for _ in range(n_simulations):
        # Shuffle the dataframe
        shuffled_df = dataframe.sample(frac=1).reset_index(drop=True)
        # Run simulation
        ratings = elo_simulation(shuffled_df)
        all_results.append(ratings)
    
    # Combine all results
    combined_results = pd.concat(all_results)
    
    # Group by project and calculate means
    final_results = combined_results.groupby(level=0).agg({
        'head-to-head_comparisons': 'first',  # These should be same across runs
        'elo_rating': 'mean',
        'margin_of_error': 'mean'
    }).sort_values('elo_rating', ascending=False)
    
    # Add standard deviation of ratings across simulations
    rating_std = combined_results.groupby(level=0)['elo_rating'].std()
    final_results['rating_std_across_simulations'] = rating_std
    
    return final_results

# Run the simulations
elo_df = run_multiple_simulations(simulation_df)

elo_df.head()

Unnamed: 0_level_0,head-to-head_comparisons,elo_rating,margin_of_error,rating_std_across_simulations
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
revoke-cash,464,1810.015815,18.569534,8.102371
heyxyz,464,1809.057787,18.569534,7.79061
defi-llama,343,1792.052248,21.59797,7.888363
idriss-crypto,358,1773.871732,21.140657,9.93024
defieye,512,1765.40155,17.67767,9.262636


In [80]:
df = (
    elo_df
    .join(gitcoin_df
          .groupby('oso_project_name')
          .agg({
            'round_number': 'nunique',
            'num_donors': 'max',
            'crowd_funding_usd': 'sum',
            'match_funding_usd': 'sum'
            })
        )
    .rename(columns={
        'round_number': 'num_main_round_appearances',
        'num_donors': 'highest_unique_donor_count_in_one_round',
    })
)
df.head()

Unnamed: 0_level_0,head-to-head_comparisons,elo_rating,margin_of_error,rating_std_across_simulations,num_main_round_appearances,highest_unique_donor_count_in_one_round,crowd_funding_usd,match_funding_usd
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
revoke-cash,464,1810.015815,18.569534,8.102371,4,15545,92239.365225,59814.988384
heyxyz,464,1809.057787,18.569534,7.79061,4,21309,129979.281943,59814.988384
defi-llama,343,1792.052248,21.59797,7.888363,4,10751,78859.452136,87210.381367
idriss-crypto,358,1773.871732,21.140657,9.93024,3,10623,39758.306933,42153.577948
defieye,512,1765.40155,17.67767,9.262636,4,4178,25272.073548,45730.583967


In [81]:
df.to_csv(f'data/{today}_gitcoin_oss_elo_ratings.csv')

# Fetch Code Metrics for the projects in the ELO ratings

In [82]:
# Get list of projects from df
project_list = df.index.tolist()

# Create SQL-friendly string of projects
project_string = "', '".join(project_list)

query = """
    SELECT *
    FROM `oso_production.code_metrics_by_project_v1`
    WHERE project_name IN ('""" + project_string + """')
"""

cm_results = client.query(query)
df_metrics = cm_results.to_dataframe()
print(f"Number of matching projects with code metrics: {len(df_metrics)}")
df_metrics.head()



Number of matching projects with code metrics: 468


Unnamed: 0,project_id,project_source,project_namespace,project_name,display_name,event_source,repository_count,first_created_at_date,last_updated_at_date,first_commit_date,...,active_developer_count_6_months,commit_count_6_months,opened_pull_request_count_6_months,merged_pull_request_count_6_months,opened_issue_count_6_months,closed_issue_count_6_months,comment_count_6_months,release_count_6_months,time_to_first_response_days_average_6_months,time_to_merge_days_average_6_months
0,Eg2IQ9X58yQRQAx7atRetDPqUdH6zOTTBScer1jSVrM=,OSS_DIRECTORY,oso,commons-stack,Commons Stack,GITHUB,36,2019-03-15 12:51:40+00:00,2024-12-13 17:49:17+00:00,2019-03-15 12:54:57+00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,xwztMm1EuqWpJ6RhT4chJ1qNVa_6jqh53dXDM5cb9z4=,OSS_DIRECTORY,oso,banklessdao,Bankless DAO,GITHUB,47,2021-04-07 01:08:36+00:00,2025-01-17 19:58:38+00:00,2021-05-05 18:22:40+00:00,...,1.0,12.0,10.0,12.0,0.0,0.0,18.0,0.0,0.0,0.0
2,7sOH8lAoHO-zAWLyIenxT3lrOGgonqAVBcb8n--fTic=,OSS_DIRECTORY,oso,ssbc,wildfiressb,GITHUB,248,2014-05-10 22:42:14+00:00,2025-01-16 04:45:07+00:00,2015-01-07 05:19:57+00:00,...,3.0,81.0,2.0,2.0,1.0,0.0,2.0,4.0,0.0,0.0
3,qHEjczna6kDTLN1fGMLq2d6rwKqfRU0mufHsRXr5cqA=,OSS_DIRECTORY,oso,pizzadao,PizzaDAO,GITHUB,21,2021-02-24 13:43:13+00:00,2025-01-17 09:56:48+00:00,2021-02-27 22:52:08+00:00,...,5.0,41.0,4.0,1.0,5.0,0.0,8.0,0.0,1.1875,0.0
4,iilyWmaoJCXv0vPfyPQnIGYXoWyTZDlP0oQLMNIVj54=,OSS_DIRECTORY,oso,metagame-metafam,MetaGame,GITHUB,58,2010-03-22 00:41:45+00:00,2025-01-14 07:01:58+00:00,2019-12-02 22:15:05+00:00,...,3.0,165.0,12.0,12.0,68.0,39.0,84.0,0.0,100.199132,0.0


In [83]:
combined_df = df.reset_index().merge(
    df_metrics,
    left_on='project',
    right_on='project_name',
    how='left'
)

# Optionally set project back as index
combined_df = combined_df.set_index('project')

# Drop the redundant project_name column if desired
combined_df = combined_df.drop('project_name', axis=1)

combined_df.head()

Unnamed: 0_level_0,head-to-head_comparisons,elo_rating,margin_of_error,rating_std_across_simulations,num_main_round_appearances,highest_unique_donor_count_in_one_round,crowd_funding_usd,match_funding_usd,project_id,project_source,...,active_developer_count_6_months,commit_count_6_months,opened_pull_request_count_6_months,merged_pull_request_count_6_months,opened_issue_count_6_months,closed_issue_count_6_months,comment_count_6_months,release_count_6_months,time_to_first_response_days_average_6_months,time_to_merge_days_average_6_months
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
revoke-cash,464,1810.015815,18.569534,8.102371,4,15545,92239.365225,59814.988384,zMr9ziUIls0NIAm5eLswVdNAjL-X9j4o2RcEQmrLf4c=,OSS_DIRECTORY,...,1.0,169.0,37.0,34.0,9.0,8.0,70.0,17.0,160.482639,0.0
heyxyz,464,1809.057787,18.569534,7.79061,4,21309,129979.281943,59814.988384,m-ayiyYXbU1nR5aCLuliEKsYRca6b_8aNzdBbKn56pM=,OSS_DIRECTORY,...,2.0,1786.0,199.0,159.0,160.0,152.0,307.0,1.0,55.857755,0.0
defi-llama,343,1792.052248,21.59797,7.888363,4,10751,78859.452136,87210.381367,6lg5zQJ4GYDjvO5NvVAfPBWuxNgOeEcLAfcL01ws5Us=,OSS_DIRECTORY,...,13.0,10469.0,4832.0,3921.0,148.0,143.0,10115.0,0.0,8.989402,0.0
idriss-crypto,358,1773.871732,21.140657,9.93024,3,10623,39758.306933,42153.577948,TpsYn6RRZN_9nai7dLsf294TRyQDmZYpeQEnaIqugw4=,OSS_DIRECTORY,...,5.0,352.0,281.0,250.0,0.0,4.0,430.0,0.0,1.85744,0.0
defieye,512,1765.40155,17.67767,9.262636,4,4178,25272.073548,45730.583967,dTOJVgcd44Xv1mkV949Tyyyx_WO6HOsCNSFEP-MP3HU=,OSS_DIRECTORY,...,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
combined_df.to_csv(f'data/{today}_gitcoin_oss_elo_ratings_with_metrics.csv')