In [7]:
from google.cloud import bigquery
import os
import pandas as pd

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../gcp_credentials.json'
client = bigquery.Client()

In [2]:
query = """
WITH actions AS (
  SELECT
    project_id,
    from_name,
    MIN(time) AS first_contribution,
    MAX(time) AS last_contribution,
    COUNT(DISTINCT time) AS num_days,
    SUM(amount) AS total_amount
  FROM `opensource-observer.oso.int_events_to_project`
  WHERE
    to_namespace = 'GITHUB'
    AND event_type IN (
      'COMMIT_CODE',
      'PULL_REQUEST_OPENED',
      'ISSUE_OPENED'
    )
  GROUP BY 1,2
)
SELECT
  p.project_id,
  p.project_name,
  a.from_name,
  a.first_contribution,
  a.total_amount
FROM actions AS a
JOIN `opensource-observer.oso.projects` AS p ON a.project_id = p.project_id
WHERE
  DATE(a.last_contribution) >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR)
  AND a.num_days >= 10
"""
result = client.query(query)

In [3]:
df = result.to_dataframe()

In [4]:
# remove bots
df = df[df['from_name'].str.contains('[bot]', regex=False)==False]
df = df[df['from_name'].str.contains('-bot',  regex=False)==False]

# remove single project contributors
contrib_counts = df.groupby('from_name')['project_id'].nunique()
repeat_contribs = list(contrib_counts[contrib_counts>1].index)
df = df[df['from_name'].isin(repeat_contribs)]

len(df)

2644

In [5]:
df['first_ecosystem_contribution'] = df.groupby('from_name')['first_contribution'].transform('min')
df['feeder_project'] = (df['first_contribution'] == df['first_ecosystem_contribution'])

dff = df[df['feeder_project']==True]
dff

Unnamed: 0,project_id,project_name,from_name,first_contribution,total_amount,first_ecosystem_contribution,feeder_project
18,Y-O2nnqzudGeu_SO3Z-iGC0SQ0r-hhqXxbv-af1MBds=,Fe,Y-Nak,2021-03-22 08:26:18+00:00,161.0,2021-03-22 08:26:18+00:00,True
20,Y-O2nnqzudGeu_SO3Z-iGC0SQ0r-hhqXxbv-af1MBds=,Fe,sbillig,2021-01-27 07:10:54+00:00,285.0,2021-01-27 07:10:54+00:00,True
22,Y-O2nnqzudGeu_SO3Z-iGC0SQ0r-hhqXxbv-af1MBds=,Fe,saifalkatout,2023-06-04 10:42:18+00:00,10.0,2023-06-04 10:42:18+00:00,True
24,7tn6nZfvnltUNZjqR8QpXkjGDo-pYJanf8CoCwWAHpc=,OP,pegahcarter,2023-10-14 22:21:50+00:00,11.0,2023-10-14 22:21:50+00:00,True
30,7tn6nZfvnltUNZjqR8QpXkjGDo-pYJanf8CoCwWAHpc=,OP,lucadonnoh,2022-01-16 17:18:53+00:00,12.0,2022-01-16 17:18:53+00:00,True
...,...,...,...,...,...,...,...
9460,2OpLN_X-kM90E4QdvJw6FLBnkFrM5JhAFbFGG8ct6tU=,hive - Ethereum end-to-end test harness,lightclient,2020-04-06 14:44:25+00:00,24.0,2020-04-06 14:44:25+00:00,True
9468,9CNpd9pl9JfxwHOBJVfuuHL9j6BbOg5lRbpiYYlok34=,Ethereum Execution Client Specifications,gurukamath,2021-12-18 17:44:39+00:00,220.0,2021-12-18 17:44:39+00:00,True
9504,oaYhCXaEQHBOrZXV5m76_gCofnNnTrMt2EIer2sGp_s=,Ethereum Proof-of-Stake Consensus Specifications,adiasg,2020-04-22 01:56:33+00:00,17.0,2020-04-22 01:56:33+00:00,True
9507,oaYhCXaEQHBOrZXV5m76_gCofnNnTrMt2EIer2sGp_s=,Ethereum Proof-of-Stake Consensus Specifications,dankrad,2019-03-06 13:52:28+00:00,117.0,2019-03-06 13:52:28+00:00,True


In [6]:
feeder_projects = (
    dff.groupby('project_name')
    .agg({
      'feeder_project': 'sum',
      'from_name': list
    })
    .rename(columns={'feeder_project': 'contributors_onboarded'})
    .sort_values(by='contributors_onboarded', ascending=False)
    .reset_index()
)
feeder_projects

Unnamed: 0,project_name,contributors_onboarded,from_name
0,Protocol Guild,235,"[nhsz, buddh0, pawanjay176, rootulp, vbuterin,..."
1,Chainlink,134,"[jinhoonbang, jkongie, bolekk, vreff, leeyikji..."
2,IPFS,34,"[cwaring, mishmosh, dennis-tra, dokterbob, ach..."
3,Synthetixio,33,"[duckception, 0xjocke, fritzschoff, JChiaramon..."
4,geth,32,"[aaronbuchwald, MariusVanDerWijden, roberto-ba..."
...,...,...,...
356,Lume Web,1,[pcfreak30]
357,LumeWeb,1,[pcfreak30]
358,LunCo,1,[Difint]
359,MAP Protocol,1,[neoiss]
