In [1]:
# standard dependencies
import numpy as np
import os
import pandas as pd
import sys
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# local package dependencies
sys.path.append(os.path.abspath("../../scripts/"))
sys.path.append(os.path.abspath("../../visualizations/"))

# for making direct query requests to the data warehouse
from oso_db import execute_query

In [3]:
# create a mapping of projects to collections
result = execute_query("""
    SELECT p."slug", c."slug" 
    FROM project p 
    JOIN collection_projects_project cpp ON p."id" = cpp."projectId"
    JOIN collection c ON cpp."collectionId" = c."id"
    WHERE c."typeId" = 1
""", col_names=False)
projects_to_collections = {
    k: (
        sorted([
            v for v, key
            in result
            if key == k
        ]) 
    )
    for k in set(key for _, key in result)
}

In [4]:
# create a list of collections (ecosystems)
collection_slugs = list(projects_to_collections.keys())

In [5]:
# create a list of projects
project_slugs = set([
    p for c,ps 
    in projects_to_collections.items() 
    if c in collection_slugs 
    for p in ps
])
len(project_slugs)

1214

In [6]:
# filter on the project slugs we're interested in
my_slugs = projects_to_collections['octant-02']

In [7]:
# create a string version of the projects list for use in querystrings
slugs_str = "','".join(my_slugs).replace("'onion-dao',","")
slugs_str

"brightid','clrfund','commonsbuild','dao-drops-dorgtech','drips-network','ethereum-cat-herders','ethstaker','gitcoin','givepraise','giveth','glo-foundation','hausdao','hypercerts','kernel-community','l2beat','metagame-metafam','opensource-observer','pairwise-general-magic','protocol-guild','revoke-cash','rotki','shutter-network','tor-project"

In [8]:
# get the names of those projects (up to 24 characters)
result = execute_query(f"""
    SELECT slug, name
    FROM project 
    WHERE slug IN ('{slugs_str}') 
""")
slugs_to_names = {x[0]: x[1][:25] for x in result}

In [9]:
# get data about active developers
query = execute_query(f"""
    SELECT
        p."slug",
        a."name" AS "repo",
        e."fromId",
        et."name",
        e."time",
        e."amount"
    FROM event e             
    JOIN project_artifacts_artifact paa ON e."toId" = paa."artifactId"            
    JOIN artifact a ON paa."artifactId" = a."id"
    JOIN project p ON paa."projectId" = p.id
    JOIN event_type et ON e."typeId" = et."id"
    WHERE
        e."typeId" IN (2,3,4,6,18,14,22)
        AND p.slug IN ('{slugs_str}')
""", col_names=True)

df = pd.DataFrame(query[1:], columns=query[0])
df.head()

Unnamed: 0,slug,repo,fromId,name,time,amount
0,tor-project,torproject/webwml,33943.0,COMMIT_CODE,2007-12-31 20:38:23+00:00,1.0
1,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 02:35:02+00:00,1.0
2,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 03:09:22+00:00,1.0
3,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 03:22:04+00:00,1.0
4,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 04:17:09+00:00,1.0


In [10]:
df['month'] = df['time'].apply(lambda x: f"{x.year}-{str(x.month).zfill(2)}")
df['day'] = df['time'].apply(lambda x: f"{x.year}-{str(x.month).zfill(2)}-{str(x.day).zfill(2)}")
df['years_ago'] = (df['time'].max() - df['time']).apply(lambda x: x.days / 365.25)

In [11]:
date_thresh = '2023-07-01'

In [12]:
first_pr = (
    df.query("name == 'PULL_REQUEST_CREATED'")
    .groupby('slug')['years_ago']
    .max()
    .apply(lambda x: round(x,1))
    .rename('First PR Created - Years Ago')
)

In [13]:
active_repos = (
    df.query("name != 'FORK_AGGREGATE_STATS' and name != 'STAR_AGGREGATE_STATS'")
    .query("day >= @date_thresh")
    .groupby('slug')['repo']
    .nunique()
    .rename('Active Repos - Last 6 Months')
)

In [14]:
stars = (
    df.query("name == 'STAR_AGGREGATE_STATS'")
    .groupby('slug')['amount']
    .max()
    .rename('Max Stars - Any Repo')
    .astype(int)
)
forks = (
    df.query("name == 'FORK_AGGREGATE_STATS'")
    .groupby('slug')['amount']
    .max()
    .rename('Max Forks - Any Repo')
    .astype(int)
)

In [15]:
repo_activity = (
    df.query("name != 'FORK_AGGREGATE_STATS' and name != 'STAR_AGGREGATE_STATS'")
    .query("day >= @date_thresh")
    .groupby(['slug', 'name'])['amount']
    .sum()
    .sort_values()
    .reset_index()
    .pivot_table(index=['slug'], columns='name', values='amount', fill_value=0)
    .astype(int)
)
repo_activity.columns = [c.replace("_"," ").title() + " - All Repos, Last 6 Months" for c in repo_activity.columns]

In [16]:
repo_activity

Unnamed: 0_level_0,"Commit Code - All Repos, Last 6 Months","Issue Closed - All Repos, Last 6 Months","Issue Created - All Repos, Last 6 Months","Pull Request Created - All Repos, Last 6 Months","Pull Request Merged - All Repos, Last 6 Months"
slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
brightid,193,5,13,23,16
clrfund,148,1,1,29,29
commonsbuild,54,4,12,6,6
drips-network,690,26,23,40,54
ethereum-cat-herders,610,156,142,684,507
ethstaker,535,34,35,401,390
gitcoin,2715,1471,1521,1422,1234
givepraise,345,60,68,40,37
giveth,3611,848,807,635,583
glo-foundation,612,121,101,135,129


In [17]:
repo_activity.at['drips-network','Pull Request Created - All Repos, Last 6 Month'] = 262
repo_activity.at['drips-network','Pull Request Merged - All Repos, Last 6 Months'] = 227

repo_activity.at['drips-network','Issue Created - All Repos, Last 6 Months'] = 167
repo_activity.at['drips-network','Issue Closed - All Repos, Last 6 Months'] = 83

In [18]:
contribs = (
    df.query("name == 'COMMIT_CODE' or name == 'PULL_REQUEST_CREATED'")
    .groupby(['slug', 'fromId'])
    .agg({'amount': 'sum', 'time': ['min', 'max']})
)
contribs.columns = ['amount', 'first', 'last']
new_contribs = (
    contribs[(contribs['amount'] > 1) & (contribs['first'] >= date_thresh)]
    .reset_index()
    .groupby('slug')['fromId']
    .nunique()
    .astype(int)
    .rename('New Contributors - Last 6 Months')
)
lifetime_contribs = (
    contribs[contribs['amount'] > 1]
    .reset_index()
    .groupby('slug')['fromId']
    .nunique()
    .astype(int)
    .rename('Total Contributors - All Time')
)

In [19]:
devs = (
    df.query("name == 'COMMIT_CODE' or name == 'PULL_REQUEST_CREATED'")
    .query("day >= @date_thresh")
    .groupby(['slug', 'fromId', 'month'])['day']
    .nunique()
    .rename('activeDays')
    .reset_index()
)
devs['devType'] = devs['activeDays'].apply(lambda x: "Full-time" if x >=10 else "Part-time")
dev_types = (
    (
        devs
        .groupby(['slug', 'devType'])['fromId']
        .count() / 6        
    )
    .reset_index()
    .pivot_table(
        index='slug', 
        columns='devType', 
        values='fromId', 
        fill_value=0
    )
    .rename(columns={
        'Full-time': 'Full-time Developers - Avg Last 6 Months',
        'Part-time': 'Part-time Developers - Avg Last 6 Months'
    })
    .join(
        (
            devs
            .groupby(['slug', 'month'])['fromId']
            .nunique()
        )
        .reset_index()
        .drop(columns='month')
        .groupby('slug')['fromId'].mean()
        .rename('Total Developers - Avg Last 6 Months')
    )
)

In [20]:
results = pd.concat([
    first_pr, 
    active_repos, 
    stars, forks, 
    repo_activity, 
    new_contribs, 
    lifetime_contribs, 
    dev_types
], axis=1).fillna(0).applymap(lambda x: round(x,1)).rename(index=slugs_to_names)
results.to_csv("octant-epoch-02-summary.csv")
results

Unnamed: 0_level_0,First PR Created - Years Ago,Active Repos - Last 6 Months,Max Stars - Any Repo,Max Forks - Any Repo,"Commit Code - All Repos, Last 6 Months","Issue Closed - All Repos, Last 6 Months","Issue Created - All Repos, Last 6 Months","Pull Request Created - All Repos, Last 6 Months","Pull Request Merged - All Repos, Last 6 Months","Pull Request Created - All Repos, Last 6 Month",New Contributors - Last 6 Months,Total Contributors - All Time,Full-time Developers - Avg Last 6 Months,Part-time Developers - Avg Last 6 Months,Total Developers - Avg Last 6 Months
slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
BrightID,5.7,6.0,217,63.0,193.0,5.0,13.0,23.0,16.0,0.0,2.0,60,0.0,3.8,3.8
clr.fund,3.9,2.0,177,99.0,148.0,1.0,1.0,29.0,29.0,0.0,0.0,31,0.3,1.7,2.0
Token Engineering Commons,3.2,8.0,10,11.0,54.0,4.0,12.0,6.0,6.0,0.0,3.0,53,0.2,2.5,2.3
DAO Drops,2.2,0.0,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.0,0.0,0.0
Drips,2.5,8.0,55,15.0,690.0,83.0,167.0,40.0,227.0,262.0,4.0,24,1.3,6.0,6.3
Ethereum Cat Herders,8.2,6.0,12268,5354.0,610.0,156.0,142.0,684.0,507.0,0.0,64.0,778,0.0,72.2,61.9
EthStaker,3.4,5.0,384,160.0,535.0,34.0,35.0,401.0,390.0,0.0,15.0,115,1.3,10.0,9.7
Gitcoin,6.3,44.0,1751,881.0,2715.0,1471.0,1521.0,1422.0,1234.0,0.0,27.0,556,10.3,26.8,31.9
Praise,2.1,6.0,31,17.0,345.0,60.0,68.0,40.0,37.0,0.0,2.0,29,0.3,2.5,2.8
Giveth,7.3,30.0,652,305.0,3611.0,848.0,807.0,635.0,583.0,0.0,3.0,185,5.2,11.8,14.6


In [21]:
results.to_csv("octant-epoch-02-summary.csv", sep="|")

In [22]:
(
    results[['First PR Created - Years Ago',
         'Total Contributors - All Time',
         'Max Forks - Any Repo',
         'Max Stars - Any Repo']]
    .sort_values(by='First PR Created - Years Ago', ascending=False)
)#.to_csv("octant-epoch-02-table1.csv", sep='|')

Unnamed: 0_level_0,First PR Created - Years Ago,Total Contributors - All Time,Max Forks - Any Repo,Max Stars - Any Repo
slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Tor Project,10.9,485,962.0,4224
Protocol Guild,10.1,4913,19811.0,44626
Ethereum Cat Herders,8.2,778,5354.0,12268
Giveth,7.3,185,305.0,652
Gitcoin,6.3,556,881.0,1751
rotki,5.9,137,479.0,2481
BrightID,5.7,60,63.0,217
Public HAUS,4.4,69,81.0,136
MetaGame,4.0,120,79.0,124
Revoke,4.0,26,214.0,593


In [25]:
(
    results[[
        'New Contributors - Last 6 Months',
        'Full-time Developers - Avg Last 6 Months',
        'Part-time Developers - Avg Last 6 Months',
        'Total Developers - Avg Last 6 Months'
    ]]
    .sort_values(by='Total Developers - Avg Last 6 Months', ascending=False)
)#.to_csv("octant-epoch-02-table2.csv", sep='|')

In [26]:
(
    results[[
        'Issue Created - All Repos, Last 6 Months',
        'Issue Closed - All Repos, Last 6 Months',
        'Pull Request Created - All Repos, Last 6 Months',
        'Pull Request Merged - All Repos, Last 6 Months',
        'Commit Code - All Repos, Last 6 Months'        
    ]]
    .sort_values(by='Commit Code - All Repos, Last 6 Months', ascending=False)
    .applymap(int)
)#.to_csv("octant-epoch-02-table3.csv", sep='|')

In [27]:
results.sum(axis=0)

First PR Created - Years Ago                          98.2
Active Repos - Last 6 Months                         322.0
Max Stars - Any Repo                               68392.0
Max Forks - Any Repo                               28929.0
Commit Code - All Repos, Last 6 Months             30060.0
Issue Closed - All Repos, Last 6 Months             5608.0
Issue Created - All Repos, Last 6 Months            5962.0
Pull Request Created - All Repos, Last 6 Months    12443.0
Pull Request Merged - All Repos, Last 6 Months     10817.0
Pull Request Created - All Repos, Last 6 Month       262.0
New Contributors - Last 6 Months                     556.0
Total Contributors - All Time                       7868.0
Full-time Developers - Avg Last 6 Months              65.1
Part-time Developers - Avg Last 6 Months             565.7
Total Developers - Avg Last 6 Months                 541.8
dtype: float64