In [1]:
# standard dependencies
import numpy as np
import os
import pandas as pd
import sys
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# local package dependencies
sys.path.append(os.path.abspath("../../scripts/"))
sys.path.append(os.path.abspath("../../visualizations/"))

# for making direct query requests to the data warehouse
from oso_db import execute_query

In [4]:
# create a mapping of projects to collections
result = execute_query("""
    SELECT p."slug", c."slug" 
    FROM project p 
    JOIN collection_projects_project cpp ON p."id" = cpp."projectId"
    JOIN collection c ON cpp."collectionId" = c."id"
    WHERE c."typeId" = 1
""", col_names=False)
projects_to_collections = {
    k: (
        sorted([
            v for v, key
            in result
            if key == k
        ]) 
    )
    for k in set(key for _, key in result)
}

In [5]:
# create a list of collections (ecosystems)
collection_slugs = list(projects_to_collections.keys())

In [6]:
# create a list of projects
project_slugs = set([
    p for c,ps 
    in projects_to_collections.items() 
    if c in collection_slugs 
    for p in ps
])
len(project_slugs)

1215

In [7]:
# filter on the project slugs we're interested in
my_slugs = [
    p for p in
    projects_to_collections['octant-02']
    if p != 'protocol-guild'
]
len(my_slugs)

23

In [8]:
# create a string version of the projects list for use in querystrings
slugs_str = "','".join(my_slugs)

In [9]:
# get the names of those projects (up to 24 characters)
result = execute_query(f"""
    SELECT slug, name
    FROM project 
    WHERE slug IN ('{slugs_str}') 
""")
slugs_to_names = {x[0]: x[1][:24] for x in result}

In [10]:
# get data about active developers
query = execute_query(f"""
    SELECT
        p."slug",
        a."name" AS "repo",
        e."fromId",
        et."name",
        e."time",
        e."amount"
    FROM event e             
    JOIN project_artifacts_artifact paa ON e."toId" = paa."artifactId"            
    JOIN artifact a ON paa."artifactId" = a."id"
    JOIN project p ON paa."projectId" = p.id
    JOIN event_type et ON e."typeId" = et."id"
    WHERE
        e."typeId" IN (2,3,4,6,18,14,22)
        AND p.slug IN ('{slugs_str}')
""", col_names=True)

df = pd.DataFrame(query[1:], columns=query[0])
df.to_csv("octant-epoch-02.csv")
df.head()

Unnamed: 0,slug,repo,fromId,name,time,amount
0,tor-project,torproject/webwml,33943.0,COMMIT_CODE,2007-12-31 20:38:23+00:00,1.0
1,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 02:35:02+00:00,1.0
2,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 03:09:22+00:00,1.0
3,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 03:22:04+00:00,1.0
4,tor-project,torproject/webwml,150563.0,COMMIT_CODE,2008-01-01 04:17:09+00:00,1.0


In [11]:
date_thresh = '2023-07-01'

In [12]:
df['month'] = df['time'].apply(lambda x: f"{x.year}-{str(x.month).zfill(2)}")
df['day'] = df['time'].apply(lambda x: f"{x.year}-{str(x.month).zfill(2)}-{str(x.day).zfill(2)}")
df['years_ago'] = (df['time'].max() - df['time']).apply(lambda x: x.days / 365.25)

In [13]:
first_pr = (
    df.query("name == 'PULL_REQUEST_CREATED'")
    .groupby('slug')['years_ago']
    .max()
    .rename('First PR Created - Years Ago')
)

In [22]:
active_repos = (
    df.query("name != 'FORK_AGGREGATE_STATS' and name != 'STAR_AGGREGATE_STATS'")
    .query("day >= @date_thresh")
    .groupby('slug')['repo']
    .nunique()
    .rename('Active Repos - Last 6 Months')
)

In [23]:
stars = (
    df.query("name == 'STAR_AGGREGATE_STATS'")
    .groupby('slug')['amount']
    .max()
    .rename('Max Stars - Any Repo')
)
forks = (
    df.query("name == 'FORK_AGGREGATE_STATS'")
    .groupby('slug')['amount']
    .max()
    .rename('Max Forks - Any Repo')
)

In [24]:
repo_activity = (
    df.query("name != 'FORK_AGGREGATE_STATS' and name != 'STAR_AGGREGATE_STATS'")
    .query("day >= @date_thresh")
    .groupby(['slug', 'name'])['amount']
    .sum()
    .sort_values()
    .reset_index()
    .pivot_table(index=['slug'], columns='name', values='amount', fill_value=0)
)
repo_activity.columns = [c.replace("_"," ").title() + " - All Repos, Last 6 Months" for c in repo_activity.columns]

In [25]:
contribs = (
    df.query("name == 'COMMIT_CODE' or name == 'PULL_REQUEST_CREATED'")
    .groupby(['slug', 'fromId'])
    .agg({'amount': 'sum', 'time': ['min', 'max']})
)
contribs.columns = ['amount', 'first', 'last']
new_contribs = (
    contribs[(contribs['amount'] > 1) & (contribs['first'] >= date_thresh)]
    .reset_index()
    .groupby('slug')['fromId']
    .nunique()
    .rename('New Contributors - Last 6 Months')
)
lifetime_contribs = (
    contribs[contribs['amount'] > 1]
    .reset_index()
    .groupby('slug')['fromId']
    .nunique()
    .rename('Total Contributors - All Time')
)

In [26]:
devs = (
    df.query("name == 'COMMIT_CODE' or name == 'PULL_REQUEST_CREATED'")
    .query("day >= @date_thresh")
    .groupby(['slug', 'fromId', 'month'])['day']
    .nunique()
    .rename('activeDays')
    .reset_index()
)
devs['devType'] = devs['activeDays'].apply(lambda x: "Full-time" if x >=10 else "Part-time")
dev_types = (
    (
        devs
        .groupby(['slug', 'devType'])['fromId']
        .count() / 6        
    )
    .reset_index()
    .pivot_table(
        index='slug', 
        columns='devType', 
        values='fromId', 
        fill_value=0
    )
    .rename(columns={
        'Full-time': 'Full-time Developers - Avg Last 6 Months',
        'Part-time': 'Part-time Developers - Avg Last 6 Months'
    })
    .join(
        (
            devs
            .groupby(['slug', 'month'])['fromId']
            .nunique()
        )
        .reset_index()
        .drop(columns='month')
        .groupby('slug')['fromId'].mean()
        .rename('Total Developers - Avg Last 6 Months')
    )
)

In [27]:
results = pd.concat([
    first_pr, 
    active_repos, 
    stars, forks, 
    repo_activity, 
    new_contribs, 
    lifetime_contribs, 
    dev_types
], axis=1).fillna(0)

In [28]:
results

Unnamed: 0_level_0,First PR Created - Years Ago,Active Repos - Last 6 Months,Max Stars - Any Repo,Max Forks - Any Repo,"Commit Code - All Repos, Last 6 Months","Issue Closed - All Repos, Last 6 Months","Issue Created - All Repos, Last 6 Months","Pull Request Created - All Repos, Last 6 Months","Pull Request Merged - All Repos, Last 6 Months",New Contributors - Last 6 Months,Total Contributors - All Time,Full-time Developers - Avg Last 6 Months,Part-time Developers - Avg Last 6 Months,Total Developers - Avg Last 6 Months
slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
brightid,5.724846,6.0,216.0,63.0,193.0,5.0,13.0,23.0,16.0,2.0,60,0.0,3.833333,3.833333
clrfund,3.937029,1.0,177.0,97.0,124.0,1.0,1.0,29.0,29.0,0.0,31,0.333333,1.666667,2.0
commonsbuild,3.189596,7.0,10.0,11.0,22.0,4.0,11.0,6.0,6.0,2.0,52,0.0,2.166667,1.857143
dao-drops-dorgtech,1.24846,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0
drips-network,2.042437,4.0,55.0,14.0,3.0,1.0,0.0,3.0,3.0,0.0,9,0.0,0.833333,2.5
ethereum-cat-herders,8.240931,6.0,12264.0,5337.0,607.0,156.0,142.0,681.0,505.0,64.0,778,0.0,71.833333,61.571429
ethstaker,3.353867,5.0,381.0,160.0,532.0,34.0,35.0,400.0,388.0,15.0,115,1.166667,10.0,9.571429
gitcoin,6.324435,43.0,1751.0,881.0,2601.0,1447.0,1514.0,1409.0,1222.0,27.0,508,10.0,26.666667,31.428571
givepraise,2.08898,6.0,30.0,16.0,345.0,60.0,68.0,40.0,37.0,2.0,29,0.333333,2.5,2.833333
giveth,7.252567,30.0,652.0,305.0,3603.0,841.0,801.0,632.0,578.0,3.0,185,5.0,12.0,14.571429


In [29]:
results.sum(axis=0)

First PR Created - Years Ago                          88.388775
Active Repos - Last 6 Months                         206.000000
Max Stars - Any Repo                               23739.000000
Max Forks - Any Repo                                9089.000000
Commit Code - All Repos, Last 6 Months             16440.000000
Issue Closed - All Repos, Last 6 Months             3729.000000
Issue Created - All Repos, Last 6 Months            3966.000000
Pull Request Created - All Repos, Last 6 Months     7027.000000
Pull Request Merged - All Repos, Last 6 Months      6341.000000
New Contributors - Last 6 Months                     228.000000
Total Contributors - All Time                       2888.000000
Full-time Developers - Avg Last 6 Months              40.166667
Part-time Developers - Avg Last 6 Months             224.333333
Total Developers - Avg Last 6 Months                 229.738095
dtype: float64