In [50]:
import os
import pandas as pd
from google.cloud import bigquery
import plotly.express as px
import plotly.graph_objects as go

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/rohitmalekar/llm/oso/fleet-bongo-424111-b3-f6d34ced0b53.json' # path to your service account key in your downloads folder
client = bigquery.Client()


# Find a project by their Github Repository

In [51]:
query = """
  select
    project_id,
    project_name,
    artifact_namespace as github_owner,
    artifact_name as github_repo
  from `oso_production.artifacts_by_project_v1`
  where
    artifact_source = 'GITHUB'
    and artifact_namespace like '%opensource%'
"""

df = client.query(query).to_dataframe()

df

Unnamed: 0,project_id,project_name,github_owner,github_repo
0,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,sqlglot
1,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,oso
2,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,sqlmesh
3,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,insights
4,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,test-public-repo
5,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,oss-directory
6,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,ethglobal-sf-2024
7,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,private-ops
8,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,dagster-sqlmesh
9,Erx9J64anc8oSeN-wDKm0sojJf8ONrFVYbQ7GFnqSyc=,opensource-observer,opensource-observer,kariba


# Get a 6-month snapshot of code metrics for the project    

In [52]:
query = """
  select
    project_name,
    display_name,
    star_count,
    fork_count,
    commit_count_6_months,
    contributor_count_6_months,
    closed_issue_count_6_months,
    new_contributor_count_6_months,
    merged_pull_request_count_6_months
  from `oso_production.code_metrics_by_project_v1`
  where project_name = 'opensource-observer'
"""
df = client.query(query).to_dataframe()

df

Unnamed: 0,project_name,display_name,star_count,fork_count,commit_count_6_months,contributor_count_6_months,closed_issue_count_6_months,new_contributor_count_6_months,merged_pull_request_count_6_months
0,opensource-observer,Open Source Observer,214,186,1163.0,66.0,427.0,50.0,669.0


# Get historical metrics for the project

In [53]:
query = """
  select
    tm.sample_date,
    m.metric_name,
    tm.amount
  from `oso_production.timeseries_metrics_by_project_v0` as tm
  join `oso_production.metrics_v0` as m
    on tm.metric_id = m.metric_id
  join `oso_production.projects_v1` as p
    on tm.project_id = p.project_id
  where p.project_name = 'opensource-observer'
  and m.metric_name in ('fulltime_developers','parttime_developers')
  order by sample_date desc
"""
df = client.query(query).to_dataframe()

df

Unnamed: 0,sample_date,metric_name,amount
0,2024-08-11 00:00:00+00:00,parttime_developers,7.0
1,2024-08-11 00:00:00+00:00,fulltime_developers,3.0
2,2024-08-10 00:00:00+00:00,parttime_developers,7.0
3,2024-08-10 00:00:00+00:00,fulltime_developers,3.0
4,2024-08-09 00:00:00+00:00,fulltime_developers,3.0
...,...,...,...
718,2023-08-01 00:00:00+00:00,parttime_developers,2.0
719,2023-07-31 00:00:00+00:00,parttime_developers,2.0
720,2023-07-30 00:00:00+00:00,parttime_developers,2.0
721,2023-07-29 00:00:00+00:00,parttime_developers,2.0


In [54]:
# Create a figure
fig = go.Figure()

# Add a trace for each metric
for metric in df['metric_name'].unique():
    metric_data = df[df['metric_name'] == metric]
    
    fig.add_trace(
        go.Scatter(
            x=metric_data['sample_date'],
            y=metric_data['amount'],
            name=metric,
            mode='lines',
            hovertemplate='<b>Date</b>: %{x}<br>' +
                         '<b>Value</b>: %{y:.1f}<br>' +
                         '<extra></extra>'
        )
    )

# Update layout
fig.update_layout(
    title='Metrics Trends Over Time',
    xaxis_title='Date',
    yaxis_title='Amount',
    hovermode='x unified',
    template='plotly_white',
    width=1000,
    height=600,
    showlegend=True,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.05
    )
)

# Show the plot
fig.show()

# Find all contributors to a project's GitHub repositories

In [55]:
query = """
  select
    te.time,
    a.artifact_name as code_contributor,
    abp.artifact_name as github_repo,
    te.event_type,
    te.amount
  from `oso_production.timeseries_events_by_artifact_v0` as te
  join `oso_production.artifacts_by_project_v1` as abp
    on te.to_artifact_id = abp.artifact_id
  join `oso_production.artifacts_v1` a
    on te.from_artifact_id = a.artifact_id
  where
    abp.project_name = 'opensource-observer'
    and te.event_type = 'COMMIT_CODE'
  order by te.time desc
"""
df = client.query(query).to_dataframe()

df

Unnamed: 0,time,code_contributor,github_repo,event_type,amount
0,2025-02-01 22:34:39+00:00,ryscheng,oso,COMMIT_CODE,1.0
1,2025-02-01 20:21:04+00:00,ryscheng,oso,COMMIT_CODE,1.0
2,2025-02-01 20:14:09+00:00,ryscheng,oso,COMMIT_CODE,1.0
3,2025-02-01 06:07:43+00:00,icarog,oso,COMMIT_CODE,1.0
4,2025-02-01 05:36:24+00:00,icarog,oso,COMMIT_CODE,1.0
...,...,...,...,...,...
2683,2023-08-01 18:32:29+00:00,ryscheng,oso,COMMIT_CODE,1.0
2684,2023-07-30 00:57:25+00:00,ccerv1,oso,COMMIT_CODE,1.0
2685,2023-07-28 19:18:39+00:00,ccerv1,oso,COMMIT_CODE,1.0
2686,2023-07-28 01:59:13+00:00,ryscheng,oso,COMMIT_CODE,1.0


In [56]:
# Convert time to datetime if it's not already
df['quarter'] = pd.to_datetime(df['time']).dt.to_period('Q')

# Calculate contributions per contributor per quarter
quarterly_contrib = df.groupby(['quarter', 'code_contributor'])['amount'].sum().reset_index()

# Calculate total contributions per quarter for percentage
quarter_totals = quarterly_contrib.groupby('quarter')['amount'].sum()
quarterly_contrib['percentage'] = quarterly_contrib.apply(
    lambda x: (x['amount'] / quarter_totals[x['quarter']]) * 100, axis=1
)

# Convert quarter to string to make it JSON serializable
quarterly_contrib['quarter'] = quarterly_contrib['quarter'].astype(str)

# Sort by quarter and percentage for better visualization
quarterly_contrib = quarterly_contrib.sort_values(['quarter', 'percentage'], ascending=[True, False])

# Create the stacked bar chart using a built-in color sequence
fig = px.bar(quarterly_contrib,
             x='quarter',
             y='percentage',
             color='code_contributor',
             color_discrete_sequence=px.colors.qualitative.Pastel,
             labels={'quarter': 'Quarter',
                    'percentage': 'Contribution Percentage',
                    'code_contributor': 'Contributor'},
             title='Quarterly Code Contributions by Developer (%)',
              text='code_contributor')

# Update layout
fig.update_layout(
    barmode='stack',
    showlegend=True,
    template='plotly_white',
    width=1200,
    height=700,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.05
    ),
    yaxis=dict(
        tickformat='.1f',
        ticksuffix='%'
    ),
    hovermode='x unified'
)



# Show the plot
fig.show()


Converting to PeriodArray/Index representation will drop timezone information.



# Analyze funding history for the project

In [57]:
query = """
  SELECT
    from_project_name as funder,
    grant_pool_name,
    CONCAT('$', FORMAT('%d', CAST(sum(amount) as INT64))) as total_amount
  FROM `oso_production.oss_funding_v0`
  WHERE to_project_name = 'uniswap'
  group by funder, grant_pool_name
  ORDER BY sum(amount) DESC
"""
df = client.query(query).to_dataframe()

# Add thousand separators in pandas after the query
df['total_amount'] = df['total_amount'].apply(lambda x: "${:,}".format(int(x.replace('$', '').replace(',', ''))))
df

Unnamed: 0,funder,grant_pool_name,total_amount
0,optimism,grants_season_1,"$1,000,000"
1,gitcoin,GG-03,"$4,119"
2,gitcoin,GG-04,"$4,048"
3,gitcoin,GG-02,"$3,715"
4,gitcoin,GG-01,"$2,783"
5,gitcoin,CGrants - Direct,"$2,107"
6,gitcoin,GG-05,$1


# Compare funding with development metrics for the project

In [58]:
query = """
  WITH project_funding AS (
    SELECT
      to_project_name,
      SUM(amount) as total_funding,
      COUNT(DISTINCT event_source) as funding_sources
    FROM `oso_production.oss_funding_v0`
    where from_project_name = 'gitcoin'
    AND time >= TIMESTAMP(DATE_SUB(CURRENT_DATE(), INTERVAL 12 MONTH))
    GROUP BY to_project_name
  )
  SELECT
    f.to_project_name,
    f.total_funding,
    f.funding_sources,
    m.active_developer_count_6_months,
    m.commit_count_6_months,
    m.opened_issue_count_6_months,
    m.star_count,
    m.fork_count
  FROM project_funding f
  JOIN `oso_production.code_metrics_by_project_v1` m
    ON f.to_project_name = m.project_name
  WHERE f.total_funding > 0
  ORDER BY f.total_funding DESC
"""
df = client.query(query).to_dataframe()

import plotly.graph_objects as go
import numpy as np

import plotly.graph_objects as go
import numpy as np

# Fill NaN values with 0 or minimum value
df['active_developer_count_6_months'] = df['active_developer_count_6_months'].fillna(1)
df['commit_count_6_months'] = df['commit_count_6_months'].fillna(0)
df['funding_sources'] = df['funding_sources'].fillna(1)

# Create the scatter plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['total_funding'],
    y=df['commit_count_6_months']/df['active_developer_count_6_months'],
    mode='markers',
    marker=dict(
        size=df['active_developer_count_6_months'],
        sizemode='area',
        sizeref=2.*max(df['active_developer_count_6_months'])/(40.**2),
        sizemin=4,
        color=df['funding_sources'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Funding Sources')
    ),
    text=df['to_project_name'],
    hovertemplate=
    '<b>Project</b>: %{text}<br>' +
    '<b>Total Funding</b>: $%{x:,.0f}<br>' +
    '<b>Commits (6m)</b>: %{y:,.0f}<br>' +
    '<b>Active Developers</b>: %{marker.size:,.0f}<br>' +
    '<b>Funding Sources</b>: %{marker.color:,.0f}<br>' +
    '<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Funding vs Development Activity by Project',
    xaxis=dict(
        title='Total Funding ($)',
        tickformat='$,.0f',
        #type='log',  # Use log scale for better distribution visualization
        showgrid=True
    ),
    yaxis=dict(
        title='Commit Count per active developer (6 months)',
        tickformat=',.0f',
        type='log',
        showgrid=True
    ),
    template='plotly_white',
    width=1000,
    height=600,
    showlegend=False,
    hovermode='closest'
)

# Show the plot
fig.show()