<img src="https://img.plasmic.app/img-optimizer/v1/img/8cfd7fa09b158cef70ba62d6a068a4b5.png?q=75&f=webp"
     alt="Kariba logo"
     style="background-color: black; float: left; margin-right: 10px;" />
# Kariba Labs: Protocol Lab 2023 Ecosystem Analysis

### **Goals and Objectives**
#### Ecosystem-level (Collection)
* [ ] Describe ecosystem-wide dynamics in user contribution
* [ ] Describe ecosystem-wide dynamics in project impact
* [ ] Generate standardized metrics to benchmark users and projects
#### Project (Project/Artifact)
* [ ] Understand churn in projects on a monthly and weekly basis
* [ ]  Create is distance metric for projects based on distributions. Answer the question: which project's are surprisingly more or less performance?
* [ ] Understand project summary stats: users, event-types, frequency
#### User-level
* [ ] Provide a metric of user's contribution to project and ecosystem
* [ ] Norm and baseline user contribution's base on project-level and ecosystem-level benchmarch
* [ ] Describe a user's project-specific and ecosystem-wide criticality: different projects and important of contributions to different projects. 



In [1]:
from datetime import datetime
import pandas as pd
import json
import math
import numpy as np
from database import KaribaDB
from Composer import *

In [2]:
db = KaribaDB()

## **Task 1**: The number of new users (denoted by ach of devs and dependent repos) + growth rate

## with **SQL** 

In [14]:
# SQL
sample = """SELECT *, 
            extract(ISOYEAR from time) as year, 
            extract(WEEK from time) as week, 
            extract(DAY from time) as day, 
            extract(MONTH from time) as month 
        FROM int_events"""

In [47]:
#compose = lambda a, b, c, d: """SELECT {} FROM ({}) AS {} {}""".format(c, a, b, d)

queries = []

def do_queries(queries, composer):
    for q in queries:
        composer.compose(q)

In [48]:
q = Query(
    _aggregator="*, extract(ISOYEAR from time) as year, extract(WEEK from time) as week, extract(DAY from time) as day, extract(MONTH from time) as month", 
    name="int_events", 
    grouping='',
    args = {})

composer = QueryComposer(query=q,conn=db)
#queries.append(q)
#composer.compose(q)

In [49]:
q2 = Query(
    _aggregator = """to_name, year, month, string_agg(distinct(from_name), ', ') AS contributors""",
    name = "q2",
    args = {},
    grouping = 'GROUP BY to_name, year, month ORDER BY to_name, year, month')

#composer.compose(q2)
queries.append(q2)

In [50]:
q3 = Query(
    _aggregator = """*, LAG(month, 1) OVER (PARTITION BY to_name) lag_month, LAG(contributors, 1) OVER (PARTITION BY to_name) lag_contributors""",
    name = "q3",
    args = {},
    grouping = '')

queries.append(q3)

In [51]:
q4 = Query(
    _aggregator = """*, string_agg(contributors, ', ') OVER (PARTITION BY to_name, year, month ORDER BY month ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_contributors""",
    name = "q4",
    args = {},
    grouping = '')

queries.append(q4)

In [52]:
q5 = Query(
    _aggregator = """to_name, 
                    year, 
                    month, 
                    unnest(string_to_array(contributors, ', ')) as contributors, 
                    lag_contributors""",
    name = "q5",
    args = {},
    grouping = '')

queries.append(q5)

In [53]:
q6 = Query(
    _aggregator = """*, (CASE WHEN strpos({first}, {second}) > 0 THEN 0 ELSE 1 END) as {name}""",
    name = "q6",
    args = {"first" : "lag_contributors", "second" : "contributors", "name" : "gained"},
    grouping = '')

queries.append(q6)

In [54]:
q7 = Query(
    _aggregator = """to_name, year, month, coalesce(lag_contributors, '') AS lag_contributors, coalesce(sum(gained), 0) AS gained, string_agg(contributors, ', ') AS contributors""",
    name = "q7",
    args = {},
    grouping = """GROUP BY to_name, 
                                year, 
                                lag_contributors, 
                                month 
                       ORDER BY to_name, 
                                year, 
                                month""")

queries.append(q7)

In [55]:
q8 = Query(
    _aggregator ="""to_name, year, month, unnest(string_to_array(lag_contributors, ', ')) as lag_contributors, contributors, gained""",
    name = "q8",
    args = {},
    grouping = '')

queries.append(q8)

In [56]:
q9 = Query(
    _aggregator ="""*,(CASE WHEN strpos({first}, {second}) > 0 THEN 0 ELSE 1 END) as {name}""",
    name = "q9",
    args = {"first" : "contributors", "second" : "lag_contributors", "name" : "lost"},
    grouping = '')

queries.append(q9)

In [57]:
q10 = Query(
    _aggregator = """to_name, year, month, coalesce(lag_contributors, '') AS lag_contributors, coalesce(sum(gained), 0) AS gained, string_agg(contributors, ', ') AS contributors""",
    name = "q10",
    args = {},
    grouping = """GROUP BY to_name, 
                                year, 
                                lag_contributors, 
                                month 
                       ORDER BY to_name, 
                                year, 
                                month""")

queries.append(q10)

In [58]:
q11 = Query(
    _aggregator = """to_name, year, month, unnest(string_to_array(lag_contributors, ', ')) as lag_contributors, contributors, gained""",
    name = "q11",
    args = {},
    grouping = "")

queries.append(q11)

q12 = Query(
    _aggregator = """*, (CASE WHEN strpos({first}, {second}) > 0 THEN 0 ELSE 1 END) as {name}""",
    name = "q12",
    args = {"first" : "contributors", "second" : "lag_contributors", "name" : "lost"},
    grouping = '')

queries.append(q12)

In [59]:
q13 = Query(
    _aggregator = """to_name, year, month, string_agg(contributors, ', ') OVER (PARTITION BY to_name, year ORDER BY month ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_contributors, coalesce(contributors, '') as contributors, array_length(string_to_array(contributors, ','), 1) as contributors_count, coalesce(sum(lost), 0) as lost, max(gained) as gained, string_agg(distinct(lag_contributors), ', ') as lag_contributors, count(lag_contributors) as lag_contributors_count""",
    name = "q13",
    args = {},
    grouping = "GROUP BY to_name, year, gained, contributors, month order by to_name, year, month")

queries.append(q13)


In [60]:
# Aggregation 8: Remove duplicated contributors and regrouped (unnest, de-depulicate, renest)

q14 = Query(
    _aggregator = """to_name, year, contributors, contributors_count, lag_contributors, lost, gained, month, unnest(string_to_array(cum_contributors, ', ')) as cum_contributors, lag_contributors_count""",
    name = "q14",
    args = {},
    grouping = "")

queries.append(q14)



q15 = Query(
    _aggregator = """to_name, year, month, max(contributors) as contributors, max(lost) as lost, max(gained) as gained, count(distinct(cum_contributors)) as cum_contributors_count, max(lag_contributors) as lag_contributors, string_agg(distinct(cum_contributors), ', ') as cum_contributors, max(lag_contributors_count) as lag_contributors_count, max(contributors_count) as contributors_count""",
    name = "q15",
    args = {},
    grouping = "GROUP BY to_name, year, contributors, lag_contributors, contributors_count, lag_contributors_count, month order by to_name, year, month""")

queries.append(q15)


In [None]:


churn_agg = lambda l : """to_name, year, month, lost, gained, cum_contributors, cum_contributors_count, 
    lag_contributors, lag_contributors_count,
    contributors, contributors_count,
(CAST(lost as float)/cum_contributors_count)*100 as churn_prior, ((CAST(cum_contributors_count as float) - CAST(contributors_count as float))/CAST(cum_contributors_count as float))*100 as churn_total""".format(label=l)


In [61]:
composer = QueryComposer(query=q,conn=db)
do_queries(queries, composer)
composer.execute()

Unnamed: 0,to_name,year,month,contributors,lost,gained,cum_contributors_count,lag_contributors,cum_contributors,lag_contributors_count,contributors_count
0,alto-io/ora-unity-renderer,2023.0,5.0,kevin-altitude,0,0.0,1,kevin-altitude,kevin-altitude,1,1
1,celestiaorg/.github,2023.0,8.0,rootulp,0,0.0,1,rootulp,rootulp,1,1
2,celestiaorg/.github,2023.0,10.0,rootulp,0,0.0,1,rootulp,rootulp,1,1
3,celestiaorg/.github,2023.0,11.0,rootulp,0,0.0,1,rootulp,rootulp,1,1
4,celestiaorg/blobstream-contracts,2023.0,9.0,"onurakpolat, rootulp",0,1.0,2,rootulp,"onurakpolat, rootulp",1,2
...,...,...,...,...,...,...,...,...,...,...,...
1852,zondax/ledger-statemine,2023.0,9.0,bee344,0,0.0,1,bee344,bee344,1,1
1853,zondax/ledger-zcash,2023.0,5.0,idatucker,0,0.0,1,idatucker,idatucker,1,1
1854,zondax/ledger-zcash,2023.0,12.0,ainhoa-a,1,1.0,2,idatucker,"ainhoa-a, idatucker",1,1
1855,zondax/ledger-zcash-rs,2023.0,11.0,idatucker,0,0.0,1,idatucker,idatucker,1,1


In [16]:
# Aggregators 1: Grouping and aggregating the contributors
grouped = """GROUP BY to_name, year, month ORDER BY to_name, year, month"""
aggregators = lambda l: """to_name, year, month, string_agg(distinct({label}.from_name), ', ') AS contributors""".format(label=l)
grouping = compose(sample, "sample", aggregators("sample"), grouped)

# Aggregators 2: Lagging the contributors and the month
lagged_aggregator = lambda l: """{label}.*, LAG({label}.month, 1) OVER (PARTITION BY {label}.to_name) lag_month, LAG({label}.contributors, 1) OVER (PARTITION BY {label}.to_name) lag_contributors""".format(label=l)
grouped = compose(grouping, "grouping", lagged_aggregator("grouping"),  "")

In [13]:
# Aggregation 3: Aggregatingt he contributions over the partitions
partitioned_agg = lambda l: """{label}.*, string_agg({label}.contributors, ', ') OVER (PARTITION BY {label}.to_name, {label}.year, {label}.month ORDER BY {label}.month ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_contributors""".format(label=l)
cum_grouped = compose(grouped, "grouped", partitioned_agg("grouped"),  "")

# Calulating the users gained and lost
# Aggregation 4: unnest and getting the gained 
agg = lambda l: """{label}.to_name, 
                    {label}.year, 
                    {label}.month, 
                    unnest(string_to_array({label}.contributors, ', ')) as contributors, 
                    {label}.lag_contributors""".format(label=l)

unnested_group = compose(cum_grouped, "cum_grouped", agg("cum_grouped"),  "")

case =  lambda l, f, s, n : """{label}.*, 
                                (CASE WHEN strpos({label}.{first}, {label}.{second}) > 0 THEN 0 ELSE 1 END) as {name}""".format(label = l, first=f, second=s, name=n)

data_with_gained = compose(unnested_group, "unnested_group", case("unnested_group", "lag_contributors", "contributors", "gained"), "")
##### go through and replace all number greater than 1 with 1

# Aggregation 5: collaspse the preivous unnested "contributors" and unnesting lag_contributors
regroup = lambda l: """GROUP BY to_name, 
                                year, 
                                lag_contributors, 
                                month 
                       ORDER BY to_name, 
                                year, 
                                month""".format(label=l)

regroupinf_agg = lambda l: """to_name, year, month, coalesce(lag_contributors, '') AS lag_contributors, coalesce(sum(gained), 0) AS gained, string_agg(contributors, ', ') AS contributors""".format(label=l)
regrouped = compose(data_with_gained , "data_with_gained", regroupinf_agg("data_with_gained"), regroup("data_with_gained"))

# Aggregation 6: unnest and getting the gained 
second_agg = lambda l: """{label}.to_name, {label}.year, {label}.month, unnest(string_to_array({label}.lag_contributors, ', ')) as lag_contributors, {label}.contributors, {label}.gained""".format(label=l)
second_unnested_group = compose(regrouped, "regrouped", second_agg("regrouped"),  "")
data_with_lost = compose(second_unnested_group, "second_unnested_group", case("second_unnested_group", "contributors", "lag_contributors", "lost"), "")

# Aggregation 7: collaspse the preivous unnested "contributors" and unnesting lag_contributors
second_regroup = lambda l: """GROUP BY to_name, year, gained, contributors, month order by to_name, year, month""".format(label=l)
second_regroupinf_agg = lambda l: """to_name, year, month, string_agg({label}.contributors, ', ') OVER (PARTITION BY {label}.to_name, {label}.year ORDER BY {label}.month ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_contributors, coalesce(contributors, '') as contributors, array_length(string_to_array(contributors, ','), 1) as contributors_count, coalesce(sum(lost), 0) as lost, max(gained) as gained, string_agg(distinct(lag_contributors), ', ') as lag_contributors, count(lag_contributors) as lag_contributors_count""".format(label=l)
second_regrouped = compose(data_with_lost , "data_with_lost", second_regroupinf_agg("data_with_lost"), second_regroup("data_with_lost"))

# Aggregation 8: Remove duplicated contributors and regrouped (unnest, de-depulicate, renest)
normalize_agg = lambda l : """{label}.to_name, {label}.year, {label}.contributors, {label}.contributors_count, {label}.lag_contributors, {label}.lost, {label}.gained, {label}.month, unnest(string_to_array({label}.cum_contributors, ', ')) as cum_contributors, {label}.lag_contributors_count""".format(label=l)
normalized = compose(second_regrouped, "second_regrouped", normalize_agg("second_regrouped"), '')

grouped_norm = lambda l: """GROUP BY to_name, year, contributors, lag_contributors, contributors_count, lag_contributors_count, month order by to_name, year, month""".format(label=l)
group_normalized_agg = lambda l : """to_name, year, month, max(contributors) as contributors, max(lost) as lost, max(gained) as gained, count(distinct({label}.cum_contributors)) as cum_contributors_count, max(lag_contributors) as lag_contributors, string_agg(distinct({label}.cum_contributors), ', ') as cum_contributors, max(lag_contributors_count) as lag_contributors_count, max(contributors_count) as contributors_count""".format(label=l)
group_normalized = compose(normalized, "normalized", group_normalized_agg("normalized"), grouped_norm("normalized"))

churn_agg = lambda l : """to_name, year, month, lost, gained, cum_contributors, cum_contributors_count, 
    lag_contributors, lag_contributors_count,
    contributors, contributors_count,
(CAST(lost as float)/cum_contributors_count)*100 as churn_prior, ((CAST(cum_contributors_count as float) - CAST(contributors_count as float))/CAST(cum_contributors_count as float))*100 as churn_total""".format(label=l)

churn_calculation = compose(group_normalized, "group_normalized", churn_agg("group_normalized"), "")

In [14]:
churn = db.execute(churn_calculation)

In [35]:
churn

Unnamed: 0,to_name,year,month,lost,gained,cum_contributors,cum_contributors_count,lag_contributors,lag_contributors_count,contributors,contributors_count,churn_prior,churn_total
0,alto-io/ora-unity-renderer,2023.0,5.0,0,0,kevin-altitude,1,kevin-altitude,1,kevin-altitude,1,0.0,0.0
1,celestiaorg/.github,2023.0,8.0,0,0,rootulp,1,rootulp,1,rootulp,1,0.0,0.0
2,celestiaorg/.github,2023.0,10.0,0,0,rootulp,1,rootulp,1,rootulp,1,0.0,0.0
3,celestiaorg/.github,2023.0,11.0,0,0,rootulp,1,rootulp,1,rootulp,1,0.0,0.0
4,celestiaorg/blobstream-contracts,2023.0,9.0,0,1,"onurakpolat, rootulp",2,rootulp,1,"onurakpolat, rootulp",2,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852,zondax/ledger-statemine,2023.0,9.0,0,0,bee344,1,bee344,1,bee344,1,0.0,0.0
1853,zondax/ledger-zcash,2023.0,5.0,0,0,idatucker,1,idatucker,1,idatucker,1,0.0,0.0
1854,zondax/ledger-zcash,2023.0,12.0,1,1,"ainhoa-a, idatucker",2,idatucker,1,ainhoa-a,1,50.0,50.0
1855,zondax/ledger-zcash-rs,2023.0,11.0,0,0,idatucker,1,idatucker,1,idatucker,1,0.0,0.0


### Project Churn (All Month)

In [36]:
metrics = churn[['to_name', 'month', 'year', 'contributors', 'churn_prior', 'churn_total', 'cum_contributors_count']].sort_values(by=['churn_total'], ascending=False)
# Do in SQL
metrics[['org', 'project']] = metrics['to_name'].str.split("/", expand=True)
metrics.groupby(["to_name", "year"]).agg({"churn_total" : np.mean, "churn_prior" : np.mean}).sort_values(by=['churn_prior'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,churn_total,churn_prior
to_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1
powerloom/deploy,2023.0,0.0,200.0
filecoin-project/fvm-starter-kit-deal-making,2023.0,0.0,200.0
zama-ai/fhe-tutorials,2023.0,0.0,200.0
ipfs/someguy,2023.0,0.0,200.0
filecoin-project/test-vectors,2023.0,0.0,200.0
...,...,...,...
ipfs/js-ipfs-utils,2023.0,0.0,0.0
celestiaorg/.github,2023.0,0.0,0.0
ipfs/js-ipfs-repo,2023.0,0.0,0.0
ipfs/js-hamt-sharding,2023.0,0.0,0.0


In [37]:
metrics = churn[['to_name', 'month', 'year', 'churn_prior', 'churn_total', 'cum_contributors_count']].sort_values(by=['churn_total'], ascending=False)
metrics[['org', 'project']] = metrics['to_name'].str.split("/", expand=True)

In [39]:
metrics_by_project = metrics.groupby(["to_name", 'year']).agg({"churn_prior": np.mean, "churn_total": np.mean, "cum_contributors_count": np.max}).sort_values(by=['churn_prior'], ascending=False)
metrics_by_project.sort_values(by=['churn_total'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,churn_prior,churn_total,cum_contributors_count
to_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
filecoin-project/devgrants,2023.0,32.649410,76.370198,38
ipfs/ipfs-desktop,2023.0,19.920905,75.156790,25
filecoin-project/filecoin-plus-large-datasets,2023.0,18.821407,72.246066,56
zama-ai/tfhe-rs,2023.0,17.985936,72.216085,17
ipfs/ipfs-docs,2023.0,30.550234,69.973393,23
...,...,...,...,...
zama-ai/fhe-biometrics,2023.0,0.000000,0.000000,1
zama-ai/evmos,2023.0,0.000000,0.000000,1
zama-ai/concrete-ntt,2023.0,0.000000,0.000000,1
zama-ai/awesome-zama,2023.0,0.000000,0.000000,1


In [33]:
metrics_by_project.sort_values(by=['churn_total'], ascending=False)

Unnamed: 0_level_0,churn_prior,churn_total,cum_contributors_count
to_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
filecoin-project/devgrants,32.649410,76.370198,38
ipfs/ipfs-desktop,19.920905,75.156790,25
filecoin-project/filecoin-plus-large-datasets,18.821407,72.246066,56
zama-ai/tfhe-rs,17.985936,72.216085,17
ipfs/ipfs-docs,30.550234,69.973393,23
...,...,...,...
zama-ai/fhe-biometrics,0.000000,0.000000,1
zama-ai/evmos,0.000000,0.000000,1
zama-ai/concrete-ntt,0.000000,0.000000,1
zama-ai/awesome-zama,0.000000,0.000000,1


In [34]:
metrics_by_org = metrics_by_project.reset_index()
metrics_by_org[['org', 'project']] = metrics_by_org['to_name'].str.split("/", expand=True)
metrics_by_org = metrics_by_org.groupby("org").agg({"churn_prior": np.mean, "churn_total": np.mean, "cum_contributors_count": np.sum}).sort_values(by=['churn_total'], ascending=False)
metrics_by_org.sort_values(by=['cum_contributors_count'], ascending=False)

Unnamed: 0_level_0,churn_prior,churn_total,cum_contributors_count
org,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ipfs,34.79302,18.396753,342
filecoin-project,28.176337,22.843835,311
libp2p,27.645448,11.811528,169
celestiaorg,24.388709,21.733583,102
multiformats,33.444715,18.010331,83
chainsafe,16.839038,15.765524,66
ipld,14.609452,15.513709,56
zama-ai,22.611389,14.629246,44
fluencelabs,20.930736,17.974387,35
ipfs-shipyard,39.617504,10.938051,29


## **Task 2**: The number of bounced users (devs or dependent repos - i.e. those who contribute for 1 week, then stop)

In [None]:
#For each repo, group contirbutions by user: for each week get # of contributions or indicator is any contribution
# for each repo, for each user: get sum of contributions and get culumative sum
# if sume is greater than 1 then user is not bounced
# create column indicating user bounced
# create data indicator total number of bounced users
# rank repos by # of bounced users 
# wanna do this my month and culumlative. So over a given period is a user bounced
# plot bounced users over time

In [72]:
bounced_agg = lambda l : """from_name, to_name, week, COUNT(to_id) AS count""".format(label = l)
bounced_grouped = """GROUP BY from_name, to_name, week ORDER BY count, from_name, to_name, week"""
bounced = compose(sample, "sample", bounced_agg("sample"), bounced_grouped)

In [73]:
test = db.execute(bounced)
test

Unnamed: 0,from_name,to_name,week,count
0,0xasten,celestiaorg/docs,48.0,1
1,0xhsy,celestiaorg/celestia.org,12.0,1
2,0xtaosu,filecoin-project/filecoin-plus-large-datasets,8.0,1
3,0xtylerholmes,chainsafe/discv5,9.0,1
4,10d9e,zama-ai/tfhe-rs,48.0,1
...,...,...,...,...
6106,rvagg,filecoin-project/lassie,34.0,79
6107,rootulp,celestiaorg/rsmt2d,27.0,81
6108,autonome,ipfs-shipyard/ipfs-thing-2023,15.0,83
6109,autonome,ipfs-shipyard/ipfs-thing-2023,13.0,119


In [71]:
db.execute(sample).groupby(['from_name', 'to_name', 'week']).agg({"to_id" : "count"}).sort_values(by=["to_id",'from_name', 'to_name', 'week'], ascending=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,to_id
from_name,to_name,week,Unnamed: 3_level_1
0xasten,celestiaorg/docs,48.0,1
0xhsy,celestiaorg/celestia.org,12.0,1
0xtaosu,filecoin-project/filecoin-plus-large-datasets,8.0,1
0xtylerholmes,chainsafe/discv5,9.0,1
10d9e,zama-ai/tfhe-rs,48.0,1
...,...,...,...
rvagg,filecoin-project/lassie,34.0,79
rootulp,celestiaorg/rsmt2d,27.0,81
autonome,ipfs-shipyard/ipfs-thing-2023,15.0,83
autonome,ipfs-shipyard/ipfs-thing-2023,13.0,119


In [24]:
user_contributions = db.execute(sample).groupby(['from_name', 'to_name', 'week']).agg({"to_id" : "count"}).sort_values(by="to_id", ascending=False)
user_contributions = user_contributions.reset_index()
user_contributions['week'] = user_contributions['week'].astype('int64')
user_contributions['merged_name'] = user_contributions.apply(lambda x: "$".join([x['from_name'], x['to_name']]), axis=1)
non_dup = user_contributions["merged_name"].drop_duplicates()

idx = pd.MultiIndex.from_product([non_dup.to_numpy(), np.arange(1, 53, 1)], names=["merged_name", "week"])

In [30]:
full_user_contribution = user_contributions[["merged_name", 'week', 'to_id']].set_index(["merged_name", 'week']).reindex(idx).reset_index()
full_user_contribution['to_id'].fillna(0, inplace=True)
full_user_contribution[['from_name', "to_name"]] = full_user_contribution.merged_name.str.split("$", expand=True)

In [31]:
full_user_contribution

Unnamed: 0,merged_name,week,to_id,from_name,to_name
0,autonome$ipfs-shipyard/ipfs-thing-2023,1,0.0,autonome,ipfs-shipyard/ipfs-thing-2023
1,autonome$ipfs-shipyard/ipfs-thing-2023,2,0.0,autonome,ipfs-shipyard/ipfs-thing-2023
2,autonome$ipfs-shipyard/ipfs-thing-2023,3,0.0,autonome,ipfs-shipyard/ipfs-thing-2023
3,autonome$ipfs-shipyard/ipfs-thing-2023,4,0.0,autonome,ipfs-shipyard/ipfs-thing-2023
4,autonome$ipfs-shipyard/ipfs-thing-2023,5,0.0,autonome,ipfs-shipyard/ipfs-thing-2023
...,...,...,...,...,...
96819,mrd0ll4r$ipfs/ipfs-docs,48,0.0,mrd0ll4r,ipfs/ipfs-docs
96820,mrd0ll4r$ipfs/ipfs-docs,49,0.0,mrd0ll4r,ipfs/ipfs-docs
96821,mrd0ll4r$ipfs/ipfs-docs,50,0.0,mrd0ll4r,ipfs/ipfs-docs
96822,mrd0ll4r$ipfs/ipfs-docs,51,0.0,mrd0ll4r,ipfs/ipfs-docs


In [None]:
## HELPERS

In [None]:
#1. What are potential things to do 

In [29]:
unfold =  lambda data, y: data.apply(lambda x: ", ".join(replace_na(json.loads(x.replace("\n", "").replace("  ", ""))[y].values())))
expand = lambda x: x.str.split(',', expand=True)
replace_na = lambda d: list(map(lambda x: "null" if x == None else x, d))

In [3]:
from Composer import *

In [10]:
q = Query(
    aggregator="*, extract(ISOYEAR from time) as year, extract(WEEK from time) as week, extract(DAY from time) as day, extract(MONTH from time) as month", 
    name="int_events", 
    grouping='')

In [11]:
q = Query(
    aggregator="*, extract(ISOYEAR from time) as year, extract(WEEK from time) as week, extract(DAY from time) as day, extract(MONTH from time) as month", 
    name="int_events", 
    grouping='')
composer = QueryComposer(query=q, conn=db)

In [12]:
q2 = Query(
    aggregator = """from_name, to_name, week, COUNT(to_id) AS count""",
    name = "q2",
    grouping = 'GROUP BY from_name, to_name, week ORDER BY count, from_name, to_name, week')
composer.compose(q2)

In [13]:
composer.compose(q2)

'SELECT from_name, to_name, week, COUNT(to_id) AS count FROM (SELECT *, extract(ISOYEAR from time) as year, extract(WEEK from time) as week, extract(DAY from time) as day, extract(MONTH from time) as month FROM int_events ) AS q2 GROUP BY from_name, to_name, week ORDER BY count, from_name, to_name, week'

In [14]:
composer.execute()

Unnamed: 0,from_name,to_name,week,count
0,0xasten,celestiaorg/docs,48.0,1
1,0xhsy,celestiaorg/celestia.org,12.0,1
2,0xtaosu,filecoin-project/filecoin-plus-large-datasets,8.0,1
3,0xtylerholmes,chainsafe/discv5,9.0,1
4,10d9e,zama-ai/tfhe-rs,48.0,1
...,...,...,...,...
6106,rvagg,filecoin-project/lassie,34.0,79
6107,rootulp,celestiaorg/rsmt2d,27.0,81
6108,autonome,ipfs-shipyard/ipfs-thing-2023,15.0,83
6109,autonome,ipfs-shipyard/ipfs-thing-2023,13.0,119
