In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import functools
import json
from collections import Counter

In [2]:
def importing_json_files(file_wf):
    f_wf = open(file_wf) #informations for nf
    # returns JSON object as
    # a dictionary
    wf = json.load(f_wf)
    f_wf.close
    return wf

#importing the wf and auth dict (github info)
dict_nf = importing_json_files('../json/source_files/wf_new_crawl_nextflow.json')
auth_nf = importing_json_files('../json/source_files/author_clem_nf.json')

#import processors dictionnaries
with open('../json/source_files/nf_proc_tool_shell.json') as f:
    nf_proc = json.load(f)
    
#importing the wf and auth dict (github info)
dict_snk = importing_json_files('../json/source_files/wf_crawl_snakemake.json')
auth_snk = importing_json_files('../json/source_files/author_clem_snk.json')

#import processors dictionnaries
with open('../json/source_files/snk_proc_tool_shell.json') as f:
    snk_proc = json.load(f)

In this notebook, we will display several experiences from which we gathered the various figures present in the paper.

### How are authors sharing their workflows ?

To get a better idea about workflow contribution practices, we asked ourselves :
- How many owners shared their workflows ?
- How many workflows have the top 10 most proficient contributors shared ?
- How many authors have shared more than one workflow ?

In [3]:
# List all the workflows with at least one tool
# Nextflow

wf_tools_nf_init = []
for el in nf_proc:
    wf_name = el["owner"]+"/"+el["wf_orig"]
    wf_tools_nf_init.append(wf_name)
wf_tools_nf_init = list(set(wf_tools_nf_init))

# correct the list
wf_tools_nf = []
keys_dict_nf = list(dict_nf.keys())
for el in wf_tools_nf_init :
    if(el in keys_dict_nf):
        wf_tools_nf.append(el)
    else:
        for el2 in keys_dict_nf:
            if(el in el2):
                wf_tools_nf.append(el2)
                
print(f" There are {len(wf_tools_nf)} Nextflow workflows with at least one tool." )
               

 There are 1186 Nextflow workflows with at least one tool.


In [4]:
# Snakemake

wf_tools_snk_init = []
for el in snk_proc:
    wf_tools_snk_init.append(el["wf_orig"])
wf_tools_snk_init = list(set(wf_tools_snk_init))
len(wf_tools_snk_init)

# correct the list
wf_tools_snk = []
for name_wf in wf_tools_snk_init:
    if(len(name_wf.split("/"))==3):
        name_wf = name_wf.split("/")[0]+"/"+name_wf.split("/")[1]
    wf_tools_snk.append(name_wf)

print(f" There are {len(wf_tools_snk)} Snakemake workflows with at least one tool." )
 

 There are 1257 Snakemake workflows with at least one tool.


In [5]:
# Nextflow authors
auth_nf_occ = []
for el in wf_tools_nf:
    auth_nf_occ.append(dict_nf[el]["owner"])
unique_auth_nf = list(set(auth_nf_occ))
nb_tot_nf = len(unique_auth_nf)
print(f"There are {nb_tot_nf} Nextflow owners")

# number of workflow per author
cnt_nf = Counter(auth_nf_occ)
json_auth_nf = []
for el in unique_auth_nf:
    json_auth_nf.append({"owner":el,"nb_wf":cnt_nf[el]})
df_auth_nf=pd.DataFrame(json_auth_nf)

There are 650 Nextflow owners


In [6]:
# Snakemake authors
auth_snk_occ = []
for el in wf_tools_snk:
    auth_snk_occ.append(dict_snk[el]["owner"])
unique_auth_snk = list(set(auth_snk_occ))
nb_tot_snk = len(unique_auth_snk)
print(f"There are {nb_tot_snk} Snakemake owners")

# number of workflow per author
cnt_snk = Counter(auth_snk_occ)
json_auth_snk = []
for el in unique_auth_snk:
    json_auth_snk.append({"owner":el,"nb_wf":cnt_snk[el]})
df_auth_snk=pd.DataFrame(json_auth_snk)

There are 535 Snakemake owners


In [7]:
# How many workflows have the 10 most proficient owners shared ?
df_auth_snk = df_auth_snk.sort_values(by="nb_wf", ascending = False)
df_auth_nf = df_auth_nf.sort_values(by="nb_wf", ascending = False)

nb_wf_top_nf = sum(df_auth_nf["nb_wf"][:10])
nb_wf_top_snk = sum(df_auth_snk["nb_wf"][:10])
                    
print(f" The top 10 owners have shared {nb_wf_top_nf} ({nb_wf_top_nf/len(wf_tools_nf)}%) Nextflow workflows")
print(f" The top 10 owners have shared {nb_wf_top_snk} ({nb_wf_top_snk/len(wf_tools_snk)}%) Snakemake workflows")


 The top 10 owners have shared 170 (0.1433389544688027%) Nextflow workflows
 The top 10 owners have shared 191 (0.15194908512330946%) Snakemake workflows


In [8]:
# How many authors have shared more than one workflow ?
nb_auth_nf = len(df_auth_nf[df_auth_nf["nb_wf"]>1])
nb_auth_snk = len(df_auth_snk[df_auth_snk["nb_wf"]>1])

perc_auth_nf = nb_auth_nf/len(unique_auth_nf)
perc_auth_snk = nb_auth_snk/len(unique_auth_snk)

print(f"{100*perc_auth_nf}% of Nextflow owners have shared more than one workflow")
print(f"{100*perc_auth_snk}% of Snakemake owners have shared more than one workflow")


30.923076923076927% of Nextflow owners have shared more than one workflow
42.242990654205606% of Snakemake owners have shared more than one workflow


### What are the reuse practices in github ?

In order to estimate workflow reuse in github, we take a look at the number of forks of these workflows. We compute the portion of workflows having more than x forks, x being 3, 5, 10 and 50.

In [9]:
# Making a table with the number of forks per workflow in Nextflow
json_forks_nf = []
for el in wf_tools_nf:
    json_forks_nf.append({"wf":el,"nb_forks":dict_nf[el]["forks"]})
df_forks_nf = pd.DataFrame(json_forks_nf)

In [10]:
# Compute the percentages of workflows that have more than x forks
fork3_nf = 100*len(df_forks_nf[df_forks_nf["nb_forks"]>=3])/len(wf_tools_nf)
fork5_nf = 100*len(df_forks_nf[df_forks_nf["nb_forks"]>=5])/len(wf_tools_nf)
fork10_nf = 100*len(df_forks_nf[df_forks_nf["nb_forks"]>=10])/len(wf_tools_nf)
fork50_nf = 100*len(df_forks_nf[df_forks_nf["nb_forks"]>=50])/len(wf_tools_nf)


In [11]:
print(f"{fork3_nf}% Nextflow workflows have more than 3 forks")
print(f"{fork5_nf}% Nextflow workflows have more than 5 forks")
print(f"{fork10_nf}% Nextflow workflows have more than 10 forks")
print(f"{fork50_nf}% Nextflow workflows have more than 50 forks")

13.743676222596964% Nextflow workflows have more than 3 forks
7.75716694772344% Nextflow workflows have more than 5 forks
3.794266441821248% Nextflow workflows have more than 10 forks
0.8431703204047217% Nextflow workflows have more than 50 forks


In [12]:
# Making a table with the number of forks per workflow in Snakemake
json_forks_snk = []
for el in wf_tools_snk:
    json_forks_snk.append({"wf":el,"nb_forks":dict_snk[el]["forks"]})
df_forks_snk = pd.DataFrame(json_forks_snk)

In [13]:
# Compute the percentages of workflows that have more than x forks
fork3_snk = 100*len(df_forks_snk[df_forks_snk["nb_forks"]>=3])/len(wf_tools_snk)
fork5_snk = 100*len(df_forks_snk[df_forks_snk["nb_forks"]>=5])/len(wf_tools_snk)
fork10_snk = 100*len(df_forks_snk[df_forks_snk["nb_forks"]>=10])/len(wf_tools_snk)
fork50_snk = 100*len(df_forks_snk[df_forks_snk["nb_forks"]>=50])/len(wf_tools_snk)


In [14]:
print(f"{fork3_snk}% Snakemake workflows have more than 3 forks")
print(f"{fork5_snk}% Snakemake workflows have more than 5 forks")
print(f"{fork10_snk}% Snakemake workflows have more than 10 forks")
print(f"{fork50_snk}% Snakemake workflows have more than 50 forks")

14.558472553699284% Snakemake workflows have more than 3 forks
7.875894988066825% Snakemake workflows have more than 5 forks
4.614160700079554% Snakemake workflows have more than 10 forks
0.15910898965791567% Snakemake workflows have more than 50 forks


### How are tools reused amongst workflows ?

In this section, we will count the number of processors each single tool appear in (occurences of tools in processors) in Nextflow and Snakemake.

In [15]:
# occurences of tools in Nextflow
tools_all_occ_nf=[]
for el in nf_proc:
    tools_all_occ_nf+=el["tools"]

# occurences of tools in Nextflow
tools_all_occ_snk=[]
for el in snk_proc:
    tools_all_occ_snk+=el["tools"]
    
#unique tools for Nextflow and Snakemake
tools_counter_nf=Counter(tools_all_occ_nf)
tools_counter_snk=Counter(tools_all_occ_snk)

In [16]:
# Counting and sorting the number of occurences of the tools in Nextflow
usage_nf = []
for tool in set(tools_all_occ_nf):
    usage_nf.append({"tool":tool,
                              "nb_occurences":tools_counter_nf[tool]})
usage_nf_df = pd.DataFrame(usage_nf)
usage_nf_df = usage_nf_df.sort_values(by="nb_occurences",ascending=False)
print("20 Most used tools in Nextflow and their number of occurences")
usage_nf_df.head(20)

20 Most used tools in Nextflow and their number of occurences


Unnamed: 0,tool,nb_occurences
222,SAMtools,2841
208,GATK,1067
98,BCFtools,929
23,FastQC,770
577,MultiQC,707
43,BWA,412
36,BEDTools,384
77,Picard,269
303,Bowtie,243
510,STAR,234


In [17]:
# Counting and sorting the number of occurences of the tools in Snakemake
usage_snk = []
for tool in set(tools_all_occ_snk):
    usage_snk.append({"tool":tool,
                              "nb_occurences":tools_counter_snk[tool]})
usage_snk_df = pd.DataFrame(usage_snk)
usage_snk_df = usage_snk_df.sort_values(by="nb_occurences",ascending=False)
print("20 Most used tools in Snakemake and their number of occurences")
usage_snk_df.head(20)

20 Most used tools in Snakemake and their number of occurences


Unnamed: 0,tool,nb_occurences
114,SAMtools,2045
21,BEDTools,603
58,BCFtools,360
25,BWA,356
108,GATK,269
88,Augur,260
14,FastQC,236
162,Bowtie,177
279,STAR,152
308,MultiQC,137


In [18]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [19]:
# intersection of n most used tools in Snakemake and Nextflow
set1 = []
set2 =[]
for i in range(0,20):
    set1.append(tools_counter_nf.most_common()[i][0])
    set2.append(tools_counter_snk.most_common()[i][0])
print(f" There are {len(intersection(set1,set2))} tools that are in the top 20 most used tools of both Nextflow and Snakemake")
                    
set1 = []
set2 =[]
for i in range(0,10):
    set1.append(tools_counter_nf.most_common()[i][0])
    set2.append(tools_counter_snk.most_common()[i][0])
print(f" There are {len(intersection(set1,set2))} tools that are in the top 10 most used tools of both Nextflow and Snakemake")

 There are 14 tools that are in the top 20 most used tools of both Nextflow and Snakemake
 There are 9 tools that are in the top 10 most used tools of both Nextflow and Snakemake
