In [1]:
import json
import jellyfish
import numpy
import pandas as pd
import math
import os

In this notebook we will import the text from the snk and nf workflows

In [2]:
path_snk = "/home/marinedjaffardjy/Documents/wf_features/data/inputs/data"
path_nf = "/home/marinedjaffardjy/Documents/AnalyseDonneesNextflow/data_nf"


# Read text File
def read_text_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

# iterate through all snk files
def read_snk_files(path):
    files_path = []
    for root, dirs, files in os.walk(path):
        for file in files:
        # Check whether file is in text format or not
            if file.endswith(".snakefile"):
                files_path.append(os.path.join(root, file))

            # call read text file function
    files_path.sort()
    files=[]
    for file_path in files_path:
        files.append(read_text_file(file_path))
    return files

# iterate through all files
def read_nf_files(path):
    files_path = []
    for root, dirs, files in os.walk(path):
        for file in files:
        # Check whether file is in text format or not
            if file.endswith(".nf"):
                files_path.append(os.path.join(root, file))

            # call read text file function
    files_path.sort()
    files=[]
    for file_path in files_path:
        files.append(read_text_file(file_path))
    return files



Let us import the content only from the files that were analysed by the parsers

In [5]:
with open('/home/marinedjaffardjy/Documents/Code/Similarite_process/json/wf_snk_added_info.json') as f:
    wf_snk = json.load(f)
with open('/home/marinedjaffardjy/Documents/Code/Similarite_process/json/nf_all_wf.json') as f:
    wf_nf = json.load(f)  

In [6]:
#let us extract the filenames in order to extract the brut text from the files
filenames_wf_snk=[el["filename"] for el in wf_snk]
filenames_wf_nf = ["/home/marinedjaffardjy/Documents/AnalyseDonneesNextflow/data_nf"+"/"+el["author"]+"__"+el["wf"] for el in wf_nf]

In [7]:
def read_files(wf_filenames):
    wf_pairs=[]
    for file in wf_filenames:
        #print(file)
        if(os.path.isdir(file)):
            #print("isDir")
            subwf_path=[]
            subwf_joined=""
            for root, dirs, fs in os.walk(file):
                for f in fs:
                    subwf_path.append(os.path.join(root,f))
            for subwf in subwf_path:
                subwf_joined=subwf_joined+"\n\n"+read_text_file(subwf)
            wf_pairs.append({"wf_file":file,"wf_code":subwf_joined})
        elif(os.path.isfile(file+".nf")):
            #print("isFile")
            wf_pairs.append({"wf_file":file,"wf_code":read_text_file(file+".nf")})
        elif(os.path.isfile(file)):
            #print("isFile")
            wf_pairs.append({"wf_file":file,"wf_code":read_text_file(file)})
        else:
            print(file+" not found")
    return wf_pairs

In [8]:
#making a list of dicts with the filename and content of the file
file_wf_snk=read_files(filenames_wf_snk)
file_wf_nf=read_files(filenames_wf_nf)

/home/marinedjaffardjy/Documents/wf_features/data/inputs/data/sreichl/genomic_region_enrichment/1.snakefile not found
/home/marinedjaffardjy/Documents/AnalyseDonneesNextflow/data_nf/Ed-G655__nf-miRNA-SNPs-classify not found
/home/marinedjaffardjy/Documents/AnalyseDonneesNextflow/data_nf/mattpito__nf not found
/home/marinedjaffardjy/Documents/AnalyseDonneesNextflow/data_nf/mattpito_nf__bam2RNAseq not found


In [9]:
with open("/home/marinedjaffardjy/Documents/Code/Similarite_process/json/nf_wf_gathered.json","w") as f:
    json.dump(file_wf_nf,f)
with open("/home/marinedjaffardjy/Documents/Code/Similarite_process/json/snk_wf_gathered.json","w") as f:
    json.dump(file_wf_snk,f)

In [18]:
#let us compute the levenshtein distance for all pairs of workflows, using a variation of the function we made for the processes
#compute levenshtein for snakemake
def levenshtein_wf(wf, resume=0, output_file = "/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein/levenshtein_wf_snk", outputfile_resume = "/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_wf_snk_50.json"):
    #input : 
    #    list of snakemake rules in form of a json
    #    resume : int at which the computation will be restarted
    #    output_file : string model of the names for the output files
    #    output_file_resume : string file for the scores already computed in the case of a resume
    #output : table of dict with scores. also saves the scores for all pairs of processes in json (path )
    v1 = wf.copy()[resume:-1]
    v2 = wf.copy()[resume+1:]
    i = 0
    if(resume>0):
        with open(outputfile_resume) as f:
            scores = json.load(f)
            f.close
    else :
        scores = []
    for wf1 in v1:
        print(str(i)+"/"+str(len(v1)))
        i+=1
        for wf2 in v2:
            
            l = max(len(wf1['wf_code']),len(wf2['wf_code']))
            if(l==0):
                score=0
            else:
                score_l = jellyfish.levenshtein_distance(wf1['wf_code'],wf2['wf_code'])
                score = (l-score_l)/l
            scores.append({"wf1":wf1["wf_file"],
                           "wf2":wf2["wf_file"],
                           "levenshtein":score})
        if(len(v2)>=2):
            v2 = v2[1:]
        if(i%50==0 or i ==len(v1) ):
            with open(output_file+"_"+str(i+resume)+".json","w") as f:
                print(output_file+"_"+str(i+resume)+".json")
                json.dump(scores,f)
                f.close
                scores = []
    return scores

In [11]:
with open("/home/marinedjaffardjy/Documents/Code/Investigating_reuse/json/groups_nf_wf.json") as f:
    groups_nf = json.load(f)
    
with open("/home/marinedjaffardjy/Documents/Code/Investigating_reuse/json/groups_snk_wf.json") as f:
    groups_snk = json.load(f)

In [12]:
def get_group_info(list_group,list_code):
    groups_info = []
    for el_group in list_group:
        group=[]
        elements_group=[el['filename'] for el in el_group]
        for el in list_code:
            if(el["wf_file"] in elements_group):
                group.append(el)
        groups_info.append(group)
    return groups_info

In [13]:
new_groups_snk=get_group_info(groups_snk,file_wf_snk)
new_groups_nf=get_group_info(groups_nf,file_wf_nf)

In [14]:
def compute_lev_groups(new_groups,sys):
    scores=[]
    i=0
    print(sys)
    for group in new_groups:
        if len(group)>1:
            print(i)
            outputfile = "/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_"+sys+"_wf/group"+str(i)+"_"+sys
            score = levenshtein_wf(group, resume=0, output_file = outputfile)
            scores.append(score)
            
        i+=1
    return scores

In [23]:
def compute_lev_groups_snk(new_groups,sys):
    scores=[]
    i=9
    print(sys)
    for group in new_groups[9:]:
        if len(group)>1:
            print(i)
            outputfile = "/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_"+sys+"_wf/group"+str(i)+"_"+sys
            score = levenshtein_wf(group, resume=0, output_file = outputfile)
            scores.append(score)
            
        i+=1
    return scores

In [24]:
len(groups_snk)

792

In [6]:
file="/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group11_snk_4.json"

with open(file) as f:
    grp=json.load(f)

scores=[el["levenshtein"] for el in grp]
scores_sup=[el>0.85 for el in scores]
sum(scores_sup)

0

In [None]:
%%time
scores_snk = compute_lev_groups_snk(new_groups_snk[1:],"snk")
scores_nf = compute_lev_groups(new_groups_nf,"nf")

snk
9
0/183
1/183
2/183
3/183
4/183
5/183
6/183
7/183
8/183
9/183
10/183
11/183
12/183
13/183
14/183
15/183
16/183
17/183
18/183
19/183
20/183
21/183
22/183
23/183
24/183
25/183
26/183
27/183
28/183
29/183
30/183
31/183
32/183
33/183
34/183
35/183
36/183
37/183
38/183
39/183
40/183
41/183
42/183
43/183
44/183
45/183
46/183
47/183
48/183
49/183
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group9_snk_50.json
50/183
51/183
52/183
53/183
54/183
55/183
56/183
57/183
58/183
59/183
60/183
61/183
62/183
63/183
64/183
65/183
66/183
67/183
68/183
69/183
70/183
71/183
72/183
73/183
74/183
75/183
76/183
77/183
78/183
79/183
80/183
81/183
82/183
83/183
84/183
85/183
86/183
87/183
88/183
89/183
90/183
91/183
92/183
93/183
94/183
95/183
96/183
97/183
98/183
99/183
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group9_snk_100.json
100/183
101/183
102/183
103/183
104/183
105/183
106/183
107/183
108/183
109/183
110/183
111/183
112/183

/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group135_snk_1.json
136
0/1
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group136_snk_1.json
137
0/4
1/4
2/4
3/4
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group137_snk_4.json
138
0/5
1/5
2/5
3/5
4/5
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group138_snk_5.json
139
0/4
1/4
2/4
3/4
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group139_snk_4.json
145
0/4
1/4
2/4
3/4
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group145_snk_4.json
150
0/2
1/2
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group150_snk_2.json
153
0/3
1/3
2/3
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf/group153_snk_3.json
156
0/2
1/2
/home/marinedjaffardjy/Documents/Code/Similarite_process/jso

In [None]:
/home/marinedjaffardjy/Documents/Code/Similarite_process/json/levenshtein_snk_wf

In [None]:
numpy.mean([len(el) for el in new_groups_snk])

In [21]:
numpy.mean([len(el) for el in groups_nf])

1.704724409448819

In [None]:
scores_nf=levenshtein_wf(file_wf_nf)

Let us group the wf by tools in order to make computation of similarity easier

In [None]:
def grouping_tools_wf