From 5c2912fc3551f08a6ebda9e677df0a971d8579e9 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 6 Nov 2022 13:18:47 +0200
Subject: [PATCH 01/14] Changes for efficiency

---
 grim/imputation/imputegl/cutils.pyx        |  66 +++++++++
 grim/imputation/imputegl/impute.py         | 156 +++++++++------------
 grim/imputation/imputegl/networkx_graph.py | 113 +++++++--------
 grim/validation/runfile.py                 |   2 +-
 setup.py                                   |  12 +-
 5 files changed, 193 insertions(+), 156 deletions(-)
 create mode 100644 grim/imputation/imputegl/cutils.pyx

diff --git a/grim/imputation/imputegl/cutils.pyx b/grim/imputation/imputegl/cutils.pyx
new file mode 100644
index 0000000..d920a6a
--- /dev/null
+++ b/grim/imputation/imputegl/cutils.pyx
@@ -0,0 +1,66 @@
+#cython: language_level=3
+import cython
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef open_ambiguities(list hap, unsigned char loc, tuple split_loc):
+    cdef unsigned int k, i, p, j #hap_len, haps_len, splits_len
+    cdef Py_ssize_t hap_len, haps_len, splits_len
+    cdef list hap_new, hap1
+    # cdef np.ndarray[STR, ndim=1] hap_new, hap1
+    p = 0
+    if len(split_loc) > 1:
+        # This opens all allele ambiguities
+        hap_len = len(hap[0])
+        haps_len = len(hap)
+        splits_len = len(split_loc)
+        hap_new = [None] * (haps_len * splits_len)
+        # hap_new = np.empty(haps_len * splits_len, dtype=np.object)  # produces an empty list of haplotypes
+        hap1 = [None] * hap_len
+        # hap1 = np.empty(haps_len, dtype=np.object)
+        for k in range(haps_len):  # split a given locus in all haps.
+
+            for j in range(hap_len):
+                hap1[j] = hap[k][j]
+
+            for i in range(splits_len):
+                hap1[loc] = split_loc[i]
+                hap_new[p] = hap1[:]
+                p += 1
+        return hap_new
+    return hap
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef create_hap_list(list all_haps, dict optionDict, unsigned int N_Loc):
+    cdef unsigned int i, j, count
+    cdef list hap_list = []
+    cdef list all_hap_split
+
+    for i in range(len(all_haps)):
+        all_hap_split = all_haps[i].split('~')
+        count = 0
+        for j in range(len(all_hap_split)):
+            if all_hap_split[j] not in optionDict:
+                break
+            else:
+                count += 1
+
+        if count == N_Loc:
+            hap_list.append(all_hap_split)
+    return hap_list
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef deepcopy_list(list l):
+    cdef list copy_l
+    cdef unsigned int i, length
+    length = len(l)
+    copy_l = [None] * length
+    for i in range(length):
+        if isinstance(l[i], list):
+            copy_l[i] = deepcopy_list(l[i])
+        else:
+            copy_l[i] = l[i]
+    return copy_l
diff --git a/grim/imputation/imputegl/impute.py b/grim/imputation/imputegl/impute.py
index f7d0169..28bc299 100644
--- a/grim/imputation/imputegl/impute.py
+++ b/grim/imputation/imputegl/impute.py
@@ -7,8 +7,9 @@
 import os.path
 import json
 
-import numpy as np
 
+import numpy as np
+from .cutils import open_ambiguities, create_hap_list, deepcopy_list
 from .cypher_plan_b import CypherQueryPlanB
 from .cypher_query import CypherQuery
 
@@ -92,6 +93,10 @@ def clean_up_gl(gl):
 
 
 class Imputation(object):
+    __slots__ = 'logger', 'verbose', 'populations', 'netGraph', 'priorMatrix', 'full_hapl', 'index_dict', 'full_loci', \
+                'factor', '_factor_missing_data', 'cypher', 'cypher_plan_b', 'matrix_planb', 'count_by_prob', \
+                'number_of_options_threshold', 'plan', 'option_1', 'option_2', \
+                'haplotypes_number_in_phase', 'save_space_mode', 'nodes_for_plan_A', 'unk_priors'
 
     def __init__(self, net=None,config=None,  count_by_prob=None, verbose=False):
         """Constructor
@@ -291,23 +296,6 @@ def open_phases_1(self, haps, N_Loc, gl_string):
                 phases.append([H1, H2])
         return phases
 
-    def open_ambiguities(self, hap, loc):
-        # This opens all allele ambiguities
-        hap_new = [] # produces an empty list of haplotypes
-        for k in range(len(hap)): #slit a given locus in all haps.
-            splitHap =  hap[k][loc].split('/')
-            if splitHap==hap[k][loc].split():
-                split_loc = hap[k][loc].split('|')
-            else:
-                split_loc = splitHap
-            hap1 = hap[k]
-            if len(split_loc) > 1:
-                for i in range(len(split_loc)):
-                    hap1[loc] = split_loc[i]
-                    hap_new.append(hap1[:])
-            else:
-                hap_new.append(hap1[:])
-        return hap_new
 
     def comp_hap_prob(self, Hap, N_Loc, epsilon, n):
         haplo_probs = self.get_haplo_freqs(Hap, epsilon, n)
@@ -350,11 +338,12 @@ def comp_hap_prob(self, Hap, N_Loc, epsilon, n):
     #     return haplo_probs
 
     def get_haplo_freqs(self, haplos, epsilon, n=25000):
-        haplos_joined = ["~".join(item) for sublist in haplos for item in sublist] ###
+        haplos_joined = ["~".join(item) for item in haplos[0]] ###
         #haplos_joined = [item for sublist in haplos for item in sublist]  ###
         #haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist]
         return self.netGraph.adjs_query(haplos_joined)
 
+
     # def get_haplo_freqs_miss(self, haplos, epsilon):
     #     haplo_probs = {}
     #     haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist]
@@ -681,7 +670,7 @@ def comp_phase_prob(self, phases, N_Loc, epsilon, n):
         # pop_res are the names of the populations
         return {'Haps': hap_total, 'Probs': p_total,'Pops': pop_res}
 
-    def reduce_phase_to_valid_allels(self, haps, N_Loc, planc = False):
+    def reduce_phase_to_valid_allels(self, haps, N_Loc, planc=False):
         for j in range(len(haps)):
             for k in range(2):
                 hap_list = []
@@ -689,14 +678,13 @@ def reduce_phase_to_valid_allels(self, haps, N_Loc, planc = False):
 
                 options = 1
                 for i in range(N_Loc): options = options * (len(hap_list[0][i].split("/")))
-                if options>=self.number_of_options_threshold or planc:
+                if options >= self.number_of_options_threshold or planc:
                     for hap_k in hap_list:
-                        for i,g in enumerate(hap_k):
+                        for i, g in enumerate(hap_k):
                             gen = g.split('/')
                             probs = self.check_if_alleles_exist(gen)
                             if not probs == {}:
-                                list(probs.keys())
-                                haps[j][k][i] = ('/').join(list(probs.keys()))
+                                haps[j][k][i] = '/'.join(list(probs.keys()))
 
 
     def reduce_phase_to_commons_alleles(self, haps, N_Loc, commons_number=1, planc = False ):
@@ -733,51 +721,53 @@ def open_phases(self, haps, N_Loc, gl_string):
         for j in range(len(haps)):
             H1 = []
             H2 = []
-          ##  fq = pa.DataFrame()
+            ##  fq = pa.DataFrame()
             fq = []
 
             for k in range(2):
-                hap_list = []
-                hap_list.append(haps[j][k])
+                hap_list = [haps[j][k]]
+                hap_list_splits = [tuple(allele.split("/")) for allele in hap_list[0]]
 
-                #compute the number of options:
+                # compute the number of options:
                 options = 1
-                for i in range(N_Loc): options=options*(len(hap_list[0][i].split("/")))
+                for i in range(N_Loc):
+                    options *= len(hap_list_splits[i])
 
-                #if the number of options is smaller than the total number of nodes:
+                # if the number of options is smaller than the total number of nodes:
                 if options < self.number_of_options_threshold:
-                    #open ambiguities regularly:
+                    # open ambiguities regularly:
                     for i in range(N_Loc):
-                        hap_list = self.open_ambiguities(hap_list, i)
-                    if (k == 0):
+                        hap_list = self.open_ambiguities(hap_list, i, hap_list_splits[i])
+
+                    if k == 0:
                         H1.append(hap_list)
                     else:
                         H2.append(hap_list)
 
-                    self.option_1 +=1
+                    self.option_1 += 1
 
-                #if there are more options than actual haplotypes possible:
+                # if there are more options than actual haplotypes possible:
                 else:
-                    self.option_2 +=1
-                    optionDict = {}
+                    self.option_2 += 1
+                    optionDict = {} #set()
                     if len(fq) == 0:
-                        list=[]
-                        for (gen,name) in self.cypher.loc_map.items():
-                            count=0
+                        _list = []
+                        for (gen, name) in self.cypher.loc_map.items():
+                            count = 0
                             for i in range(len(hap_list[0])):
-                                if hap_list[0][i].split("*")[0]==gen:
-                                    count=count+1
-                            if count>0:
-                                list.append(name)
-                        #we'll get all the options possible
-                        #(query,lc)=self.cypher.buildQuery(["~".join(list)])
-
-                       # fq = pa.DataFrame(self.graph.data(query))
-                        label = "".join(sorted(list))
+                                if hap_list[0][i].split("*", 1)[0] == gen:
+                                    count = count + 1
+                            if count > 0:
+                                _list.append(name)
+                        # we'll get all the options possible
+                        # (query,lc)=self.cypher.buildQuery(["~".join(_list)])
+
+                        # fq = pa.DataFrame(self.graph.data(query))
+                        label = "".join(sorted(_list))
                         fq = self.netGraph.haps_by_label(label)
-                       # fq = pa.DataFrame(self.netGraph.abcdq_allele(), )
-                       # fq = self.netGraph.abcdq_allele()
-                        #we'll find which of all the options are compatable with the donor
+                        # fq = pa.DataFrame(self.netGraph.abcdq_allele(), )
+                        # fq = self.netGraph.abcdq_allele()
+                        # we'll find which of all the options are compatable with the donor
                         """gl_list=gl_string.split("^")
                         for gen in gl_list:
                             gens = gen.split('+')
@@ -785,29 +775,15 @@ def open_phases(self, haps, N_Loc, gl_string):
                                 gen=g.split('/')
                                 for option in gen:
                                     optionDict[option]=True"""
-                    for hap_k in hap_list:
-                        #listType = []
-                        for g in hap_k:
-                            gen = g.split('/')
-                            for option in gen:
-                                optionDict[option] = True
-                        #print(listType)
+
+                    for gen in hap_list_splits:
+                        for option in gen:
+                            optionDict[option] = True
+                            # optionDict.add(option)
                     ##all_haps = fq.values.tolist()
-                    all_haps = fq
-                    hap_list=[]
-                    for i in range(len(all_haps)):
-                        count=0
-                        for gen in all_haps[i].split("~"):
-                            if not gen in optionDict:
-                                break
-                            else:
-                                count=count+1
-                        if count == N_Loc:
-
-                            # all_haps[i][0]=all_haps[i][0].split("~")
-                            #hap_list.append(all_haps[i][0])
-                            hap_list.append(all_haps[i].split("~"))
-                    if (k == 0):
+                    hap_list = create_hap_list(fq, optionDict, N_Loc)
+
+                    if k == 0:
                         H1.append(hap_list)
                     else:
                         H2.append(hap_list)
@@ -943,7 +919,7 @@ def find_option_freq(self, option, haplos, missing):
 
     def comp_hap_prob_plan_b(self, Hap,division,missing):
         if division[0] == list(set(self.index_dict.values())):#[1, 2, 3, 4, 5]:
-            return self.comp_hap_prob(Hap[0], 0, 0, 0)
+            return self.comp_hap_prob([Hap[0]], 0, 0, 0)
         haplo_probs = self.find_option_freq(division, Hap,missing)
 
         # Return {'Haps': haplos, 'Probs': probs} of the dict
@@ -1318,17 +1294,20 @@ def input_type(self, haplotype):
             type_list.append(self.index_dict[locus])
         return type_list
 
+    def open_ambiguities(self, hap, loc, split_loc):
+        return open_ambiguities(hap, loc, split_loc)
+
     def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, planb, em):
         # receives a list of phases and computes haps and
         # probabilties and accumulate cartesian productEpsilon=0.0001
         chr = self.gl2haps(gl_string)
         if chr == []:
-            return
+            return None, None
         # if we in 9-loci, check if the type input in valid format
         if self.nodes_for_plan_A:
             geno_type = self.input_type(chr['Genotype'][0])
             if not geno_type in self.nodes_for_plan_A:
-                return
+                return None, None
 
         n_loci = chr['N_Loc']
 
@@ -1337,7 +1316,7 @@ def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, pla
 
         # return if the result is empty (why would that be?)
         if pmags == []:
-            return
+            return None, None
 
         #res_muugs = {'Haps': 'NaN', 'Probs': 0}
         res_muugs = {'MaxProb': 0, 'Haps': {}, 'Pops': {}}
@@ -1363,7 +1342,7 @@ def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, pla
 
         if phases:
             if MUUG_output:
-                prior_matrix_orig = copy.deepcopy(self.priorMatrix)
+                prior_matrix_orig = np.array(self.priorMatrix, order='K', copy=True) #copy.deepcopy(self.priorMatrix)
                 res_muugs = self.call_comp_phase_prob(epsilon, n, phases, chr, True, planb)
                 if planb and len(res_muugs['Haps']) == 0:
                     self.plan = 'c'
@@ -1420,23 +1399,19 @@ def call_comp_phase_prob(self, epsilon, n, phases, chr, MUUG_output, planb):
         # no plan b
         for level in range(2):
             if level == 1:
-                if self.unk_priors == "MR":
-                    self.priorMatrix = np.ones((len(self.populations), len(self.populations)))
-                else:
-                    self.priorMatrix = np.identity(len(self.populations))
-                #self.priorMatrix = np.ones((len(self.populations), len(self.populations)))  ####
+                self.priorMatrix = np.ones((len(self.populations), len(self.populations)))  ####
             if planb and len(res['Haps']) == 0:
                 self.plan = 'b'
                 epsilon = 1e-14
                 n_res = 0
                 min_res = 10
                 min_epsilon = 1.e-3
-                #self.priorMatrix = np.ones((len(self.populations), len(self.populations)))
+                # self.priorMatrix = np.ones((len(self.populations), len(self.populations)))
                 while (epsilon > 0) & (n_res < min_res):
                     epsilon /= 10
                     if (epsilon < min_epsilon):
                         epsilon = 0.0
-                    phases_planb = copy.deepcopy(phases)
+                    phases_planb = deepcopy_list(phases)
                     # Find the option according to plan b
                     if MUUG_output:
                         res = self.comp_phase_prob_plan_b(phases_planb, chr['N_Loc'], epsilon, True)
@@ -1683,7 +1658,7 @@ def impute_file(self, config,  planb=None, em_mr = False, em = False):##em
 
         with f as lines:
             for (i, name_gl) in enumerate(lines):
-                #try:
+                try:
                     name_gl = name_gl.rstrip()  # remove trailing whitespace
                     if ',' in name_gl:
                         list_gl = name_gl.split(',')
@@ -1746,9 +1721,10 @@ def impute_file(self, config,  planb=None, em_mr = False, em = False):##em
                     print(time_taken)
                     if self.verbose:
                         self.logger.info("Time taken: " + str(time_taken))
-                    """except:
-                        problem.write(str(name_gl) + "\n")
-                        continue"""
+                except:
+                    print(f"{i} Subject: {subject_id} - Exception")
+                    problem.write(str(name_gl) + "\n")
+                    continue
 
             f.close()
             if MUUG_output:
diff --git a/grim/imputation/imputegl/networkx_graph.py b/grim/imputation/imputegl/networkx_graph.py
index 2eeefaf..c7c834a 100755
--- a/grim/imputation/imputegl/networkx_graph.py
+++ b/grim/imputation/imputegl/networkx_graph.py
@@ -5,18 +5,20 @@
 def missing(labelA, labelB):
     a = list(labelA)
     b = list(labelB)
-    return  [x for x in b if x not in a]
+    return [x for x in b if x not in a]
+
 
 class Graph(object):
+    __slots__ = 'graph', 'labelDict', 'whole_graph', 'full_loci', 'nodes_plan_a', 'nodes_plan_b'
 
     def __init__(self, config):
-        self.graph = nx.Graph()
+        self.graph = nx.DiGraph()
         self.labelDict = {}
-        self.whole_graph = nx.Graph()
+        self.whole_graph = nx.DiGraph()
         self.full_loci = config["full_loci"]
         self.nodes_plan_a, self.nodes_plan_b = [], []
         if config["nodes_for_plan_A"]:
-            path = ('/').join(config["node_file"].split('/')[:-1])
+            path = '/'.join(config["node_file"].split('/')[:-1])
 
             # bug: dies if file doesn't exist
             # bug: list_f doesn't exist
@@ -27,69 +29,75 @@ def __init__(self, config):
             with open(path + '/nodes_for_plan_b.txt') as list_f:
                 for item in list_f:
                     self.nodes_plan_b.append(item.strip())
-            #self.nodes_plan_a = pickle.load(open( path + '/nodes_for_plan_a.pkl', "rb"))
-            #self.nodes_plan_b = pickle.load(open( path + '/nodes_for_plan_b.pkl', "rb"))
-
+            # self.nodes_plan_a = pickle.load(open( path + '/nodes_for_plan_a.pkl', "rb"))
+            # self.nodes_plan_b = pickle.load(open( path + '/nodes_for_plan_b.pkl', "rb"))
 
-    #build graph from files of nodes and edges between nodes with top relation
+    # build graph from files of nodes and edges between nodes with top relation
     def build_graph(self, nodesFile, edgesFile, allEdgesFile):
         nodesDict = dict()
-        #add nodes from file
+        # add nodes from file
         with open(nodesFile) as nodesfile:
             readNodes = csv.reader(nodesfile, delimiter=',')
-            firstLine = next(readNodes)
+            next(readNodes)
             for row in readNodes:
                 if len(row) > 0:
                     if not self.nodes_plan_a or row[2] in self.nodes_plan_a:
-                        self.graph.add_node(row[1],label=row[2], freq=list(map(float, row[3].split(";"))))
+                        self.graph.add_node(row[1], label=row[2], freq=list(map(float, row[3].split(";"))))
                     if not self.nodes_plan_b or row[2] in self.nodes_plan_b:
-                        self.whole_graph.add_node(row[1],label=row[2], freq=list(map(float, row[3].split(";"))))
+                        self.whole_graph.add_node(row[1], label=row[2], freq=list(map(float, row[3].split(";"))))
                     nodesDict[row[0]] = row[1]
 
         nodesfile.close()
 
-
-        #add edges from file
+        # add edges from file
         with open(edgesFile) as edgesfile:
             readEdges = csv.reader(edgesfile, delimiter=',')
-            firstLine = next(readEdges)
+            next(readEdges)
             for row in readEdges:
                 if len(row) > 0:
                     node1 = nodesDict[row[0]]
                     node2 = nodesDict[row[1]]
-                    if node1 in self.graph.nodes() and node2 in self.graph.nodes():
-                        self.graph.add_edge(node1, node2)
+                    if node1 in self.graph and node2 in self.graph:
+                        if self.graph.nodes[node1]["label"] == self.full_loci:
+                            self.graph.add_edge(node2, node1)
+                        else:
+                            self.graph.add_edge(node1, node2)
 
         edgesfile.close()
 
-        #add edges from file
+        # add edges from file
         with open(allEdgesFile) as allEdgesfile:
             readEdges = csv.reader(allEdgesfile, delimiter=',')
-            firstLine = next(readEdges)
+            next(readEdges)
             for row in readEdges:
                 if len(row) > 0:
                     node1 = nodesDict[row[0]]
                     node2 = nodesDict[row[1]]
-                    kind = ("-".join(sorted([self.whole_graph.nodes[node1]['label'], self.whole_graph.nodes[node2]['label']], key=len)))
-                    self.whole_graph.add_edge(node1, node2, color = kind)
+                    if len(self.whole_graph.nodes[node1]['label']) < len(self.whole_graph.nodes[node2]['label']):
+                        connector = self.whole_graph.nodes[node2]['label'] + node1
+                        self.whole_graph.add_edge(node1, connector)
+                        self.whole_graph.add_edge(connector, node2)
+                    else:
+                        connector =  self.whole_graph.nodes[node1]['label'] + node2
+                        self.whole_graph.add_edge(node2, connector)
+                        self.whole_graph.add_edge(connector, node1)
 
         allEdgesfile.close()
-
         nodesDict.clear()
 
-    #return all haplotype by specific label
+    # return all haplotype by specific label
     def haps_by_label(self, label):
-        #cheak if already found
+        # cheak if already found
         if label in self.labelDict:
             return self.labelDict[label]
-        #not found yet. serach and save in labelDict
+        # not found yet. serach and save in labelDict
         hapsList = []
         if not self.nodes_plan_a or label in self.nodes_plan_a:
-            for key,key_data in self.graph.nodes(data=True):
+            for key, key_data in self.graph.nodes(data=True):
                 if key_data["label"] == label:
                     hapsList.append(key)
         elif label in self.nodes_plan_b:
-            for key,key_data in self.whole_graph.nodes(data=True):
+            for key, key_data in self.whole_graph.nodes(data=True):
                 if key_data["label"] == label:
                     hapsList.append(key)
         self.labelDict[label] = hapsList
@@ -108,66 +116,43 @@ def haps_with_probs_by_label(self, label):
 
         return dictAlleles
 
-    #find all adj of alleleList from label 'ABCQR'
+    # find all adj of alleleList from label 'ABCQR'
     def adjs_query(self, alleleList):
         adjDict = dict()
         for allele in alleleList:
-            if allele in self.graph.nodes():
-                if self.graph.nodes[allele]["label"] == self.full_loci:  # 'ABCQR':
-                    adjDict[allele] = self.graph.nodes[allele]['freq']
+            if allele in self.graph:
+                allele_node = self.graph.nodes[allele]
+                if allele_node["label"] == self.full_loci:  # 'ABCQR':
+                    adjDict[allele] = allele_node['freq']
                 else:
                     adjs = self.graph.adj[allele]
                     for adj in adjs:
                         adjDict[adj] = self.graph.nodes[adj]['freq']
         return adjDict
 
-
-    #find all adj of alleleList by label
+    # find all adj of alleleList by label
     def adjs_query_by_color(self, alleleList, labelA, labelB):
-       # copyLabelA = labelA
+        # copyLabelA = labelA
         adjDict = dict()
-        if (labelA == labelB):
-            return  self.node_probs(alleleList, labelA)
+        if labelA == labelB:
+            return self.node_probs(alleleList, labelA)
 
         for allele in alleleList:
-            if allele in self.whole_graph.nodes():
-                copyLabelA = labelA
-                newLabelA = labelA
-                miss = missing(labelA, labelB)
-                alleles = [allele]
-
-                while len(miss) > 0:
-                        tmpAllels = list()
-                        newLabelA = copyLabelA + miss[0]
-                        newLabelA = ''.join(sorted(newLabelA))
-                        del miss[0]
-                        for oneAllel in alleles:
-                        #    alleles.remove(oneAllel)
-                            adjs = self.whole_graph.adj[oneAllel]
-                            label = copyLabelA + '-' +newLabelA
-                            for adj in adjs:
-                                 if adjs[adj]['color'] == label:
-                                    tmpAllels.append(adj)
-                        alleles = tmpAllels
-                        copyLabelA = newLabelA
-
-
+            if allele in self.whole_graph:
+                alleles = self.whole_graph.adj.get(labelB + allele  , [])
                 for adj in alleles:
                     adjDict[adj] = self.whole_graph.nodes[adj]['freq']
         return adjDict
 
-    #return dict of nodes and there proper freq
+    # return dict of nodes and there proper freq
     def node_probs(self, nodes, label):
         nodesDict = dict()
         if not self.nodes_plan_b or label in self.nodes_plan_b:
             for node in nodes:
-                if node in self.whole_graph.nodes():
+                if node in self.whole_graph:
                     nodesDict[node] = self.whole_graph.nodes[node]['freq']
         elif label in self.nodes_plan_a:
             for node in nodes:
-                if node in self.whole_graph.nodes():
+                if node in self.whole_graph:
                     nodesDict[node] = self.graph.nodes[node]['freq']
         return nodesDict
-
-
-
diff --git a/grim/validation/runfile.py b/grim/validation/runfile.py
index 48cc3d1..16e7b82 100644
--- a/grim/validation/runfile.py
+++ b/grim/validation/runfile.py
@@ -77,7 +77,7 @@ def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_gra
     print("Performing imputation based on:")
     print("\tPopulation: {}".format(config["pops"]))
     print("\tPriority: {}".format(config["priority"]))
-    print("\tPriority: {}".format(config["UNK_priors"]))
+    print("\tUNK priority: {}".format(config["UNK_priors"]))
     print("\tEpsilon: {}".format(config["epsilon"]))
     print("\tPlan B: {}".format(config["planb"]))
     print("\tNumber of Results: {}".format(config["number_of_results"]))
diff --git a/setup.py b/setup.py
index 57542a2..04a17e1 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,16 @@
 
 """The setup script."""
 
-from setuptools import setup, find_packages
+from setuptools import setup
+from Cython.Build import cythonize
+# import numpy
+
+
+      # include_dirs=[numpy.get_include()],
+      # requires=['numpy', 'Cython'])
+
+
+from setuptools import setup, find_packages, Extension
 
 with open("README.md") as readme_file:
     readme = readme_file.read()
@@ -67,4 +76,5 @@
     tests_require=test_requirements,
     url="https://github.com/nmdp-bioinformatics/py-grim",
     zip_safe=False,
+    ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])])
 )

From 02bf72a699fbcb3c936f9c0739a6113cf66fa9ae Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 6 Nov 2022 13:33:21 +0200
Subject: [PATCH 02/14] Changes for efficiency

---
 setup.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 04a17e1..71d0365 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
 
 setup(
     name="py-graph-imputation",
-    version="0.0.4",
+    version="0.0.6",
     author="Pradeep Bashyal",
     author_email="pbashyal@nmdp.org",
     python_requires=">=3.8",
@@ -71,7 +71,13 @@
     long_description_content_type="text/markdown",
     include_package_data=True,
     keywords="grim",
-    packages=find_packages(include=["grim"]),
+    packages=find_packages(include=[
+            "grim",
+            "grim.imputation",
+            "grim.imputation.imputegl",
+            "grim.imputation.graph_generation",
+            "grim.validation",
+        ]),
     test_suite="tests",
     tests_require=test_requirements,
     url="https://github.com/nmdp-bioinformatics/py-grim",

From bdd22d7f924039ffd370d15ba54f2be2347e3bf2 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 12:35:03 +0200
Subject: [PATCH 03/14] Changes for efficiency

---
 grim/imputation/imputegl/cutils.pyx | 1 -
 setup.py                            | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/grim/imputation/imputegl/cutils.pyx b/grim/imputation/imputegl/cutils.pyx
index d920a6a..4edb9e5 100644
--- a/grim/imputation/imputegl/cutils.pyx
+++ b/grim/imputation/imputegl/cutils.pyx
@@ -1,4 +1,3 @@
-#cython: language_level=3
 import cython
 
 
diff --git a/setup.py b/setup.py
index 71d0365..51c29a9 100644
--- a/setup.py
+++ b/setup.py
@@ -82,5 +82,5 @@
     tests_require=test_requirements,
     url="https://github.com/nmdp-bioinformatics/py-grim",
     zip_safe=False,
-    ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])])
+    ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3")
 )

From bfd15995958fada6cb9e243620bba4cb69f5987b Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 13:12:33 +0200
Subject: [PATCH 04/14] Changes for efficiency

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index ee19398..b0ed4ed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 numpy>=1.20.2
 networkx==2.5.1
+cython

From 2a74587794d571c461768304b51c2082f31d4bb8 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 13:17:19 +0200
Subject: [PATCH 05/14] Changes for efficiency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 51c29a9..708a99c 100644
--- a/setup.py
+++ b/setup.py
@@ -82,5 +82,5 @@
     tests_require=test_requirements,
     url="https://github.com/nmdp-bioinformatics/py-grim",
     zip_safe=False,
-    ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3")
+    ext_modules=cythonize([Extension("cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3")
 )

From ba7c7a39074596884d4c5c3d8ff6a35ffa7db523 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:15:53 +0200
Subject: [PATCH 06/14] Changes for efficiency

---
 MANIFEST.in | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..200770d
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,6 @@
+include requirements.txt
+include README.md
+include LICENSE
+include *.txt
+recursive-include src *.py
+recursive-include src *.txt

From 766e66daad5efd5f9fb1a92fb74c90c82acae3b7 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:20:01 +0200
Subject: [PATCH 07/14] Changes for efficiency

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index b0ed4ed..27df244 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 numpy>=1.20.2
 networkx==2.5.1
 cython
+Cython

From 0dd03d9c827d179c8ceb5f24cbc3b3839f2e9fb4 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:26:32 +0200
Subject: [PATCH 08/14] Changes for efficiency

---
 grim/__init__.py                             |  2 +-
 grim/conf/__init__.py                        | 25 ++++++++++++++++++++
 grim/imputation/__init__.py                  | 25 ++++++++++++++++++++
 grim/imputation/graph_generation/__init__.py | 25 ++++++++++++++++++++
 grim/imputation/imputegl/__init__.py         |  2 +-
 grim/validation/__init__.py                  | 25 ++++++++++++++++++++
 setup.py                                     |  1 +
 7 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100755 grim/conf/__init__.py
 create mode 100755 grim/imputation/__init__.py
 create mode 100755 grim/imputation/graph_generation/__init__.py
 create mode 100755 grim/validation/__init__.py

diff --git a/grim/__init__.py b/grim/__init__.py
index 375c00f..f45a470 100644
--- a/grim/__init__.py
+++ b/grim/__init__.py
@@ -26,4 +26,4 @@
 """Top-level package for py-grim."""
 
 __organization__ = "NMDP/CIBMTR Bioinformatics"
-__version__ = "0.0.4"
+__version__ = "0.0.6"
diff --git a/grim/conf/__init__.py b/grim/conf/__init__.py
new file mode 100755
index 0000000..cedd678
--- /dev/null
+++ b/grim/conf/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+#
+#    This library is free software; you can redistribute it and/or modify it
+#    under the terms of the GNU Lesser General Public License as published
+#    by the Free Software Foundation; either version 3 of the License, or (at
+#    your option) any later version.
+#
+#    This library is distributed in the hope that it will be useful, but WITHOUT
+#    ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
+#    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+#    License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this library;  if not, write to the Free Software Foundation,
+#    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
+#
+#    > http://www.fsf.org/licensing/licenses/lgpl.html
+#    > http://www.opensource.org/licenses/lgpl-license.php
+#
+
+
+__author__ = """Martin Maiers"""
+__email__ = 'mmaiers@nmdp.org'
+__version__ = '0.0.6'
diff --git a/grim/imputation/__init__.py b/grim/imputation/__init__.py
new file mode 100755
index 0000000..cedd678
--- /dev/null
+++ b/grim/imputation/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+#
+#    This library is free software; you can redistribute it and/or modify it
+#    under the terms of the GNU Lesser General Public License as published
+#    by the Free Software Foundation; either version 3 of the License, or (at
+#    your option) any later version.
+#
+#    This library is distributed in the hope that it will be useful, but WITHOUT
+#    ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
+#    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+#    License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this library;  if not, write to the Free Software Foundation,
+#    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
+#
+#    > http://www.fsf.org/licensing/licenses/lgpl.html
+#    > http://www.opensource.org/licenses/lgpl-license.php
+#
+
+
+__author__ = """Martin Maiers"""
+__email__ = 'mmaiers@nmdp.org'
+__version__ = '0.0.6'
diff --git a/grim/imputation/graph_generation/__init__.py b/grim/imputation/graph_generation/__init__.py
new file mode 100755
index 0000000..cedd678
--- /dev/null
+++ b/grim/imputation/graph_generation/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+#
+#    This library is free software; you can redistribute it and/or modify it
+#    under the terms of the GNU Lesser General Public License as published
+#    by the Free Software Foundation; either version 3 of the License, or (at
+#    your option) any later version.
+#
+#    This library is distributed in the hope that it will be useful, but WITHOUT
+#    ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
+#    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+#    License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this library;  if not, write to the Free Software Foundation,
+#    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
+#
+#    > http://www.fsf.org/licensing/licenses/lgpl.html
+#    > http://www.opensource.org/licenses/lgpl-license.php
+#
+
+
+__author__ = """Martin Maiers"""
+__email__ = 'mmaiers@nmdp.org'
+__version__ = '0.0.6'
diff --git a/grim/imputation/imputegl/__init__.py b/grim/imputation/imputegl/__init__.py
index 1548b44..352ebc3 100755
--- a/grim/imputation/imputegl/__init__.py
+++ b/grim/imputation/imputegl/__init__.py
@@ -24,4 +24,4 @@
 
 __author__ = """Martin Maiers"""
 __email__ = 'mmaiers@nmdp.org'
-__version__ = '0.0.4'
+__version__ = '0.0.6'
diff --git a/grim/validation/__init__.py b/grim/validation/__init__.py
new file mode 100755
index 0000000..cedd678
--- /dev/null
+++ b/grim/validation/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+#
+#    This library is free software; you can redistribute it and/or modify it
+#    under the terms of the GNU Lesser General Public License as published
+#    by the Free Software Foundation; either version 3 of the License, or (at
+#    your option) any later version.
+#
+#    This library is distributed in the hope that it will be useful, but WITHOUT
+#    ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
+#    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+#    License for more details.
+#
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this library;  if not, write to the Free Software Foundation,
+#    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
+#
+#    > http://www.fsf.org/licensing/licenses/lgpl.html
+#    > http://www.opensource.org/licenses/lgpl-license.php
+#
+
+
+__author__ = """Martin Maiers"""
+__email__ = 'mmaiers@nmdp.org'
+__version__ = '0.0.6'
diff --git a/setup.py b/setup.py
index 708a99c..d23c4d8 100644
--- a/setup.py
+++ b/setup.py
@@ -77,6 +77,7 @@
             "grim.imputation.imputegl",
             "grim.imputation.graph_generation",
             "grim.validation",
+            "grim.conf",
         ]),
     test_suite="tests",
     tests_require=test_requirements,

From b0c8b94970b0e5d4b996a13bfb1f776d1685eeff Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:32:44 +0200
Subject: [PATCH 09/14] Changes for efficiency

---
 MANIFEST.in | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 200770d..cc4d41f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,5 +2,6 @@ include requirements.txt
 include README.md
 include LICENSE
 include *.txt
-recursive-include src *.py
-recursive-include src *.txt
+recursive-include grim *.py
+recursive-include grim *.txt
+recursive-include grim *.json

From 4941866b09d728e6e066a2ffec3be9c9ef66fb42 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:33:58 +0200
Subject: [PATCH 10/14] Changes for efficiency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d23c4d8..b40e34e 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
 
 setup(
     name="py-graph-imputation",
-    version="0.0.6",
+    version="0.0.7",
     author="Pradeep Bashyal",
     author_email="pbashyal@nmdp.org",
     python_requires=">=3.8",

From df035f42bbc28c7a0284ff08ec807b376d0624e9 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:35:25 +0200
Subject: [PATCH 11/14] Changes for efficiency

---
 MANIFEST.in | 1 +
 setup.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index cc4d41f..392139b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,3 +5,4 @@ include *.txt
 recursive-include grim *.py
 recursive-include grim *.txt
 recursive-include grim *.json
+recursive-include grim *.pyx
diff --git a/setup.py b/setup.py
index b40e34e..7345f40 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
 
 setup(
     name="py-graph-imputation",
-    version="0.0.7",
+    version="0.0.8",
     author="Pradeep Bashyal",
     author_email="pbashyal@nmdp.org",
     python_requires=">=3.8",

From 749372d87d5f28cc1291680367c4005042017977 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:37:54 +0200
Subject: [PATCH 12/14] Changes for efficiency

---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MANIFEST.in b/MANIFEST.in
index 392139b..bc6d0e1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,3 +6,4 @@ recursive-include grim *.py
 recursive-include grim *.txt
 recursive-include grim *.json
 recursive-include grim *.pyx
+recursive-include grim *.pyd

From 5913ba96594ba4ffc83b1c8e40c7acecd38151df Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 15:38:23 +0200
Subject: [PATCH 13/14] Changes for efficiency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7345f40..b40e34e 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
 
 setup(
     name="py-graph-imputation",
-    version="0.0.8",
+    version="0.0.7",
     author="Pradeep Bashyal",
     author_email="pbashyal@nmdp.org",
     python_requires=">=3.8",

From e4da1ab8715c07610a71f407cd8162b7f824b9b0 Mon Sep 17 00:00:00 2001
From: sapiris <sapir9003@gmail.com>
Date: Sun, 13 Nov 2022 16:35:51 +0200
Subject: [PATCH 14/14] Changes for efficiency

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index b40e34e..e25cd17 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
 
 setup(
     name="py-graph-imputation",
-    version="0.0.7",
+    version="0.0.6",
     author="Pradeep Bashyal",
     author_email="pbashyal@nmdp.org",
     python_requires=">=3.8",
@@ -83,5 +83,5 @@
     tests_require=test_requirements,
     url="https://github.com/nmdp-bioinformatics/py-grim",
     zip_safe=False,
-    ext_modules=cythonize([Extension("cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3")
+    ext_modules=cythonize([Extension("grim.imputation.imputegl.cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3")
 )