From 5c2912fc3551f08a6ebda9e677df0a971d8579e9 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 6 Nov 2022 13:18:47 +0200 Subject: [PATCH 01/19] Changes for efficiency --- grim/imputation/imputegl/cutils.pyx | 66 +++++++++ grim/imputation/imputegl/impute.py | 156 +++++++++------------ grim/imputation/imputegl/networkx_graph.py | 113 +++++++-------- grim/validation/runfile.py | 2 +- setup.py | 12 +- 5 files changed, 193 insertions(+), 156 deletions(-) create mode 100644 grim/imputation/imputegl/cutils.pyx diff --git a/grim/imputation/imputegl/cutils.pyx b/grim/imputation/imputegl/cutils.pyx new file mode 100644 index 0000000..d920a6a --- /dev/null +++ b/grim/imputation/imputegl/cutils.pyx @@ -0,0 +1,66 @@ +#cython: language_level=3 +import cython + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef open_ambiguities(list hap, unsigned char loc, tuple split_loc): + cdef unsigned int k, i, p, j #hap_len, haps_len, splits_len + cdef Py_ssize_t hap_len, haps_len, splits_len + cdef list hap_new, hap1 + # cdef np.ndarray[STR, ndim=1] hap_new, hap1 + p = 0 + if len(split_loc) > 1: + # This opens all allele ambiguities + hap_len = len(hap[0]) + haps_len = len(hap) + splits_len = len(split_loc) + hap_new = [None] * (haps_len * splits_len) + # hap_new = np.empty(haps_len * splits_len, dtype=np.object) # produces an empty list of haplotypes + hap1 = [None] * hap_len + # hap1 = np.empty(haps_len, dtype=np.object) + for k in range(haps_len): # split a given locus in all haps. + + for j in range(hap_len): + hap1[j] = hap[k][j] + + for i in range(splits_len): + hap1[loc] = split_loc[i] + hap_new[p] = hap1[:] + p += 1 + return hap_new + return hap + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef create_hap_list(list all_haps, dict optionDict, unsigned int N_Loc): + cdef unsigned int i, j, count + cdef list hap_list = [] + cdef list all_hap_split + + for i in range(len(all_haps)): + all_hap_split = all_haps[i].split('~') + count = 0 + for j in range(len(all_hap_split)): + if all_hap_split[j] not in optionDict: + break + else: + count += 1 + + if count == N_Loc: + hap_list.append(all_hap_split) + return hap_list + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef deepcopy_list(list l): + cdef list copy_l + cdef unsigned int i, length + length = len(l) + copy_l = [None] * length + for i in range(length): + if isinstance(l[i], list): + copy_l[i] = deepcopy_list(l[i]) + else: + copy_l[i] = l[i] + return copy_l diff --git a/grim/imputation/imputegl/impute.py b/grim/imputation/imputegl/impute.py index f7d0169..28bc299 100644 --- a/grim/imputation/imputegl/impute.py +++ b/grim/imputation/imputegl/impute.py @@ -7,8 +7,9 @@ import os.path import json -import numpy as np +import numpy as np +from .cutils import open_ambiguities, create_hap_list, deepcopy_list from .cypher_plan_b import CypherQueryPlanB from .cypher_query import CypherQuery @@ -92,6 +93,10 @@ def clean_up_gl(gl): class Imputation(object): + __slots__ = 'logger', 'verbose', 'populations', 'netGraph', 'priorMatrix', 'full_hapl', 'index_dict', 'full_loci', \ + 'factor', '_factor_missing_data', 'cypher', 'cypher_plan_b', 'matrix_planb', 'count_by_prob', \ + 'number_of_options_threshold', 'plan', 'option_1', 'option_2', \ + 'haplotypes_number_in_phase', 'save_space_mode', 'nodes_for_plan_A', 'unk_priors' def __init__(self, net=None,config=None, count_by_prob=None, verbose=False): """Constructor @@ -291,23 +296,6 @@ def open_phases_1(self, haps, N_Loc, gl_string): phases.append([H1, H2]) return phases - def open_ambiguities(self, hap, loc): - # This opens all allele ambiguities - hap_new = [] # produces an empty list of haplotypes - for k in range(len(hap)): #slit a given locus in all haps. - splitHap = hap[k][loc].split('/') - if splitHap==hap[k][loc].split(): - split_loc = hap[k][loc].split('|') - else: - split_loc = splitHap - hap1 = hap[k] - if len(split_loc) > 1: - for i in range(len(split_loc)): - hap1[loc] = split_loc[i] - hap_new.append(hap1[:]) - else: - hap_new.append(hap1[:]) - return hap_new def comp_hap_prob(self, Hap, N_Loc, epsilon, n): haplo_probs = self.get_haplo_freqs(Hap, epsilon, n) @@ -350,11 +338,12 @@ def comp_hap_prob(self, Hap, N_Loc, epsilon, n): # return haplo_probs def get_haplo_freqs(self, haplos, epsilon, n=25000): - haplos_joined = ["~".join(item) for sublist in haplos for item in sublist] ### + haplos_joined = ["~".join(item) for item in haplos[0]] ### #haplos_joined = [item for sublist in haplos for item in sublist] ### #haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist] return self.netGraph.adjs_query(haplos_joined) + # def get_haplo_freqs_miss(self, haplos, epsilon): # haplo_probs = {} # haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist] @@ -681,7 +670,7 @@ def comp_phase_prob(self, phases, N_Loc, epsilon, n): # pop_res are the names of the populations return {'Haps': hap_total, 'Probs': p_total,'Pops': pop_res} - def reduce_phase_to_valid_allels(self, haps, N_Loc, planc = False): + def reduce_phase_to_valid_allels(self, haps, N_Loc, planc=False): for j in range(len(haps)): for k in range(2): hap_list = [] @@ -689,14 +678,13 @@ def reduce_phase_to_valid_allels(self, haps, N_Loc, planc = False): options = 1 for i in range(N_Loc): options = options * (len(hap_list[0][i].split("/"))) - if options>=self.number_of_options_threshold or planc: + if options >= self.number_of_options_threshold or planc: for hap_k in hap_list: - for i,g in enumerate(hap_k): + for i, g in enumerate(hap_k): gen = g.split('/') probs = self.check_if_alleles_exist(gen) if not probs == {}: - list(probs.keys()) - haps[j][k][i] = ('/').join(list(probs.keys())) + haps[j][k][i] = '/'.join(list(probs.keys())) def reduce_phase_to_commons_alleles(self, haps, N_Loc, commons_number=1, planc = False ): @@ -733,51 +721,53 @@ def open_phases(self, haps, N_Loc, gl_string): for j in range(len(haps)): H1 = [] H2 = [] - ## fq = pa.DataFrame() + ## fq = pa.DataFrame() fq = [] for k in range(2): - hap_list = [] - hap_list.append(haps[j][k]) + hap_list = [haps[j][k]] + hap_list_splits = [tuple(allele.split("/")) for allele in hap_list[0]] - #compute the number of options: + # compute the number of options: options = 1 - for i in range(N_Loc): options=options*(len(hap_list[0][i].split("/"))) + for i in range(N_Loc): + options *= len(hap_list_splits[i]) - #if the number of options is smaller than the total number of nodes: + # if the number of options is smaller than the total number of nodes: if options < self.number_of_options_threshold: - #open ambiguities regularly: + # open ambiguities regularly: for i in range(N_Loc): - hap_list = self.open_ambiguities(hap_list, i) - if (k == 0): + hap_list = self.open_ambiguities(hap_list, i, hap_list_splits[i]) + + if k == 0: H1.append(hap_list) else: H2.append(hap_list) - self.option_1 +=1 + self.option_1 += 1 - #if there are more options than actual haplotypes possible: + # if there are more options than actual haplotypes possible: else: - self.option_2 +=1 - optionDict = {} + self.option_2 += 1 + optionDict = {} #set() if len(fq) == 0: - list=[] - for (gen,name) in self.cypher.loc_map.items(): - count=0 + _list = [] + for (gen, name) in self.cypher.loc_map.items(): + count = 0 for i in range(len(hap_list[0])): - if hap_list[0][i].split("*")[0]==gen: - count=count+1 - if count>0: - list.append(name) - #we'll get all the options possible - #(query,lc)=self.cypher.buildQuery(["~".join(list)]) - - # fq = pa.DataFrame(self.graph.data(query)) - label = "".join(sorted(list)) + if hap_list[0][i].split("*", 1)[0] == gen: + count = count + 1 + if count > 0: + _list.append(name) + # we'll get all the options possible + # (query,lc)=self.cypher.buildQuery(["~".join(_list)]) + + # fq = pa.DataFrame(self.graph.data(query)) + label = "".join(sorted(_list)) fq = self.netGraph.haps_by_label(label) - # fq = pa.DataFrame(self.netGraph.abcdq_allele(), ) - # fq = self.netGraph.abcdq_allele() - #we'll find which of all the options are compatable with the donor + # fq = pa.DataFrame(self.netGraph.abcdq_allele(), ) + # fq = self.netGraph.abcdq_allele() + # we'll find which of all the options are compatable with the donor """gl_list=gl_string.split("^") for gen in gl_list: gens = gen.split('+') @@ -785,29 +775,15 @@ def open_phases(self, haps, N_Loc, gl_string): gen=g.split('/') for option in gen: optionDict[option]=True""" - for hap_k in hap_list: - #listType = [] - for g in hap_k: - gen = g.split('/') - for option in gen: - optionDict[option] = True - #print(listType) + + for gen in hap_list_splits: + for option in gen: + optionDict[option] = True + # optionDict.add(option) ##all_haps = fq.values.tolist() - all_haps = fq - hap_list=[] - for i in range(len(all_haps)): - count=0 - for gen in all_haps[i].split("~"): - if not gen in optionDict: - break - else: - count=count+1 - if count == N_Loc: - - # all_haps[i][0]=all_haps[i][0].split("~") - #hap_list.append(all_haps[i][0]) - hap_list.append(all_haps[i].split("~")) - if (k == 0): + hap_list = create_hap_list(fq, optionDict, N_Loc) + + if k == 0: H1.append(hap_list) else: H2.append(hap_list) @@ -943,7 +919,7 @@ def find_option_freq(self, option, haplos, missing): def comp_hap_prob_plan_b(self, Hap,division,missing): if division[0] == list(set(self.index_dict.values())):#[1, 2, 3, 4, 5]: - return self.comp_hap_prob(Hap[0], 0, 0, 0) + return self.comp_hap_prob([Hap[0]], 0, 0, 0) haplo_probs = self.find_option_freq(division, Hap,missing) # Return {'Haps': haplos, 'Probs': probs} of the dict @@ -1318,17 +1294,20 @@ def input_type(self, haplotype): type_list.append(self.index_dict[locus]) return type_list + def open_ambiguities(self, hap, loc, split_loc): + return open_ambiguities(hap, loc, split_loc) + def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, planb, em): # receives a list of phases and computes haps and # probabilties and accumulate cartesian productEpsilon=0.0001 chr = self.gl2haps(gl_string) if chr == []: - return + return None, None # if we in 9-loci, check if the type input in valid format if self.nodes_for_plan_A: geno_type = self.input_type(chr['Genotype'][0]) if not geno_type in self.nodes_for_plan_A: - return + return None, None n_loci = chr['N_Loc'] @@ -1337,7 +1316,7 @@ def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, pla # return if the result is empty (why would that be?) if pmags == []: - return + return None, None #res_muugs = {'Haps': 'NaN', 'Probs': 0} res_muugs = {'MaxProb': 0, 'Haps': {}, 'Pops': {}} @@ -1363,7 +1342,7 @@ def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, pla if phases: if MUUG_output: - prior_matrix_orig = copy.deepcopy(self.priorMatrix) + prior_matrix_orig = np.array(self.priorMatrix, order='K', copy=True) #copy.deepcopy(self.priorMatrix) res_muugs = self.call_comp_phase_prob(epsilon, n, phases, chr, True, planb) if planb and len(res_muugs['Haps']) == 0: self.plan = 'c' @@ -1420,23 +1399,19 @@ def call_comp_phase_prob(self, epsilon, n, phases, chr, MUUG_output, planb): # no plan b for level in range(2): if level == 1: - if self.unk_priors == "MR": - self.priorMatrix = np.ones((len(self.populations), len(self.populations))) - else: - self.priorMatrix = np.identity(len(self.populations)) - #self.priorMatrix = np.ones((len(self.populations), len(self.populations))) #### + self.priorMatrix = np.ones((len(self.populations), len(self.populations))) #### if planb and len(res['Haps']) == 0: self.plan = 'b' epsilon = 1e-14 n_res = 0 min_res = 10 min_epsilon = 1.e-3 - #self.priorMatrix = np.ones((len(self.populations), len(self.populations))) + # self.priorMatrix = np.ones((len(self.populations), len(self.populations))) while (epsilon > 0) & (n_res < min_res): epsilon /= 10 if (epsilon < min_epsilon): epsilon = 0.0 - phases_planb = copy.deepcopy(phases) + phases_planb = deepcopy_list(phases) # Find the option according to plan b if MUUG_output: res = self.comp_phase_prob_plan_b(phases_planb, chr['N_Loc'], epsilon, True) @@ -1683,7 +1658,7 @@ def impute_file(self, config, planb=None, em_mr = False, em = False):##em with f as lines: for (i, name_gl) in enumerate(lines): - #try: + try: name_gl = name_gl.rstrip() # remove trailing whitespace if ',' in name_gl: list_gl = name_gl.split(',') @@ -1746,9 +1721,10 @@ def impute_file(self, config, planb=None, em_mr = False, em = False):##em print(time_taken) if self.verbose: self.logger.info("Time taken: " + str(time_taken)) - """except: - problem.write(str(name_gl) + "\n") - continue""" + except: + print(f"{i} Subject: {subject_id} - Exception") + problem.write(str(name_gl) + "\n") + continue f.close() if MUUG_output: diff --git a/grim/imputation/imputegl/networkx_graph.py b/grim/imputation/imputegl/networkx_graph.py index 2eeefaf..c7c834a 100755 --- a/grim/imputation/imputegl/networkx_graph.py +++ b/grim/imputation/imputegl/networkx_graph.py @@ -5,18 +5,20 @@ def missing(labelA, labelB): a = list(labelA) b = list(labelB) - return [x for x in b if x not in a] + return [x for x in b if x not in a] + class Graph(object): + __slots__ = 'graph', 'labelDict', 'whole_graph', 'full_loci', 'nodes_plan_a', 'nodes_plan_b' def __init__(self, config): - self.graph = nx.Graph() + self.graph = nx.DiGraph() self.labelDict = {} - self.whole_graph = nx.Graph() + self.whole_graph = nx.DiGraph() self.full_loci = config["full_loci"] self.nodes_plan_a, self.nodes_plan_b = [], [] if config["nodes_for_plan_A"]: - path = ('/').join(config["node_file"].split('/')[:-1]) + path = '/'.join(config["node_file"].split('/')[:-1]) # bug: dies if file doesn't exist # bug: list_f doesn't exist @@ -27,69 +29,75 @@ def __init__(self, config): with open(path + '/nodes_for_plan_b.txt') as list_f: for item in list_f: self.nodes_plan_b.append(item.strip()) - #self.nodes_plan_a = pickle.load(open( path + '/nodes_for_plan_a.pkl', "rb")) - #self.nodes_plan_b = pickle.load(open( path + '/nodes_for_plan_b.pkl', "rb")) - + # self.nodes_plan_a = pickle.load(open( path + '/nodes_for_plan_a.pkl', "rb")) + # self.nodes_plan_b = pickle.load(open( path + '/nodes_for_plan_b.pkl', "rb")) - #build graph from files of nodes and edges between nodes with top relation + # build graph from files of nodes and edges between nodes with top relation def build_graph(self, nodesFile, edgesFile, allEdgesFile): nodesDict = dict() - #add nodes from file + # add nodes from file with open(nodesFile) as nodesfile: readNodes = csv.reader(nodesfile, delimiter=',') - firstLine = next(readNodes) + next(readNodes) for row in readNodes: if len(row) > 0: if not self.nodes_plan_a or row[2] in self.nodes_plan_a: - self.graph.add_node(row[1],label=row[2], freq=list(map(float, row[3].split(";")))) + self.graph.add_node(row[1], label=row[2], freq=list(map(float, row[3].split(";")))) if not self.nodes_plan_b or row[2] in self.nodes_plan_b: - self.whole_graph.add_node(row[1],label=row[2], freq=list(map(float, row[3].split(";")))) + self.whole_graph.add_node(row[1], label=row[2], freq=list(map(float, row[3].split(";")))) nodesDict[row[0]] = row[1] nodesfile.close() - - #add edges from file + # add edges from file with open(edgesFile) as edgesfile: readEdges = csv.reader(edgesfile, delimiter=',') - firstLine = next(readEdges) + next(readEdges) for row in readEdges: if len(row) > 0: node1 = nodesDict[row[0]] node2 = nodesDict[row[1]] - if node1 in self.graph.nodes() and node2 in self.graph.nodes(): - self.graph.add_edge(node1, node2) + if node1 in self.graph and node2 in self.graph: + if self.graph.nodes[node1]["label"] == self.full_loci: + self.graph.add_edge(node2, node1) + else: + self.graph.add_edge(node1, node2) edgesfile.close() - #add edges from file + # add edges from file with open(allEdgesFile) as allEdgesfile: readEdges = csv.reader(allEdgesfile, delimiter=',') - firstLine = next(readEdges) + next(readEdges) for row in readEdges: if len(row) > 0: node1 = nodesDict[row[0]] node2 = nodesDict[row[1]] - kind = ("-".join(sorted([self.whole_graph.nodes[node1]['label'], self.whole_graph.nodes[node2]['label']], key=len))) - self.whole_graph.add_edge(node1, node2, color = kind) + if len(self.whole_graph.nodes[node1]['label']) < len(self.whole_graph.nodes[node2]['label']): + connector = self.whole_graph.nodes[node2]['label'] + node1 + self.whole_graph.add_edge(node1, connector) + self.whole_graph.add_edge(connector, node2) + else: + connector = self.whole_graph.nodes[node1]['label'] + node2 + self.whole_graph.add_edge(node2, connector) + self.whole_graph.add_edge(connector, node1) allEdgesfile.close() - nodesDict.clear() - #return all haplotype by specific label + # return all haplotype by specific label def haps_by_label(self, label): - #cheak if already found + # cheak if already found if label in self.labelDict: return self.labelDict[label] - #not found yet. serach and save in labelDict + # not found yet. serach and save in labelDict hapsList = [] if not self.nodes_plan_a or label in self.nodes_plan_a: - for key,key_data in self.graph.nodes(data=True): + for key, key_data in self.graph.nodes(data=True): if key_data["label"] == label: hapsList.append(key) elif label in self.nodes_plan_b: - for key,key_data in self.whole_graph.nodes(data=True): + for key, key_data in self.whole_graph.nodes(data=True): if key_data["label"] == label: hapsList.append(key) self.labelDict[label] = hapsList @@ -108,66 +116,43 @@ def haps_with_probs_by_label(self, label): return dictAlleles - #find all adj of alleleList from label 'ABCQR' + # find all adj of alleleList from label 'ABCQR' def adjs_query(self, alleleList): adjDict = dict() for allele in alleleList: - if allele in self.graph.nodes(): - if self.graph.nodes[allele]["label"] == self.full_loci: # 'ABCQR': - adjDict[allele] = self.graph.nodes[allele]['freq'] + if allele in self.graph: + allele_node = self.graph.nodes[allele] + if allele_node["label"] == self.full_loci: # 'ABCQR': + adjDict[allele] = allele_node['freq'] else: adjs = self.graph.adj[allele] for adj in adjs: adjDict[adj] = self.graph.nodes[adj]['freq'] return adjDict - - #find all adj of alleleList by label + # find all adj of alleleList by label def adjs_query_by_color(self, alleleList, labelA, labelB): - # copyLabelA = labelA + # copyLabelA = labelA adjDict = dict() - if (labelA == labelB): - return self.node_probs(alleleList, labelA) + if labelA == labelB: + return self.node_probs(alleleList, labelA) for allele in alleleList: - if allele in self.whole_graph.nodes(): - copyLabelA = labelA - newLabelA = labelA - miss = missing(labelA, labelB) - alleles = [allele] - - while len(miss) > 0: - tmpAllels = list() - newLabelA = copyLabelA + miss[0] - newLabelA = ''.join(sorted(newLabelA)) - del miss[0] - for oneAllel in alleles: - # alleles.remove(oneAllel) - adjs = self.whole_graph.adj[oneAllel] - label = copyLabelA + '-' +newLabelA - for adj in adjs: - if adjs[adj]['color'] == label: - tmpAllels.append(adj) - alleles = tmpAllels - copyLabelA = newLabelA - - + if allele in self.whole_graph: + alleles = self.whole_graph.adj.get(labelB + allele , []) for adj in alleles: adjDict[adj] = self.whole_graph.nodes[adj]['freq'] return adjDict - #return dict of nodes and there proper freq + # return dict of nodes and there proper freq def node_probs(self, nodes, label): nodesDict = dict() if not self.nodes_plan_b or label in self.nodes_plan_b: for node in nodes: - if node in self.whole_graph.nodes(): + if node in self.whole_graph: nodesDict[node] = self.whole_graph.nodes[node]['freq'] elif label in self.nodes_plan_a: for node in nodes: - if node in self.whole_graph.nodes(): + if node in self.whole_graph: nodesDict[node] = self.graph.nodes[node]['freq'] return nodesDict - - - diff --git a/grim/validation/runfile.py b/grim/validation/runfile.py index 48cc3d1..16e7b82 100644 --- a/grim/validation/runfile.py +++ b/grim/validation/runfile.py @@ -77,7 +77,7 @@ def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_gra print("Performing imputation based on:") print("\tPopulation: {}".format(config["pops"])) print("\tPriority: {}".format(config["priority"])) - print("\tPriority: {}".format(config["UNK_priors"])) + print("\tUNK priority: {}".format(config["UNK_priors"])) print("\tEpsilon: {}".format(config["epsilon"])) print("\tPlan B: {}".format(config["planb"])) print("\tNumber of Results: {}".format(config["number_of_results"])) diff --git a/setup.py b/setup.py index 57542a2..04a17e1 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,16 @@ """The setup script.""" -from setuptools import setup, find_packages +from setuptools import setup +from Cython.Build import cythonize +# import numpy + + + # include_dirs=[numpy.get_include()], + # requires=['numpy', 'Cython']) + + +from setuptools import setup, find_packages, Extension with open("README.md") as readme_file: readme = readme_file.read() @@ -67,4 +76,5 @@ tests_require=test_requirements, url="https://github.com/nmdp-bioinformatics/py-grim", zip_safe=False, + ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])]) ) From 02bf72a699fbcb3c936f9c0739a6113cf66fa9ae Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 6 Nov 2022 13:33:21 +0200 Subject: [PATCH 02/19] Changes for efficiency --- setup.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 04a17e1..71d0365 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ setup( name="py-graph-imputation", - version="0.0.4", + version="0.0.6", author="Pradeep Bashyal", author_email="pbashyal@nmdp.org", python_requires=">=3.8", @@ -71,7 +71,13 @@ long_description_content_type="text/markdown", include_package_data=True, keywords="grim", - packages=find_packages(include=["grim"]), + packages=find_packages(include=[ + "grim", + "grim.imputation", + "grim.imputation.imputegl", + "grim.imputation.graph_generation", + "grim.validation", + ]), test_suite="tests", tests_require=test_requirements, url="https://github.com/nmdp-bioinformatics/py-grim", From bdd22d7f924039ffd370d15ba54f2be2347e3bf2 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 12:35:03 +0200 Subject: [PATCH 03/19] Changes for efficiency --- grim/imputation/imputegl/cutils.pyx | 1 - setup.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/grim/imputation/imputegl/cutils.pyx b/grim/imputation/imputegl/cutils.pyx index d920a6a..4edb9e5 100644 --- a/grim/imputation/imputegl/cutils.pyx +++ b/grim/imputation/imputegl/cutils.pyx @@ -1,4 +1,3 @@ -#cython: language_level=3 import cython diff --git a/setup.py b/setup.py index 71d0365..51c29a9 100644 --- a/setup.py +++ b/setup.py @@ -82,5 +82,5 @@ tests_require=test_requirements, url="https://github.com/nmdp-bioinformatics/py-grim", zip_safe=False, - ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])]) + ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3") ) From bfd15995958fada6cb9e243620bba4cb69f5987b Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 13:12:33 +0200 Subject: [PATCH 04/19] Changes for efficiency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index ee19398..b0ed4ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ numpy>=1.20.2 networkx==2.5.1 +cython From 2a74587794d571c461768304b51c2082f31d4bb8 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 13:17:19 +0200 Subject: [PATCH 05/19] Changes for efficiency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 51c29a9..708a99c 100644 --- a/setup.py +++ b/setup.py @@ -82,5 +82,5 @@ tests_require=test_requirements, url="https://github.com/nmdp-bioinformatics/py-grim", zip_safe=False, - ext_modules=cythonize([Extension("grim/imputation/imputegl/cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3") + ext_modules=cythonize([Extension("cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3") ) From ba7c7a39074596884d4c5c3d8ff6a35ffa7db523 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:15:53 +0200 Subject: [PATCH 06/19] Changes for efficiency --- MANIFEST.in | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..200770d --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include requirements.txt +include README.md +include LICENSE +include *.txt +recursive-include src *.py +recursive-include src *.txt From 766e66daad5efd5f9fb1a92fb74c90c82acae3b7 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:20:01 +0200 Subject: [PATCH 07/19] Changes for efficiency --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b0ed4ed..27df244 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.20.2 networkx==2.5.1 cython +Cython From 0dd03d9c827d179c8ceb5f24cbc3b3839f2e9fb4 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:26:32 +0200 Subject: [PATCH 08/19] Changes for efficiency --- grim/__init__.py | 2 +- grim/conf/__init__.py | 25 ++++++++++++++++++++ grim/imputation/__init__.py | 25 ++++++++++++++++++++ grim/imputation/graph_generation/__init__.py | 25 ++++++++++++++++++++ grim/imputation/imputegl/__init__.py | 2 +- grim/validation/__init__.py | 25 ++++++++++++++++++++ setup.py | 1 + 7 files changed, 103 insertions(+), 2 deletions(-) create mode 100755 grim/conf/__init__.py create mode 100755 grim/imputation/__init__.py create mode 100755 grim/imputation/graph_generation/__init__.py create mode 100755 grim/validation/__init__.py diff --git a/grim/__init__.py b/grim/__init__.py index 375c00f..f45a470 100644 --- a/grim/__init__.py +++ b/grim/__init__.py @@ -26,4 +26,4 @@ """Top-level package for py-grim.""" __organization__ = "NMDP/CIBMTR Bioinformatics" -__version__ = "0.0.4" +__version__ = "0.0.6" diff --git a/grim/conf/__init__.py b/grim/conf/__init__.py new file mode 100755 index 0000000..cedd678 --- /dev/null +++ b/grim/conf/__init__.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# + + +__author__ = """Martin Maiers""" +__email__ = 'mmaiers@nmdp.org' +__version__ = '0.0.6' diff --git a/grim/imputation/__init__.py b/grim/imputation/__init__.py new file mode 100755 index 0000000..cedd678 --- /dev/null +++ b/grim/imputation/__init__.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# + + +__author__ = """Martin Maiers""" +__email__ = 'mmaiers@nmdp.org' +__version__ = '0.0.6' diff --git a/grim/imputation/graph_generation/__init__.py b/grim/imputation/graph_generation/__init__.py new file mode 100755 index 0000000..cedd678 --- /dev/null +++ b/grim/imputation/graph_generation/__init__.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# + + +__author__ = """Martin Maiers""" +__email__ = 'mmaiers@nmdp.org' +__version__ = '0.0.6' diff --git a/grim/imputation/imputegl/__init__.py b/grim/imputation/imputegl/__init__.py index 1548b44..352ebc3 100755 --- a/grim/imputation/imputegl/__init__.py +++ b/grim/imputation/imputegl/__init__.py @@ -24,4 +24,4 @@ __author__ = """Martin Maiers""" __email__ = 'mmaiers@nmdp.org' -__version__ = '0.0.4' +__version__ = '0.0.6' diff --git a/grim/validation/__init__.py b/grim/validation/__init__.py new file mode 100755 index 0000000..cedd678 --- /dev/null +++ b/grim/validation/__init__.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# + + +__author__ = """Martin Maiers""" +__email__ = 'mmaiers@nmdp.org' +__version__ = '0.0.6' diff --git a/setup.py b/setup.py index 708a99c..d23c4d8 100644 --- a/setup.py +++ b/setup.py @@ -77,6 +77,7 @@ "grim.imputation.imputegl", "grim.imputation.graph_generation", "grim.validation", + "grim.conf", ]), test_suite="tests", tests_require=test_requirements, From b0c8b94970b0e5d4b996a13bfb1f776d1685eeff Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:32:44 +0200 Subject: [PATCH 09/19] Changes for efficiency --- MANIFEST.in | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 200770d..cc4d41f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,5 +2,6 @@ include requirements.txt include README.md include LICENSE include *.txt -recursive-include src *.py -recursive-include src *.txt +recursive-include grim *.py +recursive-include grim *.txt +recursive-include grim *.json From 4941866b09d728e6e066a2ffec3be9c9ef66fb42 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:33:58 +0200 Subject: [PATCH 10/19] Changes for efficiency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d23c4d8..b40e34e 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ setup( name="py-graph-imputation", - version="0.0.6", + version="0.0.7", author="Pradeep Bashyal", author_email="pbashyal@nmdp.org", python_requires=">=3.8", From df035f42bbc28c7a0284ff08ec807b376d0624e9 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:35:25 +0200 Subject: [PATCH 11/19] Changes for efficiency --- MANIFEST.in | 1 + setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index cc4d41f..392139b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,3 +5,4 @@ include *.txt recursive-include grim *.py recursive-include grim *.txt recursive-include grim *.json +recursive-include grim *.pyx diff --git a/setup.py b/setup.py index b40e34e..7345f40 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ setup( name="py-graph-imputation", - version="0.0.7", + version="0.0.8", author="Pradeep Bashyal", author_email="pbashyal@nmdp.org", python_requires=">=3.8", From 749372d87d5f28cc1291680367c4005042017977 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:37:54 +0200 Subject: [PATCH 12/19] Changes for efficiency --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 392139b..bc6d0e1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,3 +6,4 @@ recursive-include grim *.py recursive-include grim *.txt recursive-include grim *.json recursive-include grim *.pyx +recursive-include grim *.pyd From 5913ba96594ba4ffc83b1c8e40c7acecd38151df Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 15:38:23 +0200 Subject: [PATCH 13/19] Changes for efficiency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7345f40..b40e34e 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ setup( name="py-graph-imputation", - version="0.0.8", + version="0.0.7", author="Pradeep Bashyal", author_email="pbashyal@nmdp.org", python_requires=">=3.8", From e4da1ab8715c07610a71f407cd8162b7f824b9b0 Mon Sep 17 00:00:00 2001 From: sapiris Date: Sun, 13 Nov 2022 16:35:51 +0200 Subject: [PATCH 14/19] Changes for efficiency --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b40e34e..e25cd17 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ setup( name="py-graph-imputation", - version="0.0.7", + version="0.0.6", author="Pradeep Bashyal", author_email="pbashyal@nmdp.org", python_requires=">=3.8", @@ -83,5 +83,5 @@ tests_require=test_requirements, url="https://github.com/nmdp-bioinformatics/py-grim", zip_safe=False, - ext_modules=cythonize([Extension("cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3") + ext_modules=cythonize([Extension("grim.imputation.imputegl.cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3") ) From f19b41abdd0a6a61fbcdc17a69d2a00af5504942 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Wed, 16 Nov 2022 10:07:32 -0600 Subject: [PATCH 15/19] Changes to PR from sapir. - fix Cython cyclic dependency with pyproject.toml - fix merge conflicts --- .github/workflows/python-tests.yml | 3 - .gitignore | 3 + grim/algorithm/match.py | 16 - grim/conf/__init__.py | 5 +- grim/grim.py | 37 +- .../generate_neo4j_multi_hpf.py | 272 +-- .../generate_neo4j_single_hpf.py | 194 +- .../graph_generation/nemo_to_hpf_csv.py | 50 +- .../graph_generation/wmda_download.py | 16 +- .../graph_generation/wmda_to_hpf_csv.py | 19 +- grim/imputation/imputegl/cypher_plan_b.py | 25 +- grim/imputation/imputegl/cypher_query.py | 15 +- grim/imputation/imputegl/impute.py | 1567 +++++++++++------ grim/imputation/imputegl/networkx_graph.py | 92 +- grim/imputation/setup.py | 45 +- grim/model/allele.py | 74 - grim/model/slug.py | 72 - grim/validation/parallel-imputation.py | 190 +- grim/validation/reduce_loci.py | 42 +- grim/validation/runfile.py | 161 +- grim/validation/runfile_mp.py | 87 +- grim/validation/runfile_mt.py | 87 +- grim/validation/simulation/__init__.py | 28 + grim/validation/simulation/data/__init__.py | 28 + pyproject.toml | 3 + requirements-deploy.txt | 2 - requirements-dev.txt | 7 +- requirements-tests.txt | 1 - requirements.txt | 5 +- setup.cfg | 2 +- tests/features/algorithm/SLUG Match.feature | 16 - .../definition/Class I HLA Alleles.feature | 21 - tests/steps/HLA_alleles.py | 30 - tests/steps/SLUG_match.py | 26 - ...st_my_project_template.py => test_grim.py} | 0 35 files changed, 1875 insertions(+), 1366 deletions(-) delete mode 100644 grim/algorithm/match.py delete mode 100644 grim/model/allele.py delete mode 100644 grim/model/slug.py create mode 100644 grim/validation/simulation/__init__.py create mode 100644 grim/validation/simulation/data/__init__.py create mode 100644 pyproject.toml delete mode 100644 tests/features/algorithm/SLUG Match.feature delete mode 100644 tests/features/definition/Class I HLA Alleles.feature delete mode 100644 tests/steps/HLA_alleles.py delete mode 100644 tests/steps/SLUG_match.py rename tests/unit/{test_my_project_template.py => test_grim.py} (100%) diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index 6bfdc01..cb48f53 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -28,6 +28,3 @@ jobs: - name: Test with pytest run: | pytest - - name: Run BDD Tests - run: | - behave diff --git a/.gitignore b/.gitignore index b33e2e9..68d97ce 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,6 @@ dmypy.json # behave pretty.output allure_report/ + +# cython temp files +grim/**/*.c diff --git a/grim/algorithm/match.py b/grim/algorithm/match.py deleted file mode 100644 index a8bb16a..0000000 --- a/grim/algorithm/match.py +++ /dev/null @@ -1,16 +0,0 @@ -from grim.model.slug import SLUG - - -# SLUG match -# match the SLUG of patient with donor - - -def slug_match(patient_slug: SLUG, donor_slug: SLUG) -> bool: - """ - SLUGs are matched if they are equal to each other - - @param patient_slug: Patient SLUG - @param donor_slug: Donor SLUG - @return: bool indicating whether they match or not - """ - return patient_slug == donor_slug diff --git a/grim/conf/__init__.py b/grim/conf/__init__.py index cedd678..54199c8 100755 --- a/grim/conf/__init__.py +++ b/grim/conf/__init__.py @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- +# +# grim Graph Imputation +# Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved. # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published @@ -7,7 +10,7 @@ # your option) any later version. # # This library is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public # License for more details. # diff --git a/grim/grim.py b/grim/grim.py index db79f42..d177da0 100644 --- a/grim/grim.py +++ b/grim/grim.py @@ -30,29 +30,42 @@ from .imputation.imputegl.networkx_graph import Graph import os -def graph_freqs(conf_file = "", for_em = False, em_pop=None ): + +def graph_freqs(conf_file="", for_em=False, em_pop=None): if conf_file == "": - conf_file = os.path.dirname(os.path.realpath(__file__)) + '/conf/minimal-configuration.json' + conf_file = ( + os.path.dirname(os.path.realpath(__file__)) + + "/conf/minimal-configuration.json" + ) + generate_neo4j_multi_hpf.generate_graph( + config_file=conf_file, em_pop=em_pop, em=for_em + ) - generate_neo4j_multi_hpf.generate_graph(config_file=conf_file, em_pop=em_pop, - em=for_em) -def impute(conf_file = ""): +def impute(conf_file=""): project_dir_in_file, project_dir_graph = "", "" if conf_file == "": - conf_file = os.path.dirname(os.path.realpath(__file__)) + '/conf/minimal-configuration.json' - project_dir_graph = os.path.dirname(os.path.realpath(__file__)) + '/imputation/graph_generation/' - project_dir_in_file = os.path.dirname(os.path.realpath(__file__)) + '/' + conf_file = ( + os.path.dirname(os.path.realpath(__file__)) + + "/conf/minimal-configuration.json" + ) + project_dir_graph = ( + os.path.dirname(os.path.realpath(__file__)) + + "/imputation/graph_generation/" + ) + project_dir_in_file = os.path.dirname(os.path.realpath(__file__)) + "/" runfile.run_impute(conf_file, project_dir_graph, project_dir_in_file) -def impute_instance(config, graph, count_by_prob= None): + +def impute_instance(config, graph, count_by_prob=None): imputation = Imputation(graph, config, count_by_prob) return imputation + def graph_instance(config): graph = Graph(config) - graph.build_graph(config["node_file"], config["top_links_file"], config["edges_file"]) + graph.build_graph( + config["node_file"], config["top_links_file"], config["edges_file"] + ) return graph - - diff --git a/grim/imputation/graph_generation/generate_neo4j_multi_hpf.py b/grim/imputation/graph_generation/generate_neo4j_multi_hpf.py index 54fea9e..ecfd53f 100644 --- a/grim/imputation/graph_generation/generate_neo4j_multi_hpf.py +++ b/grim/imputation/graph_generation/generate_neo4j_multi_hpf.py @@ -12,7 +12,7 @@ import sys import os -#sys.path.insert(0, os.path.join("..")) +# sys.path.insert(0, os.path.join("..")) # set haplotype frequency trimming threshold @@ -36,14 +36,14 @@ # csv files on a mid-2015 MacBook Pro (2.5 GHz Intel Core i7) and will result # in 1.8M nodes (262MB), # 26M edges (3.6B)and 107M top links (200MB). -#FULL_LOCI = 'ABCQR' +# FULL_LOCI = 'ABCQR' ############################################################################## # functions ############################################################################## # nCr: n Choose r def nCr(nchars, r): - return [''.join(x) for x in combinations(nchars, r)] + return ["".join(x) for x in combinations(nchars, r)] # dividebyzero: safe division @@ -56,19 +56,16 @@ def dividebyzero(a, b): # make_allele_list: convert haplotype into sorted list of alleles def make_allele_list(haplotype, full_name_index_dict, num_of_alleles): - hl = haplotype.split('~') + hl = haplotype.split("~") # Remove g from the allele names - hl = list(map(lambda h: h[:-1] if (h[-1] == 'g') else h, hl)) - sorted_h1 = ['0']*num_of_alleles + hl = list(map(lambda h: h[:-1] if (h[-1] == "g") else h, hl)) + sorted_h1 = ["0"] * num_of_alleles for allele in hl: - locus = allele.split('*')[0] - sorted_h1[full_name_index_dict[locus] - 1 ] = allele + locus = allele.split("*")[0] + sorted_h1[full_name_index_dict[locus] - 1] = allele return sorted_h1 - - - # Find the index of a loci name in `FULL_LOCI`. ie. `BC` in `ABCRQ` is `[1,2]` def find_indices(FULL_LOCI, loci): return list(map(lambda x: indexOf(FULL_LOCI, x), loci)) @@ -76,11 +73,13 @@ def find_indices(FULL_LOCI, loci): # Given a list of indices find the corresponding loci combo def find_loci_name(FULL_LOCI, loci_indices): - return ''.join([FULL_LOCI[i] for i in sorted(loci_indices)]) + return "".join([FULL_LOCI[i] for i in sorted(loci_indices)]) # Find all the parents of the loci with loci_indices against loci_allele_list. -def find_parents(FULL_LOCI, loci_indices, loci_allele_list, p, ParentLoci, top_nodes_plan_b): +def find_parents( + FULL_LOCI, loci_indices, loci_allele_list, p, ParentLoci, top_nodes_plan_b +): fi = find_indices(FULL_LOCI, FULL_LOCI) new_loci = set(fi).difference(loci_indices) parents = [] @@ -91,12 +90,13 @@ def find_parents(FULL_LOCI, loci_indices, loci_allele_list, p, ParentLoci, top_n loci_name = find_loci_name(FULL_LOCI, new_loci) if loci_name in top_nodes_plan_b: # print(loci_name) - haplotype = '~'.join([loci_allele_list[i] for i in sorted(new_loci)]) + haplotype = "~".join([loci_allele_list[i] for i in sorted(new_loci)]) parents.append(ParentLoci(loci=loci_name, haplotype=haplotype, p=p)) return parents + # Find the wanted nodes for plan A and plan B -def labels_for_grap(conf, full_loci,csvdir): +def labels_for_grap(conf, full_loci, csvdir): nodes_graph = conf.get("Plan_A_Matrix", []) @@ -108,51 +108,56 @@ def labels_for_grap(conf, full_loci,csvdir): return list_all_combo, list_all_combo, list_all_combo, list_all_combo nodes_plan_a = [] - nodes_plan_b = []# nodes from which edges to the parents came out in plan b - top_nodes_plan_b = []# nodes to which edges will reach in plan b or single locus without requested parents. - - - - matrix_plan_b = conf.get("Plan_B_Matrix", [ - [[1, 2, 3, 4, 5]], - [[1, 2, 3], [4, 5]], - [[1], [2, 3], [4, 5]], - [[1, 2, 3], [4], [5]], - [[1], [2, 3], [4], [5]], - [[1], [2], [3], [4], [5]] - ])[1] + nodes_plan_b = [] # nodes from which edges to the parents came out in plan b + top_nodes_plan_b = ( + [] + ) # nodes to which edges will reach in plan b or single locus without requested parents. + + matrix_plan_b = conf.get( + "Plan_B_Matrix", + [ + [[1, 2, 3, 4, 5]], + [[1, 2, 3], [4, 5]], + [[1], [2, 3], [4, 5]], + [[1, 2, 3], [4], [5]], + [[1], [2, 3], [4], [5]], + [[1], [2], [3], [4], [5]], + ], + )[1] # convert the first recombination event from loci indexes to loci names, # and add the event subcomponents to top_nodes_plan_b for subcomponent in matrix_plan_b: for idx, locus in enumerate(subcomponent): subcomponent[idx] = str(locus) - top_nodes_plan_b.append(('').join(subcomponent)) + top_nodes_plan_b.append(("").join(subcomponent)) list_complement = [] full_loci_list = [] full_loci_list[:0] = full_loci for node_label in nodes_graph: # add nodes of plan a - label = '' + label = "" list_label = [] for locus_idx in node_label: label += str(locus_idx) list_label.append(str(locus_idx)) nodes_plan_a.append(label) - complement_node = ('').join(sorted([item for item in full_loci_list if item not in list_label])) - if complement_node != '': + complement_node = ("").join( + sorted([item for item in full_loci_list if item not in list_label]) + ) + if complement_node != "": list_complement.append(complement_node) # find the sub-nodes of label for plan b for subcomponent in matrix_plan_b: - node = '' + node = "" for locus in label: if locus in subcomponent: node += locus - if node != '' and not node in top_nodes_plan_b: + if node != "" and not node in top_nodes_plan_b: nodes_plan_b.append(node) - #add the full-locus to plan a nodes + # add the full-locus to plan a nodes if not full_loci in nodes_plan_a: nodes_plan_a.append(full_loci) @@ -164,24 +169,29 @@ def labels_for_grap(conf, full_loci,csvdir): # add the complementary nodes if they are not in one of the graphs for node in list_complement: - if (not node in nodes_plan_a) and (not node in nodes_plan_b) and (not node in top_nodes_plan_b): + if ( + (not node in nodes_plan_a) + and (not node in nodes_plan_b) + and (not node in top_nodes_plan_b) + ): top_nodes_plan_b.append(node) nodes_plan_b = list(set(nodes_plan_b)) all_combo_list = list(dict.fromkeys(nodes_plan_a + nodes_plan_b + top_nodes_plan_b)) - with open(csvdir + 'nodes_for_plan_a.txt', 'w') as f: + with open(csvdir + "nodes_for_plan_a.txt", "w") as f: for item in nodes_plan_a: f.write("%s\n" % item) - with open(csvdir + 'nodes_for_plan_b.txt', 'w') as f: - for item in (nodes_plan_b + top_nodes_plan_b): + with open(csvdir + "nodes_for_plan_b.txt", "w") as f: + for item in nodes_plan_b + top_nodes_plan_b: f.write("%s\n" % item) - #pickle.dump(nodes_plan_a, open(csvdir + '/nodes_for_plan_a.pkl', "wb")) - #pickle.dump(nodes_plan_b + top_nodes_plan_b, open(csvdir + '/nodes_for_plan_b.pkl', "wb")) + # pickle.dump(nodes_plan_a, open(csvdir + '/nodes_for_plan_a.pkl', "wb")) + # pickle.dump(nodes_plan_b + top_nodes_plan_b, open(csvdir + '/nodes_for_plan_b.pkl', "wb")) return all_combo_list, nodes_plan_a, nodes_plan_b, top_nodes_plan_b -#create full loci order, -#map beteen the loci neme and his index + +# create full loci order, +# map beteen the loci neme and his index def loci_order(loc_values): full_name_index_dict = {} all_loci_set = set() @@ -189,28 +199,32 @@ def loci_order(loc_values): full_name_index_dict[locus] = val all_loci_set.add(str(val)) - FULL_LOCI = ''.join( sorted(all_loci_set)) + FULL_LOCI = "".join(sorted(all_loci_set)) return FULL_LOCI, full_name_index_dict -def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop = None, em= False): + +def generate_graph( + config_file="../../conf/minimal-configuration.json", em_pop=None, em=False +): ############################################################################## # Configure ############################################################################## # set output directory and create it if it doesn't exist - #csvdir = "output/csv" - - + # csvdir = "output/csv" # Input file - #freq_file = path + freq_file + # freq_file = path + freq_file parser = argparse.ArgumentParser() - parser.add_argument("-c", "--config", - required=False, - default= config_file, - help="Configuration JSON file", - type=str) + parser.add_argument( + "-c", + "--config", + required=False, + default=config_file, + help="Configuration JSON file", + type=str, + ) args = parser.parse_args() configuration_file = args.config @@ -221,59 +235,77 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop csvdir = conf.get("graph_files_path") pathlib.Path(csvdir).mkdir(parents=True, exist_ok=True) - if csvdir[-1] != '/': - csvdir += '/' + if csvdir[-1] != "/": + csvdir += "/" pops = conf.get("populations") if em_pop: pops = em_pop freq_trim = conf.get("freq_trim_threshold") - freq_file = conf.get("freq_file", "default" ) + freq_file = conf.get("freq_file", "default") if freq_file == "default": - freq_file = os.path.dirname(os.path.realpath(__file__)) + '/output/hpf.csv' + freq_file = os.path.dirname(os.path.realpath(__file__)) + "/output/hpf.csv" dict_count_of_pop = {} - pop_ratio_dir = conf.get("pops_count_file", os.path.dirname(os.path.realpath(__file__)) + '/imputation/graph_generation/output/pop_ratio.txt') + pop_ratio_dir = conf.get( + "pops_count_file", + os.path.dirname(os.path.realpath(__file__)) + + "/imputation/graph_generation/output/pop_ratio.txt", + ) path = pathlib.Path(pop_ratio_dir) - if em or not path.is_file(): for pop in pops: - dict_count_of_pop[pop] = freq_trim + dict_count_of_pop[pop] = freq_trim else: with open(pop_ratio_dir) as f_count: for line in f_count: - pop, count_pop, ratio = line.strip().split(',') + pop, count_pop, ratio = line.strip().split(",") dict_count_of_pop[pop] = freq_trim / float(count_pop) - # Display the configurations we are using - print('****************************************************************************************************') + print( + "****************************************************************************************************" + ) print("Performing graph generation based on following configuration:") print("\tPopulation: {}".format(pops)) print("\tFreq File: {}".format(freq_file)) print("\tFreq Trim Threshold: {}".format(freq_trim)) - print('****************************************************************************************************') + print( + "****************************************************************************************************" + ) ############################################################################## # setup graph elements ############################################################################## - HaplotypeNode = namedtuple('HaplotypeNode', 'node_id, freq_list, allele_list, parents, top_links') - TopLink = namedtuple('TopLink', 'node_id, haplotype') - ParentLoci = namedtuple('ParentLoci', 'loci, haplotype, p') + HaplotypeNode = namedtuple( + "HaplotypeNode", "node_id, freq_list, allele_list, parents, top_links" + ) + TopLink = namedtuple("TopLink", "node_id, haplotype") + ParentLoci = namedtuple("ParentLoci", "loci, haplotype, p") - header = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', 'CP:DOUBLE[]', ':TYPE'] + header = [":START_ID(HAPLOTYPE)", ":END_ID(HAPLOTYPE)", "CP:DOUBLE[]", ":TYPE"] FULL_LOCI, full_name_index_dict = loci_order(conf.get("loci_map")) - - all_combo_list, nodes_plan_a, nodes_plan_b, top_nodes_plan_b = labels_for_grap(conf, FULL_LOCI, csvdir) + all_combo_list, nodes_plan_a, nodes_plan_b, top_nodes_plan_b = labels_for_grap( + conf, FULL_LOCI, csvdir + ) sequence = count(0) - loci_combo_map = {combo_list: defaultdict(lambda: HaplotypeNode(node_id=next(sequence), - freq_list=[], allele_list=[], parents=[], - top_links=set())) for combo_list in all_combo_list} + loci_combo_map = { + combo_list: defaultdict( + lambda: HaplotypeNode( + node_id=next(sequence), + freq_list=[], + allele_list=[], + parents=[], + top_links=set(), + ) + ) + for combo_list in all_combo_list + } # for key in loci_combo_map: # print (key) @@ -286,7 +318,7 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop num_of_alleles = len(FULL_LOCI) with open(freq_file) as f: for hap_line in f: - haplotype, pop, freq = hap_line.split(',') + haplotype, pop, freq = hap_line.split(",") if haplotype == "hap": continue freq = float(freq) @@ -298,8 +330,8 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop continue hap_list = make_allele_list(haplotype, full_name_index_dict, num_of_alleles) - haplotype = '~'.join(hap_list) - pop_haplotype = pop + '-' + haplotype + haplotype = "~".join(hap_list) + pop_haplotype = pop + "-" + haplotype haplist_overall[haplotype] = 1 pop_hap_combos[pop_haplotype] = freq @@ -308,18 +340,19 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop freqs = [] for pop in pops: # check to see if population + haplotype combo exists - pop_haplotype = pop + '-' + haplotype + pop_haplotype = pop + "-" + haplotype if pop_haplotype in pop_hap_combos: freqs.append(pop_hap_combos[pop_haplotype]) else: freqs.append(0) hap_list = make_allele_list(haplotype, full_name_index_dict, num_of_alleles) - loci_combo_map[FULL_LOCI][haplotype] = HaplotypeNode(node_id=next(sequence), - freq_list=freqs, - allele_list=hap_list, - parents=None, - top_links=set()) - + loci_combo_map[FULL_LOCI][haplotype] = HaplotypeNode( + node_id=next(sequence), + freq_list=freqs, + allele_list=hap_list, + parents=None, + top_links=set(), + ) # list(loci_combo_map[FULL_LOCI].items())[99] @@ -331,20 +364,31 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop loci_indices = find_indices(FULL_LOCI, loci) for full_hap_name, full_hap_node in loci_combo_map[FULL_LOCI].items(): allele_list = full_hap_node.allele_list - haplotype = '~'.join([allele_list[i] for i in loci_indices]) + haplotype = "~".join([allele_list[i] for i in loci_indices]) haplotype_node = loci_map[haplotype] - if loci in nodes_plan_b:# find parents just for internal nodes of plan B - parents = find_parents(FULL_LOCI, loci_indices, allele_list, full_hap_node.freq_list, ParentLoci, top_nodes_plan_b) + if ( + loci in nodes_plan_b + ): # find parents just for internal nodes of plan B + parents = find_parents( + FULL_LOCI, + loci_indices, + allele_list, + full_hap_node.freq_list, + ParentLoci, + top_nodes_plan_b, + ) parents_list = haplotype_node.parents for parent in parents: parents_list.append(parent) else: parents_list = [] - if loci in nodes_plan_a:# top link just for nodes of plan A + if loci in nodes_plan_a: # top link just for nodes of plan A top_links = haplotype_node.top_links - top_links.add(TopLink(node_id=full_hap_node.node_id, haplotype=full_hap_name)) + top_links.add( + TopLink(node_id=full_hap_node.node_id, haplotype=full_hap_name) + ) else: top_links = None @@ -357,33 +401,36 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop new_freq_list = haplotype_node.freq_list freq_sum = list(map(add, new_freq_list, full_hap_node.freq_list)) - new_node = HaplotypeNode(node_id=haplotype_node.node_id, - freq_list=freq_sum, - allele_list=None, - parents=parents_list, - top_links=top_links) + new_node = HaplotypeNode( + node_id=haplotype_node.node_id, + freq_list=freq_sum, + allele_list=None, + parents=parents_list, + top_links=top_links, + ) loci_map[haplotype] = new_node - # #### Build Nodes file - header = ['haplotypeId:ID(HAPLOTYPE)', 'name', 'loci:LABEL', 'frequency:DOUBLE[]'] + header = ["haplotypeId:ID(HAPLOTYPE)", "name", "loci:LABEL", "frequency:DOUBLE[]"] node_file = csvdir + conf.get("node_csv_file") - with open(node_file, mode='w') as csvfile: + with open(node_file, mode="w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(header) for loci in all_combo_list: loci_map = loci_combo_map[loci] for haplotype, haplotype_node in loci_map.items(): - freq_array = ';'.join(map(str, haplotype_node.freq_list)) - csv_writer.writerow([haplotype_node.node_id, haplotype, loci, freq_array]) + freq_array = ";".join(map(str, haplotype_node.freq_list)) + csv_writer.writerow( + [haplotype_node.node_id, haplotype, loci, freq_array] + ) # #### Build Edges File - edgeheader = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', 'CP:DOUBLE[]', ':TYPE'] + edgeheader = [":START_ID(HAPLOTYPE)", ":END_ID(HAPLOTYPE)", "CP:DOUBLE[]", ":TYPE"] edge_file = csvdir + conf.get("edges_csv_file") - with open(edge_file, mode='w') as csvfile: + with open(edge_file, mode="w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(edgeheader) @@ -400,13 +447,15 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop loci_combo = parent.loci hap = parent.haplotype parent_id = loci_combo_map[loci_combo][hap].node_id - csv_writer.writerow([haplotype_node.node_id, parent_id, prob_array, 'CP']) + csv_writer.writerow( + [haplotype_node.node_id, parent_id, prob_array, "CP"] + ) # #### Generate Top Links file - topheader = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', ':TYPE'] + topheader = [":START_ID(HAPLOTYPE)", ":END_ID(HAPLOTYPE)", ":TYPE"] top_links_file = csvdir + conf.get("top_links_csv_file") - with open(top_links_file, mode='w') as csvfile: + with open(top_links_file, mode="w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(topheader) for loci in nodes_plan_a: @@ -416,16 +465,23 @@ def generate_graph(config_file = "../../conf/minimal-configuration.json", em_pop for haplotype, haplotype_node in list(loci_map.items()): top_links = haplotype_node.top_links for top_link in top_links: - csv_writer.writerow([haplotype_node.node_id, top_link.node_id, 'TOP']) + csv_writer.writerow( + [haplotype_node.node_id, top_link.node_id, "TOP"] + ) # #### Generate Info Node file - infonode_header = ['INFO_NODE_ID:ID(INFO_NODE)', 'populations:STRING[]', 'INFO_NODE:LABEL'] + infonode_header = [ + "INFO_NODE_ID:ID(INFO_NODE)", + "populations:STRING[]", + "INFO_NODE:LABEL", + ] top_links_file = csvdir + conf.get("info_node_csv_file") - with open(top_links_file, mode='w') as csvfile: + with open(top_links_file, mode="w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(infonode_header) csv_writer.writerow([1, ";".join(pops), "INFO_NODE"]) -if __name__== "__main__": + +if __name__ == "__main__": generate_graph() diff --git a/grim/imputation/graph_generation/generate_neo4j_single_hpf.py b/grim/imputation/graph_generation/generate_neo4j_single_hpf.py index d806bcc..fa2df80 100755 --- a/grim/imputation/graph_generation/generate_neo4j_single_hpf.py +++ b/grim/imputation/graph_generation/generate_neo4j_single_hpf.py @@ -1,4 +1,3 @@ - # coding: utf-8 # #### TODO @@ -8,36 +7,39 @@ # * ~~Remove ChainMap~~ # * ~~Use namedtuples~~ # * ~~Ignore 0 HF frequency from Input file~~ -# * Compare with Matlab output +# * Compare with Matlab output # * Document the code # * Extract a module # * Accepts the list of FULL_LOCI and initial Freq. file -from itertools import combinations, count +from itertools import combinations, count from collections import defaultdict, namedtuple from operator import indexOf, add, truediv import csv import os import pandas as pd -os.makedirs('output/csv', exist_ok=True) +os.makedirs("output/csv", exist_ok=True) -HaplotypeNode = namedtuple('HaplotypeNode', 'node_id, freq_list, allele_list, parents, top_links') -TopLink = namedtuple('TopLink', 'node_id, haplotype') -ParentLoci = namedtuple('ParentLoci', 'loci, haplotype, p') +HaplotypeNode = namedtuple( + "HaplotypeNode", "node_id, freq_list, allele_list, parents, top_links" +) +TopLink = namedtuple("TopLink", "node_id, haplotype") +ParentLoci = namedtuple("ParentLoci", "loci, haplotype, p") # #### nCr: n Choose r + def nCr(nchars, r): - return [''.join(x) for x in combinations(nchars, r)] + return ["".join(x) for x in combinations(nchars, r)] -FULL_LOCI='ABCQR' +FULL_LOCI = "ABCQR" all_combo_list = [FULL_LOCI] -for i in range(len(FULL_LOCI) -1, 0, -1): +for i in range(len(FULL_LOCI) - 1, 0, -1): all_combo_list.extend(nCr(FULL_LOCI, i)) len(all_combo_list) @@ -45,12 +47,18 @@ def nCr(nchars, r): sequence = count(0) -loci_combo_map = {combo_list: defaultdict(lambda : HaplotypeNode(node_id = next(sequence), - freq_list = [], - allele_list=[], - parents=[], - top_links=set())) - for combo_list in all_combo_list } +loci_combo_map = { + combo_list: defaultdict( + lambda: HaplotypeNode( + node_id=next(sequence), + freq_list=[], + allele_list=[], + parents=[], + top_links=set(), + ) + ) + for combo_list in all_combo_list +} # #### Verify @@ -61,9 +69,9 @@ def nCr(nchars, r): def make_allele_list(haplotype): - hl = haplotype.split('~') + hl = haplotype.split("~") # Remove g from the allele names - hl = list(map(lambda h: h[:-1] if(h[-1] == 'g') else h, hl)) + hl = list(map(lambda h: h[:-1] if (h[-1] == "g") else h, hl)) hl.sort() return hl @@ -75,28 +83,28 @@ def make_allele_list(haplotype): # #### Load Frequency file # Load the frequency file into the FULL_LOCI dictionary with the cleaned up haplotype as the key. -haplist_overall = {} # list of haplotypes across all populations +haplist_overall = {} # list of haplotypes across all populations pop_hap_combos = {} pops = {} # pops = ['AAFA','AFB','AINDI','AISC','ALANAM','AMIND','CARB','CARHIS','CARIBI','FILII','KORI','JAPI','MENAFC', # 'NAMER','NCHI','SCAHIS','SCAMB','SCSEAI','VIET','AFA','API','CAU','HIS','NAM'] # pops = ['VIET', 'KORI', 'JAPI', 'FILII'] # pops = ['VIET', 'FILII'] -freqfile = 'output/hpf.csv' +freqfile = "output/hpf.csv" with open(freqfile) as f: for hap_line in f: - haplotype, pop, freq = hap_line.split(',') + haplotype, pop, freq = hap_line.split(",") if haplotype == "hap": continue freq = float(freq) # Ignore lines with 0 freq if freq == 0.0: continue - #if freq < 0.0001: # REMOVE THESE TWO LINES TO GENERATE COMPLETE EDGE + # if freq < 0.0001: # REMOVE THESE TWO LINES TO GENERATE COMPLETE EDGE # continue hap_list = make_allele_list(haplotype) - haplotype = '~'.join(hap_list) - pop_haplotype = pop + '-' + haplotype + haplotype = "~".join(hap_list) + pop_haplotype = pop + "-" + haplotype pops[pop] = 1 haplist_overall[haplotype] = 1 pop_hap_combos[pop_haplotype] = freq @@ -106,17 +114,19 @@ def make_allele_list(haplotype): freqs = [] for pop in pops: # check to see if population + haplotype combo exists - pop_haplotype = pop + '-' + haplotype + pop_haplotype = pop + "-" + haplotype if pop_haplotype in pop_hap_combos: freqs.append(pop_hap_combos[pop_haplotype]) else: freqs.append(0) - hap_list = make_allele_list(haplotype) - loci_combo_map[FULL_LOCI][haplotype] = HaplotypeNode(node_id = next(sequence), - freq_list = freqs, - allele_list = hap_list, - parents = None, - top_links= set()) + hap_list = make_allele_list(haplotype) + loci_combo_map[FULL_LOCI][haplotype] = HaplotypeNode( + node_id=next(sequence), + freq_list=freqs, + allele_list=hap_list, + parents=None, + top_links=set(), + ) # #### Verify @@ -128,6 +138,7 @@ def make_allele_list(haplotype): # Find the index of a loci name in `FULL_LOCI`. ie. `BC` in `ABCRQ` is `[1,2]` + def find_indices(loci): return list(map(lambda x: indexOf(FULL_LOCI, x), loci)) @@ -139,19 +150,21 @@ def find_indices(loci): print(find_indices(loci)) -find_indices('ABCQ') +find_indices("ABCQ") # Given a list of indices find the corresponding loci combo + def find_loci_name(loci_indices): - return ''.join([FULL_LOCI[i] for i in sorted(loci_indices)]) + return "".join([FULL_LOCI[i] for i in sorted(loci_indices)]) + +find_loci_name([1, 2]) -find_loci_name([1,2]) +# Find all the parents of the loci with loci_indices against loci_allele_list. -# Find all the parents of the loci with loci_indices against loci_allele_list. def find_parents(loci_indices, loci_allele_list, p): fi = find_indices(FULL_LOCI) @@ -160,10 +173,10 @@ def find_parents(loci_indices, loci_allele_list, p): for i in list(new_loci): new_loci = loci_indices.copy() new_loci.append(i) - #print(new_loci) + # print(new_loci) loci_name = find_loci_name(new_loci) - #print(loci_name) - haplotype = '~'.join([loci_allele_list[i] for i in sorted(new_loci)]) + # print(loci_name) + haplotype = "~".join([loci_allele_list[i] for i in sorted(new_loci)]) parents.append(ParentLoci(loci=loci_name, haplotype=haplotype, p=p)) return parents @@ -172,7 +185,7 @@ def find_parents(loci_indices, loci_allele_list, p): # Find parents of a random haplotype from C # haplotype = 'A*01:01~B*15:02~C*01:02~DQB1*03:01~DRB1*12:01' -haplotype = 'A*24:02~B*52:01~C*12:02~DQB1*06:01~DRB1*15:02' +haplotype = "A*24:02~B*52:01~C*12:02~DQB1*06:01~DRB1*15:02" # haplotype = 'A*01:01~B*15:01~C*01:02~DQB1*05:01~DRB1*01:01' @@ -182,21 +195,29 @@ def find_parents(loci_indices, loci_allele_list, p): # For each loci block, get all the allele combinations. Create parents link, and accumulate the freqs to create new Haplotype Nodes. for loci in all_combo_list: - if loci != FULL_LOCI: # FULL_LOCI is already in the dictionary + if loci != FULL_LOCI: # FULL_LOCI is already in the dictionary loci_map = loci_combo_map[loci] loci_indices = find_indices(loci) - for full_haplotype_name, full_haplotype_node in loci_combo_map[FULL_LOCI].items(): + for full_haplotype_name, full_haplotype_node in loci_combo_map[ + FULL_LOCI + ].items(): allele_list = full_haplotype_node.allele_list - haplotype = '~'.join([allele_list[i] for i in loci_indices]) - haplotype_node = loci_map[haplotype] - - parents = find_parents(loci_indices, allele_list, full_haplotype_node.freq_list) + haplotype = "~".join([allele_list[i] for i in loci_indices]) + haplotype_node = loci_map[haplotype] + + parents = find_parents( + loci_indices, allele_list, full_haplotype_node.freq_list + ) parents_list = haplotype_node.parents for parent in parents: parents_list.append(parent) - + top_links = haplotype_node.top_links - top_links.add(TopLink(node_id=full_haplotype_node.node_id, haplotype=full_haplotype_name)) + top_links.add( + TopLink( + node_id=full_haplotype_node.node_id, haplotype=full_haplotype_name + ) + ) # have to make freq_list with all zeros otherwise map add function returns empty list new_freq_list = [] @@ -205,19 +226,21 @@ def find_parents(loci_indices, loci_allele_list, p): new_freq_list.append(0) else: new_freq_list = haplotype_node.freq_list - freq_sum = list(map(add,new_freq_list,full_haplotype_node.freq_list)) + freq_sum = list(map(add, new_freq_list, full_haplotype_node.freq_list)) - new_node = HaplotypeNode(node_id=haplotype_node.node_id, - freq_list=freq_sum, - allele_list=None, - parents=parents_list, - top_links=top_links) + new_node = HaplotypeNode( + node_id=haplotype_node.node_id, + freq_list=freq_sum, + allele_list=None, + parents=parents_list, + top_links=top_links, + ) loci_map[haplotype] = new_node -haplotype="A*24:02~B*52:01~C*12:02~DQB1*06:01" -loci_combo_map['ABCQ'][haplotype] +haplotype = "A*24:02~B*52:01~C*12:02~DQB1*06:01" +loci_combo_map["ABCQ"][haplotype] # haplotype="A*24:02~B*52:01~C*12:02~DQB1*06:01~DRB1*15:02" # loci_combo_map[FULL_LOCI][haplotype] # haplotype="A*24:02" @@ -230,7 +253,7 @@ def find_parents(loci_indices, loci_allele_list, p): for loci in all_combo_list: print(loci) loci_map = loci_combo_map[loci] - for k,v in list(loci_map.items())[1:10]: + for k, v in list(loci_map.items())[1:10]: print(k, v.node_id, v.freq_list, sep="\t") @@ -243,32 +266,34 @@ def find_parents(loci_indices, loci_allele_list, p): # #### Build Nodes file -header = ['haplotypeId:ID(HAPLOTYPE)', 'name', 'loci:LABEL', 'frequency:DOUBLE[]'] -node_file = 'output/csv/nodes.csv' -with open(node_file, mode='w') as csvfile: +header = ["haplotypeId:ID(HAPLOTYPE)", "name", "loci:LABEL", "frequency:DOUBLE[]"] +node_file = "output/csv/nodes.csv" +with open(node_file, mode="w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(header) for loci in all_combo_list: loci_map = loci_combo_map[loci] for haplotype, haplotype_node in loci_map.items(): - freq_array = ';'.join(map(str,haplotype_node.freq_list)) + freq_array = ";".join(map(str, haplotype_node.freq_list)) csv_writer.writerow([haplotype_node.node_id, haplotype, loci, freq_array]) # #### Build Edges File + def dividebyzero(a, b): if b == 0: return 0 - else: - return a/b + else: + return a / b + # header = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', 'CP:DOUBLE[]', ':TYPE'] -# +# # with open(edge_file, mode='w') as csvfile: # csv_writer = csv.writer(csvfile) # csv_writer.writerow(header) -edge_file = 'output/csv/edges.csv' +edge_file = "output/csv/edges.csv" fq_l = list() for loci in all_combo_list: if loci != FULL_LOCI: @@ -283,38 +308,47 @@ def dividebyzero(a, b): loci_combo = parent.loci hap = parent.haplotype parent_id = loci_combo_map[loci_combo][hap].node_id - l = list([haplotype_node.node_id, parent_id, prob_array, 'CP']) + l = list([haplotype_node.node_id, parent_id, prob_array, "CP"]) fq_l.extend([l]) freq_df = pd.DataFrame(fq_l, columns=["HapNode", "ParentID", "P", "CP"]) -ndf = freq_df.join(freq_df.groupby(['HapNode', 'ParentID'])['P'].sum(), on=['HapNode', 'ParentID'], rsuffix='_r') -df2 = ndf[['HapNode', 'ParentID', 'P_r', 'CP']] +ndf = freq_df.join( + freq_df.groupby(["HapNode", "ParentID"])["P"].sum(), + on=["HapNode", "ParentID"], + rsuffix="_r", +) +df2 = ndf[["HapNode", "ParentID", "P_r", "CP"]] df2 = df2.drop_duplicates() -df2.columns = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', 'CP:DOUBLE[]', ':TYPE'] -df2.to_csv(edge_file, index=False, header=True, line_terminator='\n') +df2.columns = [":START_ID(HAPLOTYPE)", ":END_ID(HAPLOTYPE)", "CP:DOUBLE[]", ":TYPE"] +df2.to_csv(edge_file, index=False, header=True, line_terminator="\n") # # #### Generate Top Links file -header = [':START_ID(HAPLOTYPE)', ':END_ID(HAPLOTYPE)', ':TYPE'] -top_links_file = 'output/csv/top_links.csv' -with open(top_links_file, mode='w') as csvfile: +header = [":START_ID(HAPLOTYPE)", ":END_ID(HAPLOTYPE)", ":TYPE"] +top_links_file = "output/csv/top_links.csv" +with open(top_links_file, mode="w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(header) for loci in all_combo_list: - if loci != FULL_LOCI: # FULL_LOCI is already in the dictionary - # + if loci != FULL_LOCI: # FULL_LOCI is already in the dictionary + # loci_map = loci_combo_map[loci] for haplotype, haplotype_node in list(loci_map.items()): top_links = haplotype_node.top_links for top_link in top_links: - csv_writer.writerow([haplotype_node.node_id, top_link.node_id, 'TOP']) + csv_writer.writerow( + [haplotype_node.node_id, top_link.node_id, "TOP"] + ) # #### Generate Info Node file -pops = ['CAU'] -infonode_header = ['INFO_NODE_ID:ID(INFO_NODE)', 'populations:STRING[]', 'INFO_NODE:LABEL'] -top_links_file = 'output/csv/info_node.csv' -with open(top_links_file, mode='w') as csvfile: +pops = ["CAU"] +infonode_header = [ + "INFO_NODE_ID:ID(INFO_NODE)", + "populations:STRING[]", + "INFO_NODE:LABEL", +] +top_links_file = "output/csv/info_node.csv" +with open(top_links_file, mode="w") as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(infonode_header) csv_writer.writerow([1, ";".join(pops), "INFO_NODE"]) - diff --git a/grim/imputation/graph_generation/nemo_to_hpf_csv.py b/grim/imputation/graph_generation/nemo_to_hpf_csv.py index d276429..4ff233d 100644 --- a/grim/imputation/graph_generation/nemo_to_hpf_csv.py +++ b/grim/imputation/graph_generation/nemo_to_hpf_csv.py @@ -8,11 +8,14 @@ output_dir = "output/" parser = argparse.ArgumentParser() -parser.add_argument("-c", "--config", - required=False, - default="../../conf/minimal-configuration.json", - help="Configuration JSON file", - type=str) +parser.add_argument( + "-c", + "--config", + required=False, + default="../../conf/minimal-configuration.json", + help="Configuration JSON file", + type=str, +) args = parser.parse_args() configuration_file = args.config @@ -23,21 +26,27 @@ pops = conf.get("populations") freq_data_dir = project_dir + conf.get("freq_data_dir") -pop_ratio_dir = project_dir + conf.get("pops_count_file", 'imputation/graph_generation/output/pop_ratio.txt') +pop_ratio_dir = project_dir + conf.get( + "pops_count_file", "imputation/graph_generation/output/pop_ratio.txt" +) # Output in HaplotypePopulationFrequency (hpf) csv file -hpf_file = output_dir + 'hpf.csv' +hpf_file = output_dir + "hpf.csv" # Create output directory if it doesn't exist pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True) # Display the configurations we are using -print('****************************************************************************************************') +print( + "****************************************************************************************************" +) print("Conversion to HPF file based on following configuration:") print("\tPopulation: {}".format(pops)) print("\tFrequency File Directory: {}".format(freq_data_dir)) print("\tOutput File: {}".format(hpf_file)) -print('****************************************************************************************************') +print( + "****************************************************************************************************" +) haplist_overall = {} # list of haplotypes across all populations pop_hap_combos = {} @@ -45,13 +54,13 @@ list_pop_count = [] #### Load initial frequency files for pop in pops: - freq_file = freq_data_dir + pop + '.freqs.gz' + freq_file = freq_data_dir + pop + ".freqs.gz" print("Reading Frequency File:\t {}".format(freq_file)) - with gzip.open(freq_file, 'rb') as zf: + with gzip.open(freq_file, "rb") as zf: count_pop = 0 - lines = [x.decode('utf8').strip() for x in zf.readlines()] + lines = [x.decode("utf8").strip() for x in zf.readlines()] for hap_line in lines: - haplotype, count, freq = hap_line.split(',') + haplotype, count, freq = hap_line.split(",") if haplotype == "Haplo": continue freq = float(freq) @@ -59,7 +68,7 @@ if freq == 0.0: continue - pop_haplotype = pop + '-' + haplotype + pop_haplotype = pop + "-" + haplotype haplist_overall[haplotype] = 1 pop_hap_combos[pop_haplotype] = freq @@ -67,18 +76,17 @@ list_pop_count.append(count_pop) sum_pops = sum(list_pop_count) -pop_ratio_file = open(pop_ratio_dir, 'w') -for pop,ratio in zip(pops, list_pop_count): - pop_ratio_file.write("{},{},{}\n".format(pop, ratio,(ratio/sum_pops))) +pop_ratio_file = open(pop_ratio_dir, "w") +for pop, ratio in zip(pops, list_pop_count): + pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops))) -header = ['hap', 'pop', 'freq'] - +header = ["hap", "pop", "freq"] print("Writing hpf File:\t {}".format(hpf_file)) -with open(hpf_file, mode='w') as csv_file: - csv_writer = csv.writer(csv_file, delimiter=',', quoting=csv.QUOTE_NONE) +with open(hpf_file, mode="w") as csv_file: + csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE) csv_writer.writerow(header) for pop_haplotype in pop_hap_combos: (pop, haplotype) = pop_haplotype.split("-") diff --git a/grim/imputation/graph_generation/wmda_download.py b/grim/imputation/graph_generation/wmda_download.py index 4b7f98d..9f6ac34 100755 --- a/grim/imputation/graph_generation/wmda_download.py +++ b/grim/imputation/graph_generation/wmda_download.py @@ -1,4 +1,4 @@ -''' +""" The WMDA data is part of the supplemental materials from the following article. @@ -14,24 +14,24 @@ (CC BY-NC-ND 4.0) license. https://creativecommons.org/licenses/by-nc-nd/4.0/ -''' +""" import os.path import urllib.request from tarfile import TarFile -url='https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5089599/bin/TAN-87-439-s006.tgz' -file_name='TAN-87-439-s006.tgz' +url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5089599/bin/TAN-87-439-s006.tgz" +file_name = "TAN-87-439-s006.tgz" # download file if os.path.isfile(file_name): - print('Skipping download step. File found: ', file_name) + print("Skipping download step. File found: ", file_name) else: - print('Downloading file', file_name, '...') + print("Downloading file", file_name, "...") urllib.request.urlretrieve(url, file_name) # unpack it -print('Unpacking file ...') +print("Unpacking file ...") TarFile.open(file_name).extractall(path="./data/wmda") -print('Done.') +print("Done.") diff --git a/grim/imputation/graph_generation/wmda_to_hpf_csv.py b/grim/imputation/graph_generation/wmda_to_hpf_csv.py index f84be78..49efa31 100755 --- a/grim/imputation/graph_generation/wmda_to_hpf_csv.py +++ b/grim/imputation/graph_generation/wmda_to_hpf_csv.py @@ -1,17 +1,18 @@ import os import pandas as pd -wmda_orig = pd.read_csv('data/wmda/freqs.txt', sep=';', names = ['hap', 'freq']) + +wmda_orig = pd.read_csv("data/wmda/freqs.txt", sep=";", names=["hap", "freq"]) # in case we want to change to read a gz version -#wmda_orig = pd.read_csv('data/wmda/freqs.txt.gz', sep=';', compression='gzip', names = ['hap', 'freq']) +# wmda_orig = pd.read_csv('data/wmda/freqs.txt.gz', sep=';', compression='gzip', names = ['hap', 'freq']) project_dir = "../../" -wmda_orig['pop'] = 'CAU' -wmda = wmda_orig[['hap', 'pop', 'freq']] +wmda_orig["pop"] = "CAU" +wmda = wmda_orig[["hap", "pop", "freq"]] -os.makedirs('output', exist_ok=True) -pop_ratio_dir = project_dir + 'imputation/graph_generation/output/pop_ratio.txt' -pop_ratio_file = open(pop_ratio_dir, 'w') -pop_ratio_file.write("{},{},{}\n".format('CAU', 20/(1e-5),1)) -wmda.to_csv('output/hpf.csv',index=False) +os.makedirs("output", exist_ok=True) +pop_ratio_dir = project_dir + "imputation/graph_generation/output/pop_ratio.txt" +pop_ratio_file = open(pop_ratio_dir, "w") +pop_ratio_file.write("{},{},{}\n".format("CAU", 20 / (1e-5), 1)) +wmda.to_csv("output/hpf.csv", index=False) diff --git a/grim/imputation/imputegl/cypher_plan_b.py b/grim/imputation/imputegl/cypher_plan_b.py index e68738a..b576e46 100755 --- a/grim/imputation/imputegl/cypher_plan_b.py +++ b/grim/imputation/imputegl/cypher_plan_b.py @@ -1,21 +1,17 @@ - - class CypherQueryPlanB(object): - ''' + """ classdocs - ''' + """ - def __init__(self, loci_map):#, indexes): - ''' + def __init__(self, loci_map): # , indexes): + """ Constructor - ''' + """ self.loc_map = loci_map - #self.index_dict = indexes + # self.index_dict = indexes - - - def indexToLocus(self,indexes): + def indexToLocus(self, indexes): loci = [] # Sort if the indexes are not sorted @@ -27,7 +23,7 @@ def indexToLocus(self,indexes): return loci - def findTypeFromIndexes(self,indexes): + def findTypeFromIndexes(self, indexes): full_form_loci = self.indexToLocus(indexes) return self.findFullFormType(full_form_loci) @@ -36,12 +32,11 @@ def findFullFormType(self, full_form_loci): return type def getLocus(self, allele): - """ Get the locus back from any given allele - """ + """Get the locus back from any given allele""" loc_allele = allele.split("*") return self.loc_map[loc_allele[0]] def findLinkType(self, typing): loci = sorted([self.getLocus(a) for a in typing[0].split("~")]) loci_lc = "".join([allele for allele in loci]) - return loci_lc \ No newline at end of file + return loci_lc diff --git a/grim/imputation/imputegl/cypher_query.py b/grim/imputation/imputegl/cypher_query.py index 83c112a..c9a14ad 100755 --- a/grim/imputation/imputegl/cypher_query.py +++ b/grim/imputation/imputegl/cypher_query.py @@ -5,24 +5,23 @@ class CypherQuery(object): - ''' + """ classdocs - ''' + """ - def __init__(self, loc_map, verbose: bool=False): - ''' + def __init__(self, loc_map, verbose: bool = False): + """ Constructor - ''' + """ self.verbose = verbose self.logger = logging.getLogger("Logger." + __name__) - #haplo = "A*02:01~C*03:03~B*15:01~DRB1*11:01~DQB1*03:01" + # haplo = "A*02:01~C*03:03~B*15:01~DRB1*11:01~DQB1*03:01" self.loc_map = loc_map for locus, val in self.loc_map.items(): self.loc_map[locus] = str(val) def getLocus(self, allele): - """ Get the locus back from any given allele - """ + """Get the locus back from any given allele""" loc_allele = allele.split("*") return self.loc_map[loc_allele[0]] diff --git a/grim/imputation/imputegl/impute.py b/grim/imputation/imputegl/impute.py index 28bc299..ef9b696 100644 --- a/grim/imputation/imputegl/impute.py +++ b/grim/imputation/imputegl/impute.py @@ -18,11 +18,12 @@ def chunks(l, n): for i in range(0, len(l), n): - yield l[i:i+n] + yield l[i : i + n] -def write_best_prob(name_gl, res, probs, numOfResult, fout, sign = ","): + +def write_best_prob(name_gl, res, probs, numOfResult, fout, sign=","): sumProbsDict = defaultdict(list) - #loop over the result and sum the prob by populations/haplotype + # loop over the result and sum the prob by populations/haplotype for k in range(len(res)): key = res[k][0] + sign + res[k][1] if key in sumProbsDict: @@ -42,54 +43,79 @@ def write_best_prob(name_gl, res, probs, numOfResult, fout, sign = ","): multProbs.sort(key=lambda x: x[0], reverse=True) - #write the output to file + # write the output to file minBestResult = min(numOfResult, len(multProbs)) for k in range(minBestResult): - fout.write(name_gl + ',' + str(multProbs[k][1][0]) + ',' + - str(multProbs[k][0]) + ',' + str(k) + '\n') + fout.write( + name_gl + + "," + + str(multProbs[k][1][0]) + + "," + + str(multProbs[k][0]) + + "," + + str(k) + + "\n" + ) def write_best_prob_genotype(name_gl, res, numOfResult, fout): - sorted_by_value = sorted(res.items(), key=lambda kv: kv[1], reverse=True) + sorted_by_value = sorted(res.items(), key=lambda kv: kv[1], reverse=True) - # write the output to file - minBestResult = min(numOfResult, len(sorted_by_value)) - for k in range(minBestResult): - fout.write(name_gl + ',' + str(sorted_by_value[k][0]) + ',' + - str(sorted_by_value[k][1]) + ',' + str(k) + '\n') + # write the output to file + minBestResult = min(numOfResult, len(sorted_by_value)) + for k in range(minBestResult): + fout.write( + name_gl + + "," + + str(sorted_by_value[k][0]) + + "," + + str(sorted_by_value[k][1]) + + "," + + str(k) + + "\n" + ) def write_best_hap_race_pairs(name_gl, haps, pops, probs, numOfResult, fout): all_res = [] for i in range(len(probs)): - pair = haps[i][0] + ';' + pops[i][0] + ',' + haps[i][1] + ';' + pops[i][1] + pair = haps[i][0] + ";" + pops[i][0] + "," + haps[i][1] + ";" + pops[i][1] all_res.append([probs[i], pair]) all_res.sort(key=lambda x: x[0], reverse=True) # write the output to file minBestResult = min(numOfResult, len(all_res)) for k in range(minBestResult): - fout.write(name_gl + ',' + str(all_res[k][1]) + ',' + - str(all_res[k][0]) + ',' + str(k) + '\n') + fout.write( + name_gl + + "," + + str(all_res[k][1]) + + "," + + str(all_res[k][0]) + + "," + + str(k) + + "\n" + ) + # # given a gl string, remove g,L and UUUUs from it # def clean_up_gl(gl): - gl = gl.replace('g', '') + gl = gl.replace("g", "") # keep nulls # gl = gl.replace('N', '') - gl = gl.replace('L', '') + gl = gl.replace("L", "") unknowns = [] locus_gl = gl.split("^") for j in range(len(locus_gl)): gen = locus_gl[j] - if not gen.strip('UUUU') == gen: + if not gen.strip("UUUU") == gen: unknowns.append(gen) for miss in unknowns: locus_gl.remove(miss) - return '^'.join(locus_gl) + return "^".join(locus_gl) class Imputation(object): @@ -106,7 +132,7 @@ def __init__(self, net=None,config=None, count_by_prob=None, verbose=False): # graph is a link to the graph and populations is the list of populations self.logger = logging.getLogger("Logger." + __name__) self.verbose = verbose - # self.graph = graph + # self.graph = graph if not config is None: # can not let populations be set on construction @@ -118,30 +144,30 @@ def __init__(self, net=None,config=None, count_by_prob=None, verbose=False): self.unk_priors = config["UNK_priors"] # For plan b - #self.full_loci = config["full_loci"] - #loc_values = list(set(config["loci_map"].values())) - #loc_values.sort() - #indxes_loc_dict = {} - #indxes_loc_dict_planb = {} - #for i, loc in enumerate(loc_values): - #indxes_loc_dict[loc] = i+1 - #indxes_loc_dict_planb[i+1] = loc + # self.full_loci = config["full_loci"] + # loc_values = list(set(config["loci_map"].values())) + # loc_values.sort() + # indxes_loc_dict = {} + # indxes_loc_dict_planb = {} + # for i, loc in enumerate(loc_values): + # indxes_loc_dict[loc] = i+1 + # indxes_loc_dict_planb[i+1] = loc self.full_hapl = [] - #self.index_dict = {} + # self.index_dict = {} - #for loci in config["loci_map"]: - #self.full_hapl.append(loci) - #self.index_dict[loci] = indxes_loc_dict[config["loci_map"][loci]] + # for loci in config["loci_map"]: + # self.full_hapl.append(loci) + # self.index_dict[loci] = indxes_loc_dict[config["loci_map"][loci]] loc_values = config["loci_map"] ##################### indxes_loc_dict_planb = {} self.index_dict = {} - #short_name_index_dict = {} + # short_name_index_dict = {} for locus, val in loc_values.items(): self.full_hapl.append(locus) - self.index_dict[locus] = val#full_name_index_dict - #indxes_loc_dict_planb[val[1]] = val[0] #indxes_loc_short_name_dict + self.index_dict[locus] = val # full_name_index_dict + # indxes_loc_dict_planb[val[1]] = val[0] #indxes_loc_short_name_dict self.full_loci = config["full_loci"] @@ -150,7 +176,9 @@ def __init__(self, net=None,config=None, count_by_prob=None, verbose=False): self.factor = 0.0001 self._factor_missing_data = config["factor_missing_data"] self.cypher = CypherQuery(config["loci_map"]) - self.cypher_plan_b = CypherQueryPlanB(config["loci_map"])#, indxes_loc_dict_planb) + self.cypher_plan_b = CypherQueryPlanB( + config["loci_map"] + ) # , indxes_loc_dict_planb) self.matrix_planb = config["matrix_planb"] @@ -158,14 +186,14 @@ def __init__(self, net=None,config=None, count_by_prob=None, verbose=False): self.count_by_prob = np.ones(len(self.populations)) if config["use_pops_count_file"]: with open(config["pops_count_file"]) as f_count: - for i,line in enumerate(f_count): - self.count_by_prob[i] = float(line.strip().split(',')[2]) + for i, line in enumerate(f_count): + self.count_by_prob[i] = float(line.strip().split(",")[2]) else: self.count_by_prob = count_by_prob self.number_of_options_threshold = config["number_of_options_threshold"] # The following is not reliable in parallel mode. - self.plan = 'a' + self.plan = "a" self.option_1 = 0 self.option_2 = 0 self.haplotypes_number_in_phase = config["max_haplotypes_number_in_phase"] @@ -173,14 +201,21 @@ def __init__(self, net=None,config=None, count_by_prob=None, verbose=False): self.nodes_for_plan_A = config["nodes_for_plan_A"] def print_options_count(self, subject_id): - message = "Subject: {id} plan: {plan}, open_phases - count of open regular option: {option1}, " \ - "count of alternative opening: {option2} " - print(message.format(id=subject_id, plan=self.plan, - option1=self.option_1, option2=self.option_2)) + message = ( + "Subject: {id} plan: {plan}, open_phases - count of open regular option: {option1}, " + "count of alternative opening: {option2} " + ) + print( + message.format( + id=subject_id, + plan=self.plan, + option1=self.option_1, + option2=self.option_2, + ) + ) def power_find(self, n): - """produces all powers of 2 - """ + """produces all powers of 2""" result = [] binary = bin(n)[:1:-1] for x in range(len(binary)): @@ -192,31 +227,31 @@ def gl2haps(self, GL_String): # Receives a GL string adn produces a genotype in list structure if GL_String == "" or GL_String == " ": return [] - split_hap = GL_String.split('^') + split_hap = GL_String.split("^") N_Loci = len(split_hap) t1 = [] t2 = [] count = 0 for i in range(N_Loci): - #if not split_hap[i]=='+': - if split_hap[i][0] == "+": - split_hap[i] = split_hap[i][1:] - curr_locus = split_hap[i].split('+') - if len(curr_locus) == 1: - if curr_locus == ['']: - count = count+1 - continue - else: - return [] - t1.append(curr_locus[0]) - t2.append(curr_locus[1]) + # if not split_hap[i]=='+': + if split_hap[i][0] == "+": + split_hap[i] = split_hap[i][1:] + curr_locus = split_hap[i].split("+") + if len(curr_locus) == 1: + if curr_locus == [""]: + count = count + 1 + continue + else: + return [] + t1.append(curr_locus[0]) + t2.append(curr_locus[1]) for i in range(count): - split_hap.remove('') - N_Loci = N_Loci-1 + split_hap.remove("") + N_Loci = N_Loci - 1 Gen = [sorted(t1), sorted(t2)] - return {'Genotype': Gen, 'N_Loc': N_Loci} + return {"Genotype": Gen, "N_Loc": N_Loci} - def gen_phases(self, gen, n_loci, b_phases ): + def gen_phases(self, gen, n_loci, b_phases): # haplotypes in gen is expected to be sorted # Generates all phases, but does not handle locus ambiguities if b_phases is not None: @@ -255,13 +290,13 @@ def open_gl_string(self, gl_string): return # generate the 2^(N-1) phases - chr1 = self.gen_phases(chr['Genotype'], chr['N_Loc']) + chr1 = self.gen_phases(chr["Genotype"], chr["N_Loc"]) # return if the result is empty (why would that be?) if chr1 == []: return - phases = self.open_phases_1(chr1, chr['N_Loc'], gl_string) + phases = self.open_phases_1(chr1, chr["N_Loc"], gl_string) return phases def open_phases_1(self, haps, N_Loc, gl_string): @@ -278,14 +313,14 @@ def open_phases_1(self, haps, N_Loc, gl_string): # compute the number of options: options = 1 - for i in range(N_Loc): options = options * (len(hap_list[0][i].split("/"))) + for i in range(N_Loc): + options = options * (len(hap_list[0][i].split("/"))) # if the number of options is smaller than the total number of nodes: - if options < 100: \ - # open ambiguities regularly: + if options < 100: # open ambiguities regularly: for i in range(N_Loc): hap_list = self.open_ambiguities(hap_list, i) - if (k == 0): + if k == 0: H1.append(hap_list) else: H2.append(hap_list) @@ -296,15 +331,32 @@ def open_phases_1(self, haps, N_Loc, gl_string): phases.append([H1, H2]) return phases + def open_ambiguities(self, hap, loc): + # This opens all allele ambiguities + hap_new = [] # produces an empty list of haplotypes + for k in range(len(hap)): # slit a given locus in all haps. + splitHap = hap[k][loc].split("/") + if splitHap == hap[k][loc].split(): + split_loc = hap[k][loc].split("|") + else: + split_loc = splitHap + hap1 = hap[k] + if len(split_loc) > 1: + for i in range(len(split_loc)): + hap1[loc] = split_loc[i] + hap_new.append(hap1[:]) + else: + hap_new.append(hap1[:]) + return hap_new def comp_hap_prob(self, Hap, N_Loc, epsilon, n): haplo_probs = self.get_haplo_freqs(Hap, epsilon, n) probs = list(haplo_probs.values()) - #probs=haplo_probs.values() + # probs=haplo_probs.values() haplos = list(haplo_probs.keys()) if not haplo_probs: - return {'Haps': '', 'Probs': ''} - return {'Haps': haplos, 'Probs': probs} + return {"Haps": "", "Probs": ""} + return {"Haps": haplos, "Probs": probs} # get_haplo_freqs # given a set of possible haplotypes, look them up @@ -338,9 +390,9 @@ def comp_hap_prob(self, Hap, N_Loc, epsilon, n): # return haplo_probs def get_haplo_freqs(self, haplos, epsilon, n=25000): - haplos_joined = ["~".join(item) for item in haplos[0]] ### - #haplos_joined = [item for sublist in haplos for item in sublist] ### - #haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist] + haplos_joined = ["~".join(item) for sublist in haplos for item in sublist] ### + # haplos_joined = [item for sublist in haplos for item in sublist] ### + # haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist] return self.netGraph.adjs_query(haplos_joined) @@ -358,17 +410,17 @@ def cal_prob(self, probs1, probs2, epsilon): # This is the part where we loop over all race combinations. # N*N loop, where N is the number of races/ places = [] - #print ("cal_prob\n\n") - #print ("probs1 = ", probs1) - #print ("probs2 = ", probs2) + # print ("cal_prob\n\n") + # print ("probs1 = ", probs1) + # print ("probs2 = ", probs2) for i in range(len(probs1)): for j in range(len(probs2)): - if probs1[i]*probs2[j] >= epsilon and probs1[i]*probs2[j] != 0: + if probs1[i] * probs2[j] >= epsilon and probs1[i] * probs2[j] != 0: places.append([i, j]) # places are the indices of the positions in the population array. return places - #move to one dim sorted list, and remove prob=0 + # move to one dim sorted list, and remove prob=0 def convert_list_to_one_dim(self, prob): ProbWithIndexes = [] @@ -376,53 +428,100 @@ def convert_list_to_one_dim(self, prob): for k in range(len(prob)): for j in range(len(prob[k])): if prob[k][j] > 0: - prob_with_indexes_by_prior.append([(prob[k][j] * self.priorMatrix[j][j]), [prob[k][j], [k, j]]]) + prob_with_indexes_by_prior.append( + [(prob[k][j] * self.priorMatrix[j][j]), [prob[k][j], [k, j]]] + ) prob_with_indexes_by_prior.sort(key=lambda x: x[0], reverse=True) - minBestResult = min(self.haplotypes_number_in_phase, len(prob_with_indexes_by_prior)) + minBestResult = min( + self.haplotypes_number_in_phase, len(prob_with_indexes_by_prior) + ) for k in range(minBestResult): ProbWithIndexes.append(prob_with_indexes_by_prior[k][1]) return ProbWithIndexes - def calc_haps_pairs(self, Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, hap_total, pop_res, maxProb, geno_seen): + def calc_haps_pairs( + self, + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + geno_seen, + ): for h in range(len(Prob1WithIndexes)): x = epsilon / Prob1WithIndexes[h][0] - x_homozygote = x*2 + x_homozygote = x * 2 prob2Len = len(Prob2WithIndexes) # k=0 # while k < prob2Len and Prob2WithIndexe s[k][0] >= x: for k in range(prob2Len): if Prob2WithIndexes[k][0] >= x: - if self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] > 0: + if ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + > 0 + ): hap1 = Haps1[Prob1WithIndexes[h][1][0]] hap2 = Haps2[Prob2WithIndexes[k][1][0]] - if (hap1 != hap2 and (self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] * - Prob2WithIndexes[k][0]) >= x) or (hap1==hap2 and (self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] * - Prob2WithIndexes[k][0]) >= x_homozygote): - - #hap1_list = sorted(hap1.split('~')) - #hap2_list = sorted(hap2.split('~')) - #genotype = ("~".join(sorted(hap1.split("~") + hap2.split("~")))) - #if hap1_list[-1].split("*")[0] == hap2_list[-1].split("*")[0]: - genotype = "^".join(map(lambda x: "+".join(sorted(x)), - zip(sorted(hap1.split('~')), - sorted(hap2.split('~'))))) + if ( + hap1 != hap2 + and ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + * Prob2WithIndexes[k][0] + ) + >= x + ) or ( + hap1 == hap2 + and ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + * Prob2WithIndexes[k][0] + ) + >= x_homozygote + ): + + # hap1_list = sorted(hap1.split('~')) + # hap2_list = sorted(hap2.split('~')) + # genotype = ("~".join(sorted(hap1.split("~") + hap2.split("~")))) + # if hap1_list[-1].split("*")[0] == hap2_list[-1].split("*")[0]: + genotype = "^".join( + map( + lambda x: "+".join(sorted(x)), + zip( + sorted(hap1.split("~")), sorted(hap2.split("~")) + ), + ) + ) race1 = self.populations[Prob1WithIndexes[h][1][1]] race2 = self.populations[Prob2WithIndexes[k][1][1]] - h1_id = hap1 + ',' +race1 - h2_id = hap2 + ',' + race2 - geno_id = ("-".join(sorted([h1_id, h2_id]))) - #geno_id = tuple(sorted([h1_id, h2_id])) + h1_id = hap1 + "," + race1 + h2_id = hap2 + "," + race2 + geno_id = "-".join(sorted([h1_id, h2_id])) + # geno_id = tuple(sorted([h1_id, h2_id])) if geno_id not in geno_seen: geno_seen.add(geno_id) - prob = Prob1WithIndexes[h][0] * Prob2WithIndexes[k][0] *self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] + prob = ( + Prob1WithIndexes[h][0] + * Prob2WithIndexes[k][0] + * self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + ) if hap1 != hap2: - prob = prob*2 + prob = prob * 2 if prob > maxProb: maxProb = prob @@ -433,10 +532,9 @@ def calc_haps_pairs(self, Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsi else: hap_total[genotype] = prob - races = sorted([race1, race2]) - races = races[0] + ',' + races[1] + races = races[0] + "," + races[1] if races in pop_res: sumProb = pop_res[races] + prob @@ -449,9 +547,21 @@ def calc_haps_pairs(self, Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsi return maxProb - - - def calc_haps_pairs_haplotype(self, Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, hap_total, pop_res, maxProb, haps_pairs, geno_seen, pop_res_haplo, p_total): + def calc_haps_pairs_haplotype( + self, + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + haps_pairs, + geno_seen, + pop_res_haplo, + p_total, + ): for h in range(len(Prob1WithIndexes)): x = epsilon / Prob1WithIndexes[h][0] @@ -461,33 +571,60 @@ def calc_haps_pairs_haplotype(self, Haps1, Haps2, Prob1WithIndexes, Prob2WithInd # while k < prob2Len and Prob2WithIndexe s[k][0] >= x: for k in range(prob2Len): if Prob2WithIndexes[k][0] >= x: - if self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] > 0: + if ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + > 0 + ): hap1 = Haps1[Prob1WithIndexes[h][1][0]] hap2 = Haps2[Prob2WithIndexes[k][1][0]] - if (hap1 != hap2 and (self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] * - Prob2WithIndexes[k][0]) >= x) or ( hap1 ==hap2 and - (self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] * - Prob2WithIndexes[k][0]) >= x_homozygote): - - #hap1_list = sorted(hap1.split('~')) - #hap2_list = sorted(hap2.split('~')) + if ( + hap1 != hap2 + and ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + * Prob2WithIndexes[k][0] + ) + >= x + ) or ( + hap1 == hap2 + and ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + * Prob2WithIndexes[k][0] + ) + >= x_homozygote + ): + + # hap1_list = sorted(hap1.split('~')) + # hap2_list = sorted(hap2.split('~')) # genotype = ("~".join(sorted(hap1.split("~") + hap2.split("~")))) - #if hap1_list[-1].split("*")[0] == hap2_list[-1].split("*")[0]: - genotype = ("~".join(sorted(hap1.split("~") + hap2.split("~")))) + # if hap1_list[-1].split("*")[0] == hap2_list[-1].split("*")[0]: + genotype = "~".join( + sorted(hap1.split("~") + hap2.split("~")) + ) race1 = self.populations[Prob1WithIndexes[h][1][1]] race2 = self.populations[Prob2WithIndexes[k][1][1]] - h1_id = hap1 + ',' + race1 - h2_id = hap2 + ',' + race2 - geno_id = ("-".join(sorted([h1_id, h2_id]))) + h1_id = hap1 + "," + race1 + h2_id = hap2 + "," + race2 + geno_id = "-".join(sorted([h1_id, h2_id])) # geno_id = tuple(sorted([h1_id, h2_id])) - #if len(geno_seen) > 25000000: - #k = 1 / 0 + # if len(geno_seen) > 25000000: + # k = 1 / 0 if geno_id not in geno_seen: geno_seen.add(geno_id) - prob = Prob1WithIndexes[h][0] * Prob2WithIndexes[k][0] * \ - self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] + prob = ( + Prob1WithIndexes[h][0] + * Prob2WithIndexes[k][0] + * self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + ) if hap1 != hap2: prob = prob * 2 @@ -503,7 +640,7 @@ def calc_haps_pairs_haplotype(self, Haps1, Haps2, Prob1WithIndexes, Prob2WithInd races = sorted([race1, race2]) - races = races[0] + ',' + races[1] + races = races[0] + "," + races[1] if races in pop_res: sumProb = pop_res[races] + prob @@ -535,27 +672,44 @@ def comp_phase_prob_haplotype(self, phases, N_Loc, epsilon, n): for i in range(len(phases)): P1 = self.comp_hap_prob(phases[i][0], N_Loc, epsilon, n) # This will open locus ambiguities and comp probabilities for Hap1 - Haps1 = P1['Haps'] - Prob1 = P1['Probs'] + Haps1 = P1["Haps"] + Prob1 = P1["Probs"] # if the first haplotype returned something then look at 2nd - if len(Prob1)>0: + if len(Prob1) > 0: P2 = self.comp_hap_prob(phases[i][1], N_Loc, epsilon, n) # This will do the same for Hap 2; - Haps2 = P2['Haps'] - Prob2 = P2['Probs'] + Haps2 = P2["Haps"] + Prob2 = P2["Probs"] Prob1WithIndexes = self.convert_list_to_one_dim(Prob1) Prob2WithIndexes = self.convert_list_to_one_dim(Prob2) - maxProb = self.calc_haps_pairs_haplotype(Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, genotypes_total, - pop_res, maxProb, hap_total, geno_seen, pop_res_haplo, p_total) + maxProb = self.calc_haps_pairs_haplotype( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + genotypes_total, + pop_res, + maxProb, + hap_total, + geno_seen, + pop_res_haplo, + p_total, + ) # p_total returns an array of N*N (N is number of populations), hap_total - pairs of haplotypes. # pop_res are the names of the populations - #return {'MaxProb': maxProb, 'Haps': hap_total, 'Pops': pop_res} + # return {'MaxProb': maxProb, 'Haps': hap_total, 'Pops': pop_res} - return {'MaxProb': maxProb,'Haps': hap_total, 'Probs': p_total, 'Pops': pop_res_haplo} + return { + "MaxProb": maxProb, + "Haps": hap_total, + "Probs": p_total, + "Pops": pop_res_haplo, + } def comp_phase_prob_genotype(self, phases, N_Loc, epsilon, n): # receives a list of phases and computes haps and @@ -570,27 +724,34 @@ def comp_phase_prob_genotype(self, phases, N_Loc, epsilon, n): for i in range(len(phases)): P1 = self.comp_hap_prob(phases[i][0], N_Loc, epsilon, n) # This will open locus ambiguities and comp probabilities for Hap1 - Haps1 = P1['Haps'] - Prob1 = P1['Probs'] + Haps1 = P1["Haps"] + Prob1 = P1["Probs"] # if the first haplotype returned something then look at 2nd - if len(Prob1)>0: + if len(Prob1) > 0: P2 = self.comp_hap_prob(phases[i][1], N_Loc, epsilon, n) # This will do the same for Hap 2; - Haps2 = P2['Haps'] - Prob2 = P2['Probs'] + Haps2 = P2["Haps"] + Prob2 = P2["Probs"] Prob1WithIndexes = self.convert_list_to_one_dim(Prob1) Prob2WithIndexes = self.convert_list_to_one_dim(Prob2) - maxProb = self.calc_haps_pairs(Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, hap_total, - pop_res, maxProb, geno_seen) + maxProb = self.calc_haps_pairs( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + geno_seen, + ) # p_total returns an array of N*N (N is number of populations), hap_total - pairs of haplotypes. # pop_res are the names of the populations - return {'MaxProb': maxProb, 'Haps': hap_total, 'Pops': pop_res} - - + return {"MaxProb": maxProb, "Haps": hap_total, "Pops": pop_res} def comp_phase_prob(self, phases, N_Loc, epsilon, n): # receives a list of phases and computes haps and @@ -603,38 +764,52 @@ def comp_phase_prob(self, phases, N_Loc, epsilon, n): for i in range(len(phases)): P1 = self.comp_hap_prob(phases[i][0], N_Loc, epsilon, n) # This will open locus ambiguities and comp probabilities for Hap1 - Haps1 = P1['Haps'] - Prob1 = P1['Probs'] + Haps1 = P1["Haps"] + Prob1 = P1["Probs"] # if the first haplotype returned something then look at 2nd - if len(Prob1)>0: + if len(Prob1) > 0: P2 = self.comp_hap_prob(phases[i][1], N_Loc, epsilon, n) # This will do the same for Hap 2; - Haps2 = P2['Haps'] - Prob2 = P2['Probs'] + Haps2 = P2["Haps"] + Prob2 = P2["Probs"] Prob1WithIndexes = self.convert_list_to_one_dim(Prob1) Prob2WithIndexes = self.convert_list_to_one_dim(Prob2) - # for h in range(len(Prob1)): - # for k in range(len(Prob2)): + # for h in range(len(Prob1)): + # for k in range(len(Prob2)): for h in range(len(Prob1WithIndexes)): - x = epsilon/Prob1WithIndexes[h][0] - prob2Len = len(Prob2WithIndexes) - # k=0 - # while k < prob2Len and Prob2WithIndexes[k][0] >= x: - for k in range(prob2Len): - if Prob2WithIndexes[k][0] >= x: - if self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] > 0: - if self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] > 0: - if (self.priorMatrix[Prob1WithIndexes[h][1][1]][Prob2WithIndexes[k][1][1]] * - Prob2WithIndexes[k][0]) >= x: - # NOTE: there is a bug in places - # places = self.cal_prob(Prob1[h], Prob2[k], epsilon) - #print ("places= ", places) - # for i in range(len(places)): - #Prob1WithIndexes[h][0] - # p=(places[i]) + x = epsilon / Prob1WithIndexes[h][0] + prob2Len = len(Prob2WithIndexes) + # k=0 + # while k < prob2Len and Prob2WithIndexes[k][0] >= x: + for k in range(prob2Len): + if Prob2WithIndexes[k][0] >= x: + if ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + > 0 + ): + if ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + > 0 + ): + if ( + self.priorMatrix[Prob1WithIndexes[h][1][1]][ + Prob2WithIndexes[k][1][1] + ] + * Prob2WithIndexes[k][0] + ) >= x: + # NOTE: there is a bug in places + # places = self.cal_prob(Prob1[h], Prob2[k], epsilon) + # print ("places= ", places) + # for i in range(len(places)): + # Prob1WithIndexes[h][0] + # p=(places[i]) # avoid reporting the same haplotype pair more than once # # crashing at the next line @@ -645,10 +820,16 @@ def comp_phase_prob(self, phases, N_Loc, epsilon, n): # the population list to be the same as the graph # # - #print ("p[0] = ", p[0]) - #print ("p[1] = ", p[1]) - h1_id = (Haps1[Prob1WithIndexes[h][1][0]], self.populations[Prob1WithIndexes[h][1][1]]) - h2_id = (Haps2[Prob2WithIndexes[k][1][0]], self.populations[Prob2WithIndexes[k][1][1]]) + # print ("p[0] = ", p[0]) + # print ("p[1] = ", p[1]) + h1_id = ( + Haps1[Prob1WithIndexes[h][1][0]], + self.populations[Prob1WithIndexes[h][1][1]], + ) + h2_id = ( + Haps2[Prob2WithIndexes[k][1][0]], + self.populations[Prob2WithIndexes[k][1][1]], + ) geno_id = tuple(sorted([h1_id, h2_id])) if geno_id not in geno_seen: geno_seen.add(geno_id) @@ -660,15 +841,25 @@ def comp_phase_prob(self, phases, N_Loc, epsilon, n): # should be this: # D000001,A*02:01~B*44:03~C*02:02~DQB1*02:01~DRB1*07:01,0.00006000,CAU,A*11:01~B*13:02~C*06:02~DQB1*05:01~DRB1*01:01,0.00009000,CAU if geno_id[0] == h1_id: - p_total.append([Prob1WithIndexes[h][0], Prob2WithIndexes[k][0]]) + p_total.append( + [ + Prob1WithIndexes[h][0], + Prob2WithIndexes[k][0], + ] + ) else: - p_total.append([Prob2WithIndexes[k][0],Prob1WithIndexes[h][0]]) - # k+=1 - else: - break + p_total.append( + [ + Prob2WithIndexes[k][0], + Prob1WithIndexes[h][0], + ] + ) + # k+=1 + else: + break # p_total returns an array of N*N (N is number of populations), hap_total - pairs of haplotypes. # pop_res are the names of the populations - return {'Haps': hap_total, 'Probs': p_total,'Pops': pop_res} + return {"Haps": hap_total, "Probs": p_total, "Pops": pop_res} def reduce_phase_to_valid_allels(self, haps, N_Loc, planc=False): for j in range(len(haps)): @@ -677,44 +868,49 @@ def reduce_phase_to_valid_allels(self, haps, N_Loc, planc=False): hap_list.append(haps[j][k]) options = 1 - for i in range(N_Loc): options = options * (len(hap_list[0][i].split("/"))) + for i in range(N_Loc): + options = options * (len(hap_list[0][i].split("/"))) if options >= self.number_of_options_threshold or planc: for hap_k in hap_list: for i, g in enumerate(hap_k): - gen = g.split('/') + gen = g.split("/") probs = self.check_if_alleles_exist(gen) if not probs == {}: - haps[j][k][i] = '/'.join(list(probs.keys())) - + list(probs.keys()) + haps[j][k][i] = ("/").join(list(probs.keys())) - def reduce_phase_to_commons_alleles(self, haps, N_Loc, commons_number=1, planc = False ): + def reduce_phase_to_commons_alleles( + self, haps, N_Loc, commons_number=1, planc=False + ): for j in range(len(haps)): for k in range(2): hap_list = [] hap_list.append(haps[j][k]) options = 1 - for i in range(N_Loc): options = options * (len(hap_list[0][i].split("/"))) - if options>=self.number_of_options_threshold or planc: + for i in range(N_Loc): + options = options * (len(hap_list[0][i].split("/"))) + if options >= self.number_of_options_threshold or planc: for hap_k in hap_list: - for i,g in enumerate(hap_k): - gen = g.split('/') + for i, g in enumerate(hap_k): + gen = g.split("/") probs = self.check_if_alleles_exist(gen) if not probs == {}: dict_allels = {} for allele in probs: list_allel = probs[allele] sum = 0 - for p,prob in enumerate(list_allel): - sum += prob * self.priorMatrix[p,p] + for p, prob in enumerate(list_allel): + sum += prob * self.priorMatrix[p, p] dict_allels[allele] = sum - commons_alleles = dict(sorted(dict_allels.items(), key=lambda kv: kv[1], reverse=True)[:commons_number]) - haps[j][k][i] = ('/').join(list(commons_alleles.keys())) - - - - - + commons_alleles = dict( + sorted( + dict_allels.items(), + key=lambda kv: kv[1], + reverse=True, + )[:commons_number] + ) + haps[j][k][i] = ("/").join(list(commons_alleles.keys())) def open_phases(self, haps, N_Loc, gl_string): phases = [] @@ -731,14 +927,13 @@ def open_phases(self, haps, N_Loc, gl_string): # compute the number of options: options = 1 for i in range(N_Loc): - options *= len(hap_list_splits[i]) + options = options * (len(hap_list[0][i].split("/"))) # if the number of options is smaller than the total number of nodes: if options < self.number_of_options_threshold: # open ambiguities regularly: for i in range(N_Loc): - hap_list = self.open_ambiguities(hap_list, i, hap_list_splits[i]) - + hap_list = self.open_ambiguities(hap_list, i) if k == 0: H1.append(hap_list) else: @@ -749,21 +944,21 @@ def open_phases(self, haps, N_Loc, gl_string): # if there are more options than actual haplotypes possible: else: self.option_2 += 1 - optionDict = {} #set() + optionDict = {} if len(fq) == 0: - _list = [] + list = [] for (gen, name) in self.cypher.loc_map.items(): count = 0 for i in range(len(hap_list[0])): - if hap_list[0][i].split("*", 1)[0] == gen: + if hap_list[0][i].split("*")[0] == gen: count = count + 1 if count > 0: - _list.append(name) + list.append(name) # we'll get all the options possible - # (query,lc)=self.cypher.buildQuery(["~".join(_list)]) + # (query,lc)=self.cypher.buildQuery(["~".join(list)]) # fq = pa.DataFrame(self.graph.data(query)) - label = "".join(sorted(_list)) + label = "".join(sorted(list)) fq = self.netGraph.haps_by_label(label) # fq = pa.DataFrame(self.netGraph.abcdq_allele(), ) # fq = self.netGraph.abcdq_allele() @@ -775,14 +970,28 @@ def open_phases(self, haps, N_Loc, gl_string): gen=g.split('/') for option in gen: optionDict[option]=True""" - - for gen in hap_list_splits: - for option in gen: - optionDict[option] = True - # optionDict.add(option) + for hap_k in hap_list: + # listType = [] + for g in hap_k: + gen = g.split("/") + for option in gen: + optionDict[option] = True + # print(listType) ##all_haps = fq.values.tolist() - hap_list = create_hap_list(fq, optionDict, N_Loc) - + all_haps = fq + hap_list = [] + for i in range(len(all_haps)): + count = 0 + for gen in all_haps[i].split("~"): + if not gen in optionDict: + break + else: + count = count + 1 + if count == N_Loc: + + # all_haps[i][0]=all_haps[i][0].split("~") + # hap_list.append(all_haps[i][0]) + hap_list.append(all_haps[i].split("~")) if k == 0: H1.append(hap_list) else: @@ -791,15 +1000,13 @@ def open_phases(self, haps, N_Loc, gl_string): phases.append([H1, H2]) return phases + def find_type_loci(self, loci): + return (loci.split("*"))[0] + def find_missing_indexes(self, haplo): - def find_type_loci(self,loci): - return (loci.split("*"))[0] - - def find_missing_indexes(self,haplo): - - locus_in_haplo =[] - index=[] + locus_in_haplo = [] + index = [] for locus in haplo: locus_in_haplo.append(self.index_dict[self.find_type_loci(locus)]) for locus in self.full_hapl: @@ -814,34 +1021,34 @@ def open_dict_data(self, haplo_probs): probs = list(haplo_probs.values()) haplos = list(haplo_probs.keys()) if not haplo_probs: - return {'Haps': '', 'Probs': ''} - return {'Haps': haplos, 'Probs': probs} + return {"Haps": "", "Probs": ""} + return {"Haps": haplos, "Probs": probs} def create_haplos_string(self, haplos, division, missing): - string_haplos = [] - wanted_haplos = haplos[0] - # Go over all the hap - for hap in wanted_haplos: - # The first item - hap_string = "" - # Create the wanted string - for i in range(0, len(division)): - place = division[i] - # If this is not a missing option - if division[i] not in missing: - # Normalize the places - for miss in missing: - if division[i] > miss: - place = place - 1 - - # [division[i]]- to get the index for the item in haplox we want - if hap_string == "": - hap_string = str(hap[place - 1]) - else: - hap_string = hap_string + '~' + str(hap[place - 1]) - if not hap_string == "": - string_haplos.append(hap_string) - return string_haplos + string_haplos = [] + wanted_haplos = haplos[0] + # Go over all the hap + for hap in wanted_haplos: + # The first item + hap_string = "" + # Create the wanted string + for i in range(0, len(division)): + place = division[i] + # If this is not a missing option + if division[i] not in missing: + # Normalize the places + for miss in missing: + if division[i] > miss: + place = place - 1 + + # [division[i]]- to get the index for the item in haplox we want + if hap_string == "": + hap_string = str(hap[place - 1]) + else: + hap_string = hap_string + "~" + str(hap[place - 1]) + if not hap_string == "": + string_haplos.append(hap_string) + return string_haplos def open_option_(self, dict2, dict1, planc=False, num_of_options=10): # Will contain the two option combine @@ -869,118 +1076,125 @@ def open_option_(self, dict2, dict1, planc=False, num_of_options=10): freq2 = dict2[key2] list_prob = [freq1[i] * freq2[i] * self.factor for i in range(size)] if max(list_prob) > 0: - key = ('~').join(sorted(key1.split('~') + key2.split('~'))) + key = ("~").join(sorted(key1.split("~") + key2.split("~"))) dict_all[key] = list_prob return dict_all # Find the freq of option from the matrix def find_option_freq(self, option, haplos, missing): - #one = True - # Here we calculate the freq of first the first division of the option - # The first division - division = option[0] - - # The string of the haplos - string_option = self.create_haplos_string(haplos, division, missing) - - # Find the dict of the haplos and their freqs - dict_all = self.get_haplo_freqs_pan_b(string_option, division) - - # If we found the freq of the first optin- if no-no need to continue - if dict_all != {}: - # Go over all the parts of the haplo according to option - # Skip the first options - for i in range(1, len(option)): - # Create the string of the option - # Example: option [1,2,3] will get :A*01:01~B*07:02~C*07:02 - # Example: option [5] will get DRB1*13:02 - division = option[i] - string_option = self.create_haplos_string(haplos, division, missing) - if string_option == []: - t = 0 - div_dict = self.get_haplo_freqs_pan_b(string_option, division) - - # If this part of the option is empty-the option is not good - if div_dict == {}: - result = all(elem in missing for elem in division) - if result:# and one:#len(division) == 1 and len(missing) == 1 and division[0] == missing[0]: - type = self.cypher_plan_b.findTypeFromIndexes(division) - div_dict = self.netGraph.haps_with_probs_by_label(type) - #one = False - else: - # There is no freqs - dict_all = {} - - # No need to continue - break - # Add the new data to old data and make it a new "dict_all" - dict_all = self.open_option_(div_dict, dict_all) - return dict_all + # one = True + # Here we calculate the freq of first the first division of the option + # The first division + division = option[0] + + # The string of the haplos + string_option = self.create_haplos_string(haplos, division, missing) + + # Find the dict of the haplos and their freqs + dict_all = self.get_haplo_freqs_pan_b(string_option, division) + + # If we found the freq of the first optin- if no-no need to continue + if dict_all != {}: + # Go over all the parts of the haplo according to option + # Skip the first options + for i in range(1, len(option)): + # Create the string of the option + # Example: option [1,2,3] will get :A*01:01~B*07:02~C*07:02 + # Example: option [5] will get DRB1*13:02 + division = option[i] + string_option = self.create_haplos_string(haplos, division, missing) + if string_option == []: + t = 0 + div_dict = self.get_haplo_freqs_pan_b(string_option, division) + + # If this part of the option is empty-the option is not good + if div_dict == {}: + result = all(elem in missing for elem in division) + if ( + result + ): # and one:#len(division) == 1 and len(missing) == 1 and division[0] == missing[0]: + type = self.cypher_plan_b.findTypeFromIndexes(division) + div_dict = self.netGraph.haps_with_probs_by_label(type) + # one = False + else: + # There is no freqs + dict_all = {} - def comp_hap_prob_plan_b(self, Hap,division,missing): - if division[0] == list(set(self.index_dict.values())):#[1, 2, 3, 4, 5]: - return self.comp_hap_prob([Hap[0]], 0, 0, 0) - haplo_probs = self.find_option_freq(division, Hap,missing) + # No need to continue + break + # Add the new data to old data and make it a new "dict_all" + dict_all = self.open_option_(div_dict, dict_all) + return dict_all + + def comp_hap_prob_plan_b(self, Hap, division, missing): + if division[0] == list(set(self.index_dict.values())): # [1, 2, 3, 4, 5]: + return self.comp_hap_prob(Hap[0], 0, 0, 0) + haplo_probs = self.find_option_freq(division, Hap, missing) # Return {'Haps': haplos, 'Probs': probs} of the dict return self.open_dict_data(haplo_probs) - def missing_from_data_to_string(self,hap,not_in_data): + def missing_from_data_to_string(self, hap, not_in_data): str_hap = "" str_not_in = [] - str_list=[] + str_list = [] for luci in hap: type_luci = luci.split("*")[0] if self.index_dict[type_luci] in not_in_data: str_not_in.append(luci) else: - str_hap+="~"+str(luci) + str_hap += "~" + str(luci) str_list.append(str_hap[1:]) - str_not_in=list(set(str_not_in)) - - return[str_list,str_not_in] - + str_not_in = list(set(str_not_in)) + return [str_list, str_not_in] - def find_option_freq_missing_data(self,option, haplos, missing,not_in_data): + def find_option_freq_missing_data(self, option, haplos, missing, not_in_data): # Here we calculate the freq of first the first division of the option # The first division - all_the_data = set(self.index_dict.values())#[1,2,3,4,5] + all_the_data = set(self.index_dict.values()) # [1,2,3,4,5] # All data that we dont want to - #all_missing = list(set(missing + not_in_data)) - all_missing = list(set( not_in_data)) + # all_missing = list(set(missing + not_in_data)) + all_missing = list(set(not_in_data)) all_the_data = [x for x in all_the_data if x not in all_missing] dict_res = dict() for hap in haplos[0]: # The string of hap - string_option = self.missing_from_data_to_string(hap,not_in_data) + string_option = self.missing_from_data_to_string(hap, not_in_data) # Find the dict of hap and his freqs - if len(string_option[0]) > 0 and string_option[0][0] != '': - dict_all = self.get_haplo_freqs_pan_b(string_option[0],all_the_data) + if len(string_option[0]) > 0 and string_option[0][0] != "": + dict_all = self.get_haplo_freqs_pan_b(string_option[0], all_the_data) for key in dict_all.keys(): list_key = key.split("~") - list_key=list_key[:not_in_data[0]-1]+string_option[1]+list_key[not_in_data[0]-1:] - dict_res["~".join(sorted(list_key))]=[x*(self._factor_missing_data**len(all_missing)) for x in dict_all[key]] + list_key = ( + list_key[: not_in_data[0] - 1] + + string_option[1] + + list_key[not_in_data[0] - 1 :] + ) + dict_res["~".join(sorted(list_key))] = [ + x * (self._factor_missing_data ** len(all_missing)) + for x in dict_all[key] + ] return dict_res - def comp_hap_prob_plan_b_missing_data(self, Hap,division,missing,not_in_data): + def comp_hap_prob_plan_b_missing_data(self, Hap, division, missing, not_in_data): - haplo_probs = self.find_option_freq_missing_data(division, Hap,missing,not_in_data) + haplo_probs = self.find_option_freq_missing_data( + division, Hap, missing, not_in_data + ) # Return {'Haps': haplos, 'Probs': probs} of the dict return self.open_dict_data(haplo_probs) - - def read_matrix(self,index): + def read_matrix(self, index): # Init div_option = [] - # If the index is in the size of the matrix if len(self.matrix_planb) > index: div_option = self.matrix_planb[index] @@ -988,7 +1202,7 @@ def read_matrix(self,index): # Return the division return div_option - def check_full_haplo(self,hap): + def check_full_haplo(self, hap): # To get the array wanted_hap = hap[0][0] missing = [] @@ -1002,55 +1216,57 @@ def findLinkType(self, typing): loci_lc = "".join([allele.lower() for allele in loci]) return loci_lc - def get_haplo_freqs_pan_b(self, haplos_string,division): + def get_haplo_freqs_pan_b(self, haplos_string, division): haplo_probs = {} if len(haplos_string) > 0: # Check the type of the haplos type = self.cypher_plan_b.findTypeFromIndexes(division) - label_haplo = self.cypher_plan_b.findLinkType(haplos_string) - haplo_probs = self.netGraph.adjs_query_by_color(haplos_string, label_haplo , type) + label_haplo = self.cypher_plan_b.findLinkType(haplos_string) + haplo_probs = self.netGraph.adjs_query_by_color( + haplos_string, label_haplo, type + ) return haplo_probs - def check_if_alleles_exist(self,allels): + def check_if_alleles_exist(self, allels): type = self.find_type_loci(allels[0]) - div= [self.index_dict[type]] - probs = self.get_haplo_freqs_pan_b(allels,div) + div = [self.index_dict[type]] + probs = self.get_haplo_freqs_pan_b(allels, div) return probs - def check_if_alleles_in_data(self,phase,index): - size_haplo=phase[0][0] - loci=[] + def check_if_alleles_in_data(self, phase, index): + size_haplo = phase[0][0] + loci = [] missing = [] for t in range(len(size_haplo[0][0])): for i in range(len(phase)): for hapes in phase[i][index]: - for hap in hapes: + for hap in hapes: loci.append(hap[t]) # Remove duplicates - loci=list(set(loci)) - probs= self.check_if_alleles_exist(loci) + loci = list(set(loci)) + probs = self.check_if_alleles_exist(loci) if probs == {}: - type=self.find_type_loci(loci[0]) + type = self.find_type_loci(loci[0]) missing.append(self.index_dict[type]) - loci=[] + loci = [] return missing - def check_if_alleles_of_one_phase_in_data(self,phase): - size_haplo=phase[0] - loci=[] + def check_if_alleles_of_one_phase_in_data(self, phase): + size_haplo = phase[0] + loci = [] missing = [] for t in range(len(size_haplo[0])): for hapes in phase[0]: loci.append(hapes[t]) # Remove duplicates - loci=list(set(loci)) - probs= self.check_if_alleles_exist(loci) + loci = list(set(loci)) + probs = self.check_if_alleles_exist(loci) if probs == {}: - type=self.find_type_loci(loci[0]) + type = self.find_type_loci(loci[0]) missing.append(self.index_dict[type]) - loci=[] + loci = [] return missing def allel_to_SR(self, allel_dict): @@ -1063,7 +1279,7 @@ def comp_hap_prob_plan_c(self, phases, missing): for phase in phases: dict_all_tmp = {} miss = [] - for i,loci in enumerate(phase): + for i, loci in enumerate(phase): index = self.find_type_loci(loci) index_loci = self.index_dict[index] div_dict = self.get_haplo_freqs_pan_b([loci], [index_loci]) @@ -1075,15 +1291,17 @@ def comp_hap_prob_plan_c(self, phases, missing): if dict_all_tmp == {}: dict_all_tmp = div_dict else: - dict_all_tmp = self.open_option_(div_dict, dict_all_tmp, True) + dict_all_tmp = self.open_option_(div_dict, dict_all_tmp, True) if not dict_all_tmp: break if len(miss) > 0: for key in dict_all_tmp: list_key = key.split("~") list_key = list_key + miss - dict_all["~".join(sorted(list_key))] = [x * (self._factor_missing_data ** len(miss)) for x in - dict_all_tmp[key]] + dict_all["~".join(sorted(list_key))] = [ + x * (self._factor_missing_data ** len(miss)) + for x in dict_all_tmp[key] + ] else: for key in dict_all_tmp: dict_all[key] = dict_all_tmp[key] @@ -1104,8 +1322,7 @@ def comp_hap_prob_plan_c(self, phases, missing): return dict_all - - def comp_phase_prob_plan_c(self,phases, N_Loc, epsilon, MUUG_output): + def comp_phase_prob_plan_c(self, phases, N_Loc, epsilon, MUUG_output): epsilon = 0 hap_pairs_total = [] pop_res_haplo = [] @@ -1120,41 +1337,68 @@ def comp_phase_prob_plan_c(self,phases, N_Loc, epsilon, MUUG_output): for i in range(len(phases)): # If we dont know about missing alleles - P1 =self.open_dict_data(self.comp_hap_prob_plan_c(phases[i][0][0], missing)) + P1 = self.open_dict_data( + self.comp_hap_prob_plan_c(phases[i][0][0], missing) + ) # This will open locus ambiguities and comp probabilities for Hap1 - Haps1 = P1['Haps'] - Prob1 = P1['Probs'] + Haps1 = P1["Haps"] + Prob1 = P1["Probs"] if len(Prob1) > 0: - P2 = self.open_dict_data(self.comp_hap_prob_plan_c(phases[i][1][0], missing)) - - Haps2 = P2['Haps'] - Prob2 = P2['Probs'] + P2 = self.open_dict_data( + self.comp_hap_prob_plan_c(phases[i][1][0], missing) + ) + Haps2 = P2["Haps"] + Prob2 = P2["Probs"] Prob1WithIndexes = self.convert_list_to_one_dim(Prob1) Prob2WithIndexes = self.convert_list_to_one_dim(Prob2) if MUUG_output: - maxProb = self.calc_haps_pairs(Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, hap_total, - pop_res, maxProb, geno_seen) + maxProb = self.calc_haps_pairs( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + geno_seen, + ) else: - maxProb = self.calc_haps_pairs_haplotype(Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, - hap_total, - pop_res, maxProb, hap_pairs_total, geno_seen, pop_res_haplo, - p_total) + maxProb = self.calc_haps_pairs_haplotype( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + hap_pairs_total, + geno_seen, + pop_res_haplo, + p_total, + ) if MUUG_output: sum_prob = sum(pop_res.values()) - pop_res_final = {'all_pops,all_pops': sum_prob} - return {'MaxProb': maxProb, 'Haps': hap_total, 'Pops': pop_res_final} + pop_res_final = {"all_pops,all_pops": sum_prob} + return {"MaxProb": maxProb, "Haps": hap_total, "Pops": pop_res_final} for i in range(len(pop_res_haplo)): - pop_res_haplo[i][0] = 'all_pops' - pop_res_haplo[i][1] = 'all_pops' + pop_res_haplo[i][0] = "all_pops" + pop_res_haplo[i][1] = "all_pops" - return {'MaxProb': maxProb, 'Haps': hap_pairs_total, 'Probs': p_total, 'Pops': pop_res_haplo} + return { + "MaxProb": maxProb, + "Haps": hap_pairs_total, + "Probs": p_total, + "Pops": pop_res_haplo, + } # Calculate full plan b def comp_phase_prob_plan_b(self, phases, N_Loc, epsilon, MUUG_output): @@ -1189,31 +1433,35 @@ def comp_phase_prob_plan_b(self, phases, N_Loc, epsilon, MUUG_output): for i in range(len(phases)): # If we dont know about missing alleles if missing_data_1 == []: - index = min(matrix_index,phases[i][0][1]) + index = min(matrix_index, phases[i][0][1]) option = self.read_matrix(index) P1 = self.comp_hap_prob_plan_b(phases[i][0], option, missing) - if len(P1['Haps']): + if len(P1["Haps"]): phases[i][0][1] = index else: - P1=self.comp_hap_prob_plan_b_missing_data(phases[i][0], option, missing,missing_data_1) + P1 = self.comp_hap_prob_plan_b_missing_data( + phases[i][0], option, missing, missing_data_1 + ) # This will open locus ambiguities and comp probabilities for Hap1 - Haps1 = P1['Haps'] - Prob1 = P1['Probs'] - #if len(Prob1) > 0: - # If we dont know about missing alleles + Haps1 = P1["Haps"] + Prob1 = P1["Probs"] + # if len(Prob1) > 0: + # If we dont know about missing alleles if missing_data_2 == []: index = min(matrix_index, phases[i][1][1]) option = self.read_matrix(index) P2 = self.comp_hap_prob_plan_b(phases[i][1], option, missing) - if len(P2['Haps']): + if len(P2["Haps"]): phases[i][1][1] = index - Haps2 = P2['Haps'] - Prob2 = P2['Probs'] + Haps2 = P2["Haps"] + Prob2 = P2["Probs"] else: if len(Prob1) > 0: - P2 =self.comp_hap_prob_plan_b_missing_data(phases[i][1], option, missing,missing_data_2) - Haps2 = P2['Haps'] - Prob2 = P2['Probs'] + P2 = self.comp_hap_prob_plan_b_missing_data( + phases[i][1], option, missing, missing_data_2 + ) + Haps2 = P2["Haps"] + Prob2 = P2["Probs"] # This will do the same for Hap 2; Prob1WithIndexes = self.convert_list_to_one_dim(Prob1) @@ -1221,83 +1469,130 @@ def comp_phase_prob_plan_b(self, phases, N_Loc, epsilon, MUUG_output): Prob2WithIndexes = self.convert_list_to_one_dim(Prob2) if MUUG_output: - maxProb = self.calc_haps_pairs(Haps1, Haps2,Prob1WithIndexes, Prob2WithIndexes, epsilon, hap_total, pop_res, maxProb, geno_seen) + maxProb = self.calc_haps_pairs( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + geno_seen, + ) else: - maxProb = self.calc_haps_pairs_haplotype(Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, - hap_total, - pop_res, maxProb, hap_pairs_total, geno_seen, pop_res_haplo, - p_total) + maxProb = self.calc_haps_pairs_haplotype( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + hap_pairs_total, + geno_seen, + pop_res_haplo, + p_total, + ) # Go to the next row in thr matrix matrix_index += 1 matrix_index = 10 matrix_index_curr = 0 - while hap_total == {} and matrix_index_curr<6 : + while hap_total == {} and matrix_index_curr < 6: for i in range(len(phases)): index_1 = min(matrix_index, phases[i][0][1]) index_2 = min(matrix_index, phases[i][1][1]) - if not (index_1==10 and index_2==10): - if (index_1 == 10 and len(phases[i][0][0]) > 0): + if not (index_1 == 10 and index_2 == 10): + if index_1 == 10 and len(phases[i][0][0]) > 0: option = self.read_matrix(matrix_index_curr) - missing_data_1 = self.check_if_alleles_of_one_phase_in_data(phases[i][0]) - P1 = self.comp_hap_prob_plan_b_missing_data(phases[i][0], option, missing, missing_data_1) + missing_data_1 = self.check_if_alleles_of_one_phase_in_data( + phases[i][0] + ) + P1 = self.comp_hap_prob_plan_b_missing_data( + phases[i][0], option, missing, missing_data_1 + ) option = self.read_matrix(index_2) P2 = self.comp_hap_prob_plan_b(phases[i][1], option, missing) - if (index_2 == 10 and len(phases[i][1][0]) > 0): + if index_2 == 10 and len(phases[i][1][0]) > 0: option = self.read_matrix(index_1) P1 = self.comp_hap_prob_plan_b(phases[i][0], option, missing) option = self.read_matrix(matrix_index_curr) - missing_data_2 = self.check_if_alleles_of_one_phase_in_data(phases[i][1]) - P2 = self.comp_hap_prob_plan_b_missing_data(phases[i][1], option, missing, missing_data_2) - - - Haps1 = P1['Haps'] - Prob1 = P1['Probs'] - Haps2 = P2['Haps'] - Prob2 = P2['Probs'] + missing_data_2 = self.check_if_alleles_of_one_phase_in_data( + phases[i][1] + ) + P2 = self.comp_hap_prob_plan_b_missing_data( + phases[i][1], option, missing, missing_data_2 + ) + + Haps1 = P1["Haps"] + Prob1 = P1["Probs"] + Haps2 = P2["Haps"] + Prob2 = P2["Probs"] Prob1WithIndexes = self.convert_list_to_one_dim(Prob1) Prob2WithIndexes = self.convert_list_to_one_dim(Prob2) if MUUG_output: - maxProb = self.calc_haps_pairs(Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, epsilon, - hap_total, pop_res, maxProb, geno_seen) + maxProb = self.calc_haps_pairs( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + geno_seen, + ) else: - maxProb = self.calc_haps_pairs_haplotype(Haps1, Haps2, Prob1WithIndexes, Prob2WithIndexes, - epsilon, - hap_total, - pop_res, maxProb, hap_pairs_total, geno_seen, - pop_res_haplo, - p_total) + maxProb = self.calc_haps_pairs_haplotype( + Haps1, + Haps2, + Prob1WithIndexes, + Prob2WithIndexes, + epsilon, + hap_total, + pop_res, + maxProb, + hap_pairs_total, + geno_seen, + pop_res_haplo, + p_total, + ) matrix_index_curr += 1 - - # p_total returns an array of N*N (N is number of populations), hap_total - pairs of haplotypes. - # pop_res are the names of the populations + # p_total returns an array of N*N (N is number of populations), hap_total - pairs of haplotypes. + # pop_res are the names of the populations if MUUG_output: - return {'MaxProb': maxProb, 'Haps': hap_total, 'Pops': pop_res} + return {"MaxProb": maxProb, "Haps": hap_total, "Pops": pop_res} - return {'MaxProb': maxProb, 'Haps': hap_pairs_total, 'Probs': p_total, 'Pops': pop_res_haplo} + return { + "MaxProb": maxProb, + "Haps": hap_pairs_total, + "Probs": p_total, + "Pops": pop_res_haplo, + } # haplotype - list of all loci options # return - type of tne haplotype def input_type(self, haplotype): type_list = [] for locus_options in haplotype: - locus = locus_options.split('*')[0] + locus = locus_options.split("*")[0] type_list.append(self.index_dict[locus]) return type_list - def open_ambiguities(self, hap, loc, split_loc): - return open_ambiguities(hap, loc, split_loc) - - def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, planb, em): + def comp_cand( + self, gl_string, binary, epsilon, n, MUUG_output, haps_output, planb, em + ): # receives a list of phases and computes haps and # probabilties and accumulate cartesian productEpsilon=0.0001 chr = self.gl2haps(gl_string) @@ -1305,22 +1600,22 @@ def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, pla return None, None # if we in 9-loci, check if the type input in valid format if self.nodes_for_plan_A: - geno_type = self.input_type(chr['Genotype'][0]) + geno_type = self.input_type(chr["Genotype"][0]) if not geno_type in self.nodes_for_plan_A: return None, None - n_loci = chr['N_Loc'] + n_loci = chr["N_Loc"] # generate the 2^(N-1) phases - pmags = self.gen_phases(chr['Genotype'], n_loci, binary) + pmags = self.gen_phases(chr["Genotype"], n_loci, binary) # return if the result is empty (why would that be?) if pmags == []: return None, None - #res_muugs = {'Haps': 'NaN', 'Probs': 0} - res_muugs = {'MaxProb': 0, 'Haps': {}, 'Pops': {}} - res_haps = {'Haps': 'Nan', 'Probs': 0, 'Pops':{}} + # res_muugs = {'Haps': 'NaN', 'Probs': 0} + res_muugs = {"MaxProb": 0, "Haps": {}, "Pops": {}} + res_haps = {"Haps": "Nan", "Probs": 0, "Pops": {}} # this step "opens" each phase by generating the cartesian product # NOTE: why do it here and not pass the ambiguity to the graph query? # [['A*01:01/A*01:02/A*01:03/A*01:16N', 'C*06:02'] @@ -1332,30 +1627,38 @@ def comp_cand(self, gl_string, binary, epsilon, n, MUUG_output, haps_output, pla # phases = self.open_phases(pmags, n_loci, gl_string) if not phases: - print('in reduce_phase_to_valid_alleles') + print("in reduce_phase_to_valid_alleles") self.reduce_phase_to_valid_allels(pmags, n_loci) phases = self.open_phases(pmags, n_loci, gl_string) if not phases: - print('in reduce_phase_to_commons_alleles') + print("in reduce_phase_to_commons_alleles") self.reduce_phase_to_commons_alleles(pmags, n_loci, commons_number=10) phases = self.open_phases(pmags, n_loci, gl_string) if phases: if MUUG_output: - prior_matrix_orig = np.array(self.priorMatrix, order='K', copy=True) #copy.deepcopy(self.priorMatrix) - res_muugs = self.call_comp_phase_prob(epsilon, n, phases, chr, True, planb) - if planb and len(res_muugs['Haps']) == 0: - self.plan = 'c' + prior_matrix_orig = copy.deepcopy(self.priorMatrix) + res_muugs = self.call_comp_phase_prob( + epsilon, n, phases, chr, True, planb + ) + if planb and len(res_muugs["Haps"]) == 0: + self.plan = "c" self.reduce_phase_to_commons_alleles(pmags, n_loci, 1, True) phases = self.open_phases(pmags, n_loci, gl_string) - res_muugs = self.comp_phase_prob_plan_c(phases, n_loci, epsilon, True) + res_muugs = self.comp_phase_prob_plan_c( + phases, n_loci, epsilon, True + ) self.priorMatrix = prior_matrix_orig - if (haps_output): - res_haps = self.call_comp_phase_prob(epsilon, n, phases, chr, False, planb) - if planb and len(res_haps['Haps']) == 0 and not em:#em + if haps_output: + res_haps = self.call_comp_phase_prob( + epsilon, n, phases, chr, False, planb + ) + if planb and len(res_haps["Haps"]) == 0 and not em: # em self.reduce_phase_to_commons_alleles(pmags, n_loci, 1, True) phases = self.open_phases(pmags, n_loci, gl_string) - res_haps = self.comp_phase_prob_plan_c(phases, n_loci, epsilon, False) + res_haps = self.comp_phase_prob_plan_c( + phases, n_loci, epsilon, False + ) return res_muugs, res_haps @@ -1363,80 +1666,89 @@ def call_comp_phase_prob(self, epsilon, n, phases, chr, MUUG_output, planb): n_res = 0 # number of results min_res = 100 # minimum number of results (NOTE: why 10?) - min_epsilon = 1.e-9 # minimum threshold, then why the other? - res = {'Haps': 'NaN', 'Probs': 0} + min_epsilon = 1.0e-9 # minimum threshold, then why the other? + res = {"Haps": "NaN", "Probs": 0} lastRound = False - while (epsilon > 0): + while epsilon > 0: # each time through reduce epsilon by an order of magnitude # until you have seen "min_res" results epsilon /= 10 # # if you have reduced epsilon below min_epsilon then just make it 0 # - if (epsilon < min_epsilon): + if epsilon < min_epsilon: epsilon = 0.0 # get results if MUUG_output: - res = self.comp_phase_prob_genotype(phases, chr['N_Loc'], epsilon, n) + res = self.comp_phase_prob_genotype(phases, chr["N_Loc"], epsilon, n) else: - res = self.comp_phase_prob_haplotype(phases, chr['N_Loc'], epsilon, n) - haps = res['Haps'] + res = self.comp_phase_prob_haplotype(phases, chr["N_Loc"], epsilon, n) + haps = res["Haps"] n_res = len(haps) - if (n_res > 0 and epsilon > 0): + if n_res > 0 and epsilon > 0: # sorted_by_value = sorted(haps.items(), key=lambda kv: kv[1], reverse=True) - epsilon = res['MaxProb'] / 100000 + epsilon = res["MaxProb"] / 100000 lastRound = True break if lastRound: if MUUG_output: - res = self.comp_phase_prob_genotype(phases, chr['N_Loc'], epsilon, n) + res = self.comp_phase_prob_genotype(phases, chr["N_Loc"], epsilon, n) else: - res = self.comp_phase_prob_haplotype(phases, chr['N_Loc'], epsilon, n) + res = self.comp_phase_prob_haplotype(phases, chr["N_Loc"], epsilon, n) # no plan b for level in range(2): if level == 1: - self.priorMatrix = np.ones((len(self.populations), len(self.populations))) #### - if planb and len(res['Haps']) == 0: - self.plan = 'b' + if self.unk_priors == "MR": + self.priorMatrix = np.ones( + (len(self.populations), len(self.populations)) + ) + else: + self.priorMatrix = np.identity(len(self.populations)) + # self.priorMatrix = np.ones((len(self.populations), len(self.populations))) #### + if planb and len(res["Haps"]) == 0: + self.plan = "b" epsilon = 1e-14 n_res = 0 min_res = 10 - min_epsilon = 1.e-3 + min_epsilon = 1.0e-3 # self.priorMatrix = np.ones((len(self.populations), len(self.populations))) while (epsilon > 0) & (n_res < min_res): epsilon /= 10 - if (epsilon < min_epsilon): + if epsilon < min_epsilon: epsilon = 0.0 phases_planb = deepcopy_list(phases) # Find the option according to plan b if MUUG_output: - res = self.comp_phase_prob_plan_b(phases_planb, chr['N_Loc'], epsilon, True) + res = self.comp_phase_prob_plan_b( + phases_planb, chr["N_Loc"], epsilon, True + ) else: - res = self.comp_phase_prob_plan_b(phases_planb, chr['N_Loc'], epsilon, False) - n_res = len(res['Haps']) + res = self.comp_phase_prob_plan_b( + phases_planb, chr["N_Loc"], epsilon, False + ) + n_res = len(res["Haps"]) return res - - #sum all probs for each first race and return the one with the highest prob + # sum all probs for each first race and return the one with the highest prob def find_first_race(self, pops, probs): - prob_by_race =dict() + prob_by_race = dict() for i, pop in enumerate(pops): key = pop[0] if key in prob_by_race: - prob_by_race[key] += probs[i][0]*probs[i][1] + prob_by_race[key] += probs[i][0] * probs[i][1] else: - prob_by_race[key] = probs[i][0]*probs[i][1] + prob_by_race[key] = probs[i][0] * probs[i][1] sortedDict = sorted(prob_by_race.items(), key=lambda x: x[1], reverse=True) return sortedDict[0][0] - #aum all probs of pairs- race1,second race - #and return second race of the highest pair + # aum all probs of pairs- race1,second race + # and return second race of the highest pair def find_second_race(self, race1, pops, probs): prob_by_race = dict() for i, pop in enumerate(pops): @@ -1450,31 +1762,51 @@ def find_second_race(self, race1, pops, probs): sortedDict = sorted(prob_by_race.items(), key=lambda x: x[1], reverse=True) return sortedDict[0][0] - #find the haps of race1-race2, and sum the probs - def write_haplotype(self, race1, race2, haps, pops, probs, name_gl, foutHap, foutPop): + # find the haps of race1-race2, and sum the probs + def write_haplotype( + self, race1, race2, haps, pops, probs, name_gl, foutHap, foutPop + ): haps_dict = dict() for i, line in enumerate(pops): if race1 == line[0] and race2 == line[1]: - key = haps[i][0] + ',' + haps[i][1] + key = haps[i][0] + "," + haps[i][1] if key in haps_dict: haps_dict[key] += probs[i][0] * probs[i][1] else: haps_dict[key] = probs[i][0] * probs[i][1] - #write the 1000 highest to file + # write the 1000 highest to file sortedDict = sorted(haps_dict.items(), key=lambda x: x[1], reverse=True) minBestResult = min(1000, len(sortedDict)) for k in range(minBestResult): - foutHap.write(name_gl + ',' + str(sortedDict[k][0]) + ',' + - str(sortedDict[k][1]) + ',' + str(k) + ',' + '\n') - foutPop.write(name_gl + ',' + str(race1) + ',' + str(race2) +'\n') - - #find pair of races with the highest prob + foutHap.write( + name_gl + + "," + + str(sortedDict[k][0]) + + "," + + str(sortedDict[k][1]) + + "," + + str(k) + + "," + + "\n" + ) + foutPop.write(name_gl + "," + str(race1) + "," + str(race2) + "\n") + + # find pair of races with the highest prob def find_races(self, res, foutHap, foutPop, name_gl): - if len(res['Pops']) > 0: - race1 = self.find_first_race(res['Pops'], res['Probs']) - race2 = self.find_second_race(race1, res['Pops'], res['Probs']) - self.write_haplotype(race1, race2, res['Haps'], res['Pops'], res['Probs'], name_gl, foutHap, foutPop) + if len(res["Pops"]) > 0: + race1 = self.find_first_race(res["Pops"], res["Probs"]) + race2 = self.find_second_race(race1, res["Pops"], res["Probs"]) + self.write_haplotype( + race1, + race2, + res["Haps"], + res["Pops"], + res["Probs"], + name_gl, + foutHap, + foutPop, + ) """def calc_priority_matrix(self, list_race1, list_race2, priority ): list_race1 = list_race1.split(';') @@ -1520,7 +1852,7 @@ def find_races(self, res, foutHap, foutPop, name_gl): self.priorMatrix = self.priorMatrix / prior_sum""" - def calc_priority_matrix(self, list_race1, list_race2, priority ): + def calc_priority_matrix(self, list_race1, list_race2, priority): popLen = len(self.populations) self.priorMatrix = np.zeros((popLen, popLen)) # Identity matrix @@ -1528,10 +1860,14 @@ def calc_priority_matrix(self, list_race1, list_race2, priority ): for race_1 in list_race1: for race_2 in list_race2: - if race_1 == '' and race_2 == '': + if race_1 == "" and race_2 == "": continue - if race_1 == '' or race_2 == '': - race = self.populations.index(race_2) if race_1 == '' else self.populations.index(race_1) + if race_1 == "" or race_2 == "": + race = ( + self.populations.index(race_2) + if race_1 == "" + else self.populations.index(race_1) + ) """if race_1 == '': race = self.populations.index(race_2) else: @@ -1541,35 +1877,47 @@ def calc_priority_matrix(self, list_race1, list_race2, priority ): # calc priority for i in range(popLen): - tmp_matrix[race, i] = tmp_matrix[race, i] + priority['gamma']*2 + tmp_matrix[race, i] = ( + tmp_matrix[race, i] + priority["gamma"] * 2 + ) tmp_matrix = tmp_matrix + tmp_matrix.transpose() - tmp_matrix[race, race] -= (priority['gamma']*2) + tmp_matrix[race, race] -= priority["gamma"] * 2 - tmp_matrix = priority['eta'] * np.ones((popLen, popLen)) + tmp_matrix + priority['beta'] * Imatrix + tmp_matrix = ( + priority["eta"] * np.ones((popLen, popLen)) + + tmp_matrix + + priority["beta"] * Imatrix + ) self.priorMatrix += tmp_matrix else: tmp_matrix = np.zeros((popLen, popLen)) race1 = self.populations.index(race_1) race2 = self.populations.index(race_2) - # calc priority + # calc priority for i in range(popLen): - tmp_matrix[race1, i] = tmp_matrix[race1, i] + priority['gamma'] - tmp_matrix[i, race2] = tmp_matrix[i, race2] + priority['gamma'] + tmp_matrix[race1, i] = tmp_matrix[race1, i] + priority["gamma"] + tmp_matrix[i, race2] = tmp_matrix[i, race2] + priority["gamma"] - tmp_matrix[race1, race2] -= priority['gamma'] + tmp_matrix[race1, race2] -= priority["gamma"] - tmp_matrix[race1, race2] = tmp_matrix[race1, race2] + priority['alpha'] + tmp_matrix[race1, race2] = ( + tmp_matrix[race1, race2] + priority["alpha"] + ) if race1 != race2: tmp_matrix = tmp_matrix + tmp_matrix.transpose() - tmp_matrix[race1, race1] -= priority['gamma'] - tmp_matrix[race2, race2] -= priority['gamma'] + tmp_matrix[race1, race1] -= priority["gamma"] + tmp_matrix[race2, race2] -= priority["gamma"] - tmp_matrix[race1, race1] += priority['delta'] + tmp_matrix[race1, race1] += priority["delta"] if race1 != race2: - tmp_matrix[race2, race2] += priority['delta'] - tmp_matrix = priority['eta'] * np.ones((popLen, popLen)) + tmp_matrix + priority['beta'] * Imatrix + tmp_matrix[race2, race2] += priority["delta"] + tmp_matrix = ( + priority["eta"] * np.ones((popLen, popLen)) + + tmp_matrix + + priority["beta"] * Imatrix + ) self.priorMatrix += tmp_matrix @@ -1577,52 +1925,75 @@ def calc_priority_matrix(self, list_race1, list_race2, priority ): prior_sum = 0 for i in range(popLen): for j in range(popLen): - self.priorMatrix[i][j] = self.priorMatrix[i][j] * self.count_by_prob[i] * self.count_by_prob[j] + self.priorMatrix[i][j] = ( + self.priorMatrix[i][j] + * self.count_by_prob[i] + * self.count_by_prob[j] + ) prior_sum += self.priorMatrix[i][j] self.priorMatrix = self.priorMatrix / prior_sum - def update_prob_by_priority(self, res, race1, race2, priority): # prob by priority - pops = res['Pops'] + pops = res["Pops"] for i, pop in enumerate(pops): race1 = self.populations.index(pop[0]) race2 = self.populations.index(pop[1]) - res['Probs'][i][0] = float(res['Probs'][i][0])*math.sqrt(self.priorMatrix[race1][race2]) - res['Probs'][i][1] = float(res['Probs'][i][1])*math.sqrt(self.priorMatrix[race1][race2]) + res["Probs"][i][0] = float(res["Probs"][i][0]) * math.sqrt( + self.priorMatrix[race1][race2] + ) + res["Probs"][i][1] = float(res["Probs"][i][1]) * math.sqrt( + self.priorMatrix[race1][race2] + ) return res - def impute_one(self, subject_id, gl, binary, race1, race2, priority, epsilon, n, MUUG_output, haps_output, planb, em):#em + def impute_one( + self, + subject_id, + gl, + binary, + race1, + race2, + priority, + epsilon, + n, + MUUG_output, + haps_output, + planb, + em, + ): # em clean_gl = clean_up_gl(gl) if self.unk_priors == "MR": - self.priorMatrix = np.ones((len(self.populations), len(self.populations))) + self.priorMatrix = np.ones((len(self.populations), len(self.populations))) else: self.priorMatrix = np.identity(len(self.populations)) to_calc_prior_matrix = False if race1 or race2: - race1 = race1.split(';') - for i,race in enumerate(race1): + race1 = race1.split(";") + for i, race in enumerate(race1): if not race in self.populations: - race1[i] = '' + race1[i] = "" else: to_calc_prior_matrix = True - race2 = race2.split(';') + race2 = race2.split(";") for i, race in enumerate(race2): if not race in self.populations: - race2[i] = '' + race2[i] = "" else: to_calc_prior_matrix = True if to_calc_prior_matrix: - self.calc_priority_matrix(race1, race2, priority) + self.calc_priority_matrix(race1, race2, priority) # lookup based on genotype res_muugs = res_haps = None if gl: - res_muugs, res_haps = self.comp_cand(clean_gl, binary, epsilon, n, MUUG_output, haps_output, planb, em) + res_muugs, res_haps = self.comp_cand( + clean_gl, binary, epsilon, n, MUUG_output, haps_output, planb, em + ) return subject_id, res_muugs, res_haps - def impute_file(self, config, planb=None, em_mr = False, em = False):##em + def impute_file(self, config, planb=None, em_mr=False, em=False): ##em priority = config["priority"] MUUG_output = config["output_MUUG"] haps_output = config["output_haplotypes"] @@ -1630,9 +2001,9 @@ def impute_file(self, config, planb=None, em_mr = False, em = False):##em epsilon = config["epsilon"] number_of_results = config["number_of_results"] number_of_pop_results = config["number_of_pop_results"] - #planb = config["planb"]#em - if planb is None:#em - planb = config["planb"]#em + # planb = config["planb"]#em + if planb is None: # em + planb = config["planb"] # em # TODO: do the right thing if its a gzip if self.verbose: @@ -1644,88 +2015,144 @@ def impute_file(self, config, planb=None, em_mr = False, em = False):##em f_bin = json.load(json_file) f_bin_exist = True - f = open(config["imputation_input_file"], 'r') + f = open(config["imputation_input_file"], "r") if MUUG_output: - fout_hap_muug = open(config["imputation_out_umug_freq_file"], 'w') - fout_pop_muug = open(config["imputation_out_umug_pops_file"], 'w') + fout_hap_muug = open(config["imputation_out_umug_freq_file"], "w") + fout_pop_muug = open(config["imputation_out_umug_pops_file"], "w") if haps_output: - fout_hap_haplo = open(config["imputation_out_hap_freq_file"], 'w') - fout_pop_haplo = open(config["imputation_out_hap_pops_file"], 'w') + fout_hap_haplo = open(config["imputation_out_hap_freq_file"], "w") + fout_pop_haplo = open(config["imputation_out_hap_pops_file"], "w") - miss = open(config["imputation_out_miss_file"], 'w') - problem = open(config["imputation_out_problem_file"], 'w') + miss = open(config["imputation_out_miss_file"], "w") + problem = open(config["imputation_out_problem_file"], "w") with f as lines: for (i, name_gl) in enumerate(lines): - try: - name_gl = name_gl.rstrip() # remove trailing whitespace - if ',' in name_gl: - list_gl = name_gl.split(',') - else: - list_gl = name_gl.split('%') - - subject_id = list_gl[0] - subject_gl = list_gl[1] - subject_bin = [1] *(len(self.full_loci) - 1) - if f_bin_exist: - subject_bin = f_bin[subject_id] - race1 = race2 = None - if len(list_gl) > 2: - race1 = list_gl[2] - race2 = list_gl[3] - - start = timeit.default_timer() - - # - # Impute One - # This method will impute one subject given the parameters - self.plan = 'a' - self.option_1 = 0 - self.option_2 = 0 - subject_id, res_muugs, res_haps = self.impute_one(subject_id, subject_gl, subject_bin, race1, race2, priority, epsilon, n, MUUG_output, haps_output, planb, em)#em - - if res_muugs is None: - problem.write(str(i)+","+str(subject_id)+"\n") - continue - - if (len(res_haps['Haps']) == 0 or res_haps['Haps'] == 'NaN') and len(res_muugs['Haps']) == 0: - miss.write(str(i)+","+str(subject_id)+"\n") - - - if haps_output: - haps = res_haps['Haps'] - probs = res_haps['Probs'] - pops = res_haps['Pops'] - print("{index} Subject: {id} {hap_length} haplotypes".format(index=i, id=subject_id, hap_length=len(haps))) - if em_mr: - write_best_hap_race_pairs(subject_id, haps, pops, probs, number_of_results, fout_hap_haplo) - write_best_prob(subject_id, pops, probs, 1, fout_pop_haplo) - else: - write_best_prob(subject_id, haps, probs, number_of_results, fout_hap_haplo, "+") - write_best_prob(subject_id, pops, probs, number_of_pop_results, fout_pop_haplo) - if MUUG_output: - haps = res_muugs['Haps'] - pops = res_muugs['Pops'] - print("{index} Subject: {id} {hap_length} haplotypes".format(index=i, id=subject_id, hap_length=len(haps))) - write_best_prob_genotype(subject_id, haps, number_of_results, fout_hap_muug) - write_best_prob_genotype(subject_id, pops, number_of_pop_results, fout_pop_muug) - - if self.verbose: - self.logger.info("{index} Subject: {id} {hap_length} haplotypes".format(index=i, id=subject_id, hap_length=len(haps))) - self.logger.info("{index} Subject: {id} plan: {plan} oppen_phases - count of open regular option: {option1}, count of alternative opening: {option2}".format(index=i, id=subject_id, - plan=self.plan, option1= self.option_1, option2 = self.option_2)) - - stop = timeit.default_timer() - time_taken = stop-start - print(time_taken) - if self.verbose: - self.logger.info("Time taken: " + str(time_taken)) - except: - print(f"{i} Subject: {subject_id} - Exception") - problem.write(str(name_gl) + "\n") + # try: + name_gl = name_gl.rstrip() # remove trailing whitespace + if "," in name_gl: + list_gl = name_gl.split(",") + else: + list_gl = name_gl.split("%") + + subject_id = list_gl[0] + subject_gl = list_gl[1] + subject_bin = [1] * (len(self.full_loci) - 1) + if f_bin_exist: + subject_bin = f_bin[subject_id] + race1 = race2 = None + if len(list_gl) > 2: + race1 = list_gl[2] + race2 = list_gl[3] + + start = timeit.default_timer() + + # + # Impute One + # This method will impute one subject given the parameters + self.plan = "a" + self.option_1 = 0 + self.option_2 = 0 + subject_id, res_muugs, res_haps = self.impute_one( + subject_id, + subject_gl, + subject_bin, + race1, + race2, + priority, + epsilon, + n, + MUUG_output, + haps_output, + planb, + em, + ) # em + + if res_muugs is None: + problem.write(str(i) + "," + str(subject_id) + "\n") continue + if (len(res_haps["Haps"]) == 0 or res_haps["Haps"] == "NaN") and len( + res_muugs["Haps"] + ) == 0: + miss.write(str(i) + "," + str(subject_id) + "\n") + + if haps_output: + haps = res_haps["Haps"] + probs = res_haps["Probs"] + pops = res_haps["Pops"] + print( + "{index} Subject: {id} {hap_length} haplotypes".format( + index=i, id=subject_id, hap_length=len(haps) + ) + ) + if em_mr: + write_best_hap_race_pairs( + subject_id, + haps, + pops, + probs, + number_of_results, + fout_hap_haplo, + ) + write_best_prob(subject_id, pops, probs, 1, fout_pop_haplo) + else: + write_best_prob( + subject_id, + haps, + probs, + number_of_results, + fout_hap_haplo, + "+", + ) + write_best_prob( + subject_id, + pops, + probs, + number_of_pop_results, + fout_pop_haplo, + ) + if MUUG_output: + haps = res_muugs["Haps"] + pops = res_muugs["Pops"] + print( + "{index} Subject: {id} {hap_length} haplotypes".format( + index=i, id=subject_id, hap_length=len(haps) + ) + ) + write_best_prob_genotype( + subject_id, haps, number_of_results, fout_hap_muug + ) + write_best_prob_genotype( + subject_id, pops, number_of_pop_results, fout_pop_muug + ) + + if self.verbose: + self.logger.info( + "{index} Subject: {id} {hap_length} haplotypes".format( + index=i, id=subject_id, hap_length=len(haps) + ) + ) + self.logger.info( + "{index} Subject: {id} plan: {plan} oppen_phases - count of open regular option: {option1}, count of alternative opening: {option2}".format( + index=i, + id=subject_id, + plan=self.plan, + option1=self.option_1, + option2=self.option_2, + ) + ) + + stop = timeit.default_timer() + time_taken = stop - start + print(time_taken) + if self.verbose: + self.logger.info("Time taken: " + str(time_taken)) + """except: + problem.write(str(name_gl) + "\n") + continue""" + f.close() if MUUG_output: fout_hap_muug.close() diff --git a/grim/imputation/imputegl/networkx_graph.py b/grim/imputation/imputegl/networkx_graph.py index c7c834a..01da167 100755 --- a/grim/imputation/imputegl/networkx_graph.py +++ b/grim/imputation/imputegl/networkx_graph.py @@ -22,11 +22,11 @@ def __init__(self, config): # bug: dies if file doesn't exist # bug: list_f doesn't exist - with open(path + '/nodes_for_plan_a.txt') as list_f: + with open(path + "/nodes_for_plan_a.txt") as list_f: for item in list_f: self.nodes_plan_a.append(item.strip()) # bug: dies if file doesn't exist - with open(path + '/nodes_for_plan_b.txt') as list_f: + with open(path + "/nodes_for_plan_b.txt") as list_f: for item in list_f: self.nodes_plan_b.append(item.strip()) # self.nodes_plan_a = pickle.load(open( path + '/nodes_for_plan_a.pkl', "rb")) @@ -37,22 +37,30 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile): nodesDict = dict() # add nodes from file with open(nodesFile) as nodesfile: - readNodes = csv.reader(nodesfile, delimiter=',') - next(readNodes) + readNodes = csv.reader(nodesfile, delimiter=",") + firstLine = next(readNodes) for row in readNodes: if len(row) > 0: if not self.nodes_plan_a or row[2] in self.nodes_plan_a: - self.graph.add_node(row[1], label=row[2], freq=list(map(float, row[3].split(";")))) + self.graph.add_node( + row[1], + label=row[2], + freq=list(map(float, row[3].split(";"))), + ) if not self.nodes_plan_b or row[2] in self.nodes_plan_b: - self.whole_graph.add_node(row[1], label=row[2], freq=list(map(float, row[3].split(";")))) + self.whole_graph.add_node( + row[1], + label=row[2], + freq=list(map(float, row[3].split(";"))), + ) nodesDict[row[0]] = row[1] nodesfile.close() # add edges from file with open(edgesFile) as edgesfile: - readEdges = csv.reader(edgesfile, delimiter=',') - next(readEdges) + readEdges = csv.reader(edgesfile, delimiter=",") + firstLine = next(readEdges) for row in readEdges: if len(row) > 0: node1 = nodesDict[row[0]] @@ -67,20 +75,22 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile): # add edges from file with open(allEdgesFile) as allEdgesfile: - readEdges = csv.reader(allEdgesfile, delimiter=',') - next(readEdges) + readEdges = csv.reader(allEdgesfile, delimiter=",") + firstLine = next(readEdges) for row in readEdges: if len(row) > 0: node1 = nodesDict[row[0]] node2 = nodesDict[row[1]] - if len(self.whole_graph.nodes[node1]['label']) < len(self.whole_graph.nodes[node2]['label']): - connector = self.whole_graph.nodes[node2]['label'] + node1 - self.whole_graph.add_edge(node1, connector) - self.whole_graph.add_edge(connector, node2) - else: - connector = self.whole_graph.nodes[node1]['label'] + node2 - self.whole_graph.add_edge(node2, connector) - self.whole_graph.add_edge(connector, node1) + kind = "-".join( + sorted( + [ + self.whole_graph.nodes[node1]["label"], + self.whole_graph.nodes[node2]["label"], + ], + key=len, + ) + ) + self.whole_graph.add_edge(node1, node2, color=kind) allEdgesfile.close() nodesDict.clear() @@ -109,10 +119,10 @@ def haps_with_probs_by_label(self, label): listLabel = self.haps_by_label(label) if not self.nodes_plan_a or label in self.nodes_plan_a: for allele in listLabel: - dictAlleles[allele] = self.graph.nodes[allele]['freq'] + dictAlleles[allele] = self.graph.nodes[allele]["freq"] elif label in self.nodes_plan_b: for allele in listLabel: - dictAlleles[allele] = self.whole_graph.nodes[allele]['freq'] + dictAlleles[allele] = self.whole_graph.nodes[allele]["freq"] return dictAlleles @@ -120,14 +130,13 @@ def haps_with_probs_by_label(self, label): def adjs_query(self, alleleList): adjDict = dict() for allele in alleleList: - if allele in self.graph: - allele_node = self.graph.nodes[allele] - if allele_node["label"] == self.full_loci: # 'ABCQR': - adjDict[allele] = allele_node['freq'] + if allele in self.graph.nodes(): + if self.graph.nodes[allele]["label"] == self.full_loci: # 'ABCQR': + adjDict[allele] = self.graph.nodes[allele]["freq"] else: adjs = self.graph.adj[allele] for adj in adjs: - adjDict[adj] = self.graph.nodes[adj]['freq'] + adjDict[adj] = self.graph.nodes[adj]["freq"] return adjDict # find all adj of alleleList by label @@ -138,10 +147,29 @@ def adjs_query_by_color(self, alleleList, labelA, labelB): return self.node_probs(alleleList, labelA) for allele in alleleList: - if allele in self.whole_graph: - alleles = self.whole_graph.adj.get(labelB + allele , []) + if allele in self.whole_graph.nodes(): + copyLabelA = labelA + newLabelA = labelA + miss = missing(labelA, labelB) + alleles = [allele] + + while len(miss) > 0: + tmpAllels = list() + newLabelA = copyLabelA + miss[0] + newLabelA = "".join(sorted(newLabelA)) + del miss[0] + for oneAllel in alleles: + # alleles.remove(oneAllel) + adjs = self.whole_graph.adj[oneAllel] + label = copyLabelA + "-" + newLabelA + for adj in adjs: + if adjs[adj]["color"] == label: + tmpAllels.append(adj) + alleles = tmpAllels + copyLabelA = newLabelA + for adj in alleles: - adjDict[adj] = self.whole_graph.nodes[adj]['freq'] + adjDict[adj] = self.whole_graph.nodes[adj]["freq"] return adjDict # return dict of nodes and there proper freq @@ -149,10 +177,10 @@ def node_probs(self, nodes, label): nodesDict = dict() if not self.nodes_plan_b or label in self.nodes_plan_b: for node in nodes: - if node in self.whole_graph: - nodesDict[node] = self.whole_graph.nodes[node]['freq'] + if node in self.whole_graph.nodes(): + nodesDict[node] = self.whole_graph.nodes[node]["freq"] elif label in self.nodes_plan_a: for node in nodes: - if node in self.whole_graph: - nodesDict[node] = self.graph.nodes[node]['freq'] + if node in self.whole_graph.nodes(): + nodesDict[node] = self.graph.nodes[node]["freq"] return nodesDict diff --git a/grim/imputation/setup.py b/grim/imputation/setup.py index f3dbd8a..88708cb 100755 --- a/grim/imputation/setup.py +++ b/grim/imputation/setup.py @@ -1,8 +1,8 @@ -''' +""" Created on Feb 16, 2018 @author: mmaiers-nmdp -''' +""" from setuptools import setup, find_packages from codecs import open @@ -11,55 +11,40 @@ here = path.abspath(path.dirname(__file__)) # Get the long description from the README file -with open(path.join(here, 'README.rst'), encoding='utf-8') as f: +with open(path.join(here, "README.rst"), encoding="utf-8") as f: long_description = f.read() setup( - name='imputegl', - + name="imputegl", # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='0.0.4', - - description='Imputation based on Neo4j graph', + version="0.0.4", + description="Imputation based on Neo4j graph", long_description=long_description, - # The project's main homepage. - url='https://github.com/nmdp-bioinformatics/graph-imputation-match', - - packages=[ - 'imputegl' - ], - package_dir={'imputegl': - 'imputegl'}, + url="https://github.com/nmdp-bioinformatics/graph-imputation-match", + packages=["imputegl"], + package_dir={"imputegl": "imputegl"}, # Author details - author_email='mmaiers@nmdp.org', - + author_email="mmaiers@nmdp.org", # Choose your license - license='LGPL3', - + license="LGPL3", # What does your project relate to? - keywords='imputation HLA graph', - + keywords="imputation HLA graph", # You can just specify the packages manually here if your project is # simple. Or you can use find_packages(). - # List run-time dependencies here. These will be installed by pip when # your project is installed. For an analysis of "install_requires" vs pip's # requirements files see: # https://packaging.python.org/en/latest/requirements.html - install_requires=['pandas>=1.0.3', 'networkx==2.3'], - + install_requires=["pandas>=1.0.3", "networkx==2.3"], # List additional groups of dependencies here (e.g. development # dependencies). You can install these using the following syntax, # for example: # $ pip install -e .[dev,test] extras_require={ - 'dev': ['check-manifest'], - 'test': ['coverage'], + "dev": ["check-manifest"], + "test": ["coverage"], }, - ) - - diff --git a/grim/model/allele.py b/grim/model/allele.py deleted file mode 100644 index a13b8cc..0000000 --- a/grim/model/allele.py +++ /dev/null @@ -1,74 +0,0 @@ -# -*- coding: utf-8 -*- - -# -# grim Graph Imputation -# Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved. -# -# This library is free software; you can redistribute it and/or modify it -# under the terms of the GNU Lesser General Public License as published -# by the Free Software Foundation; either version 3 of the License, or (at -# your option) any later version. -# -# This library is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -# License for more details. -# -# You should have received a copy of the GNU Lesser General Public License -# along with this library; if not, write to the Free Software Foundation, -# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -# -# > http://www.fsf.org/licensing/licenses/lgpl.html -# > http://www.opensource.org/licenses/lgpl-license.php -# - - -class Allele: - """ - A Class I allele is an allele of Locus 'HLA-A', 'HLA-B', 'HLA-C' - """ - - def __init__(self, locus: str, allele: str): - """ - Validate the Class I locus and allele first. - - @param locus: Class I Locus - @param allele: Class I Allele - """ - self.validate(locus, allele) - self.allele = allele - self.locus = locus - - def __eq__(self, other: object) -> bool: - if isinstance(other, Allele): - return self.locus == other.locus and self.allele == other.allele - return False - - def __lt__(self, other): - if isinstance(other, Allele): - return self.allele < other.allele - return False - - def __str__(self) -> str: - if self.allele.startswith("HLA-"): - return self.allele - return f"{self.locus}*{self.locus}" - - @staticmethod - def validate(locus: str, allele: str): - if locus not in ["HLA-A", "HLA-B", "HLA-C"]: - raise InvalidAllele(f"{locus} is Not a valid locus") - if "*" not in allele and ":" not in allele: - raise InvalidAllele("f{allele} is Not a valid allele") - - @classmethod - def from_fullname(cls, allele): - if not allele.startswith("HLA-"): - raise InvalidAllele(f"{allele} is not a fully-named allele") - locus = allele.split("*")[0] - return Allele(locus, allele) - - -class InvalidAllele(Exception): - def __init__(self, message: str) -> None: - self.message = message diff --git a/grim/model/slug.py b/grim/model/slug.py deleted file mode 100644 index 90f87be..0000000 --- a/grim/model/slug.py +++ /dev/null @@ -1,72 +0,0 @@ -# -*- coding: utf-8 -*- - -# -# grim Graph Imputation -# Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved. -# -# This library is free software; you can redistribute it and/or modify it -# under the terms of the GNU Lesser General Public License as published -# by the Free Software Foundation; either version 3 of the License, or (at -# your option) any later version. -# -# This library is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -# License for more details. -# -# You should have received a copy of the GNU Lesser General Public License -# along with this library; if not, write to the Free Software Foundation, -# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. -# -# > http://www.fsf.org/licensing/licenses/lgpl.html -# > http://www.opensource.org/licenses/lgpl-license.php -# - -from .allele import Allele - - -class SLUG: - """ - SLUGs are Single Locus Un-phased Genotypes made up of a pair of alleles. - The alleles are lexicographically sorted. - """ - - def __init__(self, allele_1: Allele, allele_2: Allele) -> None: - """ - SLUGs are single locus genotypes with alleles in sorted order - - @param allele_1: - @param allele_2: - """ - if allele_1 < allele_2: - self.allele_1 = allele_1 - self.allele_2 = allele_2 - else: - self.allele_1 = allele_2 - self.allele_2 = allele_1 - - def __str__(self) -> str: - """ - String version of SLUG is in GL String format - - @return: gl-string format of SLUG - """ - return f"{self.allele_1}+{self.allele_2}" - - def __eq__(self, other: object) -> bool: - """ - 2 SLUGs are equal if the individual alleles are equal - to each other. - - @param other: - @return: True indicating whether 2 SLUGS are equal - """ - if isinstance(other, SLUG): - return self.allele_1 == other.allele_1 and self.allele_2 == other.allele_2 - return False - - @classmethod - def from_glstring(cls, slug): - allele_1, allele_2 = slug.split("+") - slug = SLUG(Allele.from_fullname(allele_1), Allele.from_fullname(allele_2)) - return slug diff --git a/grim/validation/parallel-imputation.py b/grim/validation/parallel-imputation.py index f935394..7efc7cb 100755 --- a/grim/validation/parallel-imputation.py +++ b/grim/validation/parallel-imputation.py @@ -18,17 +18,23 @@ pr.enable() parser = argparse.ArgumentParser() -parser.add_argument("-c", "--config", - required=False, - default="../minimal-configuration.json", - help="Configuration JSON file", - type=str) - -parser.add_argument("-p", "--processes", - required=False, - default=0, - help="Number of processes to use. Defaults to total cores - 1", - type=int) +parser.add_argument( + "-c", + "--config", + required=False, + default="../minimal-configuration.json", + help="Configuration JSON file", + type=str, +) + +parser.add_argument( + "-p", + "--processes", + required=False, + default=0, + help="Number of processes to use. Defaults to total cores - 1", + type=int, +) args = parser.parse_args() configuration_file = args.config @@ -42,51 +48,67 @@ json_conf = json.load(f) config = { - "planb": json_conf.get('planb', True), - "pops": json_conf.get('populations'), - "priority": json_conf.get('priority'), - "epsilon": json_conf.get('epsilon', 1e-3), - "number_of_results": json_conf.get('number_of_results', 1000), - "number_of_pop_results": json_conf.get('number_of_pop_results', 100), + "planb": json_conf.get("planb", True), + "pops": json_conf.get("populations"), + "priority": json_conf.get("priority"), + "epsilon": json_conf.get("epsilon", 1e-3), + "number_of_results": json_conf.get("number_of_results", 1000), + "number_of_pop_results": json_conf.get("number_of_pop_results", 100), "output_MUUG": json_conf.get("output_MUUG", True), "output_haplotypes": json_conf.get("output_haplotypes", False), "node_file": project_dir + json_conf.get("node_csv_file"), "top_links_file": project_dir + json_conf.get("top_links_csv_file"), "edges_file": project_dir + json_conf.get("edges_csv_file"), "imputation_input_file": project_dir + json_conf.get("imputation_in_file"), - "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"), - "imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename"), - "imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"), - "imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename"), - "imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename"), - "imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename"), + "imputation_out_umug_freq_file": output_dir + + json_conf.get("imputation_out_umug_freq_filename"), + "imputation_out_umug_pops_file": output_dir + + json_conf.get("imputation_out_umug_pops_filename"), + "imputation_out_hap_freq_file": output_dir + + json_conf.get("imputation_out_hap_freq_filename"), + "imputation_out_hap_pops_file": output_dir + + json_conf.get("imputation_out_hap_pops_filename"), + "imputation_out_miss_file": output_dir + + json_conf.get("imputation_out_miss_filename"), + "imputation_out_problem_file": output_dir + + json_conf.get("imputation_out_problem_filename"), "factor_missing_data": json_conf.get("factor_missing_data", 0.01), - "loci_map": json_conf.get("loci_map", {"A": 1, "B":3, "C": 2, "DQB1": 4, "DRB1": 5} ), - "matrix_planb": json_conf.get("Plan_B_Matrix", [ - [[1, 2, 3, 4, 5]], - [[1, 2, 3], [4, 5]], - [[1], [2, 3], [4, 5]], - [[1, 2, 3], [4], [5]], - [[1], [2, 3], [4], [5]], - [[1], [2], [3], [4], [5]] - ]), - "pops_count_file": project_dir + json_conf.get("pops_count_file", ''), + "loci_map": json_conf.get( + "loci_map", {"A": 1, "B": 3, "C": 2, "DQB1": 4, "DRB1": 5} + ), + "matrix_planb": json_conf.get( + "Plan_B_Matrix", + [ + [[1, 2, 3, 4, 5]], + [[1, 2, 3], [4, 5]], + [[1], [2, 3], [4, 5]], + [[1, 2, 3], [4], [5]], + [[1], [2, 3], [4], [5]], + [[1], [2], [3], [4], [5]], + ], + ), + "pops_count_file": project_dir + json_conf.get("pops_count_file", ""), "use_pops_count_file": json_conf.get("pops_count_file", False), "number_of_options_threshold": json_conf.get("number_of_options_threshold", 100000), - "max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase", 100), - "bin_imputation_input_file": project_dir + json_conf.get("bin_imputation_in_file", "None"), + "max_haplotypes_number_in_phase": json_conf.get( + "max_haplotypes_number_in_phase", 100 + ), + "bin_imputation_input_file": project_dir + + json_conf.get("bin_imputation_in_file", "None"), "nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []), - "save_mode": json_conf.get("save_space_mode", False) + "save_mode": json_conf.get("save_space_mode", False), } all_loci_set = set() for _, val in config["loci_map"].items(): all_loci_set.add(str(val)) -config["full_loci"] = ''.join(sorted(all_loci_set)) +config["full_loci"] = "".join(sorted(all_loci_set)) # Display the configurations we are using -print('****************************************************************************************************') +print( + "****************************************************************************************************" +) print("\tLoci: {}".format(config["full_loci"])) print("\tPopulation: {}".format(config["pops"])) print("\tPriority: {}".format(config["priority"])) @@ -111,17 +133,25 @@ print("\tPops Count File: {}".format(config["pops_count_file"])) print("\tUse Pops Count File: {}".format(config["use_pops_count_file"])) print("\tNumber of Options Threshold: {}".format(config["number_of_options_threshold"])) -print("\tMax Number of haplotypes in phase: {}".format(config["max_haplotypes_number_in_phase"])) +print( + "\tMax Number of haplotypes in phase: {}".format( + config["max_haplotypes_number_in_phase"] + ) +) print("\tbin_imputation_input_file: {}".format(config["bin_imputation_input_file"])) if config["nodes_for_plan_A"]: print("\tNodes in plan A: {}".format(config["nodes_for_plan_A"])) print("\tSave space mode: {}".format(config["save_mode"])) -print('****************************************************************************************************') +print( + "****************************************************************************************************" +) if num_of_cpus == 0: # Use all the CPUs but 1 num_of_cpus = os.cpu_count() - 1 -print("Using {} processes with parent PID {}".format(num_of_cpus, os.getpid()), flush=True) +print( + "Using {} processes with parent PID {}".format(num_of_cpus, os.getpid()), flush=True +) priority = config["priority"] epsilon = config["epsilon"] @@ -141,14 +171,23 @@ def perform_impute_one(*args): subject_id, gl, race1, race2 = args[0] - subject_bin = [1] *(len(config["full_loci"]) - 1) + subject_bin = [1] * (len(config["full_loci"]) - 1) try: imputation = Imputation(graph, copy.deepcopy(config)) - impute_results = imputation.impute_one(subject_id, - gl, subject_bin, race1, race2, - priority, epsilon, 1000, - output_umug, output_haplotypes, - planb, em=False) + impute_results = imputation.impute_one( + subject_id, + gl, + subject_bin, + race1, + race2, + priority, + epsilon, + 1000, + output_umug, + output_haplotypes, + planb, + em=False, + ) imputation.print_options_count(subject_id) except MemoryError as e: print("Memory Error for subject_id: ", subject_id) @@ -159,28 +198,29 @@ def perform_impute_one(*args): def imputation_record_from(imputation_file): - if not imputation_file.endswith('.gz'): - with open(imputation_file, 'r') as imputation_file_unzipped: + if not imputation_file.endswith(".gz"): + with open(imputation_file, "r") as imputation_file_unzipped: for line in imputation_file_unzipped: - subject_id, gl, race1, race2 = line.rstrip().split(',') + subject_id, gl, race1, race2 = line.rstrip().split(",") yield subject_id, gl, race1, race2 else: - with gzip.open(imputation_file, 'rt') as imputation_gzip: + with gzip.open(imputation_file, "rt") as imputation_gzip: for line in imputation_gzip: - subject_id, gl, race1, race2 = line.rstrip().split(',') + subject_id, gl, race1, race2 = line.rstrip().split(",") yield subject_id, gl, race1, race2 + # Create output directory if it doesn't exist pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True) problem_ids = [] miss_ids = [] if MUUG_output: - fout_hap_muug = open(config["imputation_out_umug_freq_file"], 'w') - fout_pop_muug = open(config["imputation_out_umug_pops_file"], 'w') + fout_hap_muug = open(config["imputation_out_umug_freq_file"], "w") + fout_pop_muug = open(config["imputation_out_umug_pops_file"], "w") if haps_output: - fout_hap_haplo = open(config["imputation_out_hap_freq_file"], 'w') - fout_pop_haplo = open(config["imputation_out_hap_pops_file"], 'w') + fout_hap_haplo = open(config["imputation_out_hap_freq_file"], "w") + fout_pop_haplo = open(config["imputation_out_hap_pops_file"], "w") with Pool(processes=num_of_cpus) as pool: subject_records = imputation_record_from(imputation_infile) @@ -190,28 +230,42 @@ def imputation_record_from(imputation_file): problem_ids.append(subject_id) continue - if (len(haps_results['Haps']) == 0 or haps_results['Haps'] == 'NaN') and len(umug_results['Haps']) == 0: + if (len(haps_results["Haps"]) == 0 or haps_results["Haps"] == "NaN") and len( + umug_results["Haps"] + ) == 0: miss_ids.append(subject_id) if haps_output: - haps = haps_results['Haps'] - probs = haps_results['Probs'] - pops = haps_results['Pops'] - print("Subject: {id} {hap_length} haplotypes".format(id=subject_id, hap_length=len(haps))) + haps = haps_results["Haps"] + probs = haps_results["Probs"] + pops = haps_results["Pops"] + print( + "Subject: {id} {hap_length} haplotypes".format( + id=subject_id, hap_length=len(haps) + ) + ) write_best_prob(subject_id, haps, probs, number_of_results, fout_hap_haplo) - write_best_prob(subject_id, pops, probs, number_of_pop_results, fout_pop_haplo) + write_best_prob( + subject_id, pops, probs, number_of_pop_results, fout_pop_haplo + ) if MUUG_output: - haps = umug_results['Haps'] - pops = umug_results['Pops'] - print("Subject: {id} {hap_length} haplotypes".format(id=subject_id, hap_length=len(haps))) + haps = umug_results["Haps"] + pops = umug_results["Pops"] + print( + "Subject: {id} {hap_length} haplotypes".format( + id=subject_id, hap_length=len(haps) + ) + ) write_best_prob_genotype(subject_id, haps, number_of_results, fout_hap_muug) - write_best_prob_genotype(subject_id, pops, number_of_pop_results, fout_pop_muug) + write_best_prob_genotype( + subject_id, pops, number_of_pop_results, fout_pop_muug + ) num_of_misses = len(miss_ids) print("Missing: {}".format(num_of_misses)) if num_of_misses > 0: - with open(config["imputation_out_miss_file"], 'w') as miss_csv_file: + with open(config["imputation_out_miss_file"], "w") as miss_csv_file: miss_writer = csv.writer(miss_csv_file) for miss_id in miss_ids: miss_writer.writerow([miss_id]) @@ -219,7 +273,7 @@ def imputation_record_from(imputation_file): num_of_problems = len(problem_ids) print("Problems: {}".format(num_of_problems)) if num_of_problems > 0: - with open(config["imputation_out_problem_file"], 'w') as problem_csv_file: + with open(config["imputation_out_problem_file"], "w") as problem_csv_file: problem_writer = csv.writer(problem_csv_file) for problem_id in problem_ids: problem_writer.writerows([problem_id]) diff --git a/grim/validation/reduce_loci.py b/grim/validation/reduce_loci.py index 503e082..4a0e51d 100755 --- a/grim/validation/reduce_loci.py +++ b/grim/validation/reduce_loci.py @@ -8,43 +8,55 @@ def write_best_prob_genotype(name_gl, res, fout, numOfResult=10): # write the output to file minBestResult = min(numOfResult, len(sorted_by_value)) for k in range(minBestResult): - fout.write(name_gl + ',' + str(sorted_by_value[k][0]) + ',' + - str(sorted_by_value[k][1]) + ',' + str(k) + '\n') + fout.write( + name_gl + + "," + + str(sorted_by_value[k][0]) + + "," + + str(sorted_by_value[k][1]) + + "," + + str(k) + + "\n" + ) + def convert_res_of_6_to_5(file_in, file_out): dict_res = {} with open(file_in) as six_file: for line in six_file: - id, gl, prob, rank = line.strip().split(',') - gl = ('^').join(gl.split('^')[:-1]) + id, gl, prob, rank = line.strip().split(",") + gl = ("^").join(gl.split("^")[:-1]) if not id in dict_res: dict_res[id] = {} if gl in dict_res[id]: - dict_res[id][gl]+= float(prob) + dict_res[id][gl] += float(prob) else: dict_res[id][gl] = float(prob) six_file.close() - f_out = open(file_out, 'w') + f_out = open(file_out, "w") for id in dict_res: write_best_prob_genotype(id, dict_res[id], f_out) f_out.close() - - parser = argparse.ArgumentParser() -parser.add_argument("-c", "--config", - required=False, - default="../conf/minimal-configuration.json", - help="Configuration JSON file", - type=str) +parser.add_argument( + "-c", + "--config", + required=False, + default="../conf/minimal-configuration.json", + help="Configuration JSON file", + type=str, +) args = parser.parse_args() configuration_file = args.config with open(configuration_file) as f: json_conf = json.load(f) -imputation_out_muug_freqs = 'output/' + json_conf.get("imputation_out_umug_freq_filename") +imputation_out_muug_freqs = "output/" + json_conf.get( + "imputation_out_umug_freq_filename" +) -convert_res_of_6_to_5(imputation_out_muug_freqs, imputation_out_muug_freqs) \ No newline at end of file +convert_res_of_6_to_5(imputation_out_muug_freqs, imputation_out_muug_freqs) diff --git a/grim/validation/runfile.py b/grim/validation/runfile.py index 16e7b82..bf44bf5 100644 --- a/grim/validation/runfile.py +++ b/grim/validation/runfile.py @@ -11,69 +11,97 @@ from ..imputation.imputegl.networkx_graph import Graph # Profiler start -#pr = cProfile.Profile() -#pr.enable() +# pr = cProfile.Profile() +# pr.enable() -def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_graph = "", project_dir_in_file = ""): - configuration_file = conf_file +def run_impute( + conf_file="../conf/minimal-configuration.json", + project_dir_graph="", + project_dir_in_file="", +): - #project_dir = ""# "../" - #output_dir = "output/" + configuration_file = conf_file + # project_dir = ""# "../" + # output_dir = "output/" # Read configuration file and load properties with open(configuration_file) as f: json_conf = json.load(f) graph_files_path = json_conf.get("graph_files_path") - if graph_files_path[-1] != '/': - graph_files_path += '/' + if graph_files_path[-1] != "/": + graph_files_path += "/" output_dir = json_conf.get("imuptation_out_path", "output") - if output_dir[-1] != '/': - output_dir += '/' + if output_dir[-1] != "/": + output_dir += "/" config = { - "planb": json_conf.get('planb', True), - "pops": json_conf.get('populations'), - "priority": json_conf.get('priority'), - "epsilon": json_conf.get('epsilon', 1e-3), - "number_of_results": json_conf.get('number_of_results', 1000), - "number_of_pop_results": json_conf.get('number_of_pop_results', 100), + "planb": json_conf.get("planb", True), + "pops": json_conf.get("populations"), + "priority": json_conf.get("priority"), + "epsilon": json_conf.get("epsilon", 1e-3), + "number_of_results": json_conf.get("number_of_results", 1000), + "number_of_pop_results": json_conf.get("number_of_pop_results", 100), "output_MUUG": json_conf.get("output_MUUG", True), "output_haplotypes": json_conf.get("output_haplotypes", False), - "node_file": project_dir_graph + graph_files_path + json_conf.get("node_csv_file"), - "top_links_file": project_dir_graph + graph_files_path + json_conf.get("top_links_csv_file"), - "edges_file": project_dir_graph + graph_files_path +json_conf.get("edges_csv_file"), - "imputation_input_file": project_dir_in_file + json_conf.get("imputation_in_file"), - "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"), - "imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename"), - "imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"), - "imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename"), - "imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename"), - "imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename"), + "node_file": project_dir_graph + + graph_files_path + + json_conf.get("node_csv_file"), + "top_links_file": project_dir_graph + + graph_files_path + + json_conf.get("top_links_csv_file"), + "edges_file": project_dir_graph + + graph_files_path + + json_conf.get("edges_csv_file"), + "imputation_input_file": project_dir_in_file + + json_conf.get("imputation_in_file"), + "imputation_out_umug_freq_file": output_dir + + json_conf.get("imputation_out_umug_freq_filename"), + "imputation_out_umug_pops_file": output_dir + + json_conf.get("imputation_out_umug_pops_filename"), + "imputation_out_hap_freq_file": output_dir + + json_conf.get("imputation_out_hap_freq_filename"), + "imputation_out_hap_pops_file": output_dir + + json_conf.get("imputation_out_hap_pops_filename"), + "imputation_out_miss_file": output_dir + + json_conf.get("imputation_out_miss_filename"), + "imputation_out_problem_file": output_dir + + json_conf.get("imputation_out_problem_filename"), "factor_missing_data": json_conf.get("factor_missing_data", 0.01), - "loci_map": json_conf.get("loci_map", {"A": 1, "B":3, "C": 2, "DQB1": 4, "DRB1": 5} ), - "matrix_planb": json_conf.get("Plan_B_Matrix", [ - [[1, 2, 3, 4, 5]], - [[1, 2, 3], [4, 5]], - [[1], [2, 3], [4, 5]], - [[1, 2, 3], [4], [5]], - [[1], [2, 3], [4], [5]], - [[1], [2], [3], [4], [5]] - ]), - "pops_count_file": project_dir_graph + json_conf.get("pops_count_file",'' ), - "use_pops_count_file": json_conf.get("pops_count_file",False), - "number_of_options_threshold": json_conf.get("number_of_options_threshold", 100000), - "max_haplotypes_number_in_phase": json_conf.get("max_haplotypes_number_in_phase",100 ), - "bin_imputation_input_file": project_dir_in_file + json_conf.get("bin_imputation_in_file", "None"), + "loci_map": json_conf.get( + "loci_map", {"A": 1, "B": 3, "C": 2, "DQB1": 4, "DRB1": 5} + ), + "matrix_planb": json_conf.get( + "Plan_B_Matrix", + [ + [[1, 2, 3, 4, 5]], + [[1, 2, 3], [4, 5]], + [[1], [2, 3], [4, 5]], + [[1, 2, 3], [4], [5]], + [[1], [2, 3], [4], [5]], + [[1], [2], [3], [4], [5]], + ], + ), + "pops_count_file": project_dir_graph + json_conf.get("pops_count_file", ""), + "use_pops_count_file": json_conf.get("pops_count_file", False), + "number_of_options_threshold": json_conf.get( + "number_of_options_threshold", 100000 + ), + "max_haplotypes_number_in_phase": json_conf.get( + "max_haplotypes_number_in_phase", 100 + ), + "bin_imputation_input_file": project_dir_in_file + + json_conf.get("bin_imputation_in_file", "None"), "nodes_for_plan_A": json_conf.get("Plan_A_Matrix", []), "save_mode": json_conf.get("save_space_mode", False), - "UNK_priors" : json_conf.get("UNK_priors", "MR") - + "UNK_priors": json_conf.get("UNK_priors", "MR"), } # Display the configurations we are using - print('****************************************************************************************************') + print( + "****************************************************************************************************" + ) print("Performing imputation based on:") print("\tPopulation: {}".format(config["pops"])) print("\tPriority: {}".format(config["priority"])) @@ -86,11 +114,23 @@ def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_gra print("\tTop Links File: {}".format(config["edges_file"])) print("\tInput File: {}".format(config["imputation_input_file"])) print("\tOutput UMUG Format: {}".format(config["output_MUUG"])) - print("\tOutput UMUG Freq Filename: {}".format(config["imputation_out_umug_freq_file"])) - print("\tOutput UMUG Pops Filename: {}".format(config["imputation_out_umug_pops_file"])) + print( + "\tOutput UMUG Freq Filename: {}".format( + config["imputation_out_umug_freq_file"] + ) + ) + print( + "\tOutput UMUG Pops Filename: {}".format( + config["imputation_out_umug_pops_file"] + ) + ) print("\tOutput Haplotype Format: {}".format(config["output_haplotypes"])) - print("\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"])) - print("\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"])) + print( + "\tOutput HAP Freq Filename: {}".format(config["imputation_out_hap_freq_file"]) + ) + print( + "\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"]) + ) print("\tOutput Miss Filename: {}".format(config["imputation_out_miss_file"])) print("\tOutput Problem Filename: {}".format(config["imputation_out_problem_file"])) print("\tFactor Missing Data: {}".format(config["factor_missing_data"])) @@ -98,22 +138,33 @@ def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_gra print("\tPlan B Matrix: {}".format(config["matrix_planb"])) print("\tPops Count File: {}".format(config["pops_count_file"])) print("\tUse Pops Count File: {}".format(config["use_pops_count_file"])) - print("\tNumber of Options Threshold: {}".format(config["number_of_options_threshold"])) - print("\tMax Number of haplotypes in phase: {}".format(config["max_haplotypes_number_in_phase"])) + print( + "\tNumber of Options Threshold: {}".format( + config["number_of_options_threshold"] + ) + ) + print( + "\tMax Number of haplotypes in phase: {}".format( + config["max_haplotypes_number_in_phase"] + ) + ) if config["nodes_for_plan_A"]: print("\tNodes in plan A: {}".format(config["nodes_for_plan_A"])) print("\tSave space mode: {}".format(config["save_mode"])) - print('****************************************************************************************************') - + print( + "****************************************************************************************************" + ) all_loci_set = set() for _, val in config["loci_map"].items(): all_loci_set.add(str(val)) - config["full_loci"] = ''.join(sorted(all_loci_set)) + config["full_loci"] = "".join(sorted(all_loci_set)) # Perform imputation graph = Graph(config) - graph.build_graph(config["node_file"], config["top_links_file"], config["edges_file"]) + graph.build_graph( + config["node_file"], config["top_links_file"], config["edges_file"] + ) imputation = Imputation(graph, config) # Create output directory if it doesn't exist @@ -123,5 +174,5 @@ def run_impute(conf_file = "../conf/minimal-configuration.json", project_dir_gra imputation.impute_file(config) # Profiler end - #pr.disable() - #pr.print_stats(sort="time") + # pr.disable() + # pr.print_stats(sort="time") diff --git a/grim/validation/runfile_mp.py b/grim/validation/runfile_mp.py index 5d136a9..79de5c0 100755 --- a/grim/validation/runfile_mp.py +++ b/grim/validation/runfile_mp.py @@ -15,17 +15,18 @@ pr.enable() parser = argparse.ArgumentParser() -parser.add_argument("-c", "--config", - required=False, - default="../minimal-configuration.json", - help="Configuration JSON file", - type=str) - -parser.add_argument("-d", "--processes", - required=False, - default=4, - help="number of processes", - type=int) +parser.add_argument( + "-c", + "--config", + required=False, + default="../minimal-configuration.json", + help="Configuration JSON file", + type=str, +) + +parser.add_argument( + "-d", "--processes", required=False, default=4, help="number of processes", type=int +) args = parser.parse_args() configuration_file = args.config @@ -39,28 +40,36 @@ json_conf = json.load(f) config = { - "planb": json_conf.get('planb', True), - "pops": json_conf.get('populations'), - "priority": json_conf.get('priority'), - "epsilon": json_conf.get('epsilon', 1e-3), - "number_of_results": json_conf.get('number_of_results', 1000), - "number_of_pop_results": json_conf.get('number_of_pop_results', 100), + "planb": json_conf.get("planb", True), + "pops": json_conf.get("populations"), + "priority": json_conf.get("priority"), + "epsilon": json_conf.get("epsilon", 1e-3), + "number_of_results": json_conf.get("number_of_results", 1000), + "number_of_pop_results": json_conf.get("number_of_pop_results", 100), "output_MUUG": json_conf.get("output_MUUG", True), "output_haplotypes": json_conf.get("output_haplotypes", False), "node_file": project_dir + json_conf.get("node_csv_file"), "top_links_file": project_dir + json_conf.get("top_links_csv_file"), "edges_file": project_dir + json_conf.get("edges_csv_file"), "imputation_input_file": project_dir + json_conf.get("imputation_in_file"), - "imputation_out_umug_freq_file": output_dir + json_conf.get("imputation_out_umug_freq_filename"), - "imputation_out_umug_pops_file": output_dir + json_conf.get("imputation_out_umug_pops_filename"), - "imputation_out_hap_freq_file": output_dir + json_conf.get("imputation_out_hap_freq_filename"), - "imputation_out_hap_pops_file": output_dir + json_conf.get("imputation_out_hap_pops_filename"), - "imputation_out_miss_file": output_dir + json_conf.get("imputation_out_miss_filename"), - "imputation_out_problem_file": output_dir + json_conf.get("imputation_out_problem_filename") + "imputation_out_umug_freq_file": output_dir + + json_conf.get("imputation_out_umug_freq_filename"), + "imputation_out_umug_pops_file": output_dir + + json_conf.get("imputation_out_umug_pops_filename"), + "imputation_out_hap_freq_file": output_dir + + json_conf.get("imputation_out_hap_freq_filename"), + "imputation_out_hap_pops_file": output_dir + + json_conf.get("imputation_out_hap_pops_filename"), + "imputation_out_miss_file": output_dir + + json_conf.get("imputation_out_miss_filename"), + "imputation_out_problem_file": output_dir + + json_conf.get("imputation_out_problem_filename"), } # Display the configurations we are using -print('****************************************************************************************************') +print( + "****************************************************************************************************" +) print("Performing imputation based on:") print("\tPopulation: {}".format(config["pops"])) print("\tPriority: {}".format(config["priority"])) @@ -79,7 +88,9 @@ print("\tOutput HAP Pops Filename: {}".format(config["imputation_out_hap_pops_file"])) print("\tOutput Miss Filename: {}".format(config["imputation_out_miss_file"])) print("\tOutput Problem Filename: {}".format(config["imputation_out_problem_file"])) -print('****************************************************************************************************') +print( + "****************************************************************************************************" +) # Perform imputation graph = Graph() @@ -95,12 +106,20 @@ in_dir = os.path.dirname(input_file) in_file_basename = os.path.basename(input_file) -num_subjects = os.popen('wc -l ' + config["imputation_input_file"]).read() +num_subjects = os.popen("wc -l " + config["imputation_input_file"]).read() num_subjects = int(num_subjects.strip().split(" ")[0]) print(num_subjects) -split_cmd = 'split -l ' + str( - int(math.ceil(num_subjects / num_processes))) + " " + input_file + " " + in_dir + "/" + in_file_basename[0:4] +split_cmd = ( + "split -l " + + str(int(math.ceil(num_subjects / num_processes))) + + " " + + input_file + + " " + + in_dir + + "/" + + in_file_basename[0:4] +) os.system(split_cmd) @@ -115,12 +134,12 @@ config["imputation_input_file"] = in_file out_file = output_dir + os.path.basename(in_file) - config["imputation_out_umug_freq_file"] = out_file + '.umug.freqs' - config["imputation_out_umug_pops_file"] = out_file + '.umug.pops' - config["imputation_out_hap_freq_file"] = out_file + '.hap.freqs' - config["imputation_out_hap_pops_file"] = out_file + '.hap.pops' - config["imputation_out_miss_file"] = out_file + '.miss' - config["imputation_out_problem_file"] = out_file + '.problem' + config["imputation_out_umug_freq_file"] = out_file + ".umug.freqs" + config["imputation_out_umug_pops_file"] = out_file + ".umug.pops" + config["imputation_out_hap_freq_file"] = out_file + ".hap.freqs" + config["imputation_out_hap_pops_file"] = out_file + ".hap.pops" + config["imputation_out_miss_file"] = out_file + ".miss" + config["imputation_out_problem_file"] = out_file + ".problem" t = Process(target=imputation.impute_file, args=(config,)) t.start() processes.append(t) diff --git a/grim/validation/runfile_mt.py b/grim/validation/runfile_mt.py index a71f2c1..f7dffd2 100755 --- a/grim/validation/runfile_mt.py +++ b/grim/validation/runfile_mt.py @@ -4,7 +4,7 @@ import os import pathlib import threading -import string,math +import string, math from imputegl import Imputation from imputegl.networkx_graph import Graph @@ -14,17 +14,18 @@ pr.enable() parser = argparse.ArgumentParser() -parser.add_argument("-c", "--config", - required=False, - default="../minimal-configuration.json", - help="Configuration JSON file", - type=str) +parser.add_argument( + "-c", + "--config", + required=False, + default="../minimal-configuration.json", + help="Configuration JSON file", + type=str, +) -parser.add_argument("-d", "--threads", - required=False, - default=4, - help="number of threads", - type=int) +parser.add_argument( + "-d", "--threads", required=False, default=4, help="number of threads", type=int +) args = parser.parse_args() @@ -38,28 +39,30 @@ with open(configuration_file) as f: conf = json.load(f) - -number_of_pop_results = conf.get('number_of_pop_results', 100) -number_of_results = conf.get('number_of_results', 1000) -planb = conf.get('planb') -epsilon = conf.get('epsilon', 1e-3) + +number_of_pop_results = conf.get("number_of_pop_results", 100) +number_of_results = conf.get("number_of_results", 1000) +planb = conf.get("planb") +epsilon = conf.get("epsilon", 1e-3) pops = conf.get("populations") -priority = conf.get('priority') +priority = conf.get("priority") output_haplotypes = conf.get("output_haplotypes", False) output_MUUG = conf.get("output_MUUG", True) node_file = project_dir + conf.get("node_csv_file") top_linkks_file = project_dir + conf.get("top_links_csv_file") -edges_file =project_dir + conf.get("edges_csv_file") +edges_file = project_dir + conf.get("edges_csv_file") input_file = project_dir + conf.get("imputation_in_file") # Output filename is based on input filename -#output_file = output_dir + os.path.basename(input_file) + '_out' +# output_file = output_dir + os.path.basename(input_file) + '_out' # Display the configurations we are using -print('****************************************************************************************************') +print( + "****************************************************************************************************" +) print("Performing imputation based on:") print("\tPopulation: {}".format(pops)) print("\tPriority: {}".format(priority)) @@ -70,13 +73,14 @@ print("\tNodes File: {}".format(node_file)) print("\tTop Links File: {}".format(edges_file)) print("\tInput File: {}".format(input_file)) -print('****************************************************************************************************') - +print( + "****************************************************************************************************" +) # Perform imputation graph = Graph() -graph.build_graph(node_file,top_linkks_file, edges_file) +graph.build_graph(node_file, top_linkks_file, edges_file) imputation = Imputation(graph, pops) @@ -88,22 +92,43 @@ in_dir = os.path.dirname(input_file) in_file_basename = os.path.basename(input_file) -num_subjects = os.popen('wc -l ' + input_file ).read() +num_subjects = os.popen("wc -l " + input_file).read() num_subjects = int(num_subjects.strip().split(" ")[0]) print(num_subjects) -split_cmd ='split -l ' + str(int(math.ceil(num_subjects/num_threads)))+ " "+ input_file +" "+in_dir+"/" + in_file_basename[0:4] +split_cmd = ( + "split -l " + + str(int(math.ceil(num_subjects / num_threads))) + + " " + + input_file + + " " + + in_dir + + "/" + + in_file_basename[0:4] +) os.system(split_cmd) -alpha = string.ascii_lowercase +alpha = string.ascii_lowercase for i in range(num_threads): - in_file = in_dir +"/"+in_file_basename[0:4] +"a" + alpha[i] + in_file = in_dir + "/" + in_file_basename[0:4] + "a" + alpha[i] print(in_file) - output_file = output_dir + os.path.basename(in_file) + '_out' - t = threading.Thread(target=imputation.impute_file, args = (in_file, output_file, priority, output_MUUG, - output_haplotypes,1000,epsilon,number_of_results, - number_of_pop_results,planb, )) + output_file = output_dir + os.path.basename(in_file) + "_out" + t = threading.Thread( + target=imputation.impute_file, + args=( + in_file, + output_file, + priority, + output_MUUG, + output_haplotypes, + 1000, + epsilon, + number_of_results, + number_of_pop_results, + planb, + ), + ) t.start() # Profiler end diff --git a/grim/validation/simulation/__init__.py b/grim/validation/simulation/__init__.py new file mode 100644 index 0000000..cfe8889 --- /dev/null +++ b/grim/validation/simulation/__init__.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +# +# grim Graph Imputation +# Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# + + +"""Top-level package for py-grim.""" + +__organization__ = "NMDP/CIBMTR Bioinformatics" diff --git a/grim/validation/simulation/data/__init__.py b/grim/validation/simulation/data/__init__.py new file mode 100644 index 0000000..cfe8889 --- /dev/null +++ b/grim/validation/simulation/data/__init__.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- + +# +# grim Graph Imputation +# Copyright (c) 2021 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# + + +"""Top-level package for py-grim.""" + +__organization__ = "NMDP/CIBMTR Bioinformatics" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7307771 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +# Minimum requirements for the build system to execute. +requires = ["setuptools", "wheel", "cython==0.29.32"] # PEP 508 specifications. diff --git a/requirements-deploy.txt b/requirements-deploy.txt index 0e1ec03..e69de29 100644 --- a/requirements-deploy.txt +++ b/requirements-deploy.txt @@ -1,2 +0,0 @@ -connexion[swagger-ui]==2.13.0 -gunicorn==20.1.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index d2f222f..02e2bf4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,5 @@ -allure-behave==2.9.45 -flake8==4.0.1 +flake8==5.0.4 bump2version==1.0.1 -coverage==6.3.2 +coverage==6.4.4 wheel==0.37.1 -pre-commit==2.18.1 +pre-commit==2.20.0 diff --git a/requirements-tests.txt b/requirements-tests.txt index 77d53a7..2af4c79 100644 --- a/requirements-tests.txt +++ b/requirements-tests.txt @@ -1,3 +1,2 @@ pytest==7.1.2 -behave==1.2.6 PyHamcrest==2.0.3 diff --git a/requirements.txt b/requirements.txt index 27df244..f319221 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ +cython==0.29.32 numpy>=1.20.2 -networkx==2.5.1 -cython -Cython +networkx==2.5.1 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index abca7fe..d573e50 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.4 +current_version = 0.0.6 commit = True tag = True diff --git a/tests/features/algorithm/SLUG Match.feature b/tests/features/algorithm/SLUG Match.feature deleted file mode 100644 index 4cb80df..0000000 --- a/tests/features/algorithm/SLUG Match.feature +++ /dev/null @@ -1,16 +0,0 @@ -Feature: Match SLUG - - SLUGs are matched if each of the alleles of one SLUG are in the other SLUG. - - Scenario Outline: Match SLUGS by allele - - Given the SLUG for patient is - And the SLUG for donor is - When we perform SLUG match patient and donor - Then they should be - - Examples: - | Patient-SLUG | Donor-SLUG | Matched | - | HLA-C*07:02+HLA-C*16:01 | HLA-C*07:02+HLA-C*16:01 | Yes | - | HLA-A*13:02+HLA-A*13:01 | HLA-A*13:02+HLA-A*30:01 | No | - | HLA-A*13:02+HLA-B*13:01 | HLA-A*13:02+HLA-A*30:01 | No | diff --git a/tests/features/definition/Class I HLA Alleles.feature b/tests/features/definition/Class I HLA Alleles.feature deleted file mode 100644 index 9c7ad99..0000000 --- a/tests/features/definition/Class I HLA Alleles.feature +++ /dev/null @@ -1,21 +0,0 @@ -Feature: SLUGs of Class I HLA Alleles - - Class I HLA Alleles are alleles on genomic loci HLA-A, HLA-B and HLA-C. - SLUGs are Single Locus Un-phased Genotypes made up of a pair of alleles. - The alleles are lexicographically sorted. - - Scenario Outline: HLA Class I Locus SLUG - - Each gene of HLA-A, HLA-B and HLA-C have a pair of alleles, one from each parent. - - Given The Locus of Gene is - And The first allele is - And The second allele is - When I create an un-phased genotype - Then I get the single locus un-phased genotype - - Examples: Valid Class I Alleles - | Locus | Allele1 | Allele2 | SLUG | - | HLA-A | HLA-A*13:02 | HLA-A*30:01 | HLA-A*13:02+HLA-A*30:01 | - | HLA-B | HLA-B*38:01 | HLA-B*07:02 | HLA-B*07:02+HLA-B*38:01 | - | HLA-C | HLA-C*07:02 | HLA-C*16:01 | HLA-C*07:02+HLA-C*16:01 | diff --git a/tests/steps/HLA_alleles.py b/tests/steps/HLA_alleles.py deleted file mode 100644 index dfbd31a..0000000 --- a/tests/steps/HLA_alleles.py +++ /dev/null @@ -1,30 +0,0 @@ -from behave import * -from hamcrest import assert_that, is_ - -from grim.model.allele import Allele -from grim.model.slug import SLUG - - -@given("The Locus of Gene is {locus}") -def step_impl(context, locus): - context.locus = locus - - -@step("The first allele is {allele1}") -def step_impl(context, allele1): - context.allele_1 = Allele(context.locus, allele1) - - -@step("The second allele is {allele2}") -def step_impl(context, allele2): - context.allele_2 = Allele(context.locus, allele2) - - -@when("I create an un-phased genotype") -def step_impl(context): - context.slug = SLUG(context.allele_1, context.allele_2) - - -@then("I get the single locus un-phased genotype {slug}") -def step_impl(context, slug): - assert_that(str(context.slug), is_(slug)) diff --git a/tests/steps/SLUG_match.py b/tests/steps/SLUG_match.py deleted file mode 100644 index 6fda209..0000000 --- a/tests/steps/SLUG_match.py +++ /dev/null @@ -1,26 +0,0 @@ -from behave import * -from hamcrest import assert_that, is_ - -from grim.algorithm.match import slug_match -from grim.model.slug import SLUG - - -@given("the SLUG for patient is {slug}") -def step_impl(context, slug): - context.patient_slug = SLUG.from_glstring(slug) - - -@step("the SLUG for donor is {slug}") -def step_impl(context, slug): - context.donor_slug = SLUG.from_glstring(slug) - - -@when("we perform SLUG match patient and donor") -def step_impl(context): - context.matched = slug_match(context.patient_slug, context.donor_slug) - - -@then("they should be {is_matched}") -def step_impl(context, is_matched): - matched = is_matched == "Yes" - assert_that(context.matched, is_(matched)) diff --git a/tests/unit/test_my_project_template.py b/tests/unit/test_grim.py similarity index 100% rename from tests/unit/test_my_project_template.py rename to tests/unit/test_grim.py From fc1ceae0031e03ba67b3a6ef9c6edbf224fd95ce Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Wed, 16 Nov 2022 10:30:22 -0600 Subject: [PATCH 16/19] =?UTF-8?q?Bump=20version:=200.0.6=20=E2=86=92=200.0?= =?UTF-8?q?.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- grim/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/grim/__init__.py b/grim/__init__.py index f45a470..6e79b1e 100644 --- a/grim/__init__.py +++ b/grim/__init__.py @@ -26,4 +26,4 @@ """Top-level package for py-grim.""" __organization__ = "NMDP/CIBMTR Bioinformatics" -__version__ = "0.0.6" +__version__ = "0.0.7" diff --git a/setup.cfg b/setup.cfg index d573e50..dcf485c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.6 +current_version = 0.0.7 commit = True tag = True diff --git a/setup.py b/setup.py index e25cd17..9e43933 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ setup( name="py-graph-imputation", - version="0.0.6", + version="0.0.7", author="Pradeep Bashyal", author_email="pbashyal@nmdp.org", python_requires=">=3.8", From b567fa4c85b87590d9de361b24152dd77f56d306 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Wed, 16 Nov 2022 10:49:20 -0600 Subject: [PATCH 17/19] Added slots back --- grim/imputation/imputegl/impute.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/grim/imputation/imputegl/impute.py b/grim/imputation/imputegl/impute.py index 310daa9..cb4636e 100644 --- a/grim/imputation/imputegl/impute.py +++ b/grim/imputation/imputegl/impute.py @@ -119,6 +119,10 @@ def clean_up_gl(gl): class Imputation(object): + __slots__ = 'logger', 'verbose', 'populations', 'netGraph', 'priorMatrix', 'full_hapl', 'index_dict', 'full_loci', \ + 'factor', '_factor_missing_data', 'cypher', 'cypher_plan_b', 'matrix_planb', 'count_by_prob', \ + 'number_of_options_threshold', 'plan', 'option_1', 'option_2', \ + 'haplotypes_number_in_phase', 'save_space_mode', 'nodes_for_plan_A', 'unk_priors' def __init__(self, net=None, config=None, count_by_prob=None, verbose=False): """Constructor Intialize an instance of `Imputation` with a py2neo graph From 316092d4945d0d8e032d5601ec581163cde6c525 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Wed, 16 Nov 2022 11:23:47 -0600 Subject: [PATCH 18/19] black applied --- grim/conf/__init__.py | 4 +-- grim/imputation/__init__.py | 4 +-- grim/imputation/graph_generation/__init__.py | 4 +-- grim/imputation/imputegl/__init__.py | 4 +-- grim/imputation/imputegl/impute.py | 30 ++++++++++++++++---- grim/imputation/imputegl/networkx_graph.py | 9 +++++- grim/validation/__init__.py | 4 +-- 7 files changed, 43 insertions(+), 16 deletions(-) diff --git a/grim/conf/__init__.py b/grim/conf/__init__.py index 5ff1a91..d64977b 100755 --- a/grim/conf/__init__.py +++ b/grim/conf/__init__.py @@ -24,5 +24,5 @@ __author__ = """Martin Maiers""" -__email__ = 'mmaiers@nmdp.org' -__version__ = '0.0.7' +__email__ = "mmaiers@nmdp.org" +__version__ = "0.0.7" diff --git a/grim/imputation/__init__.py b/grim/imputation/__init__.py index 2511f5d..2779c9f 100755 --- a/grim/imputation/__init__.py +++ b/grim/imputation/__init__.py @@ -21,5 +21,5 @@ __author__ = """Martin Maiers""" -__email__ = 'mmaiers@nmdp.org' -__version__ = '0.0.7' +__email__ = "mmaiers@nmdp.org" +__version__ = "0.0.7" diff --git a/grim/imputation/graph_generation/__init__.py b/grim/imputation/graph_generation/__init__.py index 2511f5d..2779c9f 100755 --- a/grim/imputation/graph_generation/__init__.py +++ b/grim/imputation/graph_generation/__init__.py @@ -21,5 +21,5 @@ __author__ = """Martin Maiers""" -__email__ = 'mmaiers@nmdp.org' -__version__ = '0.0.7' +__email__ = "mmaiers@nmdp.org" +__version__ = "0.0.7" diff --git a/grim/imputation/imputegl/__init__.py b/grim/imputation/imputegl/__init__.py index 2d171e6..bec990d 100755 --- a/grim/imputation/imputegl/__init__.py +++ b/grim/imputation/imputegl/__init__.py @@ -23,5 +23,5 @@ __author__ = """Martin Maiers""" -__email__ = 'mmaiers@nmdp.org' -__version__ = '0.0.7' +__email__ = "mmaiers@nmdp.org" +__version__ = "0.0.7" diff --git a/grim/imputation/imputegl/impute.py b/grim/imputation/imputegl/impute.py index cb4636e..1cdc63a 100644 --- a/grim/imputation/imputegl/impute.py +++ b/grim/imputation/imputegl/impute.py @@ -119,10 +119,31 @@ def clean_up_gl(gl): class Imputation(object): - __slots__ = 'logger', 'verbose', 'populations', 'netGraph', 'priorMatrix', 'full_hapl', 'index_dict', 'full_loci', \ - 'factor', '_factor_missing_data', 'cypher', 'cypher_plan_b', 'matrix_planb', 'count_by_prob', \ - 'number_of_options_threshold', 'plan', 'option_1', 'option_2', \ - 'haplotypes_number_in_phase', 'save_space_mode', 'nodes_for_plan_A', 'unk_priors' + __slots__ = ( + "logger", + "verbose", + "populations", + "netGraph", + "priorMatrix", + "full_hapl", + "index_dict", + "full_loci", + "factor", + "_factor_missing_data", + "cypher", + "cypher_plan_b", + "matrix_planb", + "count_by_prob", + "number_of_options_threshold", + "plan", + "option_1", + "option_2", + "haplotypes_number_in_phase", + "save_space_mode", + "nodes_for_plan_A", + "unk_priors", + ) + def __init__(self, net=None, config=None, count_by_prob=None, verbose=False): """Constructor Intialize an instance of `Imputation` with a py2neo graph @@ -394,7 +415,6 @@ def get_haplo_freqs(self, haplos, epsilon, n=25000): # haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist] return self.netGraph.adjs_query(haplos_joined) - # def get_haplo_freqs_miss(self, haplos, epsilon): # haplo_probs = {} # haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist] diff --git a/grim/imputation/imputegl/networkx_graph.py b/grim/imputation/imputegl/networkx_graph.py index 49c5b83..1d997ab 100755 --- a/grim/imputation/imputegl/networkx_graph.py +++ b/grim/imputation/imputegl/networkx_graph.py @@ -9,7 +9,14 @@ def missing(labelA, labelB): class Graph(object): - __slots__ = 'graph', 'labelDict', 'whole_graph', 'full_loci', 'nodes_plan_a', 'nodes_plan_b' + __slots__ = ( + "graph", + "labelDict", + "whole_graph", + "full_loci", + "nodes_plan_a", + "nodes_plan_b", + ) def __init__(self, config): self.graph = nx.DiGraph() diff --git a/grim/validation/__init__.py b/grim/validation/__init__.py index 2511f5d..2779c9f 100755 --- a/grim/validation/__init__.py +++ b/grim/validation/__init__.py @@ -21,5 +21,5 @@ __author__ = """Martin Maiers""" -__email__ = 'mmaiers@nmdp.org' -__version__ = '0.0.7' +__email__ = "mmaiers@nmdp.org" +__version__ = "0.0.7" From 02334ee16d9970bf166ebfaa093a39a5fd9fafe6 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Wed, 16 Nov 2022 11:28:10 -0600 Subject: [PATCH 19/19] black applied --- setup.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 9e43933..c012b44 100644 --- a/setup.py +++ b/setup.py @@ -28,11 +28,12 @@ from setuptools import setup from Cython.Build import cythonize + # import numpy - # include_dirs=[numpy.get_include()], - # requires=['numpy', 'Cython']) +# include_dirs=[numpy.get_include()], +# requires=['numpy', 'Cython']) from setuptools import setup, find_packages, Extension @@ -71,17 +72,27 @@ long_description_content_type="text/markdown", include_package_data=True, keywords="grim", - packages=find_packages(include=[ + packages=find_packages( + include=[ "grim", "grim.imputation", "grim.imputation.imputegl", "grim.imputation.graph_generation", "grim.validation", "grim.conf", - ]), + ] + ), test_suite="tests", tests_require=test_requirements, url="https://github.com/nmdp-bioinformatics/py-grim", zip_safe=False, - ext_modules=cythonize([Extension("grim.imputation.imputegl.cutils", ["grim/imputation/imputegl/cutils.pyx"])], language_level="3") + ext_modules=cythonize( + [ + Extension( + "grim.imputation.imputegl.cutils", + ["grim/imputation/imputegl/cutils.pyx"], + ) + ], + language_level="3", + ), )