From 9903f84a2cb73deb16999b205044a351420186c9 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Tue, 29 Nov 2022 13:13:24 -0600 Subject: [PATCH] added back files from sapiris master --- grim/imputation/impute.py | 341 ++++++++++++++---------------- grim/imputation/networkx_graph.py | 62 ++---- 2 files changed, 178 insertions(+), 225 deletions(-) diff --git a/grim/imputation/impute.py b/grim/imputation/impute.py index 1cdc63a..bf9d585 100644 --- a/grim/imputation/impute.py +++ b/grim/imputation/impute.py @@ -302,7 +302,7 @@ def gen_phases(self, gen, n_loci, b_phases): return Phases - def open_gl_string(self, gl_string): + def open_gl_string(self, gl_string, cutoff): # receives a list of phases and computes haps and # probabilties and accumulate cartesian productEpsilon=0.0001 chr = self.gl2haps(gl_string) @@ -310,36 +310,35 @@ def open_gl_string(self, gl_string): return # generate the 2^(N-1) phases - chr1 = self.gen_phases(chr["Genotype"], chr["N_Loc"]) + chr1 = self.gen_phases(chr["Genotype"], chr["N_Loc"], None) # return if the result is empty (why would that be?) if chr1 == []: return - phases = self.open_phases_1(chr1, chr["N_Loc"], gl_string) + phases = self.open_phases_for_em(chr1, chr["N_Loc"], cutoff) return phases - def open_phases_1(self, haps, N_Loc, gl_string): + def open_phases_for_em(self, haps, N_Loc, cutoff): phases = [] for j in range(len(haps)): H1 = [] H2 = [] - ## fq = pa.DataFrame() - fq = [] for k in range(2): - hap_list = [] - hap_list.append(haps[j][k]) - + hap_list = [haps[j][k]] + hap_list_splits = [tuple(allele.split("/")) for allele in hap_list[0]] # compute the number of options: options = 1 for i in range(N_Loc): - options = options * (len(hap_list[0][i].split("/"))) + options *= len(hap_list_splits[i]) # if the number of options is smaller than the total number of nodes: - if options < 100: # open ambiguities regularly: + if options < cutoff: # open ambiguities regularly: for i in range(N_Loc): - hap_list = self.open_ambiguities(hap_list, i) + hap_list = self.open_ambiguities( + hap_list, i, hap_list_splits[i] + ) if k == 0: H1.append(hap_list) else: @@ -351,24 +350,6 @@ def open_phases_1(self, haps, N_Loc, gl_string): phases.append([H1, H2]) return phases - def open_ambiguities(self, hap, loc): - # This opens all allele ambiguities - hap_new = [] # produces an empty list of haplotypes - for k in range(len(hap)): # slit a given locus in all haps. - splitHap = hap[k][loc].split("/") - if splitHap == hap[k][loc].split(): - split_loc = hap[k][loc].split("|") - else: - split_loc = splitHap - hap1 = hap[k] - if len(split_loc) > 1: - for i in range(len(split_loc)): - hap1[loc] = split_loc[i] - hap_new.append(hap1[:]) - else: - hap_new.append(hap1[:]) - return hap_new - def comp_hap_prob(self, Hap, N_Loc, epsilon, n): haplo_probs = self.get_haplo_freqs(Hap, epsilon, n) probs = list(haplo_probs.values()) @@ -410,7 +391,7 @@ def comp_hap_prob(self, Hap, N_Loc, epsilon, n): # return haplo_probs def get_haplo_freqs(self, haplos, epsilon, n=25000): - haplos_joined = ["~".join(item) for sublist in haplos for item in sublist] ### + haplos_joined = ["~".join(item) for item in haplos[0]] ### # haplos_joined = [item for sublist in haplos for item in sublist] ### # haplos_joined = ["~".join(sorted(item)) for sublist in haplos for item in sublist] return self.netGraph.adjs_query(haplos_joined) @@ -895,8 +876,7 @@ def reduce_phase_to_valid_allels(self, haps, N_Loc, planc=False): gen = g.split("/") probs = self.check_if_alleles_exist(gen) if not probs == {}: - list(probs.keys()) - haps[j][k][i] = ("/").join(list(probs.keys())) + haps[j][k][i] = "/".join(list(probs.keys())) def reduce_phase_to_commons_alleles( self, haps, N_Loc, commons_number=1, planc=False @@ -946,13 +926,16 @@ def open_phases(self, haps, N_Loc, gl_string): # compute the number of options: options = 1 for i in range(N_Loc): - options = options * (len(hap_list[0][i].split("/"))) + options *= len(hap_list_splits[i]) # if the number of options is smaller than the total number of nodes: if options < self.number_of_options_threshold: # open ambiguities regularly: for i in range(N_Loc): - hap_list = self.open_ambiguities(hap_list, i) + hap_list = self.open_ambiguities( + hap_list, i, hap_list_splits[i] + ) + if k == 0: H1.append(hap_list) else: @@ -963,21 +946,21 @@ def open_phases(self, haps, N_Loc, gl_string): # if there are more options than actual haplotypes possible: else: self.option_2 += 1 - optionDict = {} + optionDict = {} # set() if len(fq) == 0: - list = [] + _list = [] for (gen, name) in self.cypher.loc_map.items(): count = 0 for i in range(len(hap_list[0])): - if hap_list[0][i].split("*")[0] == gen: + if hap_list[0][i].split("*", 1)[0] == gen: count = count + 1 if count > 0: - list.append(name) + _list.append(name) # we'll get all the options possible - # (query,lc)=self.cypher.buildQuery(["~".join(list)]) + # (query,lc)=self.cypher.buildQuery(["~".join(_list)]) # fq = pa.DataFrame(self.graph.data(query)) - label = "".join(sorted(list)) + label = "".join(sorted(_list)) fq = self.netGraph.haps_by_label(label) # fq = pa.DataFrame(self.netGraph.abcdq_allele(), ) # fq = self.netGraph.abcdq_allele() @@ -989,28 +972,14 @@ def open_phases(self, haps, N_Loc, gl_string): gen=g.split('/') for option in gen: optionDict[option]=True""" - for hap_k in hap_list: - # listType = [] - for g in hap_k: - gen = g.split("/") - for option in gen: - optionDict[option] = True - # print(listType) + + for gen in hap_list_splits: + for option in gen: + optionDict[option] = True + # optionDict.add(option) ##all_haps = fq.values.tolist() - all_haps = fq - hap_list = [] - for i in range(len(all_haps)): - count = 0 - for gen in all_haps[i].split("~"): - if not gen in optionDict: - break - else: - count = count + 1 - if count == N_Loc: - - # all_haps[i][0]=all_haps[i][0].split("~") - # hap_list.append(all_haps[i][0]) - hap_list.append(all_haps[i].split("~")) + hap_list = create_hap_list(fq, optionDict, N_Loc) + if k == 0: H1.append(hap_list) else: @@ -1147,7 +1116,7 @@ def find_option_freq(self, option, haplos, missing): def comp_hap_prob_plan_b(self, Hap, division, missing): if division[0] == list(set(self.index_dict.values())): # [1, 2, 3, 4, 5]: - return self.comp_hap_prob(Hap[0], 0, 0, 0) + return self.comp_hap_prob([Hap[0]], 0, 0, 0) haplo_probs = self.find_option_freq(division, Hap, missing) # Return {'Haps': haplos, 'Probs': probs} of the dict @@ -1609,6 +1578,9 @@ def input_type(self, haplotype): type_list.append(self.index_dict[locus]) return type_list + def open_ambiguities(self, hap, loc, split_loc): + return open_ambiguities(hap, loc, split_loc) + def comp_cand( self, gl_string, binary, epsilon, n, MUUG_output, haps_output, planb, em ): @@ -1656,7 +1628,9 @@ def comp_cand( if phases: if MUUG_output: - prior_matrix_orig = copy.deepcopy(self.priorMatrix) + prior_matrix_orig = np.array( + self.priorMatrix, order="K", copy=True + ) # copy.deepcopy(self.priorMatrix) res_muugs = self.call_comp_phase_prob( epsilon, n, phases, chr, True, planb ) @@ -1721,13 +1695,9 @@ def call_comp_phase_prob(self, epsilon, n, phases, chr, MUUG_output, planb): # no plan b for level in range(2): if level == 1: - if self.unk_priors == "MR": - self.priorMatrix = np.ones( - (len(self.populations), len(self.populations)) - ) - else: - self.priorMatrix = np.identity(len(self.populations)) - # self.priorMatrix = np.ones((len(self.populations), len(self.populations))) #### + self.priorMatrix = np.ones( + (len(self.populations), len(self.populations)) + ) #### if planb and len(res["Haps"]) == 0: self.plan = "b" epsilon = 1e-14 @@ -2048,129 +2018,130 @@ def impute_file(self, config, planb=None, em_mr=False, em=False): ##em with f as lines: for (i, name_gl) in enumerate(lines): - # try: - name_gl = name_gl.rstrip() # remove trailing whitespace - if "," in name_gl: - list_gl = name_gl.split(",") - else: - list_gl = name_gl.split("%") - - subject_id = list_gl[0] - subject_gl = list_gl[1] - subject_bin = [1] * (len(self.full_loci) - 1) - if f_bin_exist: - subject_bin = f_bin[subject_id] - race1 = race2 = None - if len(list_gl) > 2: - race1 = list_gl[2] - race2 = list_gl[3] - - start = timeit.default_timer() - - # - # Impute One - # This method will impute one subject given the parameters - self.plan = "a" - self.option_1 = 0 - self.option_2 = 0 - subject_id, res_muugs, res_haps = self.impute_one( - subject_id, - subject_gl, - subject_bin, - race1, - race2, - priority, - epsilon, - n, - MUUG_output, - haps_output, - planb, - em, - ) # em - - if res_muugs is None: - problem.write(str(i) + "," + str(subject_id) + "\n") - continue + try: + name_gl = name_gl.rstrip() # remove trailing whitespace + if "," in name_gl: + list_gl = name_gl.split(",") + else: + list_gl = name_gl.split("%") + + subject_id = list_gl[0] + subject_gl = list_gl[1] + subject_bin = [1] * (len(self.full_loci) - 1) + if f_bin_exist: + subject_bin = f_bin[subject_id] + race1 = race2 = None + if len(list_gl) > 2: + race1 = list_gl[2] + race2 = list_gl[3] + + start = timeit.default_timer() + + # + # Impute One + # This method will impute one subject given the parameters + self.plan = "a" + self.option_1 = 0 + self.option_2 = 0 + subject_id, res_muugs, res_haps = self.impute_one( + subject_id, + subject_gl, + subject_bin, + race1, + race2, + priority, + epsilon, + n, + MUUG_output, + haps_output, + planb, + em, + ) # em - if (len(res_haps["Haps"]) == 0 or res_haps["Haps"] == "NaN") and len( - res_muugs["Haps"] - ) == 0: - miss.write(str(i) + "," + str(subject_id) + "\n") - - if haps_output: - haps = res_haps["Haps"] - probs = res_haps["Probs"] - pops = res_haps["Pops"] - print( - "{index} Subject: {id} {hap_length} haplotypes".format( - index=i, id=subject_id, hap_length=len(haps) - ) - ) - if em_mr: - write_best_hap_race_pairs( - subject_id, - haps, - pops, - probs, - number_of_results, - fout_hap_haplo, + if res_muugs is None: + problem.write(str(i) + "," + str(subject_id) + "\n") + continue + + if ( + len(res_haps["Haps"]) == 0 or res_haps["Haps"] == "NaN" + ) and len(res_muugs["Haps"]) == 0: + miss.write(str(i) + "," + str(subject_id) + "\n") + + if haps_output: + haps = res_haps["Haps"] + probs = res_haps["Probs"] + pops = res_haps["Pops"] + print( + "{index} Subject: {id} {hap_length} haplotypes".format( + index=i, id=subject_id, hap_length=len(haps) + ) ) - write_best_prob(subject_id, pops, probs, 1, fout_pop_haplo) - else: - write_best_prob( - subject_id, - haps, - probs, - number_of_results, - fout_hap_haplo, - "+", + if em_mr: + write_best_hap_race_pairs( + subject_id, + haps, + pops, + probs, + number_of_results, + fout_hap_haplo, + ) + write_best_prob(subject_id, pops, probs, 1, fout_pop_haplo) + else: + write_best_prob( + subject_id, + haps, + probs, + number_of_results, + fout_hap_haplo, + "+", + ) + write_best_prob( + subject_id, + pops, + probs, + number_of_pop_results, + fout_pop_haplo, + ) + if MUUG_output: + haps = res_muugs["Haps"] + pops = res_muugs["Pops"] + print( + "{index} Subject: {id} {hap_length} haplotypes".format( + index=i, id=subject_id, hap_length=len(haps) + ) ) - write_best_prob( - subject_id, - pops, - probs, - number_of_pop_results, - fout_pop_haplo, + write_best_prob_genotype( + subject_id, haps, number_of_results, fout_hap_muug ) - if MUUG_output: - haps = res_muugs["Haps"] - pops = res_muugs["Pops"] - print( - "{index} Subject: {id} {hap_length} haplotypes".format( - index=i, id=subject_id, hap_length=len(haps) + write_best_prob_genotype( + subject_id, pops, number_of_pop_results, fout_pop_muug ) - ) - write_best_prob_genotype( - subject_id, haps, number_of_results, fout_hap_muug - ) - write_best_prob_genotype( - subject_id, pops, number_of_pop_results, fout_pop_muug - ) - if self.verbose: - self.logger.info( - "{index} Subject: {id} {hap_length} haplotypes".format( - index=i, id=subject_id, hap_length=len(haps) + if self.verbose: + self.logger.info( + "{index} Subject: {id} {hap_length} haplotypes".format( + index=i, id=subject_id, hap_length=len(haps) + ) ) - ) - self.logger.info( - "{index} Subject: {id} plan: {plan} oppen_phases - count of open regular option: {option1}, count of alternative opening: {option2}".format( - index=i, - id=subject_id, - plan=self.plan, - option1=self.option_1, - option2=self.option_2, + self.logger.info( + "{index} Subject: {id} plan: {plan} oppen_phases - count of open regular option: {option1}, count of alternative opening: {option2}".format( + index=i, + id=subject_id, + plan=self.plan, + option1=self.option_1, + option2=self.option_2, + ) ) - ) - stop = timeit.default_timer() - time_taken = stop - start - print(time_taken) - if self.verbose: - self.logger.info("Time taken: " + str(time_taken)) - """except: - problem.write(str(name_gl) + "\n") - continue""" + stop = timeit.default_timer() + time_taken = stop - start + print(time_taken) + if self.verbose: + self.logger.info("Time taken: " + str(time_taken)) + except: + print(f"{i} Subject: {subject_id} - Exception") + problem.write(str(name_gl) + "\n") + continue f.close() if MUUG_output: diff --git a/grim/imputation/networkx_graph.py b/grim/imputation/networkx_graph.py index 1d997ab..e647e6c 100755 --- a/grim/imputation/networkx_graph.py +++ b/grim/imputation/networkx_graph.py @@ -25,7 +25,7 @@ def __init__(self, config): self.full_loci = config["full_loci"] self.nodes_plan_a, self.nodes_plan_b = [], [] if config["nodes_for_plan_A"]: - path = ("/").join(config["node_file"].split("/")[:-1]) + path = "/".join(config["node_file"].split("/")[:-1]) # bug: dies if file doesn't exist # bug: list_f doesn't exist @@ -45,7 +45,7 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile): # add nodes from file with open(nodesFile) as nodesfile: readNodes = csv.reader(nodesfile, delimiter=",") - firstLine = next(readNodes) + next(readNodes) for row in readNodes: if len(row) > 0: if not self.nodes_plan_a or row[2] in self.nodes_plan_a: @@ -67,7 +67,7 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile): # add edges from file with open(edgesFile) as edgesfile: readEdges = csv.reader(edgesfile, delimiter=",") - firstLine = next(readEdges) + next(readEdges) for row in readEdges: if len(row) > 0: node1 = nodesDict[row[0]] @@ -83,21 +83,21 @@ def build_graph(self, nodesFile, edgesFile, allEdgesFile): # add edges from file with open(allEdgesFile) as allEdgesfile: readEdges = csv.reader(allEdgesfile, delimiter=",") - firstLine = next(readEdges) + next(readEdges) for row in readEdges: if len(row) > 0: node1 = nodesDict[row[0]] node2 = nodesDict[row[1]] - kind = "-".join( - sorted( - [ - self.whole_graph.nodes[node1]["label"], - self.whole_graph.nodes[node2]["label"], - ], - key=len, - ) - ) - self.whole_graph.add_edge(node1, node2, color=kind) + if len(self.whole_graph.nodes[node1]["label"]) < len( + self.whole_graph.nodes[node2]["label"] + ): + connector = self.whole_graph.nodes[node2]["label"] + node1 + self.whole_graph.add_edge(node1, connector) + self.whole_graph.add_edge(connector, node2) + else: + connector = self.whole_graph.nodes[node1]["label"] + node2 + self.whole_graph.add_edge(node2, connector) + self.whole_graph.add_edge(connector, node1) allEdgesfile.close() nodesDict.clear() @@ -137,9 +137,10 @@ def haps_with_probs_by_label(self, label): def adjs_query(self, alleleList): adjDict = dict() for allele in alleleList: - if allele in self.graph.nodes(): - if self.graph.nodes[allele]["label"] == self.full_loci: # 'ABCQR': - adjDict[allele] = self.graph.nodes[allele]["freq"] + if allele in self.graph: + allele_node = self.graph.nodes[allele] + if allele_node["label"] == self.full_loci: # 'ABCQR': + adjDict[allele] = allele_node["freq"] else: adjs = self.graph.adj[allele] for adj in adjs: @@ -154,27 +155,8 @@ def adjs_query_by_color(self, alleleList, labelA, labelB): return self.node_probs(alleleList, labelA) for allele in alleleList: - if allele in self.whole_graph.nodes(): - copyLabelA = labelA - newLabelA = labelA - miss = missing(labelA, labelB) - alleles = [allele] - - while len(miss) > 0: - tmpAllels = list() - newLabelA = copyLabelA + miss[0] - newLabelA = "".join(sorted(newLabelA)) - del miss[0] - for oneAllel in alleles: - # alleles.remove(oneAllel) - adjs = self.whole_graph.adj[oneAllel] - label = copyLabelA + "-" + newLabelA - for adj in adjs: - if adjs[adj]["color"] == label: - tmpAllels.append(adj) - alleles = tmpAllels - copyLabelA = newLabelA - + if allele in self.whole_graph: + alleles = self.whole_graph.adj.get(labelB + allele, []) for adj in alleles: adjDict[adj] = self.whole_graph.nodes[adj]["freq"] return adjDict @@ -184,10 +166,10 @@ def node_probs(self, nodes, label): nodesDict = dict() if not self.nodes_plan_b or label in self.nodes_plan_b: for node in nodes: - if node in self.whole_graph.nodes(): + if node in self.whole_graph: nodesDict[node] = self.whole_graph.nodes[node]["freq"] elif label in self.nodes_plan_a: for node in nodes: - if node in self.whole_graph.nodes(): + if node in self.whole_graph: nodesDict[node] = self.graph.nodes[node]["freq"] return nodesDict