forked from genenetwork/genenetwork2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
58 changed files
with
3,168 additions
and
1,328 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
#!/usr/bin/python | ||
|
||
""" | ||
Convert .geno files to json | ||
This file goes through all of the genofiles in the genofile directory (.geno) | ||
and converts them to json files that are used when running the marker regression | ||
code | ||
""" | ||
|
||
from __future__ import print_function, division, absolute_import | ||
import sys | ||
sys.path.append("..") | ||
import os | ||
import glob | ||
import traceback | ||
import gzip | ||
|
||
#import numpy as np | ||
#from pyLMM import lmm | ||
|
||
import simplejson as json | ||
|
||
from pprint import pformat as pf | ||
|
||
class EmptyConfigurations(Exception): pass | ||
|
||
|
||
|
||
class Marker(object): | ||
def __init__(self): | ||
self.name = None | ||
self.chr = None | ||
self.cM = None | ||
self.Mb = None | ||
self.genotypes = [] | ||
|
||
class ConvertGenoFile(object): | ||
|
||
def __init__(self, input_file, output_files): | ||
|
||
self.input_file = input_file | ||
self.output_files = output_files | ||
|
||
self.mb_exists = False | ||
self.cm_exists = False | ||
self.markers = [] | ||
|
||
self.latest_row_pos = None | ||
self.latest_col_pos = None | ||
|
||
self.latest_row_value = None | ||
self.latest_col_value = None | ||
|
||
def convert(self): | ||
|
||
self.haplotype_notation = { | ||
'@mat': "1", | ||
'@pat': "0", | ||
'@het': "0.5", | ||
'@unk': "NA" | ||
} | ||
|
||
self.configurations = {} | ||
#self.skipped_cols = 3 | ||
|
||
#if self.input_file.endswith(".geno.gz"): | ||
# print("self.input_file: ", self.input_file) | ||
# self.input_fh = gzip.open(self.input_file) | ||
#else: | ||
self.input_fh = open(self.input_file) | ||
|
||
with open(self.output_files[0], "w") as self.geno_fh: | ||
#if self.file_type == "geno": | ||
self.process_csv() | ||
#elif self.file_type == "snps": | ||
# self.process_snps_file() | ||
|
||
|
||
def process_csv(self): | ||
for row_count, row in enumerate(self.process_rows()): | ||
row_items = row.split("\t") | ||
|
||
this_marker = Marker() | ||
this_marker.name = row_items[1] | ||
this_marker.chr = row_items[0] | ||
if self.cm_exists and self.mb_exists: | ||
this_marker.cM = row_items[2] | ||
this_marker.Mb = row_items[3] | ||
genotypes = row_items[4:] | ||
elif self.cm_exists: | ||
this_marker.cM = row_items[2] | ||
genotypes = row_items[3:] | ||
elif self.mb_exists: | ||
this_marker.Mb = row_items[2] | ||
genotypes = row_items[3:] | ||
else: | ||
genotypes = row_items[2:] | ||
for item_count, genotype in enumerate(genotypes): | ||
if genotype.upper().strip() in self.configurations: | ||
this_marker.genotypes.append(self.configurations[genotype.upper().strip()]) | ||
else: | ||
this_marker.genotypes.append("NA") | ||
|
||
#print("this_marker is:", pf(this_marker.__dict__)) | ||
#if this_marker.chr == "14": | ||
self.markers.append(this_marker.__dict__) | ||
|
||
self.write_to_bimbam() | ||
|
||
# with open(self.output_file, 'w') as fh: | ||
# json.dump(self.markers, fh, indent=" ", sort_keys=True) | ||
|
||
# print('configurations:', str(configurations)) | ||
#self.latest_col_pos = item_count + self.skipped_cols | ||
#self.latest_col_value = item | ||
|
||
#if item_count != 0: | ||
# self.output_fh.write(" ") | ||
#self.output_fh.write(self.configurations[item.upper()]) | ||
|
||
#self.output_fh.write("\n") | ||
|
||
def write_to_bimbam(self): | ||
with open(self.output_files[0], "w") as geno_fh: | ||
# geno_fh.write(str(len(self.sample_list)) + "\n") | ||
# geno_fh.write("2\n") | ||
# geno_fh.write("IND") | ||
# for sample in self.sample_list: | ||
# geno_fh.write(" " + sample) | ||
# geno_fh.write("\n") | ||
for marker in self.markers: | ||
geno_fh.write(marker['name']) | ||
geno_fh.write(", X, Y") | ||
geno_fh.write(", " + ", ".join(marker['genotypes'])) | ||
geno_fh.write("\n") | ||
|
||
#pheno_fh = open(self.output_files[1], 'w') | ||
with open(self.output_files[1], "w") as pheno_fh: | ||
for sample in self.sample_list: | ||
pheno_fh.write("1\n") | ||
|
||
with open(self.output_files[2], "w") as snp_fh: | ||
for marker in self.markers: | ||
if self.mb_exists: | ||
snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n") | ||
else: | ||
snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n") | ||
|
||
|
||
def get_sample_list(self, row_contents): | ||
self.sample_list = [] | ||
if self.mb_exists: | ||
if self.cm_exists: | ||
self.sample_list = row_contents[4:] | ||
else: | ||
self.sample_list = row_contents[3:] | ||
else: | ||
if self.cm_exists: | ||
self.sample_list = row_contents[3:] | ||
else: | ||
self.sample_list = row_contents[2:] | ||
|
||
def process_rows(self): | ||
for self.latest_row_pos, row in enumerate(self.input_fh): | ||
#if self.input_file.endswith(".geno.gz"): | ||
# print("row: ", row) | ||
self.latest_row_value = row | ||
# Take care of headers | ||
if not row.strip(): | ||
continue | ||
if row.startswith('#'): | ||
continue | ||
if row.startswith('Chr'): | ||
if 'Mb' in row.split(): | ||
self.mb_exists = True | ||
if 'cM' in row.split(): | ||
self.cm_exists = True | ||
self.get_sample_list(row.split()) | ||
continue | ||
if row.startswith('@'): | ||
key, _separater, value = row.partition(':') | ||
key = key.strip() | ||
value = value.strip() | ||
if key in self.haplotype_notation: | ||
self.configurations[value] = self.haplotype_notation[key] | ||
continue | ||
if not len(self.configurations): | ||
raise EmptyConfigurations | ||
yield row | ||
|
||
@classmethod | ||
def process_all(cls, old_directory, new_directory): | ||
os.chdir(old_directory) | ||
for input_file in glob.glob("*"): | ||
if not input_file.endswith(('geno', '.geno.gz')): | ||
continue | ||
group_name = ".".join(input_file.split('.')[:-1]) | ||
geno_output_file = os.path.join(new_directory, group_name + "_geno.txt") | ||
pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt") | ||
snp_output_file = os.path.join(new_directory, group_name + "_snps.txt") | ||
output_files = [geno_output_file, pheno_output_file, snp_output_file] | ||
print("%s -> %s" % ( | ||
os.path.join(old_directory, input_file), geno_output_file)) | ||
convertob = ConvertGenoFile(input_file, output_files) | ||
try: | ||
convertob.convert() | ||
except EmptyConfigurations as why: | ||
print(" No config info? Continuing...") | ||
#excepted = True | ||
continue | ||
except Exception as why: | ||
|
||
print(" Exception:", why) | ||
print(traceback.print_exc()) | ||
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, | ||
convertob.latest_col_pos)) | ||
print(" Column is:", convertob.latest_col_value) | ||
print(" Row is:", convertob.latest_row_value) | ||
break | ||
|
||
#def process_snps_file(cls, snps_file, new_directory): | ||
# output_file = os.path.join(new_directory, "mouse_families.json") | ||
# print("%s -> %s" % (snps_file, output_file)) | ||
# convertob = ConvertGenoFile(input_file, output_file) | ||
|
||
|
||
if __name__=="__main__": | ||
Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/""" | ||
New_Geno_Directory = """/home/zas1024/genotype_files/genotype/bimbam/""" | ||
#Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" | ||
#Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" | ||
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") | ||
#convertob.convert() | ||
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) | ||
#ConvertGenoFiles(Geno_Directory) | ||
|
||
#process_csv(Input_File, Output_File) |
Oops, something went wrong.