Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
pjotrp committed Dec 20, 2017
2 parents 99b8952 + d692e4f commit 6ba4c48
Show file tree
Hide file tree
Showing 58 changed files with 3,168 additions and 1,328 deletions.
2 changes: 2 additions & 0 deletions bin/genenetwork2
Expand Up @@ -74,6 +74,7 @@ else
export PATH=$GN2_PROFILE/bin:$PATH
export PYTHONPATH=$GN2_PROFILE/lib/python2.7/site-packages
export R_LIBS_SITE=$GN2_PROFILE/site-library
export GEM_PATH=$GN2_PROFILE/lib/ruby/gems/2.4.0
export JS_GUIX_PATH=$GN2_PROFILE/share/genenetwork2/javascript
export GUIX_GTK3_PATH="$GN2_PROFILE/lib/gtk-3.0"
export GI_TYPELIB_PATH="$GN2_PROFILE/lib/girepository-1.0"
Expand All @@ -84,6 +85,7 @@ else
export PLINK_COMMAND="$GN2_PROFILE/bin/plink2"
export PYLMM_COMMAND="$GN2_PROFILE/bin/pylmm_redis"
export GEMMA_COMMAND="$GN2_PROFILE/bin/gemma"
export GEMMA_WRAPPER_COMMAND="$GN2_PROFILE/bin/gemma-wrapper"
fi
if [ -z $PYTHONPATH ] ; then
echo "ERROR PYTHONPATH has not been set - use GN2_PROFILE!"
Expand Down
1 change: 1 addition & 0 deletions etc/default_settings.py
Expand Up @@ -74,3 +74,4 @@
# PYLMM_COMMAND = str.strip(os.popen("which pylmm_redis").read())
# PLINK_COMMAND = str.strip(os.popen("which plink2").read())
# GEMMA_COMMAND = str.strip(os.popen("which gemma").read())
# GEMMA_WRAPPER_COMMAND = str.strip(os.popen("which gemma-wrapper").read())
16 changes: 11 additions & 5 deletions wqflask/base/data_set.py
Expand Up @@ -111,7 +111,7 @@ def __init__(self):
new_type = "ProbeSet"
self.datasets[short_dataset_name] = new_type
# Set LOG_LEVEL_DEBUG=5 to see the following:
logger.debugf(5,"datasets",self.datasets)
logger.debugf(5, "datasets",self.datasets)

def __call__(self, name):
return self.datasets[name]
Expand Down Expand Up @@ -169,7 +169,7 @@ def mescape(*items):
class Markers(object):
"""Todo: Build in cacheing so it saves us reading the same file more than once"""
def __init__(self, name):
json_data_fh = open(locate(name + '.json','genotype/json'))
json_data_fh = open(locate(name + ".json",'genotype/json'))
try:
markers = json.load(json_data_fh)
except:
Expand Down Expand Up @@ -334,7 +334,10 @@ def check_plink_gemma():
else:
marker_class = Markers

self.markers = marker_class(self.name)
if self.genofile:
self.markers = marker_class(self.genofile[:-5])
else:
self.markers = marker_class(self.name)

def get_f1_parent_strains(self):
try:
Expand Down Expand Up @@ -423,13 +426,15 @@ def datasets(group_name, this_group = None):
FROM PublishFreeze,InbredSet
WHERE PublishFreeze.InbredSetId = InbredSet.Id
and InbredSet.Name = '%s'
and PublishFreeze.public > %s)
and PublishFreeze.public > %s
and PublishFreeze.confidentiality < 1)
UNION
(SELECT '#GenoFreeze',GenoFreeze.FullName,GenoFreeze.Name
FROM GenoFreeze, InbredSet
WHERE GenoFreeze.InbredSetId = InbredSet.Id
and InbredSet.Name = '%s'
and GenoFreeze.public > %s)
and GenoFreeze.public > %s
and GenoFreeze.confidentiality < 1)
UNION
(SELECT Tissue.Name, ProbeSetFreeze.FullName,ProbeSetFreeze.Name
FROM ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue
Expand All @@ -438,6 +443,7 @@ def datasets(group_name, this_group = None):
and ProbeFreeze.InbredSetId = InbredSet.Id
and InbredSet.Name like %s
and ProbeSetFreeze.public > %s
and ProbeSetFreeze.confidentiality < 1
ORDER BY Tissue.Name, ProbeSetFreeze.CreateTime desc, ProbeSetFreeze.AvgId)
''' % (group_name, webqtlConfig.PUBLICTHRESH,
group_name, webqtlConfig.PUBLICTHRESH,
Expand Down
3 changes: 2 additions & 1 deletion wqflask/base/trait.py
Expand Up @@ -206,6 +206,8 @@ def description_fmt(self):
formatted = self.description
if self.probe_target_description:
formatted += "; " + self.probe_target_description
else:
formatted = "Not available"
elif self.dataset.type == 'Publish':
if self.confidential:
formatted = self.pre_publication_description
Expand Down Expand Up @@ -290,7 +292,6 @@ def retrieve_sample_data(trait, dataset, samplelist=None):
name, value, variance, num_cases, name2 = item
if not samplelist or (samplelist and name in samplelist):
trait.data[name] = webqtlCaseData(*item) #name, value, variance, num_cases)

return trait

def convert_location_to_value(chromosome, mb):
Expand Down
8 changes: 4 additions & 4 deletions wqflask/base/webqtlCaseData.py
Expand Up @@ -44,9 +44,9 @@ def __init__(self, name, value=None, variance=None, num_cases=None, name2=None):

def __repr__(self):
str = "<webqtlCaseData> "
if self.value:
if self.value != None:
str += "value=%2.3f" % self.value
if self.variance:
if self.variance != None:
str += " variance=%2.3f" % self.variance
if self.num_cases:
str += " ndata=%d" % self.num_cases
Expand All @@ -66,14 +66,14 @@ def class_outlier(self):

@property
def display_value(self):
if self.value:
if self.value != None:
return "%2.3f" % self.value
else:
return "x"

@property
def display_variance(self):
if self.variance:
if self.variance != None:
return "%2.3f" % self.variance
else:
return "x"
5 changes: 3 additions & 2 deletions wqflask/base/webqtlConfig.py
Expand Up @@ -49,6 +49,7 @@
UCSC_POS = "http://genome.ucsc.edu/cgi-bin/hgTracks?clade=mammal&org=%s&db=%s&position=chr%s:%s-%s&pix=800&Submit=submit"
UCSC_BLAT = 'http://genome.ucsc.edu/cgi-bin/hgBlat?org=%s&db=%s&type=0&sort=0&output=0&userSeq=%s'
UTHSC_BLAT = 'http://ucscbrowser.genenetwork.org/cgi-bin/hgBlat?org=%s&db=%s&type=0&sort=0&output=0&userSeq=%s'
UTHSC_BLAT2 = 'http://ucscbrowserbeta.genenetwork.org/cgi-bin/hgBlat?org=%s&db=%s&type=0&sort=0&output=0&userSeq=%s'
UCSC_GENOME = "http://genome.ucsc.edu/cgi-bin/hgTracks?db=%s&position=chr%s:%d-%d&hgt.customText=http://web2qtl.utmem.edu:88/snp/chr%s"
ENSEMBLE_BLAT = 'http://www.ensembl.org/Mus_musculus/featureview?type=AffyProbe&id=%s'
DBSNP = 'http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=%s'
Expand All @@ -70,8 +71,8 @@

CACHEDIR = mk_dir(TMPDIR+'/cache/')
# We can no longer write into the git tree:
GENERATED_IMAGE_DIR = mk_dir(TMPDIR+'/generated/')
GENERATED_TEXT_DIR = mk_dir(TMPDIR+'/generated_text/')
GENERATED_IMAGE_DIR = mk_dir(TMPDIR+'generated/')
GENERATED_TEXT_DIR = mk_dir(TMPDIR+'generated_text/')

# Make sure we have permissions to access these
assert_writable_dir(CACHEDIR)
Expand Down
239 changes: 239 additions & 0 deletions wqflask/maintenance/convert_geno_to_bimbam.py
@@ -0,0 +1,239 @@
#!/usr/bin/python

"""
Convert .geno files to json
This file goes through all of the genofiles in the genofile directory (.geno)
and converts them to json files that are used when running the marker regression
code
"""

from __future__ import print_function, division, absolute_import
import sys
sys.path.append("..")
import os
import glob
import traceback
import gzip

#import numpy as np
#from pyLMM import lmm

import simplejson as json

from pprint import pformat as pf

class EmptyConfigurations(Exception): pass



class Marker(object):
def __init__(self):
self.name = None
self.chr = None
self.cM = None
self.Mb = None
self.genotypes = []

class ConvertGenoFile(object):

def __init__(self, input_file, output_files):

self.input_file = input_file
self.output_files = output_files

self.mb_exists = False
self.cm_exists = False
self.markers = []

self.latest_row_pos = None
self.latest_col_pos = None

self.latest_row_value = None
self.latest_col_value = None

def convert(self):

self.haplotype_notation = {
'@mat': "1",
'@pat': "0",
'@het': "0.5",
'@unk': "NA"
}

self.configurations = {}
#self.skipped_cols = 3

#if self.input_file.endswith(".geno.gz"):
# print("self.input_file: ", self.input_file)
# self.input_fh = gzip.open(self.input_file)
#else:
self.input_fh = open(self.input_file)

with open(self.output_files[0], "w") as self.geno_fh:
#if self.file_type == "geno":
self.process_csv()
#elif self.file_type == "snps":
# self.process_snps_file()


def process_csv(self):
for row_count, row in enumerate(self.process_rows()):
row_items = row.split("\t")

this_marker = Marker()
this_marker.name = row_items[1]
this_marker.chr = row_items[0]
if self.cm_exists and self.mb_exists:
this_marker.cM = row_items[2]
this_marker.Mb = row_items[3]
genotypes = row_items[4:]
elif self.cm_exists:
this_marker.cM = row_items[2]
genotypes = row_items[3:]
elif self.mb_exists:
this_marker.Mb = row_items[2]
genotypes = row_items[3:]
else:
genotypes = row_items[2:]
for item_count, genotype in enumerate(genotypes):
if genotype.upper().strip() in self.configurations:
this_marker.genotypes.append(self.configurations[genotype.upper().strip()])
else:
this_marker.genotypes.append("NA")

#print("this_marker is:", pf(this_marker.__dict__))
#if this_marker.chr == "14":
self.markers.append(this_marker.__dict__)

self.write_to_bimbam()

# with open(self.output_file, 'w') as fh:
# json.dump(self.markers, fh, indent=" ", sort_keys=True)

# print('configurations:', str(configurations))
#self.latest_col_pos = item_count + self.skipped_cols
#self.latest_col_value = item

#if item_count != 0:
# self.output_fh.write(" ")
#self.output_fh.write(self.configurations[item.upper()])

#self.output_fh.write("\n")

def write_to_bimbam(self):
with open(self.output_files[0], "w") as geno_fh:
# geno_fh.write(str(len(self.sample_list)) + "\n")
# geno_fh.write("2\n")
# geno_fh.write("IND")
# for sample in self.sample_list:
# geno_fh.write(" " + sample)
# geno_fh.write("\n")
for marker in self.markers:
geno_fh.write(marker['name'])
geno_fh.write(", X, Y")
geno_fh.write(", " + ", ".join(marker['genotypes']))
geno_fh.write("\n")

#pheno_fh = open(self.output_files[1], 'w')
with open(self.output_files[1], "w") as pheno_fh:
for sample in self.sample_list:
pheno_fh.write("1\n")

with open(self.output_files[2], "w") as snp_fh:
for marker in self.markers:
if self.mb_exists:
snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n")
else:
snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n")


def get_sample_list(self, row_contents):
self.sample_list = []
if self.mb_exists:
if self.cm_exists:
self.sample_list = row_contents[4:]
else:
self.sample_list = row_contents[3:]
else:
if self.cm_exists:
self.sample_list = row_contents[3:]
else:
self.sample_list = row_contents[2:]

def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
#if self.input_file.endswith(".geno.gz"):
# print("row: ", row)
self.latest_row_value = row
# Take care of headers
if not row.strip():
continue
if row.startswith('#'):
continue
if row.startswith('Chr'):
if 'Mb' in row.split():
self.mb_exists = True
if 'cM' in row.split():
self.cm_exists = True
self.get_sample_list(row.split())
continue
if row.startswith('@'):
key, _separater, value = row.partition(':')
key = key.strip()
value = value.strip()
if key in self.haplotype_notation:
self.configurations[value] = self.haplotype_notation[key]
continue
if not len(self.configurations):
raise EmptyConfigurations
yield row

@classmethod
def process_all(cls, old_directory, new_directory):
os.chdir(old_directory)
for input_file in glob.glob("*"):
if not input_file.endswith(('geno', '.geno.gz')):
continue
group_name = ".".join(input_file.split('.')[:-1])
geno_output_file = os.path.join(new_directory, group_name + "_geno.txt")
pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt")
snp_output_file = os.path.join(new_directory, group_name + "_snps.txt")
output_files = [geno_output_file, pheno_output_file, snp_output_file]
print("%s -> %s" % (
os.path.join(old_directory, input_file), geno_output_file))
convertob = ConvertGenoFile(input_file, output_files)
try:
convertob.convert()
except EmptyConfigurations as why:
print(" No config info? Continuing...")
#excepted = True
continue
except Exception as why:

print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
convertob.latest_col_pos))
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break

#def process_snps_file(cls, snps_file, new_directory):
# output_file = os.path.join(new_directory, "mouse_families.json")
# print("%s -> %s" % (snps_file, output_file))
# convertob = ConvertGenoFile(input_file, output_file)


if __name__=="__main__":
Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/"""
New_Geno_Directory = """/home/zas1024/genotype_files/genotype/bimbam/"""
#Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
#Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
#convertob.convert()
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
#ConvertGenoFiles(Geno_Directory)

#process_csv(Input_File, Output_File)

0 comments on commit 6ba4c48

Please sign in to comment.