merge

pjotrp · Dec 20, 2017 · 6ba4c48 · 6ba4c48
2 parents 99b8952 + d692e4f
commit 6ba4c48
Show file tree

Hide file tree

Showing 58 changed files with 3,168 additions and 1,328 deletions.
diff --git a/bin/genenetwork2 b/bin/genenetwork2
@@ -74,6 +74,7 @@ else
     export PATH=$GN2_PROFILE/bin:$PATH
     export PYTHONPATH=$GN2_PROFILE/lib/python2.7/site-packages
     export R_LIBS_SITE=$GN2_PROFILE/site-library
+    export GEM_PATH=$GN2_PROFILE/lib/ruby/gems/2.4.0
     export JS_GUIX_PATH=$GN2_PROFILE/share/genenetwork2/javascript
     export GUIX_GTK3_PATH="$GN2_PROFILE/lib/gtk-3.0"
     export GI_TYPELIB_PATH="$GN2_PROFILE/lib/girepository-1.0"
@@ -84,6 +85,7 @@ else
     export PLINK_COMMAND="$GN2_PROFILE/bin/plink2"
     export PYLMM_COMMAND="$GN2_PROFILE/bin/pylmm_redis"
     export GEMMA_COMMAND="$GN2_PROFILE/bin/gemma"
+    export GEMMA_WRAPPER_COMMAND="$GN2_PROFILE/bin/gemma-wrapper"
 fi
 if [ -z $PYTHONPATH ] ; then
     echo "ERROR PYTHONPATH has not been set - use GN2_PROFILE!"

diff --git a/etc/default_settings.py b/etc/default_settings.py
@@ -74,3 +74,4 @@
 # PYLMM_COMMAND = str.strip(os.popen("which pylmm_redis").read())
 # PLINK_COMMAND = str.strip(os.popen("which plink2").read())
 # GEMMA_COMMAND = str.strip(os.popen("which gemma").read())
+# GEMMA_WRAPPER_COMMAND = str.strip(os.popen("which gemma-wrapper").read())
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
@@ -111,7 +111,7 @@ def __init__(self):
                             new_type = "ProbeSet"
                         self.datasets[short_dataset_name] = new_type
         # Set LOG_LEVEL_DEBUG=5 to see the following:
-        logger.debugf(5,"datasets",self.datasets)
+        logger.debugf(5, "datasets",self.datasets)
 
     def __call__(self, name):
         return self.datasets[name]
@@ -169,7 +169,7 @@ def mescape(*items):
 class Markers(object):
     """Todo: Build in cacheing so it saves us reading the same file more than once"""
     def __init__(self, name):
-        json_data_fh = open(locate(name + '.json','genotype/json'))
+        json_data_fh = open(locate(name + ".json",'genotype/json'))
         try:
             markers = json.load(json_data_fh)
         except:
@@ -334,7 +334,10 @@ def check_plink_gemma():
         else:
             marker_class = Markers
 
-        self.markers = marker_class(self.name)
+        if self.genofile:
+            self.markers = marker_class(self.genofile[:-5])
+        else:
+            self.markers = marker_class(self.name)
 
     def get_f1_parent_strains(self):
         try:
@@ -423,13 +426,15 @@ def datasets(group_name, this_group = None):
           FROM PublishFreeze,InbredSet
           WHERE PublishFreeze.InbredSetId = InbredSet.Id
             and InbredSet.Name = '%s'
-            and PublishFreeze.public > %s)
+            and PublishFreeze.public > %s
+            and PublishFreeze.confidentiality < 1)
          UNION
          (SELECT '#GenoFreeze',GenoFreeze.FullName,GenoFreeze.Name
           FROM GenoFreeze, InbredSet
           WHERE GenoFreeze.InbredSetId = InbredSet.Id
             and InbredSet.Name = '%s'
-            and GenoFreeze.public > %s)
+            and GenoFreeze.public > %s
+            and GenoFreeze.confidentiality < 1)
          UNION
          (SELECT Tissue.Name, ProbeSetFreeze.FullName,ProbeSetFreeze.Name
           FROM ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue
@@ -438,6 +443,7 @@ def datasets(group_name, this_group = None):
             and ProbeFreeze.InbredSetId = InbredSet.Id
             and InbredSet.Name like %s
             and ProbeSetFreeze.public > %s
+            and ProbeSetFreeze.confidentiality < 1
           ORDER BY Tissue.Name, ProbeSetFreeze.CreateTime desc, ProbeSetFreeze.AvgId)
         ''' % (group_name, webqtlConfig.PUBLICTHRESH,
               group_name, webqtlConfig.PUBLICTHRESH,

diff --git a/wqflask/base/trait.py b/wqflask/base/trait.py
@@ -206,6 +206,8 @@ def description_fmt(self):
                 formatted = self.description
                 if self.probe_target_description:
                     formatted += "; " + self.probe_target_description
+            else:
+                formatted = "Not available"
         elif self.dataset.type == 'Publish':
             if self.confidential:
                 formatted = self.pre_publication_description
@@ -290,7 +292,6 @@ def retrieve_sample_data(trait, dataset, samplelist=None):
             name, value, variance, num_cases, name2 = item
             if not samplelist or (samplelist and name in samplelist):
                 trait.data[name] = webqtlCaseData(*item)   #name, value, variance, num_cases)
-
     return trait
 
 def convert_location_to_value(chromosome, mb):

diff --git a/wqflask/base/webqtlCaseData.py b/wqflask/base/webqtlCaseData.py
@@ -44,9 +44,9 @@ def __init__(self, name, value=None, variance=None, num_cases=None, name2=None):
 
     def __repr__(self):
         str = "<webqtlCaseData> "
-        if self.value:
+        if self.value != None:
             str += "value=%2.3f" % self.value
-        if self.variance:
+        if self.variance != None:
             str += " variance=%2.3f" % self.variance
         if self.num_cases:
             str += " ndata=%d" % self.num_cases
@@ -66,14 +66,14 @@ def class_outlier(self):
 
     @property
     def display_value(self):
-        if self.value:
+        if self.value != None:
             return "%2.3f" % self.value
         else:
             return "x"
 
     @property
     def display_variance(self):
-        if self.variance:
+        if self.variance != None:
             return "%2.3f" % self.variance
         else:
             return "x"
diff --git a/wqflask/base/webqtlConfig.py b/wqflask/base/webqtlConfig.py
@@ -49,6 +49,7 @@
 UCSC_POS = "http://genome.ucsc.edu/cgi-bin/hgTracks?clade=mammal&org=%s&db=%s&position=chr%s:%s-%s&pix=800&Submit=submit"
 UCSC_BLAT = 'http://genome.ucsc.edu/cgi-bin/hgBlat?org=%s&db=%s&type=0&sort=0&output=0&userSeq=%s'
 UTHSC_BLAT = 'http://ucscbrowser.genenetwork.org/cgi-bin/hgBlat?org=%s&db=%s&type=0&sort=0&output=0&userSeq=%s'
+UTHSC_BLAT2 = 'http://ucscbrowserbeta.genenetwork.org/cgi-bin/hgBlat?org=%s&db=%s&type=0&sort=0&output=0&userSeq=%s'
 UCSC_GENOME = "http://genome.ucsc.edu/cgi-bin/hgTracks?db=%s&position=chr%s:%d-%d&hgt.customText=http://web2qtl.utmem.edu:88/snp/chr%s"
 ENSEMBLE_BLAT = 'http://www.ensembl.org/Mus_musculus/featureview?type=AffyProbe&id=%s'
 DBSNP = 'http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=%s'
@@ -70,8 +71,8 @@
 
 CACHEDIR             = mk_dir(TMPDIR+'/cache/')
 # We can no longer write into the git tree:
-GENERATED_IMAGE_DIR  = mk_dir(TMPDIR+'/generated/')
-GENERATED_TEXT_DIR   = mk_dir(TMPDIR+'/generated_text/')
+GENERATED_IMAGE_DIR  = mk_dir(TMPDIR+'generated/')
+GENERATED_TEXT_DIR   = mk_dir(TMPDIR+'generated_text/')
 
 # Make sure we have permissions to access these
 assert_writable_dir(CACHEDIR)

diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py
@@ -0,0 +1,239 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+#import numpy as np
+#from pyLMM import lmm
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+class EmptyConfigurations(Exception): pass
+
+
+
+class Marker(object):
+    def __init__(self):
+        self.name = None
+        self.chr = None
+        self.cM = None
+        self.Mb = None
+        self.genotypes = []
+
+class ConvertGenoFile(object):
+
+    def __init__(self, input_file, output_files):
+
+        self.input_file = input_file
+        self.output_files = output_files
+
+        self.mb_exists = False
+        self.cm_exists = False
+        self.markers = []
+
+        self.latest_row_pos = None
+        self.latest_col_pos = None
+
+        self.latest_row_value = None
+        self.latest_col_value = None
+
+    def convert(self):
+
+        self.haplotype_notation = {
+            '@mat': "1",
+            '@pat': "0",
+            '@het': "0.5",
+            '@unk': "NA"
+            }
+
+        self.configurations = {}
+        #self.skipped_cols = 3
+
+        #if self.input_file.endswith(".geno.gz"):
+        #    print("self.input_file: ", self.input_file)
+        #    self.input_fh = gzip.open(self.input_file)
+        #else:
+        self.input_fh = open(self.input_file)
+
+        with open(self.output_files[0], "w") as self.geno_fh:
+            #if self.file_type == "geno":
+            self.process_csv()
+            #elif self.file_type == "snps":
+            #    self.process_snps_file()
+
+
+    def process_csv(self):
+        for row_count, row in enumerate(self.process_rows()):
+            row_items = row.split("\t")
+
+            this_marker = Marker()
+            this_marker.name = row_items[1]
+            this_marker.chr = row_items[0]
+            if self.cm_exists and self.mb_exists:
+                this_marker.cM = row_items[2]
+                this_marker.Mb = row_items[3]
+                genotypes = row_items[4:]
+            elif self.cm_exists:
+                this_marker.cM = row_items[2]
+                genotypes = row_items[3:]
+            elif self.mb_exists:
+                this_marker.Mb = row_items[2]
+                genotypes = row_items[3:]
+            else:
+                genotypes = row_items[2:]
+            for item_count, genotype in enumerate(genotypes):
+                if genotype.upper().strip() in self.configurations:
+                    this_marker.genotypes.append(self.configurations[genotype.upper().strip()])
+                else:
+                    this_marker.genotypes.append("NA")
+
+            #print("this_marker is:", pf(this_marker.__dict__))   
+            #if this_marker.chr == "14":
+            self.markers.append(this_marker.__dict__)
+
+        self.write_to_bimbam()    
+
+        # with open(self.output_file, 'w') as fh:
+            # json.dump(self.markers, fh, indent="   ", sort_keys=True)
+
+                # print('configurations:', str(configurations))
+                #self.latest_col_pos = item_count + self.skipped_cols
+                #self.latest_col_value = item
+
+                #if item_count != 0:
+                #    self.output_fh.write(" ")
+                #self.output_fh.write(self.configurations[item.upper()])
+
+            #self.output_fh.write("\n")
+
+    def write_to_bimbam(self):
+        with open(self.output_files[0], "w") as geno_fh:
+            # geno_fh.write(str(len(self.sample_list)) + "\n")
+            # geno_fh.write("2\n")
+            # geno_fh.write("IND")
+            # for sample in self.sample_list:
+                # geno_fh.write(" " + sample)
+            # geno_fh.write("\n")
+            for marker in self.markers:
+                geno_fh.write(marker['name'])
+                geno_fh.write(", X, Y")
+                geno_fh.write(", " + ", ".join(marker['genotypes']))
+                geno_fh.write("\n")
+
+        #pheno_fh = open(self.output_files[1], 'w')
+        with open(self.output_files[1], "w") as pheno_fh:
+            for sample in self.sample_list:
+                pheno_fh.write("1\n")
+
+        with open(self.output_files[2], "w") as snp_fh:
+            for marker in self.markers:
+                if self.mb_exists:
+                    snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n")
+                else:
+                    snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n")
+
+
+    def get_sample_list(self, row_contents):
+        self.sample_list = []
+        if self.mb_exists:
+            if self.cm_exists:
+                self.sample_list = row_contents[4:]
+            else:
+                self.sample_list = row_contents[3:]
+        else:
+            if self.cm_exists:
+                self.sample_list = row_contents[3:]
+            else:
+                self.sample_list = row_contents[2:]
+
+    def process_rows(self):
+        for self.latest_row_pos, row in enumerate(self.input_fh):
+            #if self.input_file.endswith(".geno.gz"):
+            #    print("row: ", row)
+            self.latest_row_value = row
+            # Take care of headers
+            if not row.strip():
+                continue
+            if row.startswith('#'):
+                continue
+            if row.startswith('Chr'):
+                if 'Mb' in row.split():
+                    self.mb_exists = True
+                if 'cM' in row.split():
+                    self.cm_exists = True
+                self.get_sample_list(row.split())
+                continue
+            if row.startswith('@'):
+                key, _separater, value = row.partition(':')
+                key = key.strip()
+                value = value.strip()
+                if key in self.haplotype_notation:
+                    self.configurations[value] = self.haplotype_notation[key]
+                continue
+            if not len(self.configurations):
+                raise EmptyConfigurations
+            yield row
+
+    @classmethod
+    def process_all(cls, old_directory, new_directory):
+        os.chdir(old_directory)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            geno_output_file = os.path.join(new_directory, group_name + "_geno.txt")
+            pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt")
+            snp_output_file = os.path.join(new_directory, group_name + "_snps.txt")
+            output_files = [geno_output_file, pheno_output_file, snp_output_file]
+            print("%s -> %s" % (
+                os.path.join(old_directory, input_file), geno_output_file))
+            convertob = ConvertGenoFile(input_file, output_files)
+            try:
+                convertob.convert()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                #excepted = True
+                continue
+            except Exception as why:
+
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+
+    #def process_snps_file(cls, snps_file, new_directory):
+    #    output_file = os.path.join(new_directory, "mouse_families.json")
+    #    print("%s -> %s" % (snps_file, output_file))
+    #    convertob = ConvertGenoFile(input_file, output_file)
+
+
+if __name__=="__main__":
+    Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/"""
+    New_Geno_Directory = """/home/zas1024/genotype_files/genotype/bimbam/"""
+    #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+    #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+    #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+    #convertob.convert()
+    ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+    #ConvertGenoFiles(Geno_Directory)
+
+    #process_csv(Input_File, Output_File)