diff --git a/LICENSE.md b/LICENSE.md index 33e2553..7e92416 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,4 @@ - Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. + Copyright (c) the author(s), All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/Makefile b/Makefile index b8e0687..3462121 100644 --- a/Makefile +++ b/Makefile @@ -52,7 +52,8 @@ JAR_FILE=japsa.jar LIB_DIR=libs #List of external libraries -EXT_LIBS=colloquial.jar commons-math3-3.0.jar jhdf5.jar jhdfobj.jar htsjdk-1.126.jar guava-18.0.jar jcommon-1.0.23.jar jfreechart-1.0.19.jar JRIEngine.jar JRI.jar +##EXT_LIBS=colloquial.jar commons-math3-3.0.jar jhdf5.jar jhdfobj.jar htsjdk-2.10.1.jar guava-18.0.jar jcommon-1.0.23.jar jfreechart-1.0.19.jar JRIEngine.jar JRI.jar +EXT_LIBS=colloquial.jar commons-math3-3.0.jar jhdf5.jar jhdfobj.jar htsjdk-2.10.1.jar guava-18.0.jar jcommon-1.0.23.jar jfreechart-1.0.19.jar JRIEngine.jar JRI.jar gs-core-1.3.jar gs-ui-1.3.jar gs-algo-1.3.jar javax.json-api-1.0.jar javax.json-1.0.4.jar slf4j-api-1.7.25.jar slf4j-simple-1.7.25.jar jfxrt.jar ########################################################################### ##What this scripts does: @@ -97,7 +98,11 @@ classes: $(CLASS_FILES) $(CLASS_DIR): mkdir -p $(CLASS_DIR) -$(JAR_FILE):classes +images: + cp -r $(SRC_DIR)/japsa/bio/misc/dnaPlatform/gui/images $(CLASS_DIR)/japsa/bio/misc/dnaPlatform/gui/ + cp -r $(SRC_DIR)/japsa/seq/nanopore/icons $(CLASS_DIR)/japsa/seq/nanopore/ + +$(JAR_FILE):classes images jar cf $(JAR_FILE) -C $(CLASS_DIR) . jar: $(JAR_FILE) @@ -132,19 +137,29 @@ else O_MXMEM= endif + +ifdef SERVER +O_SERVER=--server=${SERVER} +else +O_SERVER= +endif + +docs: jar + java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar:$(LIB_DIR)/slf4j-api-1.7.25.jar:$(LIB_DIR)/slf4j-simple-1.7.25.jar japsa.util.deploy.GenDocs && cd docs && make html + RELEASE=JapsaRelease pre-install: jar - @@echo "java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar japsa.util.deploy.Deploy --mode install --libs $(subst $(SPACE),:, $(EXT_LIBS)) $(O_INS_DIR) $(O_JLP) $(O_MXMEM) --compiler \"`$(JCC) -version 2>&1`\"" > install.sh && \ + @@echo "java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar:$(LIB_DIR)/slf4j-api-1.7.25.jar:$(LIB_DIR)/slf4j-simple-1.7.25.jar japsa.util.deploy.Deploy --mode install --libs $(subst $(SPACE),:, $(EXT_LIBS)) $(O_INS_DIR) $(O_JLP) $(O_MXMEM) $(O_SERVER) --compiler \"`$(JCC) -version 2>&1`\"" > install.sh && \ chmod u+x install.sh && \ - echo "java -cp $(JAR_FILE);$(LIB_DIR)\guava-18.0.jar japsa.util.deploy.Deploy --mode install --libs $(subst $(SPACE),:, $(EXT_LIBS)) $(O_INS_DIR) $(O_JLP) $(O_MXMEM) --compiler \"`$(JCC) -version 2>&1`\"" > install.bat && \ + echo "java -cp $(JAR_FILE);$(LIB_DIR)\guava-18.0.jar;$(LIB_DIR)\slf4j-api-1.7.25.jar;$(LIB_DIR)\slf4j-simple-1.7.25.jar japsa.util.deploy.Deploy --mode install --libs $(subst $(SPACE),:, $(EXT_LIBS)) $(O_INS_DIR) $(O_JLP) $(O_MXMEM) $(O_SERVER) --compiler \"`$(JCC) -version 2>&1`\"" > install.bat && \ echo "Installation scripts created" install: pre-install ./install.sh uninstall: - @@java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar japsa.util.deploy.Deploy --mode uninstall --libs $(subst $(SPACE),:, $(EXT_LIBS)) ${O_INS_DIR} && echo "Japsa uninstalled!" + @@java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar:$(LIB_DIR)/slf4j-api-1.7.25.jar:$(LIB_DIR)/slf4j-simple-1.7.25.jar japsa.util.deploy.Deploy --mode uninstall --libs $(subst $(SPACE),:, $(EXT_LIBS)) ${O_INS_DIR} && echo "Japsa uninstalled!" release: pre-install @@mkdir -p $(RELEASE)/libs/ && \ diff --git a/README.md b/README.md index 8313ddc..ad7c72b 100644 --- a/README.md +++ b/README.md @@ -19,18 +19,37 @@ Details of installation (including for Windows) and usage of Japsa can be found in its documentation hosted on [ReadTheDocs](http://japsa.readthedocs.org/en/latest/index.html) ### Authors and Contributors -Japsa was mainly developed and curated by Minh Duc Cao (@mdcao). The following -people have contributed to the development of Japsa, including ideas, +Japsa was maintained by Minh Duc Cao (@mdcao). The following +people (in alphatical order) have contributed to the development of Japsa, including ideas, algorithms, implementation, documentation and feedback: -* Lachlan Coin -* Son Hoang Nguyen -* Mikael Boden -* Lloyd Allison -* Trevor I Dix +* Bhuvan Sankar +* David Powell +* Devika Ganesamoorthy * Hoang Anh Nguyen * Julia Bernal -* David Powell +* [Lachlan Coin](http://www.imb.uq.edu.au/lachlan-coin) +* [Lloyd Allison](http://www.allisons.org/ll/) +* Michael Hall +* Mikael Boden +* [Son Hoang Nguyen](https://github.com/hsnguyen) +* Trevor I Dix + + +### Other projects based on Japsa + + +* [eXpert Model](https://github.com/mdcao/xm): The expert model compression model +* [XMas](https://github.com/mdcao/XMas): Phylogenetic distance method using information theory +* [capsim](https://github.com/mdcao/capsim): Simulation of capture sequencing +* [npScarf](https://github.com/mdcao/npScarf): Scaffold and Complete assemblies in real-time fashion +* [npAnalysis](https://github.com/mdcao/npAnalysis): Realtime identification of bacterial sample +* [npReader](https://github.com/mdcao/npReader): Real-time extraction and analysis Oxford Nanopore sequencing data +* [npBarcode](https://github.com/hsnguyen/npBarcode): Demultiplex barcoded Oxford Nanopore sequencing +* [PhageXpress](https://github.com/mdcao/phagexpress) + +and more to come. + ### License Japsa is released under the accompanying BSD-like license. diff --git a/bin/.gitignore b/bin/.gitignore new file mode 100644 index 0000000..efaee1f --- /dev/null +++ b/bin/.gitignore @@ -0,0 +1,2 @@ +/japsa/ +/japsadev/ diff --git a/docs/source/install.rst b/docs/source/install.rst index d07a0ee..b488392 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -4,7 +4,7 @@ Installation There are two methods to install Japsa in your computer. The first method -(using pre-compiled package in JDK 1.6) is straight-forward and can be used for +(using pre-compiled package in JDK 1.8) is straight-forward and can be used for any operating systems, including Windows. The second method (compile from source code) requires some extra tools (make and JDK) but may yield better runtime performance as the package will be compiled with the same version of the Java @@ -43,7 +43,9 @@ agree with its suggestion, just type Enter. The questions are: HDFViewer (https://www.hdfgroup.org/products/java/release/download.html) installed, and enter the path to file *libjhdf5.so* (on Linux/Unix/Mac) or to *jhdf5.dll* (Windows). This is only required if you intend to use npReader( - jsa.np.f5reader). + jsa.np.f5reader). Note that we tested with hdfj version 2.10.1, which you can + download from https://support.hdfgroup.org/ftp/HDF5/releases/HDF-JAVA/hdf-java-2.10.1/bin/ + * *Path to JRI library:* Enter path to JRI library. This is required only for running real-time species typing (jsa.np.speciesTyping). diff --git a/docs/source/tools/index.rst b/docs/source/tools/index.rst index 9158a85..b48509b 100644 --- a/docs/source/tools/index.rst +++ b/docs/source/tools/index.rst @@ -16,13 +16,20 @@ This chapter presents the list of tools provided by Japsa. jsa.seq.annovcf.rst jsa.seq.gff2fasta.rst jsa.seq.emalign.rst - jsa.np.f5reader.rst + jsa.hts.countReads.rst + jsa.hts.errorAnalysis.rst + jsa.hts.n50.rst + jsa.np.npreader.rst jsa.np.filter.rst jsa.np.rtSpeciesTyping.rst jsa.np.rtMLST.rst jsa.np.rtStrainTyping.rst jsa.np.rtResistGenes.rst + jsa.np.npscarf.rst + jsa.np.barcode.rst jsa.util.streamServer.rst jsa.util.streamClient.rst jsa.phylo.xmas.rst jsa.phylo.normalise.rst + jsa.sim.capsim.rst + jsa.xm.compress.rst diff --git a/docs/source/tools/jsa.hts.countReads.rst b/docs/source/tools/jsa.hts.countReads.rst new file mode 100644 index 0000000..dc42dea --- /dev/null +++ b/docs/source/tools/jsa.hts.countReads.rst @@ -0,0 +1,49 @@ +------------------------------------------------ +*jsa.hts.countReads*: Count reads from bam files +------------------------------------------------ + + +~~~~~~~~ +Synopsis +~~~~~~~~ + +*jsa.hts.countReads*: Count the number of reads in some regions from a sorted, indexed bam file + +~~~~~ +Usage +~~~~~ +:: + + jsa.hts.countReads [options] + +~~~~~~~ +Options +~~~~~~~ + --bamFile=s Name of the bam file + (REQUIRED) + --bedFile=s Name of the regions file in bed format + (REQUIRED) + --output=s Name of output file, - for from standard out. + (default='-') + --flanking=i Size of the flanking regions, effectively expand the region by flanking + (default='0') + --qual=i Minimum quality + (default='0') + --filterBits=i Filter reads based on flag. Common values: + 0 no filter + 256 exclude secondary alignment + 1024 exclude PCR/optical duplicates + 2048 exclude supplementary alignments + (default='0') + --contained Count reads contained in the region + (default='false') + --overlap Count number of read overlap with the region + (default='false') + --span Count reads span the region + (default='false') + --help Display this usage and exit + (default='false') + + + + diff --git a/docs/source/tools/jsa.hts.errorAnalysis.rst b/docs/source/tools/jsa.hts.errorAnalysis.rst new file mode 100644 index 0000000..275d0b7 --- /dev/null +++ b/docs/source/tools/jsa.hts.errorAnalysis.rst @@ -0,0 +1,39 @@ +---------------------------------------------------------- +*jsa.hts.errorAnalysis*: Error analysis of sequencing data +---------------------------------------------------------- + +*jsa.hts.errorAnalysis* assesses the error profile of sequencing data by getting the numbers +of errors (mismatches, indels etc) from a bam file. Obviously, it does not distinguish +sequencing errors from mutations, and hence consider mutations as errors. It is best to use +with the bam file from aligning sequencing reads to a reliable assembly of the sample. + +~~~~~~~~ +Synopsis +~~~~~~~~ + +*jsa.hts.errorAnalysis*: Error analysis of sequencing data + +~~~~~ +Usage +~~~~~ +:: + + jsa.hts.errorAnalysis [options] + +~~~~~~~ +Options +~~~~~~~ + --bamFile=s Name of bam file + (REQUIRED) + --reference=s Name of reference genome + (REQUIRED) + --pattern=s Pattern of read name, used for filtering + (default='null') + --qual=i Minimum quality required + (default='0') + --help Display this usage and exit + (default='false') + + + + diff --git a/docs/source/tools/jsa.hts.n50.rst b/docs/source/tools/jsa.hts.n50.rst new file mode 100644 index 0000000..bac2936 --- /dev/null +++ b/docs/source/tools/jsa.hts.n50.rst @@ -0,0 +1,28 @@ +----------------------------------------- +*jsa.hts.n50*: Compute N50 of an assembly +----------------------------------------- + +~~~~~~~~ +Synopsis +~~~~~~~~ + +*jsa.hts.n50*: Compute N50 of an assembly + +~~~~~ +Usage +~~~~~ +:: + + jsa.hts.n50 [options] + +~~~~~~~ +Options +~~~~~~~ + --input=s Name of the file + (REQUIRED) + --help Display this usage and exit + (default='false') + + + + diff --git a/docs/source/tools/jsa.np.barcode.rst b/docs/source/tools/jsa.np.barcode.rst new file mode 100644 index 0000000..52d1a31 --- /dev/null +++ b/docs/source/tools/jsa.np.barcode.rst @@ -0,0 +1,128 @@ +--------------------------------------------------------------------------- +*barcode*: real-time de-multiplexing Nanopore reads from barcode sequencing +--------------------------------------------------------------------------- + +*barcode* (jsa.np.barcode) is a program that demultiplex the nanopore reads from +Nanopore barcode sequencing. Downstream analysis can be invoked concurrently by an input script. + +*barcode* is included in the `Japsa package `_. + +~~~~~~~~ +Synopsis +~~~~~~~~ + +*jsa.np.barcode*: Clustering nanopore sequences based on barcode + +~~~~~ +Usage +~~~~~ +:: + + jsa.np.barcode [options] + +~~~~~~~ +Options +~~~~~~~ + --bcFile=s Barcode file + (REQUIRED) + --seqFile=s Nanopore sequences file + (REQUIRED) + --scriptRun=s Invoke command script to run npScarf + (default='null') + --threshold=d Minimum identity(%) for barcode alignment + (default='70.0') + --distance=d Minimum identity(%) distance between the best alignment to others + (default='4.0') + --twoends Whether a read must contain barcode sequence from both ends or just one end (default) + (default='false') + --print Print out demultiplexed reads to corresponding FASTA file or not. + (default='false') + --help Display this usage and exit + (default='false') + + + + +~~~~~~~~~~~~~~ +Usage examples +~~~~~~~~~~~~~~ + +A summary of *barcode* usage can be obtained by invoking the --help option:: + + jsa.np.barcode --help + +===== +Input +===== + *barcode* takes 2 files as required input:: + + jsa.np.barcode -seq -bc + +<*nanopore reads*> is either the long reads in FASTA/FASTQ file (after MinION sequencing is +finished) or standard input ( specified by "-", for real-time analysis). + +<*barcode.fasta*> is the FASTA file of barcode sequences (given by ONT) with name correspond to the assigned sample id. + +Missing any file would break down the whole pipeline. + +In addition, one can provide <*analysis_script*> which is the script call for further action on the de-multiplexed reads. It always take one argument and be +executable by invoking:: + + ./analysis_script + +in which <*id*> is the identifier of a sample as given in the . The script should read the standard input +of long-read streams to do further analysis. + +*barcode* allows user to set the minimum criteria of a hit with barcode reference to be considered valid. The default value +is 70% for minimum identity. At the same time, 4% distance between the best hit and the second best is necessary for differentiation. +Decreasing the thresholds will lead to more reads being clustered but with higher risk of false positive while more stringent parameters +will generate less but more confident of demultiplexed reads. + +User can also have control on the matching condition for barcode detection, either one-end match or both-end match. For the first case (default), only the +a legal maximal hit from one end of a read is enough to label it while in the later case, we take into account a pair from both 5' and 3'terminus. +Thus the input for each use case should be different. The one-end option can take the simple FASTA file of Nanopore barcodes while the two-end need pairs of +barcode to be specified (e.g. with _F and _R suffix). One of a typical use case for two-end matching is when we want to detect the super-barcode which includes +also tail- and primer-sequences in pre-defined orientation. + +====== +Output +====== +*barcode* output depends on the <*analysis script*> because the de-multiplexed reads are streamed directly to its dedicated process. +If ones only interest in de-multiplexing alone, then the script should be as simple as to write stream to file. For example: + +.. code-block:: bash + :linenos: + + #!/bin/bash + while read line + do + echo "$line" + done >> ${1}_script.fasta + +This is equivalent to enable the *-p* option:: + + jsa.np.barcode -seq -bc -script -p + +that would print out de-multiplexed FASTA sequences \_clustered.fasta + +============================================ +Real-time scaffolding for barcode sequencing +============================================ + +One use-case for barcode sequencing is to run *npscarf* on the resulted de-multiplexed reads. This could be done by calling a script +that can take an output folder of long reads from a sample to scaffold its corresponding short-reads (e.g. SPAdes) assembly. +E.g. + +.. code-block:: bash + :linenos: + + #!/bin/bash + dirname=`find /coin/barcode/ -maxdepth 1 -type d -name "*${1}*" -print -quit` + + bwa index ${dirname}/contigs.fasta + + bwa mem -t 16 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y -K 10000 ${dirname}/contigs.fasta - 2> /dev/null | \ + jsa.np.npscarf -realtime -read 100 -time 1 -b - -seq ${dirname}/contigs.fasta -spadesDir ${dirname} -prefix ${1} > ${1}.log 2>&1 + +In this scenario, we assume the output SPAdes folders locate in one directory and the folder names contain the ID of the corresponding samples. + diff --git a/docs/source/tools/jsa.np.filter.rst b/docs/source/tools/jsa.np.filter.rst index 7dbf187..8e1fd5a 100644 --- a/docs/source/tools/jsa.np.filter.rst +++ b/docs/source/tools/jsa.np.filter.rst @@ -3,7 +3,7 @@ --------------------------------------- *jsa.np.filter* filters sequencing data based on sequence read type, length and -quality. Examples of its usage can be found on jsa.np.f5reader_. +quality. Examples of its usage can be found on jsa.np.npreader_. ~~~~~~~~ Synopsis @@ -33,6 +33,8 @@ Options (default='0.0') --qualMax=d Maximum average quality (default='1000.0') + --group=s Group need to be extracted, leave blank for selecting all groups + (default='') --excl2D Exclude 2D reads (default='false') --exclTemp Exclude template reads @@ -49,9 +51,9 @@ Options See also ~~~~~~~~ -jsa.np.f5reader_, jsa.util.streamServer_, jsa.util.streamClient_ +jsa.np.npreader_, jsa.util.streamServer_, jsa.util.streamClient_ -.. _jsa.np.f5reader: jsa.np.f5reader.html +.. _jsa.np.npreader: jsa.np.npreader.html .. _jsa.util.streamServer: jsa.util.streamServer.html .. _jsa.util.streamClient: jsa.util.streamClient.html diff --git a/docs/source/tools/jsa.np.f5reader.rst b/docs/source/tools/jsa.np.npreader.rst similarity index 80% rename from docs/source/tools/jsa.np.f5reader.rst rename to docs/source/tools/jsa.np.npreader.rst index 7c7af5b..911ffc3 100644 --- a/docs/source/tools/jsa.np.f5reader.rst +++ b/docs/source/tools/jsa.np.npreader.rst @@ -2,7 +2,7 @@ *npReader*: real-time conversion and analysis of Nanopore sequencing data ------------------------------------------------------------------------- -*npReader* (jsa.np.f5reader) is a program that extracts Oxford Nanopore +*npReader* (jsa.np.npreader) is a program that extracts Oxford Nanopore sequencing data from FAST5 files, performs an initial analysis of the date and streams them to real-time analysis pipelines. These pipelines can run on the same computer or on computing clouds/high performance clusters. @@ -34,18 +34,22 @@ Linux distribution software repository, such as:: The library is typically installed to */usr/lib/jni*. Enter this path when prompted for "Path to HDF library" during installation of Japsa. +HDF-View (https://www.hdfgroup.org/products/java/release/download.html) also +contains the neccessary library. Please install HDF-2.10.1 instead of the +latest version. + ~~~~~~~~ Synopsis ~~~~~~~~ -*jsa.np.f5reader*: Extract and stream Oxford Nanopore sequencing data in real-time +*jsa.np.npreader*: Extract and stream Oxford Nanopore sequencing data in real-time. Demultiplexe included. ~~~~~ Usage ~~~~~ :: - jsa.np.f5reader [options] + jsa.np.npreader [options] ~~~~~~~ Options @@ -65,13 +69,17 @@ Options --format=s Format of sequence reads (fastq or fasta) (default='fastq') --minLength=i Minimum read length - (default='0') + (default='1') --number Add a unique number to read name (default='false') --stats Generate a report of read statistics (default='false') - --time Extract the sequencing time of each read -- only work with Metrichor > 1.12 + --time Extract the sequencing time of each read -- experimental + (default='false') + --exhaustive Whether to traverse the input directory exhaustively (albacore) or lazily (metrichor) (default='false') + --barcode=s The file containing all barcode sequences for demultiplexing. + (default='null') --help Display this usage and exit (default='false') @@ -97,16 +105,16 @@ Usage examples A summary of npReader usage can be obtained by invoking the --help option:: - jsa.np.f5reader --help + jsa.np.npreader --help The simplest way to run *npReader* in GUI mode is by typing:: - jsa.np.f5reader -GUI -realtime + jsa.np.npreader -GUI -realtime and specify various options in the GUI. All of these options can be specified from the command line:: - jsa.np.f5reader -GUI -realtime -folder c:\Downloads\ -fail -output myrun.fastq --minLength 200 --stats + jsa.np.npreader -GUI -realtime -folder c:\Downloads\ -fail -output myrun.fastq --minLength 200 --stats npReader can run natively on a Windows laptop that runs the Metrichor agent. It can stream sequence data to multiple analysis pipelines on the same computer @@ -139,7 +147,7 @@ analysis pipelines, such as:: Once these pipelines are ready, npReader can start streaming data off the MinION and the Metrichor agent to these pipelines:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output myrun.fastq \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output myrun.fastq \ --minLength 200 --streams server1IP:3456,server2IP:3457 One can run *npReader* on a computing cloud if the download folder (containing @@ -147,22 +155,30 @@ base-called data) can be mounted to the cloud. In such case, npReader can direct stream data to the pipelines without the need of *jsa.util.streamServer*:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ jsa.np.speciesTyping -bam - --index speciesIndex -output output.dat +*npReader* now supports barcode sequencing demultiplex. For this analysis, it +requires a FASTA file of barcode tag sequences and will classify output sequences +based on alignment. User can specify the threshold for alignment confidence from +the GUI. Demultiplexing results are illustrated as prefix Barcode::| +added to each output sequence name. + + jsa.np.npreader -GUI -barcode barcode.fasta + Japsa also provides *jsa.np.filter*, a tool to bin sequence data in groups of the user's liking. Like any other streamline tools, jsa.np.filter can run behind *jsa.util.streamServer* on a remote machine, or can get data directly from npReader via pipe:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ jsa.np.filter -input - -lenMin 2000 --qualMin 10 -output goodreads.fq One can also use *tee* to group data into different bins *in real-time* with *jsa.np.filter*:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ tee >(jsa.np.filter -input - -lenMax 2000 -output 0k2k.fq) \ >(jsa.np.filter -lenMin 2000 -lenMax 4000 -input - -output 2k4k.fq) \ >(jsa.np.filter -lenMin 4000 -lenMax 6000 -input - -output 4k6k.fq) \ diff --git a/docs/source/tools/jsa.np.npscarf.rst b/docs/source/tools/jsa.np.npscarf.rst new file mode 100644 index 0000000..9f0f4f7 --- /dev/null +++ b/docs/source/tools/jsa.np.npscarf.rst @@ -0,0 +1,187 @@ +---------------------------------------------------------------------------------- +*npScarf*: real-time scaffolder using SPAdes contigs and Nanopore sequencing reads +---------------------------------------------------------------------------------- + +*npScarf* (jsa.np.npscarf) is a program that connect contigs from a draft genomes +to generate sequences that are closer to finish. These pipelines can run on a single laptop +for microbial datasets. In real-time mode, it can be integrated with simple structural +analyses such as gene ordering, plasmid forming. + +*npScarf* is included in the `Japsa package `_. + +~~~~~~~~ +Synopsis +~~~~~~~~ + +*jsa.np.npscarf*: Experimental Scaffold and finish assemblies using Oxford Nanopore sequencing reads + +~~~~~ +Usage +~~~~~ +:: + + jsa.np.npscarf [options] + +~~~~~~~ +Options +~~~~~~~ + --seqFile=s Name of the assembly file (sorted by length) + (REQUIRED) + --input=s Name of the input file, - for stdin + (REQUIRED) + --format=s format of the input fastq/fasta or sam/bam + (default='fastq/fasta') + --bwaExe=s Path to bwa + (default='bwa') + --bwaThread=i Theads used by bwa + (default='4') + --long Whether report all sequences, including short/repeat contigs (default) or only long/unique/completed sequences. + (default='false') + --spadesDir=s Name of the output folder by SPAdes: assembly graph and paths will be used for better gap-filling. + (default='null') + --prefix=s Prefix for the output files + (default='out') + --genes=s Realtime annotation: name of annotated genes in GFF 3.0 format + (default='null') + --resistGene=s Realtime annotation: name of antibiotic resistance gene fasta file + (default='null') + --insertSeq=s Realtime annotation: name of IS fasta file + (default='null') + --oriRep=s Realtime annotation: name of fasta file containing possible origin of replication + (default='null') + --minContig=i Minimum contigs length that are used in scaffolding. + (default='300') + --maxRepeat=i Maximum length of repeat in considering species. + (default='7500') + --cov=d Expected average coverage of Illumina, <=0 to estimate + (default='0.0') + --qual=i Minimum quality + (default='1') + --support=i Minimum supporting long read needed for a link between markers + (default='1') + --realtime Process in real-time mode. Default is batch mode (false) + (default='false') + --read=i Minimum number of reads between analyses + (default='50') + --time=i Minimum number of seconds between analyses + (default='10') + --verbose Turn on debugging mode + (default='false') + --help Display this usage and exit + (default='false') + + +~~~~~~~~ +See also +~~~~~~~~ + +jsa.np.npreader_, jsa.util.streamServer_, jsa.util.streamClient_ + +.. _jsa.np.npreader: jsa.np.npreader.html +.. _jsa.util.streamServer: jsa.util.streamServer.html +.. _jsa.util.streamClient: jsa.util.streamClient.html + + + +~~~~~~~~~~~~~~ +Usage examples +~~~~~~~~~~~~~~ + +A summary of *npScarf* usage can be obtained by invoking the --help option:: + + jsa.np.npscarf --help + +Input +===== + *npScarf* takes two files as required input:: + + jsa.np.npscarf -seq -input + +<*draft*> input is the FASTA file containing the pre-assemblies. Normally this +is the output from running SPAdes on Illumina MiSeq paired end reads. + +<*nanopore*> is either the long reads in FASTA/FASTQ file or SAM/BAM formated alignments +between them to <*draft*> file. We use BWA-MEM as the recommended aligner +with the fixed parameter set as follow:: + + bwa mem -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y > + +The input file format is specified by option --format. The default is FASTA/FASTQ in which +the path to BWA version 0.7.11 or newer is required. Remember to always *INDEXING* the +reference before running BWA:: + + bwa index + +Missing this step would break down the whole pipeline. + +Output +======= +*npScarf* output is specified by *-prefix* option. The default prefix is \'out\'. +Normally the tool generate two files: *prefix*.fin.fasta and *prefix*.fin.japsa which +indicate the result scaffolders in FASTA and JAPSA format. + +In realtime mode, if any annotation analysis is enabled, a file named +*prefix*.anno.japsa is generated instead. This file contains features detected after +scaffolding. + +Real-time scaffolding +===================== +To run *npScarf* in streaming mode:: + + jsa.np.npscarf -realtime [options] + +In this mode, the <*bam*> file will be processed block by block. The size of block +(number of BAM/SAM records) can be manipulated through option *-read* and *-time*. + +The idea of streaming mode is when the input <*nanopore*> file is retrieved in stream. +npReader is the module that provides such data from fast5 files returned from the real-time +base-calling cloud service Metrichor. Ones can run:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npscarf --realtime -bwaExe= -bwaThread=10 -input - -seq > log.out 2>&1 + +For the same purpose, you can also invoke BWA-MEM explicitly as in the old version of *npScarf*, +In this case, option --format=SAM must be presented as follow: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + bwa mem -t 10 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y -K 3000 - 2> /dev/null | \ + jsa.np.npscarf --realtime -input - -format=SAM -seq > log.out 2>&1 + +or if you have the whole set of Nanopore long reads already and want to emulate the +streaming mode:: + + jsa.np.timeEmulate -s 100 -i -output - | \ + jsa.np.npscarf --realtime -bwaExe= -bwaThread=10 -input - -seq > log.out 2>&1 + +Note that jsa.np.timeEmulate based on the field *timestamp* located in the read name line to +decide the order of streaming data. So if your input <*nanopore*> already contains the field, +you have to sort it:: + + jsa.seq.sort -i -o -sortKey=timestamp + +or if your file does not have the *timestamp* data yet, you can manually make ones. For example:: + + cat | \ + awk 'BEGIN{time=0.0}NR%4==1{printf "%s timestamp=%.2f\n", $0, time; time++}NR%4!=1{print}' \ + > <*nanopore-with-time*> + +Real-time annotation +==================== + +The tool includes usecase for streaming annotation. Ones can provides database of antibiotic +resistance genes and/or Origin of Replication in FASTA format for the analysis of gene ordering +and/or plasmid identifying respectively:: + + jsa.np.timeEmulate -s 100 -i -output - | \ + jsa.np.npscarf --realtime -bwaExe= -input - -seq -resistGene -oriRep > log.out 2>&1 + +Assembly graph +============== + +*npScarf* can read the assembly graph info from SPAdes to make the results more precise. +The results might be slightly deviate from the old version in term of number of final contigs:: + + jsa.np.npscarf --spadesFolder= + +where SPAdes_output_directory indicates the result folder of SPAdes, containing files such as contigs.fasta, +contigs.paths and assembly_graph.fastg. diff --git a/docs/source/tools/jsa.np.rtMLST.rst b/docs/source/tools/jsa.np.rtMLST.rst index d2e041a..cabc124 100644 --- a/docs/source/tools/jsa.np.rtMLST.rst +++ b/docs/source/tools/jsa.np.rtMLST.rst @@ -42,9 +42,9 @@ Options See also ~~~~~~~~ -jsa.np.f5reader_, jsa.np.rtSpeciesTyping_, jsa.np.rtStrainTyping_, jsa.np.rtResistGenes_, jsa.util.streamServer_, jsa.util.streamClient_ +jsa.np.npreader_, jsa.np.rtSpeciesTyping_, jsa.np.rtStrainTyping_, jsa.np.rtResistGenes_, jsa.util.streamServer_, jsa.util.streamClient_ -.. _jsa.np.f5reader: jsa.np.f5reader.html +.. _jsa.np.npreader: jsa.np.npreader.html .. _jsa.np.rtSpeciesTyping: jsa.np.rtSpeciesTyping.html .. _jsa.np.rtStrainTyping: jsa.np.rtStrainTyping.html .. _jsa.np.rtResistGenes: jsa.np.rtResistGenes.html diff --git a/docs/source/tools/jsa.np.rtResistGenes.rst b/docs/source/tools/jsa.np.rtResistGenes.rst index 425b561..12ea1c3 100644 --- a/docs/source/tools/jsa.np.rtResistGenes.rst +++ b/docs/source/tools/jsa.np.rtResistGenes.rst @@ -51,9 +51,9 @@ Options See also ~~~~~~~~ -jsa.np.f5reader_, jsa.np.rtSpeciesTyping_, jsa.np.rtStrainTyping_, jsa.util.streamServer_, jsa.util.streamClient_ +jsa.np.npreader_, jsa.np.rtSpeciesTyping_, jsa.np.rtStrainTyping_, jsa.util.streamServer_, jsa.util.streamClient_ -.. _jsa.np.f5reader: jsa.np.f5reader.html +.. _jsa.np.npreader: jsa.np.npreader.html .. _jsa.np.rtSpeciesTyping: jsa.np.rtSpeciesTyping.html .. _jsa.np.rtStrainTyping: jsa.np.rtStrainTyping.html .. _jsa.util.streamServer: jsa.util.streamServer.html @@ -65,6 +65,7 @@ jsa.np.f5reader_, jsa.np.rtSpeciesTyping_, jsa.np.rtStrainTyping_, jsa.util.stre Setting up ~~~~~~~~~~ -Refer to real-time analyais page at https://github.com/mdcao/npAnalysis/ +Refer to the documentation at https://github.com/mdcao/npAnalysis/ for more +details. diff --git a/docs/source/tools/jsa.np.rtSpeciesTyping.rst b/docs/source/tools/jsa.np.rtSpeciesTyping.rst index 27fc192..b8b1725 100644 --- a/docs/source/tools/jsa.np.rtSpeciesTyping.rst +++ b/docs/source/tools/jsa.np.rtSpeciesTyping.rst @@ -6,15 +6,11 @@ using Oxford Nanopore sequencing in real-time. It reads data in SAM/BAM format of the alignments of sequence reads to a collection of species genomes. -We provide a genome collection of nearly 1500 bacterial species http://genomicsresearch.org/public/researcher/npAnalysis/SpeciesTyping.tar.gz -(or https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/SpeciesTyping.tar.gz). -Obtain them by:: - - wget https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/SpeciesTyping.tar.gz.tar.gz - tar zxvf SpeciesTyping.tar.gz - -which will generate three folders for the three species. - +We provide a genome collection of nearly 1500 bacterial species +on http://data.genomicsresearch.org/Projects/npAnalysis/. +Refer to the documentation at https://github.com/mdcao/npAnalysis/ for more +details. + ~~~~~~~~ Synopsis ~~~~~~~~ @@ -31,7 +27,7 @@ Usage ~~~~~~~ Options ~~~~~~~ - --output=s Output file + --output=s Output file, - for standard output (default='output.dat') --bamFile=s The bam file (REQUIRED) @@ -45,6 +41,10 @@ Options (default='50') --time=i Minimum number of seconds between analyses (default='30') + --web Whether to use Web visualization. + (default='false') + --log Whether to write mapping details to species2reads.map. + (default='false') --help Display this usage and exit (default='false') @@ -53,9 +53,9 @@ Options See also ~~~~~~~~ -jsa.np.f5reader_, jsa.np.rtStrainTyping_, jsa.np.rtResistGenes_, jsa.util.streamServer_, jsa.util.streamClient_ +jsa.np.npreader_, jsa.np.rtStrainTyping_, jsa.np.rtResistGenes_, jsa.util.streamServer_, jsa.util.streamClient_ -.. _jsa.np.f5reader: jsa.np.f5reader.html +.. _jsa.np.npreader: jsa.np.npreader.html .. _jsa.np.rtStrainTyping: jsa.np.rtStrainTyping.html .. _jsa.np.rtResistGenes: jsa.np.rtResistGenes.html .. _jsa.util.streamServer: jsa.util.streamServer.html @@ -87,6 +87,6 @@ to listen on port 3456 and streams data to this pipeline using npReader: :: - jsa.np.f5reader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3456 + jsa.np.npreader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3456 diff --git a/docs/source/tools/jsa.np.rtStrainTyping.rst b/docs/source/tools/jsa.np.rtStrainTyping.rst index acc678f..b83b0c6 100644 --- a/docs/source/tools/jsa.np.rtStrainTyping.rst +++ b/docs/source/tools/jsa.np.rtStrainTyping.rst @@ -9,14 +9,10 @@ gene presence, it makes an inference of the strain, together with the confidence interval of 95%. We provide the gene databases for three bacterial species K. pneumoniae, -E. coli and S. aureus on http://genomicsresearch.org/public/researcher/npAnalysis/StrainTyping.tar.gz -(or https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/StrainTyping.tar.gz). -Obtain them by:: +E. coli and S. aureus on http://data.genomicsresearch.org/Projects/npAnalysis/. +Refer to the documentation at https://github.com/mdcao/npAnalysis/ for more +details. - wget https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/StrainTyping.tar.gz - tar zxvf StrainTyping.tar.gz - -which will generate three folders for the three species. ~~~~~~~~ Synopsis @@ -56,9 +52,9 @@ Options See also ~~~~~~~~ -jsa.np.f5reader_, jsa.np.rtSpeciesTyping_, jsa.np.rtResistGenes_, jsa.util.streamServer_, jsa.util.streamClient_ +jsa.np.npreader_, jsa.np.rtSpeciesTyping_, jsa.np.rtResistGenes_, jsa.util.streamServer_, jsa.util.streamClient_ -.. _jsa.np.f5reader: jsa.np.f5reader.html +.. _jsa.np.npreader: jsa.np.npreader.html .. _jsa.np.rtSpeciesTyping: jsa.np.rtSpeciesTyping.html .. _jsa.np.rtResistGenes: jsa.np.rtResistGenes.html .. _jsa.util.streamServer: jsa.util.streamServer.html @@ -88,6 +84,6 @@ to listen on port 3457 and streams data to this pipeline using npReader: :: - jsa.np.f5reader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3457 + jsa.np.npreader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3457 diff --git a/docs/source/tools/jsa.seq.emalign.rst b/docs/source/tools/jsa.seq.emalign.rst index 8982a7b..80ad1d3 100644 --- a/docs/source/tools/jsa.seq.emalign.rst +++ b/docs/source/tools/jsa.seq.emalign.rst @@ -19,6 +19,8 @@ Usage ~~~~~~~ Options ~~~~~~~ + --iteration=i Number of iteration + (default='5') --help Display this usage and exit (default='false') diff --git a/docs/source/tools/jsa.seq.gff2fasta.rst b/docs/source/tools/jsa.seq.gff2fasta.rst index 47a5eb7..7fbbefa 100644 --- a/docs/source/tools/jsa.seq.gff2fasta.rst +++ b/docs/source/tools/jsa.seq.gff2fasta.rst @@ -1,8 +1,8 @@ ------------------------------------------- - *jsa.seq.gff2fasta*: Extract gene sequences +*jsa.seq.gff2fasta*: Extract gene sequences ------------------------------------------- - *jsa.seq.gff2fasta* extract the functional sequences (genes, CDS, etc) from +*jsa.seq.gff2fasta* extract the functional sequences (genes, CDS, etc) from a gff file and a sequence file. ~~~~~~~~ diff --git a/docs/source/tools/jsa.sim.capsim.rst b/docs/source/tools/jsa.sim.capsim.rst new file mode 100644 index 0000000..dace12e --- /dev/null +++ b/docs/source/tools/jsa.sim.capsim.rst @@ -0,0 +1,57 @@ +---------------------------------------------------------------------------- +*capsim*: Simulating the Dynamics of Targeted Capture Sequencing with CapSim +---------------------------------------------------------------------------- + +*capsim* (jsa.sim.capsim) is a tool to simulate target capture sequencing. Its +simulates the dynamics of capture process + +~~~~~~~~ +Synopsis +~~~~~~~~ + +*jsa.sim.capsim*: Simulate capture sequencing + +~~~~~ +Usage +~~~~~ +:: + + jsa.sim.capsim [options] + +~~~~~~~ +Options +~~~~~~~ + --reference=s Name of genome to be + (REQUIRED) + --probe=s File containing probes mapped to the reference in bam format + (default='null') + --logFile=s Log file + (default='-') + --ID=s A unique ID for the data set + (default='') + --miseq=s Name of read file if miseq is simulated + (default='null') + --pacbio=s Name of read file if pacbio is simulated + (default='null') + --fmedian=i Median of fragment size at shearing + (default='2000') + --fshape=d Shape parameter of the fragment size distribution + (default='6.0') + --num=i Number of fragments + (default='1000000') + --pblen=i PacBio: Average (polymerase) read length + (default='30000') + --illen=i Illumina: read length + (default='300') + --seed=i Random seed, 0 for a random seed + (default='0') + --help Display this usage and exit + (default='false') + + + + +~~~~~~~~~~~~~ +Usage samples +~~~~~~~~~~~~~ + diff --git a/docs/source/tools/jsa.util.streamClient.rst b/docs/source/tools/jsa.util.streamClient.rst index 835aa16..19adc2c 100644 --- a/docs/source/tools/jsa.util.streamClient.rst +++ b/docs/source/tools/jsa.util.streamClient.rst @@ -33,11 +33,11 @@ Options See also ~~~~~~~~ -jsa.util.streamServer_, jsa.np.filter_, jsa.np.f5reader_ +jsa.util.streamServer_, jsa.np.filter_, jsa.np.npreader_ .. _jsa.util.streamServer: jsa.util.streamServer.html .. _jsa.np.filter: jsa.np.filter.html -.. _jsa.np.f5reader: jsa.np.f5reader.html +.. _jsa.np.npreader: jsa.np.npreader.html diff --git a/docs/source/tools/jsa.util.streamServer.rst b/docs/source/tools/jsa.util.streamServer.rst index e0a2e3e..75a9a04 100644 --- a/docs/source/tools/jsa.util.streamServer.rst +++ b/docs/source/tools/jsa.util.streamServer.rst @@ -34,11 +34,11 @@ Options See also ~~~~~~~~ -jsa.util.streamClient_, jsa.np.filter_, jsa.np.f5reader_ +jsa.util.streamClient_, jsa.np.filter_, jsa.np.npreader_ .. _jsa.util.streamClient: jsa.util.streamClient.html .. _jsa.np.filter: jsa.np.filter.html -.. _jsa.np.f5reader: jsa.np.f5reader.html +.. _jsa.np.npreader: jsa.np.npreader.html diff --git a/docs/source/tools/jsa.xm.compress.rst b/docs/source/tools/jsa.xm.compress.rst new file mode 100644 index 0000000..0ba2a67 --- /dev/null +++ b/docs/source/tools/jsa.xm.compress.rst @@ -0,0 +1,75 @@ +--------------------------------------------------------- +*Expert Model*: tool for compression of genomic sequences +--------------------------------------------------------- + +*jsa.xm.compress* in the implementation of the expert model (XM) algorithm for +compression of genomics sequences. The source code is included in the +`Japsa package `_. +Please see check the installation_ page for instructions. + +.. _installation: ../install.html + +~~~~~~~~ +Synopsis +~~~~~~~~ + +*jsa.xm.compress*: Compression of DNA/protein sequences + +~~~~~ +Usage +~~~~~ +:: + + jsa.xm.compress [options] file1 file2 ... + +~~~~~~~ +Options +~~~~~~~ + --hashSize=i Hash size + (default='11') + --context=i Length of the context + (default='15') + --limit=i Expert Limit + (default='200') + --threshold=d Listen threshold + (default='0.15') + --chance=i Chances + (default='20') + --binaryHash Use binary hash or not + (default='false') + --offsetType=s Way of update offset/palindrome expert: possible value count, subs + (default='counts') + --real=s File name of the real compression + (default='null') + --decode=s File name of the encoded + (default='null') + --output=s The output file of decoded file + (default='decoded') + --info=s File name of the infomation content + (default='null') + --markov=s File name of the markov infomation content + (default='null') + --optimise Running in optimise mode, just report the entropy,recommended for long sequence + (default='false') + --checkPoint=i Frequency of check point + (default='1000000') + --hashType=s Type of Hash table: hash=hashtable, sft=SuffixTree,sfa = SuffixArray + (default='hash') + --selfRep Propose experts from the sequence to compressed? + (default='true') + --help Display this usage and exit + (default='false') + + + + +~~~~~~~~ +Citation +~~~~~~~~ + +If you find XM useful for your research, please cite + +Cao MD, Dix TI, Allison L, and Mears C, +*A simple statistical algorithm for biological sequence compression*, +Data Compression Conference, 2007 (DCC'07), Snowbird, UT, pp43-52. + diff --git a/libs/gs-algo-1.3.jar b/libs/gs-algo-1.3.jar new file mode 100644 index 0000000..1e720ee Binary files /dev/null and b/libs/gs-algo-1.3.jar differ diff --git a/libs/gs-core-1.3.jar b/libs/gs-core-1.3.jar new file mode 100644 index 0000000..c8f50fd Binary files /dev/null and b/libs/gs-core-1.3.jar differ diff --git a/libs/gs-ui-1.3.jar b/libs/gs-ui-1.3.jar new file mode 100644 index 0000000..4661b87 Binary files /dev/null and b/libs/gs-ui-1.3.jar differ diff --git a/libs/htsjdk-1.126.jar b/libs/htsjdk-1.126.jar deleted file mode 100644 index cf0c02f..0000000 Binary files a/libs/htsjdk-1.126.jar and /dev/null differ diff --git a/libs/htsjdk-2.10.1.jar b/libs/htsjdk-2.10.1.jar new file mode 100644 index 0000000..399634d Binary files /dev/null and b/libs/htsjdk-2.10.1.jar differ diff --git a/libs/javax.json-1.0.4.jar b/libs/javax.json-1.0.4.jar new file mode 100644 index 0000000..09967d8 Binary files /dev/null and b/libs/javax.json-1.0.4.jar differ diff --git a/libs/javax.json-api-1.0.jar b/libs/javax.json-api-1.0.jar new file mode 100644 index 0000000..d276c79 Binary files /dev/null and b/libs/javax.json-api-1.0.jar differ diff --git a/libs/jfxrt.jar b/libs/jfxrt.jar new file mode 100755 index 0000000..55858aa Binary files /dev/null and b/libs/jfxrt.jar differ diff --git a/libs/junit-4.11.jar b/libs/junit-4.11.jar deleted file mode 100644 index 09db3e7..0000000 Binary files a/libs/junit-4.11.jar and /dev/null differ diff --git a/libs/slf4j-api-1.7.25.jar b/libs/slf4j-api-1.7.25.jar new file mode 100644 index 0000000..7e62f13 Binary files /dev/null and b/libs/slf4j-api-1.7.25.jar differ diff --git a/libs/slf4j-simple-1.7.25.jar b/libs/slf4j-simple-1.7.25.jar new file mode 100644 index 0000000..b29ca12 Binary files /dev/null and b/libs/slf4j-simple-1.7.25.jar differ diff --git a/makefile.dev b/makefile.dev new file mode 100644 index 0000000..efb729c --- /dev/null +++ b/makefile.dev @@ -0,0 +1,180 @@ +############################################################################# +# Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. # +# # +# Redistribution and use in source and binary forms, with or without # +# modification, are permitted provided that the following conditions # +# are met: # +# # +# 1. Redistributions of source code must retain the above copyright notice, # +# this list of conditions and the following disclaimer. # +# 2. Redistributions in binary form must reproduce the above copyright # +# notice, this list of conditions and the following disclaimer in the # +# documentation and/or other materials provided with the distribution. # +# 3. Neither the names of the institutions nor the names of the contributors# +# may be used to endorse or promote products derived from this software # +# without specific prior written permission. # +# # +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS # +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, # +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +############################################################################/ + + +############################################################################ +# A generic makefile for java +# +############################################################################ + + +########################################################################### +# Modify the following parameter to suit your setting + +#Class directory +CLASS_DIR=target/dev-classes + +#Source directory +SRC_MAIN_DIR=src/main/java +SRC_DEV_DIR=src/dev/java +##SRC_WORK_DIR=src/work/java + +#Java compiler +JCC=javac + +#target jar file +JAR_FILE=japsa-dev.jar + +#External library directory +LIB_DIR=libs + +#List of external libraries + +#EXT_LIBS=colloquial.jar commons-math3-3.0.jar jhdf5.jar jhdfobj.jar htsjdk-2.10.1.jar guava-18.0.jar jcommon-1.0.23.jar jfreechart-1.0.19.jar JRIEngine.jar JRI.jar gs-core-1.3.jar gs-ui-1.3.jar gs-algo-1.3.jar javax.json-api-1.0.jar jfxrt.jar +EXT_LIBS=colloquial.jar commons-math3-3.0.jar jhdf5.jar jhdfobj.jar htsjdk-2.10.1.jar guava-18.0.jar jcommon-1.0.23.jar jfreechart-1.0.19.jar JRIEngine.jar JRI.jar gs-core-1.3.jar gs-ui-1.3.jar gs-algo-1.3.jar javax.json-api-1.0.jar javax.json-1.0.4.jar slf4j-api-1.7.25.jar slf4j-simple-1.7.25.jar jfxrt.jar + +########################################################################### +##What this scripts does: +# 1. Find all packages by searching SRC_DIR subdirectories +# 2. Get all the java file, and compile them to the CLASS_DIR directory +# 5. Clean back up files (*.bak, ~, etc) +# To be implemented: +# 3. Copy all the resources to class directory +# 4. Make java file +########################################################################## + +COMMA:= , +EMPTY:= +SPACE:= $(SPACE) $(SPACE) +LIBS:=$(subst $(SPACE),:, $(addprefix $(LIB_DIR)/, $(EXT_LIBS))) + +##Get all the packages in $(SRC_MAIN_DIR). +PACKAGE_MAIN_DIRS := $(shell echo `cd $(SRC_MAIN_DIR);find . -type d`) +SRC_MAIN_DIRS := $(addprefix $(SRC_MAIN_DIR)/, $(PACKAGE_MAIN_DIRS)) +CLASS_MAIN_DIRS := $(addprefix $(CLASS_DIR)/, $(PACKAGE_MAIN_DIRS)) +##Get the list of java files +JAVA_MAIN_FILES := $(foreach dir,$(SRC_MAIN_DIRS),$(wildcard $(dir)/*.java)) +CLASS_MAIN_FILES := $(subst $(SRC_MAIN_DIR)/,$(CLASS_DIR)/,$(JAVA_MAIN_FILES:.java=.class)) + +PACKAGE_DEV_DIRS := $(shell echo `cd $(SRC_DEV_DIR);find . -type d`) +SRC_DEV_DIRS := $(addprefix $(SRC_DEV_DIR)/, $(PACKAGE_DEV_DIRS)) +CLASS_DEV_DIRS := $(addprefix $(CLASS_DIR)/, $(PACKAGE_DEV_DIRS)) +##Get the list of java files +JAVA_DEV_FILES := $(foreach dir,$(SRC_DEV_DIRS),$(wildcard $(dir)/*.java)) +CLASS_DEV_FILES := $(subst $(SRC_DEV_DIR)/,$(CLASS_DIR)/,$(JAVA_DEV_FILES:.java=.class)) + +#TODO: merge three src dirs to one by foreach + +#####################Make targets + +VPATH=$(subst ' ',':',$(PACKAGE_MAIN_DIRS)) +$(CLASS_DIR)/%.class: $(SRC_MAIN_DIR)/%.java $(CLASS_DIR) + $(JCC) -sourcepath $(SRC_MAIN_DIR) -cp $(CLASS_DIR):$(LIBS) -nowarn -d $(CLASS_DIR) $(JDEBUGFLAGS) $< + +VPATH=$(subst ' ',':',$(PACKAGE_DEV_DIRS)) +$(CLASS_DIR)/%.class: $(SRC_DEV_DIR)/%.java $(CLASS_DIR) + $(JCC) -sourcepath $(SRC_DEV_DIR):$(SRC_MAIN_DIR) -cp $(CLASS_DIR):$(LIBS) -nowarn -d $(CLASS_DIR) $(JDEBUGFLAGS) $< + +all: jar + +main-classes: $(CLASS_MAIN_FILES) +dev-classes: $(CLASS_DEV_FILES) + +###Create the class directory if neccessary +$(CLASS_DIR): + mkdir -p $(CLASS_DIR) + +images: + cp -r $(SRC_MAIN_DIR)/japsa/bio/misc/dnaPlatform/gui/images $(CLASS_DIR)/japsa/bio/misc/dnaPlatform/gui/ + cp -r $(SRC_MAIN_DIR)/japsa/seq/nanopore/icons $(CLASS_DIR)/japsa/seq/nanopore/ + +$(JAR_FILE): main-classes dev-classes images + jar cf $(JAR_FILE) -C $(CLASS_DIR) . + +jar: $(JAR_FILE) + +clean: + @@for i in $(PACKAGE_MAIN_DIRS); do \ + echo "rm -f $(SRC_MAIN_DIR)/$$i/*.bak $(SRC_MAIN_DIR)/$$i/*.class $(SRC_MAIN_DIR)/$$i/*~ $(CLASS_DIR)/$$i/*.class"; \ + rm -f $(SRC_MAIN_DIR)/$$i/*.bak $(SRC_MAIN_DIR)/$$i/*.class $(SRC_MAIN_DIR)/$$i/*~ $(CLASS_DIR)/$$i/*.class; \ + done + @@for i in $(PACKAGE_DEV_DIRS); do \ + echo "rm -f $(SRC_DEV_DIR)/$$i/*.bak $(SRC_DEV_DIR)/$$i/*.class $(SRC_DEV_DIR)/$$i/*~ $(CLASS_DIR)/$$i/*.class"; \ + rm -f $(SRC_DEV_DIR)/$$i/*.bak $(SRC_DEV_DIR)/$$i/*.class $(SRC_DEV_DIR)/$$i/*~ $(CLASS_DIR)/$$i/*.class; \ + done + rm -f install.sh install.bat + +############################################################################## +# INSTALLATION +############################################################################## + +ifdef INSTALL_DIR +O_INS_DIR=-installDir=\"${INSTALL_DIR}\" +else +O_INS_DIR= +endif + + +ifdef JLP +O_JLP=-jlp=\"${JLP}\" +else +O_JLP= +endif + +ifdef MXMEM +O_MXMEM=-xmx=${MXMEM} +else +O_MXMEM= +endif + + +ifdef SERVER +O_SERVER=--server=${SERVER} +else +O_SERVER= +endif + + +##=/usr/lib/jni:/usr/lib/R/site-library/rJava/jri + +pre-install: jar + @@echo "java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar:$(LIB_DIR)/slf4j-api-1.7.25.jar:$(LIB_DIR)/slf4j-simple-1.7.25.jar japsadev.util.deploy.DevDeploy --mode install --libs $(subst $(SPACE),:, $(EXT_LIBS)) $(O_INS_DIR) $(O_JLP) $(O_MXMEM) $(O_SERVER) --compiler \"`$(JCC) -version 2>&1`\"" > install.sh && \ + chmod u+x install.sh && \ + echo "java -cp $(JAR_FILE):$(LIB_DIR)\guava-18.0.jar:$(LIB_DIR)/slf4j-api-1.7.25.jar:$(LIB_DIR)/slf4j-simple-1.7.25.jar japsadev.util.deploy.DevDeploy --mode install --libs $(subst $(SPACE),:, $(EXT_LIBS)) $(O_INS_DIR) $(O_JLP) $(O_MXMEM) $(O_SERVER)--compiler \"`$(JCC) -version 2>&1`\"" > install.bat &&\ + echo "Installation scripts created" + +install: pre-install + ./install.sh + +uninstall: + @@java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar:$(LIB_DIR)/slf4j-api-1.7.25.jar:$(LIB_DIR)/slf4j-simple-1.7.25.jar japsadev.util.deploy.DevDeploy --mode uninstall --libs $(subst $(SPACE),:, $(EXT_LIBS)) ${O_INS_DIR} && echo "Japsa uninstalled!" + +galaxy: jar + @@java -cp $(JAR_FILE):$(LIB_DIR)/guava-18.0.jar:$(LIB_DIR)/slf4j-api-1.7.25.jar:$(LIB_DIR)/slf4j-simple-1.7.25.jar japsadev.util.deploy.DevDeploy --mode galaxy && echo "Galaxy wrappers generated!" + diff --git a/src/dev/java/japsadev/bio/go/GoTerm.java b/src/dev/java/japsadev/bio/go/GoTerm.java new file mode 100644 index 0000000..2951407 --- /dev/null +++ b/src/dev/java/japsadev/bio/go/GoTerm.java @@ -0,0 +1,80 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 20 Jul 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.bio.go; + + + +import java.util.ArrayList; + +/** + * A simple GO Term class, will be replaced by a standard library + * @author minhduc + * + */ +public class GoTerm { + String ID; + String name; + String desc; + + ArrayList relationship; + + public GoTerm (String anID){ + ID = anID; + relationship = new ArrayList(); + } + + public void addRelationShip(TypeRelationship type, GoTerm term){ + relationship.add(new GoRelationship(type, term)); + } + + public static class GoRelationship{ + GoTerm relTerm; + TypeRelationship relType; + GoRelationship(TypeRelationship type, GoTerm term){ + relType = type; + relTerm = term; + } + } + + /** + * @param args + */ + public static void main(String[] args) { + + + + } + +} diff --git a/src/dev/java/japsadev/bio/go/ODOParser.java b/src/dev/java/japsadev/bio/go/ODOParser.java new file mode 100644 index 0000000..88e2833 --- /dev/null +++ b/src/dev/java/japsadev/bio/go/ODOParser.java @@ -0,0 +1,590 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 20 Jul 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.bio.go; + +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsadev.bio.go.GoTerm.GoRelationship; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +/** + * @author minhduc + * + */ +public class ODOParser { + private static final Logger LOG = LoggerFactory.getLogger(ODOParser.class); + + HashMap terms = new HashMap(); + + HashMap relTypes = + new HashMap(); + + public ODOParser(String fileName) throws IOException{ + + //First round to read + BufferedReader bf = SequenceReader.openFile(fileName); + String line = ""; + int lineNo = 0; + + while ((line = bf.readLine())!=null) { + lineNo ++; + line = line.trim(); + if (line.equals("[Term]")){ + line = bf.readLine().trim(); + lineNo ++; + if (!line.startsWith("id: ")) + throw new RuntimeException("Wrong format at line " + lineNo); + + String ID = line.substring(4); + line = bf.readLine().trim(); + lineNo ++; + + if (!line.startsWith("name: ")) + throw new RuntimeException("Wrong format at line " + lineNo); + String name = line.substring(5).trim(); + + GoTerm currentTerm = new GoTerm(ID); + currentTerm.name = name; + terms.put(ID, currentTerm); + continue; + } + + if (line.equals("[Typedef]")){ + line = bf.readLine().trim(); + lineNo ++; + if (!line.startsWith("id: ")) + throw new RuntimeException("Wrong format at line " + lineNo); + + String ID = line.substring(4); + line = bf.readLine().trim(); + lineNo ++; + + if (!line.startsWith("name: ")) + throw new RuntimeException("Wrong format at line " + lineNo); + String name = line.substring(5); + + TypeRelationship relationship = new TypeRelationship (ID, name); + relTypes.put(ID, relationship); + continue; + } + } + bf.close(); + + + //System.out.println(terms.size()); + //System.out.println(relTypes.size()); + + + //Second round to build relationship + bf = SequenceReader.openFile(fileName); + line = ""; + lineNo = 0; + + while ((line = bf.readLine())!=null) { + lineNo ++; + line = line.trim(); + + + if (line.equals("[Term]")){ + line = bf.readLine().trim(); + lineNo ++; + if (!line.startsWith("id: ")) + throw new RuntimeException("Wrong format at line " + lineNo); + + String ID = line.substring(4); + + line = bf.readLine().trim(); + lineNo ++; + + GoTerm currentTerm = terms.get(ID); + + // System.out.println(currentTerm + " " + lineNo); + while(true){ + line = bf.readLine(); + lineNo ++; + if (line == null) + break; + + line = line.trim(); + if (line.length() == 0) + break; + + + if (line.startsWith("is_a: ")){ + String[] toks = line.split(" ",4); + TypeRelationship relationship = relTypes.get("is_a"); + GoTerm term = terms.get(toks[1]); + if (term != null) + currentTerm.addRelationShip(relationship, term); + else + LOG.warn("Term " + toks[1] + " not found at line " + lineNo); + + } + if (line.startsWith("relationship: ")){ + String[] toks = line.split(" ",4); + TypeRelationship relationship = relTypes.get(toks[1]); + GoTerm term = terms.get(toks[2]); + + if (term != null) + currentTerm.addRelationShip(relationship, term); + else + LOG.warn("Term " + toks[2] + " not found at line " + lineNo); + + }//if + }//while + }//if + }//while + bf.close(); + } + + static HashSet conferDrugList = new HashSet(); + static HashSet conferList = new HashSet(); + static StringBuilder sb = new StringBuilder(); + + + static void confer(GoTerm term){ + for (GoRelationship rel:term.relationship){ + if (rel.relType.ID.equals("confers_resistance_to")) + conferList.add(rel.relTerm.name); + else if (rel.relType.ID.equals("confers_resistance_to_drug")) + conferDrugList.add(rel.relTerm.name); + else if (rel.relType.ID.equals("is_a")) + confer(rel.relTerm); + } + } + + static void list(GoTerm term, HashSet relationList){ + for (GoRelationship rel:term.relationship){ + if (relationList.contains(rel.relType.ID) && (!blackList.contains(rel.relTerm.ID))){ + sb.append(term.ID + "->" + rel.relTerm.ID + "[" + rel.relTerm.name + "];"); + list(rel.relTerm, relationList); + } + } + } + + static ODOParser odo; + //static HashSet relationList = new HashSet(); + + static HashSet blackList = new HashSet(); + static{ + blackList.add("AL513383.1.gene203"); + blackList.add("DQ303918.1.gene1"); + blackList.add("AL513383.1.gene214"); + + //CONFUSED GENE + //blackList.add("KM998962.1.gene1"); + // + + //DQ464881.1.gene2 + //AL513383.1.gene203 + //AY566250.1.gene1 + //AB091338.1.gene1 + + + //blackList.add("ARO:1000001");//every thing + //blackList.add("ARO:3000557");// [antibiotic inactivation enzyme] + //blackList.add("ARO:3000000");//determinant of antibiotic resistance + /****************************************************************** + blackList.add("ARO:3000722"); + blackList.add("ARO:3000717"); + blackList.add("ARO:3002639"); + blackList.add("ARO:3000744"); //-same as above + blackList.add("ARO:3000730"); + blackList.add("ARO:3002701"); + blackList.add("ARO:3002703"); + blackList.add("ARO:3000816"); + blackList.add("ARO:3000723"); + blackList.add("ARO:3000498"); + blackList.add("ARO:3000744"); + blackList.add("ARO:3000795"); + blackList.add("ARO:3000796"); + blackList.add("ARO:3002675"); + blackList.add("ARO:3003079"); + blackList.add("ARO:3000730"); + blackList.add("ARO:3003078"); + blackList.add("ARO:3000743"); + blackList.add("ARO:3000822"); + blackList.add("ARO:3002986"); + blackList.add("ARO:3000730"); + blackList.add("ARO:3000717"); + blackList.add("ARO:3000735"); + blackList.add("ARO:3000722"); + blackList.add("ARO:3000734"); + blackList.add("ARO:3000730"); + blackList.add("ARO:3003577"); + blackList.add("ARO:3000299"); + blackList.add("ARO:3000717"); + blackList.add("ARO:3000027"); + blackList.add("ARO:3000730"); + blackList.add("ARO:3000734"); + blackList.add("ARO:3000074"); + blackList.add("ARO:3000838"); + blackList.add("ARO:3000735"); + blackList.add("ARO:3000734"); + blackList.add("ARO:3003301"); + blackList.add("ARO:3000723"); + blackList.add("ARO:3000722"); + blackList.add("ARO:3000735"); + blackList.add("ARO:3000733"); + blackList.add("ARO:3003063"); + blackList.add("ARO:3000781"); + blackList.add("ARO:3000780"); + blackList.add("ARO:3000768"); + blackList.add("ARO:3000782"); + blackList.add("ARO:3000722"); + blackList.add("ARO:3003392"); + blackList.add("ARO:3000730"); + blackList.add("ARO:3001214"); + blackList.add("ARO:3000717"); + blackList.add("ARO:3000737"); + blackList.add("ARO:3003064"); + blackList.add("ARO:3003052"); + blackList.add("ARO:3003053"); + blackList.add("ARO:3003051"); + blackList.add("ARO:3000502"); + blackList.add("ARO:3000502"); + blackList.add("ARO:3000502"); + blackList.add("ARO:3000499"); + blackList.add("ARO:3000499"); + blackList.add("ARO:3000499"); + blackList.add("ARO:3000810"); + blackList.add("ARO:3000811"); + blackList.add("ARO:3003046"); + blackList.add("ARO:3003047"); + blackList.add("ARO:3000804"); + blackList.add("ARO:3000805"); + blackList.add("ARO:3000785"); + blackList.add("ARO:3000254"); + blackList.add("ARO:3000254"); + blackList.add("ARO:3000254"); + blackList.add("ARO:3000254"); + blackList.add("ARO:3000237"); + blackList.add("ARO:3000379"); + blackList.add("ARO:3000379"); + blackList.add("ARO:3000379"); + blackList.add("ARO:3002983"); + blackList.add("ARO:3002983"); + blackList.add("ARO:3003033"); + blackList.add("ARO:3002982"); + blackList.add("ARO:3002982"); + blackList.add("ARO:3003034"); + blackList.add("ARO:3000774"); + blackList.add("ARO:3000774"); + blackList.add("ARO:3000812"); + blackList.add("ARO:3000807"); + blackList.add("ARO:3000806"); + blackList.add("ARO:3000377"); + blackList.add("ARO:3000377"); + blackList.add("ARO:3000794"); + blackList.add("ARO:3000794"); + blackList.add("ARO:3000794"); + blackList.add("ARO:3000792"); + blackList.add("ARO:3000792"); + blackList.add("ARO:3000792"); + blackList.add("ARO:3000793"); + blackList.add("ARO:3000793"); + blackList.add("ARO:3000793"); + blackList.add("ARO:3000800"); + blackList.add("ARO:3000800"); + blackList.add("ARO:3000801"); + blackList.add("ARO:3000801"); + blackList.add("ARO:3000790"); + blackList.add("ARO:3000789"); + blackList.add("ARO:3000791"); + blackList.add("ARO:3000779"); + blackList.add("ARO:3000779"); + blackList.add("ARO:3000779"); + blackList.add("ARO:3000207"); + blackList.add("ARO:3000216"); + blackList.add("ARO:3000809"); + blackList.add("ARO:3000808"); + blackList.add("ARO:3000533"); + blackList.add("ARO:3000378"); + blackList.add("ARO:3000803"); + blackList.add("ARO:3000777"); + blackList.add("ARO:3000778"); + blackList.add("ARO:3000535"); + blackList.add("ARO:3000775"); + blackList.add("ARO:3000775"); + blackList.add("ARO:3003057"); + blackList.add("ARO:3003056"); + blackList.add("ARO:3003055"); + blackList.add("ARO:3003039"); + blackList.add("ARO:3000802"); + blackList.add("ARO:3000776"); + blackList.add("ARO:3000783"); + blackList.add("ARO:3000784"); + blackList.add("ARO:3003010"); + blackList.add("ARO:3003009"); + blackList.add("ARO:3000206"); +/******************************************************************/ + /* + * ARO:3000839 + * ARO:3000725 + * ARO:3000815 + * ARO:3000765 + * ARO:3000764 + * ARO:3000508 + * ARO:3003066 + * ARO:3003067 + * ARO:3000725 + * ARO:3000764 + * ARO:3000656 + * ARO:3000502 + * ARO:3000499 + * ARO:3000730 + * + * + * KM998962.1.gene1 + * + * + */ + } + + public static void resistanceClass(GoTerm term, HashSet allClass, HashSet myClass) throws IOException{ + if (allClass.contains(term.ID)) + myClass.add(term.name); + else{ + for (GoRelationship rel:term.relationship){ + if (rel.relType.ID.equals("is_a")) + resistanceClass(rel.relTerm, allClass, myClass); + } + } + + } + + public static HashMap> checkGenes(ArrayList seqs) throws IOException{ + //Set up known classes + HashSet abrGroups = new HashSet(); + abrGroups.add("ARO:3000052");// phenicol resistance gene + abrGroups.add("ARO:3000102");// fluoroquinolone resistance gene + abrGroups.add("ARO:3000104");// aminoglycoside resistance gene + abrGroups.add("ARO:3000129");// beta-lactam resistance gene + abrGroups.add("ARO:3000240");// streptogramin resistance gene + abrGroups.add("ARO:3000241");// lincosamide resistance gene + abrGroups.add("ARO:3000267");// linezolid resistance gene + abrGroups.add("ARO:3000271");// fosfomycin resistance gene + abrGroups.add("ARO:3000315");// macrolide resistance gene + abrGroups.add("ARO:3000362");// mosaic antibiotic resistance gene + abrGroups.add("ARO:3000383");// rifampin resistance gene + abrGroups.add("ARO:3000398");// chloramphenicol resistance gene + abrGroups.add("ARO:3000408");// sulfonamide resistance gene + abrGroups.add("ARO:3000468");// ethambutol resistance gene + abrGroups.add("ARO:3000472");// tetracycline resistance gene + abrGroups.add("ARO:3000477");// aminocoumarin resistance gene + abrGroups.add("ARO:3000494");// glycopeptide resistance gene + abrGroups.add("ARO:3000529");// mupirocin resistance gene + abrGroups.add("ARO:3000751");// peptide antibiotic resistance gene + abrGroups.add("ARO:3000868");// streptothricin resistance gene + abrGroups.add("ARO:3001217");// trimethoprim resistance gene + abrGroups.add("ARO:3001311");// elfamycin resistance gene + abrGroups.add("ARO:3002984");// polymyxin resistance gene + abrGroups.add("ARO:3003024");// fusidic acid resistance gene + abrGroups.add("ARO:3003058");// tunicamycin resistance gene + abrGroups.add("ARO:3003073");// lipopeptide antibiotic resistance gene + abrGroups.add("ARO:3003252");// bacitracin resistance gene + abrGroups.add("ARO:3003432");// isoniazid resistance gene + abrGroups.add("ARO:3003433");// pyrazinamide resistance gene + + + abrGroups.add("ARO:3000004");// class B (metallo-) beta-lactamase + abrGroups.add("ARO:3000075");// class D beta-lactamase + abrGroups.add("ARO:3000076");// class C beta-lactamase + abrGroups.add("ARO:3000078");// class A beta-lactamase + + HashMap> ret= new HashMap>(); + + for (Sequence seq : seqs){ + String ID = seq.getName(); + + if (blackList.contains(ID)) + continue; + + String desc = seq.getDesc(); + HashSet myGroups = new HashSet(); + + int indexPos = 0; + while (true){ + indexPos = desc.indexOf("ARO:",indexPos); + if (indexPos < 0) + break; + + if (indexPos + 11 < desc.length()){ + String ARO = desc.substring(indexPos, indexPos + 11); + indexPos += 11; + if (ARO.equals("ARO:1000001")) + continue; + + GoTerm term = odo.terms.get(ARO); + resistanceClass(term,abrGroups, myGroups); + } + }//while + System.out.print(seq.getName() + " : "); + for (String group:myGroups) + System.out.print(group + ";" ); + + System.out.println(); + + ret.put(seq.getName(), myGroups); + } + return ret; + } + + /** + * @param args + * @throws IOException + * + */ + public static void main(String[] args) throws IOException { + odo = new ODOParser("aro.obo"); + //relationList.add("is_a"); +/******************************************************** + ArrayList seqs = SequenceReader.readAll(args[0],Alphabet.DNA()); + + HashMap allele2Sequence = new HashMap(); + + for (Sequence seq:seqs){ + allele2Sequence.put(seq.getName(), seq); + } + + HashMap> allele2Groups = checkGenes(seqs); + + + BufferedReader bf = SequenceReader.openFile("togene"); + String line = ""; + + HashMap> gene2Groups = new HashMap>(); + HashMap gene2dbGeneID = new HashMap(); + + GeneDatabase geneDB = new GeneDatabase(); + + + while ( (line = bf.readLine())!=null){ + String [] toks = line.trim().split("\t"); + String alleleID = toks[0]; + + if (!allele2Groups.containsKey(alleleID)) + continue; + + String geneID = ((toks.length > 2)? toks[2]:"None") + "_" + toks[1]; + + HashSet agroup = allele2Groups.get(alleleID); + if (gene2Groups.containsKey(geneID)){ + HashSet ggroup = gene2Groups.get(geneID); + if (agroup == null){ + LOG.error(alleleID + " xx " + geneID ); + } + + if (!ggroup.containsAll(agroup)) + LOG.error( alleleID + " <> " + geneID ); + + if (!agroup.containsAll(ggroup)) + LOG.error(alleleID + " # " + geneID ); + + GeneDatabase.GeneFamily dbFamily = geneDB.getFamily(gene2dbGeneID.get(geneID)); + + if (dbFamily == null) + LOG.error("Problem finding " + geneID ); + else{ + dbFamily.addSequence(allele2Sequence.get(alleleID)); + } + + + }else{ + gene2Groups.put(geneID, allele2Groups.get(toks[0])); + Sequence seq = allele2Sequence.get(alleleID); + String famID = geneDB.addNewFamily(seq); + gene2dbGeneID.put(geneID, famID); + String desc = "geneID=" + geneID + ";dg="; + + for (String group:agroup){ + desc += group + ","; + } + + geneDB.getFamily(famID).setDesc(desc); + } + } + + geneDB.write2File("F.fasta", false); + geneDB.write2File("A.fasta", true); + + LOG.info(gene2Groups.size() + " group "); + + + + /********************************************************/ + + for (String relID:odo.relTypes.keySet()){ + System.out.println(relID + "\t" + odo.relTypes.get(relID)); + } + + for (String termID:odo.terms.keySet()){ + GoTerm term = odo.terms.get(termID); + for (GoRelationship rel:term.relationship){ + if (rel.relType.ID.equals("is_a")) + System.out.println(termID + "(" + term.name +")\t" + rel.relTerm.ID + "(" + rel.relTerm.name+")"); + } + + conferDrugList.clear(); + conferList.clear(); + sb = new StringBuilder(); + + odo.confer(term); + + System.out.println(" " + term.ID + " :" + term.name + " " + term.desc); + + //sodo.list(term, relationList); + System.out.print(" Confer: "); + for (String st:conferList){ + System.out.print(st + ";"); + } + + System.out.print("\n Confer drug : "); + for (String st:conferDrugList){ + System.out.print(st + ";"); + } + System.out.print("\n-----------------------------------------------------\n"); + + } + /********************************************************/ + } +} diff --git a/src/dev/java/japsadev/bio/go/RefFinderParser.java b/src/dev/java/japsadev/bio/go/RefFinderParser.java new file mode 100644 index 0000000..9582f5b --- /dev/null +++ b/src/dev/java/japsadev/bio/go/RefFinderParser.java @@ -0,0 +1,144 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 20 Jul 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.bio.go; + +import japsa.bio.gene.GeneDatabase; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * @author minhduc + * + */ +public class RefFinderParser { + private static final Logger LOG = LoggerFactory.getLogger(RefFinderParser.class); + + public RefFinderParser(String fileName) throws IOException{ + + } + + + /** + * @param args + * @throws IOException + * + */ + public static void main(String[] args) throws IOException { + + //Read in all sequences + ArrayList seqs = SequenceReader.readAll(args[0],Alphabet.DNA()); + + HashMap allele2Sequence = new HashMap(); + + for (Sequence seq:seqs){ + allele2Sequence.put(seq.getName(), seq); + } + +// / HashMap allele2Groups = new HashMap(); + + BufferedReader bf = SequenceReader.openFile("togene"); + String line = ""; + + HashMap gene2Groups = new HashMap(); + HashMap gene2dbGeneID = new HashMap(); + + GeneDatabase geneDB = new GeneDatabase(); + + + while ( (line = bf.readLine())!=null){ + String [] toks = line.trim().split("\t"); + String alleleID = toks[0]; + String agroup = toks[1]; + String geneID = toks[4] + "_" + toks[2]; + + if (gene2Groups.containsKey(geneID)){ + String ggroup = gene2Groups.get(geneID); + + if (agroup == null){ + LOG.error(alleleID + " xx " + geneID ); + } + + if (!ggroup.equals(agroup)) + LOG.error( alleleID + " <> " + geneID ); + + GeneDatabase.GeneFamily dbFamily = geneDB.getFamily(gene2dbGeneID.get(geneID)); + + if (dbFamily == null) + LOG.error("Problem finding " + geneID ); + else{ + dbFamily.addSequence(allele2Sequence.get(alleleID)); + } + + + }else{ + gene2Groups.put(geneID, agroup); + Sequence seq = allele2Sequence.get(alleleID); + String famID = geneDB.addNewFamily(seq); + gene2dbGeneID.put(geneID, famID); + String desc = "geneID=" + geneID + ";dg="+agroup; + + geneDB.getFamily(famID).setDesc(desc); + } + } + + geneDB.write2File("F.fasta", false); + geneDB.write2File("A.fasta", true); + + LOG.info(gene2Groups.size() + " group "); + + /******************************************************** + for (String relID:odo.relTypes.keySet()){ + System.out.println(relID + "\t" + odo.relTypes.get(relID)); + } + + for (String termID:odo.terms.keySet()){ + GoTerm term = odo.terms.get(termID); + for (GoRelationship rel:term.relationship){ + if (rel.relType.ID.equals("is_a")) + System.out.println(termID + "(" + term.name +")\t" + rel.relTerm.ID + "(" + rel.relTerm.name+")"); + } + } + /********************************************************/ + } +} diff --git a/src/dev/java/japsadev/bio/go/TypeRelationship.java b/src/dev/java/japsadev/bio/go/TypeRelationship.java new file mode 100644 index 0000000..f655f29 --- /dev/null +++ b/src/dev/java/japsadev/bio/go/TypeRelationship.java @@ -0,0 +1,43 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 20 Jul 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.bio.go; + +public class TypeRelationship{ + String ID, name; + public TypeRelationship(String id, String aname){ + ID = id; + name = aname; + } +} \ No newline at end of file diff --git a/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java new file mode 100644 index 0000000..f175847 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java @@ -0,0 +1,173 @@ +package japsadev.bio.hts.clustering; + +import java.io.*; +import java.util.*; + + + + +/** + * @author buvan.suji + * + */ + +public class GettingTreadsFromFasta { + + private String[] description; + private String[] sequence; + + + static ArrayList Rname = new ArrayList(); + static ArrayList TReads = new ArrayList(); + static String FnamePath; + static int sequenceLength; + static String fle; + + public GettingTreadsFromFasta(String filename) { + readSequenceFromFile(filename); + } + + @SuppressWarnings("resource") + void readSequenceFromFile(String file) { + + ArrayList desc = new ArrayList(); + ArrayList seq = new ArrayList(); + BufferedReader in = null; + + try { + in = new BufferedReader(new FileReader(file)); + StringBuffer buffer = new StringBuffer(); + String line = in.readLine(); + + if (line == null) + throw new IOException(file + " is an empty file"); + + if (line.charAt(0) != '>') + throw new IOException("First line of " + file + + " should start with '>'"); + else + desc.add(line); + + for (line = in.readLine().trim(); line != null; line = in + .readLine()) { + if (line.length() > 0 && line.charAt(0) == '>') { + seq.add(buffer.toString()); + buffer = new StringBuffer(); + desc.add(line); + } else + buffer.append(line.trim()); + } + if (buffer.length() != 0) + seq.add(buffer.toString()); + } catch (IOException e) { + System.out.println("Error when reading " + file); + e.printStackTrace(); + } + + description = new String[desc.size()]; + sequence = new String[seq.size()]; + for (int i = 0; i < seq.size(); i++) { + description[i] = (String) desc.get(i); + sequence[i] = (String) seq.get(i); + } + sequenceLength = seq.size(); + + } + + // return first sequence as a String + public String getSequence() { + return sequence[0]; + } + + // return first xdescription as String + public String getDescription() { + return description[0]; + } + + // return sequence as a String + public String getSequence(int i) { + return sequence[i]; + } + + // return description as String + public String getDescription(int i) { + return description[i]; + } + + public int size() { + return sequence.length; + } + + public static void FileReading(String filename) { + GettingTreadsFromFasta fsf = new GettingTreadsFromFasta( + filename); + StringBuffer buffer1 = new StringBuffer(); + + for (int i = 0; i < fsf.size(); i++) { + String temp = ((buffer1.append(fsf.getDescription(i))) + .deleteCharAt(0)).toString(); + Rname.add(temp); + TReads.add(fsf.getSequence(i)); + buffer1 = new StringBuffer(); + } + } + + // call this method-1 + //public static void DestReads() throws Exception { + public static void DestReads(String fle1) throws Exception { + /*String fname = ""; + System.out.print("Enter the name of the FastaFile:"); + fname = (new BufferedReader(new InputStreamReader(System.in))) + .readLine(); + FnamePath = new File(fname).getName();*/ + + // System.out.println(FnamePath); + //FileReading(fname); + fle = fle1; + FnamePath = fle1; + //FnamePath = "chr10134169759_134174717.fasta"; + FileReading(FnamePath); + } + + public static void main(String[] args) throws Exception { + DestReads(fle); + ViewList(); + } + + // call this method-2 + public static ArrayList GetRname() { + return Rname; + } + + // call this method-3 + public static ArrayList GetTReads() { + return TReads; + } + + public static int NumberReads() { + return TReads.size(); + } + + // call this method-4 + public static String GetFileName() { + return FnamePath; + } + + public static int SeqLength() { + return sequenceLength; + } + + public static int NelementsClustering() { + return sequenceLength * (sequenceLength - 1) / 2; + } + + public static void ViewList() { + System.out.println(GetRname()); + // System.out.println(GetTReads()); + System.out.println(SeqLength()); + System.out.println(NelementsClustering()); + } + + + +} diff --git a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java new file mode 100644 index 0000000..ddd55f9 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java @@ -0,0 +1,300 @@ +package japsadev.bio.hts.clustering; + +import java.util.ArrayList; +import java.io.*; + +import japsa.seq.SequenceOutputStream; +import japsadev.bio.hts.clustering.PairDistance; +import japsadev.bio.hts.clustering.GettingTreadsFromFasta; + +/** + * @author buvan.suji + * + */ + +public class KmeanClustering { + + public static void Clustering() throws Exception{ + FileInputStream file1 = new FileInputStream("TRfile.fasta"); + BufferedReader br = new BufferedReader(new InputStreamReader(file1)); + String line = null; + while((line = br.readLine())!=null){ + ArrayList descript = new ArrayList(); + ArrayList reads = new ArrayList(); + //ArrayList MaxReadLn = new ArrayList(); + int MaxReadLn = 0; + int Nreads; + String FileName; + int NumberElements; + int ClustElements; + int n; + int d[]; + int k[][]; + final int p =2;//number of clusters + int tempk[][]; + int m[]; + int Nclusters = 0; + + double max = 0; + double temp1 = 0; + int index1=0; + int index2=0; + int count1=0,count2=0; + long startTime = System.nanoTime(); + + + GettingTreadsFromFasta.DestReads(line); + //GettingTreadsFromFasta.DestReads(); + descript = GettingTreadsFromFasta.GetRname(); + reads = GettingTreadsFromFasta.GetTReads(); + Nreads = GettingTreadsFromFasta.NumberReads(); + FileName = GettingTreadsFromFasta.GetFileName(); + NumberElements = GettingTreadsFromFasta.SeqLength(); + ClustElements = GettingTreadsFromFasta.NelementsClustering(); + + double[][] table = new double[Nreads][Nreads]; + + for (int i=0; im[i]){ + diff[i] = list[m[i]][a]; + } + else{ + diff[i]=0; + } + } + + int val=0; + double temp=diff[0]; + for(int i=0;i<2;++i){ + if(diff[i] t = new ArrayList() ; + + for(int i=0;i<2;++i){ + m[i]=0; // initializing means to 0 + } + + //int cnt=0; + for(int i=0;i<2;++i){ + + for(int j=0;j s = new ArrayList(); + + for(int x1 = 0; x1t.get(x2)){ + sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); + } + else{ + sum = sum+0; + } + } + //System.out.println(sum); + s.add(sum/t.size()); + } + //System.out.println(s.size()); + double min=s.get(0); + int d2 = 0; + for(int x3=1;x3s.get(x3)){ + min = s.get(x3); + d2 = x3; + } + } + + + m[i]=t.get(d2); + s.clear(); + t.clear(); + + } + } + +//This checks if previous k ie. tempk and current k are same.Used as terminating case. + static int VerifyEqual(int n, int k[][], int tempk[][]){ + for(int i=0;i<2;++i){ + for(int j=0;j> + Clustering(ArrayList reads) throws Exception{ + ArrayList> tempClusterList = + new ArrayList>(); + ArrayList tempList = new ArrayList(); + if(reads.size()<=2){ + tempClusterList.add(tempList); + tempClusterList.add(tempList); + tempClusterList.add(tempList); + return tempClusterList; + } + else{ + int MaxReadLn = 0; + int MinReadLn = 1000000; + + int n; + int d[]; + int k[][]; + final int p =2;//number of clusters + int tempk[][]; + int m[]; + int Nclusters = 0; + + double max = 0; + double temp1 = 0; + int index1=0; + int index2=0; + int count1=0,count2=0; + long startTime = System.nanoTime(); + + n = reads.size(); + + double[][] table = new double[n][n]; + ArrayList readsLengthRange = new ArrayList(); + + for (int x=0; x> clusterList = + new ArrayList>(); + clusterList.add(readsLengthRange); + + for(int x=0;x tempcluster = new ArrayList(); + for(int y=0;k[x][y]!=-1 && ym[i]){ + diff[i] = list[m[i]][a]; + } + else{ + diff[i]=0; + } + } + + int val=0; + double temp=diff[0]; + for(int i=0;i<2;++i){ + if(diff[i] t = new ArrayList() ; + + for(int i=0;i<2;++i){ + m[i]=0; // initializing means to 0 + } + + //int cnt=0; + for(int i=0;i<2;++i){ + + for(int j=0;j1){ + ArrayList s = new ArrayList(); + + for(int x1 = 0; x1t.get(x2)){ + sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); + } + else{ + sum = sum+0; + } + } + //System.out.println(sum); + s.add(sum/t.size()); + } + //System.out.println(s.size()); + double min=s.get(0); + int d2 = 0; + for(int x3=1;x3s.get(x3)){ + min = s.get(x3); + d2 = x3; + } + } + + + m[i]=t.get(d2); + s.clear(); + t.clear(); + }else{ + m[i]=t.get(0); + } + + } + } + +//This checks if previous k ie. tempk and current k are same.Used as terminating case. + static int VerifyEqual(int n, int k[][], int tempk[][]){ + for(int i=0;i<2;++i){ + for(int j=0;j parentMap = new HashMap<>(); + + for (int i = 1; i <= m; ++i) { + d[i][0] = i; + } + + for (int j = 1; j <= n; ++j) { + d[0][j] = j; + } + + + + + for (int j = 1; j <= n; ++j) { + for (int i = 1; i <= m; ++i) { + final int delta = (s.charAt(j - 1) == z.charAt(i - 1)) ? 0 : 1; + + double tentativeDistance = d[i - 1][j] +0.5;//gap penalty + EditOperation editOperation = EditOperation.INSERT; + + if (tentativeDistance > d[i][j - 1] +0.5) { + tentativeDistance = d[i][j - 1] +0.5; + editOperation = EditOperation.DELETE; + } + + if (tentativeDistance > d[i - 1][j - 1] + delta) { + tentativeDistance = d[i - 1][j - 1] + delta; + editOperation = EditOperation.SUBSTITUTE; + } + + d[i][j] = tentativeDistance; + + + switch (editOperation) { + case SUBSTITUTE: + parentMap.put(new Point(i, j), new Point(i - 1, j - 1)); + break; + + case INSERT: + parentMap.put(new Point(i, j), new Point(i - 1, j)); + break; + + case DELETE: + parentMap.put(new Point(i, j), new Point(i, j - 1)); + break; + default: + break; + } + } + } + + final StringBuilder topLineBuilder = new StringBuilder(n + m); + final StringBuilder bottomLineBuilder = new StringBuilder(n + m); + final StringBuilder editSequenceBuilder = new StringBuilder(n + m); + Point current = new Point(m, n); + + while (true) { + Point predecessor = parentMap.get(current); + + if (predecessor == null) { + break; + } + + if (current.x != predecessor.x && current.y != predecessor.y) { + final char schar = s.charAt(predecessor.y); + final char zchar = z.charAt(predecessor.x); + + topLineBuilder.append(schar); + bottomLineBuilder.append(zchar); + editSequenceBuilder.append(schar != zchar ? + EditOperation.SUBSTITUTE : + EditOperation.MATCH); + } else if (current.x != predecessor.x) { + topLineBuilder.append(GAP); + bottomLineBuilder.append(z.charAt(predecessor.x)); + editSequenceBuilder.append(EditOperation.INSERT); + } else { + topLineBuilder.append(s.charAt(predecessor.y)); + bottomLineBuilder.append(GAP); + editSequenceBuilder.append(EditOperation.DELETE); + } + + current = predecessor; + } + + // Remove the last characters that correspond to the very beginning + // of the alignments and edit sequence (since the path reconstruction + // proceeds from the "end" to the "beginning" of the distance matrix. + topLineBuilder .deleteCharAt(topLineBuilder.length() - 1); + bottomLineBuilder .deleteCharAt(bottomLineBuilder.length() - 1); + editSequenceBuilder.deleteCharAt(editSequenceBuilder.length() - 1); + + // Our result data is backwards, reverse them. + topLineBuilder .reverse(); + bottomLineBuilder .reverse(); + editSequenceBuilder.reverse(); + + return new EditDistanceResult(d[m][n], + editSequenceBuilder.toString(), + topLineBuilder.toString(), + bottomLineBuilder.toString()); + } + + + + public static void main(String[] args) throws IOException { + EditDistanceResult result = compute("ACTAGGTTA", "TAAGGCTCAAT"); + System.out.println("Distance: " + result.getDistance()); + System.out.println("Edit sequence: " + result.getEditSequence()); + System.out.println("Alignment:"); + System.out.println(result.getTopAlignmentRow()); + System.out.println(result.getBottomAlignmentRow()); + + + } + + + + +} diff --git a/src/dev/java/japsadev/bio/hts/clustering/WriteClusterResultOnFile.java b/src/dev/java/japsadev/bio/hts/clustering/WriteClusterResultOnFile.java new file mode 100644 index 0000000..fe5c98a --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/clustering/WriteClusterResultOnFile.java @@ -0,0 +1,73 @@ +package japsadev.bio.hts.clustering; + +import java.util.ArrayList; +import java.io.*; + +import japsa.seq.SequenceOutputStream; +import japsadev.bio.hts.clustering.PairDistance; +import japsadev.bio.hts.clustering.GettingTreadsFromFasta; +import japsa.seq.*; + +/** + * @author buvan.suji + * + */ + +public class WriteClusterResultOnFile { + + public static void writeOnFile(ArrayList> + list, Sequence cons1, Sequence cons2, String FileName ) throws Exception{ + + File file = new File("ClusterResult_"+FileName+".fasta"); + FileWriter fw = new FileWriter(file.getAbsoluteFile()); + BufferedWriter bw = new BufferedWriter(fw); + + + ArrayList list1 = new ArrayList(); + list1 = list.get(0); + + bw.write("Minimum Read Length: "+ list1.get(0)); + bw.newLine(); + bw.write("Maximum Read Length: "+ list1.get(1)); + bw.newLine(); + bw.write("Estimated Time: "+ list1.get(2)); + bw.newLine(); + bw.newLine(); + bw.write("The consensus sequence of cluster1: "); + bw.newLine(); + bw.write(""+cons1); + bw.newLine(); + bw.newLine(); + bw.write("The C1 members are: "); + bw.newLine(); + for(int x=1;x tempList = new ArrayList(); + tempList = list.get(x); + + for(int y=0;y vertices = (HashSet) graph.getVertices(); +// +// //display the initial setup- all vertices adjacent to each other +// for(Vertex vertex:vertices){ +// System.out.println(vertex); +// +// for(int j = 0; j < vertex.getNeighborCount(); j++){ +// System.out.println(vertex.getNeighbor(j)); +// } +// +// System.out.println(); +// } +// +// Node source=new Node(graph.getVertex("108"),false), +// dest=new Node(graph.getVertex("201"),true); +// int distance=2962; +// +// ArrayList paths=graph.DFS(source, dest, distance); +// if(!paths.isEmpty()){ +// System.out.println("Paths found ("+distance+"):"); +// for(Path p:paths) +// System.out.println(p.toString() + " d=" + p.getDeviation() ); +// } +// else +// System.out.println("Path not found ("+distance+")!"); + + }catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + +// Vertex v1= new Vertex("EDGE_1_length_1000_cov_9"), +// v2= new Vertex("EDGE_2_length_200_cov_22"), +// v3= new Vertex("EDGE_3_length_300_cov_12"), +// v4= new Vertex("EDGE_4_length_500_cov_33"), +// v5= new Vertex("EDGE_5_length_600_cov_21"); +// graph.addVertex(v1, false); +// graph.addVertex(v2, false); +// graph.addVertex(v3, false); +// graph.addVertex(v4, false); +// graph.addVertex(v5, false); +// +// graph.addEdge(v1, v2, true, true); +// graph.addEdge(v1, v2, false, false); +// +// graph.addEdge(v2, v1, true, true); +// graph.addEdge(v2, v3, true, true); +// graph.addEdge(v2, v1, false, false); +// graph.addEdge(v2, v4, false, false); +// +// graph.addEdge(v3, v4, true, true); +// graph.addEdge(v3, v2, false, false); +// +// graph.addEdge(v4, v5, true, true); +// graph.addEdge(v4, v2, true, true); +// graph.addEdge(v4, v3, false, false); +// graph.addEdge(v4, v5, false, false); +// +// graph.addEdge(v5, v4, false, false); +// graph.addEdge(v5, v4, true, true); +// +// graph.printStats(); +// +// Path p1=new Path(graph, "1+,2+,3+"), +// p2=new Path(graph, "3+,4+"); +// graph.reduce(p1); +// graph.printStats(); +// graph.reduce(p2); +// graph.printStats(); + } +} + + diff --git a/src/dev/java/japsadev/bio/hts/metagenome/Edge.java b/src/dev/java/japsadev/bio/hts/metagenome/Edge.java new file mode 100644 index 0000000..99cb9a3 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/metagenome/Edge.java @@ -0,0 +1,180 @@ +package japsadev.bio.hts.metagenome; + +/** + * This class models an bidirected Edge in my Graph implementation. + * An Edge contains two vertices and a weight (distance between them). + * A certain edge (v1,v2) can take one among 4 types: ++, --, +- and -+. Each + * type corresponds to the way we read the DNA sequence in each read when traversing + * this edge. + * For example: v1->---<-v2 or (v1,v2)+- spells out (v1 v2') and/or (v2 v1') as in SPAdes output. + * This class also deviates from the expectations of the Comparable interface + * in that a return value of 0 does not indicate that this.equals(other). The + * equals() method only compares the vertices, while the compareTo() method + * compares the edge weights. This provides more efficient implementation for + * checking uniqueness of edges, as well as the fact that two edges of equal weight + * should be considered equitably in a path finding or spanning tree algorithm. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Edge implements Comparable { + + private Vertex one, two; + private boolean dOne, dTwo; + private int weight; + + /** + * + * @param one The first vertex in the Edge + * @param two The second vertex in the Edge + */ + public Edge(Vertex one, Vertex two, boolean d1, boolean d2){ + this(one, two, d1, d2, -127); + } + + /** + * + * @param one The first vertex in the Edge + * @param two The second vertex of the Edge + * @param weight The weight of this Edge + */ + public Edge(Vertex one, Vertex two, boolean dOne, boolean dTwo, int weight){ + //this.one = (one.getLabel().compareTo(two.getLabel()) <= 0) ? one : two; + //this.two = (this.one == one) ? two : one; + this.one=one; + this.two=two; + this.weight = weight; + this.dOne=dOne; + this.dTwo=dTwo; + } + + + /** + * + * @param current + * @return The neighbor of current along this Edge + */ + public Vertex getNeighbor(Vertex current){ + if(!(current.equals(one) || current.equals(two))){ + return null; + } + + return (current.equals(one)) ? two : one; + } + /** + * Return the same Edge but reading the other way around + * just swap the order of its vertices upside down + * @param + * @return the identical Edge + */ + public Edge getReversedRead(){ + return new Edge(this.two, this.one, !this.dTwo, !this.dOne, this.weight); + } + /** + * + * @param current + * @return The direction to spell *current* along this Edge + */ + public boolean getDirection(Vertex current){ + assert (current.equals(one) || current.equals(two)):"Vertex doesn't belong to this Edge!"; + + return (current.equals(one)) ? dOne : !dTwo; + } + + /** + * + * @return Vertex this.one + */ + public Vertex getOne(){ + return this.one; + } + + /** + * + * @return Vertex this.two + */ + public Vertex getTwo(){ + return this.two; + } + + /** + * + * @return boolean this.dOne + */ + public boolean getDOne(){ + return this.dOne; + } + + /** + * + * @return boolean this.dTwo + */ + public boolean getDTwo(){ + return this.dTwo; + } + /** + * + * @return int The weight of this Edge + */ + public int getWeight(){ + return this.weight; + } + + + /** + * + * @param weight The new weight of this Edge + */ + public void setWeight(int weight){ + this.weight = weight; + } + + + /** + * Note that the compareTo() method deviates from + * the specifications in the Comparable interface. A + * return value of 0 does not indicate that this.equals(other). + * The equals() method checks the Vertex endpoints, while the + * compareTo() is used to compare Edge weights + * + * @param other The Edge to compare against this + * @return int this.weight - other.weight + */ + public int compareTo(Edge other){ + return this.weight - other.weight; + } + + /** + * + * @return String A String representation of this Edge + */ + public String toString(){ + return "({" + one + (dOne?">":"<") + ", " + two + (dTwo?">":"<") + "}, " + weight + ")"; + } + + /** + * + * @return int The hash code for this Edge + */ + public int hashCode(){ + return (one.getLabel() + (dOne?"":"'") + two.getLabel() + (dTwo?"":"'")).hashCode(); + } + + /** + * + * @param other The Object to compare against this + * @return true iff other is an Edge with the same Vertices as this + */ + public boolean equals(Object other){ + if(!(other instanceof Edge)){ + return false; + } + + Edge e = (Edge)other; + + return (e.one.equals(this.one) && e.two.equals(this.two) && (e.getDOne()==this.dOne) && (e.getDTwo()==this.dTwo)) + || (e.one.equals(this.two) && e.two.equals(this.one) && (e.getDOne()!=this.dOne) && (e.getDTwo()!=this.dTwo)); + } +} + + diff --git a/src/dev/java/japsadev/bio/hts/metagenome/Graph.java b/src/dev/java/japsadev/bio/hts/metagenome/Graph.java new file mode 100644 index 0000000..d419852 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/metagenome/Graph.java @@ -0,0 +1,389 @@ +package japsadev.bio.hts.metagenome; + +import java.io.IOException; +import java.util.*; +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; + +/** + * This class models a simple, bidirected graph using an + * incidence list representation. Vertices are identified + * uniquely by their labels, and only unique vertices are allowed. + * At most one unique Edge per vertex pair is allowed in this Graph. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Graph { + + private HashMap vertices; + private HashMap edges; + private int kmer; + + static final int TOLERATE=500; + + + public Graph(){ + this.vertices = new HashMap(); + this.edges = new HashMap(); + setKmerSize(127);//default kmer size used by SPAdes to assembly MiSeq data + } + + + public Graph(String graphFile) throws IOException{ + this(); + //1. next iterate over again to read the connections + SequenceReader reader = new FastaReader(graphFile); + Sequence seq; + int shortestLen = 10000; + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + if(seq.length() 1){ + String[] nbList = adjList[1].split(","); + for(int i=0; i < nbList.length; i++){ + // create list of bridges here (distance=-kmer overlapped) + String neighbor = nbList[i]; + boolean dir2=neighbor.contains("'")?false:true; + neighbor=neighbor.replaceAll("[^a-zA-Z0-9_.]", "").trim(); + + Vertex nbVertex=new Vertex(neighbor); + if(getVertex(nbVertex.getLabel())!=null) + nbVertex=getVertex(nbVertex.getLabel()); + + addVertex(nbVertex, false); + + addEdge(current, nbVertex, dir1, dir2); + } + } + + } + //rough estimation of kmer used + if((shortestLen-1) != getKmerSize()) + setKmerSize(shortestLen-1); + + reader.close(); + } + /** + * This constructor accepts an ArrayList and populates + * this.vertices. If multiple Vertex objects have the same label, + * then the last Vertex with the given label is used. + * + * @param vertices The initial Vertices to populate this Graph + */ + public Graph(ArrayList vertices){ + this.vertices = new HashMap(); + this.edges = new HashMap(); + + for(Vertex v: vertices){ + this.vertices.put(v.getLabel(), v); + } + setKmerSize(127);//default kmer size used by SPAdes to assembly MiSeq data + + } + + public int getKmerSize(){ + return this.kmer; + } + public void setKmerSize(int kmer){ + this.kmer=kmer; + Path.setK(kmer); + } + /** + * This method adds am edge between Vertices one and two + * and their corresponding direction of weight kmer, + * if no Edge between these Vertices already exists in the Graph. + * + * @param one The first vertex to add + * @param two The second vertex to add + * @param d1 The direction on the side of vertex one + * @param d2 The direction on the side of vertex two + * @return true iff no Edge relating one and two exists in the Graph + */ + public boolean addEdge(Vertex one, Vertex two, boolean d1, boolean d2){ + return addEdge(one, two, d1, d2, -kmer); + } + + + /** + * Accepts two vertices, their directions and a weight, and adds the edge + * ({one, two}, {d1, d2}, weight) iff no Edge relating one and two + * exists in the Graph. + * + * @param one The first Vertex of the Edge + * @param two The second Vertex of the Edge + * @param d1 The direction on the side of vertex one + * @param d2 The direction on the side of vertex two + * @param weight The weight of the Edge + * @return true iff no Edge already exists in the Graph + */ + public boolean addEdge(Vertex one, Vertex two, boolean d1, boolean d2, int weight){ + + //ensures the Edge is not in the Graph + Edge e = new Edge(one, two, d1, d2, weight); + if(edges.containsKey(e.hashCode()) || edges.containsKey(e.getReversedRead().hashCode())){ + return false; + } + + //and that the Edge isn't already incident to one of the vertices + else if(one.containsNeighbor(e) || two.containsNeighbor(e.getReversedRead())){ + return false; + } + + edges.put(e.hashCode(), e); + one.addNeighbor(e); + two.addNeighbor(e.getReversedRead()); + return true; + } + + /** + * + * @param e The Edge to look up + * @return true iff this Graph contains the Edge e + */ + public boolean containsEdge(Edge e){ + if(e.getOne() == null || e.getTwo() == null){ + return false; + } + + return this.edges.containsKey(e.hashCode()) + || this.edges.containsKey(e.getReversedRead().hashCode()); + } + + + /** + * This method removes the specified Edge from the Graph, + * including as each vertex's incidence neighborhood. + * + * @param e The Edge to remove from the Graph + * @return Edge The Edge removed from the Graph + */ + public Edge removeEdge(Edge e){ + e.getOne().removeNeighbor(e); + e.getTwo().removeNeighbor(e.getReversedRead()); + Edge rmEdge = this.edges.remove(e.hashCode()); + if (rmEdge==null) + rmEdge = this.edges.remove(e.getReversedRead().hashCode()); + return rmEdge; + } + + /** + * + * @param vertex The Vertex to look up + * @return true iff this Graph contains vertex + */ + public boolean containsVertex(Vertex vertex){ + return this.vertices.get(vertex.getLabel()) != null; + } + + /** + * + * @param label The specified Vertex label + * @return Vertex The Vertex with the specified label + */ + public Vertex getVertex(String label){ + return vertices.get(label); + } + + /** + * This method adds a Vertex to the graph. If a Vertex with the same label + * as the parameter exists in the Graph, the existing Vertex is overwritten + * only if overwriteExisting is true. If the existing Vertex is overwritten, + * the Edges incident to it are all removed from the Graph. + * + * @param vertex + * @param overwriteExisting + * @return true iff vertex was added to the Graph + */ + public boolean addVertex(Vertex vertex, boolean overwriteExisting){ + Vertex current = this.vertices.get(vertex.getLabel()); + if(current != null){ + if(!overwriteExisting){ + return false; + } + + while(current.getNeighborCount() > 0){ + this.removeEdge(current.getNeighbor(0)); + } + } + + + vertices.put(vertex.getLabel(), vertex); + return true; + } + + /** + * + * @param label The label of the Vertex to remove + * @return Vertex The removed Vertex object + */ + public Vertex removeVertex(String label){ + Vertex v = vertices.remove(label); + + while(v.getNeighborCount() > 0){ + this.removeEdge(v.getNeighbor((0))); + } + + return v; + } + + /** + * + * @return Set All Graph's Vertex objects + */ + public Set getVertices(){ + return new HashSet(this.vertices.values()); + } + + /** + * + * @return Set The Edges of this graph + */ + public Set getEdges(){ + return new HashSet(this.edges.values()); + } + + /** + * Find a path between two nodes within a given distance + */ + public ArrayList DFS(Node source, Node dest, int distance){ + System.out.println("Looking for path between " + source.toString() + " to " + dest.toString() + " with distance " + distance); + Path tmp = new Path(); + ArrayList retval = new ArrayList(); + tmp.addNode(source); + + //traverse(tmp, dest, retval, distance+source.getSeq().length()+dest.getSeq().length()); + traverse(tmp, dest, retval, distance); + + return retval; + } + + public void traverse(Path path, Node dest, ArrayList curResult, int distance){ + Node source=path.getEnd(); + assert source!=null:"Path null fault!"; + + ArrayList nList = source.getVertex().getNeighbors(); + for(Edge e:nList){ + if(e.getDOne()==source.getDirection()){ + path.addNode(e.getTwo(), e.getDTwo()); + + if(e.getTwo()==dest.getVertex() && e.getDTwo()==dest.getDirection() && Math.abs(distance+getKmerSize()) < TOLERATE){ + + Path curPath=curResult.isEmpty()?new Path():curResult.get(0), //the best path saved among all possible paths from the list curResult + tmpPath=new Path(); + tmpPath.setComp(path.getNodes()); + tmpPath.setDeviation(Math.abs(distance+getKmerSize())); + if( Math.abs(distance+getKmerSize()) < curPath.getDeviation() ) + curResult.add(0, tmpPath); + else + curResult.add(tmpPath); + + System.out.println("Hit added: "+path+"(candidate deviation: "+Math.abs(distance+getKmerSize())+")"); + }else{ + int newDistance=distance-e.getTwo().getSequence().length()+getKmerSize(); + if (newDistance+getKmerSize()<-TOLERATE){ + System.out.println("Stop following path with distance "+newDistance+" already! : "+path); + }else + traverse(path, dest, curResult, newDistance); + } + path.removeLast(); + } + } + } + /** + * + * @param p Path to be grouped as a virtually vertex + */ + public void reduce(Path p){ + Vertex comp=new Vertex(p); + //add the new composite Vertex to the graph + addVertex(comp, true); + //remove unique nodes on p + ArrayList tobeRemoved=new ArrayList(); + for(Node n:p.getNodes()){ + if(n.getVertex().isUnique()) + tobeRemoved.add(n.getVertex().getLabel()); + } + + Node start = p.getStart(), + end = p.getEnd(); + //set neighbors of the grouped Vertex + for(Edge e:start.getVertex().getNeighbors()){ + if(e.getDOne()!=start.getDirection()){ + //comp.addNeighbor(e); + addEdge(comp,e.getTwo(),e.getDOne(),e.getDTwo()); + } + } + for(Edge e:end.getVertex().getNeighbors()){ + if(e.getDOne()!=end.getDirection()) + //comp.addNeighbor(e); + addEdge(comp,e.getTwo(),e.getDOne(),e.getDTwo()); + } + for(String lab:tobeRemoved) + removeVertex(lab); + //TODO: remove bubbles... + } + /** + * + * @param v Vertex to be reverted (1-level reverting) + */ + public void revert(Vertex v){ + //TODO: revert to initial status by extracting a complex vertex into its initial components + Path p=v.getSubComps(); + if(!containsVertex(v)||p==null) return; + //add back all vertices first + for(Node n:p.getNodes()) + addVertex(n.getVertex(), false); + //then add back all neighbor edges of this composite vertex + for(Edge e:v.getNeighbors()) + addEdge(v,e.getTwo(),e.getDOne(),e.getDTwo()); + //finally add back all edges from the path + Node prev=p.getStart(); + for(Node cur:p.getNodes()){ + if(cur==p.getStart()) + continue; + else{ + addEdge(prev.getVertex(),cur.getVertex(),prev.getDirection(),cur.getDirection()); + prev=cur; + } + } + + //remove the original composite vertex + removeVertex(v.getLabel()); + } + public void printStats(){ + System.out.println(vertices.size() + " vertices:"); + for(String label:vertices.keySet()) + System.out.print(label+", "); + System.out.println(); + System.out.println(edges.size() + " edges:"); + for(Edge e:edges.values()){ + System.out.print(e.toString()+", "); + } + System.out.println(); + + } +} + + diff --git a/src/dev/java/japsadev/bio/hts/metagenome/Node.java b/src/dev/java/japsadev/bio/hts/metagenome/Node.java new file mode 100644 index 0000000..d6c1036 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/metagenome/Node.java @@ -0,0 +1,34 @@ +package japsadev.bio.hts.metagenome; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; + +public class Node{ + Vertex v; + boolean dir; + Node(Vertex v, boolean dir){ + this.v=v; + this.dir=dir; + } + public Vertex getVertex(){ + return v; + } + public void setVertex(Vertex v){ + this.v = v; + } + public boolean getDirection(){ + return dir; + } + public void setDirection(boolean dir){ + this.dir=dir; + } + public Node getRC(){ + return new Node(v,!dir); + } + public Sequence getSeq(){ + return dir?v.getSequence():Alphabet.DNA.complement(v.getSequence()); + } + public String toString(){ + return v.getLabel()+ (dir?"+":"-"); + } +} diff --git a/src/dev/java/japsadev/bio/hts/metagenome/Path.java b/src/dev/java/japsadev/bio/hts/metagenome/Path.java new file mode 100644 index 0000000..5474435 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/metagenome/Path.java @@ -0,0 +1,167 @@ +package japsadev.bio.hts.metagenome; + +import java.util.ArrayList; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; + +public class Path implements Comparable{ + public static int kmer=127; + ArrayList nodes; + //Graph graph; + int length, deviation; //how this path differ to long read data (todo: by multiple-alignment??) + public Path(){ + this.nodes=new ArrayList(); + //graph=new Graph(); + length=0; + deviation=Integer.MAX_VALUE; + } + +// public Path(Graph graph){ +// this(); +// associate(graph); +// } + public static void setK(int kmer){ + Path.kmer=kmer; + } + public Path(Path p){ + this(); + //this(p.graph); + for(Node node:p.nodes) + this.nodes.add(node); + this.length=p.length; + } + /* + * @param String: a path as in contigs.paths of SPAdes output + * For example: 1+,2-,3+ + */ + public Path(Graph graph, String paths){ + this(); + //this(graph); + paths=paths.replace(";", ""); //optimized it! + String[] comps = paths.split(","); + for(int i=0; i getNodes(){ + return nodes; + } + public void setComp(ArrayList nodes){ + this.length=0; + for(Node node:nodes) + this.addNode(node); + } + + public Path rc(){ + //Path retval=new Path(graph); + Path retval=new Path(); + for(Node node:nodes){ + retval.nodes.add(0, node.getRC()); + } + return retval; + } + + public String toString(){ + return "P"+getID(); + } + public String getID(){ + String retval=""; + for(Node node:nodes){ + retval+=node.toString(); + } + return retval.trim(); + } + public Node removeLast(){ + Node retval=nodes.remove(nodes.size()-1); + length-=retval.getSeq().length()-kmer; + return retval; + } + + public Sequence spelling(){ + SequenceBuilder seq = new SequenceBuilder(Alphabet.DNA16(), 1024*1024, this.toString()); + + for(int i=0;i neighborhood; + private String fullName, label; + private double coverage; + private Sequence seq=null; + //a sub-graph (path) is equivalent to a Vertex recursively + private Path components=null; + public Vertex(){ + fullName=""; + label=""; + coverage=0; + this.neighborhood = new ArrayList(); + this.seq=new Sequence(Alphabet.DNA5(), 0); + } + /** + * + * @param name The unique label associated with this Vertex + */ + public Vertex(String name){ + this.fullName=name; + this.label=extractID(name); + this.coverage=extractCoverage(name); + this.neighborhood = new ArrayList(); + this.seq=new Sequence(Alphabet.DNA5(), 0); + } + + public Vertex(String name, Sequence seq){ + this(name); + this.seq = seq; + } + + public Vertex(Path p){ + this(); + fullName=label=p.getID(); + components=p; + coverage=p.averageCov(); + } + /** + * + * @return If this vertex is unique (only based on its degree) + */ + public boolean isUnique(){ + return (getNeighborCount() <= 2); + } + public Path getSubComps(){ + return components; + } + /** + * Extract ID from name: EDGE_xx_length_yy_cov_zz; + * @param name The name of Edge in assembly graph that correspond to this Vertex + */ + private String extractID(String name){ + return name.split("_")[1]; + } + /** + * Extract coverage from name: EDGE_xx_length_yy_cov_zz; + * @param name The name of Edge in assembly graph that correspond to this Vertex + * @return coverage value + */ + private double extractCoverage(String name){ + double res=1.0; + try{ + res=Double.parseDouble(name.split("_")[5]); + }catch(Exception e){ + e.printStackTrace(); + } + return res; + } + public double getCoverage(){ + return coverage; + } + /** + * This method adds an Edge to the incidence neighborhood of this graph iff + * the edge is not already present. + * + * @param edge The edge to add + */ + public void addNeighbor(Edge edge){ + if(this.neighborhood.contains(edge)){ + return; + } + this.neighborhood.add(edge); + } + + + /** + * + * @param other The edge for which to search + * @return true iff other is contained in this.neighborhood + */ + public boolean containsNeighbor(Edge other){ + return this.neighborhood.contains(other); + } + + /** + * + * @param index The index of the Edge to retrieve + * @return Edge The Edge at the specified index in this.neighborhood + */ + public Edge getNeighbor(int index){ + return this.neighborhood.get(index); + } + + + /** + * + * @param index The index of the edge to remove from this.neighborhood + * @return Edge The removed Edge + */ + Edge removeNeighbor(int index){ + return this.neighborhood.remove(index); + } + + /** + * + * @param e The Edge to remove from this.neighborhood + */ + public void removeNeighbor(Edge e){ + this.neighborhood.remove(e); + } + + + /** + * + * @return int The number of neighbors of this Vertex + */ + public int getNeighborCount(){ + return this.neighborhood.size(); + } + /** + * + * @return String The label of this Vertex + */ + public String getLabel(){ + return this.label; + } + /** + * + * @return String The full name of this Vertex + */ + public String getName(){ + return this.fullName; + } + /** + * + * @param Sequence A sequence + */ + public void setSequence(Sequence seq){ + this.seq = seq; + } + /** + * + * @return Sequence The sequence of this Vertex + */ + public Sequence getSequence(){ + return this.seq; + } + /** + * + * @return String A String representation of this Vertex + */ + public String toString(){ + return "Vertex " + label; + } + + /** + * + * @return The hash code of this Vertex's label + */ + public int hashCode(){ + return this.label.hashCode(); + } + + /** + * + * @param other The object to compare + * @return true iff other instanceof Vertex and the two Vertex objects have the same label + */ + public boolean equals(Object other){ + if(!(other instanceof Vertex)){ + return false; + } + + Vertex v = (Vertex)other; + return this.label.equals(v.label); + } + + /** + * + * @return ArrayList A copy of this.neighborhood. Modifying the returned + * ArrayList will not affect the neighborhood of this Vertex + */ + public ArrayList getNeighbors(){ + return new ArrayList(this.neighborhood); + } + +} + + diff --git a/src/dev/java/japsadev/bio/hts/newscarf/Alignment.java b/src/dev/java/japsadev/bio/hts/newscarf/Alignment.java new file mode 100644 index 0000000..41d04fe --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/Alignment.java @@ -0,0 +1,157 @@ +package japsadev.bio.hts.newscarf; + +import java.util.ArrayList; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import japsa.seq.Sequence; + +public class Alignment implements Comparable { + public final static int OVERHANG_THRES=1000; + public final static int GOOD_QUAL=60; + + public static int MIN_QUAL=30; //TODO: reduce this by doing self-correction + + int alignLength, quality; + + public String readID; + BidirectedNode node; + + public int refStart, refEnd; //1-based position on ref of the start and end of the alignment + + //Position on read of the start and end of the alignment (using the direction of read) + //readStart map to refStart, readEnd map to refEnd. + //readStart < readEnd if strand = true, else readStart > readEnd + public int readStart = 0, readEnd = 0; + + //read length + public int readLength = 0; + + public boolean strand = true;//positive + public boolean prime = true;//primary alignment + public boolean goodMargin = false; + public boolean useful = false; + //SAMRecord mySam; + + ArrayList alignmentCigars = new ArrayList(); + + + //public int readLeft, readRight, readAlign, refLeft, refRight, refAlign; + //left and right are in the direction of the reference sequence + + public Alignment(SAMRecord sam, BidirectedNode node) { +// readID = Integer.parseInt(sam.getReadName().split("_")[0]); + readID = sam.getReadName(); + quality = sam.getMappingQuality(); + prime=!sam.getNotPrimaryAlignmentFlag(); + this.node = node; + + refStart = sam.getAlignmentStart(); + refEnd = sam.getAlignmentEnd(); + + Cigar cigar = sam.getCigar(); + boolean enterAlignment = false; + ////////////////////////////////////////////////////////////////////////////////// + + for (final CigarElement e : cigar.getCigarElements()) { + alignmentCigars.add(e); + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + case S : + case P : //pad is a kind of clipped + if (enterAlignment) + readEnd = readLength; + readLength += length; + break; // soft clip read bases + case I : + case M : + case EQ : + case X : + if (!enterAlignment){ + readStart = readLength + 1; + enterAlignment = true; + } + readLength += length; + break; + case D : + case N : + if (!enterAlignment){ + readStart = readLength + 1; + enterAlignment = true; + } + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//case + }//for + if (readEnd == 0) + readEnd = readLength; + //these temporary variable to determine usefulness + int readLeft = readStart -1; + int readRight = readLength - readEnd; + + int refLeft = refStart - 1; + int refRight = ((Sequence) node.getAttribute("seq")).length() - refEnd; + + alignLength = refEnd + 1 - refStart; + if (sam.getReadNegativeStrandFlag()){ + strand = false; + //need to convert the alignment position on read the correct direction + readStart = 1 + readLength - readStart; + readEnd = 1 + readLength - readEnd; + } + + if ( + (readLeft < OVERHANG_THRES || refLeft < OVERHANG_THRES) && + (readRight < OVERHANG_THRES || refRight < OVERHANG_THRES) + ) + goodMargin=true; + + if ( goodMargin + //prime && //TODO: should be separated as another attribute for further consideration?? + && alignLength > BidirectedGraph.getKmerSize() //FIXME: + && quality >= MIN_QUAL + ) + useful = true; + + } + + + public int readAlignmentStart(){ + return Math.min(readStart,readEnd); + + } + + public int readAlignmentEnd(){ + return Math.max(readStart,readEnd); + } + + public String toString() { + return node.getAttribute("name") + + ": " + refStart + + " -> " + refEnd + + " / " + ((Sequence) node.getAttribute("seq")).length() + + " map to " + + readID + + ": " + readStart + + " -> " + readEnd + + " / " + readLength + + ", strand: " + (strand?"+":"-") + + ", prime: " + (prime?"yes":"no") + + ", margin: " + (goodMargin?"good":"bad"); + } + public static boolean isOverlap(Alignment alg1, Alignment alg2){ + boolean retval=false; + + + return retval; + } + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(Alignment o) { + return readAlignmentStart() - o.readAlignmentStart(); + } +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/BidirectedEdge.java b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedEdge.java new file mode 100644 index 0000000..0e262e5 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedEdge.java @@ -0,0 +1,176 @@ +package japsadev.bio.hts.newscarf; + +import org.graphstream.graph.Edge; +import org.graphstream.graph.Node; +import org.graphstream.graph.implementations.AbstractEdge; +import org.graphstream.graph.implementations.AbstractNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import japsa.seq.Sequence; + +public class BidirectedEdge extends AbstractEdge{ + protected boolean dir0, dir1;//true: outward, false: inward + private int length=-BidirectedGraph.getKmerSize();//length of the edge (distance between tips of seqs represented by 2 nodes) + private BidirectedPath path; //path that represents this edge. Should be a list if metagenome/multiploidy + + + private static final Logger LOG = LoggerFactory.getLogger(BidirectedEdge.class); + + protected BidirectedEdge(String id, AbstractNode src, AbstractNode dst, boolean dir0, boolean dir1) { + // id fuck off!!! we'll make one for ourselves + this(src,dst,dir0,dir1); + } + protected BidirectedEdge(AbstractNode src, AbstractNode dst, boolean dir0, boolean dir1) { + //this(createID(src,dst,dir0,dir1), src, dst); + + super(createID(src,dst,dir0,dir1),src,dst,false); + this.dir0=dir0; + this.dir1=dir1; + + path = new BidirectedPath(); + path.setRoot(src); + path.add(this); + } + + /* param id must have the form %s[o/i]%s[o/i], e.g. [1+2-]o[3]i + * the constructor will translate the id to the direction property + * of the bidirected edge + */ +// protected BidirectedEdge(String id, AbstractNode source, AbstractNode dest){ +// super(id, source, dest, false); +// String pattern = "^\\[([0-9\\+\\-]*)\\]([oi])\\[([0-9\\+\\-]*)\\]([oi])$"; +// // Create a Pattern object +// Pattern r = Pattern.compile(pattern); +// // Now create matcher object. +// Matcher m = r.matcher(id); +// String leftID, rightID, +// leftDir, rightDir; +// if(m.find()){ +// leftID=m.group(1); +// leftDir=m.group(2); +// rightID=m.group(3); +// rightDir=m.group(4); +// if(source.getId().equals(leftID)){ +// dir0=(leftDir.equals("o")?true:false); +// dir1=(rightDir.equals("o")?true:false); +// }else if(source.getId().equals(rightID)){ +// dir0=(rightDir.equals("o")?true:false); +// dir1=(leftDir.equals("o")?true:false); +// }else{ +// System.err.println("ID does not match"); +// System.exit(1); +// } +// } else{ +// System.err.println("Illegal ID for a bidirected edge (id must have the form node_id[o/i]node_id[o/i])"); +// System.exit(1); +// } +// +// } + + /* + * To adapt Graph class from GraphStream library (called by newInstance()) + */ + protected BidirectedEdge(String id, AbstractNode source, AbstractNode dest){ + super(id, source, dest, false); + + assert (source.getGraph() == dest.getGraph()):"Nodes come from different graph " + source.getGraph().getId() + " and " + dest.getGraph().getId(); + BidirectedGraph g = (BidirectedGraph) source.getGraph(); + path = new BidirectedPath(g,id); + BidirectedNode n0 = (BidirectedNode) path.getRoot(), + n1 = (BidirectedNode) path.peekNode(); + + BidirectedEdge firstEdge = (BidirectedEdge) path.getEdgePath().get(0), + lastEdge = (BidirectedEdge) path.peekEdge(); + if(n0.getId().equals(source.getId())){ + dir0 = firstEdge.getDir(n0); + dir1 = lastEdge.getDir(n1); + }else if(n0.getId().equals(dest.getId())){ + dir1 = firstEdge.getDir(n0); + dir0 = lastEdge.getDir(n1); + path = path.getReversedComplemented(); + }else{ + LOG.error("Path {} conflicts with src={} dst={}!", id, source.getId(), dest.getId()); + System.exit(1); + } + } + +// public static String createID(AbstractNode source, AbstractNode dst, boolean dir0, boolean dir1){ +// String srcDes = "["+source.getId()+"]"+(dir0 ? "o":"i"), +// dstDes = "["+dst.getId()+"]"+(dir1 ? "o":"i"); +// if(srcDes.compareTo(dstDes)<0) +// return String.format("%s%s", srcDes, dstDes); +// else +// return String.format("%s%s", dstDes, srcDes); +// } + + + //should have smt like this + public static BidirectedEdge makeEdge (AbstractNode src, AbstractNode dst, BidirectedPath path) { + return new BidirectedEdge(path.getId(), src, dst); + } + +// public BidirectedEdge getReversedComplemented(){ +// BidirectedEdge retval = makeEdge(getNode1(),getNode0(),path.getReversedComplemented()); +// +// return retval; +// } + + public void setPath(BidirectedPath path){ + this.path = path; + //here set the new length due to that path + Node curNode = path.getRoot(); + for(Edge e:path.getEdgePath()){ + curNode = e.getOpposite(curNode); + if(curNode == path.peekNode()) + break; + else{ + length+=((Sequence)curNode.getAttribute("seq")).length()-BidirectedGraph.getKmerSize(); + } + } + } + public BidirectedPath getPath(){ + return path; + } + + @Override + public String toString() { + return String.format("%s:%s-%s-%s-%s", getId(), source, (dir0?">":"<"), (dir1?"<":">"), target); + } + + public void setDir0(boolean dir){ + this.dir0=dir; + } + public void setDir1(boolean dir){ + this.dir1=dir; + } + public boolean getDir0(){ + return dir0; + } + public boolean getDir1(){ + return dir1; + } + //if kmer!=127 we need to set initial lengths of edges again (easy+tedious way) + public void changeKmerSize(int kmer){ + length=-kmer; + } + public int getLength(){ + return length; + } + + //TODO: include the case of tandem repeats + public boolean getDir(AbstractNode node){ + assert node==getSourceNode()||node==getTargetNode():"Node " + node.getId() + " does not belong to this edge src=" + getSourceNode().getId() + " dst=" + getTargetNode().getId(); + return node==getSourceNode()?getDir0():getDir1(); + } + + + public static String createID(AbstractNode source, AbstractNode dst, boolean dir0, boolean dir1){ + String srcDes = source.getId(), + dstDes = dst.getId(); + if(srcDes.compareTo(dstDes)<0) + return String.format("%s%s,%s%s", srcDes, (dir0?"+":"-"), dstDes, (dir1?"-":"+")); + else + return String.format("%s%s,%s%s", dstDes, (dir1?"+":"-"), srcDes, (dir0?"-":"+")); + } +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/BidirectedGraph.java b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedGraph.java new file mode 100644 index 0000000..a3556d3 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedGraph.java @@ -0,0 +1,586 @@ +package japsadev.bio.hts.newscarf; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; + +import org.graphstream.graph.*; +import org.graphstream.graph.implementations.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; + + +public class BidirectedGraph extends AdjacencyListGraph{ + static int kmer=127; + static final int TOLERATE=500; + static final int D_LIMIT=10000; + static final int S_LIMIT=50; + + private static final Logger LOG = LoggerFactory.getLogger(BidirectedGraph.class); + + // *** Constructors *** + /** + * Creates an empty graph. + * + * @param id + * Unique identifier of the graph. + * @param strictChecking + * If true any non-fatal error throws an exception. + * @param autoCreate + * If true (and strict checking is false), nodes are + * automatically created when referenced when creating a edge, + * even if not yet inserted in the graph. + * @param initialNodeCapacity + * Initial capacity of the node storage data structures. Use this + * if you know the approximate maximum number of nodes of the + * graph. The graph can grow beyond this limit, but storage + * reallocation is expensive operation. + * @param initialEdgeCapacity + * Initial capacity of the edge storage data structures. Use this + * if you know the approximate maximum number of edges of the + * graph. The graph can grow beyond this limit, but storage + * reallocation is expensive operation. + */ + public BidirectedGraph(String id, boolean strictChecking, boolean autoCreate, + int initialNodeCapacity, int initialEdgeCapacity) { + super(id, strictChecking, autoCreate); + // All we need to do is to change the node & edge factory + setNodeFactory(new NodeFactory() { + public BidirectedNode newInstance(String id, Graph graph) { + return new BidirectedNode((AbstractGraph) graph, id); + } + }); + + setEdgeFactory(new EdgeFactory() { + public BidirectedEdge newInstance(String id, Node src, Node dst, boolean directed) { //stupid?? + return new BidirectedEdge(id, (AbstractNode)src, (AbstractNode)dst); + } + }); + + } + + /** + * Creates an empty graph with default edge and node capacity. + * + * @param id + * Unique identifier of the graph. + * @param strictChecking + * If true any non-fatal error throws an exception. + * @param autoCreate + * If true (and strict checking is false), nodes are + * automatically created when referenced when creating a edge, + * even if not yet inserted in the graph. + */ + public BidirectedGraph(String id, boolean strictChecking, boolean autoCreate) { + this(id, strictChecking, autoCreate, DEFAULT_NODE_CAPACITY, + DEFAULT_EDGE_CAPACITY); + } + + /** + * Creates an empty graph with strict checking and without auto-creation. + * + * @param id + * Unique identifier of the graph. + */ + public BidirectedGraph(String id) { + this(id, true, false, 10000, 100000); + } + + public BidirectedGraph(){ + this("Assembly graph",true,false, 10000, 100000); + setKmerSize(127);//default kmer size used by SPAdes to assembly MiSeq data + } + + //just to make AbstractGraph.removeEdge(AbstractEdge, boolean, boolean, boolean) visible + protected void removeEdgeDup(AbstractEdge edge, boolean graphCallback, + boolean sourceCallback, boolean targetCallback) { + this.removeEdge(edge, graphCallback, sourceCallback, targetCallback); + + } + protected BidirectedEdge addEdge(AbstractNode src, AbstractNode dst, boolean dir0, boolean dir1){ + BidirectedEdge tmp = addEdge(BidirectedEdge.createID(src, dst, dir0, dir1), src, dst); +// String s1=tmp.toString(); +// //tmp.setDir0(dir0); +// //tmp.setDir1(dir1); +// if(!s1.equals(tmp.toString())) +// System.out.println(s1 + " ---> " + tmp); + return tmp; + } + + public String printEdgesOfNode(BidirectedNode node){ + Iterator ins = getNode(node.getId()).getEnteringEdgeIterator(), + outs = getNode(node.getId()).getLeavingEdgeIterator(); + String retval=node.getId() + ": IN={"; + while(ins.hasNext()) + retval += ins.next().getId() + " "; + retval+="}; OUT={"; + while(outs.hasNext()) + retval += outs.next().getId() + " "; + retval+="}"; + return retval; + } + /********************************************************************************** + * ****************************Algorithms go from here***************************** + */ + //TODO: read from ABySS assembly graph (graph of final contigs, not like SPAdes) + private static double aveCov; //TODO: replaced with more accurate method + + public void loadFromFile(String graphFile) throws IOException{ + setAutoCreate(true); + setStrict(false); + //1. next iterate over again to read the connections + SequenceReader reader = new FastaReader(graphFile); + Sequence seq; + int shortestLen = 10000; + int totReadLen=0, totGenomeLen=0; + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + if(seq.length() 1){ + String[] nbList = adjList[1].split(","); + for(int i=0; i < nbList.length; i++){ + String neighbor = nbList[i]; + // note that the direction is read reversely in the dest node + boolean dir1=neighbor.contains("'")?true:false; + neighbor=neighbor.replaceAll("[^a-zA-Z0-9_.]", "").trim(); + + String neighborID = neighbor.split("_")[1]; + AbstractNode nbr = addNode(neighborID); + nbr.setAttribute("cov", Double.parseDouble(neighbor.split("_")[5])); + + addEdge(node, nbr, dir0, dir1); + //e.addAttribute("ui.label", e.getId()); + } + } + + } + + //rough estimation of kmer used + if((shortestLen-1) != getKmerSize()){ + setKmerSize(shortestLen-1); + for(Edge e:getEdgeSet()){ + ((BidirectedEdge)e).changeKmerSize(kmer); + } + } + + aveCov = totReadLen/totGenomeLen; + reader.close(); + + } + + + public static int getKmerSize(){ + return BidirectedGraph.kmer; + } + public static void setKmerSize(int kmer){ + BidirectedGraph.kmer=kmer; + } + + + /* + * Read paths from contigs.path and reduce the graph + */ +// public void readPathsFromSpades(String paths) throws IOException{ +// +// BufferedReader pathReader = new BufferedReader(new FileReader(paths)); +// +// String s; +// //Read contigs from contigs.paths and refer themselves to contigs.fasta +// boolean flag=false; +// while((s=pathReader.readLine()) != null){ +// if(s.contains("NODE")){ +// flag=s.contains("'")?false:true; +// continue; +// }else if(flag){ +// BidirectedPath path=new BidirectedPath(this, s); +//// System.out.println("Using path to reduce: " + path.getId()); +//// System.out.println("Before reduce => Node: " + getNodeCount() + " Edge: " + getEdgeCount()); +// +//// AbstractNode comp= +// +// this.reduce(path); +// +// +//// if(comp!=null){ +//// System.out.println("Reverting node: " + comp.getId()); +//// revert(comp); +//// System.out.println("After revert => Node: " + getNodeCount() + " Edge: " + getEdgeCount()); +//// +//// } +// } +// +// +// } +// pathReader.close(); +// } + +// /** +// * +// * @param p Path to be grouped as a virtually vertex +// */ +// public AbstractNode reduce(BidirectedPath p){ +// //do nothing if the path has only one node +// if(p==null || p.getEdgeCount()<1) +// return null; +// +// //now only work with path containing more than 2 unique nodes +// int uniqueCount=0; +// for(Node n:p.getEachNode()){ +// if(isUnique(n)) +// uniqueCount++; +// } +// if(uniqueCount < 2) +// { +// System.out.println("ignore path with less than 1 unique contig!"); +// return null; +// } +// //add the new composite Node to the graph +// //compare id from sense & anti-sense to get the unique one +// AbstractNode comp = addNode(p.getId().compareTo(p.getReversedComplemented().getId())>0? +// p.getReversedComplemented().getId():p.getId()); +// +// comp.addAttribute("path", p); +// comp.addAttribute("seq", p.spelling()); +// comp.addAttribute("ui.label", comp.getId()); +// comp.setAttribute("ui.style", "text-offset: -10;"); +// comp.setAttribute("ui.class", "marked"); +// try { Thread.sleep(100); } catch (Exception e) {} +// +// //store unique nodes on p for removing +// ArrayList tobeRemoved=new ArrayList(); +// for(Node n:p.getEachNode()){ +// if(isUnique(n)) +// tobeRemoved.add(n.getId()); +// } +// BidirectedNode start = (BidirectedNode) p.getRoot(), +// end = (BidirectedNode) p.peekNode(); +// boolean startDir = ((BidirectedEdge) p.getEdgePath().get(0)).getDir(start), +// endDir = ((BidirectedEdge) p.peekEdge()).getDir(end); +// //set neighbors of the composite Node +// Iterator startEdges = startDir?start.getEnteringEdgeIterator():start.getLeavingEdgeIterator(), +// endEdges = endDir?end.getEnteringEdgeIterator():end.getLeavingEdgeIterator(); +// while(startEdges.hasNext()){ +// BidirectedEdge e = (BidirectedEdge) startEdges.next(); +// BidirectedNode opNode = e.getOpposite(start); +// boolean opDir = e.getDir(opNode); +// //Edge tmp= +// addEdge(BidirectedEdge.createID(comp, opNode, false, opDir), comp, opNode);//always into start node +// //System.out.println("From " + start.getId() + ": " + tmp.getId() + " added!"); +// } +// +// while(endEdges.hasNext()){ +// BidirectedEdge e = (BidirectedEdge) endEdges.next(); +// BidirectedNode opNode = e.getOpposite(end); +// boolean opDir = e.getDir(opNode); +// //Edge tmp= +// addEdge(BidirectedEdge.createID(comp, opNode, true, opDir), comp, opNode);//always out of end node +// +// //System.out.println("From " + end.getId() + ": " + tmp.getId() + " added!"); +// +// } +// +// for(String nLabel:tobeRemoved){ +// //System.out.println("About to remove " + nLabel); +// removeNode(nLabel); +// } +// +// //TODO: remove bubbles... +// return comp; +// } +// +// +// /** +// * +// * @param v Node to be reverted (1-level reverting) +// */ +// public void revert(AbstractNode v){ +// System.out.println("Reverting..."); +// Path p=v.getAttribute("path"); +// if(p==null) return; +// +// BidirectedNode start = (BidirectedNode) p.getRoot(), +// end = (BidirectedNode) p.peekNode(); +// boolean startDir = ((BidirectedEdge) p.getEdgePath().get(0)).getDir(start), +// endDir = ((BidirectedEdge) p.peekEdge()).getDir(end); +// +// //add back all neighbor edges of this composite vertex +// Iterator startEdges = v.getEnteringEdgeIterator(), +// endEdges = v.getLeavingEdgeIterator(); +// //add back all nodes from the path +// for(Node n:p.getNodeSet()){ +// if(getNode(n.getId())!=null) +// continue; +// Node tmp = addNode(n.getId()); +// tmp.addAttribute("seq", (japsa.seq.Sequence)n.getAttribute("seq")); +// tmp.addAttribute("name", (String)n.getAttribute("name")); +// tmp.addAttribute("path", (BidirectedPath)n.getAttribute("path")); +// +// //System.out.println("Adding back edge "+tmp.getId()); +// } +// while(startEdges.hasNext()){ +// BidirectedEdge e = (BidirectedEdge) startEdges.next(); +// BidirectedNode opNode = e.getOpposite(v); +// boolean opDir = e.getDir(opNode); +// //Edge tmp = +// addEdge(BidirectedEdge.createID(start, opNode, !startDir, opDir), start, opNode); +// //System.out.println("Adding back edge "+tmp.getId()); +// } +// +// while(endEdges.hasNext()){ +// BidirectedEdge e = (BidirectedEdge) endEdges.next(); +// BidirectedNode opNode = e.getOpposite(v); +// boolean opDir = e.getDir(opNode); +// //Edge tmp = +// addEdge(BidirectedEdge.createID(end, opNode, !endDir, opDir), end, opNode); +// //System.out.println("Adding back edge "+tmp.getId()); +// } +// +// //add back all edges from the path +// for(Edge e:p.getEdgeSet()){ +// //Edge tmp = +// addEdge(e.getId(), e.getSourceNode().getId(), e.getTargetNode().getId()); +// //System.out.println("Adding back edge "+tmp.getId()); +// } +// //finally remove the composite node +// removeNode(v); +// } + + /* + * This function deduces a full path in this graph between 2 nodes aligned with a long read + */ + protected ArrayList getClosestPath(Alignment from, Alignment to, int distance){ + BidirectedNode srcNode = from.node, + dstNode = to.node; + System.out.println("Looking for path between " + srcNode.getId() + " to " + dstNode.getId() + " with distance " + distance); + BidirectedPath tmp = new BidirectedPath(); + ArrayList possiblePaths = new ArrayList(), + retval = new ArrayList(); + tmp.setRoot(srcNode); + + //traverse(tmp, dest, retval, distance+source.getSeq().length()+dest.getSeq().length()); + traverse(tmp, dstNode, possiblePaths, distance, from.strand, to.strand, 0); + //only get the best ones + if(possiblePaths.isEmpty()){ + //if a path couldn't be found between 2 dead-ends but alignments quality are insane high + //FIXME: return a pseudo path having an nanopore edge + if(isUnique(srcNode) && isUnique(dstNode) && srcNode.getDegree() == 1 && dstNode.getDegree()==1 && + Math.min(from.quality, to.quality) >= Alignment.GOOD_QUAL) + { + BidirectedEdge pseudoEdge = new BidirectedEdge(srcNode, dstNode, from.strand, to.strand); + //TODO: save the corresponding content of long reads to this edge + pseudoEdge.setAttribute("pseudo", distance); + tmp.add(pseudoEdge); + retval.add(tmp); + System.out.println("pseudo path from " + srcNode.getId() + " to " + dstNode.getId()); +// HybridAssembler.promptEnterKey(); + return retval; + }else + return null; + } + double bestScore=possiblePaths.get(0).getDeviation(); + for(int i=0;i curResult, + int distance, boolean srcDir, boolean dstDir, int stepCount) + { + //stop if it's going too far! + if(stepCount >= S_LIMIT) + return; + + BidirectedNode currentNode=(BidirectedNode) path.peekNode(); + BidirectedEdge currentEdge; + boolean curDir;//direction to the next node, = ! previous' + + Iterator ite; + if(path.size() <= 1) //only root + curDir=srcDir;//re-check + else{ + currentEdge = (BidirectedEdge) path.peekEdge(); + curDir = !((BidirectedEdge) currentEdge).getDir(currentNode); + } + ite=curDir?currentNode.getLeavingEdgeIterator():currentNode.getEnteringEdgeIterator(); + + while(ite.hasNext()){ + BidirectedEdge e = ite.next(); + path.add(e); + + int toTarget=Math.abs(distance-e.getLength()); + if(e.getOpposite(currentNode)==dst && e.getDir(dst)!=dstDir && toTarget < TOLERATE){ + BidirectedPath curPath=curResult.isEmpty()?new BidirectedPath():curResult.get(0), //the best path saved among all possible paths from the list curResult + tmpPath=new BidirectedPath(path); + tmpPath.setDeviation(toTarget); + if( toTarget < curPath.getDeviation() ) + curResult.add(0, tmpPath); + else + curResult.add(tmpPath); + + System.out.println("Hit added: "+path.getId()+"(candidate deviation: "+toTarget+")"); + }else{ + int newDistance = distance - ((Sequence) e.getOpposite(currentNode).getAttribute("seq")).length() - e.getLength(); +// System.out.println("adding edge: " + e.getId() + " length=" + e.getLength() +" -> distance=" + newDistance); + if (newDistance - e.getLength() < -TOLERATE){ + System.out.println("Stop go to edge " + e.getPath() + " from path with distance "+newDistance+" already! : "+path.getId()); + }else + traverse(path, dst, curResult, newDistance, srcDir, dstDir, stepCount++); + } + path.popNode(); + + } + } + + /* + * Find a path based on list of Alignments + */ + public BidirectedPath pathFinding(ArrayList alignments) { + if(alignments.size()<=1) + return null; + + System.out.println("================================================="); + for(Alignment alg:alignments) + System.out.println("\t"+alg.toString()); + System.out.println("================================================="); + //First bin the alignments into different overlap regions + //only considering useful alignments + HashMap allAlignments = new HashMap(); + ArrayList joinPaths = new ArrayList(); + + for(Alignment alg:alignments){ + if(alg.useful){ + Range range = new Range(alg.readAlignmentStart(),alg.readAlignmentEnd()); + allAlignments.put(range, alg); + } + } + //now get all the bin in order + List baseRanges=new ArrayList(allAlignments.keySet()); + List> rangeGroups = MetaRange.getOverlappingGroups(baseRanges); + + if(rangeGroups.size() < 2) + return null; + + System.out.println("Binning ranges: "); + for(List group : rangeGroups){ + System.out.println(group); + } + + //iterate all alignments in adjacent bins to find correct path + ArrayList curGroup = new ArrayList(), + nextGroup = new ArrayList(); + List curRanges = rangeGroups.get(0); + for(Range r:curRanges) + curGroup.add(allAlignments.get(r)); + + for(int i=1; i nextRanges = rangeGroups.get(i); + nextGroup = new ArrayList(); + + for(Range r:nextRanges) + nextGroup.add(allAlignments.get(r)); + + ArrayList allPaths = new ArrayList(); + + + for(Alignment curAlg:curGroup){ + for(Alignment nextAlg:nextGroup){ + int distance = nextAlg.readAlignmentStart()-curAlg.readAlignmentEnd(); + if(distance bridges = getClosestPath(curAlg, nextAlg, distance); + if(bridges!=null) + allPaths.addAll(bridges); + } + } + } + //join all paths from previous to the new ones + //TODO:optimize it + if(joinPaths.isEmpty()) + joinPaths=allPaths; + else{ + System.out.println("=====Current list of paths: " + joinPaths); + System.out.println("=====Join to list of paths: " + allPaths); + + for(BidirectedPath p:joinPaths) + for(BidirectedPath e:allPaths) + p.join(e); + + } + curGroup=nextGroup; + } + + for(BidirectedPath path:joinPaths) + System.out.println("A member Path: " + path.toString() + " deviation: " + path.getDeviation()); + if(joinPaths.isEmpty()) + return null; + else + return joinPaths.get(0); + } + + /* + * Important function: determine if a node is able to be removed or not + * TODO: re-implement it based on statistics of coverage also + * 1. pick the least coverage ones among a path as the base + * 2. global base + */ + public static boolean isUnique(Node node){ + boolean res = false; + + if(node.getDegree()<=2){ // not always true, e.g. unique node in a repetitive component + Sequence seq = node.getAttribute("seq"); + if(seq.length() > 10000 || node.getNumber("cov")/aveCov < 1.3) + res=true; + } + +// if(res) +// LOG.info(node.getAttribute("name") + " with coverage " + node.getNumber("cov") + " is a marker!"); +// else +// LOG.info(node.getAttribute("name") + " with coverage " + node.getNumber("cov") + " is NOT a marker!"); + + return res; + } + /* need more powerful function: + * A-statistics? + * Mixture of Poisson distributions??? Kalman filter idea... + */ +// public static boolean isUnique(Node node, double cov){ +// boolean res = false; +// if(node.getDegree()<=2 || Math.abs(node.getAttribute("cov" )) < cov){ +//// if(((Sequence)node.getAttribute("seq")).length() > 5000 || node.getDegree()==0) +// res=true; +// } +// +// return res; +// } +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/BidirectedNode.java b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedNode.java new file mode 100644 index 0000000..0da20a4 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedNode.java @@ -0,0 +1,301 @@ +package japsadev.bio.hts.newscarf; + +import java.security.AccessControlException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.NoSuchElementException; + +import org.graphstream.graph.Edge; +import org.graphstream.graph.Node; +import org.graphstream.graph.implementations.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +/** + * Similar to {@link AdjacencyListNode} + * + */ +public class BidirectedNode extends AbstractNode { + protected int COPY=1; + + protected static final int INITIAL_EDGE_CAPACITY; + protected static final double GROWTH_FACTOR = 1.1; + + private static final Logger LOG = LoggerFactory.getLogger(BidirectedNode.class); + + static { + String p = "org.graphstream.graph.node.initialEdgeCapacity"; + int initialEdgeCapacity = 32; + try { + initialEdgeCapacity = Integer.valueOf(System.getProperty(p, "32")); + } catch (AccessControlException e) { + } + INITIAL_EDGE_CAPACITY = initialEdgeCapacity; + } + //edges are bidirected, here are 4 sub-types (name based on direction of the arrow relative to the corresponding node): + //note that neighbor edges here will treat their root node as *left* node, the opposite node as *right* node + protected static final byte OO_EDGE = 0b00; // src-->--<--dst + protected static final byte OI_EDGE = 0b01; // src-->-->--dst + protected static final byte IO_EDGE = 0b10; // src--<--<--dst + protected static final byte II_EDGE = 0b11; // src--<-->--dst + + protected BidirectedEdge[] edges;//fast access to edges knowing direction from src + protected int oStart, degree; + + protected HashMap> neighborMap; //fast access to edges knowing dst + // *** Constructor *** + + protected BidirectedNode(AbstractGraph graph, String id) { + super(graph, id); + edges = new BidirectedEdge[INITIAL_EDGE_CAPACITY]; + oStart = degree = 0; + neighborMap = new HashMap>( + 4 * INITIAL_EDGE_CAPACITY / 3 + 1); + } + + // *** Helpers *** + + protected byte edgeType(BidirectedEdge e) { + //return (byte) (e.getDir((AbstractNode) e.getOpposite(this))?0:1 + (e.getDir(this)?0:1)<<1); //cool but less efficient + BidirectedNode opposite = e.getOpposite(this); + if(e.getDir(this)) + if(e.getDir(opposite)) + return OO_EDGE; + else + return OI_EDGE; + else + if(e.getDir(opposite)) + return IO_EDGE; + else + return II_EDGE; + } + + @SuppressWarnings("unchecked") + protected BidirectedEdge locateEdge(Node opposite, byte type) { + List l = neighborMap.get(opposite); + if (l == null) + return null; + + for (BidirectedEdge e : l) { + if(type==edgeType(e)) + return e; + } + return null; + } + + protected void removeEdge(int i) { +// System.out.print("From node " + this.getId() + " remove edge number " + i); +// System.out.println(" from a list of edges: "); +// for(int j=0;j l = neighborMap.get(opposite); + l.remove(edges[i]); + if (l.isEmpty()) + neighborMap.remove(opposite); + //remove from the array + if (i >= oStart) { + edges[i] = edges[--degree]; + edges[degree] = null; + return; + } + + edges[i] = edges[--oStart]; + edges[oStart] = edges[--degree]; + edges[degree] = null; + + } + + // *** Callbacks *** + + @Override + protected boolean addEdgeCallback(AbstractEdge edge) { + //LOG.info("Adding edge callback " + edge.getId() + " from graph " + getGraph().getId()); + AbstractNode opposite = edge.getOpposite(this); + List l = neighborMap.get(opposite); + if (l == null) { + l = new LinkedList(); + neighborMap.put(opposite, l); + } + l.add((BidirectedEdge) edge); + + // resize edges if necessary + if (edges.length == degree) { + BidirectedEdge[] tmp = new BidirectedEdge[(int) (GROWTH_FACTOR * edges.length) + 1]; + System.arraycopy(edges, 0, tmp, 0, edges.length); + Arrays.fill(edges, null); + edges = tmp; + } + + byte type = edgeType((BidirectedEdge) edge); + + if (type <= OI_EDGE) { + edges[degree++] = (BidirectedEdge) edge; + return true; + } + + edges[degree++] = edges[oStart]; + edges[oStart++] = (BidirectedEdge) edge; + return true; + } + + @Override + protected void removeEdgeCallback(AbstractEdge edge) { + //LOG.info("Removing edge callback " + edge.getId() + " from graph " + getGraph().getId()); + + // locate the edge first + byte type = edgeType((BidirectedEdge) edge); + int i = 0; + if (type <= OI_EDGE) + i = oStart; + while (i <= degree && edges[i] != edge) + i++; + if(i < degree){ //only remove iff edge is found + removeEdge(i); + } + } + + @Override + protected void clearCallback() { + Arrays.fill(edges, 0, degree, null); + oStart = degree = 0; + } + + // *** Access methods *** + + @Override + public int getDegree() { + return degree; + } + + @Override + public int getInDegree() { + return oStart; + } + + @Override + public int getOutDegree() { + return degree - oStart; + } + + @SuppressWarnings("unchecked") + @Override + public T getEdge(int i) { + if (i < 0 || i >= degree) + throw new IndexOutOfBoundsException("Node \"" + this + "\"" + + " has no edge " + i); + return (T) edges[i]; + } + + @SuppressWarnings("unchecked") + @Override + public T getEnteringEdge(int i) { + if (i < 0 || i >= getInDegree()) + throw new IndexOutOfBoundsException("Node \"" + this + "\"" + + " has no entering edge " + i); + return (T) edges[i]; + } + + @SuppressWarnings("unchecked") + @Override + public T getLeavingEdge(int i) { + if (i < 0 || i >= getOutDegree()) + throw new IndexOutOfBoundsException("Node \"" + this + "\"" + + " has no edge " + i); + return (T) edges[oStart + i]; + } + + // FIXME: I must override these stupid functions, let's just return random edge among 4 types!!! + @SuppressWarnings("unchecked") + @Override + public T getEdgeBetween(Node node) { + return (T) locateEdge(node, IO_EDGE); + } + + @SuppressWarnings("unchecked") + @Override + public T getEdgeFrom(Node node) { + return (T) locateEdge(node, OO_EDGE); + } + + @SuppressWarnings("unchecked") + @Override + public T getEdgeToward(Node node) { + return (T) locateEdge(node, II_EDGE); + } + + // *** Iterators *** + + protected class EdgeIterator implements Iterator { + protected int iPrev, iNext, iEnd; + //0:in, 1:out, other(2):all + protected EdgeIterator(int ori) { + iPrev = -1; + iNext = 0; + iEnd = degree; + if (ori==0) + iEnd = oStart; + else if(ori==1) + iNext = oStart; + +// System.out.println("Iterator " + ori + " of " + getId() + " from " + iNext + " to " + iEnd + " of"); +// for(int i=0;i= iEnd) + throw new NoSuchElementException(); + iPrev = iNext++; + return (T) edges[iPrev]; + } + + public void remove() { + if (iPrev == -1) + throw new IllegalStateException(); + AbstractEdge e = edges[iPrev]; + // do not call the callback because we already know the index + //graph.removeEdge(e); + ((BidirectedGraph)graph).removeEdgeDup(e, true, e.getSourceNode() != BidirectedNode.this, + e.getTargetNode() != BidirectedNode.this); + removeEdge(iPrev); + iNext = iPrev; + iPrev = -1; + iEnd--; + + } + } + + @Override + public Iterator getEdgeIterator() { + return new EdgeIterator(2); + } + + @Override + public Iterator getEnteringEdgeIterator() { + return new EdgeIterator(0); + } + + @Override + public Iterator getLeavingEdgeIterator() { + return new EdgeIterator(1); + } + +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/BidirectedPath.java b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedPath.java new file mode 100644 index 0000000..0ba5f55 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/BidirectedPath.java @@ -0,0 +1,191 @@ +package japsadev.bio.hts.newscarf; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; + + +import java.util.List; + +import org.graphstream.graph.Edge; +import org.graphstream.graph.Node; +import org.graphstream.graph.Path; +import org.graphstream.graph.implementations.AbstractNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class BidirectedPath extends Path{ + int deviation; //how this path differ to long read data (todo: by multiple-alignment??) + private double coverage=-1; //representative coverage of this path (basically the lowest cov from its nodes) + private static final Logger LOG = LoggerFactory.getLogger(BidirectedPath.class); + + @Override + public void add(Edge edge) { + super.add(edge); + + double newCoverage = peekNode().getNumber("cov"); + if(coverage<=0) + coverage=newCoverage; + else + coverage=Math.min(coverage, newCoverage); + + } + + public BidirectedPath(){ + super(); + } + public BidirectedPath(BidirectedPath p){ + super(); + if(p!=null && !p.empty()){ + setRoot(p.getRoot()); + for(Edge e:p.getEdgePath()) + add(e); + } + deviation=p.deviation; + } + + //This constructor is only used to load in contigs.path from SPAdes + //So no recursive path here (path contains all primitive edges) + public BidirectedPath(BidirectedGraph graph, String paths){ + super(); + paths=paths.replace(";", ","); //optimized it! + String[] comps = paths.split(","); + if(comps.length<1) + return; + String curID = comps[0], nextID; + boolean curDir = curID.contains("+")?true:false, + nextDir; + BidirectedNode curNode = graph.getNode(curID.substring(0,curID.length()-1)), + nextNode; + setRoot(curNode); + for(int i=1; i edges = this.getEdgePath(); + for(int i = edges.size()-1; i>=0; i--) + rcPath.add(edges.get(i)); + return rcPath; + } + //It is not really ID because Path doesn't need an ID + public String getId(){ + //need to make the Id unique for both sense and antisense spelling??? + BidirectedNode curNode = (BidirectedNode) getRoot(); + if(getEdgeCount()<1) + return curNode.getId(); + + String retval=curNode.getId(), + curDir=((BidirectedEdge) getEdgePath().get(0)).getDir(curNode)?"+":"-"; + retval+=curDir; + for(Edge e:getEdgePath()){ + curNode=e.getOpposite(curNode); + retval+=","+curNode.getId(); + curDir=((BidirectedEdge) e).getDir(curNode)?"-":"+"; //note that curNode is target node + retval+=curDir; + } + + return retval.trim(); + } + + public String toString(){ + return "(" + getId() + ")"; + } + + public Sequence spelling(){ + + BidirectedNode curNode = (BidirectedNode) getRoot(); + Sequence curSeq = curNode.getAttribute("seq"); + + if(getEdgeCount()<1) + return curSeq; + + SequenceBuilder seq = new SequenceBuilder(Alphabet.DNA16(), 1024*1024, this.toString()); + boolean curDir=((BidirectedEdge) getEdgePath().get(0)).getDir(curNode); + curSeq = curDir?curSeq:Alphabet.DNA.complement(curSeq); + + seq.append(curSeq.subSequence(0, curSeq.length()-BidirectedGraph.getKmerSize())); + for(Edge edge:getEdgePath()){ + for(Edge e:((BidirectedEdge) edge).getPath().getEdgePath()){ + curNode=e.getOpposite(curNode); + curSeq= curNode.getAttribute("seq"); + curDir=!((BidirectedEdge) e).getDir(curNode); + curSeq = curDir?curSeq:Alphabet.DNA.complement(curSeq); + + seq.append(curSeq.subSequence(0, curSeq.length()-(curNode==peekNode()? + 0:BidirectedGraph.getKmerSize()))); + } + } + return seq.toSequence(); + } + /* + * Add a path to the current path. The path to be added must start with the last node + * of the current path. + */ + public void join(BidirectedPath bridge) { + if(bridge==null || bridge.size() <=1) + return; + if(bridge.getRoot() != peekNode()){ + LOG.error("Cannot join path with disagreed first node " + bridge.getRoot().getId()); + return; + } + if(((BidirectedEdge) bridge.getEdgePath().get(0)).getDir((AbstractNode) bridge.getRoot()) + == ((BidirectedEdge) peekEdge()).getDir((AbstractNode) peekNode())){ + LOG.error("Conflict direction from the first node " + bridge.getRoot().getId()); + return; + } + //TODO: need a way to check coverage consistent + + + for(Edge e:bridge.getEdgePath()){ + add(e); + } + + coverage=Math.min(coverage, bridge.coverage); + } + + public int getDeviation(){ + return this.deviation; + } + public void setDeviation(int deviation){ + this.deviation=deviation; + } + + public double getCoverage(){ + return coverage; + } + /** + * + * @return average depth of this path + */ +// public double averageCov(){ +// int len=0; +// double res=0; +// for(Node n:getNodePath()){ +// if(BidirectedGraph.isUnique(n)){ +// Sequence seq = (Sequence) n.getAttribute("seq"); +// double cov = Double.parseDouble(seq.getName().split("_")[5]); +// len+=(n==getRoot())?seq.length():seq.length()-BidirectedGraph.getKmerSize(); +// res+=seq.length()*cov; +// } +// } +// return res/len; +// } + + public int length() { + int retval = 0; + for(Node n:getNodePath()){ + Sequence seq = (Sequence) n.getAttribute("seq"); + retval+=(n==getRoot())?seq.length():seq.length()-BidirectedGraph.getKmerSize(); + } + return retval; + } +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/GraphExplore.java b/src/dev/java/japsadev/bio/hts/newscarf/GraphExplore.java new file mode 100644 index 0000000..7f85338 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/GraphExplore.java @@ -0,0 +1,127 @@ +package japsadev.bio.hts.newscarf; + +import java.io.IOException; +import java.util.Iterator; +import org.graphstream.graph.*; +import japsa.seq.Sequence; + + +public class GraphExplore { + + public static String spadesFolder="/home/sonhoanghguyen/Projects/scaffolding/data/spades_3.7/"; //imb desktop +// public static String spadesFolder="/home/hoangnguyen/workspace/data/spades/"; //sony +// public static String spadesFolder="/home/s_hoangnguyen/Projects/scaffolding/test-graph/spades/"; //dell + + + public static void main(String args[]) { + try { + new GraphExplore(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + } + + public GraphExplore() throws IOException{ + //System.setProperty("org.graphstream.ui.renderer", "org.graphstream.ui.j2dviewer.J2DGraphRenderer"); +// String sample="EcK12S-careful"; +// String sample="Kp2146-careful"; +// String sample="meta-careful"; + String sample="cp_S5"; + + HybridAssembler ass = new HybridAssembler(spadesFolder+sample+"/assembly_graph.fastg"); + BidirectedGraph graph= ass.simGraph; +// graph.addAttribute("ui.quality"); +// graph.addAttribute("ui.antialias"); + graph.addAttribute("ui.stylesheet", styleSheet); + graph.addAttribute("ui.default.title", "New real-time hybrid assembler"); + + + graph.display(); + + System.out.println("Node: " + graph.getNodeCount() + " Edge: " + graph.getEdgeCount()); + + + for (Node node : graph) { +// node.addAttribute("ui.label", node.getId()); +// node.setAttribute("ui.style", "text-offset: -10;"); + if(BidirectedGraph.isUnique(node)) + node.setAttribute("ui.class", "marked"); + } + + //explore(graph.getNode("A")); + + /* + * Testing reduce function + */ + try { + HybridAssembler.promptEnterKey(); + ass.reduceFromSPAdesPaths(spadesFolder+sample+"/contigs.paths"); + HybridAssembler.promptEnterKey(); + ass.assembly(spadesFolder+sample+"/assembly_graph.sam"); + + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + //TODO: thorough cleaning... should have flag dead for each node +// boolean dead=true; +// while(dead){ +// dead=false; +// for (Node node : graph) { +// if((node.getDegree() < 2) +//// || (node.getDegree()==0 && ((Sequence)node.getAttribute("seq")).length() < 1000) +// ){ +// graph.removeNode(node); +// dead=true; +// } +// +// } +// } + System.out.println("Node: " + graph.getNodeCount() + " Edge: " + graph.getEdgeCount()); + + /* + * Testing BidirectedEdge id pattern + */ +// String pattern = "^\\[([0-9\\+\\-]*)\\]([oi])\\[([0-9\\+\\-]*)\\]([oi])$"; +// // Create a Pattern object +// Pattern r = Pattern.compile(pattern); +// // Now create matcher object. +// String id="[3]i[4+8+]o"; +// Matcher m = r.matcher(id); +// +// if(m.find()){ +// System.out.println(m.group(1)+"|"+m.group(2)+"|"+m.group(3)+"|"+m.group(4)); +// } else +// System.out.println("Fuck"); + } + + public void explore(Node source) { + Iterator k = source.getBreadthFirstIterator(); + + while (k.hasNext()) { + Node next = k.next(); + next.setAttribute("ui.class", "marked"); + sleep(); + } + } + + protected void sleep() { + try { Thread.sleep(1000); } catch (Exception e) {} + } + + protected String styleSheet = + "node {" + + " fill-color: black;" + + "}" + + "node.marked {" + + " fill-color: red;" + + "}" + + "edge.marked {" + + " fill-color: red;" + + "}"; + +} \ No newline at end of file diff --git a/src/dev/java/japsadev/bio/hts/newscarf/GraphTest.java b/src/dev/java/japsadev/bio/hts/newscarf/GraphTest.java new file mode 100644 index 0000000..9d231c1 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/GraphTest.java @@ -0,0 +1,54 @@ +package japsadev.bio.hts.newscarf; + +import java.io.File; +import java.io.PrintWriter; +import java.util.Collections; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.graphstream.graph.Graph; +import org.graphstream.graph.implementations.AdjacencyListGraph; + +import htsjdk.samtools.*; + +public class GraphTest { + public static void main(String[] args) throws Exception { +// System.setProperty("org.graphstream.ui.renderer", +// "org.graphstream.ui.j2dviewer.J2DGraphRenderer"); +// Graph g = new AdjacencyListGraph("g"); +// g.addNode("A").addAttribute("xyz", new double[] { 0, 0 }); +// g.addNode("B").addAttribute("xyz", new double[] { 10, 10 }); +// +// g.addEdge("AB", "A", "B", false) +// .addAttribute( +// "ui.points", +// (Object) new double[] { 0, 0, 0, 0, 5, 0, 5, 10, 0, 10, +// 10, 0 }); +// +// g.addAttribute("ui.stylesheet", "edge {shape: polyline; }"); // or shape: cubic-curve +// +// g.display(false); + +// String data="/home/hoangnguyen/workspace/data/spades/EcK12S-careful/assembly_graph.fastg"; //sony +// +// final SamFileValidator validator=new SamFileValidator(new PrintWriter(System.out),8000); +// validator.setIgnoreWarnings(true); +// validator.setVerbose(true,1000); +// validator.setErrorsToIgnore(Collections.singletonList(SAMValidationError.Type.MISSING_READ_GROUP)); +// SamReaderFactory factory=SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT); +// SamReader samReader=factory.open(new File(data)); +// boolean ischeck=false; +// SAMRecordIterator iter = samReader.iterator(); +// System.out.println(iter.next()); +// +// samReader.close(); + + Pattern versionPattern = Pattern.compile("^Version:\\s(\\d+\\.\\d+\\.\\d+).*"); + String line="Version: 0.7.5a-r405"; + Matcher matcher =versionPattern.matcher(line); + if (matcher.find()){ + System.out.println(line); + System.out.println(matcher.group(1)); + } + } +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/HybridAssembler.java b/src/dev/java/japsadev/bio/hts/newscarf/HybridAssembler.java new file mode 100644 index 0000000..f25af64 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/HybridAssembler.java @@ -0,0 +1,308 @@ +package japsadev.bio.hts.newscarf; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Scanner; + +import org.graphstream.graph.Edge; +import org.graphstream.graph.Node; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; + +public class HybridAssembler { + private static final Logger LOG = LoggerFactory.getLogger(HybridAssembler.class); + +// final BidirectedGraph origGraph; + public BidirectedGraph simGraph; //original and simplified graph should be separated, no??? + + public HybridAssembler(){ +// origGraph=new BidirectedGraph("batch"); + simGraph=new BidirectedGraph("real"); + } + + + public HybridAssembler(String graphFile) throws IOException{ + this(); +// origGraph.loadFromFile(graphFile); + simGraph.loadFromFile(graphFile); + } + + + public void assembly(String bamFile) throws IOException{ + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + + SamReader reader; + if ("-".equals(bamFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + //ReadFilling readFilling = null; + ArrayList samList = new ArrayList();;// alignment record of the same read; + BidirectedPath p = new BidirectedPath(); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + if (rec.getReadUnmappedFlag()) + continue; +// if (rec.getMappingQuality() < qual) +// continue; + + String refID = rec.getReferenceName().split("_")[1]; + Alignment myRec = new Alignment(rec, simGraph.getNode(refID)); //FIXME: optimize + + ////////////////////////////////////////////////////////////////// + // make list of alignments of the same (Nanopore) read. + + //not the first occurrance + if (!readID.equals("") && !readID.equals(myRec.readID)) { + //Collections.sort(samList); + //p=origGraph.pathFinding(samList); + p=simGraph.pathFinding(samList); // the graph MUST be the same as from new Alignment(...) + + if(p!=null) + System.out.println("Final path found: " + p.getId()); + reduce(p); +// reduce2(p); + samList = new ArrayList(); + //readID = myRec.readID; + } + readID = myRec.readID; + samList.add(myRec); // FIXME: (optimize) insert sort here + + }// while + iter.close(); + + //outOS.close(); + reader.close(); + + } + /* + * Read paths from contigs.path and reduce the graph + */ + public void reduceFromSPAdesPaths(String paths) throws IOException{ + + BufferedReader pathReader = new BufferedReader(new FileReader(paths)); + + String s="", curpath=""; + //Read contigs from contigs.paths and refer themselves to contigs.fasta + boolean flag=false; + while((s=pathReader.readLine()) != null){ + if(s.contains("NODE")){ + if(flag){ + BidirectedPath path=new BidirectedPath(simGraph, curpath); + reduce(path); +// reduce2(path); + } + flag=s.contains("'")?false:true; + curpath=new String(); + continue; + }else if(flag){ + curpath+=s; + } + + + } + pathReader.close(); + } + + /** + * Another reduce that doesn't remove the unique nodes + * Instead redundant edges are removed on a path way + * @param p Path to simplify the graph (from origGraph) + * @param target Subjected graph for the simplification + */ + private void reduce(BidirectedPath p){ + //do nothing if the path has only one node + if(p==null || p.getEdgeCount()<1) + return; + + //loop over the edges of path (like spelling()) + BidirectedNode markerNode = null, + curNodeFromSimGraph = (BidirectedNode) p.getRoot(); + + BidirectedPath curPath= null; + boolean markerDir=true, curDir; + + if(BidirectedGraph.isUnique(curNodeFromSimGraph)){ + markerNode=curNodeFromSimGraph; + markerDir=((BidirectedEdge) p.getEdgePath().get(0)).getDir(markerNode); + curPath = new BidirectedPath(); + curPath.setRoot(curNodeFromSimGraph); + } + + + //search for an unique node as the marker. + ArrayList tobeRemoved = new ArrayList(), + tobeAdded = new ArrayList(); + for(Edge e:p.getEdgePath()){ + + curNodeFromSimGraph=e.getOpposite(curNodeFromSimGraph); + +// curNodeFromSimGraph = simGraph.getNode(curNodeFromOrigGraph.getId()); //change back to Node belong to simGraph (instead of origGraph) + curDir=((BidirectedEdge) e).getDir(curNodeFromSimGraph); + + if(BidirectedGraph.isUnique(curNodeFromSimGraph)){ + + if(markerNode!=null){ + //this is when we have 1 jumping path (both ends are markers) + curPath.add(e); +// LOG.info("Processing path {} with marker {}:{}:{} and curNode {}:{}:{}", curPath.getId(), markerNode.getId(), markerDir?"out":"in", markerNode.getGraph().getId(), curNodeFromSimGraph.getId(), curDir?"out":"in", curNodeFromSimGraph.getGraph().getId()); + //create an edge connect markerNode to curNode with curPath + //Edge reducedEdge = simGraph.addEdge(markerNode, curNodeFromSimGraph, markerDir, curDir); + BidirectedEdge reducedEdge = new BidirectedEdge(markerNode, curNodeFromSimGraph, markerDir, curDir); + +// if(reducedEdge!=null) +// reducedEdge.addAttribute("path", new BidirectedPath(curPath)); + + tobeAdded.add(reducedEdge); + + //loop over curPath to find out edges needed to be removed + Node n0 = curPath.getRoot(), + n1 = null; + for(Edge ep:curPath.getEdgePath()){ + n1 = ep.getOpposite(n0); + if(!BidirectedGraph.isUnique(n0) == BidirectedGraph.isUnique(n1)){ + tobeRemoved.add((BidirectedEdge)ep); + } + +// if(!BidirectedGraph.isUnique(n1)){ +// n1.setAttribute("cov", n1.getNumber("cov")-markerNode.getNumber("cov")); +// LOG.info("...coverage of " + n1.getAttribute("name") + " now is " + n1.getNumber("cov")); +// } + + n0=n1; + } + + } + + + markerNode=curNodeFromSimGraph; + markerDir=!curDir; //in-out, out-in + curPath= new BidirectedPath(); + curPath.setRoot(curNodeFromSimGraph); + } + else{ + if(markerNode!=null){ + curPath.add(e); + } + } + + } + + //remove appropriate edges + for(BidirectedEdge e:tobeRemoved){ + LOG.info("REMOVING EDGE " + e.getId() + " from " + e.getNode0().getGraph().getId() + "-" + e.getNode1().getGraph().getId()); + LOG.info("before: \n\t" + simGraph.printEdgesOfNode(e.getNode0()) + "\n\t" + simGraph.printEdgesOfNode(e.getNode1())); + simGraph.removeEdge(e.getId()); + LOG.info("after: \n\t" + simGraph.printEdgesOfNode(e.getNode0()) + "\n\t" + simGraph.printEdgesOfNode(e.getNode1())); + } + + //add appropriate edges + for(BidirectedEdge e:tobeAdded){ + LOG.info("ADDING EDGE " + e.getId()+ " from " + e.getNode0().getGraph().getId() + "-" + e.getNode1().getGraph().getId()); + LOG.info("before: \n\t" + simGraph.printEdgesOfNode(e.getNode0()) + "\n\t" + simGraph.printEdgesOfNode(e.getNode1())); + + Edge reducedEdge = simGraph.addEdge(e.getSourceNode(),e.getTargetNode(),e.getDir0(),e.getDir1()); + if(reducedEdge!=null){ +// reducedEdge.addAttribute("ui.label", reducedEdge.getId()); +// reducedEdge.setAttribute("ui.style", "text-offset: -10; text-alignment: along;"); + reducedEdge.addAttribute("isReducedEdge", true); + reducedEdge.setAttribute("ui.class", "marked"); +// reducedEdge.addAttribute("layout.weight", 10); + } + LOG.info("after: \n\t" + simGraph.printEdgesOfNode(e.getNode0()) + "\n\t" + simGraph.printEdgesOfNode(e.getNode1())); + + } + + } + /** + * Another reduce that doesn't need to know unique contig + * @param p Path to simplify the graph (from origGraph) + * @param target Subjected graph for the simplification + */ + private void reduce2(BidirectedPath p){ + //do nothing if the path has only one node + if(p==null || p.getEdgeCount()<1) + return; + double coverage = p.getCoverage(); + //loop over the edges of path (like spelling()) + BidirectedNode firstNode = (BidirectedNode) p.getRoot(), + lastNode = (BidirectedNode) p.peekNode(); + + boolean firstDir=((BidirectedEdge)p.getEdgePath().get(0)).getDir(firstNode), + lastDir=((BidirectedEdge)p.peekEdge()).getDir(lastNode); + + //search for an unique node as the marker. + ArrayList tobeRemoved = new ArrayList(); + BidirectedNode curNode = firstNode; + boolean curDir; + for(Edge e:p.getEdgePath()){ + + double curCoverage=curNode.getNumber("cov"), + nextCoverage=e.getOpposite(curNode).getNumber("cov"); + //if current node has the same coverage as path coverage + if(covLeft(curCoverage, coverage)==0 || covLeft(nextCoverage, coverage)==0){ + tobeRemoved.add((BidirectedEdge) e); + } + + curNode=e.getOpposite(curNode); + } + + //remove appropriate edges + for(BidirectedEdge e:tobeRemoved){ + LOG.info("REMOVING EDGE " + e.getId() + " from " + e.getNode0().getGraph().getId() + "-" + e.getNode1().getGraph().getId()); + LOG.info("before: \n\t" + simGraph.printEdgesOfNode(e.getNode0()) + "\n\t" + simGraph.printEdgesOfNode(e.getNode1())); + simGraph.removeEdge(e.getId()); + LOG.info("after: \n\t" + simGraph.printEdgesOfNode(e.getNode0()) + "\n\t" + simGraph.printEdgesOfNode(e.getNode1())); + } + + //add appropriate edges + Edge reducedEdge = simGraph.addEdge(firstNode,lastNode,firstDir,lastDir); + + + } + private double covLeft(double cov, double pathCov){ + double retval=0; + //TODO: need statistics here... + if((cov-pathCov)/pathCov > .2){ + retval=cov-pathCov; + } + return retval; + } + + + @SuppressWarnings("resource") + public static void promptEnterKey(){ + System.out.println("Press \"ENTER\" to continue..."); + Scanner scanner = new Scanner(System.in); + scanner.nextLine(); + } + + protected void sleep() { + try { Thread.sleep(1000); } catch (Exception e) {} + } + + public static void main(String[] argv) throws IOException{ + HybridAssembler hbAss = new HybridAssembler(GraphExplore.spadesFolder+"EcK12S-careful/assembly_graph.fastg"); + //For SAM file, run bwa first on the edited assembly_graph.fastg by running: + //awk -F '[:;]' -v q=\' 'BEGIN{flag=0;}/^>/{if(index($1,q)!=0) flag=0; else flag=1;}{if(flag==1) print $1;}' ../EcK12S-careful/assembly_graph.fastg > Eck12-careful.fasta + //TODO: need to make this easier + + hbAss.assembly(GraphExplore.spadesFolder+"EcK12S-careful/assembly_graph.sam"); + + } + +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/MetaRange.java b/src/dev/java/japsadev/bio/hts/newscarf/MetaRange.java new file mode 100644 index 0000000..95259e6 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/MetaRange.java @@ -0,0 +1,140 @@ +package japsadev.bio.hts.newscarf; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * A meta range is composed of sub ranges. Its left endpoint is + * the leftmost point of its subranges and its right endpoint is + * the rightmost point of its subranges. + */ +public class MetaRange extends Range{ + + /** + * meta Ranges are composed of lists of subRanges. + */ + private List subranges; + + public MetaRange(Range initialSubRange){ + super(initialSubRange.getLeft(),initialSubRange.getRight()); + this.subranges = new ArrayList(); + this.subranges.add(initialSubRange); + } + + public void join(MetaRange other){ + verifyHomo(other); + + this.setLeft(Math.min(this.getLeft(),other.getLeft())); + this.setRight(Math.max(this.getRight(),other.getRight())); + this.subranges.addAll(other.getSubRanges()); + + other.invalidate(); + } + + public void setSubRanges(List subRanges){ + this.subranges = subRanges; + } + + public List getSubRanges(){ + return this.subranges; + } + + private void invalidate(){ + this.setSubRanges(null); + this.setLeft(0); + this.setRight(0); + } + + public boolean isInvalid(){ + return this.getSubRanges()==null; + } + + /** + * This code is retrieved from StackOverFlow: credits to augray :). + * From here on, the term "overlap" actually means ranges that + * belong to the same homo-group + * + * Given a list of Ranges, returns a list of Range groups where + * the Ranges in the groups all overlap. So if A overlaps with B and + * B with C, then A,B,and C will be returned in a group. Supposing Ranges + * D and E share nothing with A, B, or C, but do share with each other, D and E + * will be returned as a separate group from A,B,C. + * @param baseRanges + * @return + */ + public static List> getOverlappingGroups(List baseRanges){ + List baseMetaRanges = toMetaRanges(baseRanges); + + List mergedMetaRanges = getMergedMetaRanges(baseMetaRanges); + + List> RangeGroups = metaRangesToGroups(mergedMetaRanges); + return RangeGroups; + } + + + + private static List getMergedMetaRanges( + List metaRanges) { + if(metaRanges.isEmpty()){ + return metaRanges; + } + //order the MetaRanges by their starting point. + Collections.sort(metaRanges); + + //go through and merge the overlapping meta Ranges. + //This relies on the logic that if Range i overlaps with + //an Range that started before it, then once all the Ranges + //before i have been merged, Range i will have a starting point + //consecutive to the starting point of the the preceeding Range. + for(int i=0; i< metaRanges.size()-1; i++){ + MetaRange thisRange = metaRanges.get(i); + MetaRange nextRange = metaRanges.get(i+1); + + if(thisRange.isHomo(nextRange)){ + nextRange.join(thisRange); + } + } + + List resultRanges = new ArrayList(); + + //All Ranges from the original list either: + //(a) didn't overlap with any others + //(b) overlapped with others and were chosen to represent the merged group or + //(c) overlapped with others, were represented in the group in another + // MetaRange, and then marked as invalid. + //Go through and only add the valid Ranges to be returned. + + for(MetaRange i : metaRanges){ + if(!i.isInvalid()){ + resultRanges.add(i); + } + } + return resultRanges; + } + + /** + * Convert a list of MetaRanges into groups of Ranges. + * @param mergedMetaRanges + * @return + */ + private static List> metaRangesToGroups( + List mergedMetaRanges) { + List> groups = new ArrayList<>(mergedMetaRanges.size()); + for(MetaRange metaRange : mergedMetaRanges){ + groups.add(metaRange.getSubRanges()); + } + return groups; + } + + private static List toMetaRanges( + List baseRanges) { + ArrayList metaRanges = new ArrayList(baseRanges.size()); + for(Range i : baseRanges){ + metaRanges.add(new MetaRange(i)); + } + + return metaRanges; + } + +} diff --git a/src/dev/java/japsadev/bio/hts/newscarf/Range.java b/src/dev/java/japsadev/bio/hts/newscarf/Range.java new file mode 100644 index 0000000..dae84fd --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/newscarf/Range.java @@ -0,0 +1,77 @@ +package japsadev.bio.hts.newscarf; + +public class Range implements Comparable{ + static final double ER_UPPERBOUND=2; + + int left, right; + Range(){ + left=right=0; + } + Range(int left, int right){ + this.left=left; + this.right=right; + } + + public int getLeft(){ + return left; + } + public int getRight(){ + return right; + } + public void setLeft(int left){ + this.left=left; + } + public void setRight(int right){ + this.right=right; + } + + public boolean isHomo(Range other){ + int order=compareTo(other); + if(order==0) return true; + + boolean retval=false; + Range ref=(order<0?this:other), + qry=(order<0?other:this); + + if(ref.right-qry.left > ER_UPPERBOUND*BidirectedGraph.getKmerSize()) + retval=true; + else + retval=(qry.right<=ref.right); + + return retval; + } + + public void verifyHomo(Range other){ + if(!isHomo(other)){ + throw new IllegalStateException("Other range shouldn't belongs to the same group!"); + } + } + + @Override + public int compareTo(Range o) { + // TODO Auto-generated method stub + return left-o.left; + } + + public String toString(){ + return new String(left+" -> "+right); + } + @Override + public int hashCode() { + int retval=3; + retval=37*retval+left; + retval=37*retval+right; + return retval; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Range)) + return false; + if (obj == this) + return true; + + Range rhs = (Range) obj; + return left==rhs.getLeft()&&right==rhs.getRight(); + } +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/AlignmentRecord.java b/src/dev/java/japsadev/bio/hts/scaffold/AlignmentRecord.java new file mode 100644 index 0000000..b91cc21 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/AlignmentRecord.java @@ -0,0 +1,234 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 31/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.hts.scaffold; + +import java.util.ArrayList; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; + +public class AlignmentRecord implements Comparable { + static final double matchCost = 0; + int score; + + public String readID; + Contig contig; + + public int refStart, refEnd; //1-based position on ref of the start and end of the alignment + + //Position on read of the start and end of the alignment (using the direction of read) + //readStart map to refStart, readEnd map to refEnd. + //readStart < readEnd if strand = true, else readStart > readEnd + public int readStart = 0, readEnd = 0; + + //read length + public int readLength = 0; + + public boolean strand = true;//positive + public boolean useful = false; + //SAMRecord mySam; + + ArrayList alignmentCigars = new ArrayList(); + + + //public int readLeft, readRight, readAlign, refLeft, refRight, refAlign; + //left and right are in the direction of the reference sequence + + public AlignmentRecord(String readID, int refStart, int refEnd, int readLength, + int readStart, int readEnd, boolean strand, boolean useful, Contig contig, int score){ + this.readID = readID; + this.contig = contig; + this.refStart = refStart; + this.refEnd = refEnd; + + this.readLength = readLength; + this.readStart = readStart;//1-index + this.readEnd = readEnd;//1-index + this.strand = strand; + this.useful = useful; + this.contig = contig; + this.score = score; + } + public AlignmentRecord(SAMRecord sam, Contig ctg) { +// readID = Integer.parseInt(sam.getReadName().split("_")[0]); + if(!sam.getReferenceName().equals(ctg.getName())){ + System.err.println("Reference in SAM file doesn't agree with contigs name: " + + sam.getReferenceName() + " != " + ctg.getName()); + System.err.println("Hint: SAM file must resulted from alignment between long reads and contigs!"); + System.exit(1); + } + readID = sam.getReadName(); + + contig = ctg; + + //mySam = sam; + refStart = sam.getAlignmentStart(); + refEnd = sam.getAlignmentEnd(); + + Cigar cigar = sam.getCigar(); + boolean enterAlignment = false; + ////////////////////////////////////////////////////////////////////////////////// + + for (final CigarElement e : cigar.getCigarElements()) { + alignmentCigars.add(e); + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + case S : + case P : //pad is a kind of clipped + if (enterAlignment) + readEnd = readLength; + readLength += length; + break; // soft clip read bases + case I : + case M : + case EQ : + case X : + if (!enterAlignment){ + readStart = readLength + 1; + enterAlignment = true; + } + readLength += length; + break; + case D : + case N : + if (!enterAlignment){ + readStart = readLength + 1; + enterAlignment = true; + } + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//case + }//for + if (readEnd == 0) + readEnd = readLength; + //these temporary variable to determine usefulness + int readLeft = readStart -1; + int readRight = readLength - readEnd; + + int refLeft = refStart - 1; + int refRight = contig.length() - refEnd; + score = refEnd + 1 - refStart; + if (sam.getReadNegativeStrandFlag()){ + strand = false; + //need to convert the alignment position on read the correct direction + readStart = 1 + readLength - readStart; + readEnd = 1 + readLength - readEnd; + } + + if ( + (readLeft < ScaffoldGraph.marginThres || refLeft < ScaffoldGraph.marginThres) && + (readRight < ScaffoldGraph.marginThres || refRight < ScaffoldGraph.marginThres) && + score > ScaffoldGraph.minContigLength + ) + useful = true; + + } + + + public int readAlignmentStart(){ + return Math.min(readStart,readEnd); + + } + + public int readAlignmentEnd(){ + return Math.max(readStart,readEnd); + } + + public String toString() { + return contig.index + + " " + refStart + + " " + refEnd + + " " + contig.length() + + " " + readStart + + " " + readEnd + + " " + readLength + + " " + strand; + } + + public String pos() { + return + refStart + + " " + refEnd + + " " + contig.length() + + " " + readStart + + " " + readEnd + + " " + readLength + + " " + score + + " " + strand + ; + } + // return same alignment but with reversed read + //TODO: change to object self-editing function? + public AlignmentRecord reverseRead(){ + AlignmentRecord revAlign = new AlignmentRecord(readID, refStart, refEnd, readLength, + readLength - readStart + 1, readLength - readEnd + 1, !strand, useful, contig, score); + + revAlign.alignmentCigars = alignmentCigars; + + return revAlign; + } + public AlignmentRecord clones(){ + AlignmentRecord align = new AlignmentRecord(readID, refStart, refEnd, readLength, + readStart, readEnd, strand, useful, contig, score); + + align.alignmentCigars = alignmentCigars; + + return align; + } + public void copy(AlignmentRecord rec){ + readID = rec.readID; + contig = rec.contig; + refStart = rec.refStart; + refEnd = rec.refEnd; + + readLength = rec.readLength; + readStart = rec.readStart;//1-index + readEnd = rec.readEnd;//1-index + strand = rec.strand; + useful = rec.useful; + alignmentCigars = rec.alignmentCigars; + contig = rec.contig; + score = rec.score; + } + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(AlignmentRecord o) { + return readAlignmentStart() - o.readAlignmentStart(); + } +} \ No newline at end of file diff --git a/src/dev/java/japsadev/bio/hts/scaffold/Contig.java b/src/dev/java/japsadev/bio/hts/scaffold/Contig.java new file mode 100644 index 0000000..9323278 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/Contig.java @@ -0,0 +1,250 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 20/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.hts.scaffold; + +import java.util.ArrayList; +import java.util.stream.IntStream; + +import japsa.seq.Sequence; +import japsa.util.Logging; +import japsa.seq.JapsaFeature; + +public class Contig{ + int index; + ScaffoldVector myVector;//relative position to the head contig of my scaffold + Sequence contigSequence;//the sequence of the contig + double coverage = 1.0; + int head = -1; //point to the index of its head contig in the scaffold + double prevScore=0, nextScore=0; + int cirProb = -1; //measure how likely the contig itself is circular + + int[] isMapped; //which bases is mapped by any long reads + //for annotation + ArrayList genes, //genes list + oriRep, //origin of replication: indicator of plasmid for bacteria + insertSeq, //Insertion Sequence + resistanceGenes; //list of antibiotic resistance genes found in this contig + //a contig is composed of edges from assembly graph + + static Graph asGraph=null; + public static void setGraph(Graph g){ + asGraph=g; + } + public static boolean hasGraph(){ + return asGraph!=null; + } + + ArrayList paths; + + public Contig(int index, Sequence seq){ + this.index = index; + contigSequence = seq; + isMapped = new int[seq.length()]; + + myVector = new ScaffoldVector(0,1); + + genes = new ArrayList(); + oriRep = new ArrayList(); + insertSeq = new ArrayList(); + resistanceGenes = new ArrayList(); + + paths = new ArrayList(); + } + + + public Contig clone(){ + Contig ctg = new Contig(this.index, this.contigSequence); + ctg.coverage = coverage; + + ctg.head = this.head; //update later + ctg.cirProb = this.cirProb; + ctg.isMapped = this.isMapped; + + ctg.genes = this.genes; + ctg.oriRep = this.oriRep; + ctg.insertSeq = this.insertSeq; + ctg.resistanceGenes = this.resistanceGenes; + + ctg.paths = new ArrayList(); + for(Path p:paths) + ctg.paths.add(p); + + return ctg; + } + // Get features in an interval of contig + public ArrayList getFeatures(ArrayList features, int start, int end){ + + ArrayList remainFeatures = new ArrayList(); + boolean isReverse= (start>end)?true:false; + for(JapsaFeature feature:features){ + JapsaFeature cutFeature=feature.cloneFeature(); + int fstart = feature.getStart(), + fend = feature.getEnd(); + + //find overlap + if(Integer.signum(fstart-start)*Integer.signum(fstart-end) <= 0){ + if(Integer.signum(fend-start)*Integer.signum(fend-end) > 0){ + fend = (Math.abs(fend-start) < Math.abs(fend-end))?start:end; + } + }else{ + fstart = (Math.abs(fstart-start) < Math.abs(fstart-end))?start:end; + if(Integer.signum(start-fend)*Integer.signum(start-fstart) <= 0 && Integer.signum(end-fend)*Integer.signum(end-fstart) <= 0) + fend = (Math.abs(fend-start) < Math.abs(fend-end))?start:end; + else if(Integer.signum(start-fend)*Integer.signum(start-fstart) > 0 && Integer.signum(end-fend)*Integer.signum(end-fstart) > 0) + continue; + } + //if the contig is reversed complement + if(isReverse){ + int ostart = fstart; + fstart= this.length() - fend; + fend = this.length() - ostart; + if(cutFeature.getStrand() == '+') + cutFeature.setStrand('-'); + else + cutFeature.setStrand('+'); + } + + cutFeature.setStart(fstart); + cutFeature.setEnd(fend); + double cutRate=(float) Math.abs(cutFeature.getLength())/Math.abs(feature.getLength()); + if(cutRate > .9){ + cutFeature.setScore(feature.getScore()*cutRate); + remainFeatures.add(cutFeature); + + } + } + + return remainFeatures; + } + + //get the SPAdes name (out of MicroManage name maybe) + public String getName(){ + return contigSequence.getName(); + } + public String getDesc(){ + return contigSequence.getDesc(); + } + + public int getIndex(){ + return index; + } + //actually a backward composite + public void composite(ScaffoldVector aVector){ + myVector = ScaffoldVector.composition(myVector, aVector); + } + /** + * Relative position to the head of the scaffold + * @return + */ + public int getRelPos(){ + return myVector.magnitude; + } + + public int getRelDir(){ + return myVector.direction; + } + + /** + * Get the left most position if transpose by vector trans + * @return + */ + public int leftMost(ScaffoldVector trans){ + return trans.magnitude - ((trans.direction > 0)?0:length()); + } + + /** + * Get the right most position if transpose by vector trans + * @return + */ + public int rightMost(ScaffoldVector trans){ + return trans.magnitude + ((trans.direction > 0)?length():0); + } + + /** + * Get the left most position + * @return + */ + public int leftMost(){ + return leftMost(myVector); + } + + /** + * Get the right most position + * @return + */ + public int rightMost(){ + return rightMost(myVector); + } + + public boolean isCircular(){ + return !ScaffoldGraph.eukaryotic && (cirProb > 1); + } + + public ScaffoldVector getVector(){ + return myVector; + } + + public int length(){ + return contigSequence.length(); + } + + public double getCoverage(){ + return coverage; + } + + + public boolean isMapped(){ + int sum = IntStream.of(isMapped).sum(); + return ((double)sum/length()) > .8; + } + /* + * Operators related to Path + */ + public ArrayList getPaths(){ + return paths; + } + public void setPath(Path path){ + this.paths.add(path); + } + + public void setCoverage(double cov){ + coverage = cov; + } + public String toString(){ + return new String(" contig" + getIndex()); + } + + +} \ No newline at end of file diff --git a/src/dev/java/japsadev/bio/hts/scaffold/ContigBridge.java b/src/dev/java/japsadev/bio/hts/scaffold/ContigBridge.java new file mode 100644 index 0000000..9689602 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/ContigBridge.java @@ -0,0 +1,1358 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 19/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.hts.scaffold; + + + +import htsjdk.samtools.CigarElement; +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFeature; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collections; + +import japsa.bio.np.ErrorCorrection; + +/** + * Create a bridge that connects two contigs. The bridge can be ranked based + * on the confidence so that more confident bridge is used priorly. + * Note that two contigs can have more than one bridge from circular + * sequence or false positives. + * @author minhduc + * + */ + +public class ContigBridge implements Comparable{ + + Contig firstContig, secondContig; + final String hashKey; + final int orderIndex; + + private double score = 0;//more is better + private ScaffoldVector transVector = null; + private Connection connection = null;// the representative connection of this bridge + private int numOfConnections = 0; + private Path bridgePath=null; + + + final static int SEARCH_THRES=300; + protected static boolean forceFill=false; + + public static void forceFilling(){ + forceFill=true; + } + public static void relaxFilling(){ + forceFill=false; + } + + + public ContigBridge(Contig c1, Contig c2, int ind){ + firstContig = c1; + secondContig = c2; + orderIndex = ind; + hashKey = makeHash(c1.index,c2.index, orderIndex); + + } + /** + * Re-assign the two contigs + * @param first + * @param second + */ + public ContigBridge clone(Contig first, Contig second){ + ContigBridge dolly=new ContigBridge(first,second,orderIndex); + dolly.bridgePath=bridgePath; + dolly.transVector=transVector; + dolly.score=score; + dolly.connection=connection; + dolly.numOfConnections=numOfConnections; + return dolly; + + } + public static String makeHash(int aIndex, int bIndex, int order){ + return aIndex+"#"+bIndex + "#" + order; + } + + public boolean consistentWith(ScaffoldVector aVector){ + int tolerance = firstContig.getIndex()==secondContig.getIndex()?100:250; + return (aVector.direction == transVector.direction) + && (((aVector.magnitude * 1.0 / transVector.magnitude > 0.75) + && (aVector.magnitude * 1.0 / transVector.magnitude < 1.25)) + || (Math.abs(aVector.magnitude-transVector.magnitude) < tolerance) + ) + ; + } +/** + * To add one more connection supporting this bridge. For memory efficient, only + * store the best ones. + * @param readSequence + * @param firstAlignment + * @param secondAlignment + * @param trans + * @param sc + * @return + */ + public double addConnection(ReadFilling readSequence, + AlignmentRecord firstAlignment, + AlignmentRecord secondAlignment, + ScaffoldVector trans, + double sc){ + + Connection newConnect = new Connection(readSequence, firstAlignment,secondAlignment,trans); + numOfConnections++; + //the metric for bridge score is important! + //score = score>sc?score:sc; + score += sc; + + if(connection == null || connection.gapsBetween() > newConnect.gapsBetween()) { + transVector = trans; + connection=newConnect; + } + + + return score; + } + + /* + * Try to find a path that connect firstContig to secondContig, + * based on the transVector. + * Only being invoked from Scaffold.viewSequence()?? Yes! + */ + private Connection updatePath(){ + //only go if assembly graph is specified! + if(Contig.asGraph == null) + return null; + + int d=transVector.distance(firstContig, secondContig); + + //if the distance is too long, we should wait for more long reads coming in + if(!forceFill && d>SEARCH_THRES) + return null; + + //not doing it again + //FIXME: remove this when implement progressive taxa-typing based on long reads... + if(bridgePath!=null) + return path2Connection(bridgePath); + + if(ScaffoldGraph.verbose){ + System.out.println("Trying to find path that connect " + firstContig.getName() + "("+ (firstContig.getRelDir()>0?"F":"R") + ")" + + " to " + secondContig.getName() + "("+ (secondContig.getRelDir()>0?"F":"R") + ")"); + } + Node tip1, tip2; + ArrayList firstPathList=firstContig.getPaths(), + secondPathList=secondContig.getPaths(), + candidates=new ArrayList(); + + if(firstContig.getRelDir()>0) + tip1 = firstPathList.get(firstPathList.size()-1).getEnd(); + else + tip1 = firstPathList.get(0).rc().getEnd(); + + if(secondContig.getRelDir()>0) + tip2 = secondPathList.get(0).getStart(); + else + tip2 = secondPathList.get(secondPathList.size()-1).rc().getStart(); + + + candidates.addAll(Contig.asGraph.DFS(tip1, tip2, d)); + + Collections.sort(candidates); + + /** + * Using poa to find best candidate regarding long reads data. + */ + String bestMatch=null; + + if(candidates.isEmpty()) + return null; +// else if(candidates.size()==1) +// bridgePath=candidates.get(0); +// else{ +// ArrayList allSeq=new ArrayList(); +// Collections.sort(connections); +// allSeq.addAll(connections); +// for(Path p:candidates) +// allSeq.add(path2Connection(p)); +// +// ArrayList readList = new ArrayList(connections.size()); +// // locate the offset points on two contigs. Note: 1-based due to the fuking htsjdk.samtools +// int cutOnFirstContig=firstContig.getRelDir()>0?(firstContig.length()):1, +// cutOnSecondContig=secondContig.getRelDir()>0?1:(secondContig.length()); +// +// for (Connection connection:allSeq){ +// int firstCutOnRead=mapToRead(cutOnFirstContig, connection.firstAlignment), +// secondCutOnRead=mapToRead(cutOnSecondContig, connection.secondAlignment); +// Sequence tmp = null; +// try{ +// ReadFilling tmpRead = connection.read; +// if (firstCutOnRead > secondCutOnRead){ +// connection.read = connection.read.reverse(); +// connection.firstAlignment=connection.firstAlignment.reverseRead(); +// connection.secondAlignment=connection.secondAlignment.reverseRead(); +// firstCutOnRead = tmpRead.readSequence.length()-firstCutOnRead+1; +// secondCutOnRead = tmpRead.readSequence.length()-secondCutOnRead+1; +// } +// +// tmp = connection.read.readSequence.subSequence(firstCutOnRead-1, secondCutOnRead-1); +// tmp.setName(tmpRead.readSequence.getName()); +// tmp.setDesc(tmpRead.readSequence.getDesc()); +// readList.add(tmp); +// } +// catch(Exception e){ +// e.printStackTrace(); +// System.err.println("Failed attempt to extract (" + firstCutOnRead + ", " + secondCutOnRead +// + ") from sequence with length " + connection.read.readSequence.length()); +// } +// } +// +// try { +// +// String faiFile = hashKey + "_ai.fasta";//name of input fasta file +// String faoFile = hashKey + "_ao_pir.fasta";//name of output +// { +// SequenceOutputStream faiSt = SequenceOutputStream.makeOutputStream(faiFile); +// for (Sequence seq:readList){ +// Logging.info(seq.getName() + " " + seq.length()); +// seq.writeFasta(faiSt); +// } +// faiSt.close(); +// } +// +// //2.0 Run multiple alignment +// { +// String cmd = "/home/s.hoangnguyen/Tools/poaV2/poa -read_fasta " + faiFile + " -pir " + faoFile + " -hb -best /home/s.hoangnguyen/Tools/poaV2/blosum80.mat"; +// //String cmd = "/home/s.hoangnguyen/Tools/poaV2/poa -read_fasta " + faiFile + " -clustal clustal_" + faoFile + " -hb -best /home/s.hoangnguyen/Tools/poaV2/blosum80.mat"; +// +// +// Logging.info("Running " + cmd); +// Process process = Runtime.getRuntime().exec(cmd); +// process.waitFor(); +// Logging.info("Done " + cmd); +// } +// +// FastaReader reader = new FastaReader(faoFile); +// bestMatch=reader.nextSequence(Alphabet.DNA()).getName(); +// reader.close(); +// +// +// } catch (Exception e) { +// e.printStackTrace(); +// System.err.println("Can not generate consensus sequence!"); +// } +// for(Path p:candidates){ +// if(p.getID().equals(bestMatch)){ +// bridgePath=p; +// break; +// } +// +// } +// } + + + if(bestMatch==null){ + //System.out.println("Not found a stand-out path! Pick the first one."); + bridgePath=candidates.get(0); + } + + //now make change to the transVector to fit the bridgePath + int newDistance=bridgePath.length-bridgePath.getStart().getSeq().length()-bridgePath.getEnd().getSeq().length(); //distance between two closest tips of two connecting Nodes + transVector.setMagnitute(transVector.getMagnitute()+(newDistance-d)*Integer.signum(firstContig.getRelDir())); + //check if this is not the close bridge of the scaffold + if(consistentWith(ScaffoldVector.composition(secondContig.myVector, ScaffoldVector.reverse(firstContig.myVector)))) + secondContig.myVector=ScaffoldVector.composition(transVector, firstContig.myVector); + return path2Connection(bridgePath); + + } + /** + * Get an artifact connection out of current bridgePath + * @return Connection: corresponding connection + */ + private Connection path2Connection(Path p){ + Node tip1=p.getStart(), + tip2=p.getEnd(); + int d=transVector.distance(firstContig, secondContig); + + + Connection retval=null; + if(!p.isEmpty()){ + // Convert bridgePath to a Connection + String readID=p.getID(); + Sequence seq=p.spelling(); + int refStart, refEnd, readLength=seq.length(), readStart, readEnd, score=Integer.MAX_VALUE; + boolean strand, useful=true; + + refStart=firstContig.length()-tip1.getSeq().length()+1; refEnd=firstContig.length(); + readStart=1; readEnd=tip1.getSeq().length(); + strand=true; + if(firstContig.getRelDir()<0){ + refStart=1; refEnd=tip1.getSeq().length(); + int tmp=readStart; + readStart=readEnd; + readEnd=tmp; + strand=false; + } + AlignmentRecord firstAlignment=new AlignmentRecord(readID, refStart, refEnd, readLength, readStart, readEnd, strand, useful, firstContig, score); + + refStart=1; refEnd=tip2.getSeq().length(); + readStart=readLength-tip2.getSeq().length()+1; readEnd=readLength; + strand=true; + if(secondContig.getRelDir()<0){ + refStart=secondContig.length()-tip2.getSeq().length()+1; refEnd=secondContig.length(); + int tmp=readStart; + readStart=readEnd; + readEnd=tmp; + strand=false; + } + AlignmentRecord secondAlignment=new AlignmentRecord(readID, refStart, refEnd, readLength, readStart, readEnd, strand, useful, secondContig, score); + ArrayList list = new ArrayList(); + list.add(firstAlignment); + list.add(secondAlignment); + ReadFilling read = new ReadFilling(seq,list); + + //now make change to the transVector to fit the bridgePath + int newDistance=p.length-tip1.getSeq().length()-tip2.getSeq().length(); //distance between two closest tips of two connecting Nodes + + retval=new Connection( read, firstAlignment, secondAlignment, + new ScaffoldVector(transVector.getDirection(), transVector.getMagnitute()+(newDistance-d)*Integer.signum(firstContig.getRelDir()))); + } + return retval; + + } + + /** + * @return the score + */ + public double getScore() { + return score; + } + + //NOTE: magnitude usually doesn't help for bridges with repeat. + // E.g. <--===---------------> prev not next for the both + public void setContigScores(){ + int firstPointer = 0, + secondPointer = 0; + + if(transVector.magnitude < 0){ + firstPointer=-1; + if(transVector.direction < 0) + secondPointer=-1; + else + secondPointer=1; + } + // special case: magnitude < firstContig.length() && transVector.direction < 0; + else if(transVector.magnitude < firstContig.length() && transVector.direction < 0){ + firstPointer = secondPointer = -1; + } + else{ + + firstPointer=1; + if(transVector.direction > 0) + secondPointer=-1; + else + secondPointer=1; + } + //reset based on the pointers + if(firstPointer > 0){ + firstContig.nextScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set nextScore of %s to %.2f\n", firstContig.getName(), score); + }else{ + firstContig.prevScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set prevScore of %s to %.2f\n", firstContig.getName(), score); + } + + if(secondPointer > 0){ + secondContig.nextScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set nextScore of %s to %.2f\n", secondContig.getName(), score); + }else{ + secondContig.prevScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set prevScore of %s to %.2f\n", secondContig.getName(), score); + } + + } + // when contig bridge is removed, reset the scores + public void resetContigScores(){ + int firstPointer = 0, + secondPointer = 0; + if(ScaffoldGraph.verbose) + System.out.print("Trans vector " + transVector + " :" ); + if(transVector.magnitude < 0){ + firstPointer=-1; + if(transVector.direction < 0) + secondPointer=-1; + else + secondPointer=1; + } + // special case: magnitude < firstContig.length() && transVector.direction < 0; + else if(transVector.magnitude < firstContig.length() && transVector.direction < 0){ + firstPointer = secondPointer = -1; + } + else{ + firstPointer=1; + if(transVector.direction > 0) + secondPointer=-1; + else + secondPointer=1; + } + //reset based on the pointers + if(firstPointer > 0){ + firstContig.nextScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("...reset nextScore of %s to 0, ", firstContig.getName()); + }else{ + firstContig.prevScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("...reset prevScore of %s to 0, ", firstContig.getName()); + } + + if(secondPointer > 0){ + secondContig.nextScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("reset nextScore of %s to 0\n", secondContig.getName()); + }else{ + secondContig.prevScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("reset prevScore of %s to 0\n", secondContig.getName()); + } + + } + /** + * @return the transVector + */ + public ScaffoldVector getTransVector() { + return transVector; + } + /** + * @return the connections + */ + public int getNumOfConnections() { + return numOfConnections; + } + /* + * @return the equivalent path + */ + public Path getBridgePath(){ + return bridgePath; + } + + public Connection getBestConnection() throws IOException{ + if(ScaffoldGraph.verbose) + System.out.println("Finding best connection for bridge "+ hashKey + ":"); + + Connection gapsBestConnection = updatePath(); + if(gapsBestConnection==null){ + if(ScaffoldGraph.verbose) + System.out.println("Path not found! Use representative connection instead..."); + + gapsBestConnection = connection; + + } else if(ScaffoldGraph.verbose) + System.out.println("Found path("+bridgePath.length+"): "+bridgePath); + + if(ScaffoldGraph.verbose) + gapsBestConnection.display(); + + return gapsBestConnection; + + } + + /** + * Return the position on the reference that corresponds to a given position + * on read. + * + * @param posInRead + * @param record + * @return + */ + static int positionOnRef(int readLookingPositon, AlignmentRecord record){ + if(ScaffoldGraph.verbose) + System.out.println("...locating position on reference of read's position " + readLookingPositon + + "(" + record.readAlignmentStart() + "," + record.readAlignmentEnd() + ")"); + if (readLookingPositon < record.readAlignmentStart() || readLookingPositon > record.readAlignmentEnd()) + return 0; + + if (!record.strand) + readLookingPositon = record.readLength - readLookingPositon + 1; // use direction of ref (forward) + + + int posOnRead = record.strand?record.readStart:(record.readLength + 1 - record.readStart); + int posOnRef = record.refStart; + //assert pos <= posOnRead + + if(record.alignmentCigars.isEmpty()){ //perfect alignment made by overlapped EDGES (when using assembly graph) + return posOnRef + readLookingPositon - posOnRead; + + }else{ + for (final CigarElement e : record.alignmentCigars) { + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + case S : + case P : + break; // ignore pads and clips + case I : + //insert + if (posOnRead + length < readLookingPositon){ + posOnRead += length; + }else{ + return posOnRef; + } + break; + case M ://match or mismatch + case EQ://match + case X ://mismatch + if (posOnRead + length < readLookingPositon){ + posOnRead += length; + posOnRef += length; + }else{ + return posOnRef + readLookingPositon - posOnRead; + } + break; + case D : + posOnRef += length; + break; + case N : + posOnRef += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//casse + }//for + } + return 0; + } + /** + * Return the position on the read that corresponds to a given position + * on reference. + * + * @param posInRef + * @param record + * @return + */ + static int mapToRead(int posOnRef, AlignmentRecord record){ + // read htsjdk.samtools.* API + int location = -1; + + if ((posOnRef - record.refStart)*(posOnRef - record.refEnd) >= 0){ + if (Math.abs(posOnRef-record.refStart) > Math.abs(posOnRef-record.refEnd)) + location = record.strand?record.readEnd+posOnRef-record.refEnd:record.readEnd-posOnRef+record.refEnd; + else + location = record.strand?record.readStart+posOnRef-record.refStart:record.readStart-posOnRef+record.refStart; + } + else{ + // current coordinate on read, followed the reference contig's direction + int posOnRead = record.strand?record.readStart:record.readLength-record.readStart+1; + // current position on ref + int pos = record.refStart; + + for (final CigarElement e : record.alignmentCigars) { + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + case S : + case P : + break; // ignore pads and clips + case I : + posOnRead += length; + break; + case M ://match or mismatch + case EQ://match + case X ://mismatch + if (pos + length < posOnRef){ + pos += length; + posOnRead += length; + }else{ + location = posOnRef + posOnRead - pos; + } + break; + case D : + case N : + //delete + if (pos + length < posOnRef){ + pos += length; + }else{ + location = posOnRead; + } + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//casse + }//for + //convert back to coordinate based on read direction + location = record.strand?location:record.readLength-location+1; + } + + System.out.println( "Contig (ref): " + record.contig.getName() + " Read: " + record.readID + " Strand: " + record.strand); + System.out.println( "\tOn contig: " + record.refStart + " -> " + record.refEnd + + " Len: " + record.contig.length() + " Cut point: " + posOnRef); + System.out.println( "\tOn read: " + record.readStart + " -> " + record.readEnd + + " Len: " + record.readLength + " Alleged cut point: " + location); + + location=location>0?location:0; + location=location " + record.readEnd + + " Len: " + record.readLength + " Final cut point: " + location); + return location; + } + + public boolean isContaining(Contig ctg){ + if(firstContig.getIndex() == ctg.getIndex() || secondContig.getIndex() == ctg.getIndex()) + return true; + else + return false; + } + public void display(){ + System.out.printf("##################START########################\n" + + "Contig %3d (%d) -> Contig %3d (%d) Vector (%s) score = %f distance = %d\n", + this.firstContig.index, + this.firstContig.length(), + this.secondContig.index, + this.secondContig.length(), + transVector.toString(), + this.score, + transVector.distance(firstContig, secondContig) + ); + + connection.display(); + System.out.println("##################END########################"); + } + + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(ContigBridge o) { + return (int) (o.score - score); + + } + +/******************************************************************************************************* + * Deprecated functions used for calculating the consensus connection out of a list. + * Now only save the best connection for memory efficiency. + * Can get back to this for better quality gap filling... + ******************************************************************************************************* + */ + + /* + * + //Try to connect contigs with consensus sequence from involved reads + public Connection consensusConnection(SequenceOutputStream consensusOut) throws IOException{ + int offset = 100; //1-based + Collections.sort(connections); + ArrayList readList = new ArrayList(connections.size()); + // locate the offset points on two contigs + int cutOnFirstContig, cutOnSecondContig; + int tS = 1, tE = firstContig.length(), + fS, fE; + if (transVector.direction > 0){ + fS = transVector.magnitude; + fE = transVector.magnitude + secondContig.length(); + }else{ + fE = transVector.magnitude; + fS = transVector.magnitude - secondContig.length(); + } + // tS---|->tE fS<-|--->fE + if (fS-tE > tS-fE){ + cutOnFirstContig = firstContig.length()>offset?firstContig.length()-offset:firstContig.length(); + cutOnSecondContig = secondContig.length()>offset?offset:secondContig.length(); + + } + // fS<---|->fE tS-|--->tE + else{ + cutOnFirstContig = firstContig.length()>offset?offset:firstContig.length(); + cutOnSecondContig = secondContig.length()>offset?secondContig.length()-offset:secondContig.length(); + + } + // should we check other case (overlapped, contained..)?? + cutOnSecondContig = transVector.direction>0?cutOnSecondContig:secondContig.length()-cutOnSecondContig; + + Connection gapsBestConnection = null; + int gapsBest = Integer.MAX_VALUE; + int rplStart=0, rplEnd=0; + for (Connection connection:connections){ + int firstCutOnRead=mapToRead(cutOnFirstContig, connection.firstAlignment), + secondCutOnRead=mapToRead(cutOnSecondContig, connection.secondAlignment); + Sequence tmp = null; + try{ +// if(firstCutOnRead < secondCutOnRead) +// tmp = connection.read.readSequence.subSequence(firstCutOnRead, secondCutOnRead); +// else +// tmp = connection.read.readSequence.subSequence(secondCutOnRead, firstCutOnRead); + ReadFilling tmpRead = connection.read; + if (firstCutOnRead > secondCutOnRead){ + connection.read = connection.read.reverse(); + connection.firstAlignment=connection.firstAlignment.reverseRead(); + connection.secondAlignment=connection.secondAlignment.reverseRead(); + firstCutOnRead = tmpRead.readSequence.length()-firstCutOnRead+1; + secondCutOnRead = tmpRead.readSequence.length()-secondCutOnRead+1; + } + + tmp = connection.read.readSequence.subSequence(firstCutOnRead-1, secondCutOnRead-1); + tmp.setName(tmpRead.readSequence.getName()); + tmp.setDesc(tmpRead.readSequence.getDesc()); + readList.add(tmp); + } + catch(Exception e){ + e.printStackTrace(); + System.err.println("Failed attempt to extract (" + firstCutOnRead + ", " + secondCutOnRead + + ") from sequence with length " + connection.read.readSequence.length()); + } + int gapsBt = connection.gapsBetween(); + if (gapsBt < gapsBest){ + gapsBest = gapsBt; + gapsBestConnection = connection; + rplStart = firstCutOnRead; + rplEnd = secondCutOnRead; + } + } + + Sequence consensus = null; + Connection consensusConnection = gapsBestConnection; + try { + consensus = ErrorCorrection.consensusSequence(readList, hashKey, "poa"); + consensus.setName(hashKey); + consensus.setDesc("Consensus sequence"); + consensus.writeFasta(consensusOut); + Sequence gapsBestSequence = gapsBestConnection.read.readSequence; + int len = gapsBestSequence.length()-(rplEnd-rplStart+1)+consensus.length(); + Sequence rpl = new Sequence(Alphabet.DNA16(), len); + for (int idx=0;idx < len; idx++){ + if (idx < rplStart) + rpl.setBase(idx, gapsBestSequence.getBase(idx)); + else if (idx >= rplStart+consensus.length()) + rpl.setBase(idx, gapsBestSequence.getBase(idx+rplEnd+1-rplStart-consensus.length())); + else + rpl.setBase(idx, consensus.getBase(idx-rplStart)); + } + System.out.println("---->Changed length (loss): " + (len-gapsBestSequence.length())); + + AlignmentRecord newFirst=gapsBestConnection.firstAlignment, + newSecond=gapsBestConnection.secondAlignment; + newSecond.readStart+=len-gapsBestSequence.length(); + newSecond.readEnd+=len-gapsBestSequence.length(); + ArrayList ends = new ArrayList(); + ends.add(newFirst); + ends.add(newSecond); + ReadFilling simple = new ReadFilling(rpl, ends); + consensusConnection = new Connection(simple,gapsBestConnection.firstAlignment,gapsBestConnection.secondAlignment,gapsBestConnection.trans); + + } catch (Exception e) { + e.printStackTrace(); + System.err.println("Can not generate consensus sequence!"); + } + return consensusConnection; + + } + */ + + /* + // Get the consensus read out of the list + public ReadFilling consensusRead() throws IOException{ + int offset = 300; //1-based + Collections.sort(connections); + ArrayList readList = new ArrayList(connections.size()); + // locate the offset points on two contigs + int cutOnFirstContig, cutOnSecondContig; + int tS = 1, tE = firstContig.length(), + fS, fE; + if (transVector.direction > 0){ + fS = transVector.magnitude; + fE = transVector.magnitude + secondContig.length(); + }else{ + fE = transVector.magnitude; + fS = transVector.magnitude - secondContig.length(); + } + // tS---|->tE fS<-|--->fE + if (fS-tE > tS-fE){ + cutOnFirstContig = firstContig.length()>offset?firstContig.length()-offset:firstContig.length(); + cutOnSecondContig = secondContig.length()>offset?offset:secondContig.length(); + + } + // fS<---|->fE tS-|--->tE + else{ + cutOnFirstContig = firstContig.length()>offset?offset:firstContig.length(); + cutOnSecondContig = secondContig.length()>offset?secondContig.length()-offset:secondContig.length(); + + } + // cuz first contig direction was used as base -> adjust coordinate on the second + cutOnSecondContig = transVector.direction>0?cutOnSecondContig:secondContig.length()-cutOnSecondContig; + + for (Connection connection:connections){ + int firstCutOnRead=mapToRead(cutOnFirstContig, connection.firstAlignment), + secondCutOnRead=mapToRead(cutOnSecondContig, connection.secondAlignment); + Sequence tmp = null; + try{ + ReadFilling tmpRead = connection.read; + if (firstCutOnRead > secondCutOnRead){ + tmpRead = connection.read.reverse(); + firstCutOnRead = tmpRead.readSequence.length() - firstCutOnRead; + secondCutOnRead = tmpRead.readSequence.length() - secondCutOnRead; + } + + tmp = tmpRead.readSequence.subSequence(firstCutOnRead-1, secondCutOnRead-1); + tmp.setName(tmpRead.readSequence.getName()); + tmp.setDesc(tmpRead.readSequence.getDesc()); + readList.add(tmp); + } + catch(Exception e){ + e.printStackTrace(); + System.err.println("Failed attempt to extract (" + firstCutOnRead + ", " + secondCutOnRead + + ") from sequence with length " + connection.read.readSequence.length()); + } + + } + + Sequence consensus = readList.get(0); + ReadFilling consensusRead = null; + try { + consensus = ErrorCorrection.consensusSequence(readList, hashKey, "poa"); + consensusRead = new ReadFilling(consensus, new ArrayList()); + } catch (InterruptedException e) { + e.printStackTrace(); + System.err.println("Can not generate consensus sequence!"); + } + + return consensusRead; + + } + */ + + /* + //Fill the scaffold considering all connections (get the consensus) + public Sequence fillConsensus(AlignmentRecord ttAlign, AlignmentRecord ffAlign){ + int tS = 1, tE = firstContig.length(), + fS, fE, tC, fC; + AlignmentRecord tAlign=null, fAlign=null; + if (transVector.direction > 0){ + fS = transVector.magnitude; + fE = transVector.magnitude + secondContig.length(); + }else{ + fE = transVector.magnitude; + fS = transVector.magnitude - secondContig.length(); + } + // tS---|->tE fS<-|--->fE + if (fS-tE > tS-fE){ + int tEnd = firstContig.length()-1, fEnd = transVector.direction>0?0:secondContig.length()-1; //furthest pair + tC=tEnd; + fC=fEnd; + + for (Connection connection:connections){ + if(Math.min(Math.abs(connection.firstAlignment.refStart-tEnd), + Math.abs(connection.firstAlignment.refEnd-tEnd)) + > Math.abs(tC-tEnd)){ + tC= Math.abs(connection.firstAlignment.refStart-tEnd) < + Math.abs(connection.firstAlignment.refEnd-tEnd)? + connection.firstAlignment.refStart + :connection.firstAlignment.refEnd; + tAlign=connection.firstAlignment; + + } + if(Math.min(Math.abs(connection.secondAlignment.refStart-fEnd), + Math.abs(connection.secondAlignment.refEnd-fEnd)) + > Math.abs(fC-fEnd)){ + fC= Math.abs(connection.secondAlignment.refStart-fEnd) < + Math.abs(connection.secondAlignment.refEnd-fEnd)? + connection.secondAlignment.refStart + :connection.secondAlignment.refEnd; + fAlign=connection.secondAlignment; + } + } + + } + // fS<---|->fE tS-|--->tE + else{ + int tEnd = 0, fEnd = transVector.direction>0?secondContig.length()-1:0; //furthest pair + tC=tEnd; + fC=fEnd; + + for (Connection connection:connections){ + if(Math.min(Math.abs(connection.firstAlignment.refStart-tEnd), + Math.abs(connection.firstAlignment.refEnd-tEnd)) + > Math.abs(tC-tEnd)){ + tC= Math.abs(connection.firstAlignment.refStart-tEnd) < + Math.abs(connection.firstAlignment.refEnd-tEnd)? + connection.firstAlignment.refStart + :connection.firstAlignment.refEnd; + tAlign=connection.firstAlignment; + + } + if(Math.min(Math.abs(connection.secondAlignment.refStart-fEnd), + Math.abs(connection.secondAlignment.refEnd-fEnd)) + > Math.abs(fC-fEnd)){ + fC= Math.abs(connection.secondAlignment.refStart-fEnd) < + Math.abs(connection.secondAlignment.refEnd-fEnd)? + connection.secondAlignment.refStart + :connection.secondAlignment.refEnd; + fAlign=connection.secondAlignment; + } + } + } + ttAlign.copy(tAlign); + ffAlign.copy(fAlign); + + //---------------------------------------------------------------------------------- + ArrayList seqList = new ArrayList(); + + Contig fromContig = firstContig, + toContig = secondContig; + + //loop over all connections + for(Connection connection:connections){ + + AlignmentRecord fromAlignment = + (fromContig == firstContig)?connection.firstAlignment:connection.secondAlignment; + + AlignmentRecord toAlignment = + (fromContig == firstContig)?connection.secondAlignment:connection.firstAlignment; + + ReadFilling readFilling = connection.read; + if ( fromContig.getRelDir()>0 != fromAlignment.strand){ + //swap + readFilling = connection.read.reverse(); + readFilling.sortAlignment(); + for (AlignmentRecord record:readFilling.alignments){ + if (record.contig == fromContig) + fromAlignment = record; + + if (record.contig == toContig) + toAlignment = record; + } + } + //now readFilling is good to go + int posReadEnd = fromAlignment.readAlignmentEnd(); + int posReadFinal = toAlignment.readAlignmentStart();// I need as far as posReadFinal + + Sequence seqRead = readFilling.readSequence.subSequence(posReadEnd, posReadFinal); + seqRead.setName(new String("R_" + connection.readID + "_" + score)); + + SequenceBuilder seqContig = new SequenceBuilder(Alphabet.DNA16(),1024*1024,"C_" + connection.readID + "_" + score); + //int curPos=0; //current 1-based position pointer of seqContig + for (AlignmentRecord record:readFilling.alignments){ + Contig contig = record.contig; + if (contig == fromContig) + continue; + + if (posReadEnd >= posReadFinal -1) + //continue;//I can break here, but want to get portionUsed of other contigs + break; + + if (record.readAlignmentEnd() < posReadEnd) + continue; + + //assert: posReadEnd < readEnd + if (record.readAlignmentStart() > posReadEnd){ + //Really need to fill in using read information + int newPosReadEnd = Math.min(posReadFinal - 1, record.readAlignmentStart() -1); + if (newPosReadEnd > posReadEnd){ + seqContig.append(readFilling.readSequence.subSequence(posReadEnd, newPosReadEnd)); +// if (connection.readID%3==2) +// seqContig.append(readFilling.readSequence.subSequence(posReadEnd, newPosReadEnd)); +// else{ +// char[] n = new char[newPosReadEnd-posReadEnd+1]; +// java.util.Arrays.fill(n,'-'); +// seqContig.append(new Sequence(Alphabet.DNA16(),n,"Filling")); +// } + posReadEnd = newPosReadEnd; + + } + if (posReadEnd + 1 >= posReadFinal) + //continue;//Done + break; + //Now get information on the contig from start + if (contig == toContig) + //continue;//could break + break; + if (record.strand){ + int refLeft = record.refStart; + int refRight = record.refEnd; + + if (posReadFinal <= record.readAlignmentEnd()){ + refRight = positionOnRef(posReadFinal, record) -1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + + seqContig.append(contig.contigSequence.subSequence(refLeft - 1, refRight)); + }else{//neg strain + int refRight = record.refStart; + int refLeft = record.refEnd; + + if (posReadFinal <= record.readAlignmentEnd()){ + refLeft = positionOnRef(posReadFinal, record) + 1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + + seqContig.append(Alphabet.DNA.complement(contig.contigSequence.subSequence(refRight - 1, refLeft))); + } + }//if record.readAlignmentStart() > posReadEnd + else{//Now get information on the contig from start + if (contig == toContig) + //continue;//could break + break; + if (record.strand){ + int refLeft = positionOnRef(posReadEnd, record) + 1; + int refRight = record.refEnd; + + if (posReadFinal <= record.readAlignmentEnd()){ + refRight = positionOnRef(posReadFinal, record) -1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + + seqContig.append(contig.contigSequence.subSequence(refLeft - 1, refRight)); + }else{//neg strain + int refLeft = positionOnRef(posReadEnd, record) + 1; + int refRight = record.refStart; + + if (posReadFinal <= record.readAlignmentEnd()){ + refLeft = positionOnRef(posReadFinal, record) + 1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + seqContig.append(Alphabet.DNA.complement(contig.contigSequence.subSequence(refRight - 1, refLeft))); + } + } + } + seqList.add(seqContig.toSequence()); + if(connection.readID.contains("twodimentional")) //only add 2D reads to calculate the consensus + seqList.add(seqRead); + } + Sequence consensus = null; + try { + consensus = ErrorCorrection.consensusSequence(seqList, hashKey, "poa"); + } catch (Exception e) { + e.printStackTrace(); + System.err.println("Can not generate consensus sequence!"); + } + return consensus; +// AlignmentRecord first=fromContig==firstContig?tAlign.clones():fAlign.clones(), +// second=fromContig==firstContig?fAlign.clones():tAlign.clones(); +// +// +// SequenceBuilder builder = new SequenceBuilder(Alphabet.DNA16(), 1024*1024, "consensus"); +// +// Sequence ligateToStart = first.contig.contigSequence.subSequence(first.refStart, first.refEnd), +// ligateToEnd = second.contig.contigSequence.subSequence(second.refStart, second.refEnd); +// first.readID = second.readID = -1; +// first.strand=fromContig.getRelDir()>0; +// second.strand = toContig.getRelDir()>0; +// // |--|------------|----|> +// // -----|--|-- --|----|----- +// first.readLength = second.readLength = ligateToStart.length() + consensus.length() + ligateToEnd.length(); +// +// first.readStart = first.strand?0:ligateToStart.length()-1; +// first.readEnd = first.strand?ligateToStart.length()-1:0; +// second.readStart = second.strand?ligateToStart.length()+consensus.length()-1:second.readLength-1; +// second.readEnd = second.strand?second.readLength-1:ligateToStart.length()+consensus.length()-1; +// +// builder.append(ligateToStart); +// builder.append(consensus); +// builder.append(ligateToEnd); +// ArrayList alignList = new ArrayList(); +// alignList.add(first); alignList.add(second); +// ReadFilling rf=new ReadFilling(builder.toSequence(), alignList); +// return new Connection(rf, first, second, ScaffoldVector.composition(second.contig.getVector(),ScaffoldVector.reverse(first.contig.getVector()))); +// + } + */ + + + /* + * Connections are linking structure retrieved by 2 contigs aligned to the same nanopore read + * each common read makes up a Connection + */ + public class Connection implements Comparable{ + ReadFilling read; + String readID; + int score; + ScaffoldVector trans; + AlignmentRecord firstAlignment, secondAlignment; + + Connection(){ + + } + Connection(ReadFilling mRead, AlignmentRecord a, AlignmentRecord b, ScaffoldVector trans){ + this.read = mRead; + this.readID = a.readID; + this.firstAlignment = a; + this.secondAlignment = b; + + int aAlign = Math.abs(a.refStart - a.refEnd); + int bAlign = Math.abs(b.refStart - b.refEnd); + + score = aAlign * bAlign / (aAlign +bAlign); + this.trans = trans; + + } + + void display (){ + System.out.printf("[%6d %6d] -> [%6d %6d] : [%6d %6d] -> [%6d %6d] (%s) score=%d Read %s ==> %d\n", + firstAlignment.refStart, firstAlignment.refEnd, secondAlignment.refStart, secondAlignment.refEnd, + firstAlignment.readStart, firstAlignment.readEnd, secondAlignment.readStart, secondAlignment.readEnd, + trans.toString(), + score, read.readSequence.getName(), + trans.distance(firstContig, secondContig)); + } + + /** + * Count the number of gaps (that is the number of bases that are not + * aligned to a contig) between 2 main contigs + * @return + */ + public int gapsBetween(){ + int start = 0, end = read.readSequence.length(); + + if (firstAlignment.readAlignmentStart() < secondAlignment.readAlignmentStart()){ + start = firstAlignment.readAlignmentEnd(); + end = secondAlignment.readAlignmentStart(); + }else{ + start = secondAlignment.readAlignmentEnd(); + end = firstAlignment.readAlignmentStart(); + } + + if (start >= end) + return 0; + + BitSet bitSet = new BitSet(end); + bitSet.set(start, end); + for (AlignmentRecord record:read.alignments){ + bitSet.clear(record.readAlignmentStart(), record.readAlignmentEnd()); + } + + return bitSet.cardinality(); + } + + public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ + Contig fromContig = firstContig; + Contig toContig = secondContig; + + AlignmentRecord fromAlignment = firstAlignment, + toAlignment = secondAlignment; + + ReadFilling readFilling = read; + if ( fromContig.getRelDir()>0 != fromAlignment.strand){ + //swap + readFilling = read.reverse(); + readFilling.sortAlignment(); + + fromAlignment = fromAlignment.reverseRead(); + toAlignment = toAlignment.reverseRead(); + } + //now readFilling is good to go + int posReadEnd = fromAlignment.readAlignmentEnd(); + int posReadFinal = toAlignment.readAlignmentStart();// I need as far as posReadFinal + // locate the last position being extended... + int lastExtendedPosition = posReadFinal; + if(posReadEnd >= posReadFinal ){ + lastExtendedPosition = Math.min(posReadEnd,toAlignment.readAlignmentEnd()); + return positionOnRef(lastExtendedPosition, toAlignment); + } + if(seqBuilder == null) + return positionOnRef(lastExtendedPosition, toAlignment); + + for (AlignmentRecord record:readFilling.alignments){ + Contig contig = record.contig; +// if (contig.getIndex() == fromContig.getIndex()) +// continue; + + if (posReadEnd >= posReadFinal -1) + break; + + + if (record.readAlignmentEnd() <= posReadEnd) + continue; + + if (record.readAlignmentStart() > posReadEnd){ + //Really need to fill in using read information + int newPosReadEnd = Math.min(posReadFinal - 1, record.readAlignmentStart() -1); + if (newPosReadEnd > posReadEnd){ + JapsaFeature feature = + new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() + newPosReadEnd - posReadEnd, + "CONTIG",readFilling.readSequence.getName(),'+',""); + + //P=0 get the orignial read name and position + feature.addDesc(readFilling.readSequence.getName() + "+("+(posReadEnd + 1) +"," + newPosReadEnd+")"); + anno.add(feature); + seqBuilder.append(readFilling.readSequence.subSequence(posReadEnd, newPosReadEnd)); + posReadEnd = newPosReadEnd; + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); + + } + if (posReadEnd + 1 >= posReadFinal) + continue;//Done + + //Now get information on the contig from start + if (contig.getIndex() == toContig.getIndex()) + continue;//tandem + if (record.strand){ + int refLeft = record.refStart; + int refRight = record.refEnd; + + if (posReadFinal <= record.readAlignmentEnd()){ + refRight = positionOnRef(posReadFinal, record) -1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + if(refLeft > refRight) + continue; + + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refLeft-1) + " to " + refRight + " out of " + contig.getName()); + JapsaFeature feature = + new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() + refRight - refLeft +1, + "CONTIG",contig.getName(),'+',""); + feature.addDesc(contig.getName() + "+("+(refLeft ) +"," + refRight+")"); + anno.add(feature); + + + seqBuilder.append(contig.contigSequence.subSequence(refLeft - 1, refRight)); + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); + + }else{//neg strain + int refRight = record.refStart; + int refLeft = record.refEnd; + + if (posReadFinal <= record.readAlignmentEnd()){ + refLeft = positionOnRef(posReadFinal, record) + 1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + if(refLeft < refRight) + continue; + + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refRight-1) + " to " + refLeft + " out of " + contig.getName()); + JapsaFeature feature = + new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() - refRight + refLeft +1, + "CONTIG",contig.getName(),'+',""); + feature.addDesc(contig.getName() + "-("+(refRight ) +"," + refLeft+")"); + anno.add(feature); + + seqBuilder.append(Alphabet.DNA.complement(contig.contigSequence.subSequence(refRight - 1, refLeft))); + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); + } + }//if record.readAlignmentStart() > posReadEnd + else{//Now get information on the contig from start + if (contig.getIndex() == toContig.getIndex()) + continue;//tandem + if (record.strand){ + int refLeft = positionOnRef(posReadEnd, record) + 1; + int refRight = record.refEnd; + + if (posReadFinal <= record.readAlignmentEnd()){ + refRight = positionOnRef(posReadFinal, record) -1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + if(refLeft > refRight) + continue; + + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refLeft-1) + " to " + refRight + " out of " + contig.getName()); + JapsaFeature feature = + new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() + refRight - refLeft +1, + "CONTIG",contig.getName(),'+',""); + feature.addDesc(contig.getName() + "+("+(refLeft ) +"," + refRight+")"); + anno.add(feature); + + seqBuilder.append(contig.contigSequence.subSequence(refLeft - 1, refRight)); + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); + }else{//neg strand + int refLeft = positionOnRef(posReadEnd, record) + 1; + int refRight = record.refStart; + + if (posReadFinal <= record.readAlignmentEnd()){ + refRight = positionOnRef(posReadFinal, record) + 1; + posReadEnd = posReadFinal -1; + }else{ + posReadEnd = record.readAlignmentEnd(); + } + if(refLeft < refRight) + continue; + + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refRight-1) + " to " + refLeft + " out of " + contig.getName()); + JapsaFeature feature = + new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() - refRight + refLeft +1, + "CONTIG",contig.getName(),'+',""); + feature.addDesc(contig.getName() + "-("+(refRight ) +"," + refLeft+")"); + anno.add(feature); + + seqBuilder.append(Alphabet.DNA.complement(contig.contigSequence.subSequence(refRight - 1, refLeft))); + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); + } + } + } + return positionOnRef(lastExtendedPosition, toAlignment); + } + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(Connection o) { + return o.score - score; + } + } + +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/Edge.java b/src/dev/java/japsadev/bio/hts/scaffold/Edge.java new file mode 100644 index 0000000..2567c05 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/Edge.java @@ -0,0 +1,182 @@ +package japsadev.bio.hts.scaffold; + +import japsa.util.Logging; + +/** + * This class models an bidirected Edge in my Graph implementation. + * An Edge contains two vertices and a weight (distance between them). + * A certain edge (v1,v2) can take one among 4 types: ++, --, +- and -+. Each + * type corresponds to the way we read the DNA sequence in each read when traversing + * this edge. + * For example: v1->---<-v2 or (v1,v2)+- spells out (v1 v2') and/or (v2 v1') as in SPAdes output. + * This class also deviates from the expectations of the Comparable interface + * in that a return value of 0 does not indicate that this.equals(other). The + * equals() method only compares the vertices, while the compareTo() method + * compares the edge weights. This provides more efficient implementation for + * checking uniqueness of edges, as well as the fact that two edges of equal weight + * should be considered equitably in a path finding or spanning tree algorithm. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Edge implements Comparable { + + private Vertex one, two; + private boolean dOne, dTwo; + private int weight; + + /** + * + * @param one The first vertex in the Edge + * @param two The second vertex in the Edge + */ + public Edge(Vertex one, Vertex two, boolean d1, boolean d2){ + this(one, two, d1, d2, -Graph.getKmerSize()); + } + + /** + * + * @param one The first vertex in the Edge + * @param two The second vertex of the Edge + * @param weight The weight of this Edge + */ + public Edge(Vertex one, Vertex two, boolean dOne, boolean dTwo, int weight){ + //this.one = (one.getLabel().compareTo(two.getLabel()) <= 0) ? one : two; + //this.two = (this.one == one) ? two : one; + this.one=one; + this.two=two; + this.weight = weight; + this.dOne=dOne; + this.dTwo=dTwo; + } + + + /** + * + * @param current + * @return The neighbor of current along this Edge + */ + public Vertex getNeighbor(Vertex current){ + if(!(current.equals(one) || current.equals(two))){ + return null; + } + + return (current.equals(one)) ? two : one; + } + /** + * Return the same Edge but reading the other way around + * just swap the order of its vertices upside down + * @param + * @return the identical Edge + */ + public Edge getReversedRead(){ + return new Edge(this.two, this.one, !this.dTwo, !this.dOne, this.weight); + } + /** + * + * @param current + * @return The direction to spell *current* along this Edge + */ + public boolean getDirection(Vertex current){ + assert (current.equals(one) || current.equals(two)):"Vertex doesn't belong to this Edge!"; + + return (current.equals(one)) ? dOne : !dTwo; + } + + /** + * + * @return Vertex this.one + */ + public Vertex getOne(){ + return this.one; + } + + /** + * + * @return Vertex this.two + */ + public Vertex getTwo(){ + return this.two; + } + + /** + * + * @return boolean this.dOne + */ + public boolean getDOne(){ + return this.dOne; + } + + /** + * + * @return boolean this.dTwo + */ + public boolean getDTwo(){ + return this.dTwo; + } + /** + * + * @return int The weight of this Edge + */ + public int getWeight(){ + return this.weight; + } + + + /** + * + * @param weight The new weight of this Edge + */ + public void setWeight(int weight){ + this.weight = weight; + } + + + /** + * Note that the compareTo() method deviates from + * the specifications in the Comparable interface. A + * return value of 0 does not indicate that this.equals(other). + * The equals() method checks the Vertex endpoints, while the + * compareTo() is used to compare Edge weights + * + * @param other The Edge to compare against this + * @return int this.weight - other.weight + */ + public int compareTo(Edge other){ + return this.weight - other.weight; + } + + /** + * + * @return String A String representation of this Edge + */ + public String toString(){ + return "({" + one + (dOne?"":"'") + ", " + two + (dTwo?"":"'") + "}, " + weight + ")"; + } + + /** + * + * @return int The hash code for this Edge + */ + public int hashCode(){ + return (one.getLabel() + (dOne?"+":"-") + two.getLabel() + (dTwo?"+":"-")).hashCode(); + } + + /** + * + * @param other The Object to compare against this + * @return true iff other is an Edge with the same Vertices as this + */ + public boolean equals(Object other){ + if(!(other instanceof Edge)){ + return false; + } + + Edge e = (Edge)other; + + return (e.one.equals(this.one) && e.two.equals(this.two) && (e.getDOne()==this.dOne) && (e.getDTwo()==this.dTwo)) + || (e.one.equals(this.two) && e.two.equals(this.one) && (e.getDOne()!=this.dOne) && (e.getDTwo()!=this.dTwo)); + } +} + + diff --git a/src/dev/java/japsadev/bio/hts/scaffold/Graph.java b/src/dev/java/japsadev/bio/hts/scaffold/Graph.java new file mode 100644 index 0000000..fc0b357 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/Graph.java @@ -0,0 +1,370 @@ +package japsadev.bio.hts.scaffold; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.time.Instant; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.util.Logging; + + +/** + * This class models a simple, bidirected graph using an + * incidence list representation. Vertices are identified + * uniquely by their labels, and only unique vertices are allowed. + * At most one unique Edge per vertex pair is allowed in this Graph. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Graph { + + private HashMap vertices; + private HashMap edges; + private static int kmer; + + static final int TOLERATE=500; + static final int MAX_DEPTH=50; //limit depth to search for DFS + static final long MAX_TIME=10; //limit time in seconds to search for DFS + + public Graph(){ + this.vertices = new HashMap(); + this.edges = new HashMap(); + setKmerSize(127);//default kmer size used by SPAdes to assembly MiSeq data + } + + + public Graph(String graphFile, byte assembler) throws IOException{ + this(); + int shortestLen = 10000; + //SPAdes assembly graph is stored in a .fastg file + if(assembler==0b00){ + shortestLen = StringHelper.buildGraphFromFastg(graphFile, this); +// SequenceReader reader = new FastaReader(graphFile); +// Sequence seq; +// +// while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ +// if(seq.length() 1){ +// String[] nbList = adjList[1].split(","); +// for(int i=0; i < nbList.length; i++){ +// // create list of bridges here (distance=-kmer overlapped) +// String neighbor = nbList[i]; +// boolean dir2=neighbor.contains("'")?false:true; +// neighbor=neighbor.replaceAll("[^a-zA-Z0-9_.]", "").trim(); +// +// Vertex nbVertex=new Vertex(neighbor); +// if(getVertex(nbVertex.getLabel())!=null) +// nbVertex=getVertex(nbVertex.getLabel()); +// +// addVertex(nbVertex, false); +// +// addEdge(current, nbVertex, dir1, dir2); +// } +// } +// +// } +// reader.close(); + } + // ABySS: assembly graph is presented by a .dot file + else if(assembler==0b01){ + StringHelper.buildGraphFromDot(graphFile, this); + } + + //rough estimation of kmer used + if((shortestLen-1) != getKmerSize()) + setKmerSize(shortestLen-1); + + + } + /** + * This constructor accepts an ArrayList and populates + * this.vertices. If multiple Vertex objects have the same label, + * then the last Vertex with the given label is used. + * + * @param vertices The initial Vertices to populate this Graph + */ + public Graph(ArrayList vertices){ + this.vertices = new HashMap(); + this.edges = new HashMap(); + + for(Vertex v: vertices){ + this.vertices.put(v.getLabel(), v); + } + setKmerSize(127);//default kmer size used by SPAdes to assembly MiSeq data + + } + + public static int getKmerSize(){ + return kmer; + } + static void setKmerSize(int k){ + kmer=k; + } + /** + * This method adds an edge between Vertices one and two + * and their corresponding direction of weight kmer, + * if no Edge between these Vertices already exists in the Graph. + * + * @param one The first vertex to add + * @param two The second vertex to add + * @param d1 The direction on the side of vertex one + * @param d2 The direction on the side of vertex two + * @return true iff no Edge relating one and two exists in the Graph + */ + public boolean addEdge(Vertex one, Vertex two, boolean d1, boolean d2){ + return addEdge(one, two, d1, d2, -kmer); + } + + + /** + * Accepts two vertices, their directions and a weight, and adds the edge + * ({one, two}, {d1, d2}, weight) iff no Edge relating one and two + * exists in the Graph. + * + * @param one The first Vertex of the Edge + * @param two The second Vertex of the Edge + * @param d1 The direction on the side of vertex one + * @param d2 The direction on the side of vertex two + * @param weight The weight of the Edge (distance between two end vertices) + * @return true iff no Edge already exists in the Graph + */ + public boolean addEdge(Vertex one, Vertex two, boolean d1, boolean d2, int weight){ + + //ensures the Edge is not in the Graph + Edge e = new Edge(one, two, d1, d2, weight); + if(edges.containsKey(e.hashCode()) || edges.containsKey(e.getReversedRead().hashCode())){ + return false; + } + + //and that the Edge isn't already incident to one of the vertices + else if(one.containsNeighbor(e) || two.containsNeighbor(e.getReversedRead())){ + return false; + } + + edges.put(e.hashCode(), e); + one.addNeighbor(e); + two.addNeighbor(e.getReversedRead()); + return true; + } + + /** + * + * @param e The Edge to look up + * @return true iff this Graph contains the Edge e + */ + public boolean containsEdge(Edge e){ + if(e.getOne() == null || e.getTwo() == null){ + return false; + } + + return this.edges.containsKey(e.hashCode()) + || this.edges.containsKey(e.getReversedRead().hashCode()); + } + + + /** + * This method removes the specified Edge from the Graph, + * including as each vertex's incidence neighborhood. + * + * @param e The Edge to remove from the Graph + * @return Edge The Edge removed from the Graph + */ + public Edge removeEdge(Edge e){ + e.getOne().removeNeighbor(e); + e.getTwo().removeNeighbor(e.getReversedRead()); + Edge rmEdge = this.edges.remove(e.hashCode()); + if (rmEdge==null) + rmEdge = this.edges.remove(e.getReversedRead().hashCode()); + return rmEdge; + } + + /** + * + * @param vertex The Vertex to look up + * @return true iff this Graph contains vertex + */ + public boolean containsVertex(Vertex vertex){ + return this.vertices.get(vertex.getLabel()) != null; + } + + /** + * + * @param label The specified Vertex label + * @return Vertex The Vertex with the specified label + */ + public Vertex getVertex(String label){ + return vertices.get(label); + } + + /** + * + * @param one The first vertex to add + * @param two The second vertex to add + * @param d1 The direction on the side of vertex one + * @param d2 The direction on the side of vertex two + * @return Edge The Edge that connect the two end vertices in the graph + */ + public Edge getEdge(Vertex one, Vertex two, boolean d1, boolean d2){ + Edge e = new Edge(one, two, d1, d2); + if(edges.containsKey(e.hashCode())){ + return edges.get(e.hashCode()); + } else if(edges.containsKey(e.getReversedRead().hashCode())) + return edges.get(e.getReversedRead().hashCode()); + else + return null; + + } + + /** + * This method adds a Vertex to the graph. If a Vertex with the same label + * as the parameter exists in the Graph, the existing Vertex is overwritten + * only if overwriteExisting is true. If the existing Vertex is overwritten, + * the Edges incident to it are all removed from the Graph. + * + * @param vertex + * @param overwriteExisting + * @return true iff vertex was added to the Graph + */ + public boolean addVertex(Vertex vertex, boolean overwriteExisting){ + Vertex current = this.vertices.get(vertex.getLabel()); + if(current != null){ + if(!overwriteExisting){ + return false; + } + + while(current.getNeighborCount() > 0){ + this.removeEdge(current.getNeighbor(0)); + } + } + + + vertices.put(vertex.getLabel(), vertex); + return true; + } + + /** + * + * @param label The label of the Vertex to remove + * @return Vertex The removed Vertex object + */ + public Vertex removeVertex(String label){ + Vertex v = vertices.remove(label); + + while(v.getNeighborCount() > 0){ + this.removeEdge(v.getNeighbor((0))); + } + + return v; + } + + /** + * + * @return Set All Graph's Vertex objects + */ + public Set getVertices(){ + return new HashSet(this.vertices.values()); + } + + /** + * + * @return Set The Edges of this graph + */ + public Set getEdges(){ + return new HashSet(this.edges.values()); + } + + /** + * Find a path between two nodes within a given distance + */ + public ArrayList DFS(Node source, Node dest, int distance){ + if(ScaffoldGraph.verbose) + System.out.println("Looking for path between " + source.toString() + " to " + dest.toString() + " with distance " + distance); + + Path tmp = new Path(this); + ArrayList retval = new ArrayList(); + tmp.addNode(source); + + //traverse(tmp, dest, retval, distance+source.getSeq().length()+dest.getSeq().length()); + traverse(tmp, dest, retval, distance, 0, Instant.now().getEpochSecond()); + + return retval; + } + + public void traverse(Path path, Node dest, ArrayList curResult, int distance, int depth, long time){ + //stop if go to far + if(depth >= MAX_DEPTH){ + if(ScaffoldGraph.verbose) + Logging.info("Stop following path with depth "+depth+" already! : "+path); + return; + } + if(Instant.now().getEpochSecond()-time>=MAX_TIME){ + if(ScaffoldGraph.verbose) + Logging.info("Stop searching due to overtime!"); + return; + } + + Node source=path.getEnd(); + //assert source!=null:"Path null fault!"; + + ArrayList nList = source.getVertex().getNeighbors(); + for(Edge e:nList){ + if(e.getDOne()==source.getDirection()){ + path.addNode(e.getTwo(), e.getDTwo()); + int d = -e.getWeight(); + if(e.getTwo()==dest.getVertex() && e.getDTwo()==dest.getDirection() && Math.abs(distance+d) < TOLERATE){ + + Path curPath=curResult.isEmpty()?new Path(this):curResult.get(0), //the best path saved among all possible paths from the list curResult + tmpPath=new Path(this); + tmpPath.setComp(path.getComp()); + tmpPath.setDeviation(Math.abs(distance+d)); + if( Math.abs(distance+d) < curPath.getDeviation() ) + curResult.add(0, tmpPath); + else + curResult.add(tmpPath); + if(ScaffoldGraph.verbose) + System.out.println("Hit added: "+path+"(candidate deviation: "+Math.abs(distance+d)+")"); + }else{ + int newDistance=distance-e.getTwo().getSequence().length()+d; + if (newDistance+d<-TOLERATE){ + if(ScaffoldGraph.verbose) + Logging.info("Stop following path with distance "+newDistance+" already! : "+path); + + }else + traverse(path, dest, curResult, newDistance, depth+1,time); + } + path.removeLast(); + } + } + } +} + + diff --git a/src/dev/java/japsadev/bio/hts/scaffold/Node.java b/src/dev/java/japsadev/bio/hts/scaffold/Node.java new file mode 100644 index 0000000..e298a12 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/Node.java @@ -0,0 +1,35 @@ +package japsadev.bio.hts.scaffold; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; + + +public class Node{ + Vertex v; + boolean dir; + Node(Vertex v, boolean dir){ + this.v=v; + this.dir=dir; + } + public Vertex getVertex(){ + return v; + } + public void setVertex(Vertex v){ + this.v = v; + } + public boolean getDirection(){ + return dir; + } + public void setDirection(boolean dir){ + this.dir=dir; + } + public Node getRC(){ + return new Node(v,!dir); + } + public Sequence getSeq(){ + return dir?v.getSequence():Alphabet.DNA.complement(v.getSequence()); + } + public String toString(){ + return v.getLabel()+ (dir?"+":"-"); + } +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/Path.java b/src/dev/java/japsadev/bio/hts/scaffold/Path.java new file mode 100644 index 0000000..f14c44e --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/Path.java @@ -0,0 +1,192 @@ +package japsadev.bio.hts.scaffold; + +import java.util.ArrayList; +import java.util.Arrays; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; +import japsa.util.Logging; + +public class Path implements Comparable{ + ArrayList nodes; + //ArrayList edges; + Graph graph; + int length, deviation; //how this path differ to long read data (todo: by multiple-alignment??) + int gapLen; + public Path(){ + this.nodes=new ArrayList(); + //this.edges=new ArrayList(); + graph=new Graph(); + gapLen = length = 0; + deviation=Integer.MAX_VALUE; + } + + public Path(Graph graph){ + this(); + associate(graph); + } + public Path(Path p){ + this(p.graph); + for(Node node:p.nodes) + this.nodes.add(node); +// for(Edge edge:p.edges) +// this.edges.add(edge); + + this.length = p.length; + this.gapLen = p.gapLen; + } + /* + * @param String: a path as in contigs.paths of SPAdes output + * For example: 1+,2-,3+ + */ + public Path(Graph graph, String paths){ + this(graph); +// paths=paths.replace(";", ""); //optimized it! +// String[] comps = paths.split(","); +// for(int i=0; i0) + gapLen+=e.getWeight(); + } + + nodes.add(new Node(v,dir)); + } + public void addNode(Node aNode){ + addNode(aNode.getVertex(), aNode.getDirection()); + } + + public ArrayList getComp(){ + return nodes; + } + public void setComp(ArrayList nodes){ + this.length=0; + for(Node node:nodes) + this.addNode(node); + } + + public Path rc(){ + Path retval=new Path(graph); + for(Node node:nodes){ + retval.nodes.add(0, node.getRC()); + } + return retval; + } + + public String toString(){ + return "P"+getID(); + } + public String getID(){ + String retval=""; + for(Node node:nodes){ + retval+=node.toString(); + } + return retval.trim(); + } + public Node removeLast(){ + Node retval=nodes.remove(nodes.size()-1); + length-=retval.getSeq().length()-Graph.getKmerSize(); + return retval; + } + + public Sequence spelling(){ + SequenceBuilder seq = new SequenceBuilder(Alphabet.DNA16(), 1024*1024, this.toString()); + + for(int i=0;i 0) + seq.append(aNode.getSeq().subSequence(0, lenLeft)); + else if(ScaffoldGraph.verbose) + Logging.info( "Ignore " + aNode.toString() + " and " + nextNode.toString() + " overlap: " + lenLeft + + "/" + aNode.getSeq().length() ); + } + + else{ + char[] chars = new char[e.getWeight()]; + Arrays.fill(chars, 'N'); + Sequence gap = new Sequence(Alphabet.DNA16(), new String(chars), "gaps"); + seq.append(aNode.getSeq()); + seq.append(gap); + } + + }else + seq.append(aNode.getSeq()); + + } + + return seq.toSequence(); + } + /** + * + * @return Node: start node + */ + public Node getStart(){ + return nodes.get(0); + } + /* + * @return Node: end node + */ + public Node getEnd(){ + return nodes.get(nodes.size()-1); + } + + public boolean isEmpty(){ + return nodes.isEmpty(); + } + + public int getDeviation(){ + return this.deviation; + } + public void setDeviation(int deviation){ + this.deviation=deviation; + } + + @Override + public int compareTo(Path o) { + // TODO Auto-generated method stub + return this.deviation-o.deviation; + } + +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/ReadFilling.java b/src/dev/java/japsadev/bio/hts/scaffold/ReadFilling.java new file mode 100644 index 0000000..525eea5 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/ReadFilling.java @@ -0,0 +1,85 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 31/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.hts.scaffold; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import java.util.ArrayList; +import java.util.Collections; + + +public class ReadFilling{ + /** + * The read sequence + */ + Sequence readSequence; + private boolean sorted = false; + + ArrayList alignments; + + public ReadFilling(Sequence read, ArrayList alignmentList){ + readSequence = read; + alignments = alignmentList; + } + + public Sequence getReadSequence(){ + return readSequence; + } + + public ArrayList getAlignmentRecords(){ + return alignments; + } + + + public ReadFilling reverse(){ + //return an (conceptually the same) read filling with the a reverse read + Sequence revRead = Alphabet.DNA.complement(readSequence); + revRead.setName("REV"+readSequence.getName()); + ArrayList revAlignments = new ArrayList(); + + for (AlignmentRecord alignment:alignments) + revAlignments.add(alignment.reverseRead()); + + ReadFilling revFilling = new ReadFilling(revRead, revAlignments); + return revFilling; + } + + public void sortAlignment(){ + if (!sorted){ + Collections.sort(alignments); + sorted = true; + } + } +} \ No newline at end of file diff --git a/src/dev/java/japsadev/bio/hts/scaffold/RealtimeScaffolding.java b/src/dev/java/japsadev/bio/hts/scaffold/RealtimeScaffolding.java new file mode 100644 index 0000000..0c9771f --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/RealtimeScaffolding.java @@ -0,0 +1,335 @@ +package japsadev.bio.hts.scaffold; +import htsjdk.samtools.SAMRecord; + +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; + +import java.io.File; +import java.io.IOException; +import java.lang.ProcessBuilder.Redirect; +import java.util.ArrayList; +import java.util.Date; + +import japsa.bio.np.RealtimeAnalysis; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.Logging; + +//Simulate fastq realtime generator: jsa.np.timeEmulate -i -output - +public class RealtimeScaffolding { + RealtimeScaffolder scaffolder; + public ScaffoldGraph graph; + int currentReadCount = 0; + long currentBaseCount = 0; + + public RealtimeScaffolding(String seqFile, String genesFile, String resistFile, String isFile, String oriFile, String output)throws IOException, InterruptedException{ + scaffolder = new RealtimeScaffolder(this, output); + graph = new ScaffoldGraphDFS(seqFile, genesFile, resistFile, isFile, oriFile); + } + + + /** + * MDC tried to include BWA as part + * @param bamFile + * @param readNumber + * @param timeNumber + * @param minCov + * @param qual + * @throws IOException + * @throws InterruptedException + */ + public void scaffolding(String inFile, int readNumber, int timeNumber, double minCov, int qual, String format, String bwaExe, int bwaThread, String bwaIndex) + throws IOException, InterruptedException{ + scaffolder.setReadPeriod(readNumber); + scaffolder.setTimePeriod(timeNumber * 1000); + + Logging.info("Scaffolding ready at " + new Date()); + + //... + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = null; + + Process bwaProcess = null; + + if (format.endsWith("am")){//bam or sam + if ("-".equals(inFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(inFile)); + }else{ + Logging.info("Starting BWA at " + new Date()); + ProcessBuilder pb = null; + if ("-".equals(inFile)){ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", + "-K", + "20000", + bwaIndex, + "-" + ). + redirectInput(Redirect.INHERIT); + }else{ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", + "-K", + "20000", + bwaIndex, + inFile + ); + } + + bwaProcess = pb.redirectError(ProcessBuilder.Redirect.to(new File("/dev/null"))).start(); + + Logging.info("BWA started!"); + + //SequenceReader seqReader = SequenceReader.getReader(inFile); + + //SequenceOutputStream + //outStrs = new SequenceOutputStream(bwaProcess.getOutputStream()); + //Logging.info("set up output from bwa"); + + //Start a new thread to feed the inFile into bwa input + //Thread thread = new Thread(){ + // public void run(){ + // Sequence seq; + // Alphabet dna = Alphabet.DNA16(); + // try { + // Logging.info("Thread to feed bwa started"); + // while ( (seq = seqReader.nextSequence(dna)) !=null){ + // seq.writeFasta(outStrs); + // } + // outStrs.close();//as well as signaling + // seqReader.close(); + // } catch (IOException e) { // + + // }finally{ + + // } + // } + //}; + + //thread.start(); + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(bwaProcess.getInputStream())); + + } + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + ReadFilling readFilling = null; + AlignmentRecord myRec = null; + ArrayList samList = null;// alignment record of the same read; + + Thread thread = new Thread(scaffolder); + thread.start(); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + if (rec.getReadUnmappedFlag() || rec.getMappingQuality() < qual){ + if (!readID.equals(rec.getReadName())){ + readID = rec.getReadName(); + synchronized(this){ + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + } + continue; + } + myRec = new AlignmentRecord(rec, graph.contigs.get(rec.getReferenceIndex())); +// System.out.println("Processing record of read " + rec.getReadName() + " and ref " + rec.getReferenceName() + (myRec.useful?": useful ":": useless ") + myRec); + + if (readID.equals(myRec.readID)) { + + if (myRec.useful){ + for (AlignmentRecord s : samList) { + if (s.useful){ + //...update with synchronized + synchronized(this.graph){ + graph.addBridge(readFilling, s, myRec, minCov); + //Collections.sort(graph.bridgeList); + } + } + } + } + } else { + samList = new ArrayList(); + readID = myRec.readID; + readFilling = new ReadFilling(new Sequence(Alphabet.DNA5(), rec.getReadString(), "R" + readID), samList); + synchronized(this){ + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + } + + samList.add(myRec); + + }// while + scaffolder.stopWaiting(); + thread.join(); + iter.close(); + reader.close(); + + if (bwaProcess != null){ + bwaProcess.waitFor(); + } + + } + + @Deprecated + public void scaffolding(String bamFile, int readNumber, int timeNumber, double minCov, int qual) + throws IOException, InterruptedException{ + scaffolder.setReadPeriod(readNumber); + scaffolder.setTimePeriod(timeNumber * 1000); + + Logging.info("Scaffolding ready at " + new Date()); + + //... + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + + SamReader reader; + if ("-".equals(bamFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + ReadFilling readFilling = null; + AlignmentRecord myRec = null; + ArrayList samList = null;// alignment record of the same read; + + Thread thread = new Thread(scaffolder); + thread.start(); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + if (rec.getReadUnmappedFlag() || rec.getMappingQuality() < qual){ + if (!readID.equals(rec.getReadName())){ + readID = rec.getReadName(); + synchronized(this){ + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + } + continue; + } + myRec = new AlignmentRecord(rec, graph.contigs.get(rec.getReferenceIndex())); + + if (readID.equals(myRec.readID)) { + + if (myRec.useful){ + for (AlignmentRecord s : samList) { + if (s.useful){ + //...update with synchronized + synchronized(this.graph){ + graph.addBridge(readFilling, s, myRec, minCov); + //Collections.sort(graph.bridgeList); + } + } + } + } + } else { + samList = new ArrayList(); + readID = myRec.readID; + readFilling = new ReadFilling(new Sequence(Alphabet.DNA5(), rec.getReadString(), "R" + readID), samList); + synchronized(this){ + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + } + + samList.add(myRec); + + }// while + scaffolder.stopWaiting(); + thread.join(); + iter.close(); + reader.close(); + + } + public static class RealtimeScaffolder extends RealtimeAnalysis{ + RealtimeScaffolding scaffolding; + public SequenceOutputStream outOS; + RealtimeScaffolder(RealtimeScaffolding scf, String output) throws IOException{ + scaffolding = scf; + outOS = SequenceOutputStream.makeOutputStream(output); + } + + @Override + protected void close() { + //if SPAdes assembly graph is involved + if(Contig.hasGraph()){ + ContigBridge.forceFilling(); + analysis(); + } + + try{ + //print for the last time if needed + if(!ScaffoldGraph.updateGenome) + scaffolding.graph.printSequences(true,false); + + outOS.close(); + }catch (Exception e){ + e.printStackTrace(); + } + } + + @Override + protected void analysis() { + long step = (lastTime - startTime)/1000;//convert to second + ScaffoldGraph sg = scaffolding.graph; + synchronized(sg){ + sg.connectBridges(); + + try { + // This function is for the sake of real-time annotation experiments being more readable + scaffolding.graph.printRT(scaffolding.currentBaseCount); + sg.printSequences(ScaffoldGraph.updateGenome,false); + outOS.print("Time |\tStep |\tRead count |\tBase count|\tNumber of scaffolds|\tCircular scaffolds |\tN50 | \tBreaks (maxlen)\n"); + outOS.print(timeNow + " |\t" + step + " |\t" + lastReadNumber + " |\t" + scaffolding.currentBaseCount + " |\t" + sg.getNumberOfContigs() + + " |\t" + sg.getNumberOfCirculars() + " |\t" + sg.getN50() + " |\t" + sg.getGapsInfo()); + + outOS.println(); + outOS.flush(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + @Override + protected int getCurrentRead() { + // TODO Auto-generated method stub + return scaffolding.currentReadCount; + } + + } +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/Scaffold.java b/src/dev/java/japsadev/bio/hts/scaffold/Scaffold.java new file mode 100644 index 0000000..e70641a --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/Scaffold.java @@ -0,0 +1,690 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 20/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.hts.scaffold; + +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFeature; +import japsa.seq.SequenceBuilder; +import japsa.seq.SequenceOutputStream; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedList; +import java.util.Iterator; +import java.util.ListIterator; + +/** + * Implement scaffold as an array deque, that is a linear array that can be + * added/removed from either end + * + * @author minhduc + */ +public final class Scaffold extends LinkedList{ + ContigBridge closeBridge = null;//if not null, will bridge the last and the first contig + ScaffoldVector circle = null; + private static final long serialVersionUID = -4310125261868862931L; + LinkedList bridges; + int scaffoldIndex; + int len=-1; + //boolean closed = false; + /** + * invariant: the direction of the decque is the same as the main (the longest one) + * @param myFContig + */ + public Scaffold(int index){ + super(); + closeBridge=null; + circle=null; + scaffoldIndex = index; + bridges = new LinkedList(); + } + public Scaffold(Contig myFContig){ + super(); + closeBridge=null; + circle=null; + scaffoldIndex = myFContig.index; + add(myFContig);//the first one + bridges = new LinkedList(); + } + + + public synchronized void setCloseBridge(ContigBridge bridge){ + assert bridge.firstContig.getIndex() == this.getLast().getIndex():"Closed bridge: " + bridge.hashKey + " <-> " +this.getLast().getIndex(); + closeBridge = bridge; + circle = ScaffoldVector.composition(this.getLast().getVector(), ScaffoldVector.reverse(this.getFirst().getVector())); //first->last + ScaffoldVector last2first = bridge.getTransVector(); + if(this.peekFirst().getIndex() == bridge.firstContig.getIndex()) + last2first = ScaffoldVector.reverse(last2first); + circle = ScaffoldVector.composition(last2first, circle); + +// change magnitute of vector to positive for convenience, a.k.a the direction of head contig + circle.magnitude = Math.abs(circle.magnitude); + + //bridge.setContigScores(); + //closed = true; + } + + /** + * Return the vector of a contig after move it forward or backward 1 circular length + * @param ScaffoldVector v of the contig + * @param boolean direction to move: true to move forward, false for backward (w.r.t. head contig) + * @return ScaffoldVector of contig after moving + */ + public ScaffoldVector rotate(ScaffoldVector v, boolean direction){ + return (direction && (v.direction>0))?ScaffoldVector.composition(circle, v):ScaffoldVector.composition(ScaffoldVector.reverse(circle), v); + } + + /** + * Return 1 or -1 if the contig is at the first or last of the list. + * Otherwise, return 0 + * @param ctg + * @return + */ + public int isEnd(Contig ctg){ + if(this.isEmpty()) + return 0; + + if (ctg.getIndex() == this.peekLast().getIndex()) + return -1; + if (ctg.getIndex() == this.peekFirst().getIndex()) + return 1; + + return 0; + } + + public boolean isFirst(Contig ctg){ + return ctg.getIndex() == this.peekFirst().getIndex(); + } + + public boolean isLast(Contig ctg){ + return ctg.getIndex() == this.peekLast().getIndex(); + } + + /** + * Add a contig and its bridge to the beginning of the deque + * @param contig + * @param bridge + */ + public void addFront(Contig contig, ContigBridge bridge){ + assert bridge.firstContig.getIndex() == contig.getIndex(): "Front prob: "+ bridge.hashKey + " not connect " + contig.getIndex() + " and " + this.getFirst().getIndex(); + this.addFirst(contig); + bridges.addFirst(bridge); + if(ScaffoldGraph.verbose) + System.out.printf("...adding contig %d to scaffold %d backward!\n", contig.getIndex(), scaffoldIndex); + + } + + /** + * Add a contig and its bridge to the end of the deque + * @param contig + * @param bridge + */ + public void addRear(Contig contig, ContigBridge bridge){ + assert bridge.secondContig.getIndex() == contig.getIndex():"Rear prob: "+ bridge.hashKey + " not connect " + this.getLast().getIndex() + " and " + contig.getIndex(); + this.addLast(contig); + bridges.addLast(bridge); + if(ScaffoldGraph.verbose) + System.out.printf("...adding contig %d to scaffold %d forward!\n", contig.getIndex(), scaffoldIndex); + + } + + public Contig nearestMarker(Contig ctg, boolean forward){ + + if(ScaffoldGraph.isRepeat(ctg)){ + if(ScaffoldGraph.verbose) + System.out.println("Cannot determine nearest marker of a repeat!"); + return null; + } + int index = this.indexOf(ctg); + if(index < 0) return null; + ListIterator iterator = this.listIterator(index); + + if(ScaffoldGraph.verbose){ + System.out.printf("Tracing scaffold %d from contig %d with index %d\n", scaffoldIndex, ctg.getIndex(), index); + //this.view(); + System.out.printf("Finding nearest %s marker of contig %d:", forward?"next":"previous", ctg.getIndex()); + } + Contig marker = null; + while((forward?iterator.hasNext():iterator.hasPrevious())){ + marker = (forward?iterator.next():iterator.previous()); + if(ScaffoldGraph.verbose) + System.out.print("..."+marker.getIndex()); + if(marker != null && !ScaffoldGraph.isRepeat(marker) && marker.getIndex() != ctg.getIndex()) + break; + } + if(closeBridge!=null && (marker == null || ScaffoldGraph.isRepeat(marker))){ + marker = forward?this.getFirst():this.getLast(); + while((forward?iterator.hasNext():iterator.hasPrevious())){ + if(ScaffoldGraph.verbose) + System.out.print("......"+marker.getIndex()); + if(marker != null && !ScaffoldGraph.isRepeat(marker) && marker.getIndex() != ctg.getIndex()) + break; + else + marker = (forward?iterator.next():iterator.previous()); + } + } + if(ScaffoldGraph.verbose) + System.out.println(); + return marker; + + } + /** + * This function try to remove non-unique contigs from both ends of this scaffold, + * usually applied after a break. + */ + //reset prevScore or nextScore to 0 according to removed bridges. + public synchronized void trim(){ + if(ScaffoldGraph.verbose) + System.out.println("Trimming scaffold: " + scaffoldIndex); + + if(closeBridge != null || this.isEmpty()) + return; + //from right + Contig rightmost = this.peekLast(); + while(rightmost!=null && ScaffoldGraph.isRepeat(rightmost)){ + if(ScaffoldGraph.verbose) + System.out.println("...removing contig " + rightmost.getIndex()); + this.removeLast(); + rightmost=this.peekLast(); + + } + if(this.size() <=1){ + bridges= new LinkedList(); + return; + } + + while(!bridges.isEmpty()){ + if(bridges.peekLast().isContaining(rightmost)) + break; + else{ + if(ScaffoldGraph.verbose) + System.out.println("...removing bridge " + bridges.peekLast().hashKey); + //bridges.peekLast(); + bridges.removeLast().resetContigScores(); + } + } + if(bridges.size() > 1){ + if(bridges.get(bridges.size() - 2).isContaining(rightmost)){ + if(ScaffoldGraph.verbose) + System.out.println("...removing bridge " + bridges.peekLast().hashKey); + bridges.removeLast().resetContigScores(); + } + } + + //from left + Contig leftmost = this.peekFirst(); + while(leftmost!=null && ScaffoldGraph.isRepeat(leftmost)){ + if(ScaffoldGraph.verbose) + System.out.println("...removing contig " + leftmost.getIndex()); + this.removeFirst(); + leftmost=this.peekFirst(); + } + if(this.size() <=1){ + bridges= new LinkedList(); + return; + } + + while(!bridges.isEmpty()){ + if(bridges.peekFirst().isContaining(leftmost)) + break; + else{ + if(ScaffoldGraph.verbose) + System.out.println("...removing bridge " + bridges.peekFirst().hashKey); + //bridges.peekFirst(); + bridges.removeFirst().resetContigScores(); + } + } + if(bridges.size() > 1){ + if(bridges.get(1).isContaining(leftmost)){ + if(ScaffoldGraph.verbose) + System.out.println("...removing bridge " + bridges.peekFirst().hashKey); + bridges.removeFirst().resetContigScores(); + } + } + + } + public synchronized void setHead(int head){ + scaffoldIndex = head; + for (Contig ctg:this) + ctg.head = head; + } + public LinkedList getBridges(){ + return bridges; + } + + /** + * @param start the start to set + */ + public synchronized void view(){ + System.out.println("========================== START ============================="); + Iterator bridIter = bridges.iterator(); + if(closeBridge!=null){ + System.out.println("Close bridge: " + closeBridge.hashKey + " Circularized vector: " + circle); + } + for (Contig ctg:this){ + System.out.printf(" contig %s ======" + (ctg.getRelDir() > 0?">":"<") + "%6d %6d %s ",ctg.getName(), ctg.leftMost(),ctg.rightMost(), ctg.getName()); + if (bridIter.hasNext()){ + ContigBridge bridge = bridIter.next(); + System.out.printf(" %d: %s\n", bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig), bridge.hashKey); + }else + System.out.println(); + } + System.out.println("============================ END ==========================="); + } + + /** + * Return the length of this scaffold + * Check out quast (https://github.com/ablab/quast) + */ + public int length(){ + if(isEmpty()) + return 0; + if(len > 0) + return len; + int len = getLast().rightMost() - getFirst().leftMost(); + if(circle!=null) + len = Math.abs(circle.getMagnitute()); + return len; + } + + public synchronized void viewSequence(SequenceOutputStream fout, SequenceOutputStream jout) throws IOException{ + + if(ScaffoldGraph.verbose){ + view(); + System.out.println("Size = " + size() + " sequence"); + } + + // Synchronize positions of 2 contigs (myVector) of a bridge based on the real list of (maybe cloned) contigs + // TODO: do the same with viewAnnotation() + assert this.size()==bridges.size()+1:"Number of contigs ("+this.size()+")" + " doesn't agree with number of bridges ("+bridges.size()+"!"; + for(int i=0;i | | leftContig | | | | rightContig + * Contigs: ... ~~~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*~~~ ... ~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * startLeft(1) endLeft(1) startLeft(2) ... + * + * that's what happens below! + * + */ + + rightContig = getFirst(); + + //startLeft: the leftPoint of leftContig, endLeft: rightPoint of left Contig + int startLeft = (rightContig.getRelDir() > 0)?0:rightContig.length(); //starting point after the last fillFrom + int endLeft = (rightContig.getRelDir() < 0)?0:rightContig.length(); + //TODO: re-check the coordinate of two ends (0 or 1, inclusive/exclusive) +// int startLeft = (rightContig.getRelDir() > 0)?1:rightContig.length(); //starting point after the last fillFrom +// int endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + + + /* uncomment for illumina-based */ + //int closeDis = 0; + + if (closeBridge != null){ + bestCloseConnection = closeBridge.getBestConnection(); + leftContig = closeBridge.firstContig; + /* uncomment for longread-based */ + startLeft = bestCloseConnection.filling(null, null); + /* uncomment for illumina-based */ +// closeDis =closeBridge.getTransVector().distance(closeBridge.firstContig, closeBridge.secondContig); +// if(closeDis > 0) +// startLeft = bestCloseConnection.filling(null, null); //adjust the starting point + + anno.addDescription("Circular"); + + }else + anno.addDescription("Linear"); + + + Iterator ctgIter = this.iterator(); + leftContig = ctgIter.next();//The first + + for (ContigBridge bridge:bridges){ + rightContig = ctgIter.next(); + ContigBridge.Connection connection = bridge.getBestConnection(); + /* uncomment for longread-based */ + endLeft = (leftContig.getRelDir()>0)?(connection.firstAlignment.refEnd): + (connection.firstAlignment.refStart); + /* uncomment for illumina-based */ +// int distance = bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig); +// if(distance < 0){ +// endLeft = (leftContig.getRelDir()>0)?(leftContig.length()-Math.abs(distance)):Math.abs(distance); +// }else{ +// endLeft = (leftContig.getRelDir()>0)?(connection.firstAlignment.refEnd): +// (connection.firstAlignment.refStart); +// } + + if (startLeft 0)?1:rightContig.length(); +// }else{ +// //Fill in the connection +// startLeft = connection.filling(seq, anno); +// } + leftContig = rightContig; + + }//for + + //leftContig = lastContig in the queue + if (bestCloseConnection != null){ + /* uncomment for longread-based */ + endLeft = (leftContig.getRelDir()>0)?(bestCloseConnection.firstAlignment.refEnd): + (bestCloseConnection.firstAlignment.refStart); + /* uncomment for illumina-based */ +// if(closeDis > 0) +// endLeft = (leftContig.getRelDir()>0)?(bestCloseConnection.firstAlignment.refEnd): +// (bestCloseConnection.firstAlignment.refStart); +// else{ +// endLeft = (rightContig.getRelDir() < 0)?Math.abs(closeDis):rightContig.length()-Math.abs(closeDis); +// } + } + else + endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + + if (startLeft0 ) +// bestCloseConnection.filling(seq, anno); + } + + len = seq.length(); + JapsaAnnotation.write(seq.toSequence(), anno, jout); + seq.writeFasta(fout); + } + /* Output annotation of this scaffold + * TODO: output annotations from the filling sequences + */ + public synchronized void viewAnnotation(SequenceOutputStream out) throws IOException{ + + SequenceBuilder seq = new SequenceBuilder(Alphabet.DNA16(), 1024*1024, "Scaffold" + scaffoldIndex); + JapsaAnnotation anno = new JapsaAnnotation(); + + ContigBridge.Connection bestCloseConnection = null; + Contig leftContig, rightContig; +/* + * Nanopore reads: + * ==================================== ========================================== + * | | | | | | + * | | | | | | + * | | leftContig | | | | rightContig + * Contigs: ... ~~~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*~~~ ... ~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * startLeft endLeft + * + * that's what happens below! + * + */ + + rightContig = getFirst(); + + //startLeft: the leftPoint of leftContig, endLeft: rightPoint of left Contig + int startLeft = (rightContig.getRelDir() > 0)?1:rightContig.length(); //starting point after the last fillFrom + int endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + + if (closeBridge != null){ + bestCloseConnection = closeBridge.getBestConnection(); + leftContig = closeBridge.firstContig; + startLeft = bestCloseConnection.filling(null, null); + + anno.addDescription("Circular"); + + }else + anno.addDescription("Linear"); + + Iterator ctgIter = this.iterator(); + leftContig = ctgIter.next();//The first + + JapsaFeature lastGene=null; + if(leftContig.resistanceGenes.size()>0) + lastGene = startLeft0)?(connection.firstAlignment.refEnd): + (connection.firstAlignment.refStart); + + /**********************************************************************************************/ + ArrayList resistLeft = leftContig.getFeatures(leftContig.resistanceGenes, startLeft, endLeft), + insertLeft = leftContig.getFeatures(leftContig.insertSeq, startLeft, endLeft), + oriLeft = leftContig.getFeatures(leftContig.oriRep, startLeft, endLeft); + + for(JapsaFeature resist:resistLeft){ + resist.setStart(resist.getStart()+seq.length()); + resist.setEnd(resist.getEnd()+seq.length()); + anno.add(resist); + } + Collections.sort(resistLeft); + if(resistLeft.size() > 0){ + JapsaFeature leftGene = startLeftendLeft?resistLeft.get(0):resistLeft.get(resistLeft.size()-1); + //extract first and last gene here + if(lastGene!=null) + System.out.println(lastGene.getID() + "...next to..." + leftGene.getID()); + lastGene = rightGene; + } + for(JapsaFeature insert:insertLeft){ + insert.setStart(insert.getStart()+seq.length()); + insert.setEnd(insert.getEnd()+seq.length()); + anno.add(insert); + } + for(JapsaFeature ori:oriLeft){ + ori.setStart(ori.getStart()+seq.length()); + ori.setEnd(ori.getEnd()+seq.length()); + anno.add(ori); + } + /**********************************************************************************************/ + + if (startLeft0)?(bestCloseConnection.firstAlignment.refEnd): + (bestCloseConnection.firstAlignment.refStart); + + } + else + endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + + /**********************************************************************************************/ + ArrayList resistLeft = leftContig.getFeatures(leftContig.resistanceGenes, startLeft, endLeft), + insertLeft = leftContig.getFeatures(leftContig.insertSeq, startLeft, endLeft), + oriLeft = leftContig.getFeatures(leftContig.oriRep, startLeft, endLeft); + + for(JapsaFeature resist:resistLeft){ + resist.setStart(resist.getStart()+seq.length()); + resist.setEnd(resist.getEnd()+seq.length()); + anno.add(resist); + } + if(resistLeft.size() > 0){ + JapsaFeature leftGene = startLeft ctgList = new ArrayList(), + genesList = new ArrayList(); + + for(Contig ctg:this){ + ctgList.add(ctg.getName()); + for (JapsaFeature feature:ctg.genes) + genesList.add(feature.toString()); + } + pw.print(">"); + for(String ctg:ctgList) + pw.printf("%s\t", ctg); + + pw.printf("\n>%d genes\t",genesList.size()); + + for(String genes:genesList) + pw.print(" \n\t"+genes); + pw.println(""); + + pw.close(); + } + +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldGraph.java b/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldGraph.java new file mode 100644 index 0000000..f8eb3bc --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldGraph.java @@ -0,0 +1,1219 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 20/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.hts.scaffold; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; + +import japsa.seq.Alphabet; +import japsa.seq.JapsaFeature; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.Logging; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.lang.ProcessBuilder.Redirect; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; + +public abstract class ScaffoldGraph{ + public static volatile int maxRepeatLength=7500; //for ribosomal repeat cluster in bacteria (Koren S et al 2013), it's 9.1kb for yeast. + public static volatile int marginThres = 1000; + public static volatile int minContigLength = 200; + public static volatile int minSupportReads = 1; + public static volatile boolean verbose = false; + public static volatile boolean reportAll = false; + public static volatile boolean updateGenome = true; + public static volatile boolean eukaryotic = false; + public static volatile boolean select = false; + + + public volatile boolean annotation = false; + public static volatile byte assembler =0b00; // 0 for SPAdes, 1 for ABySS + + public static HashMap countOccurence=new HashMap(); + public String prefix = "out"; + public static double estimatedCov = 0; + private static double estimatedLength = 0; + + //below maps contain avatar of contigs and bridges only, + //not the actual ones in used (because of the repeats that need to be cloned) + + ArrayList contigs; + HashMap bridgeMap= new HashMap(); + static HashMap> bridgesFromContig = new HashMap>(); + + Scaffold [] scaffolds; // DNA translator, previous image of sequence is stored for real-time processing + int scfNum, cirNum; // assembly statistics: number of contigs and circular ones. + + // Constructor for the graph with contigs FASTA file (contigs.fasta from SPAdes output) + public ScaffoldGraph(String sequenceFile) throws IOException{ + //1. read in contigs + SequenceReader reader = SequenceReader.getReader(sequenceFile); + Sequence seq; + contigs = new ArrayList(); + + int index = 0; + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + Contig ctg = new Contig(index, seq); + + String name = seq.getName(), + desc = seq.getDesc(); + + double mycov = 1.0; + //SPAdes header: >%d_length_%d_cov_%f + //SPAdes header: >MicroManage_ID %d_length_%d_cov_%f + if(assembler==0b00){ + String [] toks = (name+desc).split("_"); + for (int i = 0; i < toks.length - 1;i++){ + if ("cov".equals(toks[i])){ + mycov = Double.parseDouble(toks[i+1]); + break; + } + } + } + //ABySS header: >%d %d %d, ID, length, kmer_sum + else if(assembler==0b01){ + String [] toks = desc.split("\\s"); + mycov = Double.parseDouble(toks[1])/Double.parseDouble(toks[0]); + } + + estimatedCov += mycov * seq.length(); + estimatedLength += seq.length(); + + ctg.setCoverage(mycov); + + contigs.add(ctg); + bridgesFromContig.put(ctg.getIndex(), new ArrayList()); + index ++; + } + reader.close(); + + estimatedCov /= estimatedLength; + + Logging.info("Average coverage:" + estimatedCov + " Length: " + estimatedLength); + //turn off verbose mode if the genome is bigger than 100Mb. + if(estimatedLength > 100000000 || contigs.size() > 10000){ + Logging.warn("Verbose mode and realtime updating are automatically disabled due to too large genome!"); + verbose=false; + updateGenome=false; + } + + //2. Initialise scaffold graph + scaffolds = new Scaffold[contigs.size()]; + + for (int i = 0; i < contigs.size();i++){ + scaffolds[i] = new Scaffold(contigs.get(i)); + //point to the head of the scaffold + contigs.get(i).head = i; + }//for + + scfNum=contigs.size(); + cirNum=0; + + }//constructor + + public String getAssemblerName(){ + if(assembler==0b01) + return new String("ABySS"); + else + return new String("SPAdes"); + } + /* Read short-read assembly information from SPAdes output: assembly graph (assembly_graph.fastg) and + ** traversed paths (contigs.pahth) to make up the contigs + */ + public void readMore(String assemblyGraph, String paths) throws IOException{ + //1. Read assembly graph and store in a string graph + Graph g = new Graph(assemblyGraph,assembler); + + // for(Vertex v:g.getVertices()){ + // System.out.println("Neighbors of vertex " + v.getLabel() + " (" + v.getNeighborCount() +"):"); + // for(Edge e:v.getNeighbors()) + // System.out.println(e + "; "); + // System.out.println(); + // } + + Contig.setGraph(g); + + if(assembler==0b00){ + //2. read file contigs.paths from SPAdes + BufferedReader pathReader = new BufferedReader(new FileReader(paths)); + + String s; + //Read contigs from contigs.paths and refer themselves to contigs.fasta + Contig curContig = null; + while((s=pathReader.readLine()) != null){ + if(s.contains("NODE")) + curContig=getSPadesContig(s); + else if(curContig!=null) + curContig.setPath(new Path(g,s)); + + } + pathReader.close(); + } else if(assembler==0b01){ //for the case of ABySS: contig and vertex of assembly graph are the same + for(Contig ctg:contigs) + ctg.setPath(new Path(g,ctg.getName()+"+")); + } + + if(ScaffoldGraph.verbose) + Logging.info("Short read assembler " + (assembler==0b00?"SPAdes":"ABySS") + " kmer=" + Graph.getKmerSize()); + } + + public Contig getSPadesContig(String name){ + if(name.contains("'")) + return null; + + + Contig res = null; + + for(Contig ctg:contigs){ + // Extract to find contig named NODE_x_ + //because sometimes there are disagreement between contig name (_length_) in contigs.paths and contigs.fasta in SPAdes!!! + // + if( ctg.getName().contains(name) || (ctg.getName()+ctg.getDesc()).contains("NODE_"+name.split("_")[1]+"_") ){ + res = ctg; + break; + } + } + + if(res==null && verbose){ + System.out.println("Contig not found:" + name); + } + + return res; + } + + public synchronized int getN50(){ + int [] lengths = new int[scaffolds.length]; + int count=0; + double sum = 0; +// for (int i = 0; i < scaffolds.length;i++){ +// if(scaffolds[i].isEmpty()) continue; +// int len = scaffolds[i].length(); +// +// if(contigs.get(i).head == i) +// if ( (!isRepeat(contigs.get(i)) && len > maxRepeatLength) //here are the big ones +// || scaffolds[i].closeBridge != null //circular plasmid contigs +// || (reportAll && needMore(contigs.get(i)) && contigs.get(i).coverage > .5*estimatedCov)) //short,repetitive sequences here if required +// { +// lengths[count] = len; +// sum+=len; +// count++; +// } +// } + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + + if(select && !contigs.get(i).isMapped()) + continue; + + int len = scaffolds[i].length(); + + if(contigs.get(i).head == i){ + if (isRepeat(contigs.get(i)) && !reportAll ) + continue; + } + else if(!isRepeat(contigs.get(i)) || !needMore(contigs.get(i)) ) + continue; + + lengths[count] = len; + sum+=len; + count++; + } + Arrays.sort(lengths); + + int index = lengths.length; + double contains = 0; + while (contains < sum/2){ + index --; + contains += lengths[index]; + } + + return lengths[index]; + } + + public synchronized String getGapsInfo(){ + int gapCount=0, + gapMaxLen=0; + + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + int len = scaffolds[i].length(); + if ((contigs.get(i).head == i + && !isRepeat(contigs.get(i)) + && len > maxRepeatLength + ) + || scaffolds[i].closeBridge != null) + { + for(ContigBridge brg:scaffolds[i].bridges){ + if(brg.getBridgePath()==null){ + gapCount++; + if(brg.getTransVector().distance(brg.firstContig, brg.secondContig) > gapMaxLen) + gapMaxLen=brg.getTransVector().distance(brg.firstContig, brg.secondContig); + } + } + if(scaffolds[i].closeBridge!=null){ + ContigBridge brg=scaffolds[i].closeBridge; + if(brg.getBridgePath()==null){ + gapCount++; + if(brg.getTransVector().distance(brg.firstContig, brg.secondContig) > gapMaxLen) + gapMaxLen=brg.getTransVector().distance(brg.firstContig, brg.secondContig); + } + } + + } + } + + + return gapCount+" ("+gapMaxLen+")"; + } + /** + * MDC added second version that include bwa + * @param bamFile + * @param minCov + * @param qual + * @throws IOException + * @throws InterruptedException + */ + public void makeConnections(String inFile, double minCov, int qual, String format, String bwaExe, int bwaThread, String bwaIndex) throws IOException, InterruptedException{ + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + + SamReader reader = null; + Process bwaProcess = null; + + if (format.endsWith("am")){//bam or sam + if ("-".equals(inFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(inFile)); + }else{ + Logging.info("Starting bwa at " + new Date()); + + ProcessBuilder pb = null; + if ("-".equals(inFile)){ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", +// "-K", +// "20000", + bwaIndex, + "-" + ). + redirectInput(Redirect.INHERIT); + }else{ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", +// "-K", +// "20000", + bwaIndex, + inFile + ); + } + bwaProcess = pb.redirectError(ProcessBuilder.Redirect.to(new File("/dev/null"))).start(); + + //Logging.info("bwa started x"); + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(bwaProcess.getInputStream())); + } + + + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + ReadFilling readFilling = null; + ArrayList samList = null;// alignment record of the same read; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + if (rec.getReadUnmappedFlag()) + continue; + if (rec.getMappingQuality() < qual) + continue; + + Contig tmp = contigs.get(rec.getReferenceIndex()); + if(tmp==null){ + Logging.error("Contig " + rec.getReferenceIndex() + " doesn't exist!"); + System.exit(1); + } + + AlignmentRecord myRec = new AlignmentRecord(rec, tmp); + Arrays.fill(tmp.isMapped, myRec.refStart, myRec.refEnd, 1); +// System.out.println("Processing record of read " + rec.getReadName() + " and ref " + rec.getReferenceName() + (myRec.useful?": useful ":": useless ") + myRec); + + + ////////////////////////////////////////////////////////////////// + // make bridge of contigs that align to the same (Nanopore) read. + // Note that SAM file MUST be sorted based on readID (samtools sort -n) + + //not the first occurrance + if (readID.equals(myRec.readID)) { + if (myRec.useful){ + for (AlignmentRecord s : samList) { + if (s.useful){ + this.addBridge(readFilling, s, myRec, minCov); //stt(s) < stt(myRec) -> (s,myRec) appear once only! + } + } + } + } else { + + samList = new ArrayList(); + readID = myRec.readID; + readFilling = new ReadFilling(new Sequence(Alphabet.DNA5(), rec.getReadString(), "R" + readID), samList); + } + samList.add(myRec); + + }// while + iter.close(); + + //outOS.close(); + reader.close(); + if (bwaProcess != null){ + bwaProcess.waitFor(); + } + + } + + + + /** + * Forming bridges based on alignments. Used in batch mode only + * + * @param bamFile + * @param minCov + * @param maxCov + * @param threshold + * @param qual + * @throws IOException + */ + @Deprecated + public void makeConnections(String bamFile, double minCov, int qual) throws IOException{ + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + + SamReader reader; + if ("-".equals(bamFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + ReadFilling readFilling = null; + AlignmentRecord myRec = null; + ArrayList samList = null;// alignment record of the same read; + + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + if (rec.getReadUnmappedFlag()) + continue; + if (rec.getMappingQuality() < qual) + continue; + + myRec = new AlignmentRecord(rec, contigs.get(rec.getReferenceIndex())); + + + ////////////////////////////////////////////////////////////////// + // make bridge of contigs that align to the same (Nanopore) read. + // Note that SAM file MUST be sorted based on readID (samtools sort -n) + + //not the first occurrance + if (readID.equals(myRec.readID)) { + if (myRec.useful){ + for (AlignmentRecord s : samList) { + if (s.useful){ + this.addBridge(readFilling, s, myRec, minCov); //stt(s) < stt(myRec) -> (s,myRec) appear once only! + } + } + } + } else { + + samList = new ArrayList(); + readID = myRec.readID; + readFilling = new ReadFilling(new Sequence(Alphabet.DNA5(), rec.getReadString(), "R" + readID), samList); + } + samList.add(myRec); + + }// while + iter.close(); + + //outOS.close(); + reader.close(); + + //Logging.info("Sort list of bridges"); + //Collections.sort(bridgeList); + } + + + + /*********************************************************************************/ + protected void addBridge(ReadFilling readSequence, AlignmentRecord a, AlignmentRecord b, double minCov){ + if (a.contig.index > b.contig.index){ + AlignmentRecord t = a;a=b;b=t; + } + // Rate of aligned lengths: ref/read (illumina contig/nanopore read) + int alignedReadLen = Math.abs(a.readEnd - a.readStart) + Math.abs(b.readEnd - b.readStart), + alignedRefLen = Math.abs(a.refEnd - a.refStart) + Math.abs(b.refEnd - b.refStart); + double rate = 1.0 * alignedRefLen/alignedReadLen; + + //See if this is reliable + double score = Math.min(a.score, b.score); + int alignP = (int) ((b.readStart - a.readStart) * rate); + int alignD = (a.strand == b.strand)?1:-1; + + //(rough) relative position from ref_b (contig of b) to ref_a (contig of a) in the assembled genome + int gP = (alignP + (a.strand ? a.refStart:-a.refStart) - (b.strand?b.refStart:-b.refStart)); + if (!a.strand) + gP = -gP; + if ( a.contig.getIndex() == b.contig.getIndex() + && alignD > 0 + && (Math.abs(gP)*1.0 / a.contig.length()) < 1.1 + && (Math.abs(gP)*1.0 / a.contig.length()) > 0.9 + && a.readLength < 1.1* a.contig.length() + ) + { + if( alignedReadLen*1.0/a.readLength > 0.8 ){ //need more than 80% alignment (error rate of nanopore read) + a.contig.cirProb ++; + } + if(verbose) + System.out.printf("Potential CIRCULAR or TANDEM contig %s map to read %s(length=%d): (%d,%d) => circular score: %d\n" + , a.contig.getName(), a.readID, a.readLength, gP, alignD, a.contig.cirProb); + } + else{ + a.contig.cirProb--; + b.contig.cirProb--; + } + + // overlap length on aligned read (<0 if not overlap) + int overlap = Math.min( a.readAlignmentEnd() - b.readAlignmentStart(), b.readAlignmentEnd() - a.readAlignmentStart()); + + if ( overlap > Math.min( .5 * Math.min(a.readAlignmentEnd()-a.readAlignmentStart(), b.readAlignmentEnd()-b.readAlignmentStart()), + minContigLength) + || a.contig.getCoverage() < minCov // filter out contigs with inappropriate cov + || b.contig.getCoverage() < minCov + ) + { +// System.out.println("...ignoring " + a.contig.getIndex() + "#" + b.contig.getIndex()); + return; + } + + ScaffoldVector trans = new ScaffoldVector(gP, alignD); + + int count = 0; + ContigBridge bridge, bridge_rev; + while (true){ + int brgID = count, revID = count; + if(a.contig.getIndex()==b.contig.getIndex()){ + brgID = 2*count; + revID = brgID+1; + } + String hash = ContigBridge.makeHash(a.contig.index, b.contig.index, brgID), + hash_rev = ContigBridge.makeHash(b.contig.index, a.contig.index, revID); + + bridge = bridgeMap.get(hash); + bridge_rev = bridgeMap.get(hash_rev); + if (bridge == null){ + assert bridge_rev==null:hash_rev + " not null!"; + bridge = new ContigBridge(a.contig, b.contig, brgID); + bridge_rev = new ContigBridge(b.contig, a.contig, revID); + + bridge.addConnection(readSequence, a, b, trans, score); + bridge_rev.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); + + // a.contig.bridges.add(bridge); + // b.contig.bridges.add(bridge_rev); +// System.out.println("...addding " + bridge.hashKey + " and " + bridge_rev.hashKey); + bridgesFromContig.get(a.contig.getIndex()).add(bridge); + bridgesFromContig.get(b.contig.getIndex()).add(bridge_rev); + + bridgeMap.put(hash, bridge); + bridgeMap.put(hash_rev, bridge_rev); + + break; + } + if ((a.contig.getIndex() != b.contig.getIndex()) && bridge.consistentWith(trans)){ + assert bridge_rev!=null:hash_rev + "is null!"; + bridge.addConnection(readSequence, a, b, trans, score); + bridge_rev.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); + break; + } + if(a.contig.getIndex() == b.contig.getIndex()){ + assert bridge_rev!=null:hash_rev + "is null"; + if(bridge.consistentWith(trans)){ + bridge.addConnection(readSequence, a, b, trans, score); + bridge_rev.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); + break; + } + if(bridge.consistentWith(ScaffoldVector.reverse(trans))){ + bridge_rev.addConnection(readSequence, b, a, trans, score); + bridge.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); + break; + } + } + count ++; + }//while + + } + public static ArrayList getListOfBridgesFromContig(Contig ctg){ + return bridgesFromContig.get(ctg.getIndex()); + } + /**********************************************************************************************/ + public ContigBridge getReversedBridge(ContigBridge bridge){ + String hash = ContigBridge.makeHash(bridge.secondContig.index, bridge.firstContig.index, bridge.orderIndex); + return bridgeMap.get(hash); + } + /**********************************************************************************************/ + /* + * Check if it's possible to extend from *contig* with *bridge* to another extended-already contig (contigF) + * use for markers and unique bridge only. + * This is a pre-step to join 2 scaffolds: scaffoldT going to scaffoldF + * @param Contig: a contig to start with + * ContigBridge: a bridge from given contig to a candidate unique contig for the extension + * @return int: direction on targeted scaffold (scaffoldF) that can be traversed + */ + protected int extendDirection(Contig contig, ContigBridge bridge){ + Contig contigF = bridge.secondContig; + ScaffoldVector trans = bridge.getTransVector(); //contig->contigF + int pointer = Integer.signum(trans.magnitude * trans.direction); //pointer < 0 => tail of contigF on bridge + assert scaffolds[contigF.head].size() > 1 : contigF.head; + + int headF = contigF.head; + int direction = 0; //direction of extension on scaffoldT (we need to return direction on scaffoldF) + ScaffoldVector headT2contigF = ScaffoldVector.composition(trans, contig.getVector()); + int rEnd = contig.rightMost(), rEndF = contigF.rightMost(headT2contigF), + lEnd = contig.leftMost(), lEndF = contigF.leftMost(headT2contigF); + if(rEndF > rEnd){ + direction = 1; + } + else if(lEndF < lEnd){ + direction = -1; + } + else + return 0; + if(verbose) + System.out.println("Examining extending direction from contig " + contig.getIndex() + " to " + bridge.hashKey); + Scaffold scaffoldF = scaffolds[headF]; + // Get order-based (order on scaffold other than orientation-based of contig) previous and next marker(unique contig) + Contig prevMarker = scaffoldF.nearestMarker(contigF, false), // previous marker of contigF on *corresponding scaffold* + nextMarker = scaffoldF.nearestMarker(contigF, true); // next marker of contigF on *corresponding scaffold* + + ScaffoldVector rev = ScaffoldVector.reverse(contigF.getVector()); //rev = contigF->headF + if(prevMarker != null){ + ScaffoldVector toPrev = ScaffoldVector.composition(prevMarker.getVector(),rev); //contigF->prevMarker + if(scaffoldF.indexOf(prevMarker) > scaffoldF.indexOf(contigF) && scaffoldF.closeBridge != null) + //toPrev = ScaffoldVector.composition(ScaffoldVector.reverse(scaffoldF.circle), toPrev); + toPrev = scaffoldF.rotate(toPrev, false); + ScaffoldVector headT2Prev = ScaffoldVector.composition(toPrev, headT2contigF); + int rEndPrev = prevMarker.rightMost(headT2Prev), + lEndPrev = prevMarker.leftMost(headT2Prev); + if(verbose){ + System.out.printf("Extending from contigT %d to targeted contig (contigF) %d with previous contig (prevMarker) %d \n", contig.getIndex(), contigF.getIndex(), prevMarker.getIndex()); + System.out.println("...headT->contig, contigF and prevMarker: " + contig.getVector() + headT2contigF + headT2Prev); + } + if ((direction > 0?rEndPrev > rEndF: lEndPrev < lEndF)){ + //check if the candidate ContigBridge is more confident than the current or not + if((pointer<0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ + if(verbose) + System.out.printf("=> go from %d to %d to %d \n", contig.getIndex(), contigF.getIndex(), prevMarker.getIndex()); + return -1; + } + else{ + if(verbose) + System.out.printf("Bridge score not strong enough: %.2f < %.2f (%.2f)\n", + bridge.getScore(), pointer<0?contigF.nextScore:contigF.prevScore, + pointer<0?contigF.prevScore:contigF.nextScore); + + return 0; + } + }else{ + if(verbose) + System.out.printf("Direction conflict: %d, %d %d or %d %d. Checking otherway... \n", direction, rEndPrev, rEndF, lEndPrev, lEndF); + } + } + if(nextMarker != null){ + ScaffoldVector toNext = ScaffoldVector.composition(nextMarker.getVector(),rev); //contigF->nextMarker + if(scaffoldF.indexOf(nextMarker) < scaffoldF.indexOf(contigF) && scaffoldF.closeBridge != null) + //toNext = ScaffoldVector.composition(scaffoldF.circle, toNext); + toNext = scaffoldF.rotate(toNext, true); + ScaffoldVector headT2Next = ScaffoldVector.composition(toNext, headT2contigF); + int rEndNext = nextMarker.rightMost(headT2Next), + lEndNext = nextMarker.leftMost(headT2Next); + if(verbose){ + System.out.printf("Extending from contigT %d to targeted contig (contigF) %d with next contig (nextMarker) %d \n", contig.getIndex(), contigF.getIndex(), nextMarker.getIndex()); + System.out.println("...headT->contig, contigF and nextMarker: " + contig.getVector() + headT2contigF + headT2Next); + } + + if ((direction > 0? rEndNext > rEndF : lEndNext < lEndF)){ + //if((rev.direction<0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ + if((pointer<0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ + if(verbose) + System.out.printf("=> go from %d to %d to %d \n", contig.getIndex(), contigF.getIndex(), nextMarker.getIndex()); + return 1; + } + else{ + if(verbose) + System.out.printf("Bridge score not strong enough: %.2f < %.2f (%.2f)\n", + bridge.getScore(), pointer<0?contigF.nextScore:contigF.prevScore, + pointer<0?contigF.prevScore:contigF.nextScore); + return 0; + + } + }else{ + if(verbose) + System.out.printf("Direction conflict: %d, %d %d or %d %d. End searching! \n", direction, rEndNext, rEndF, lEndNext, lEndF); + } + } + return 0; + } + /*********************************************************************************/ + public synchronized boolean joinScaffold(Contig contig, ContigBridge bridge, boolean firstDir, int secondDir){ + if(verbose) { + System.out.println("PROCEED TO CONNECT " + bridge.hashKey + " with score " + bridge.getScore() + + ", size " + bridge.getNumOfConnections() + + ", vector (" + bridge.getTransVector().toString() + + "), distance " + bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig)); + bridge.display(); + } + + + Contig contigF = bridge.secondContig, contigT = contig; + ScaffoldVector trans = bridge.getTransVector(); + + int headF = contigF.head, + headT = contigT.head; + Scaffold scaffoldF = scaffolds[headF], + scaffoldT = scaffolds[headT]; + int posT = scaffoldT.isEnd(contigT); + if (posT == 0){ + if(verbose) + System.out.println("Impossible to jump from the middle of a scaffold " + headT + ": contig " + contigT.index); + return false; + } + + if(verbose) + System.out.println("Before joining " + contigF.index + " (" + headF +") to " + contigT.index + + " (" + headT +") " + + (scaffoldT.getLast().rightMost() - scaffoldT.getFirst().leftMost()) + + " " + (scaffoldF.getLast().rightMost() - scaffoldF.getFirst().leftMost()) + + " " + (scaffoldT.getLast().rightMost() - scaffoldT.getFirst().leftMost() + scaffoldF.getLast().rightMost() - scaffoldF.getFirst().leftMost())); + //=================================================================================================== + int index = scaffoldF.indexOf(contigF), + count = index; + + ScaffoldVector rev = ScaffoldVector.reverse(contigF.getVector()); //rev = contigF->headF + + int addScf=-1; + + if(secondDir == -1){ + if(headF==headT){ + //if(posT!=1) + if(firstDir) + return false; + else{ + Contig nextMarker = scaffoldF.nearestMarker(contigF, true); + if(nextMarker!=null){ + Contig ctg = scaffoldF.remove(index+1); + Scaffold newScf = new Scaffold(ctg); + ContigBridge brg = scaffoldF.bridges.remove(index); + while(true){ + if(scaffoldF.size()==index+1) break; + ctg= scaffoldF.remove(index+1); + brg = scaffoldF.bridges.remove(index); + newScf.addRear(ctg,brg); + } + newScf.trim(); + changeHead(newScf, nextMarker); + addScf=nextMarker.getIndex(); + } + scaffoldF.setCloseBridge(getReversedBridge(bridge)); + changeHead(scaffoldF, contigF); + } + }else{ + Contig ctg = scaffoldF.remove(index); + ContigBridge brg = getReversedBridge(bridge); + //extend and connect + while(true){ + ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg + ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg + ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT + + ctg.head = headT; + //if (posT == 1){ + if(!firstDir){ + scaffoldT.addFront(ctg,brg); + }else{ + scaffoldT.addRear(ctg,getReversedBridge(brg)); + } + if(count<1) break; + ctg = scaffoldF.remove(--count); + brg = scaffoldF.bridges.remove(count); + + } + if(scaffoldF.closeBridge!=null && !scaffoldF.isEmpty()){ + count = scaffoldF.size()-1; + ctg = scaffoldF.removeLast(); + brg = scaffoldF.closeBridge; + + while(true){ + //ctg.myVector = ScaffoldVector.composition(ScaffoldVector.reverse(scaffoldF.circle),ctg.myVector); + ctg.myVector = scaffoldF.rotate(ctg.myVector, false); + ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg + ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg + ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT + //ctg.composite(ScaffoldVector.reverse(scaffoldF.circle)); //composite co tinh giao hoan k ma de day??? + ctg.head = headT; + //if (posT == 1){ + if(!firstDir){ + scaffoldT.addFront(ctg,brg); + }else{ + scaffoldT.addRear(ctg,getReversedBridge(brg)); + } + if(count<1) break; + brg = scaffoldF.bridges.remove(count--); + ctg = scaffoldF.remove(count); + + } + } + + //set the remaining. + scaffoldT.trim(); + scaffoldF.trim(); + if(!scaffoldF.isEmpty()){ + addScf=scaffoldF.getFirst().getIndex();//getFirst: NoSuchElementException + changeHead(scaffoldF, scaffoldF.getFirst()); + } + } + //now since scaffoldF is empty due to changeHead(), re-initialize it!(do we need this??) + scaffoldF = new Scaffold(contigs.get(headF)); + } + else if(secondDir == 1){ + if(headF==headT){ + //if(posT!=-1) + if(!firstDir) + return false; + else{ + Contig prevMarker = scaffoldF.nearestMarker(contigF, false); + if(prevMarker!=null){ + Contig ctg = scaffoldF.remove(--count); + Scaffold newScf = new Scaffold(ctg); + ContigBridge brg = scaffoldF.bridges.remove(count); + while(true){ + if(count<1) break; + ctg= scaffoldF.remove(--count); + brg = scaffoldF.bridges.remove(count); + newScf.addFront(ctg,brg); + } + newScf.trim(); + changeHead(newScf, prevMarker); + addScf=prevMarker.getIndex(); + } + scaffoldF.setCloseBridge(bridge); + changeHead(scaffoldF, contigF); + + } + }else{ + Contig ctg = scaffoldF.remove(index); + ContigBridge brg = bridge; + //extend and connect + while(true){ + ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg + ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg + ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT + + ctg.head = headT; + //if (posT == 1){ + if(!firstDir){ + scaffoldT.addFront(ctg,getReversedBridge(brg)); + }else{ + scaffoldT.addRear(ctg,brg); + } + if(scaffoldF.size()==index) break; + ctg = scaffoldF.remove(index); + brg = scaffoldF.bridges.remove(index); + } + if(scaffoldF.closeBridge!=null && !scaffoldF.isEmpty()){ + ctg = scaffoldF.removeFirst(); + brg = scaffoldF.closeBridge; + while(true){ + //ctg.myVector = ScaffoldVector.composition(scaffoldF.circle,ctg.myVector); + ctg.myVector = scaffoldF.rotate(ctg.myVector, true); + ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg + ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg + ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT + //ctg.composite(scaffoldF.circle); + ctg.head = headT; + //if (posT == 1){ + if(!firstDir){ + scaffoldT.addFront(ctg,getReversedBridge(brg)); + }else{ + scaffoldT.addRear(ctg,brg); + } + if(scaffoldF.size()<1) break; + brg = scaffoldF.bridges.removeFirst(); + ctg = scaffoldF.removeFirst(); + } + } + //set the remaining + scaffoldT.trim(); + scaffoldF.trim(); + if(!scaffoldF.isEmpty()){ + addScf=scaffoldF.getLast().getIndex(); //getLast: NoSuchElementException + changeHead(scaffoldF, scaffoldF.getLast()); + } + } + //now since scaffoldF is empty due to changeHead(), re-initialize it!(do we need this??) + scaffoldF = new Scaffold(contigs.get(headF)); + } + else + return false; + + //=================================================================================================== + if(verbose){ + System.out.println("After Joining: " + (addScf<0?1:2) + " scaffolds!"); + scaffolds[contigF.head].view(); + if(addScf >=0) + scaffolds[addScf].view(); + } + return true; + } + //change head of scaffold scf to newHead. + //This should move the content of scf to scaffolds[newHead.idx], leaving scf=null afterward + //TODO: tidy this!!! + public void changeHead(Scaffold scf, Contig newHead){ + if(isRepeat(newHead)){ + if(verbose) + System.out.println("Cannot use repeat as a head! " + newHead.getName()); + return; + } + //Scaffold scf = scaffolds[scfIndex]; + int scfIndex = scf.scaffoldIndex; + int headPos = scf.indexOf(newHead); + if(headPos < 0){ + if(verbose) + System.out.printf("Cannot find contig %d in scaffold %d\n" , newHead.getIndex(), scfIndex); + return; + } + Scaffold newScf = new Scaffold(newHead.getIndex()); + ScaffoldVector rev = ScaffoldVector.reverse(newHead.getVector()); //rev = newHead->head + + if(newHead.getRelDir() == 0){ + if(verbose) + System.out.printf("Contig %d of scaffold %d got direction 0!\n" , newHead.getIndex(), scfIndex); + return; + } + else if(newHead.getRelDir() > 0){ + while(!scf.isEmpty()) + newScf.add(scf.removeFirst()); + while(!scf.bridges.isEmpty()) + newScf.bridges.add(scf.bridges.removeFirst()); + if(scf.closeBridge != null){ + newScf.closeBridge = scf.closeBridge; + newScf.circle = scf.circle; + //then reset these factors + scf.closeBridge = null; + scf.circle = null; + } + } + else{ + while(!scf.isEmpty()) + newScf.add(scf.removeLast()); + while(!scf.bridges.isEmpty()) + newScf.bridges.add(getReversedBridge(scf.bridges.removeLast())); + if(scf.closeBridge != null){ + newScf.closeBridge = getReversedBridge(scf.closeBridge); + //newScf.circle = ScaffoldVector.reverse(scf.circle); + newScf.circle = scf.circle; //cuz now circle is always positive + + //then reset these factors + scf.closeBridge = null; + scf.circle = null; + } + } + + for (Contig ctg:newScf){ + ctg.composite(rev); // leftmost->head + head->ctg = leftmost->ctg + } + newScf.setHead(newHead.getIndex()); + scaffolds[newHead.getIndex()] = newScf; + + } + public synchronized void printSequences(boolean allOut, boolean isBatch) throws IOException{ + //countOccurence=new HashMap(); + int currentNumberOfContigs = 0, + currentNumberOfCirculars = 0; + + if(annotation){ + SequenceOutputStream aout = SequenceOutputStream.makeOutputStream(prefix+".anno.japsa"); + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + + if(select && !contigs.get(i).isMapped()) + continue; + + int len = scaffolds[i].getLast().rightMost() - scaffolds[i].getFirst().leftMost(); + + if(contigs.get(i).head == i ){ + if(!reportAll && isRepeat(contigs.get(i)) && scaffolds[i].closeBridge == null) + continue; + + if(scaffolds[i].closeBridge != null ){ + currentNumberOfCirculars++; + } + currentNumberOfContigs++; + + }else if(reportAll && isRepeat(contigs.get(i)) && needMore(contigs.get(i))){ + currentNumberOfContigs++; + }else{ + continue; + } + + if(verbose) + System.out.println("Scaffold " + i + " estimated length " + len); + if(isBatch) + scaffolds[i].printBatchGenes(); + if(allOut) + scaffolds[i].viewAnnotation(aout); + } + aout.close(); + } else{ + SequenceOutputStream fout = SequenceOutputStream.makeOutputStream(prefix+".fin.fasta"), + jout = SequenceOutputStream.makeOutputStream(prefix+".fin.japsa"); +// for (int i = 0; i < scaffolds.length;i++){ +// if(scaffolds[i].isEmpty()) continue; +// int len = scaffolds[i].getLast().rightMost() - scaffolds[i].getFirst().leftMost(); +// +// if(contigs.get(i).head == i){ +// if(scaffolds[i].closeBridge != null ){ +// currentNumberOfContigs++; +// currentNumberOfCirculars++; +// } +// else if ((!isRepeat(contigs.get(i)) && len > maxRepeatLength) //here are the big ones +// || (reportAll && needMore(contigs.get(i)) && contigs.get(i).coverage > .5*estimatedCov)) //short,repetitive sequences here if required +// currentNumberOfContigs++; +// else +// continue; +// +// if(verbose) +// System.out.println("Scaffold " + i + " estimated length " + len); +// if(allOut) +// scaffolds[i].viewSequence(fout, jout); +// } +// } + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + + if(select && !contigs.get(i).isMapped()) + continue; + + int len = scaffolds[i].getLast().rightMost() - scaffolds[i].getFirst().leftMost(); + + if(contigs.get(i).head == i ){ + if(!reportAll && isRepeat(contigs.get(i)) && scaffolds[i].closeBridge == null) + continue; + + if(scaffolds[i].closeBridge != null ){ + currentNumberOfCirculars++; + } + currentNumberOfContigs++; + + }else if(reportAll && isRepeat(contigs.get(i)) && needMore(contigs.get(i))){ + currentNumberOfContigs++; + }else{ + continue; + } + + if(verbose) + System.out.println("Scaffold " + i + " estimated length " + len); + if(allOut) + scaffolds[i].viewSequence(fout, jout); + } + + fout.close(); + jout.close(); + } + scfNum=currentNumberOfContigs; + cirNum=currentNumberOfCirculars; + } + public synchronized static void oneMore(Contig ctg){ + if(countOccurence.get(ctg.getIndex())==null) + countOccurence.put(ctg.getIndex(), 1); + else + countOccurence.put(ctg.getIndex(), countOccurence.get(ctg.getIndex())+1); + } + + synchronized boolean needMore(Contig ctg) { + Integer count = countOccurence.get(ctg.getIndex()); + if(count==null) return true; + else return false; //if not occurred (Minh) + +// int estimatedOccurence = (int) Math.floor(ctg.coverage/estimatedCov); +// if(estimatedOccurence <= Math.floor(.75*count)) +// return true; +// else +// return false; + } + + public synchronized void printRT(long tpoint) throws IOException{ + for (Contig contig:contigs){ + if(contig.oriRep.size() > 0){ + String fname = contig.getName() + ".rtout"; + File f = new File(fname); + if(!f.exists()) + f.createNewFile(); + + //BufferedWriter out = new BufferedWriter(new FileWriter(f.getPath(), true)); + FileWriter fw = new FileWriter(f,true); + BufferedWriter bw = new BufferedWriter(fw); + PrintWriter pw = new PrintWriter(bw); + + ArrayList ctgList = new ArrayList(), + origList = new ArrayList(), + resList = new ArrayList(), + genesList = new ArrayList(); + + for(Contig ctg:scaffolds[contig.head]){ + ctgList.add(ctg.getName()); + if(ctg.oriRep.size()>0) + for(JapsaFeature ori:ctg.oriRep) + origList.add(ori.getID()); + for (JapsaFeature feature:ctg.genes) + genesList.add(feature.toString()); + for (JapsaFeature feature:ctg.resistanceGenes) + resList.add(feature.toString()); + } + float streamData=tpoint/1000000; + pw.print(">"); + for(String ctg:ctgList) + pw.printf("%s\t", ctg); + + pw.printf("\n>%.2fMpb\t%d genes\t", streamData, genesList.size()); + + for(String ori:origList) + pw.printf("+%s", ori); + + for(String genes:genesList) + pw.print(" \n\t"+genes); + pw.println(""); + + for(String res:resList) + pw.print(" \n\t"+res); + pw.println(""); + + pw.close(); + + } + } + + + } + + + // To check if this contig is likely a repeat or a singleton. If FALSE: able to be used as a marker. + public static boolean isRepeat(Contig ctg){ + //for the case when no coverage information of contigs is found + if(estimatedCov == 1.0 && ctg.getCoverage() == 1.0){ + if(ctg.length() > maxRepeatLength) + return false; + else + return true; + } + + if (ctg.length() < minContigLength || ctg.getCoverage() < .3 * estimatedCov) return true; + else if (ctg.length() > maxRepeatLength || ctg.getCoverage() < 1.3 * estimatedCov) + return false; + else if (ctg.getCoverage() > 1.5 * estimatedCov) + return true; + else{ + for(ContigBridge bridge:getListOfBridgesFromContig(ctg)){ + Contig other = bridge.firstContig.getIndex()==ctg.getIndex()?bridge.secondContig:bridge.firstContig; + if(other.getIndex()==ctg.getIndex()) continue; + int dist=bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig); + if( dist<0 && dist>-ctg.length()*.25){ + if(other.length() > maxRepeatLength || other.getCoverage() < 1.3*estimatedCov) + return true; + } + } + + } + if(ctg.length() < 2*minContigLength) // further filter: maybe not repeat but insignificant contig + return true; + else + return false; + } + + abstract public void connectBridges(); + + public int getNumberOfContigs(){ + return scfNum; + } + public int getNumberOfCirculars(){ + return cirNum; + } + +} \ No newline at end of file diff --git a/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldGraphDFS.java b/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldGraphDFS.java new file mode 100644 index 0000000..ac5a35a --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldGraphDFS.java @@ -0,0 +1,527 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 3 Jan 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.bio.hts.scaffold; + +import japsa.seq.JapsaFeature; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * @author minhduc&sonnguyen + * + */ +public class ScaffoldGraphDFS extends ScaffoldGraph { + /** + * @param sequenceFile + * @throws IOException + */ + public ScaffoldGraphDFS(String sequenceFile, String genesFile, String resistFile, String isFile, String oriFile) throws IOException, InterruptedException { + super(sequenceFile); + if(resistFile != null){ + readDb(resistFile, "Resistance genes", .9, 1.0); + annotation = true; + } + if(isFile != null){ + readDb(isFile, "Insertion sites", .8, .9); + annotation = true; + } + if(oriFile != null){ + readDb(oriFile, "Origin of replication", .8, .9); + annotation = true; + } + if(genesFile != null){ + readGFF(genesFile); + annotation = true; + } +// for (Contig contig:contigs){ +// for(JapsaFeature feature:contig.genes) +// System.out.println(contig.getName() + "\t" + feature.getStrand() + "\t" + feature); +// } +// for (Contig contig:contigs){ +// for(JapsaFeature feature:contig.oriRep) +// System.out.println(contig.getName() + "\t" + feature.getStrand() + "\t" + feature); +// } +// for (Contig contig:contigs){ +// for(JapsaFeature feature:contig.insertSeq) +// System.out.println(contig.getName() + "\t" + feature.getStrand() + "\t" + feature); +// } + } + + private void readDb(String data, String type, double minCov, double minID) throws IOException, InterruptedException{ + type = type.toLowerCase(); + + String blastn = "blastn"; + + ProcessBuilder pb = new ProcessBuilder(blastn, + "-subject", + "-", + "-query", + data, + "-outfmt", + "7 qseqid qlen qstart qend sseqid slen sstart send length frames pident nident gaps mismatch score bitscore sstrand"); + ///// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + + //6 qseqid qlen length pident nident gaps mismatch + // 0 1 2 3 4 5 6 + Process process = pb.start(); + //Pass on the genome to blastn + SequenceOutputStream out = new SequenceOutputStream(process.getOutputStream()); + for (Contig ctg:contigs){ + Sequence seq=ctg.contigSequence; + seq.writeFasta(out); + } + out.close(); + + //Read the output of blastn + BufferedReader br = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line; + + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) + continue; + + String [] toks = line.trim().split("\t"); + int length = Integer.parseInt(toks[8]); + int qlen = Integer.parseInt(toks[1]); + double cov = (float)length/qlen; + if (minCov > cov){ + continue; + } + + if (Double.parseDouble(toks[10]) < minID * 100){ + continue; + } + //pass + + Contig ctg = getSPadesContig(toks[4]); + if(ctg != null){ + char strand = toks[16].equals("plus")?'+':'-'; + JapsaFeature feature = new JapsaFeature(Integer.parseInt(toks[6]), Integer.parseInt(toks[7]), type, toks[0], strand, ctg.getName()); + + feature.addDesc(toks[0]+ ":" + (int)(cov*100) + "% cover, " + toks[10] + "% identity"); + + switch (type.toLowerCase()){ + case "resistance genes": + ctg.resistanceGenes.add(feature); + break; + case "insertion sites": + ctg.insertSeq.add(feature); + break; + case "origin of replication": + ctg.oriRep.add(feature); + break; + default: + System.err.println(type + " has not yet included in our analysis!"); + break; + + } + + } + Collections.sort(ctg.resistanceGenes); + Collections.sort(ctg.insertSeq); + Collections.sort(ctg.oriRep); + } + br.close(); + //process.waitFor();//Do i need this??? + } + private void readGFF(String fileName) throws IOException, InterruptedException{ + BufferedReader br = new BufferedReader(new FileReader(fileName)); + String line; + + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) + continue; + if (line.startsWith(">")) + break; + String [] toks = line.trim().split("\t"); + Contig ctg = getSPadesContig(toks[0]); //get the contig from its shorten name + if(ctg != null){ + int start = Integer.parseInt(toks[3]), + end = Integer.parseInt(toks[4]); + String [] des = toks[8].trim().split(";"); + String [] id = des[0].trim().split("="); + String ID = "undefined"; + if(id[0].equals("ID")) + ID = id[1]; + if(!toks[2].equals("gene")){ + JapsaFeature feature = new JapsaFeature(start, end, toks[2], ID, toks[6].charAt(0), ctg.getName()); + feature.addDesc(toks[8]); + ctg.genes.add(feature); + } + } + } + br.close(); + } + public ScaffoldGraphDFS(String sequenceFile, String graphFile) throws IOException { + super(sequenceFile); + //TODO: implement fastg reader for SequenceReader to have pre-assembled bridges + + } + + /* + * Arrange contigs + * + */ + @Override + public synchronized void connectBridges(){ +// System.out.println("List of all bridges: "); +// for(Contig ctg:contigs){ +// ArrayList brgList = getListOfBridgesFromContig(ctg); +// System.out.print(ctg.getName() + " roots for " + brgList.size() + " bridges: "); +// for(ContigBridge brg:brgList) +// System.out.print(brg.hashKey + " ; "); +// System.out.println(); +// } + // Start scaffolding + if(verbose) { + System.out.println("Starting scaffolding......."); + + } + + List list = new ArrayList(); + for(int i = 0; i 1) + System.out.printf("Finally, scaffold %d size %d and is %s\n", + i, + scaffolds[i].getLast().rightMost() - scaffolds[i].getFirst().leftMost(), + closed?"circular":"linear"); + }//for + + } + + + /* To check the agreement of locations between 3 contigs: marker -> prevContig -> curContig + * (also their corresponding scores?). + * Purpose: avoid false positive alignments. + * + */ + private ContigBridge checkHang(Contig marker, ContigBridge toPrev, ContigBridge toCurrent){ + Contig prevContig = toPrev.secondContig, + curContig = toCurrent.secondContig; + ScaffoldVector mark2Prev = toPrev.getTransVector(), + mark2Cur = toCurrent.getTransVector(); + ScaffoldVector prevToCur = ScaffoldVector.composition(mark2Cur,ScaffoldVector.reverse(mark2Prev)); + if(verbose) + System.out.printf("\texamining %s to %s:\n ",prevContig.getName(),curContig.getName()); + for(ContigBridge brg:getListOfBridgesFromContig(prevContig)){ + if(brg.secondContig.getIndex() == curContig.getIndex()){ + if(brg.consistentWith(prevToCur)){ + if(verbose) + System.out.printf("=> consistent between bridge vector %s and estimated vector %s\n",brg.getTransVector(),prevToCur); + return brg; + } + else{ + if(verbose) + System.out.printf("=> inconsistent between bridge vector %s and estimated vector %s\n",brg.getTransVector(),prevToCur); + } + } + + } + + return null; + } + + /* + * Walking through markers while trying to fill the gap by repeat sequences simultaneously + */ + private boolean walk2(int i, boolean direction ){ + Scaffold scaffold = scaffolds[i]; + boolean extended = true; + boolean closed = scaffold.closeBridge!=null; + + /*****************************************************************/ + while (extended && (!closed) && scaffold.size() > 0){ + Contig ctg = direction?scaffold.getLast():scaffold.getFirst(); + ArrayList bridges=getListOfBridgesFromContig(ctg); + + if(verbose) { + System.out.printf(" Last of scaffold %d extention is on contig %d (%s): ",i,ctg.getIndex(),ctg.getName()); + System.out.printf("iterating among %d bridges\n",bridges.size()); + } + int ctgEnd = direction?ctg.rightMost():ctg.leftMost(); + + extended = false; //only continue the while loop if extension is on the move (line 122) + int maxLink = bridges.size(), + extendDir = 0, //direction to go on the second scaffold: ScaffoldT (realtime mode) + curStep = Integer.MAX_VALUE; //distance between singleton1 -> singleton2 + double curScore = 0.0; //score between singleton1 -> singleton2 + ContigBridge stepBridge = null; + + ArrayList extendableContig = new ArrayList(maxLink); + ArrayList extendableContigBridge = new ArrayList(maxLink); + ArrayList extendableVector = new ArrayList(maxLink); + ArrayList distances = new ArrayList(maxLink); + Collections.sort(bridges); + for (ContigBridge bridge:bridges){ + if (bridge.firstContig == bridge.secondContig) //2 identical markers ??! + if(!bridge.firstContig.isCircular()) + continue; + Contig nextContig = bridge.secondContig; + ScaffoldVector trans = bridge.getTransVector(); +// if (ctg == bridge.secondContig){ +// nextContig = bridge.firstContig; +// trans = ScaffoldVector.reverse(trans); +// } + if(verbose) + System.out.println("..." + nextContig.getName()); + ScaffoldVector trialTrans = ScaffoldVector.composition(trans, ctg.getVector()); + int newEnd = direction?nextContig.rightMost(trialTrans):nextContig.leftMost(trialTrans); + + //see if the next contig would extend the scaffold to the right + //only take one next singleton (with highest score possible sorted) as the marker for the next extension + int distance = bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig); + if (direction?(newEnd > ctgEnd):(newEnd < ctgEnd)){ + if(!isRepeat(nextContig) || (ctg.isCircular() && ctg.getIndex() == nextContig.getIndex())){ + //check quality of the bridge connected 2 markers + int aDir = 0; + if(scaffolds[nextContig.head].size() > 1){ + aDir = extendDirection(ctg, bridge); + if(aDir==0){ + if(verbose) + System.out.println("No jump to " + nextContig.getName()); + continue; + } + } + if(distance > -maxRepeatLength && bridge.getNumOfConnections() >= minSupportReads && bridge.getScore() > curScore ){ +// if(verbose) +// bridge.display(); + curStep = distance; + curScore = bridge.getScore(); + stepBridge = bridge; + extendDir = aDir; + }else{ + if(verbose) + System.out.printf("Cannot form unique bridge from %d to %d with %d connections and score %.2f\n", + ctg.getIndex(), nextContig.getIndex(), bridge.getNumOfConnections(), bridge.getScore()); + continue; + } + } + + if(verbose) + System.out.printf(" Might extend %d from %d(%d) to %d(%d) (%s direction) with score %f and distance %d\n" + ,i,ctg.index, ctgEnd, nextContig.index, newEnd, + (bridge.getTransVector().getDirection() > 0?"same":"opposite"), bridge.getScore(), distance); + + int j = 0; + //looking for right position to have the list sorted + for(j=0; j distance) + break; + + distances.add(j,distance); + extendableContig.add(j, nextContig); + extendableContigBridge.add(j, bridge); + extendableVector.add(j, trialTrans); + } + else if(verbose) + System.out.printf(" No extend %d from %d(%d) to %d(%d) with score %f and distance %d\n",i,ctg.index, ctgEnd, nextContig.index, newEnd,bridge.getScore(), distance); + + }//for + int noOfUniqueContig = 0; //reset to count how many singleton will be added now + + if(stepBridge==null){ + if(verbose) + System.out.printf(" Extension of Scaffold %d toward stopped at %d due to the lack of next marker!\n", i,ctg.index); + return false; + } + int curEnd = ctgEnd; + Contig prevContig = ctg; + ContigBridge prevContigBridge = null; + ScaffoldVector prevVector = new ScaffoldVector(); + for(int index = 0; index < extendableContig.size(); index++){ + if(distances.get(index) > curStep) + continue; + Contig curContig = extendableContig.get(index); + ContigBridge curContigBridge = extendableContigBridge.get(index); //will be replaced by the bridge to prev contig later + ScaffoldVector curVector = extendableVector.get(index); + if(verbose) + System.out.println("Checking contig " + curContig.getName() + "..."); + if( isRepeat(curContig) && !curContig.isCircular()) + if(checkHang(ctg, curContigBridge, stepBridge)==null) + continue; + prevVector = prevContig.getVector(); + boolean extendable = false; + ScaffoldVector prevToCur = ScaffoldVector.composition(curVector,ScaffoldVector.reverse(prevVector)); + ContigBridge confirmedBridge; + //TODO: if this happen with singleton -> chimeric happen (unique+repeat=contig) need to do smt... + if( isRepeat(curContig) && + (direction?(curContig.rightMost(curVector) < curEnd):(curContig.leftMost(curVector)) > curEnd)){ + if(verbose) + System.out.println(curContig.getName() + " is ignored because current end " + curEnd + + " cover the new end " + (direction?curContig.rightMost(curVector): curContig.leftMost(curVector))); + continue; + } + else{ + if(index >= 1 && prevContigBridge != null){ + confirmedBridge = checkHang(ctg, prevContigBridge, curContigBridge); + if(confirmedBridge != null){ + prevContigBridge = curContigBridge; + prevToCur =confirmedBridge.getTransVector(); + extendable = true; + } + else + continue; + } + else{ + prevContigBridge = curContigBridge; + confirmedBridge = curContigBridge; + extendable = true; + } + } + if(extendable){ + // if extension is circularized + if(curContig.getIndex() == (direction?scaffold.getFirst().getIndex():scaffold.getLast().getIndex()) + && (!isRepeat(curContig) || curContig.isCircular()) + ){ + if(verbose) + System.out.printf(" *****************SCAFFOLD %d CLOSED AFTER CONNECT %d ***********************\n", i,curContig.index); + scaffold.setCloseBridge(direction?confirmedBridge:getReversedBridge(confirmedBridge)); + curContigBridge.setContigScores(); + return true; + } + + if(isRepeat(curContig)){ + curContig.head = i; //must be here! + curContig = curContig.clone(); + }else{ + //check to join 2 scaffolds and stop this round + if (scaffolds[curContig.head].size() > 1){ + if(!joinScaffold(prevContig,confirmedBridge,direction,extendDir)){ + if(verbose) + System.out.printf(" Skip to connect contig %d of %d to contig %d of %d\n", ctg.index,i,curContig.index, curContig.head); + continue; + } + else{ + curContigBridge.setContigScores(); + noOfUniqueContig++; + break; + } + + } + curContig.head = i; //must be here! + noOfUniqueContig++; + curContigBridge.setContigScores(); + } + + if(verbose) { + System.out.printf(" Extend %d from %d(%d) to %d(%d) with score %f: ", + i,ctg.index, ctgEnd, curContig.index, curContig.rightMost(curVector), curContigBridge.getScore()); + System.out.printf(" curContigBridge %d -> %d\n", confirmedBridge.firstContig.getIndex(), curContigBridge.secondContig.getIndex()); + } + curContig.myVector = ScaffoldVector.composition(prevToCur,prevContig.getVector());//from the head contig + //confirmedBridge=confirmedBridge.clone(prevContig,curContig); + + if(direction) + scaffolds[i].addRear(curContig, confirmedBridge); + else + scaffolds[i].addFront(curContig, getReversedBridge(confirmedBridge)); + + + curEnd = direction?curContig.rightMost(curVector):curContig.leftMost(curVector); + extended = true; //scaffold extension is really on the move... + + prevContig = curContig; + + if(verbose) + scaffolds[i].view(); + + } + + if(distances.get(index) == curStep) + break; + }//for + if(noOfUniqueContig < 1){ + if(verbose) + System.out.printf(" Extension of Scaffold %d toward stopped at %d because next marker is not reachable!\n", i,ctg.index); + scaffolds[i].trim(); + return false; + } + // TO THE NEXT UNIQUE CONTIG + }//while + return closed; + } + + + class LengthIndex implements Comparable{ + int length, index; + public LengthIndex(int len, int index){ + this.length = len; + this.index = index; + } + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(LengthIndex o) { + return (int) (o.length - length); + + } + } +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldVector.java b/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldVector.java new file mode 100644 index 0000000..b2fb64a --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/ScaffoldVector.java @@ -0,0 +1,135 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 20/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.hts.scaffold; + + +/** + * Implementation of a vector of relative position of a contig in its scaffold + * @author minhduc + * + */ +public class ScaffoldVector{ + int magnitude = 0; //distance of two contigs' starting points *---> <---* (with sign follows the first contig's direction) + int direction = 1; //relative direction of those two (+/- for same/opposite directions) + + public ScaffoldVector(){ + //magnitude = 0 + //direction = 1; + } + public ScaffoldVector(int p, int d){ + magnitude = p; + direction = d; + } + // reverse vector a -> b to b -> a (a and b are contigs) + public static ScaffoldVector reverse(ScaffoldVector v){ + ScaffoldVector rev = new ScaffoldVector(); + if (v.direction > 0){ + rev.direction = 1; + rev.magnitude = - v.magnitude; + }else{ + rev.direction = -1; + rev.magnitude = v.magnitude; + } + return rev; + } + + /** + * Return the distance between two closest tips of two contigs + * that relative position of fContig with regard to tContig (tContig->fContig) is represented by *this* + * A negative value indicate the overlap of two contigs. + * @param fContig + * @param tContig + * @return + */ + + public int distance(Contig tContig, Contig fContig){ + + int tS = 0, tE = tContig.length(), + fS, fE; + + if (direction > 0){ + fS = magnitude; + fE = magnitude + fContig.length(); + }else{ + fE = magnitude; + fS = magnitude - fContig.length(); + } + //System.out.printf("tS=%d tE=%d fS=%d fE=%d fS-tE=%d tS-fE = %d ret=%d\n",tS, tE, fE, fE, fS-tE,tS-fE,Math.max(fS - tE, tS - fE)); + + //FIXME: not handle the case that contig A contain contigB and via verse + return Math.max(fS - tE, tS - fE); + } + + /** + * Compose two vectors: a -> b is v2, b -> c is v1. returned a -> c is v1 * v2 + * Warning: the parameters' order doesn't follow normal intuition. USE WITH CARE!!! + * @param v1 + * @param v2 + * @return + */ + public static ScaffoldVector composition(ScaffoldVector v1, ScaffoldVector v2){ + ScaffoldVector ret = new ScaffoldVector(); + + ret.magnitude = v2.magnitude + v2.direction * v1.magnitude; + ret.direction = v1.direction * v2.direction; + + return ret; + } + + public String toString(){ + return "<" + magnitude + ", " + direction + ">"; + } + /** + * @return the magnitude + */ + public int getMagnitute() { + return magnitude; + } + /** + * @return the direction + */ + public int getDirection() { + return direction; + } + + /** + * Set the new magnitude + * @param magnitude + */ + public void setMagnitute(int magnitude){ + this.magnitude=magnitude; + + } +} \ No newline at end of file diff --git a/src/dev/java/japsadev/bio/hts/scaffold/StringHelper.java b/src/dev/java/japsadev/bio/hts/scaffold/StringHelper.java new file mode 100644 index 0000000..4c77080 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/StringHelper.java @@ -0,0 +1,174 @@ +package japsadev.bio.hts.scaffold; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; + +public class StringHelper { + + /* + * @param String, Graph: name of the FASTG file and the Graph to build + * @return int: shortest contig length in the FASTG file, used for k-mer guess + */ + public static int buildGraphFromFastg(String graphFile, Graph g) throws IOException{ + int shortestLen = Integer.MAX_VALUE; + SequenceReader reader = new FastaReader(graphFile); + Sequence seq; + + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + if(seq.length() 1){ + String[] nbList = adjList[1].split(","); + for(int i=0; i < nbList.length; i++){ + // create list of bridges here (distance=-kmer overlapped) + String neighbor = nbList[i]; + boolean dir2=neighbor.contains("'")?false:true; + neighbor=neighbor.replaceAll("[^a-zA-Z0-9_.]", "").trim(); + + Vertex nbVertex=new Vertex(neighbor); + if(g.getVertex(nbVertex.getLabel())!=null) + nbVertex=g.getVertex(nbVertex.getLabel()); + + g.addVertex(nbVertex, false); + + g.addEdge(current, nbVertex, dir1, dir2); + } + } + + } + reader.close(); + + return shortestLen; + } + /* + * @param Path, String: path to be built from a string representing paths from File contigs.path + * in SPAdes output directory. + * + */ + public static void addPathFromSPAdes(Path p, String paths){ + Graph graph = p.graph; + paths=paths.replace(";", ""); //optimized it! + String[] comps = paths.split(","); + for(int i=0; i")){ + String node = toks[0].replaceAll("\"", ""); + String vertexID = node.substring(0, node.length()-1); + graph.addVertex(new Vertex(vertexID), false); + } + else{ + if(toks.length < 3) + continue; //smt wrong actually! + //get rid of quote characters + String source = toks[0].replaceAll("\"", ""), + dest = toks[2].replaceAll("\"", ""); + boolean sourceDir = source.contains("+")?true:false, + destDir = dest.contains("+")?true:false; + String sourceID = source.substring(0, source.length()-1), + destID = dest.substring(0, dest.length()-1); + Vertex sourceVertex = graph.getVertex(sourceID), + destVertex = graph.getVertex(destID); + +// if(sourceVertex==null){ +// sourceVertex = new Vertex(sourceID); +// graph.addVertex(sourceVertex, false); +// } +// if(destVertex==null){ +// destVertex = new Vertex(destID); +// graph.addVertex(destVertex, false); +// } + + if(toks.length > 3) //distance available + graph.addEdge(sourceVertex, destVertex, sourceDir, destDir, getDistanceFromDotFileBlock(toks[3])); + else + graph.addEdge(sourceVertex, destVertex, sourceDir, destDir); + + } + } + } + + br.close(); + } + /* + * @param String [d=-%d] + * @return overlap distance, must be negative + */ + static int getDistanceFromDotFileBlock(String block){ + int d = 0; + String pattern = "^\\[d=([-]?[0-9]*)\\]$"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(block); + if(m.find()){ + //System.out.println("Pattern matched: " + m.group(1)); + d=Integer.parseInt(m.group(1)); + //do smt + } else{ + System.err.println("Not a legal block for distance!"); + } + return d; + } + + public static void main(String[] args) { + // TODO Auto-generated method stub + String blk="[d=-1234]"; + System.out.println(getDistanceFromDotFileBlock(blk)); + } + +} diff --git a/src/dev/java/japsadev/bio/hts/scaffold/Vertex.java b/src/dev/java/japsadev/bio/hts/scaffold/Vertex.java new file mode 100644 index 0000000..5e33b94 --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/scaffold/Vertex.java @@ -0,0 +1,179 @@ +package japsadev.bio.hts.scaffold; + +import java.util.ArrayList; + +import japsadev.bio.hts.scaffold.Edge; +import japsadev.bio.hts.scaffold.Vertex; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; + +/** + * This class models a vertex for my string graph which actually corresponds to an edge in SPAdes's assembly graph. + * Label for this vertex is extracted from its full name and used as its index. + * For example, vertex named 'EDGE_1_length_1000_cov_50' is labeled as vertex 1. + * This vertex's neighborhood is described by the Edges incident to it. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Vertex { + + private ArrayList neighborhood; + private String fullName, label; + private Sequence seq=null; + + /** + * + * @param label The unique label associated with this Vertex + */ + public Vertex(String name){ + this.fullName=name; + this.label=getID(name); + this.neighborhood = new ArrayList(); + this.seq=new Sequence(Alphabet.DNA5(), 0); + } + + public Vertex(String name, Sequence seq){ + this(name); + this.seq = seq; + + } + /** + * + * @param name The name of Edge in assembly graph that correspond to this Vertex + */ + private String getID(String name){ + String[] toks = name.split("_"); + if(toks.length > 1)//SPAdes + return toks[1]; + else + return name; + } + /** + * This method adds an Edge to the incidence neighborhood of this graph iff + * the edge is not already present. + * + * @param edge The edge to add + */ + public void addNeighbor(Edge edge){ + if(this.neighborhood.contains(edge)){ + return; + } + this.neighborhood.add(edge); + } + + + /** + * + * @param other The edge for which to search + * @return true iff other is contained in this.neighborhood + */ + public boolean containsNeighbor(Edge other){ + return this.neighborhood.contains(other); + } + + /** + * + * @param index The index of the Edge to retrieve + * @return Edge The Edge at the specified index in this.neighborhood + */ + public Edge getNeighbor(int index){ + return this.neighborhood.get(index); + } + + + /** + * + * @param index The index of the edge to remove from this.neighborhood + * @return Edge The removed Edge + */ + Edge removeNeighbor(int index){ + return this.neighborhood.remove(index); + } + + /** + * + * @param e The Edge to remove from this.neighborhood + */ + public void removeNeighbor(Edge e){ + this.neighborhood.remove(e); + } + + + /** + * + * @return int The number of neighbors of this Vertex + */ + public int getNeighborCount(){ + return this.neighborhood.size(); + } + /** + * + * @return String The label of this Vertex + */ + public String getLabel(){ + return this.label; + } + /** + * + * @return String The full name of this Vertex + */ + public String getName(){ + return this.fullName; + } + /** + * + * @param Sequence A sequence + */ + public void setSequence(Sequence seq){ + this.seq = seq; + } + /** + * + * @return Sequence The sequence of this Vertex + */ + public Sequence getSequence(){ + return this.seq; + } + /** + * + * @return String A String representation of this Vertex + */ + public String toString(){ + return "Vertex " + label; + } + + /** + * + * @return The hash code of this Vertex's label + */ + public int hashCode(){ + return this.label.hashCode(); + } + + /** + * + * @param other The object to compare + * @return true iff other instanceof Vertex and the two Vertex objects have the same label + */ + public boolean equals(Object other){ + if(!(other instanceof Vertex)){ + return false; + } + + Vertex v = (Vertex)other; + return this.label.equals(v.label); + } + + /** + * + * @return ArrayList A copy of this.neighborhood. Modifying the returned + * ArrayList will not affect the neighborhood of this Vertex + */ + public ArrayList getNeighbors(){ + return new ArrayList(this.neighborhood); + } + +} + + diff --git a/src/dev/java/japsadev/bio/np/phage/CDHitExtract.java b/src/dev/java/japsadev/bio/np/phage/CDHitExtract.java new file mode 100644 index 0000000..4203dd8 --- /dev/null +++ b/src/dev/java/japsadev/bio/np/phage/CDHitExtract.java @@ -0,0 +1,80 @@ +package japsadev.bio.np.phage; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.HashMap; + +import japsa.bio.np.ErrorCorrection; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; + +public class CDHitExtract { + + public static void main(String[] args) throws IOException, InterruptedException { + // TODO Auto-generated method stub + ArrayList representSeq = SequenceReader.readAll("/home/sonhoanghguyen/Projects/Phage/paper/poa-consensus/allNanopore.fasta", Alphabet.DNA()); + HashMap map = new HashMap(); + for(Sequence seq:representSeq){ + map.put(seq.getName(), seq); + } + String aligner = "clustal"; + BufferedReader pathReader = new BufferedReader(new FileReader("/home/sonhoanghguyen/Projects/Phage/paper/poa-consensus/nanopore.clstr")); + SequenceOutputStream out = SequenceOutputStream.makeOutputStream("/home/sonhoanghguyen/Projects/Phage/paper/poa-consensus/nanopore.fasta"); + String s; + //Read contigs from contigs.paths and refer themselves to contigs.fasta + Sequence consensus = new Sequence( Alphabet.DNA(), 10000); + int count = 0; + String seq = ""; + ArrayList aGroup=new ArrayList();; + while((s=pathReader.readLine()) != null){ + if(s.startsWith(">")){ + if(count>0){ + //group=map.get(seq); + if(count > 1){ + System.out.println("Consensusing group with " + aGroup.size() + " members"); + consensus = ErrorCorrection.consensusSequence(aGroup, "grouping", aligner); + } + else + consensus = map.get(seq); + consensus.setName(seq); // name of the CDHit representative sequence, but content is the consensus + consensus.setDesc(aligner+"="+count); + //System.out.println(group.getName() + " : " + group.getDesc()); + consensus.writeFasta(out); + } + aGroup = new ArrayList(); + count=0; + }else{ + count++; + aGroup.add(map.get(s.substring(s.indexOf(">")+1, s.indexOf("...")))); + if(s.contains("*")){ + seq = s.substring(s.indexOf(">")+1, s.indexOf("...")); + + } + } + + } + //last round + if(count>0){ + //group=map.get(seq); + if(count > 1){ + System.out.println("Consensusing group with " + aGroup.size() + " members"); + consensus = ErrorCorrection.consensusSequence(aGroup, "grouping", aligner); + } + else + consensus = map.get(seq); + consensus.setName(seq); // name of the CDHit representative sequence, but content is the consensus + consensus.setDesc(aligner+"="+count); + //System.out.println(group.getName() + " : " + group.getDesc()); + consensus.writeFasta(out); + } + + pathReader.close(); + out.close(); + } + +} diff --git a/src/dev/java/japsadev/bio/np/phage/CoverTree.java b/src/dev/java/japsadev/bio/np/phage/CoverTree.java new file mode 100644 index 0000000..e77b3b0 --- /dev/null +++ b/src/dev/java/japsadev/bio/np/phage/CoverTree.java @@ -0,0 +1,495 @@ +package japsadev.bio.np.phage; + +/** + * This class provides a Java version of the cover tree nearest neighbor algorithm. + * It is based on Thomas Kollar's version of "Cover Trees for Nearest Neighbor" by + * Langford, Kakade, Beygelzimer (2007). The original algorithm is extended towards + * selecting K centers which are maximally different from one another from an online sample. + * + * Date of creation: 2013-02-08 + * Copyright (c) 2013, Nils Loehndorf + * + * The software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it freely. + * + * @author Nils Loehndorf + * + */ + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +public class CoverTree { + + int maxLevel; + int minLevel; + double base; + int maxMinLevel; + Node rootNode; + boolean hasBounds; + double[] min; + double[] max; + int[] numLevels; + int maxNumLevels = 500; + int minNumLevels = -500; + + /** + * Create a cover tree at level zero which automatically expands above and below. + */ + public CoverTree () { + this.maxMinLevel = Integer.MIN_VALUE; + this.numLevels = new int[maxNumLevels-minNumLevels]; + this.base = 1.2; + } + + /** + * Create a cover tree which stops increasing the minimumLevel as soon as the given number of nodes is reached. + */ + public CoverTree (double base, int maxMinLevel) { + this.base = base; + this.maxMinLevel = maxMinLevel; + if (maxMinLevel>0) { + this.maxLevel = maxMinLevel; + this.minLevel = maxMinLevel; + } + this.numLevels = new int[maxNumLevels-minNumLevels]; + } + + /** + * Set the minimum levels of the cover tree by defining the maximum exponent of the base (default = 500). + */ + public void setMaxNumLevels (int max) { + maxNumLevels = max; + } + + /** + * Set the minimum levels of the cover tree by defining the minimum exponent of the base (default = -500). + */ + public void setMinNumLevels (int min) { + minNumLevels = min; + } + + /** + * Points outside of the bounding box, will not be included. This allows for easy truncation. + * @param min + * @param max + */ + public void setBounds(double[] min, double[] max) { + hasBounds = true; + this.min = min; + this.max = max; + } + + /** + * Returns the maximum level of this tree. + * @return + */ + public int maxLevel() { + return maxLevel; + } + + /** + * Returns the minimum level of this tree. + * @return + */ + public int minLevel() { + return minLevel; + } + + void incNodes(int level) { + numLevels[level-minNumLevels]++; + } + + void decNodes(int level) { + numLevels[level-minNumLevels]--; + } + + /** + * Returns the size of the cover tree up to the given level (inclusive) + * @param level + * @return + */ + public int size(int level) { + int sum = 0; + for (int i=maxLevel; i>=level; i--) + sum += numLevels[i-minNumLevels]; + return sum; + } + + /** + * Returns the size of the cover tree + * @return + */ + public int size() { + return size(minLevel); + } + + void insertAtRoot(E element, double[] point) { + //inserts the point above the root by successively increaasing the cover of the root node until it + //contains the new point, the old root is added as child of the new root + Node oldRoot = rootNode; + double dist = oldRoot.distance(point); + while (dist > Math.pow(base,maxLevel)) { + Node newRoot = new Node(null,rootNode.element,rootNode.point); + rootNode.setParent(newRoot); + newRoot.addChild(rootNode); + rootNode = newRoot; + decNodes(maxLevel); + incNodes(++maxLevel); + } + Node newNode = new Node(rootNode,element,point); + rootNode.addChild(newNode); + incNodes(maxLevel-1); + } + + /** + * Insert a point into the tree. If the tree size is greater than k the lowest cover will be removed as long as it does not decrease tree size below k. + * @param point + */ + public boolean insert(E element, double[] point, int k) { + boolean inserted = insert(element,point); + //only do this if there are more than two levels + if (maxLevel-minLevel>2) { + //remove lowest cover if the cover before has a sufficient number of nodes + if (size(minLevel+1)>=k) { + removeLowestCover(); + //do not accept new nodes at the minimum level + maxMinLevel = minLevel+1; + } + //remove redundant nodes from the minimum level + if (size(minLevel)>=2*k) { + removeNodes(k); + } + } + return inserted; + } + + /** + * Insert a point into the tree. + * @param point + */ + public boolean insert(E element, double[] point) { + if (hasBounds) { + //points outside of the bounding box will not be added to the tree + for (int d=0; dmax[d]) + return false; + if (point[d](null,element,point); + incNodes(maxLevel); + return true; + } + //do not add if the new node is identical to the root node + rootNode.distance = rootNode.distance(point); + if (rootNode.distance == 0.) + return false; + //if the node lies outside the cover of the root node and its decendants then insert the node above the root node + if (rootNode.distance > Math.pow(base,maxLevel+1)) { + insertAtRoot(element,point); + return true; + } + //usually insertion begins here + List> coverset = new LinkedList>(); + //the initial coverset contains only the root node + coverset.add(rootNode); + int level = maxLevel; + Node parent = null; //the root node does not have a parent + int parentLevel = maxLevel; + while (true) { + boolean parentFound = true; + List> candidates = new LinkedList>(); + for (Node n1 : coverset) { + for (Node n2 : n1.getChildren()) { + if (n1.point!=n2.point) { + //do not compute distance twice + n2.distance = n2.distance(point) ; + //do not add if node is already contained in the tree + if (n2.distance == 0.) + return false; + } + else + n2.distance = n1.distance; + if (n2.distance < Math.pow(base,level)) { + candidates.add(n2); + parentFound = false; + } + } + } + //if the children of the coverset are further away the 2^level then an element of the + //coverset is the parent of the new node + if (parentFound) + break; + //select one node of the coverset as the parent of the node + for (Node n : coverset) { + if (n.distance < Math.pow(base,level)) { + parent = n; + parentLevel = level; + break; + } + } + //set all nodes as the new coverset + level--; + coverset = candidates; + } + //if the point is a sibling of the root node, then the cover of the root node is increased + if (parent == null) { + insertAtRoot(element,point); + return true; + } + if (parentLevel-1 < minLevel) { + //if the maximum size is reached and this would only increase the depth of the tree then stop + if (parentLevel-1 < maxMinLevel) + return false; + minLevel = parentLevel-1; + } + //otherwise add child to the tree + Node newNode = new Node(parent,element,point); + parent.addChild(newNode); + //record distance to parent node and add to the sorted set of nodes where distance is used for sorting (needed for removal) + incNodes(parentLevel-1); + return true; + } + + /** + * Removes the the cover at the lowest level of the tree. + */ + void removeLowestCover() { + List> coverset = new LinkedList>(); + coverset.add(rootNode); + int k = maxLevel; + while(k-- > minLevel+1){ + List> nextCoverset = new LinkedList>(); + for (Node n : coverset) + nextCoverset.addAll(n.getChildren()); + coverset = nextCoverset; + } + for (Node n : coverset) + n.removeChildren(); + + minLevel++; + } + + + /** + * Removes all but k points. + */ + List> removeNodes(int numCenters) { + List> coverset = new LinkedList>(); + coverset.add(rootNode); + int k = maxLevel; + while(k-- > minLevel+1){ + List> nextCoverset = new LinkedList>(); + for (Node n : coverset) + nextCoverset.addAll(n.getChildren()); + coverset = nextCoverset; + } + int missing = numCenters-coverset.size(); + if (missing < 0) + System.err.println("Error: negative missing="+missing+" in coverset"); + //sucessively pick the node with the largest distance to the coverset and add it to the coverset + LinkedList> candidates = new LinkedList>(); + for (Node n : coverset) + for (Node n2 : n.getChildren()) + if (n.point!=n2.point) + candidates.add(n2); + //only add candidates when the coverset is yet smaller then the number of desired centers + if (coverset.size() n1 : candidates) { + double minDist = Double.POSITIVE_INFINITY; + for (Node n2 : n1.getParent().getParent().getChildren()) { + double dist = n1.distance(n2.point); + if (dist < minDist) + minDist = dist; + } + n1.distance = minDist; + if (minDist==Double.POSITIVE_INFINITY) + System.err.println("Error: Infinite distance in k centers computation."); + } + do { + Collections.sort(candidates); + Node newNode = candidates.removeLast(); + coverset.add(newNode); + //update the distance of all candidates in the neighborhood of the new node + for (Node n : newNode.getParent().getParent().getChildren()) { + if (n!=newNode) { + double dist = newNode.distance(n.point); + if (dist < newNode.distance) + newNode.distance = dist; + } + } + } while (coverset.size() n : candidates) { + n.getParent().removeChild(n); + decNodes(minLevel); + } + return coverset; + } + + /** + * Retrieve the elemnet from the tree that is nearest to the given point with respect to the Euclidian distance. + * @param point + * @return + */ + public E getNearest(double[] point) { + List> candidates = new LinkedList>(); + candidates.add(rootNode); + rootNode.distance = rootNode.distance(point); + double minDist = rootNode.distance; + for (int i=maxLevel; i>minLevel; i--) { + List> newCandidates = new LinkedList>(); + for (Node n : candidates) { + for (Node n2 : n.getChildren()) { + //do not compute distances twice + if (n.point!=n2.point) { + n2.distance = n2.distance(point); + //minimum distance can be recorded here + if (n2.distance n : newCandidates) + if (n.distance < minDist + Math.pow(base,i)) + candidates.add(n); + } + for (Node n : candidates) { + if (n.distance == minDist) + return n.element; + } + return null; + } + + /** + * Get the cover of the given level. All points at this level are guaranteed to be 2^i apart from one another. + * @param level + * @return + */ + public List getCover(int level) { + List> coverset = new LinkedList>(); + coverset.add(rootNode); + int k = maxLevel; + while(k-- > level){ + List> nextCoverset = new LinkedList>(); + for (Node n : coverset) + nextCoverset.addAll(n.getChildren()); + coverset = nextCoverset; + } + List cover = new LinkedList(); + for (Node n: coverset) { + cover.add(n.element); + } + + return cover; + } + + /** + * Gets at least k centers which are maximally apart from each other. All remaining centers are removed from the tree. This function only works as designed + * when the function insert(point,k) has been used before to add points to the tree. Otherwise, it will return the cover one level above the bottom most level of the tree. + * @param number of centers + * @return + */ + public List getKCenters(int numCenters) { + List> coverset = removeNodes(numCenters); + //create cover + List cover = new LinkedList(); + for (Node n: coverset) { + cover.add(n.element); + } + return cover; + + } + + static double distance(double[] d1, double[] d2) { + double sumSq = 0.; + for (int i=0; i implements Comparable> { + + Node parent; + E element; + List> children; + double[] point; + double distance; + + //use for a child + Node (Node parent, E element, double[] point) { + this.parent = parent; + this.children = new LinkedList>(); + this.element = element; + this.point = point; + } + + Node getParent() { + return parent; + } + + void setParent(Node node) { + parent = node; + } + + void addChild(Node node) { + children.add(node); + } + + List> getChildren() { + if (children.isEmpty()) { + Node n = new Node(this,this.element,this.point); + addChild(n); + } + return children; + } + + void removeChild(Node n) { + children.remove(n); + } + + void removeChildren() { + children.clear(); + } + + double distance(double[] point) { + double sumSq = 0.; + for (int i=0; i o) { + if (distance < o.distance) + return -1; + if (distance > o.distance) + return 1; + return 0; + } + + + } + +} + diff --git a/src/dev/java/japsadev/bio/phylo/Tree2Tikz.java b/src/dev/java/japsadev/bio/phylo/Tree2Tikz.java new file mode 100755 index 0000000..d7db152 --- /dev/null +++ b/src/dev/java/japsadev/bio/phylo/Tree2Tikz.java @@ -0,0 +1,59 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.bio.phylo; + +import japsa.bio.phylo.PhylogenyTree; + + +/** + * @author minhduc + * + */ +public class Tree2Tikz { + + /** + * @param args + */ + public static void main(String[] args) throws Exception { + // Read a tree in + if (args.length <= 0) { + System.err.println("Usage: tree2tikz treeFile [texFile]"); + System.exit(1); + } + PhylogenyTree tree = PhylogenyTree.readFromFile(args[0]); + + if (args.length >= 2) + tree.drawTree(args[1], true); + else + tree.drawTree("tree.tex", true); + // System.out.println(tree); + } + +} diff --git a/src/dev/java/japsadev/bio/test/KMean.java b/src/dev/java/japsadev/bio/test/KMean.java new file mode 100644 index 0000000..54b5b2a --- /dev/null +++ b/src/dev/java/japsadev/bio/test/KMean.java @@ -0,0 +1,92 @@ +package japsadev.bio.test; + +import static java.lang.Math.abs; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Scanner; + +public class KMean { + int k; + int noOfItems; + ArrayList dataItems; + ArrayList cz; + ArrayList oldCz; + ArrayList row; + ArrayList> groups; + Scanner input; + + public KMean(int k, int noOfItems) { + this.k = k; + this.noOfItems = noOfItems; + dataItems = new ArrayList<>(); + cz = new ArrayList<>(); + oldCz = new ArrayList<>(); + row = new ArrayList<>(); + groups = new ArrayList<>(); + input = new Scanner(System.in); + + for (int i = 0; i < k; i++) { + groups.add(new ArrayList<>()); + } + + for (int i = 0; i < noOfItems; i++) { + System.out.println("Enter Value for: " + (i + 1) + " item"); + dataItems.add(input.nextInt()); + if (i < k) { + cz.add(dataItems.get(i)); + System.out.println("C" + (i + 1) + " is " + cz.get(i)); + } + } + int iter = 1; + do { + for (int aItem : dataItems) { + for (int c : cz) { + row.add(abs(c - aItem)); + } + groups.get(row.indexOf(Collections.min(row))).add(aItem); + row.removeAll(row); + } + for (int i = 0; i < k; i++) { + if (iter == 1) { + oldCz.add(cz.get(i)); + } else { + oldCz.set(i, cz.get(i)); + } + if (!groups.get(i).isEmpty()) { + cz.set(i, average(groups.get(i))); + } + } + if (!cz.equals(oldCz)) { + for (int i = 0; i < groups.size(); i++) { + groups.get(i).removeAll(groups.get(i)); + } + } + iter++; + } while (!cz.equals(oldCz)); + for (int i = 0; i < cz.size(); i++) { + System.out.println("New C" + (i + 1) + " " + cz.get(i)); + } + for (int i = 0; i < groups.size(); i++) { + System.out.println("Group " + (i + 1)); + System.out.println(groups.get(i).toString()); + } + System.out.println("Number of Itrations: " + iter); + } + + public static void main(String[] args) { + Scanner input = new Scanner(System.in); + System.out.println("Enter Value of K"); + int k = input.nextInt(); + System.out.println("Enter No of Data Items"); + int noOfItems = input.nextInt(); + new KMean(k, noOfItems); + } + + public static int average(ArrayList list) { + int sum = 0; + for (Integer value : list) { + sum = sum + value; + } + return sum / list.size(); + } +} diff --git a/src/dev/java/japsadev/bio/test/TestPFA.java b/src/dev/java/japsadev/bio/test/TestPFA.java new file mode 100644 index 0000000..9119f30 --- /dev/null +++ b/src/dev/java/japsadev/bio/test/TestPFA.java @@ -0,0 +1,667 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 09/12/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.bio.test; + +import java.util.Random; + +import japsa.bio.alignment.ProbFSM; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; + +/** + * @author minhduc + * + */ +public class TestPFA { + /** + * @param args + */ + public static void main(String[] args) { + int length = 1000; + Alphabet dna = Alphabet.DNA4(); + Random rand = new Random(1); + Sequence mSeq, gSeq; + ProbFSM fa; + ProbFSM.Emission emission; + + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + + /********************************************************************/ + System.out.println("=================================================="); + mSeq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rand); + fa = new ProbFSM.ProbThreeSM(mSeq); + + gSeq = fa.generate(rand); + + emission = fa.align(gSeq); + + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + + emission = fa.align(gSeq); + System.out.println(emission.myCost); + fa.updateCount(emission); + fa.reEstimate(); + fa.resetCount(); + + fa.showProb(); + + } + +} diff --git a/src/dev/java/japsadev/lib/jMEF/Bernoulli.java b/src/dev/java/japsadev/lib/jMEF/Bernoulli.java new file mode 100755 index 0000000..822e5ed --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Bernoulli.java @@ -0,0 +1,202 @@ +package japsadev.lib.jMEF; + +import japsadev.lib.jMEF.Parameter.TYPE; + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * The Bernoulli distribution is an exponential family and, as a consequence, the probability density function is given by + * \f[ f(x; \mathbf{\Theta}) = \exp \left( \langle t(x), \mathbf{\Theta} \rangle - F(\mathbf{\Theta}) + k(x) \right) \f] + * where \f$ \mathbf{\Theta} \f$ are the natural parameters. + * This class implements the different functions allowing to express a Bernoulli distribution as a member of an exponential family. + * + * @section Parameters + * + * The parameters of a given distribution are: + * - Source parameters \f$\mathbf{\Lambda} = p \in [0,1]\f$ + * - Natural parameters \f$\mathbf{\Theta} = \theta \in R^+\f$ + * - Expectation parameters \f$ \mathbf{H} = \eta \in [0,1] \f$ + */ +public final class Bernoulli extends ExponentialFamily{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ F(\mathbf{\Theta}) = \log \left( 1 + \exp \theta \right) \f$ + */ + public double F(PVector T){ + return Math.log(1+Math.exp(T.array[0])); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) = \frac{\exp \theta}{1 + \exp \theta} \f$ + */ + public PVector gradF(PVector T){ + PVector gradient = new PVector(1); + gradient.array[0] = Math.exp(T.array[0]) / (1+Math.exp(T.array[0])); + gradient.type = TYPE.EXPECTATION_PARAMETER; + return gradient; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ G(\mathbf{H}) = \log \left( \frac{\eta}{1-\eta} \right) \eta - \log \left( \frac{1}{1-\eta} \right) \f$ + */ + public double G(PVector H){ + return H.array[0] * Math.log(H.array[0]/(1-H.array[0])) - Math.log(1.0/(1-H.array[0])); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ \nabla G( \mathbf{H} ) = \log \left( \frac{\eta}{1-\eta} \right) \f$ + */ + public PVector gradG(PVector H){ + PVector gradient = new PVector(1); + gradient.array[0] = Math.log(H.array[0]/(1-H.array[0])); + gradient.type = TYPE.NATURAL_PARAMETER; + return gradient; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = x \f$ + */ + public PVector t(PVector x){ + PVector t = new PVector(1); + t.array[0] = x.array[0]; + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = 0 \f$ + */ + public double k(PVector x){ + return 0.0d; + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = p \f$ + * @return natural parameters \f$ \mathbf{\Theta} = \log \left( \frac{p}{1-p} \right) \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = new PVector(1); + T.array[0] = Math.log(L.array[0]/(1-L.array[0])); + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \frac{\exp\theta}{1+\exp\theta} \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = new PVector(1); + L.array[0] = Math.exp(T.array[0]) / ( 1 + Math.exp(T.array[0]) ); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = p \f$ + * @return expectation parameters \f$ \mathbf{H} = p \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = new PVector(1); + H.array[0] = L.array[0]; + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} = \eta\f$ + * @return source parameters \f$ \mathbf{\Lambda} = \eta \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = new PVector(1); + L.array[0] = H.array[0]; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;p) \f$. + * @param x a point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;p) = p^x (1-p)^{1-x} \mbox{ for } x \in \{0,1\} \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER) + return Math.pow(param.array[0], x.array[0]) * Math.pow(1-param.array[0], 1-x.array[0]); + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Draws a point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = p \f$ + * @return a point + */ + public PVector drawRandomPoint(PVector L) { + PVector x = new PVector(1); + if (Math.random(){ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Parameter n. + */ + private int n; + + + /** + * Class constructor. + */ + public BinomialFixedN(){ + this.n = 100; + } + + + /** + * Class constructor. + * @param n parameter n + */ + public BinomialFixedN(int n){ + this.n = n; + } + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ F(\mathbf{\Theta}) = n \log (1 + \exp \theta) - \log (n!) \f$ + */ + public double F(PVector T){ + return n * Math.log(1 + Math.exp(T.array[0])) - Math.log(fact(n)); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) = \frac{n \exp \theta}{1 + \exp \theta} \f$ + */ + public PVector gradF(PVector T){ + PVector gradient = new PVector(1); + gradient.array[0] = n * Math.exp(T.array[0]) / (1 + Math.exp(T.array[0])); + gradient.type = TYPE.EXPECTATION_PARAMETER; + return gradient; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ G(\mathbf{H}) = \eta \log \left( \frac{\eta}{n-\eta} \right) - n \log\left( \frac{n}{n-\eta} \right) \f$ + */ + public double G(PVector H){ + return H.array[0] * Math.log(H.array[0]/(n-H.array[0])) - n * Math.log(n/(n-H.array[0])); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ \nabla G( \mathbf{H} ) = \log \left( \frac{\eta}{n-\eta} \right) \f$ + */ + public PVector gradG(PVector H){ + PVector gradient = new PVector(1); + gradient.array[0] = Math.log(H.array[0]/(n-H.array[0])); + gradient.type = TYPE.NATURAL_PARAMETER; + return gradient; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = x \f$ + */ + public PVector t(PVector x){ + PVector t = new PVector(1); + t.array[0] = x.array[0]; + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = - \log (x! (n-x)!) \f$ + */ + public double k(PVector x){ + return (double)( - Math.log(fact(x.array[0]) * fact(n-x.array[0])) ); + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = p \f$ + * @return natural parameters \f$ \mathbf{\Theta} = \log \left( \frac{p}{1-p} \right) \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = new PVector(1); + T.array[0] = Math.log(L.array[0]/(1-L.array[0])); + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \frac{\exp \theta}{1 + \exp \theta} \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = new PVector(1); + L.array[0] = Math.exp(T.array[0]) / (1 + Math.exp(T.array[0])); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = p \f$ + * @return expectation parameters \f$ \mathbf{H} = np \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = new PVector(1); + H.array[0] = n*L.array[0]; + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} = \eta\f$ + * @return source parameters \f$ \mathbf{\Lambda} = \frac{\eta}{n} \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = new PVector(1); + L.array[0] = H.array[0]/n; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;p) \f$. + * @param x a point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;p) = \frac{n!}{x!(n-x)!} p^x (1-p)^{n-x} \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER) + return (fact(n) * Math.pow(param.array[0], x.array[0]) * Math.pow(1-param.array[0], n-x.array[0]) ) / ( fact(x.array[0]) * fact(n-x.array[0]) ); + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Computes the factorial of a number. + * @param n number + * @return n! + */ + private double fact(double n){ + double f = 1; + for (int i=1; i<=n; i++) + f *= i; + return f; + } + + + /** + * Draws a point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = p \f$ + * @return a point + */ + public PVector drawRandomPoint(PVector L) { + + // Loop + int count=0; + for(int i=0; i0){ + cdf_min = kld_cdf[i]; + break; + } + double value = rand.nextDouble() * (kld_cdf[n-1]-cdf_min) + cdf_min; + + // Select the corresponding Gaussian + index = -1; + for (i=0; i=value) + break; + } + + // Add the new centroid + g.weight[row] = f.weight[index]; + g.param[row] = f.param[index]; + gau[index] = 1; + + + // Compute the KLD matrix + for (i=0; i1){ + for (int j=0; j " + getLossFunction(f, g, type)); + } + + return cond; + } + + + /** + * Computes the repartition of components of f in g + * @param f initial mixture model + * @param g simplified mixture model + * @param type type of the Bregman divergence used (right-sided, left-sided, or symmetric) + * @param repartition array of repartition + */ + private static void computeRepartition(MixtureModel f, MixtureModel g, Clustering.CLUSTERING_TYPE type, int[] repartition){ + int n = f.size; + int m = g.size; + + for (int i=0; i[] clusters, ExponentialFamily EF){ + + // Mixture initialization + MixtureModel mm = new MixtureModel(clusters.length); + mm.EF = EF; + + // Amount of points + int nb = 0; + for (int i=0; ilogLikelihoodThreshold ); + + // Conversion of mixture in source parameters + return mixtureH2L(fH); + + } + + + /** + * Converts a mixture model from source parameters to expectation parameters. + * @param fL mixture model in source parameters + * @return mixture model in expected parameters + */ + private static MixtureModel mixtureL2H(MixtureModel fL){ + int size = fL.size; + MixtureModel fH = new MixtureModel(size); + fH.EF = fL.EF; + for (int i=0; i1.0e-6 ){ + l = (lmin+lmax)/2.0; + thetageodesic = EF.GeodesicPoint(thetaR, thetaL, l); + if (EF.BD(thetageodesic,thetaR)>EF.BD(thetaL,thetageodesic)) + lmax=l; + else + lmin=l; + } + l = (lmin+lmax)/2.0; + centroid = EF.GeodesicPoint(thetaR, thetaL, l); + + return centroid; + } + + + /** + * Computes the centroid of a mixture model. This centroid is sided (right- or left-sided) or is symmetric. + * @param f mixture model + * @param type type of Bregman divergence used (right-sided, left-sided, or symmetric) + * @return sided or symmetric centroid of f + */ + public static Parameter getCentroid(MixtureModel f, Clustering.CLUSTERING_TYPE type){ + Parameter centroid; + if (type==CLUSTERING_TYPE.RIGHT_SIDED) + centroid = Clustering.getCenterOfMass(f); + else if (type==CLUSTERING_TYPE.LEFT_SIDED) + centroid = Clustering.getGeneralizedCentroid(f.EF, f); + else + centroid = Clustering.getSymmetricCentroid(f.EF, f); + return centroid; + } + +} diff --git a/src/dev/java/japsadev/lib/jMEF/ExpectationMaximization1D.java b/src/dev/java/japsadev/lib/jMEF/ExpectationMaximization1D.java new file mode 100755 index 0000000..997e707 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/ExpectationMaximization1D.java @@ -0,0 +1,159 @@ +package japsadev.lib.jMEF; + +import java.util.Vector; + +public class ExpectationMaximization1D { + + + /** + * Maximum number of iterations permitted. + */ + private static int MAX_ITERATIONS = 30; + + + /** + * Initializes a mixture model from clusters of points. The parameters estimated corresponds to univariate Gaussian distributions. + * @param clusters clusters of points + * @return mixture model + */ + public static MixtureModel initialize(Vector[] clusters){ + + // Mixture model + MixtureModel mm = new MixtureModel(clusters.length); + mm.EF = new UnivariateGaussian(); + + // Amount of points + int nb = 0; + for (int i=0; ilogLikelihoodThreshold && iterations implements Serializable{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta}\f$ + * @return \f$ F(\mathbf{\Theta}) \f$ + */ + abstract public double F(ParamD T); + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T expectation parameters \f$ \mathbf{\Theta} \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) \f$ + */ + abstract public ParamD gradF(ParamD T); + + + /** + * Computes \f$ div F \f$. + * @param TP natural parameters \f$ \mathbf{\Theta}_P\f$ + * @param TQ natural parameters \f$ \mathbf{\Theta}_Q\f$ + * @return \f$ div F( \mathbf{\Theta}_P \| \mathbf{\Theta}_Q ) = F(\mathbf{\Theta}_P) - F(\mathbf{\Theta}_Q) - \langle \mathbf{\Theta}_P-\mathbf{\Theta}_Q , \nabla F(\mathbf{\Theta}_Q) \rangle\f$ + */ + public double DivergenceF(ParamD TP , ParamD TQ){ + return F(TP) - F(TQ) - (TP.Minus(TQ)).InnerProduct(gradF(TQ)); + } + + + /** + * Computes \f$ G(\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} \f$ + * @return \f$ G(\mathbf{H}) \f$ + */ + abstract public double G(ParamD H); + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} \f$ + * @return \f$ \nabla G(\mathbf{H}) \f$ + */ + abstract public ParamD gradG(ParamD H); + + + /** + * Computes \f$ div G \f$. + * @param HP expectation parameters \f$ \mathbf{H}_P\f$ + * @param HQ expectation parameters \f$ \mathbf{H}_Q\f$ + * @return \f$ div G( \mathbf{H}_P \| \mathbf{H}_Q ) = G(\mathbf{H}_P) - G(\mathbf{H}_Q) - \langle \mathbf{H}_P-\mathbf{H}_Q , \nabla G(\mathbf{H}_Q) \rangle\f$ + */ + public double DivergenceG(ParamD HP , ParamD HQ){ + return G(HP) - G(HQ) - (HP.Minus(HQ)).InnerProduct(gradG(HQ)); + } + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) \f$ + */ + abstract public ParamD t(ParamX x); + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) \f$ + */ + abstract public double k(ParamX x); + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} \f$ + * @return natural parameters \f$ \mathbf{\Theta} \f$ + */ + public abstract ParamD Lambda2Theta(ParamD L); + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta}\f$ + * @return source parameters \f$ \mathbf{\Lambda} \f$ + */ + public abstract ParamD Theta2Lambda(ParamD T); + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} \f$ + * @return expected parameters \f$ \mathbf{H} \f$ + */ + public abstract ParamD Lambda2Eta(ParamD L); + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} \f$ + * @return source parameters \f$ \mathbf{\Lambda} \f$ + */ + public abstract ParamD Eta2Lambda(ParamD H); + + + /** + * Converts natural parameters to expectation parameters. + * @param T natural parameters \f$ \mathbf{\Theta}\f$ + * @return expectation parameters \f$ \mathbf{H} \f$ + */ + public ParamD Theta2Eta(ParamD T){ + return gradF(T); + } + + + /** + * Converts expectation parameters to natural parameters. + * @param H expectation parameters \f$ \mathbf{H} \f$ + * @return natural parameters \f$ \mathbf{\Theta} \f$ + */ + public ParamD Eta2Theta(ParamD H){ + return gradG(H); + } + + + /** + * Computes the density value \f$ f(x;\mathbf{\Theta}) \f$ of an exponential family member. + * @param x a point + * @param T natural parameters \f$ \mathbf{\Theta} \f$ + * @return \f$ f(x) = \exp \left( \langle \mathbf{\Theta} \ , \ t(x) \rangle - F(\mathbf{\Theta}) + k(x) \right) \f$ + */ + public double density(ParamX x, ParamD T){ + return Math.exp(T.InnerProduct(t(x))-F(T)+k(x)); + } + + + /** + * Computes the Bregman divergence between two members of a same exponential family. + * @param T1 natural parameters \f$ \mathbf{\Theta}_1\f$ + * @param T2 natural parameters \f$ \mathbf{\Theta}_2\f$ + * @return \f$ BD( \mathbf{\Theta_1} \| \mathbf{\Theta_2} ) = F(\mathbf{\Theta_1}) - F(\mathbf{\Theta_2}) - \langle \mathbf{\Theta_1} - \mathbf{\Theta_2} , \nabla F(\mathbf{\Theta_2}) \rangle \f$ + */ + public double BD(ParamD T1 , ParamD T2){ + return F(T1) - F(T2) - gradF(T2).InnerProduct(T1.Minus(T2)); + } + + + /** + * Computes the Kullback-Leibler divergence between two members of a same exponential family. + * @param LP source parameters \f$ \mathbf{\Lambda}_P \f$ + * @param LQ source parameters \f$ \mathbf{\Lambda}_Q \f$ + * @return \f$ D_{\mathrm{KL}}(f_P\|f_Q) \f$ + */ + public abstract double KLD(ParamD LP , ParamD LQ); + + + /** + * Computes the geodesic point. + * @param T1 natural parameters \f$ \mathbf{\Theta}_1\f$ + * @param T2 natural parameters \f$ \mathbf{\Theta}_2\f$ + * @param alpha position \f$ \alpha \f$ of the point on the geodesic link + * @return \f$ \nabla G \left( (1-\alpha) \nabla F (\mathbf{\Theta}_1) + \alpha \nabla F (\mathbf{\Theta}_2) \right) \f$ + */ + public ParamD GeodesicPoint(ParamD T1, ParamD T2, double alpha){ + return gradG( (ParamD) (gradF(T1).Times(1.0d-alpha)).Plus(gradF(T2).Times(alpha)) ); + } + + + /** + * Draws a random point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda}\f$ + * @return a point + */ + public abstract ParamX drawRandomPoint(ParamD L); + +} + diff --git a/src/dev/java/japsadev/lib/jMEF/HierarchicalMixtureModel.java b/src/dev/java/japsadev/lib/jMEF/HierarchicalMixtureModel.java new file mode 100755 index 0000000..24a54dd --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/HierarchicalMixtureModel.java @@ -0,0 +1,202 @@ +package japsadev.lib.jMEF; + +import japsadev.lib.jMEF.Clustering.CLUSTERING_TYPE; + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * A hierarchical mixture model is a hierarchical structure (tree) containing the elements of a mixture model. + * A HierarchicalMixtureModel object is created by the BregmanHierarchicalClustering class. + */ +public class HierarchicalMixtureModel { + + /** + * Exponential family member. + */ + public ExponentialFamily EF; + + /** + * Weight of the tree. + */ + public double weight; + + /** + * Node containing a mixture model. + */ + public MixtureModel node; + + + /** + * Parent of the node. + */ + public HierarchicalMixtureModel parent; + + + /** + * Left child of the node. + */ + public HierarchicalMixtureModel leftChild; + + + /** + * Right child of the node. + */ + public HierarchicalMixtureModel rightChild; + + + /** + * Type of the Bregman divergence. + */ + public Clustering.CLUSTERING_TYPE type; + + + /** + * Maximum resolution of the hierarchical mixture model. + */ + public int resolutionMax; + + + /** + * Class constructor. + */ + public HierarchicalMixtureModel(){ + this.EF = null; + this.weight = 0.0d; + this.node = null; + this.parent = null; + this.leftChild = null; + this.rightChild = null; + this.type = CLUSTERING_TYPE.SYMMETRIC; + } + + + /** + * Extracts a mixture model for a given resolution from the hierarchical mixture model. + * The resolution 1 corresponds to a mixture model with only one model. + * @param resolution resolution of the mixture model + * @return mixture model for a given resolution + */ + public MixtureModel getResolution(int resolution){ + + if (resolution==1 || (this.leftChild==null && this.rightChild==null)){ + + // Mixture model + MixtureModel mix = new MixtureModel(1); + mix.EF = this.EF; + mix.weight[0] = this.weight; + mix.param[0] = Clustering.getCentroid(this.node, this.type); + + // Return the mixture model converted in source (lambda) parameters + return mixtureT2L(mix); + } + else{ + + // Variables + int n1 = 0 , n2 = 0; + double w1 = 0 , w2 = 0; + MixtureModel mm1 = null, mm2 = null; + + // Get the mixture model of the right and left child + if (this.leftChild!=null){ + mm1 = this.leftChild.getResolution(resolution-1); + n1 = mm1.size; + w1 = this.leftChild.weight; + } + if (this.rightChild!=null){ + mm2 = this.rightChild.getResolution(resolution-1); + n2 = mm2.size; + w2 = this.rightChild.weight; + } + + // Fusion the two mixture models + MixtureModel mix = new MixtureModel(n1+n2); + mix.EF = this.EF; + int i; + for (i=0; id_max){ + d_max = d_tmp; + idx = j; + } + } + px = ((PVectorMatrix)mm.param[idx]).v; + c = new Color( (int)px.array[0], (int)px.array[1], (int)px.array[2] ); + imgOut.setRGB(row, col, c.getRGB()); + } + + // Return + return imgOut; + } + + + /** + * Reads an image. + * @param imagePath image file to read + * @return image + */ + public static BufferedImage readImage(String imagePath){ + BufferedImage image_in = null; + try{ + image_in = ImageIO.read(new File(imagePath)); + } + catch (IOException e) { + e.printStackTrace(); + System.err.println("*** Error: Image file does not exist ***"); + } + return image_in; + } + + + /** + * Writes an image. + * @param image image to be written + * @param imagePath image path + */ + public static void writeImage(BufferedImage image, String imagePath){ + try{ + ImageIO.write(image, "png", new File(imagePath)); + } + catch (IOException e) { + e.printStackTrace(); + } + } + + + /** + * Computes the PSNR between two images. + * @param i1 first image + * @param i2 second image + * @return PSNR(i1,i2) + */ + public static double PSNR(BufferedImage i1, BufferedImage i2){ + double mse = 0; + for (int r=0; r[] clusters = KMeans.run(px, n); + mm = BregmanSoftClustering.initialize(clusters, new MultivariateGaussian()); + mm = BregmanSoftClustering.run(px, mm); + MixtureModel.save(mm, path); + } + else if (mm.getDimension()!=d) { + throw new RuntimeException("Incorrect dimension."); + } + return mm; + } + + + /** + * Counts the minimum number of points assigned to image pixel. + * @param tab array + * @param height height of the array + * @param width width of the array + * @return minimum number of points + */ + private static int min(int[][] tab, int height, int width){ + int min = Integer.MAX_VALUE; + for (int y=0; y=0 && y>=0 && x=0 && g>=0 && b>=0 && r<255 && g<255 && b<255){ + imgSum[y][x][0] += r; + imgSum[y][x][1] += g; + imgSum[y][x][2] += b; + imgCpt[y][x]++; + } + } + } + } + + // Normalize the colors + for (y=0; y[] run(PVector[] points, int k){ + + PVector[] centroids = initialize(points, k); + int[] repartition = new int[points.length]; + Vector[] clusters = new Vector[k]; + + int it = 0; + int[] tmp = new int[points.length]; + + do{ + tmp = repartition.clone(); + repartitionStep(points, k, centroids, repartition, clusters); + centroidStep(points, k, centroids, clusters); + it++; + } while(!Arrays.equals(repartition, tmp) && it[] clusters){ + + // Initialization of the clusters + for (int i=0; i(); + + // Compute repartition + for (int i=0; i[] clusters){ + for (int i=0; i { + + + /** + * Constant for serialization + */ + private static final long serialVersionUID = 1L; + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ F(\mathbf{\Theta}) = \log \left( -\frac{2}{\theta} \right) \f$ + */ + public double F(PVector T){ + return Math.log( -2.0d / T.array[0] ); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) = -\frac{1}{\theta} \f$ + */ + public PVector gradF(PVector T){ + PVector g = new PVector(T.dim); + g.array[0] = -1.0d / T.array[0]; + g.type = TYPE.EXPECTATION_PARAMETER; + return g; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ G(\mathbf{H}) = - \log \eta \f$ + */ + public double G(PVector H){ + return -Math.log(H.array[0]); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ \nabla G(\mathbf{H}) = -\frac{1}{\eta} \f$ + */ + public PVector gradG(PVector H){ + PVector g = new PVector(1); + g.array[0] = -1.0d/H.array[0]; + g.type = TYPE.NATURAL_PARAMETER; + return g; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = |x| \f$ + */ + public PVector t(PVector x){ + PVector t = new PVector(1); + t.array[0] = Math.abs(x.array[0]); + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = 0 \f$ + */ + public double k(PVector x){ + return 0.0d; + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \sigma \f$ + * @return natural parameters \f$ \mathbf{\Theta} = -\frac{1}{\sigma} \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = new PVector(L.dim); + T.array[0] = -1.0d/L.array[0]; + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = -\frac{1}{\theta} \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = new PVector(T.dim); + L.array[0] = -1.0d/T.array[0]; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \sigma \f$ + * @return expectation parameters \f$ \mathbf{H} = \sigma \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = new PVector(1); + H.array[0] = L.array[0]; + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \eta \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = new PVector(1); + L.array[0] = H.array[0]; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;\sigma) \f$. + * @param x a point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;\sigma) = \frac{1}{ 2 \sigma } \exp \left( - \frac{|x|}{\sigma} \right) \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER) + return (1.0d /(2*param.array[0])) * Math.exp( - Math.abs(x.array[0])/param.array[0] ); + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Draws a point from the considered Laplacian distribution. + * @param L source parameters \f$ \mathbf{\Lambda}\f$. + * @return a point. + */ + public PVector drawRandomPoint(PVector L) { + double u = Math.random() - 0.5; + PVector point = new PVector(1); + point.array[0] = -L.array[0] * Math.signum(u) * Math.log(1 - 2 * Math.abs(u)); + return point; + } + + + /** + * Computes the Kullback-Leibler divergence between two Laplacian distributions. + * @param LP source parameters \f$ \mathbf{\Lambda}_P \f$ + * @param LQ source parameters \f$ \mathbf{\Lambda}_Q \f$ + * @return \f$ D_{\mathrm{KL}}(f_P\|f_Q) = \log \left( \frac{\sigma_Q}{\sigma_P} \right) + \frac{\sigma_P - \sigma_Q}{\sigma_Q} \f$ + */ + public double KLD(PVector LP, PVector LQ) { + double sP = LP.array[0]; + double sQ = LQ.array[0]; + return Math.log(sQ/sP) + (sP-sQ)/sQ; + } + + +} diff --git a/src/dev/java/japsadev/lib/jMEF/MixtureModel.java b/src/dev/java/japsadev/lib/jMEF/MixtureModel.java new file mode 100755 index 0000000..f1bbba3 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/MixtureModel.java @@ -0,0 +1,303 @@ +package japsadev.lib.jMEF; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.Random; + + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * A mixture model is a powerful framework commonly used to estimate the probability density function (PDF) of a random variable. + * Let us consider a mixture model \f$f\f$ of size \f$n\f$. The probability density function \f$f\f$ evaluated at \f$x \in R^d\f$ is given by + * \f[ f(x) = \sum_{i=1}^n \alpha_i f_i(x)\f] + * where \f$\alpha_i \in [0,1]\f$ denotes the weight of the \f$i^{\textrm{th}}\f$ mixture component \f$f_i\f$ such as \f$\sum_{i=1}^n \alpha_i=1\f$. + * The MixtureModel class provides a convenient way to create and manage mixture of exponential families. + */ +public class MixtureModel implements Serializable{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Exponential family of the mixture model. + */ + public ExponentialFamily EF; + + /** + * Number of components in the mixture model. + */ + public int size; + + + /** + * Array containing the weights of the mixture components. + */ + public double[] weight; + + + /** + * Array containing the parameters of the mixture components. + */ + public Parameter[] param; + + + /** + * Class constructor. + * @param n number of components in the mixture models. + */ + public MixtureModel(int n){ + this.EF = null; + this.size = n; + this.weight = new double[n]; + this.param = new Parameter[n]; + } + + + /** + * Computes the density value \f$ f(x) \f$ of a mixture model. + * @param x a point + * @return value of the density \f$ f(x) \f$ + */ + public double density(Parameter x){ + double cumul=0.0d; + for(int i=0; i{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Parameter n. + */ + private int n; + + + /** + * Class constructor. + */ + public MultinomialFixedN(){ + this.n = 100; + } + + + /** + * Class constructor. + * @param n parameter n + */ + public MultinomialFixedN(int n){ + this.n = n; + } + + + /** + * Computes \f$ F( \mathbf{\Theta} ) \f$. + * @param T parameters \f$ \mathbf{\Theta} = (\theta_1, \cdots, \theta_{k-1}) \f$ + * @return \f$ F(\mathbf{\Theta}) = n \log \left( 1 + \sum_{i=1}^{k-1} \exp \theta_i \right) - \log n! \f$ + */ + public double F(PVector T){ + double sum = 0; + for (int i=0; ip[idx]) + idx++; + x.array[idx] += 1; + } + + // Return + return x; + } + + + /** + * Computes the Kullback-Leibler divergence between two Binomial distributions. + * @param LA source parameters \f$ \mathbf{\Lambda}_\alpha \f$ + * @param LB source parameters \f$ \mathbf{\Lambda}_\beta \f$ + * @return \f$ D_{\mathrm{KL}}(f_1\|f_2) = n p_{\alpha,k} \log \frac{p_{\alpha,k}}{p_{\beta,k}} - n \sum_{i=1}^{k-1} p_{\alpha,i} \log \frac{p_{\beta,i}}{p_{\alpha,i}} \f$ + */ + public double KLD(PVector LA, PVector LB) { + int k = LA.getDimension()-1; + double sum = 0; + for (int i=0; i{ + + + /** + * Constant for serialization + */ + private static final long serialVersionUID = 1L; + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = ( \theta , \Theta ) \f$ + * @return \f$ F(\mathbf{\Theta})=\frac{1}{4} \mathrm{tr}(\Theta^{-1}\theta\theta^T) - \frac{1}{2} \log \det\Theta + \frac{d}{2} log \pi \f$ + */ + public double F(PVectorMatrix T){ + return 0.25d*( (T.M.Inverse()).Multiply(T.v.OuterProduct()) ).Trace() + - 0.5d*Math.log( T.M.Determinant() ) + + (0.5d*T.v.dim)*Math.log(Math.PI); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural \f$ \mathbf{\Theta} = ( \theta , \Theta ) \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) = \left( \frac{1}{2} \Theta^{-1} \theta , -\frac{1}{2} \Theta^{-1} -\frac{1}{4} (\Theta^{-1} \theta)(\Theta^{-1} \theta)^T \right) \f$ + */ + public PVectorMatrix gradF(PVectorMatrix T){ + PVectorMatrix gradient = new PVectorMatrix(T.v.dim); + gradient.v = T.M.Inverse().MultiplyVectorRight(T.v).Times(0.5d); + gradient.M = T.M.Inverse().Times(-0.5d).Minus( (T.M.Inverse().MultiplyVectorRight(T.v)).OuterProduct().Times(0.25d) ); + gradient.type = TYPE.EXPECTATION_PARAMETER; + return gradient; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = ( \eta , H ) \f$ + * @return \f$ G(\mathbf{H}) = - \frac{1}{2} \log \left( 1 + \eta^T H^{-1} \eta \right) - \frac{1}{2} \log \det (-H) - \frac{d}{2} \log (2 \pi e) \f$ + */ + public double G(PVectorMatrix H){ + return -0.5d * Math.log( 1.0d + H.v.InnerProduct(H.M.Inverse().MultiplyVectorRight(H.v)) ) - 0.5d * Math.log( H.M.Times(-1.0d).Determinant() ) - H.v.dim*0.5d*Math.log(2*Math.PI*Math.E); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = ( \eta , H ) \f$ + * @return \f$ \nabla G(\mathbf{H}) = \left( -( H + \eta \eta^T )^{-1} \eta , -\frac{1}{2} ( H + \eta \eta^T )^{-1} \right) \f$ + */ + public PVectorMatrix gradG(PVectorMatrix H){ + PVectorMatrix gradient = new PVectorMatrix(H.v.dim); + PMatrix tmp = H.M.Plus(H.v.OuterProduct()).Inverse(); + gradient.v = tmp.MultiplyVectorRight(H.v).Times(-1.0d); + gradient.M = tmp.Times(-0.5d); + gradient.type = TYPE.NATURAL_PARAMETER; + return gradient; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = (x , -x x^\top) \f$ + */ + public PVectorMatrix t(PVector x){ + PVectorMatrix t = new PVectorMatrix(x.dim); + t.v = x; + t.M = x.OuterProduct().Times(-1); + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = 0 \f$ + */ + public double k(PVector x){ + return 0.0d; + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = ( \mu , \Sigma ) \f$ + * @return natural parameters \f$ \mathbf{\Theta} = \left( \Sigma^{-1} \mu , \frac{1}{2} \Sigma^{-1} \right)\f$ + */ + public PVectorMatrix Lambda2Theta(PVectorMatrix L){ + PVectorMatrix T = new PVectorMatrix(L.v.dim); + PMatrix tmp = L.M.Inverse(); + T.v = tmp.MultiplyVectorRight(L.v); + T.M = tmp.Times(0.5d); + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = ( \theta , \Theta )\f$ + * @return source parameters \f$ \mathbf{\Lambda} = \left( \frac{1}{2} \Theta^{-1} \theta , \frac{1}{2} \Theta^{-1} \right) \f$ + */ + public PVectorMatrix Theta2Lambda(PVectorMatrix T){ + PVectorMatrix L = new PVectorMatrix(T.v.dim); + L.M = T.M.Inverse().Times(0.5d); + L.v = L.M.MultiplyVectorRight(T.v); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = ( \mu , \Sigma ) \f$ + * @return expectation parameters \f$ \mathbf{H} = \left( \mu , - (\Sigma + \mu \mu^T) \right) \f$ + */ + public PVectorMatrix Lambda2Eta(PVectorMatrix L){ + PVectorMatrix H = new PVectorMatrix(L.v.dim); + H.v = (PVector)L.v.clone(); + H.M = L.M.Plus(L.v.OuterProduct()).Times(-1); + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} = ( \eta , H )\f$ + * @return source parameters \f$ \mathbf{\Lambda} = \left( \eta , - (H + \eta \eta^T) \right) \f$ + */ + public PVectorMatrix Eta2Lambda(PVectorMatrix H){ + PVectorMatrix L = new PVectorMatrix(H.v.dim); + L.v = (PVector)H.v.clone(); + L.M = H.M.Plus(H.v.OuterProduct()).Times(-1); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;\mu,\Sigma) \f$. + * @param x point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;\mu,\Sigma) = \frac{1}{ (2\pi)^{d/2} |\Sigma|^{1/2} } \exp \left( - \frac{(x-\mu)^T \Sigma^{-1}(x-\mu)}{2} \right) \mbox{ for } x \in \mathds{R}^d \f$ + */ + public double density(PVector x, PVectorMatrix param){ + if (param.type==TYPE.SOURCE_PARAMETER){ + double v1 = (x.Minus(param.v)).InnerProduct(param.M.Inverse().MultiplyVectorRight(x.Minus(param.v))); + double v2 = Math.exp(-0.5d*v1); + double v3 = Math.pow(2.0d*Math.PI, (double)x.dim/2.0d)*Math.sqrt(param.M.Determinant()); + return v2 / v3; + } + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Draws a point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = ( \mu , \Sigma ) \f$ + * @return a point. + */ + public PVector drawRandomPoint(PVectorMatrix L) { + + // Compute Z vector containing dim values i.i.d. drawn from N(0,1) + Random rand = new Random(); + PVector z = new PVector(L.getDimension()); + for (int i=0; i{ + + + /** + * Constant for serialization + */ + private static final long serialVersionUID = 1L; + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ F(\mathbf{\theta}) = \frac{1}{2} \theta^\top\theta + \frac{d}{2}\log 2\pi \f$ + */ + public double F(PVector T){ + return 0.5d*( T.InnerProduct(T) + T.dim * Math.log(2*Math.PI) ); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) = \theta \f$ + */ + public PVector gradF(PVector T){ + PVector gradient = (PVector)T.clone(); + gradient.type = TYPE.EXPECTATION_PARAMETER; + return gradient; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ F(\mathbf{\theta})= \frac{1}{2} \eta^\top\eta + \frac{d}{2}\log 2\pi \f$ + */ + public double G(PVector H){ + return 0.5d*( H.InnerProduct(H) + H.dim * Math.log(2*Math.PI) ); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ \nabla G(\mathbf{H}) = \eta \f$ + */ + public PVector gradG(PVector H){ + PVector gradient = (PVector)H.clone(); + gradient.type = TYPE.NATURAL_PARAMETER; + return gradient; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = x \f$ + */ + public PVector t(PVector x){ + PVector t = (PVector)x.clone(); + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = -\frac{1}{2}x^\top x \f$ + */ + public double k(PVector x){ + return -0.5d * x.InnerProduct(x); + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \mu \f$ + * @return natural parameters \f$ \mathbf{\Theta} = \mu \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = (PVector)L.clone(); + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \theta \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = (PVector)T.clone(); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \mu \f$ + * @return expectation parameters \f$ \mathbf{H} = \mu \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = (PVector)L.clone(); + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \eta \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = (PVector)H.clone(); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;\mu) \f$. + * @param x point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;\mu) = \frac{1}{ (2\pi)^{d/2} } \exp \left( - \frac{(x-\mu)^T (x-\mu)}{2} \right) \mbox{ for } x \in \mathds{R}^d \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER){ + double v1 = (x.Minus(param)).InnerProduct(x.Minus(param)); + double v2 = Math.exp(-0.5d*v1); + return v2 / Math.pow( 2.0d*Math.PI , (double)x.dim/2.0d ); + } + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Draws a point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = \mu \f$ + * @return a point. + */ + public PVector drawRandomPoint(PVector L) { + Random rand = new Random(); + PVector x = new PVector(L.getDimension()); + for (int i=0; ik) + for (j=0; jk) + for (i=0; ik) + for (i=0; ik) + for (j=0; ji) + SubMatrix.array[j-1][k-1] = array[j][k]; + } + } + result += array[0][i] * Math.pow(-1, (double)i) * SubMatrix.Determinant(); + } + return result; + } + + + /** + * Computes the trace of the current matrix \f$ m \f$. + * @return \f$ tr (m)\f$ + */ + public double Trace(){ + double tr = 0.0d; + for(int i=0; i=i) + L.array[i][j] = Math.random(); + else + L.array[i][j] = 0.0; + } + return L.Multiply(L.Transpose()); + } + + + /** + * Computes the Cholesky decomposition of the current matrix \f$ m \f$. + * @return a lower triangular matrix + */ + public PMatrix Cholesky(){ + PMatrix L = new PMatrix(this.dim); + for (int i=0; i{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ F(\mathbf{\Theta}) = \exp \theta \f$ + */ + public double F(PVector T){ + return Math.exp(T.array[0]); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) = \exp \theta \f$ + */ + public PVector gradF(PVector T){ + PVector g = new PVector(1); + g.array[0] = Math.exp(T.array[0]); + g.type = TYPE.EXPECTATION_PARAMETER; + return g; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ G(\mathbf{H}) = \eta \log \eta - \eta \f$ + */ + public double G(PVector H){ + return H.array[0] * Math.log(H.array[0]) - H.array[0]; + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ \nabla G( \mathbf{H} ) = \log \eta \f$ + */ + public PVector gradG(PVector H){ + PVector g = new PVector(1); + g.array[0] = Math.log(H.array[0]); + g.type = TYPE.NATURAL_PARAMETER; + return g; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = x \f$ + */ + public PVector t(PVector x){ + PVector t = new PVector(1); + t.array[0] = x.array[0]; + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = - \log (x!) \f$ + */ + public double k(PVector x){ + return -Math.log( (double)fact((int)x.array[0]) ); + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \lambda \f$ + * @return natural parameters \f$ \mathbf{\Theta} = \log \lambda \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = new PVector(1); + T.array[0] = Math.log(L.array[0]); + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \exp \theta \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = new PVector(1); + L.array[0] = Math.exp(T.array[0]); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \lambda \f$ + * @return expectation parameters \f$ \mathbf{H} = \lambda \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = new PVector(1); + H.array[0] = L.array[0]; + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \eta \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = new PVector(1); + L.array[0] = H.array[0]; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;\lambda) \f$. + * @param x a point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;\lambda) = \frac{\lambda^x \exp(-\lambda)}{x!} \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER) + return (Math.pow(param.array[0], x.array[0])*Math.exp(-param.array[0])) / ((double)fact((int)x.array[0])); + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Computes the factorial of a number. + * @param n number + * @return n! + */ + private double fact(double n){ + double f = 1; + for (int i=1; i<=n; i++) + f *= i; + return f; + } + + + /** + * Draws a point from the considered Poisson distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = \lambda \f$ + * @return a point. + */ + public PVector drawRandomPoint(PVector L) { + + // Initialization + double l = Math.exp(-L.array[0]); + double p = 1.0; + int k = 0; + + // Loop + do{ + k++; + p *= Math.random(); + } while(p>l); + + // Point + PVector point = new PVector(1); + point.array[0] = k-1; + return point; + } + + + /** + * Computes the Kullback-Leibler divergence between two Poisson distributions. + * @param LP source parameters \f$ \mathbf{\Lambda}_P \f$ + * @param LQ source parameters \f$ \mathbf{\Lambda}_Q \f$ + * @return \f$ D_{\mathrm{KL}}(f_P\|f_Q) = \lambda_Q - \lambda_P \left( 1 + \log \left( \frac{\lambda_Q}{\lambda_P} \right) \right) \f$ + */ + public double KLD(PVector LP, PVector LQ) { + double lp = LP.array[0]; + double lq = LQ.array[0]; + return lq - lp * ( 1 + Math.log(lq/lp) ); + } + + + +} diff --git a/src/dev/java/japsadev/lib/jMEF/Quicksort.java b/src/dev/java/japsadev/lib/jMEF/Quicksort.java new file mode 100755 index 0000000..f416447 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Quicksort.java @@ -0,0 +1,96 @@ +package japsadev.lib.jMEF; + + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * The Quicksort class implements the quicksort algorithm. + */ +public class Quicksort { + + /** + * Sorts an array using quicksort algorithm. + * @param data array to be sorted + * @param index initial position (index) of the sorted elements + */ + public static void quicksort(double[] data, int[] index) { + shuffle(data, index); // to guard against worst-case + quicksort(data, index, 0, data.length - 1); + } + + + /** + * Sorts the left and the right parts of an array using recursive quicksort algorithm. + * @param a array to be sorted + * @param idx index + * @param left left index: sort between left to idx + * @param right right index: sort between idx to right + */ + private static void quicksort(double[] a, int[] idx, int left, int right) { + if (right <= left) return; + int i = partition(a, idx, left, right); + quicksort(a, idx, left, i-1); + quicksort(a, idx, i+1, right); + } + + /** + * Creates the partition. + * @param a array to be sorted + * @param idx array of indexes + * @param left left index: sort between left to i + * @param right right index: sort between i to right + */ + private static int partition(double[] a, int[] idx, int left, int right) { + int i = left - 1; + int j = right; + while (true) { + while (a[++i]= j) break; // check if pointers cross + exch(a, idx, i, j); // swap two elements into place + } + exch(a, idx, i, right); // swap with partition element + return i; + } + + + /** + * Switches the the values a[i] and a[j] and the indexes idx[i] and idx[j]. + * @param a array to be sorted + * @param idx array of indexes + * @param i index + * @param j index + */ + private static void exch(double[] a, int[] idx, int i, int j) { + double swap = a[i]; + a[i] = a[j]; + a[j] = swap; + int swap2 = idx[i]; + idx[i] = idx[j]; + idx[j] = swap2; + } + + /** + * Shuffles the array a + * @param a array to be sorted + * @param idx array of indexes + */ + private static void shuffle(double[] a, int[] idx) { + int N = a.length; + for (int i = 0; i < N; i++) { + int r = i + (int) (Math.random() * (N-i)); // between i and N-1 + exch(a, idx, i, r); + } + } + +} diff --git a/src/dev/java/japsadev/lib/jMEF/Rayleigh.java b/src/dev/java/japsadev/lib/jMEF/Rayleigh.java new file mode 100755 index 0000000..f8e9d53 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Rayleigh.java @@ -0,0 +1,199 @@ +package japsadev.lib.jMEF; + +import japsadev.lib.jMEF.Parameter.TYPE; + + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * The Rayleigh is an exponential family and, as a consequence, the probability density function is given by + * \f[ f(x; \mathbf{\Theta}) = \exp \left( \langle t(x), \mathbf{\Theta} \rangle - F(\mathbf{\Theta}) + k(x) \right) \f] + * where \f$ \mathbf{\Theta} \f$ are the natural parameters. + * This class implements the different functions allowing to express a Rayleigh distribution as a member of an exponential family. + * + * @section Parameters + * + * The parameters of a given distribution are: + * - Source parameters \f$ \mathbf{\Lambda} = \sigma^2 \in \mathds{R}^+ \f$ + * - Natural parameters \f$ \mathbf{\Theta} = \theta \in \mathds{R}^- \f$ + * - Expectation parameters \f$ \mathbf{H} = \eta \in \mathds{R}^+ \f$ + * + */ +public final class Rayleigh extends ExponentialFamily{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ F(\mathbf{\theta}) = - \log (-2 \theta) \f$ + */ + public double F(PVector T){ + return - Math.log( -2 * T.array[0] ); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ \nabla F(\mathbf{\theta}) = -\frac{1}{\theta} \f$ + */ + public PVector gradF(PVector T){ + PVector gradient = new PVector(1); + gradient.array[0] = -1.0d / T.array[0]; + gradient.type = TYPE.EXPECTATION_PARAMETER; + return gradient; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ G(\mathbf{\eta}) = - \log \eta \f$ + */ + public double G(PVector H){ + return - Math.log(H.array[0]); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ \nabla G(\mathbf{\eta}) = -\frac{1}{\eta} \f$ + */ + public PVector gradG(PVector H){ + PVector gradient = new PVector(1); + gradient.array[0] = -1.0d / H.array[0]; + gradient.type = TYPE.NATURAL_PARAMETER; + return gradient; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = x^2 \f$ + */ + public PVector t(PVector x){ + PVector t = new PVector(1); + t.array[0] = x.array[0] * x.array[0]; + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = \log x \f$ + */ + public double k(PVector x){ + return Math.log(x.array[0]); + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \sigma^2 \f$ + * @return natural parameters \f$ \mathbf{\Theta} = -\frac{1}{2 \sigma^2} \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = new PVector(1); + T.array[0] = -1.0d / (2.0d * L.array[0]); + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = -\frac{1}{2\theta} \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = new PVector(1); + L.array[0] = -1.0d / (2.0d * T.array[0]); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \sigma^2 \f$ + * @return expectation parameters \f$ \mathbf{H} = 2 \sigma^2 \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = new PVector(1); + H.array[0] = 2 * L.array[0]; + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H expectation parameters \f$ \mathbf{H} = \eta\f$ + * @return source parameters \f$ \mathbf{\Lambda} = \frac{\eta}{2} \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = new PVector(1); + L.array[0] = H.array[0] / 2.0d; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;p) \f$. + * @param x a point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;\sigma^2) = \frac{x}{\sigma^2} \exp \left( -\frac{x^2}{2\sigma^2} \right) \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER) + return ( Math.exp( - (x.array[0]*x.array[0]) / (2*param.array[0])) * x.array[0] ) / param.array[0]; + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Draws a point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = \sigma^2 \f$ + * @return a point + */ + public PVector drawRandomPoint(PVector L) { + PVector x = new PVector(1); + x.array[0] = Math.sqrt( - 2 * Math.log( Math.random() ) * L.array[0] ); + return x; + } + + + /** + * Computes the Kullback-Leibler divergence between two Binomial distributions. + * @param LP source parameters \f$ \mathbf{\Lambda}_P \f$ + * @param LQ source parameters \f$ \mathbf{\Lambda}_Q \f$ + * @return \f$ D_{\mathrm{KL}}(f_P \| f_Q) = \log \left( \frac{\sigma_Q^2}{\sigma_P^2} \right) + \frac{ \sigma_P^2 - \sigma_Q^2 }{\sigma_Q^2} \f$ + */ + public double KLD(PVector LP, PVector LQ) { + double vP = LP.array[0]; + double vQ = LQ.array[0]; + return Math.log(vQ/vP) + ( (vP-vQ) / vQ); + } +} diff --git a/src/dev/java/japsadev/lib/jMEF/Test.java b/src/dev/java/japsadev/lib/jMEF/Test.java new file mode 100644 index 0000000..e298d29 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Test.java @@ -0,0 +1,81 @@ +package japsadev.lib.jMEF; + +import java.io.File; +import java.io.FileNotFoundException; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Scanner; +import java.util.Vector; + +public class Test { + /** + * Main function. + * @param args + */ + public static void main(String[] args) { + + // Display + String title = ""; + title += "+----------------------------------------+\n"; + title += "| Testing soft clustering & classical EM |\n"; + title += "+----------------------------------------+\n"; + System.out.print(title); + + // Variables + int n = 12; + + File file = new File("/home/sonhoanghguyen/Projects/scaffolding/repeat/porecamp_metaSpades.hist"); + ArrayList vectors = new ArrayList(); + try { + + Scanner sc = new Scanner(file); + int count=0; + while (sc.hasNext()) { + int length = sc.nextInt(); + double cov = sc.nextDouble(); + + for(int i=0; i < length; i++){ + PVector v = new PVector(1); + v.array[0]=cov; + vectors.add(v); + + //System.out.println(count++ + ":" + length + " | " + cov); + } + } + sc.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + + PVector[] points = vectors.stream().toArray(PVector[]::new); + System.out.println("Starting to fit by kmean..."); + long start = System.currentTimeMillis(); + + NumberFormat formatter = new DecimalFormat("#0.00000"); + // Draw points from initial mixture model and compute the n clusters + Vector[] clusters = KMeans.run(points, n); + + long end1 = System.currentTimeMillis(); + System.out.print("running time: " + formatter.format((end1 - start) / 1000d) + " seconds"); + + // Bregman soft clustering for Gauss + MixtureModel mmef; + mmef = BregmanSoftClustering.initialize(clusters, new UnivariateGaussian()); + mmef = BregmanSoftClustering.run(points, mmef); + System.out.println("Mixure model of Gaussian estimated using Bregman soft clustering \n" + mmef + "\n"); + long end2 = System.currentTimeMillis(); + System.out.print("running time: " + formatter.format((end2 - end1) / 1000d) + " seconds"); + + // Bregman soft clustering for Poisson + MixtureModel mmp; + mmp = BregmanSoftClustering.initialize(clusters, new Poisson()); + mmp = BregmanSoftClustering.run(points, mmp); + System.out.println("Mixure model of Poisson estimated using Bregman soft clustering \n" + mmp + "\n"); + + long end3 = System.currentTimeMillis(); + System.out.print("running time: " + formatter.format((end3 - end2) / 1000d) + " seconds"); + } + +} diff --git a/src/dev/java/japsadev/lib/jMEF/Tutorial1.java b/src/dev/java/japsadev/lib/jMEF/Tutorial1.java new file mode 100755 index 0000000..247c38e --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Tutorial1.java @@ -0,0 +1,338 @@ +package japsadev.lib.jMEF; + +import java.util.Locale; +import java.util.Vector; + +public class Tutorial1 { + + + /** + * Converts valid points (>0) to integer. + * @param points point set in \f$R^d\f$. + * @return a point set in \f$N^d\f$. + */ + private static PVector[] checkPoints(PVector[] points){ + // Count how many point are >0 + int n = 0; + for (int i=0; i0.5) + n++; + + // Convert valid points as integer + PVector[] new_points = new PVector[n]; + int idx = 0; + for (int i=0; i0.5){ + PVector p = new PVector(1); + p.array[0] = Math.round( points[i].array[0] ); + new_points[idx] = p; + idx++; + } + return new_points; + } + + + /** + * Compute the NMI. + * @param points set of points + * @param f initial mixture model + * @param g estimated mixture model + * @return NMI + */ + private static double NMI(PVector[] points, MixtureModel f, MixtureModel g){ + + int m = points.length; + int n = f.size; + + double[][] p = new double[n][n]; + + for (int i=0; if_max){ + f_max = f_tmp; + f_label = j; + } + if (g_tmp>g_max){ + g_max = g_tmp; + g_label = j; + } + } + p[f_label][g_label]++; + } + + // Normalization + for (int i=0; i0) + mi += p[i][j] * Math.log( p[i][j] / (fl[i] * gl[j]) ); + double hf = 0; + double hg = 0; + for (int i=0; i[] clusters = KMeans.run(points, 3); + + // Estimation of the mixture of Gaussians + MixtureModel mog; + mog = BregmanSoftClustering.initialize(clusters, new UnivariateGaussianFixedVariance(25)); + mog = BregmanSoftClustering.run(points, mog); + out[0] = NMI(points, f, mog); + + // Estimation of the mixture of Poisson + MixtureModel mop; + mop = BregmanSoftClustering.initialize(clusters, new Poisson()); + mop = BregmanSoftClustering.run(points, mop); + out[1] = NMI(points, f, mop); + + // Estimation of the mixture of Poisson + MixtureModel mob; + mob = BregmanSoftClustering.initialize(clusters, new BinomialFixedN(100)); + mob = BregmanSoftClustering.run(points, mob); + out[2] = NMI(points, f, mob); + + // Return + return out; + } + + + /** + * Compute the mixtures from points drawn from a mixture of Poisson distributions. + * @param m number of points drawn from the mixture. + */ + private static double[] testPoisson(int m){ + + // + double[] out = new double[3]; + + // Initial model : Poisson + MixtureModel f = new MixtureModel(3); + f.EF = new Poisson(); + f.weight[0] = 1.0/3.0; + f.weight[1] = 1.0/3.0; + f.weight[2] = 1.0/3.0; + PVector p1 = new PVector(1); + PVector p2 = new PVector(1); + PVector p3 = new PVector(1); + p1.array[0] = 10; + p2.array[0] = 20; + p3.array[0] = 40; + f.param[0] = p1; + f.param[1] = p2; + f.param[2] = p3; + + // Draw points from the mixture + PVector[] points = f.drawRandomPoints(m); + points = checkPoints(points); + + // K-means + Vector[] clusters = KMeans.run(points, 3); + + // Estimation of the mixture of Gaussians + MixtureModel mog; + mog = BregmanSoftClustering.initialize(clusters, new UnivariateGaussianFixedVariance(25)); + mog = BregmanSoftClustering.run(points, mog); + out[0] = NMI(points, f, mog); + + // Estimation of the mixture of Poisson + MixtureModel mop; + mop = BregmanSoftClustering.initialize(clusters, new Poisson()); + mop = BregmanSoftClustering.run(points, mop); + out[1] = NMI(points, f, mop); + + // Estimation of the mixture of Poisson + MixtureModel mob; + mob = BregmanSoftClustering.initialize(clusters, new BinomialFixedN(100)); + mob = BregmanSoftClustering.run(points, mob); + out[2] = NMI(points, f, mob); + + // Return + return out; + } + + + /** + * Compute the mixtures from points drawn from a mixture of Binomial distributions. + * @param m number of points drawn from the mixture. + */ + private static double[] testBinomial(int m){ + + // Output vector + double[] out = new double[3]; + + // Initial model : Binomial + MixtureModel f = new MixtureModel(3); + f.EF = new BinomialFixedN(100); + f.weight[0] = 1.0/3.0; + f.weight[1] = 1.0/3.0; + f.weight[2] = 1.0/3.0; + PVector p1 = new PVector(1); + PVector p2 = new PVector(1); + PVector p3 = new PVector(1); + p1.array[0] = 0.1; + p2.array[0] = 0.2; + p3.array[0] = 0.4; + f.param[0] = p1; + f.param[1] = p2; + f.param[2] = p3; + + // Draw points from the mixture + PVector[] points = f.drawRandomPoints(m); + points = checkPoints(points); + + // K-means + Vector[] clusters = KMeans.run(points, 3); + + // Estimation of the mixture of Gaussians + MixtureModel mog; + mog = BregmanSoftClustering.initialize(clusters, new UnivariateGaussianFixedVariance(25)); + mog = BregmanSoftClustering.run(points, mog); + out[0] = NMI(points, f, mog); + + // Estimation of the mixture of Poisson + MixtureModel mop; + mop = BregmanSoftClustering.initialize(clusters, new Poisson()); + mop = BregmanSoftClustering.run(points, mop); + out[1] = NMI(points, f, mop); + + // Estimation of the mixture of Poisson + MixtureModel mob; + mob = BregmanSoftClustering.initialize(clusters, new BinomialFixedN(100)); + mob = BregmanSoftClustering.run(points, mob); + out[2] = NMI(points, f, mob); + + // Return + return out; + } + + + /** + * Main function. + * @param args + */ + public static void main(String[] args) { + + // Display + String title = ""; + title += "+-------------------------+\n"; + title += "| Bregman soft clustering |\n"; + title += "+-------------------------+\n"; + System.out.print(title); + + // Variables + int m = 1000; + int loop = 100; + + // NMI arrays + double[][] NMI_Gaussian = new double[loop][3]; + double[][] NMI_Poisson = new double[loop][3]; + double[][] NMI_Binomial = new double[loop][3]; + + // Computation of NMI + for (int l=0; l[] clusters = KMeans.run(points, n); + + // Classical EM + MixtureModel mmc; + mmc = ExpectationMaximization1D.initialize(clusters); + mmc = ExpectationMaximization1D.run(points, mmc); + System.out.println("Mixure model estimated using classical EM \n" + mmc + "\n"); + + // Bregman soft clustering + MixtureModel mmef; + mmef = BregmanSoftClustering.initialize(clusters, new UnivariateGaussian()); + mmef = BregmanSoftClustering.run(points, mmef); + System.out.println("Mixure model estimated using Bregman soft clustering \n" + mmef + "\n"); + + } +} diff --git a/src/dev/java/japsadev/lib/jMEF/Tutorial3.java b/src/dev/java/japsadev/lib/jMEF/Tutorial3.java new file mode 100755 index 0000000..9868c06 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Tutorial3.java @@ -0,0 +1,58 @@ +package japsadev.lib.jMEF; + +import java.awt.image.BufferedImage; + +import japsadev.lib.jMEF.Clustering.CLUSTERING_TYPE; + +public class Tutorial3 { + + + /** + * Main function. + * @param args + */ + public static void main(String[] args) { + + // Display + String title = ""; + title += "+-----------------------------------------------+\n"; + title += "| Mixture simplification and image segmentation |\n"; + title += "+-----------------------------------------------+\n"; + System.out.print(title); + + // Variables + int n = 32; + int m = 8; + + // Image/texture information (to be changed to fit your configuration) + String input_folder = "/home/sonhoanghguyen/workspace/MixtureModels/Input/"; + String output_folder = "/home/sonhoanghguyen/workspace/MixtureModels/Output/"; + String image_name = "Baboon"; + String image_path = input_folder + image_name + ".png"; + String mixture_path = String.format("%s%s_3D_%03d.mix", input_folder, image_name, n); + + // Read the input image + System.out.print("Read input image : "); + BufferedImage image = Image.readImage(image_path); + System.out.println("ok"); + + // Read or generate the mixture model + System.out.print("Read/generate mixture model : "); + MixtureModel mm1 = Image.loadMixtureModel(mixture_path, image, 3, n); + System.out.println("ok"); + + // Compute the image segmentation based on the mixture mm1 + System.out.print("Segment image (mixture model) : "); + BufferedImage seg1 = Image.segmentColorImageFromMOG(image, mm1); + Image.writeImage(seg1, String.format("%sTutorial3_%s_%03d.png", output_folder, image_name, n)); + System.out.println("ok"); + + // Simplify mm1 in a mixture mm2 of m components and compute the image segmentation based on mm2 + System.out.print("Segment image (simplified mixture model) : "); + MixtureModel mm2 = BregmanHardClustering.simplify(mm1, m, CLUSTERING_TYPE.LEFT_SIDED); + BufferedImage seg2 = Image.segmentColorImageFromMOG(image, mm2); + Image.writeImage(seg2, String.format("%sTutorial3_%s_%03d.png", output_folder, image_name, m)); + System.out.println("ok"); + } + +} diff --git a/src/dev/java/japsadev/lib/jMEF/Tutorial4.java b/src/dev/java/japsadev/lib/jMEF/Tutorial4.java new file mode 100755 index 0000000..14685ed --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Tutorial4.java @@ -0,0 +1,64 @@ +package japsadev.lib.jMEF; + +import java.awt.image.BufferedImage; + +import japsadev.lib.jMEF.BregmanHierarchicalClustering.LINKAGE_CRITERION; +import japsadev.lib.jMEF.Clustering.CLUSTERING_TYPE; + +public class Tutorial4 { + + + /** + * Main function. + * @param args + */ + public static void main(String[] args) { + + // Display + String title = ""; + title += "+----------------------------------------------------+\n"; + title += "| Hierarchical mixture models and image segmentation |\n"; + title += "+----------------------------------------------------+\n"; + System.out.print(title); + + // Variables + int n = 32; + int m = 8; + + // Image/texture information (to be changed to fit your configuration) + String input_folder = "/home/sonhoanghguyen/workspace/MixtureModels/Input/"; + String output_folder = "/home/sonhoanghguyen/workspace/MixtureModels/Output/"; + String image_name = "Baboon"; + String image_path = input_folder + image_name + ".png"; + String mixture_path = String.format("%s%s_3D_%03d.mix", input_folder, image_name, n); + + // Read the input image + System.out.print("Read input image : "); + BufferedImage image = Image.readImage(image_path); + System.out.println("ok"); + + // Read or generate the mixture model + System.out.print("Read/generate mixture model : "); + MixtureModel mm1 = Image.loadMixtureModel(mixture_path, image, 3, n); + System.out.println("ok"); + + // Initial segmentation from MoG + System.out.print("Segment image (mixture model) : "); + BufferedImage seg1 = Image.segmentColorImageFromMOG(image, mm1); + Image.writeImage(seg1, String.format("%sTutorial4_%s_%03d.png", output_folder, image_name, n)); + System.out.println("ok"); + + // Build hierarchical mixture model + System.out.print("Create hierarchical mixture model : "); + HierarchicalMixtureModel hmm = BregmanHierarchicalClustering.build(mm1, CLUSTERING_TYPE.SYMMETRIC, LINKAGE_CRITERION.MAXIMUM_DISTANCE); + System.out.println("ok"); + + // Initial segmentation from simplified MoG + System.out.print("Segment image (hierarchical mixture model) : "); + MixtureModel mm2 = hmm.getResolution(m); + //MixtureModel mm2 = hmm.getOptimalMixtureModel(0.5); + BufferedImage seg2 = Image.segmentColorImageFromMOG(image, mm2); + Image.writeImage(seg2, String.format("%sTutorial4_%s_%03d.png", output_folder, image_name, m)); + System.out.println("ok"); + } +} diff --git a/src/dev/java/japsadev/lib/jMEF/Tutorial5.java b/src/dev/java/japsadev/lib/jMEF/Tutorial5.java new file mode 100755 index 0000000..fcb790c --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/Tutorial5.java @@ -0,0 +1,55 @@ +package japsadev.lib.jMEF; + +import java.awt.image.BufferedImage; + +public class Tutorial5{ + + + /** + * Main function. + * @param args + */ + public static void main(String[] args) { + + // Display + String title = ""; + title += "+----------------------------------------+\n"; + title += "| Statistical images from mixture models |\n"; + title += "+----------------------------------------+\n"; + System.out.print(title); + + // Variables + int n = 32; + + // Image/texture information (to be changed to fit your configuration) + String input_folder = "/home/sonhoanghguyen/workspace/MixtureModels/Input/"; + String output_folder = "/home/sonhoanghguyen/workspace/MixtureModels/Output/"; + String image_name = "Baboon"; + String image_path = input_folder + image_name + ".png"; + String mixture_path = String.format("%s%s_5D_%03d.mix", input_folder, image_name, n); + + // Read the input image + System.out.print("Read input image : "); + BufferedImage image = Image.readImage(image_path); + System.out.println("ok"); + + // Read or generate the mixture model + System.out.print("Read/generate mixture model : "); + MixtureModel f = Image.loadMixtureModel(mixture_path, image, 5, n); + System.out.println("ok"); + + // Creates and save the statistical image + System.out.print("Create statistical image : "); + BufferedImage stat = Image.createImageFromMixtureModel(image.getWidth(), image.getHeight(), f); + Image.writeImage(stat, String.format("%sTutorial5_%s_statistical_%03d.png", output_folder, image_name, n)); + System.out.println("ok"); + + // Creates and save the ellipse image + System.out.print("Create ellipse image : "); + BufferedImage ell = Image.createEllipseImage(image.getWidth(), image.getHeight(), f, 2); + Image.writeImage(ell, String.format("%sTutorial5_%s_ellipses_%03d.png", output_folder, image_name, n)); + System.out.println("ok"); + + } + +} diff --git a/src/dev/java/japsadev/lib/jMEF/UnivariateGaussian.java b/src/dev/java/japsadev/lib/jMEF/UnivariateGaussian.java new file mode 100755 index 0000000..ccdcfcb --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/UnivariateGaussian.java @@ -0,0 +1,241 @@ +package japsadev.lib.jMEF; + +import japsadev.lib.jMEF.Parameter.TYPE; + +import java.util.Random; + + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * The univariate Gaussian distribution is an exponential family and, as a consequence, the probability density function is given by + * \f[ f(x; \mathbf{\Theta}) = \exp \left( \langle t(x), \mathbf{\Theta} \rangle - F(\mathbf{\Theta}) + k(x) \right) \f] + * where \f$ \mathbf{\Theta} \f$ are the natural parameters. + * This class implements the different functions allowing to express a univariate Gaussian distribution as a member of an exponential family. + * + * @section Parameters + * + * The parameters of a given distribution are: + * - Source parameters \f$\mathbf{\Lambda} = ( \mu , \sigma^2 ) \in R \times R^+\f$ + * - Natural parameters \f$\mathbf{\Theta} = ( \theta_1 , \theta_2 ) \in R \times R^-\f$ + * - Expectation parameters \f$ \mathbf{H} = ( \eta_1 , \eta_2 ) \in R \times R^+\f$ + */ +public final class UnivariateGaussian extends ExponentialFamily{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T parameters \f$ \mathbf{\Theta} = ( \theta_1 , \theta_2 ) \f$ + * @return \f$ F(\mathbf{\Theta}) = -\frac{\theta_1^2}{4\theta_2} + \frac{1}{2} \log \left( -\frac{\pi}{\theta_2} \right) \f$ + */ + public double F(PVector T){ + return -0.25d * T.array[0]*T.array[0]/T.array[1] + 0.5d * Math.log(-Math.PI/T.array[1]); + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural parameters \f$ \mathbf{\Theta} = ( \theta_1 , \theta_2 ) \f$ + * @return \f$ \nabla F(\mathbf{\Theta}) = \left( -\frac{\theta_1}{2 \theta_2} , -\frac{1}{2 \theta_2} + \frac{\theta_1^2}{4 \theta_2^2} \right) \f$ + */ + public PVector gradF(PVector T){ + PVector gradient = new PVector(2); + gradient.array[0] = -0.5d * T.array[0]/T.array[1]; + gradient.array[1] = 0.25d * (T.array[0]*T.array[0])/(T.array[1]*T.array[1]) - 0.5d/T.array[1]; + gradient.type = TYPE.EXPECTATION_PARAMETER; + return gradient; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = ( \eta_1 , \eta_2 ) \f$ + * @return \f$ G(\mathbf{H}) = - \frac{1}{2} \log ( \eta_1^2 - \eta_2 ) \f$ + */ + public double G(PVector H){ + return -0.5d * Math.log(Math.abs(H.array[0]*H.array[0] - H.array[1])); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = ( \eta_1 , \eta_2) \f$ + * @return \f$ \nabla G(\mathbf{H}) = \left( -\frac{\eta_1}{\eta_1^2-\eta_2} , \frac{1}{2 (\eta_1^2-\eta_2)} \right) \f$ + */ + public PVector gradG(PVector H){ + PVector gradient = new PVector(2); + double tmp = H.array[0]*H.array[0] - H.array[1]; + gradient.array[0] = -H.array[0]/tmp; + gradient.array[1] = 0.5d/tmp; + gradient.type = TYPE.NATURAL_PARAMETER; + return gradient; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = (x , x^2) \f$ + */ + public PVector t(PVector x){ + PVector t = new PVector(2); + t.array[0] = x.array[0]; + t.array[1] = x.array[0]*x.array[0]; + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = 0 \f$ + */ + public double k(PVector x){ + return 0.0d; + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = ( \mu , \sigma^2 )\f$ + * @return natural parameters \f$ \mathbf{\Theta} = \left( \frac{\mu}{\sigma^2} , -\frac{1}{2\sigma^2} \right) \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = new PVector(2); + T.array[0] = L.array[0] / L.array[1]; + T.array[1] = -1.0d / (2*L.array[1]); + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = ( \theta_1 , \theta_2 )\f$ + * @return source parameters \f$ \mathbf{\Lambda} = \left( -\frac{\theta_1}{2 \theta_2} , -\frac{1}{2 \theta_2} \right) \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = new PVector(2); + L.array[0] = - T.array[0] / (2 * T.array[1]); + L.array[1] = - 1 / (2 * T.array[1]); + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = ( \mu , \sigma^2 )\f$ + * @return expectation parameters \f$ \mathbf{H} = \left( \mu , \sigma^2 + \mu^2 \right) \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = new PVector(2); + H.array[0] = L.array[0]; + H.array[1] = L.array[0]*L.array[0] + L.array[1]; + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H natural parameters \f$ \mathbf{H} = ( \eta_1 , \eta_2 )\f$ + * @return source parameters \f$ \mathbf{\Lambda} = \left( \eta_1 , \eta_2 - \eta_1^2 \right) \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = new PVector(2); + L.array[0] = H.array[0]; + L.array[1] = H.array[1] - H.array[0] * H.array[0]; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Box-Muller transform/generator. + * @param mu mean \f$ \mu \f$ + * @param sigma variance \f$ \sigma \f$ + * @return \f$ \mu + \sigma \sqrt{ -2 \log ( x ) } \cos (2 \pi x) \f$ where \f$ x \in \mathcal{U}(0,1)\f$ + */ + public static double Rand(double mu, double sigma){ + return mu + sigma * Math.sqrt( -2.0d * Math.log(Math.random()) ) * Math.cos( 2.0d * Math.PI * Math.random() ); + } + + + /** + * Box-Muller transform/generator + * @return \f$ \sqrt{ -2 \log ( x ) } \cos (2 \pi x) \f$ where \f$ x \in \mathcal{U}(0,1)\f$ + */ + public static double Rand(){ + return Rand(0,1); + } + + + /** + * Computes the density value \f$ f(x;\mu,\sigma^2) \f$. + * @param x point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;\mu,\sigma^2) = \frac{1}{ \sqrt{2\pi \sigma^2} } \exp \left( - \frac{(x-\mu)^2}{ 2 \sigma^2} \right) \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER) + return Math.exp( - (x.array[0]-param.array[0])*(x.array[0]-param.array[0]) / (2.0d*param.array[1]) ) / (Math.sqrt(2.0d*Math.PI*param.array[1])); + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Draws a point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = ( \mu , \sigma^2 )\f$ + * @return a point + */ + public PVector drawRandomPoint(PVector L) { + + // Mean and variance + PVector mean = new PVector(1); + PVector variance = new PVector(1); + mean.array[0] = L.array[0]; + variance.array[0] = L.array[1]; + + // Draw the point + Random rand = new Random(); + PVector v = new PVector(1); + v.array[0] = rand.nextGaussian() * Math.sqrt(variance.array[0]); + return v.Plus(mean); + } + + + /** + * Computes the Kullback-Leibler divergence between two univariate Gaussian distributions. + * @param LP source parameters \f$ \mathbf{\Lambda}_P \f$ + * @param LQ source parameters \f$ \mathbf{\Lambda}_Q \f$ + * @return \f$ D_{\mathrm{KL}}(f_P\|f_Q) = \frac{1}{2} \left( 2 \log \frac{\sigma_Q}{\sigma_P} + \frac{\sigma_P^2}{\sigma_Q^2} + \frac{(\mu_Q-\mu_P)^2}{\sigma_Q^2} -1\right) \f$ + */ + public double KLD(PVector LP, PVector LQ) { + double mP = LP.array[0]; + double vP = LP.array[1]; + double mQ = LQ.array[0]; + double vQ = LQ.array[1]; + return 0.5d * ( 2 * Math.log(Math.sqrt(vQ/vP)) + vP/vQ + ((mQ-mP)*(mQ-mP))/vQ - 1 ); + } + + +} diff --git a/src/dev/java/japsadev/lib/jMEF/UnivariateGaussianFixedVariance.java b/src/dev/java/japsadev/lib/jMEF/UnivariateGaussianFixedVariance.java new file mode 100755 index 0000000..da1bd02 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/UnivariateGaussianFixedVariance.java @@ -0,0 +1,225 @@ +package japsadev.lib.jMEF; + +import java.util.Random; + +import japsadev.lib.jMEF.Parameter.TYPE; + + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * The univariate Gaussian distribution, with fixed variance \f$ \sigma^2 \f$, is an exponential family and, as a consequence, the probability density function is given by + * \f[ f(x; \mathbf{\Theta}) = \exp \left( \langle t(x), \mathbf{\Theta} \rangle - F(\mathbf{\Theta}) + k(x) \right) \f] + * where \f$ \mathbf{\Theta} \f$ are the natural parameters. + * This class implements the different functions allowing to express a univariate Gaussian distribution as a member of an exponential family. + * + * @section Parameters + * + * The parameters of a given distribution are: + * - Source parameters \f$\mathbf{\Lambda} = \mu \in R\f$ + * - Natural parameters \f$\mathbf{\Theta} = \theta \in R\f$ + * - Expectation parameters \f$ \mathbf{H} = \eta \in R\f$ + */ +public final class UnivariateGaussianFixedVariance extends ExponentialFamily{ + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Variance \f$ \sigma^2\f$ + */ + private double variance; + + + /** + * Class constructor with \f$\sigma^2=1\f$ + */ + public UnivariateGaussianFixedVariance(){ + this.variance = 1.0; + } + + + /** + * Class constructor. + * @param variance variance \f$ \sigma^2\f$ + */ + public UnivariateGaussianFixedVariance(double variance){ + this.variance = variance; + } + + + /** + * Computes the log normalizer \f$ F( \mathbf{\Theta} ) \f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ F(\mathbf{\Theta}) = \frac{\sigma^2 \theta^2 + \log(2 \pi \sigma^2)}{2} \f$ + */ + public double F(PVector T){ + return (T.array[0] * T.array[0] * variance + Math.log(2 * Math.PI * variance)) / 2.0; + } + + + /** + * Computes \f$ \nabla F ( \mathbf{\Theta} )\f$. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return \f$ \nabla F( \mathbf{\Theta} ) = \sigma^2 \theta \f$ + */ + public PVector gradF(PVector T){ + PVector gradient = new PVector(1); + gradient.array[0] = variance * T.array[0]; + gradient.type = TYPE.EXPECTATION_PARAMETER; + return gradient; + } + + + /** + * Computes \f$ G(\mathbf{H})\f$. + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ G(\mathbf{H}) = \frac{\eta^2}{2 \sigma^2} \f$ + */ + public double G(PVector H){ + return (H.array[0] * H.array[0]) / (2 * variance); + } + + + /** + * Computes \f$ \nabla G (\mathbf{H})\f$ + * @param H expectation parameters \f$ \mathbf{H} = \eta \f$ + * @return \f$ \nabla G( \mathbf{H} ) = \frac{\eta}{\sigma^2} \f$ + */ + public PVector gradG(PVector H){ + PVector gradient = new PVector(1); + gradient.array[0] = H.array[0] / variance; + gradient.type = TYPE.NATURAL_PARAMETER; + return gradient; + } + + + /** + * Computes the sufficient statistic \f$ t(x)\f$. + * @param x a point + * @return \f$ t(x) = x \f$ + */ + public PVector t(PVector x){ + PVector t = new PVector(1); + t.array[0] = x.array[0]; + t.type = TYPE.EXPECTATION_PARAMETER; + return t; + } + + + /** + * Computes the carrier measure \f$ k(x) \f$. + * @param x a point + * @return \f$ k(x) = -\frac{x^2}{2 \sigma^2} \f$ + */ + public double k(PVector x){ + return -(x.array[0]*x.array[0])/(2*variance); + } + + + /** + * Converts source parameters to natural parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \mu \f$ + * @return natural parameters \f$ \mathbf{\Theta} = \frac{\mu}{\sigma^2} \f$ + */ + public PVector Lambda2Theta(PVector L){ + PVector T = new PVector(1); + T.array[0] = L.array[0] / variance; + T.type = TYPE.NATURAL_PARAMETER; + return T; + } + + + /** + * Converts natural parameters to source parameters. + * @param T natural parameters \f$ \mathbf{\Theta} = \theta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \theta \sigma^2 \f$ + */ + public PVector Theta2Lambda(PVector T){ + PVector L = new PVector(1); + L.array[0] = T.array[0] * variance; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Converts source parameters to expectation parameters. + * @param L source parameters \f$ \mathbf{\Lambda} = \lambda \f$ + * @return expectation parameters \f$ \mathbf{H} = \lambda \f$ + */ + public PVector Lambda2Eta(PVector L){ + PVector H = new PVector(1); + H.array[0] = L.array[0]; + H.type = TYPE.EXPECTATION_PARAMETER; + return H; + } + + + /** + * Converts expectation parameters to source parameters. + * @param H natural parameters \f$ \mathbf{H} = \eta \f$ + * @return source parameters \f$ \mathbf{\Lambda} = \eta \f$ + */ + public PVector Eta2Lambda(PVector H){ + PVector L = new PVector(1); + L.array[0] = H.array[0]; + L.type = TYPE.SOURCE_PARAMETER; + return L; + } + + + /** + * Computes the density value \f$ f(x;\mu,\sigma^2) \f$. + * @param x a point + * @param param parameters (source, natural, or expectation) + * @return \f$ f(x;\mu,\sigma^2) = \frac{1}{ \sqrt{2\pi \sigma^2} } \exp \left( - \frac{(x-\mu)^2}{ 2 \sigma^2} \right) \f$ + */ + public double density(PVector x, PVector param){ + if (param.type==TYPE.SOURCE_PARAMETER) + return Math.exp(-((x.array[0]-param.array[0])*(x.array[0]-param.array[0]))/(2*variance)) / (Math.sqrt(2 * Math.PI * variance)); + else if(param.type==TYPE.NATURAL_PARAMETER) + return super.density(x, param); + else + return super.density(x, Eta2Theta(param)); + } + + + /** + * Draws a point from the considered distribution. + * @param L source parameters \f$ \mathbf{\Lambda} = \lambda \f$ + * @return a point + */ + public PVector drawRandomPoint(PVector L) { + Random rand = new Random(); + PVector v = new PVector(1); + v.array[0] = rand.nextGaussian() * Math.sqrt(this.variance); + return v.Plus(L); + } + + + /** + * Computes the Kullback-Leibler divergence between two univariate Gaussian distributions. + * @param LP source parameters \f$ \mathbf{\Lambda}_P \f$ + * @param LQ source parameters \f$ \mathbf{\Lambda}_Q \f$ + * @return \f$ D_{\mathrm{KL}}(f_P\|f_Q) = \frac{(\mu_Q-\mu_P)^2}{2\sigma^2} \f$ + */ + public double KLD(PVector LP, PVector LQ) { + double mP = LP.array[0]; + double mQ = LQ.array[0]; + return ((mQ-mP)*(mQ-mP)) / (2*variance); + } + +} diff --git a/src/dev/java/japsadev/lib/jMEF/jMEFException.java b/src/dev/java/japsadev/lib/jMEF/jMEFException.java new file mode 100755 index 0000000..b5cc582 --- /dev/null +++ b/src/dev/java/japsadev/lib/jMEF/jMEFException.java @@ -0,0 +1,58 @@ +package japsadev.lib.jMEF; + + +/** + * @author Vincent Garcia + * @author Frank Nielsen + * @version 1.0 + * + * @section License + * + * See file LICENSE.txt + * + * @section Description + * + * This class provides exception for the jMEF library. + */ +public class jMEFException extends Exception { + + + /** + * Constant for serialization. + */ + private static final long serialVersionUID = 1L; + + + /** + * Exception message. + */ + String msg; + + + /** + * Class constructor. + * @param s exception message + */ + public jMEFException(String s){ + msg = s; + } + + + /** + * Method toString. + * @return string describing the exception + */ + public String toString(){ + return new String(super.toString() + " --> " + msg); + } + + + /** + * Returns the detail message string of this exception. + * @return the detail message string of this exception + */ + public String getMessage(){ + return msg + "|" + super.getMessage(); + } + +} diff --git a/src/dev/java/japsadev/obsolete/np/BarCodeAnalysis.java b/src/dev/java/japsadev/obsolete/np/BarCodeAnalysis.java new file mode 100644 index 0000000..10d9206 --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/BarCodeAnalysis.java @@ -0,0 +1,202 @@ +package japsadev.obsolete.np; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.Logging; + +public class BarCodeAnalysis { + int SCAN_WINDOW, + DIST_THRES, + SCORE_THRES; + public static boolean toPrint=false; + ArrayList barCodes; //barcode sequences + ArrayList barCodeComps; //barcode complement sequences + Process[] processes; + int nSamples; + int barcodeLen; + SequenceOutputStream[] streamToScaffolder, streamToFile; + + public BarCodeAnalysis(String barcodeFile, String scriptFile) throws IOException{ + barCodes = SequenceReader.readAll(barcodeFile, Alphabet.DNA()); + nSamples = barCodes.size(); + + processes = new Process[nSamples]; + streamToScaffolder = new SequenceOutputStream[nSamples]; + if(toPrint) + streamToFile = new SequenceOutputStream[nSamples]; + + barCodeComps = new ArrayList (barCodes.size()); + String id; + for(int i=0;i" + id + ":" + barCode); + + ProcessBuilder pb = new ProcessBuilder(scriptFile, id) + .redirectError(new File("log_" + id + ".err")) + .redirectOutput(new File("log_" + id + ".out")); + pb.directory(new File(System.getProperty("user.dir"))); + + processes[i] = pb.start(); + + Logging.info("Job for " + id + " started"); + streamToScaffolder[i] = new SequenceOutputStream(processes[i].getOutputStream()); + if(toPrint) + streamToFile[i] = SequenceOutputStream.makeOutputStream(id+"_clustered.fasta"); + } + + barcodeLen = barCodes.get(0).length(); + SCAN_WINDOW = barcodeLen * 3; + SCORE_THRES = barcodeLen; + DIST_THRES = SCORE_THRES / 3; + } + + public void setThreshold(int score){ + SCORE_THRES=score; + DIST_THRES = SCORE_THRES / 3; + } + /* + * Trying to clustering MinION read data into different samples based on the barcode + */ + public void clustering(String dataFile) throws IOException, InterruptedException{ + SequenceReader reader; + if(dataFile.equals("-")) + reader = SequenceReader.getReader(System.in); + else + reader = SequenceReader.getReader(dataFile); + Sequence seq; + + Sequence s5, s3; + final double[] tf = new double[nSamples], + tr = new double[nSamples], + cr = new double[nSamples], + cf = new double[nSamples]; + // Integer[] tRank = new Integer[nSamples], + // cRank = new Integer[nSamples]; + // jaligner.Alignment[] alignmentsTF = new jaligner.Alignment[pop], + // alignmentsTR = new jaligner.Alignment[pop], + // alignmentsCF = new jaligner.Alignment[pop], + // alignmentsCR = new jaligner.Alignment[pop]; + + Sequence barcodeSeq = new Sequence(Alphabet.DNA4(),barcodeLen,"barcode"); + Sequence tipSeq = new Sequence(Alphabet.DNA4(),SCAN_WINDOW,"tip"); + + BarcodeAlignment barcodeAlignment = new BarcodeAlignment(barcodeSeq, tipSeq); + + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + if(seq.length() < 200){ + System.err.println("Ignore short sequence " + seq.getName()); + continue; + } + //alignment algorithm is applied here. For the beginning, Smith-Waterman local pairwise alignment is used + + s5 = seq.subSequence(0, SCAN_WINDOW); + s3 = seq.subSequence(seq.length()-SCAN_WINDOW,seq.length()); + + + + double bestScore = 0.0; + double distance = 0.0; //distance between bestscore and the runner-up + + int bestIndex = nSamples; + + for(int i=0;i bestScore){ + //Logging.info("Better score=" + myScore); + distance = myScore-bestScore; + bestScore = myScore; + bestIndex = i; + } else if((bestScore-myScore) < distance){ + distance=bestScore-myScore; + } + + } + + if(bestScore < SCORE_THRES || distance < DIST_THRES){ + //Logging.info("Unknown sequence " + seq.getName()); + continue; + } + //if the best (sum of both ends) alignment in template sequence is greater than in complement + else { + Logging.info("Sequence " + seq.getName() + " might belongs to sample " + barCodes.get(bestIndex).getName() + " with score=" + bestScore); + if(bestIndex=start2){ + // seq1=origSeq1.substring(0, start1) + alnSeq1 + origSeq1.substring(start1+alnSeq1.length()-gap1); + // String seq2Filler = start1==start2?"":String.format("%"+(start1-start2)+"s", ""), + // markFiller = start1==0?"":String.format("%"+start1+"s", ""); + // seq2= seq2Filler + origSeq2.substring(0, start2) + alnSeq2 + origSeq2.substring(start2+alnSeq2.length()-gap2); + // mark= markFiller+String.valueOf(alignment.getMarkupLine()); + // }else{ + // seq2=origSeq2.substring(0, start2) + alnSeq2 + origSeq2.substring(start2+alnSeq2.length()-gap2); + // String markFiller = start2==0?"":String.format("%"+start2+"s", ""); + // seq1=String.format("%"+(start2-start1)+"s", "") + origSeq1.substring(0, start1) + alnSeq1 + origSeq1.substring(start1+alnSeq1.length()-gap1); + // mark=markFiller+String.valueOf(alignment.getMarkupLine()); + // } + // //System.out.println(alignment.getSummary()); + // System.out.println(seq1); + // System.out.println(mark); + // System.out.println(seq2); + // } + +} diff --git a/src/dev/java/japsadev/obsolete/np/BarcodeAlignment.java b/src/dev/java/japsadev/obsolete/np/BarcodeAlignment.java new file mode 100644 index 0000000..e9a6015 --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/BarcodeAlignment.java @@ -0,0 +1,279 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/**************************************************************************** + * Revision History + * 22 Nov 2016 - Minh Duc Cao: Started + * + ****************************************************************************/ +package japsadev.obsolete.np; + + +import japsa.seq.Sequence; + +/** + * Implement based on jaligner from Ahmed Moustafa + * See license below + */ + +public final class BarcodeAlignment { + + /** + * Traceback direction stop + */ + public static final byte STOP = 0; + /** + * Traceback direction left + */ + public static final byte LEFT = 1; + /** + * Traceback direction diagonal + */ + public static final byte DIAGONAL = 2; + /** + * Traceback direction up + */ + public static final byte UP = 3; + + public BarcodeAlignment(Sequence s1, Sequence s2) { + super(); + this.barcodeSequence = s1; + this.readSequence = s2; + + m = s1.length() + 1; + n = s2.length() + 1; + + //Initilise the arrays + pointers = new byte[m * n]; + sizesOfVerticalGaps = new short[m * n]; + sizesOfHorizontalGaps = new short[m * n]; + } + + Sequence barcodeSequence; + Sequence readSequence; + int m,n; + byte[] pointers; + short[] sizesOfVerticalGaps; + short[] sizesOfHorizontalGaps; + //BLOSSOM62 + //double [][] scores = + // {{4.0,0.0,0.0,0.0}, + // {0.0,9.0,-3.0,-1.0}, + // {0.0,-3.0,6.0,-2.0}, + // {0.0,-1.0,-2.0,5.0} + // }; + + //poreFUME's scores + double openPenalty = 4.7; + double extendPenalty = 1.6; + + double [][] scores = { + { 2.7, -4.5, -4.5, -4.5}, + { -4.5, 2.7, -4.5, -4.5}, + { -4.5, -4.5, 2.7, -4.5}, + { -4.5, -4.5, -4.5, 2.7} + }; + + + private int cellRow; + /** + * Column of the cell + */ + private int cellCol; + /** + * Alignment score at this cell + */ + private double cellScore; + + + public void setBarcodeSequence(Sequence seq){ + barcodeSequence = seq; + } + + public void setReadSequence(Sequence seq){ + readSequence = seq; + } + + + + public double align() { + // Initializes the boundaries of the traceback matrix to STOP. + for (int i = 0, k = 0; i < m; i++, k += n) { + pointers[k] = STOP; + } + for (int j = 1; j < n; j++) { + pointers[j] = STOP; + } + + for (int i = 0, k = 0; i < m; i++, k += n) { + for (int j = 0; j < n; j++) { + sizesOfVerticalGaps[k + j] = sizesOfHorizontalGaps[k + j] = 1; + } + } + return construct(); + } + + /** + * Constructs directions matrix for the traceback + * + * @param barcodeSequence + * sequence #1 + * @param readSequence + * sequence #2 + * @param scores + * scoring matrix + * @param openPenalty + * open gap penalty + * @param extendPenalty + * extend gap penalty + * @return The cell where the traceback starts. + */ + private double construct() { + //logger.info("Started..."); + //long start = System.currentTimeMillis(); + + double f; // score of alignment x1...xi to y1...yi if xi aligns to yi + double[] g = new double[n]; // score if xi aligns to a gap after yi + double h; // score if yi aligns to a gap after xi + double[] v = new double[n]; // best score of alignment x1...xi to + // y1...yi + double vDiagonal; + + g[0] = Float.NEGATIVE_INFINITY; + h = Float.NEGATIVE_INFINITY; + v[0] = 0; + + for (int j = 1; j < n; j++) { + g[j] = Float.NEGATIVE_INFINITY; + v[j] = 0; + } + + double similarityScore, g1, g2, h1, h2; + + cellScore = Float.NEGATIVE_INFINITY; + //Cell cell = new Cell(); + + for (int i = 1, k = n; i < m; i++, k += n) { + h = Float.NEGATIVE_INFINITY; + vDiagonal = v[0]; + for (int j = 1, l = k + 1; j < n; j++, l++) { + similarityScore = scores[barcodeSequence.getBase(i-1)][readSequence.getBase(j-1)]; + + // Fill the matrices + f = vDiagonal + similarityScore; + + g1 = g[j] - extendPenalty; + g2 = v[j] - openPenalty; + if (g1 > g2) { + g[j] = g1; + sizesOfVerticalGaps[l] = (short) (sizesOfVerticalGaps[l - n] + 1); + } else { + g[j] = g2; + } + + h1 = h - extendPenalty; + h2 = v[j - 1] - openPenalty; + if (h1 > h2) { + h = h1; + sizesOfHorizontalGaps[l] = (short) (sizesOfHorizontalGaps[l - 1] + 1); + } else { + h = h2; + } + + vDiagonal = v[j]; + v[j] = maximum(f, g[j], h, 0); + + // Determine the traceback direction + if (v[j] == 0) { + pointers[l] = STOP; + } else if (v[j] == f) { + pointers[l] = DIAGONAL; + } else if (v[j] == g[j]) { + pointers[l] = UP; + } else { + pointers[l] = LEFT; + } + + // Set the traceback start at the current cell i, j and score + if (v[j] > cellScore) { + cellRow = i; + cellCol = j; + cellScore = v[j]; + //cell.set(i, j, v[j]); + } + } + } + return cellScore; + } + + + /** + * Returns the maximum of 4 float numbers. + * + * @param a + * float #1 + * @param b + * float #2 + * @param c + * float #3 + * @param d + * float #4 + * @return The maximum of a, b, c and d. + */ + private static double maximum(double a, double b, double c, double d) { + if (a > b) { + if (a > c) { + return a > d ? a : d; + } else { + return c > d ? c : d; + } + } else if (b > c) { + return b > d ? b : d; + } else { + return c > d ? c : d; + } + } + +} + +/** + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ diff --git a/src/dev/java/japsadev/obsolete/np/BaseMethylationCmd.java b/src/dev/java/japsadev/obsolete/np/BaseMethylationCmd.java new file mode 100644 index 0000000..e2fb34f --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/BaseMethylationCmd.java @@ -0,0 +1,46 @@ +package japsadev.obsolete.np; + +import japsa.util.CommandLine; +import japsa.util.JapsaException; +import japsa.util.deploy.Deployable; + +@Deployable(scriptName = "jsa.np.meth", +scriptDesc = "Detecting methylated bases using Oxford Nanopore sequencing signal") +public class BaseMethylationCmd extends CommandLine{ + public BaseMethylationCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("samFile", null, "Name of the .sam file when nanopore reads align with the reference"); + addString("refFile", null, "Name of the reference fasta file"); + addString("gffFile", null, "Name of the GFF file specifying methylation position of ref"); + addString("nanopore", null, "Name of folder containing nanopore raw reads in hdf5 format"); + addStdHelp(); + } + /** + * @param args + * @throws Exception + * @throws JapsaException + * @throws OutOfMemoryError + */ + public static void main(String[] args) throws OutOfMemoryError, JapsaException, Exception { + // TODO Auto-generated method stub + CommandLine cmdLine = new BaseMethylationCmd(); + args = cmdLine.stdParseLine(args); + + String refFile = cmdLine.getStringVal("refFile"); + String samFile = cmdLine.getStringVal("samFile"); + String gffFile = cmdLine.getStringVal("gffFile"); + String nanopore = cmdLine.getStringVal("nanopore"); + + KmerMap map = new KmerMap(gffFile, "m6A"); +// map.scanUnmethylate(refFile); +// map.scanAlignment(samFile); +// map.scanHDF5(nanopore); +// map.print(); + KmerMap.statsHDF5(nanopore); + } + +} diff --git a/src/dev/java/japsadev/obsolete/np/Fast5DetailReader.java b/src/dev/java/japsadev/obsolete/np/Fast5DetailReader.java new file mode 100644 index 0000000..3d83ca8 --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/Fast5DetailReader.java @@ -0,0 +1,444 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 21/07/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.obsolete.np; + +import java.util.List; + +import ncsa.hdf.object.Group; +import ncsa.hdf.object.HObject; +import ncsa.hdf.object.h5.H5CompoundDS; +import ncsa.hdf.object.h5.H5ScalarDS; +import japsa.util.JapsaException; +import japsa.util.Logging; + +/** + * Read detail nanopore data (read sequence, events, alignment, models etc) from a raw + * (fast5) format. + * Re-implemented from the previous to aim for faster, which static key + * + * @author minhduc + */ +public class Fast5DetailReader extends Fast5NPReader{ + static String RAW_PREFIX = "/Raw/Reads"; + static String CHANNEL_ID = "/UniqueGlobalKey/channel_id"; + static String TRACKING_ID = "/UniqueGlobalKey/tracking_id"; + + private double samplingRate = 0;//Default + private int channelNumber = 0; + private long startTime = 0; + + public Fast5DetailReader (String fileName) throws JapsaException, OutOfMemoryError, Exception{ + super(fileName); + readMetaData(); + } + + + public double getSamplingRate() { + return samplingRate; + } + + + public int getChannelNumber() { + return channelNumber; + } + + public long getStartTime() { + return startTime; + } + + + /** + * Extract metadata about the read, including: + * - sampling rate + * - channel number + * @throws Exception + */ + + private void readMetaData() throws Exception{ + HObject data = f5File.get(CHANNEL_ID); + @SuppressWarnings("unchecked") + List aL = (List) data.getMetadata(); + for (ncsa.hdf.object.Attribute att:aL){ + if (att.getName().equals("sampling_rate")){ + samplingRate = ((double[]) att.getValue())[0]; + }else if (att.getName().equals("channel_number")){ + channelNumber = Integer.parseInt(((String[]) att.getValue())[0]); + } + } + } + + /** + * Extract raw event from the read. Start time is also extracted in the process + * @return + * @throws Exception + */ + + public RawSignal getRawEvent() throws Exception{ + HObject data = f5File.get(RAW_PREFIX); + //Logging.info("Read 1 " + (data == null)); + if (data !=null){ + Group group = (Group) data; + group = (Group) group.getMemberList().get(0); + @SuppressWarnings("unchecked") + List aL = (List) group.getMetadata(); + for (ncsa.hdf.object.Attribute att:aL){ + if (att.getName().equals("start_time")){ + startTime = ((long[]) att.getValue())[0]; + } + } + H5ScalarDS myDat =((H5ScalarDS) group.getMemberList().get(0)); + //Logging.info("Read 2 " + (myDat == null)); + if (myDat != null){ + short [] rawEvent = (short[])myDat.getData(); + //Logging.info("Read 3 " + (rawEvent == null)); + RawSignal rawSignal = new RawSignal(rawEvent); + return rawSignal; + } + } + return null; + } + + public void readData() throws OutOfMemoryError, Exception{ + Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); + readData(root); + } + + + + /** + * Recursively print a group and its members. Fastq data are read.If all + * flag is turned on, this method will also reads all events and model data. + * @throws OutOfMemoryError + * + * @throws Exception + */ + private void readData(Group g) throws OutOfMemoryError, Exception{ + + if (g == null) return; + java.util.List members = g.getMemberList(); + + for (HObject member:members) { + //String f = member.getFullName(); + if (member instanceof Group) { + readData((Group) member); + }else if (member instanceof H5CompoundDS){ + String fullName = member.getFullName(); + + //Logging.info(member.getClass() +" "); + @SuppressWarnings("unchecked") + List dat = (List) (((H5CompoundDS) member).getData()); + if (dat != null){ + /********************************************************/ + if (fullName.startsWith("/Analyses/EventDetection_000/Reads/") && fullName.endsWith("Events") ){ + Logging.info("Read " + fullName); + detectedEvents = new DetectionEvents(); + detectedEvents.start = (long[]) dat.get(0); + detectedEvents.length = (long[]) dat.get(1); + detectedEvents.mean = (double[]) dat.get(2); + detectedEvents.stdv = (double[]) dat.get(3); + }else if (fullName.endsWith("BaseCalled_template/Events")){ + Logging.info("Read " + fullName); + bcTempEvents = new BaseCallEvents(); + bcTempEvents.mean = (double[]) dat.get(0); + bcTempEvents.start = (double[]) dat.get(1); + bcTempEvents.stdv = (double[]) dat.get(2); + bcTempEvents.length = (double[]) dat.get(3); + bcTempEvents.modelState = (String[]) dat.get(4); + bcTempEvents.move = (long[]) dat.get(5); + bcTempEvents.weight = (float[]) dat.get(6); + bcTempEvents.pModelState = (float[]) dat.get(7); + bcTempEvents.mpState = (String[]) dat.get(8); + bcTempEvents.pMpState = (float[]) dat.get(9); + + bcTempEvents.pA = (float[]) dat.get(10); + bcTempEvents.pC = (float[]) dat.get(11); + bcTempEvents.pG = (float[]) dat.get(12); + bcTempEvents.pT = (float[]) dat.get(13); + }else if (fullName.endsWith("BaseCalled_complement/Events")){ + Logging.info("Read " + fullName); + bcCompEvents = new BaseCallEvents(); + bcCompEvents.mean = (double[]) dat.get(0); + bcCompEvents.start = (double[]) dat.get(1); + bcCompEvents.stdv = (double[]) dat.get(2); + bcCompEvents.length = (double[]) dat.get(3); + bcCompEvents.modelState = (String[]) dat.get(4); + bcCompEvents.move = (long[]) dat.get(5); + bcCompEvents.weight = (float[]) dat.get(6); + bcCompEvents.pModelState = (float[]) dat.get(7); + bcCompEvents.mpState = (String[]) dat.get(8); + bcCompEvents.pMpState = (float[]) dat.get(9); + + bcCompEvents.pA = (float[]) dat.get(10); + bcCompEvents.pC = (float[]) dat.get(11); + bcCompEvents.pG = (float[]) dat.get(12); + bcCompEvents.pT = (float[]) dat.get(13); + }else if (fullName.endsWith("BaseCalled_complement/Model")){ + Logging.info("Read " + fullName); + bcCompModel = new BaseCallModel(); + bcCompModel.kmer = (String[]) dat.get(0); + //bcCompModel.variant = (double[]) dat.get(1); + bcCompModel.levelMean = (double[]) dat.get(2); + bcCompModel.levelStdv = (double[]) dat.get(3); + bcCompModel.sdMean = (double[]) dat.get(4); + bcCompModel.sdStdv = (double[]) dat.get(5); + //bcCompModel.weigth = (double[]) dat.get(6); + }else if (fullName.endsWith("BaseCalled_template/Model")){ + Logging.info("Read " + fullName); + bcTempModel = new BaseCallModel(); + bcTempModel.kmer = (String[]) dat.get(0); + //bcTempModel.variant = (double[]) dat.get(1); + bcTempModel.levelMean = (double[]) dat.get(2); + bcTempModel.levelStdv = (double[]) dat.get(3); + bcTempModel.sdMean = (double[]) dat.get(4); + bcTempModel.sdStdv = (double[]) dat.get(5); + //bcTempModel.weigth = (double[]) dat.get(6); + }else if (fullName.endsWith("HairpinAlign/Alignment")){ + Logging.info("Read " + fullName); + bcAlignmentHairpin = new BaseCallAlignmentHairpin(); + bcAlignmentHairpin.template = (long[]) dat.get(0); + bcAlignmentHairpin.complement = (long[]) dat.get(1); + }else + if (fullName.endsWith("BaseCalled_2D/Alignment")){ + Logging.info("Read " + fullName); + bcAlignment2D = new BaseCallAlignment2D(); + bcAlignment2D.template = (long[]) dat.get(0); + bcAlignment2D.complement = (long[]) dat.get(1); + bcAlignment2D.kmer = (String[]) dat.get(2); + } + /********************************************************/ + } + } + } + } + + + /** + * Get base call events for complement strand + * @return the bcCompEvents + */ + public BaseCallEvents getBcCompEvents() { + return bcCompEvents; + } + + + /** + * Get base call events for template strand + * @return the bcTempEvents + */ + public BaseCallEvents getBcTempEvents() { + return bcTempEvents; + } + + /** + * Get the events from the pore + * @return the events + */ + public DetectionEvents getEvents() { + return detectedEvents; + } + + + + /** + * Class represent RawSignal, which is an array of short + * @author minhduc + * + */ + public static class RawSignal{ + short [] signal; + + private RawSignal(short [] data){ + signal = data; + } + public short [] getSignal(){ + return signal; + } + } + + /******************************************************************************************************** +H5CompoundDS : /Analyses/EventDetection_000/Reads/Read_12/Events=class java.util.Vector + + + H5ScalarDS : /Analyses/Calibration_Strand_000/Log=class [Ljava.lang.String; + H5ScalarDS : /Analyses/EventDetection_000/Log=class [Ljava.lang.String; + H5CompoundDS : /Analyses/EventDetection_000/Reads/Read_5543/Events=class java.util.Vector + H5ScalarDS : /Raw/Reads/Read_5543/Signal=class [S + + /*******************************************************/ + BaseCallAlignment2D bcAlignment2D = null; + BaseCallAlignmentHairpin bcAlignmentHairpin = null; + BaseCallModel bcCompModel = null, bcTempModel = null; + + DetectionEvents detectedEvents; + BaseCallEvents bcCompEvents = null, bcTempEvents = null; + RawSignal rawSignal = null; + + double seqTime = 0; + + + + + /** + * Get 2D alignment + * @return the bcAlignment2D + */ + public BaseCallAlignment2D getBcAlignment2D() { + return bcAlignment2D; + } + + /** + * Get hairpin alignment + * @return the bcAlignmentHairpin + */ + public BaseCallAlignmentHairpin getBcAlignmentHairpin() { + return bcAlignmentHairpin; + } + + /** + * Get the model for base call of the complement + * @return the bcCompModel + */ + public BaseCallModel getBcCompModel() { + return bcCompModel; + } + + /** + * Get the model for base call of the template + * @return the bcTempModel + */ + public BaseCallModel getBcTempModel() { + return bcTempModel; + } + //String expStart = ""; + + public static class BaseCallModel{ + String [] kmer; + double[] variant; + double[] levelMean, levelStdv, sdMean, sdStdv;//, weigth; + + } + + public static class BaseCallEvents{ + int dim; + double [] mean, start, stdv, length; + float [] pA, pC, pG, pT; + long []move;//, rawIndex; + float [] pModelState; + float [] pMpState; + float [] weight; + //long [] modelLevel; + String [] modelState, mpState; + + public long [] getMove(){ + return move; + } + + public double [] length(){ + return length; + } + + public double [] mean(){ + return mean; + } + + public double [] stdv(){ + return stdv; + } + + public float [] weight(){ + return weight; + } + + public String [] modelState(){ + return modelState; + } + } + + public static class BaseCallAlignment2D{ + int dim; + long [] template, complement; + String [] kmer; + + public String[] getKmer(){ + return kmer; + } + public long[] getComplementKmer(){ + return complement; + } + public long[] getTemplateKmer(){ + return template; + } + } + + public static class BaseCallAlignmentHairpin{ + int dim; + long [] template, complement; + } + + public static class DetectionEvents{ + int dim; + double [] stdv; + double [] mean; + long [] start; + long [] length; + + public double [] getMean(){ + return mean; + } + + /** + * @return the stdv + */ + public double[] getStdv() { + return stdv; + } + + /** + * @return the start + */ + public long[] getStart() { + return start; + } + + /** + * @return the length + */ + public long[] getLength() { + return length; + } + + } + +} diff --git a/src/dev/java/japsadev/obsolete/np/Fast5NPReader.java b/src/dev/java/japsadev/obsolete/np/Fast5NPReader.java new file mode 100644 index 0000000..47a4a3f --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/Fast5NPReader.java @@ -0,0 +1,232 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 23/09/20116 - Minh Duc Cao: Resigned of the reader class + * + ****************************************************************************/ + +package japsadev.obsolete.np; + +import ncsa.hdf.object.FileFormat; +import ncsa.hdf.object.Group; +import ncsa.hdf.object.HObject; +import ncsa.hdf.object.h5.H5ScalarDS; +import japsa.seq.Alphabet.DNA; + +import java.util.ArrayList; + +import japsa.seq.Alphabet; +import japsa.seq.FastqSequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.JapsaException; + + +/** + * Read nanopore data (read sequence, events, alignment, models etc) from a raw + * (fast5) format. + * + * + * @author minhduc + */ +public class Fast5NPReader{ + protected FileFormat f5File; + // FastqSequence seqTemplate = null, seqComplement = null, seq2D = null; + + ArrayList seqList = null; + + /** + * Open a fast5 file before reading anything from it. + * + * The file should be closed before gabbage collected. + * + * @param fileName + * @throws OutOfMemoryError + * @throws Exception + */ + public Fast5NPReader (String fileName) throws JapsaException, OutOfMemoryError, Exception{ + FileFormat fileFormat = FileFormat.getFileFormat(FileFormat.FILE_TYPE_HDF5); + + if (fileFormat == null){ + throw new JapsaException("Cannot read HDF5 file, possily because JHI5 is not installed or configured. Please refer to npReader installation guide or contact the deverlopers."); + } + + //Logging.info("Open " + fileName); + f5File = fileFormat.createInstance(fileName, FileFormat.READ); + if (f5File == null) + throw new RuntimeException("Unable to open file " + fileName); + + f5File.open(); + } + + + public void close() throws Exception{ + f5File.close(); + } + + public void readFastq() throws OutOfMemoryError, Exception{ + if (seqList !=null) return; + seqList = new ArrayList(); + Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); + readFastq(root); + } + + public ArrayList getFastqList(){ + return seqList; + } + + public void readAllFastq(SequenceOutputStream sos) throws OutOfMemoryError, Exception{ + Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); + readAllFastq(root, sos); + } + + + ///** + // * @return the seqTemplate + // */ + //public FastqSequence getSeqTemplate() { + // return seqTemplate; + //} + + + ///** + // * @return the seqComplement + // */ + //public FastqSequence getSeqComplement() { + // return seqComplement; + //} + + ///** + // * @return the seq2D + // */ + //public FastqSequence getSeq2D() { + // return seq2D; + //} + + private void readAllFastq(Group g, SequenceOutputStream out) throws OutOfMemoryError, Exception{ + if (g == null) return; + java.util.List members = g.getMemberList(); + + for (HObject member:members) { + if (member instanceof Group) { + readAllFastq((Group) member, out); + }else if (member instanceof H5ScalarDS){ + String fullName = member.getFullName(); + if (fullName.endsWith("Fastq")){ + Object data = ((H5ScalarDS) member).getData(); + if (data != null){ + //Logging.info(fullName); + //out.print(((String[]) data)[0]); + //out.println(); + //Logging.info("Read " + fullName); + + String [] toks = ((String[]) data)[0].split("\n",2); + if (fullName.contains("BaseCalled_2D")){ + out.print(toks[0] + "_twodimentional path=" + fullName); + }else if (fullName.contains("BaseCalled_complement")){ + out.print(toks[0] + "_complement path=" + fullName); + }else if (fullName.contains("BaseCalled_template")){ + out.print(toks[0] + "_template path=" + fullName); + }else + out.print(toks[0] + "_unknown path=" + fullName); + + out.print('\n'); + out.print(toks[1]); + out.print('\n'); + } + } + } + } + } + + /** + * Recursively print a group and its members. Fastq data are read.If all + * flag is turned on, this method will also reads all events and model data. + * @throws OutOfMemoryError + * + * @throws Exception + */ + private void readFastq(Group g) throws OutOfMemoryError, Exception{ + if (g == null) return; + java.util.List members = g.getMemberList(); + + for (HObject member:members) { + if (member instanceof Group) { + readFastq((Group) member); + }else if (member instanceof H5ScalarDS){ + String fullName = member.getFullName(); + if (fullName.endsWith("Fastq")){ + Object data = ((H5ScalarDS) member).getData(); + String group = fullName.split("/")[2]; + if (data != null){ + //Logging.info("Read " + fullName); + String [] toks = ((String[]) data)[0].split("\n"); + if (fullName.contains("BaseCalled_2D")){ + toks[0] = toks[0].substring(1) + "_twodimentional" + " length=" + toks[1].length() + " group=" + group; + seqList.add(new BaseCalledFastq(DNA.DNA16(), toks, BaseCalledFastq.TWODIM)); + }else if (fullName.contains("BaseCalled_complement")){ + toks[0] = toks[0].substring(1) + "_complement" + " length=" + toks[1].length() + " group=" + group ; + seqList.add(new BaseCalledFastq(DNA.DNA16(), toks, BaseCalledFastq.COMPLEMENT)); + }else if (fullName.contains("BaseCalled_template")){ + toks[0] = toks[0].substring(1) + "_template" + " length=" + toks[1].length() + " group=" + group; + seqList.add(new BaseCalledFastq(DNA.DNA16(), toks, BaseCalledFastq.TEMPLATE)); + } + } + } + } + } + } + + public static class BaseCalledFastq extends FastqSequence{ + public static final int UNKNOWN = 4; + public static final int TWODIM = 0; + public static final int TEMPLATE = 1; + public static final int COMPLEMENT = 2; + int myType = 4; + + public BaseCalledFastq(Alphabet alphabet, String [] toks, int type) { + super(alphabet, toks); + myType = type; + } + + public int type(){ + return myType; + } + + public boolean isTwoDim(){ + return myType == TWODIM; + } + public boolean isTemplate(){ + return myType == TEMPLATE; + } + public boolean isComplement(){ + return myType == COMPLEMENT; + } + } +} diff --git a/src/dev/java/japsadev/obsolete/np/Kmer.java b/src/dev/java/japsadev/obsolete/np/Kmer.java new file mode 100644 index 0000000..2191af0 --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/Kmer.java @@ -0,0 +1,82 @@ +package japsadev.obsolete.np; + +public class Kmer implements Comparable{ + boolean isMeth=false; + int coordinate=-1; + boolean strand = true; // +/- correspond to true/false + String containedSeq = ""; + String desc = "", sequence = ""; + + double mean=0.0, stdv=0.0; //might be a class + public Kmer(){ + } + public Kmer(boolean isMeth, boolean strand, String container, int coordinate){ + this.isMeth = isMeth; + this.strand = strand; + this.containedSeq = container; + this.coordinate = coordinate; + } + + public void setSignal(double mean, double stdv){ + this.mean = mean; + this.stdv = stdv; + } + public double getMeanSignal(){ + return mean; + } + public double getStdvSignal(){ + return stdv; + } + + public void setCoordinate(int coor){ + coordinate = coor; + } + public int getCoordinate(){ + return coordinate; + } + + public void setDesc(String str){ + desc = new String(str); + } + public String getDesc(){ + return desc; + } + + public void setSeq(String str){ + sequence = new String(str); + } + public String getSeq(){ + return sequence; + } + + public String toString(){ + return new String(containedSeq + ":" + coordinate +":" + (strand?'+':'-')); + } + + @Override + public boolean equals(Object obj) + { + if(this == obj) + return true; + if((obj == null) || (obj.getClass() != this.getClass())) + return false; + // object must be Test at this point + Kmer test = (Kmer)obj; + return strand == test.strand && coordinate == test.coordinate && + (containedSeq == test.containedSeq || (containedSeq != null && containedSeq.equals(test.containedSeq))); + } + + public int hashCode() + { + int hash = 7; + hash = 31 * hash + (strand?1:0); + hash = 31 * hash + coordinate; + hash = 31 * hash + (null == containedSeq ? 0 : containedSeq.hashCode()); + return hash; + } + @Override + public int compareTo(Kmer kmer) { + // TODO Auto-generated method stub + return coordinate - kmer.coordinate; + } +} diff --git a/src/dev/java/japsadev/obsolete/np/KmerMap.java b/src/dev/java/japsadev/obsolete/np/KmerMap.java new file mode 100644 index 0000000..d834245 --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/KmerMap.java @@ -0,0 +1,426 @@ +package japsadev.obsolete.np; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.Alphabet; +import japsa.seq.FastqSequence; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.seq.nanopore.Fast5DetailReader; +import japsa.seq.nanopore.Fast5DetailReader.BaseCallAlignment2D; +import japsa.seq.nanopore.Fast5DetailReader.BaseCallEvents; +import japsa.seq.nanopore.Fast5NPReader.BaseCalledFastq; +import japsa.util.JapsaException; +import japsa.util.Logging; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class KmerMap { + HashMap> methKmer = new HashMap>(), + unmethKmer = new HashMap>(), + finalList = new HashMap>(); + + HashMap kmerList = new HashMap(); // kmer string -> Kmer object + + HashMap alignment = new HashMap(); // read name -> SAM record + public KmerMap(String gffFile, String methID) throws IOException{ + BufferedReader gffReader = new BufferedReader(new FileReader(gffFile)); + String s; + Pattern tab = Pattern.compile("\t"), + acgt = Pattern.compile("context=([ACGT]+);"); + for (s = gffReader.readLine(); null != s; s = gffReader.readLine()) { + s = s.trim(); + if (s.length() > 0) { + if (s.charAt(0) != '#'){ + String[] line = tab.split(s); + String type=line[2].trim(); + if(type.equalsIgnoreCase(methID)){ + //String name=line[0].split("\\|")[0]; + String name=line[0]; + int basePos = Integer.parseInt(line[3])-1; + char strand = line[6].charAt(0); + Matcher des = acgt.matcher(line[8]); + String context=""; + if(des.find()) + context=des.group(1); + for(int i=16;i<21;i++){ + String kmer = context.substring(i, i+5); + + if(strand == '+'){ + Kmer aKmer = new Kmer(true, true, name, basePos+i-20); + aKmer.setSeq(kmer); + aKmer.setDesc(methID + ":" + (21-i) + "A"); + if(methKmer.containsKey(kmer)) + methKmer.get(kmer).add(aKmer); + else{ + ArrayList list = new ArrayList(); + list.add(aKmer); + methKmer.put(kmer, list); + } + kmerList.put(aKmer.toString(), aKmer); + } + else{ + Kmer aKmer = new Kmer(true, false, name, basePos+16-i); + aKmer.setSeq(kmer); + aKmer.setDesc(methID + ":" + (21-i) + "A"); + if(methKmer.containsKey(kmer)) + methKmer.get(kmer).add(aKmer); + else{ + ArrayList list = new ArrayList(); + list.add(aKmer); + methKmer.put(kmer, list); + } + kmerList.put(aKmer.toString(), aKmer); + } + } + } + } + } + } + + gffReader.close(); + + } + public void scanUnmethylate(String refFile) throws IOException{ + SequenceReader reader = SequenceReader.getReader(refFile); + Sequence seq; + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + String name = seq.getName(); + for(int i=0; i< seq.length()-5; i++){ + Sequence kmer = seq.subSequence(i, i+5); + if(methKmer.containsKey(kmer.toString())){ + Kmer tmp = new Kmer(false, true, name, i); + tmp.setSeq(kmer.toString()); + if(!methKmer.get(kmer.toString()).contains(tmp)) + if(unmethKmer.containsKey(kmer.toString())) + unmethKmer.get(kmer.toString()).add(tmp); + else{ + ArrayList list = new ArrayList(); + list.add(tmp); + unmethKmer.put(kmer.toString(), list); + } + if(!kmerList.containsKey(tmp.toString())) + kmerList.put(tmp.toString(), tmp); + } + + kmer = Alphabet.DNA.complement(kmer); + if(methKmer.containsKey(kmer.toString())){ + Kmer tmp = new Kmer(false, false, name, i); + tmp.setSeq(kmer.toString()); + if(!methKmer.get(kmer.toString()).contains(tmp)) + if(unmethKmer.containsKey(kmer.toString())) + unmethKmer.get(kmer.toString()).add(tmp); + else{ + ArrayList list = new ArrayList(); + list.add(tmp); + unmethKmer.put(kmer.toString(), list); + } + if(!kmerList.containsKey(tmp.toString())) + kmerList.put(tmp.toString(), tmp); + } + } + } + reader.close(); + } + public void scanAlignment(String samFile) throws IOException{ + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + + SamReader reader; + if ("-".equals(samFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(samFile)); + + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + while (iter.hasNext()) { + SAMRecord record = iter.next(); + if (record.getReadUnmappedFlag()) + continue; + if (record.getMappingQuality() < 1) // or else? + continue; + readID = record.getReadName(); + if(!readID.contains("twodimentional")) + continue; + + alignment.put(readID.substring(readID.indexOf("_")+1), record); + + }// while + iter.close(); + reader.close(); + } + + public static int move(String[] kmer, int begin, int step){ + int index = begin; + while(step-- > 0){ + if(index >= kmer.length-1) + return kmer.length-1; + while(kmer[index].equals(kmer[++index])); + } + return index; + } + public void scanHDF5(String folderName) throws OutOfMemoryError{ + File mainFolder = new File(folderName); + File [] fileList = mainFolder.listFiles(); + Logging.info("Reading in folder " + mainFolder.getAbsolutePath()); + if (fileList!=null){ + for (File f:fileList){ + //directory + if (!f.isFile()) + continue;//for + + if (!f.getName().endsWith("fast5")) + continue;//for + String sPath = f.getAbsolutePath(); + + try{ + Fast5DetailReader npReader = new Fast5DetailReader(sPath); + npReader.readData(); + npReader.readFastq(); + ArrayList seqList = npReader.getFastqList(); + FastqSequence fastq = null; + for (BaseCalledFastq bfcq:seqList){ + if (bfcq.isTwoDim()){ + fastq = bfcq; + break; + } + } + if(fastq==null){ + npReader.close(); + continue; + }else{ + String readName = fastq.getName().split(" ")[0]; + SAMRecord record; + if(alignment.containsKey(readName)) + record = alignment.get(readName); + else{ + npReader.close(); + continue; + } + System.out.print("Reading "+ readName); + // now read the whole HDF5 file + npReader.readData(); + + BaseCallAlignment2D align2d = npReader.getBcAlignment2D(); + + BaseCallEvents temp = npReader.getBcTempEvents(); + if(align2d == null || temp == null){ + System.out.println("...ignored!"); + npReader.close(); + continue; + } + System.out.println(); + npReader.close(); + + String [] kmer2d = align2d.getKmer(); + int curPos = 0; + + char strand = record.getReadNegativeStrandFlag()?'-':'+'; + String refID = record.getReferenceName(); + Cigar cigar = record.getCigar(); + + int posOnRef = record.getAlignmentStart(); + for (final CigarElement e : cigar.getCigarElements()) { + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + case S : + case P : + case I : + curPos = move(kmer2d, curPos, length); + break; + case M ://match or mismatch + case EQ://match + case X ://mismatch + if(length < 5){ + posOnRef += length; + curPos = move(kmer2d, curPos, length); + break; + }else{ + for(int i = 0; i < length-4; i++){ + posOnRef++; curPos = move(kmer2d, curPos, 1); + String hash = new String(refID+":"+posOnRef+":"+strand); + if(kmerList.containsKey(hash)){ + Kmer sigKmer = kmerList.get(hash); + String seq = sigKmer.getSeq(); + if(kmer2d[curPos].equals(seq)){ + System.out.println("...hit " + hash + "\t" + kmer2d[curPos] + " vs. " + seq + " " + sigKmer.getDesc()); + sigKmer.setSignal(temp.mean()[curPos], temp.stdv()[curPos]); + + if(finalList.containsKey(seq)) + finalList.get(seq).add(sigKmer); + else{ + ArrayList list = new ArrayList(); + list.add(sigKmer); + finalList.put(seq, list); + } + }else{ + System.out.println("...mismatch => ignored: " + hash); + } + }else{ + System.out.println("...no hit " + hash); + } + } + break; + } + case D : + case N : + posOnRef += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//casse + }//for + + } + }catch (JapsaException e){ + e.printStackTrace(); + continue; + }catch (Exception e){ + Logging.error("Problem with reading " + sPath + ":" + e.getMessage()); + e.printStackTrace(); + continue; + } + + + }//for + }//if + else{ + Logging.info("Folder " + mainFolder.getAbsolutePath() + " does not exist, are you sure this is the right folder?"); + } + + } + + public static void statsHDF5(String folderName) throws OutOfMemoryError, IOException{ + BufferedWriter statFile = new BufferedWriter(new PrintWriter("kmerStats.out")); + File mainFolder = new File(folderName); + File [] fileList = mainFolder.listFiles(); + Logging.info("Reading in folder " + mainFolder.getAbsolutePath()); + if (fileList!=null){ + for (File f:fileList){ + //directory + if (!f.isFile()) + continue;//for + + if (!f.getName().endsWith("fast5")) + continue;//for + String sPath = f.getAbsolutePath(); + + try{ + Fast5DetailReader npReader = new Fast5DetailReader(sPath); + npReader.readData(); + + ArrayList seqList = npReader.getFastqList(); + FastqSequence fastq = null; + for (BaseCalledFastq bfcq:seqList){ + if (bfcq.isTwoDim()){ + fastq = bfcq; + break; + } + } + if(fastq==null){ + npReader.close(); + continue; + }else{ + String readName = fastq.getName().split(" ")[0]; + System.out.print("Reading "+ readName); + // now read the whole HDF5 file + npReader.readData(); + + BaseCallEvents temp = npReader.getBcTempEvents(), + comp = npReader.getBcCompEvents(); + if(comp == null || temp == null){ + System.out.println("...ignored!"); + npReader.close(); + continue; + } + System.out.println(); + npReader.close(); + long [] move = temp.getMove(); + double[] mean = temp.mean(), + stdv = temp.stdv(); + //long[] modelLv = temp.modelLv(); + String[] modelState = temp.modelState(); + for(int i = 0; i < move.length; i++){ + if(move[i] == 1) + statFile.write(modelState[i] + " " + "temp" + " " + mean[i] + " " + stdv[i] + "\n"); + } + + move = comp.getMove(); + mean = comp.mean(); + stdv = comp.stdv(); + //modelLv = comp.modelLv(); + modelState = comp.modelState(); + for(int i = 0; i < move.length; i++){ + if(move[i] == 1) + statFile.write(modelState[i] + " " + "comp" + " " + mean[i] + " " + stdv[i] + "\n"); + } + } + }catch (JapsaException e){ + e.printStackTrace(); + continue; + }catch (Exception e){ + Logging.error("Problem with reading " + sPath + ":" + e.getMessage()); + e.printStackTrace(); + continue; + } + + + }//for + }//if + else{ + Logging.info("Folder " + mainFolder.getAbsolutePath() + " does not exist, are you sure this is the right folder?"); + } + statFile.close(); + + } + + public void print() throws IOException{ + BufferedWriter methFile = new BufferedWriter(new PrintWriter("methylated.out")), + unmethFile = new BufferedWriter(new PrintWriter("unmethylated.out")); + // Get a set of the entries + Set>> set = finalList.entrySet(); + // Get an iterator + Iterator>> ite = set.iterator(); + // Display elements + while(ite.hasNext()) { + Map.Entry> me = (Map.Entry>)ite.next(); + String kmer = me.getKey(), + desc = ""; + ArrayList list = me.getValue(); + + int index=0; + while(index < list.size() && !list.get(index).isMeth) index++; + desc = (index < list.size()) ? list.get(index).getDesc():""; + methFile.write(">"+ kmer + "\t" + desc + "\n"); + unmethFile.write(">"+ kmer + "\n"); + for(Kmer km:list) + if(km.isMeth) + methFile.write(km + "\t" + km.getMeanSignal() + "\t" + km.getStdvSignal() + "\n"); + else + unmethFile.write(km + "\t" + km.getMeanSignal() + "\t" + km.getStdvSignal() + "\n"); + } + methFile.close(); + unmethFile.close(); + + } +} diff --git a/src/dev/java/japsadev/obsolete/np/NanoporeReaderCmd.java b/src/dev/java/japsadev/obsolete/np/NanoporeReaderCmd.java new file mode 100644 index 0000000..5976cc8 --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/NanoporeReaderCmd.java @@ -0,0 +1,293 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 7 Aug 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.obsolete.np; + +import org.jfree.data.time.TimeTableXYDataset; +import japsa.util.CommandLine; +import japsa.util.JapsaException; +import japsa.util.Logging; +import japsa.util.deploy.Deployable; + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.np.npreader", + scriptDesc = "Extract and stream Oxford Nanopore sequencing data in real-time", + seeAlso = "jsa.np.filter, jsa.util.streamServer, jsa.util.streamClient,jsa.np.rtSpeciesTyping, jsa.np.rtStrainTyping, jsa.np.rtResistGenes" + ) +public class NanoporeReaderCmd extends CommandLine{ + public NanoporeReaderCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addBoolean("GUI", false,"Run with a Graphical User Interface"); + addBoolean("realtime", false,"Run the program in real-time mode, i.e., keep waiting for new data from Metrichor agent"); + addString("folder", null,"The folder containing base-called reads"); + addBoolean("fail", false,"Get sequence reads from fail folder"); + addString("output", "-","Name of the output file, - for stdout"); + addString("streams", null,"Stream output to some servers, format \"IP:port,IP:port\" (no spaces)"); + addString("format", "fastq","Format of sequence reads (fastq or fasta)"); + //addString("group", "","Group of base-called to be extracted ()"); + addInt("minLength", 1,"Minimum read length"); + addBoolean("number", false,"Add a unique number to read name"); + addBoolean("stats", false,"Generate a report of read statistics"); + //addBoolean("time", false,"Extract the sequencing time of each read -- only work with Metrichor > 1.12"); + + + addStdHelp(); + } + + public static void main(String[] args) throws OutOfMemoryError, Exception { + CommandLine cmdLine = new NanoporeReaderCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String output = cmdLine.getStringVal("output"); + String folder = cmdLine.getStringVal("folder"); + int minLength = cmdLine.getIntVal("minLength"); + boolean stats = cmdLine.getBooleanVal("stats"); + boolean number = cmdLine.getBooleanVal("number"); + //boolean time = cmdLine.getBooleanVal("time"); + boolean GUI = cmdLine.getBooleanVal("GUI"); + boolean realtime = cmdLine.getBooleanVal("realtime"); + boolean fail = cmdLine.getBooleanVal("fail"); + String format = cmdLine.getStringVal("format"); + String streamServers = cmdLine.getStringVal("streams"); + + //String pFolderName = cmdLine.getStringVal("pFolderName"); + //String f5list = cmdLine.getStringVal("f5list"); + //int interval = cmdLine.getIntVal("interval");//in second + //int age = cmdLine.getIntVal("age") * 1000;//in second + int age = 20 * 1000;//cmdLine.getIntVal("age") * 1000;//in second + int interval = 30; + String pFolderName = null; + + if (!GUI && folder == null){// && f5list == null){ + Logging.exit("Download folder need to be specified", 1); + } + + NanoporeReaderStream reader = new NanoporeReaderStream(); + + //reader.getTime = time; + reader.stats = stats; + reader.number = number; + reader.minLength = minLength; + + reader.interval = interval; + reader.age = age; + + //reader.f5List = f5list; + reader.folder = folder; + reader.doFail = fail; + reader.output = output; + reader.format = format.toLowerCase(); + reader.realtime = realtime; + reader.streamServers = streamServers; + NanoporeReaderWindow mGUI = null; + + if (GUI){ + reader.realtime = true; + System.setProperty("java.awt.headless", "false"); + reader.stats = true;//GUI implies stats + reader.ready = false;//wait for the command from GUI + + TimeTableXYDataset dataset = new TimeTableXYDataset(); + mGUI = new NanoporeReaderWindow(reader,dataset); + + while (!reader.ready){ + Logging.info("NOT READY"); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + Logging.info("GO"); + + new Thread(mGUI).start(); + }else{ + String msg = reader.prepareIO(); + if (msg != null){ + Logging.exit(msg, 1); + } + } + //reader need to wait until ready to go + + //reader.sos = SequenceOutputStream.makeOutputStream(reader.output); + try{ + reader.readFastq(pFolderName); + }catch (JapsaException e){ + System.err.println(e.getMessage()); + e.getStackTrace(); + if (mGUI != null) + mGUI.interupt(e); + }catch (Exception e){ + throw e; + }finally{ + reader.close(); + } + + }//main +} + + +/*RST* +------------------------------------------------------------------------- +*npReader*: real-time conversion and analysis of Nanopore sequencing data +------------------------------------------------------------------------- + +*npReader* (jsa.np.npreader) is a program that extracts Oxford Nanopore +sequencing data from FAST5 files, performs an initial analysis of the date and +streams them to real-time analysis pipelines. These pipelines can run on the +same computer or on computing clouds/high performance clusters. + +npReader is included in the `Japsa package `_. +It requires +`JAVA HDF5 INTERFACE (JHI5) library `_ +to be installed prior to setting up Japsa. Details of installation as follows: + +**On Windows/Mac** + +1. Download and install HDF-View from +https://www.hdfgroup.org/products/java/release/download.html. +Note the folder that the JHI library is installed, e.g., +*C:\\Program Files\\HDF_Group\\HDFView\\2.11.0\\lib* + +2. Follow the instructions to install Japsa on +http://japsa.readthedocs.org/en/latest/install.html. +Upon prompting for "Path to HDF library", enter the above path. + +**On Linux** + +You can either install the JHI5 library by downloading the software from +*https://www.hdfgroup.org/products/java/JNI/jhi5/index.html* or from your +Linux distribution software repository, such as:: + + sudo apt-get install libjhdf5-jni + +The library is typically installed to *#/usr/lib/jni*. Enter this path when +prompted for "Path to HDF library" during installation of Japsa. + +HDF-View (https://www.hdfgroup.org/products/java/release/download.html) also +contains the neccessary library. Please install HDF-2.10.1 instead of the +latest version. + + + +~~~~~~~~~~~~~~ +Usage examples +~~~~~~~~~~~~~~ + +A summary of npReader usage can be obtained by invoking the --help option:: + + jsa.np.npreader --help + +The simplest way to run *npReader* in GUI mode is by typing:: + + jsa.np.npreader -GUI -realtime + +and specify various options in the GUI. All of these options can be specified +from the command line:: + + jsa.np.npreader -GUI -realtime -folder c:\Downloads\ -fail -output myrun.fastq --minLength 200 --stats + +npReader can run natively on a Windows laptop that runs the Metrichor agent. It +can stream sequence data to multiple analysis pipelines on the same computer +and/or on high performance clusters and computing clouds. + +Start several analysis pipelines on some remote machines. Such a pipeline can +be to count how many reads aligned to chromosomes A and B:: + + jsa.util.streamServer --port 3456 | \ + bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ + awk -F "\t" 'BEGIN{A=0;B=0;N++} NF>4 \ + {if ($3=="chrA") A++; if ($3=="chrB") B++; \ + if (NR %100==0) \ + {print "At " NR " reads, " A " aligned to chr A; " B " aligned to chr B"} \ + }' + +In this pipeline, the *jsa.util.streamServer* program receives stream data +from *npReader* and forwards to *bwa*, which aligns the data to a reference +and in turn streams the alignment in sam format to the awk program to perform +a simple analysis of counting reads aligned to chrA and chrB. + +The Japsa package contains several real-time analysis (jsa.np.speciesTyping, +jsa.np.geneStrainTyping, jsa.np.resistGenes). They can be used to set up +analysis pipelines, such as:: + + jsa.util.streamServer --port 3457 \ + bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ + jsa.np.speciesTyping -bam - --index speciesIndex -output output.dat + +Once these pipelines are ready, npReader can start streaming data off the +MinION and the Metrichor agent to these pipelines:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output myrun.fastq \ + --minLength 200 --streams server1IP:3456,server2IP:3457 + +One can run *npReader* on a computing cloud if the download folder (containing +base-called data) can be mounted to the cloud. In such case, npReader can +direct stream data to the pipelines without the need of +*jsa.util.streamServer*:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ + jsa.np.speciesTyping -bam - --index speciesIndex -output output.dat + +Japsa also provides *jsa.np.filter*, a tool to bin sequence data in groups of +the user's liking. Like any other streamline tools, jsa.np.filter can run +behind *jsa.util.streamServer* on a remote machine, or can get data directly +from npReader via pipe:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.filter -input - -lenMin 2000 --qualMin 10 -output goodreads.fq + +One can also use *tee* to group data into different bins *in real-time* with +*jsa.np.filter*:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + tee >(jsa.np.filter -input - -lenMax 2000 -output 0k2k.fq) \ + >(jsa.np.filter -lenMin 2000 -lenMax 4000 -input - -output 2k4k.fq) \ + >(jsa.np.filter -lenMin 4000 -lenMax 6000 -input - -output 4k6k.fq) \ + >(jsa.np.filter -lenMin 6000 -input - -output 6k.fq) \ + > all.fq + +These bins can also be piped/streamed to different analysis pipelines as above. + +*RST*/ diff --git a/src/dev/java/japsadev/obsolete/np/NanoporeReaderCmd2.java b/src/dev/java/japsadev/obsolete/np/NanoporeReaderCmd2.java new file mode 100644 index 0000000..14a32fe --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/NanoporeReaderCmd2.java @@ -0,0 +1,293 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 7 Aug 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.obsolete.np; + +import org.jfree.data.time.TimeTableXYDataset; +import japsa.util.CommandLine; +import japsa.util.JapsaException; +import japsa.util.Logging; +import japsa.util.deploy.Deployable; + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.np.npreader2", + scriptDesc = "Extract and stream Oxford Nanopore sequencing data in real-time", + seeAlso = "jsa.np.filter, jsa.util.streamServer, jsa.util.streamClient,jsa.np.rtSpeciesTyping, jsa.np.rtStrainTyping, jsa.np.rtResistGenes" + ) +public class NanoporeReaderCmd2 extends CommandLine{ + public NanoporeReaderCmd2(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addBoolean("GUI", false,"Run with a Graphical User Interface"); + addBoolean("realtime", false,"Run the program in real-time mode, i.e., keep waiting for new data from Metrichor agent"); + addString("folder", null,"The folder containing base-called reads"); + addBoolean("fail", false,"Get sequence reads from fail folder"); + addString("output", "-","Name of the output file, - for stdout"); + addString("streams", null,"Stream output to some servers, format \"IP:port,IP:port\" (no spaces)"); + addString("format", "fastq","Format of sequence reads (fastq or fasta)"); + //addString("group", "","Group of base-called to be extracted ()"); + addInt("minLength", 1,"Minimum read length"); + addBoolean("number", false,"Add a unique number to read name"); + addBoolean("stats", false,"Generate a report of read statistics"); + //addBoolean("time", false,"Extract the sequencing time of each read -- only work with Metrichor > 1.12"); + + + addStdHelp(); + } + + public static void main(String[] args) throws OutOfMemoryError, Exception { + CommandLine cmdLine = new NanoporeReaderCmd2(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String output = cmdLine.getStringVal("output"); + String folder = cmdLine.getStringVal("folder"); + int minLength = cmdLine.getIntVal("minLength"); + boolean stats = cmdLine.getBooleanVal("stats"); + boolean number = cmdLine.getBooleanVal("number"); + //boolean time = cmdLine.getBooleanVal("time"); + boolean GUI = cmdLine.getBooleanVal("GUI"); + boolean realtime = cmdLine.getBooleanVal("realtime"); + boolean fail = cmdLine.getBooleanVal("fail"); + String format = cmdLine.getStringVal("format"); + String streamServers = cmdLine.getStringVal("streams"); + + //String pFolderName = cmdLine.getStringVal("pFolderName"); + //String f5list = cmdLine.getStringVal("f5list"); + //int interval = cmdLine.getIntVal("interval");//in second + //int age = cmdLine.getIntVal("age") * 1000;//in second + int age = 20 * 1000;//cmdLine.getIntVal("age") * 1000;//in second + int interval = 30; + + if (!GUI && folder == null){// && f5list == null){ + Logging.exit("Download folder need to be specified", 1); + } + + NanoporeReaderStream2 reader = new NanoporeReaderStream2(); + + //reader.getTime = time; + reader.stats = stats; + reader.number = number; + reader.minLength = minLength; + + reader.interval = interval; + reader.age = age; + + //reader.f5List = f5list; + reader.folder = folder; + reader.doFail = fail; + reader.output = output; + reader.format = format.toLowerCase(); + reader.realtime = realtime; + reader.streamServers = streamServers; + NanoporeReaderWindow2 mGUI = null; + + if (GUI){ + reader.realtime = true; + System.setProperty("java.awt.headless", "false"); + reader.stats = true;//GUI implies stats + reader.ready = false;//wait for the command from GUI + + TimeTableXYDataset dataset = new TimeTableXYDataset(); + mGUI = new NanoporeReaderWindow2(reader,dataset); + + while (!reader.ready){ + Logging.info("NOT READY"); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + Logging.info("GO"); + + new Thread(mGUI).start(); + }else{ + String msg = reader.prepareIO(); + if (msg != null){ + Logging.exit(msg, 1); + } + } + //reader need to wait until ready to go + + //reader.sos = SequenceOutputStream.makeOutputStream(reader.output); + try{ + Logging.info("Start reading" ); + reader.readFast5(); + }catch (JapsaException e){ + System.err.println(e.getMessage()); + e.getStackTrace(); + if (mGUI != null) + mGUI.interupt(e); + }catch (Exception e){ + throw e; + }finally{ + reader.close(); + } + + }//main +} + + +/*RST* +------------------------------------------------------------------------- +*npReader*: real-time conversion and analysis of Nanopore sequencing data +------------------------------------------------------------------------- + +*npReader* (jsa.np.npreader) is a program that extracts Oxford Nanopore +sequencing data from FAST5 files, performs an initial analysis of the date and +streams them to real-time analysis pipelines. These pipelines can run on the +same computer or on computing clouds/high performance clusters. + +npReader is included in the `Japsa package `_. +It requires +`JAVA HDF5 INTERFACE (JHI5) library `_ +to be installed prior to setting up Japsa. Details of installation as follows: + +**On Windows/Mac** + +1. Download and install HDF-View from +https://www.hdfgroup.org/products/java/release/download.html. +Note the folder that the JHI library is installed, e.g., +*C:\\Program Files\\HDF_Group\\HDFView\\2.11.0\\lib* + +2. Follow the instructions to install Japsa on +http://japsa.readthedocs.org/en/latest/install.html. +Upon prompting for "Path to HDF library", enter the above path. + +**On Linux** + +You can either install the JHI5 library by downloading the software from +*https://www.hdfgroup.org/products/java/JNI/jhi5/index.html* or from your +Linux distribution software repository, such as:: + + sudo apt-get install libjhdf5-jni + +The library is typically installed to *#/usr/lib/jni*. Enter this path when +prompted for "Path to HDF library" during installation of Japsa. + +HDF-View (https://www.hdfgroup.org/products/java/release/download.html) also +contains the neccessary library. Please install HDF-2.10.1 instead of the +latest version. + + + +~~~~~~~~~~~~~~ +Usage examples +~~~~~~~~~~~~~~ + +A summary of npReader usage can be obtained by invoking the --help option:: + + jsa.np.npreader --help + +The simplest way to run *npReader* in GUI mode is by typing:: + + jsa.np.npreader -GUI -realtime + +and specify various options in the GUI. All of these options can be specified +from the command line:: + + jsa.np.npreader -GUI -realtime -folder c:\Downloads\ -fail -output myrun.fastq --minLength 200 --stats + +npReader can run natively on a Windows laptop that runs the Metrichor agent. It +can stream sequence data to multiple analysis pipelines on the same computer +and/or on high performance clusters and computing clouds. + +Start several analysis pipelines on some remote machines. Such a pipeline can +be to count how many reads aligned to chromosomes A and B:: + + jsa.util.streamServer --port 3456 | \ + bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ + awk -F "\t" 'BEGIN{A=0;B=0;N++} NF>4 \ + {if ($3=="chrA") A++; if ($3=="chrB") B++; \ + if (NR %100==0) \ + {print "At " NR " reads, " A " aligned to chr A; " B " aligned to chr B"} \ + }' + +In this pipeline, the *jsa.util.streamServer* program receives stream data +from *npReader* and forwards to *bwa*, which aligns the data to a reference +and in turn streams the alignment in sam format to the awk program to perform +a simple analysis of counting reads aligned to chrA and chrB. + +The Japsa package contains several real-time analysis (jsa.np.speciesTyping, +jsa.np.geneStrainTyping, jsa.np.resistGenes). They can be used to set up +analysis pipelines, such as:: + + jsa.util.streamServer --port 3457 \ + bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ + jsa.np.speciesTyping -bam - --index speciesIndex -output output.dat + +Once these pipelines are ready, npReader can start streaming data off the +MinION and the Metrichor agent to these pipelines:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output myrun.fastq \ + --minLength 200 --streams server1IP:3456,server2IP:3457 + +One can run *npReader* on a computing cloud if the download folder (containing +base-called data) can be mounted to the cloud. In such case, npReader can +direct stream data to the pipelines without the need of +*jsa.util.streamServer*:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ + jsa.np.speciesTyping -bam - --index speciesIndex -output output.dat + +Japsa also provides *jsa.np.filter*, a tool to bin sequence data in groups of +the user's liking. Like any other streamline tools, jsa.np.filter can run +behind *jsa.util.streamServer* on a remote machine, or can get data directly +from npReader via pipe:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.filter -input - -lenMin 2000 --qualMin 10 -output goodreads.fq + +One can also use *tee* to group data into different bins *in real-time* with +*jsa.np.filter*:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + tee >(jsa.np.filter -input - -lenMax 2000 -output 0k2k.fq) \ + >(jsa.np.filter -lenMin 2000 -lenMax 4000 -input - -output 2k4k.fq) \ + >(jsa.np.filter -lenMin 4000 -lenMax 6000 -input - -output 4k6k.fq) \ + >(jsa.np.filter -lenMin 6000 -input - -output 6k.fq) \ + > all.fq + +These bins can also be piped/streamed to different analysis pipelines as above. + +*RST*/ diff --git a/src/dev/java/japsadev/obsolete/np/NanoporeReaderStream.java b/src/dev/java/japsadev/obsolete/np/NanoporeReaderStream.java new file mode 100644 index 0000000..2c86c31 --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/NanoporeReaderStream.java @@ -0,0 +1,584 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 21/07/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.obsolete.np; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.net.Socket; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; + +import japsa.seq.FastqSequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.DoubleArray; +import japsa.util.IntArray; +import japsa.util.JapsaException; +import japsa.util.Logging; +import japsa.util.net.StreamClient; +import japsadev.obsolete.np.Fast5NPReader.BaseCalledFastq; + +/** + * Read nanopore data (read sequence, events, alignment, models etc) from a raw + * (fast5) format. + * @author minhduc + * + */ +public class NanoporeReaderStream{ + public String prepareIO(){ + String msg = null; + try{ + sos = SequenceOutputStream.makeOutputStream(output); + if (streamServers != null && streamServers.trim().length() > 0){ + @SuppressWarnings("resource") + StreamClient streamClient = new StreamClient(streamServers); + ArrayList sockets = streamClient.getSockets(); + networkOS = new ArrayList(sockets.size()); + for (Socket socket:sockets) + networkOS.add(new SequenceOutputStream(socket.getOutputStream())); + } + }catch (Exception e){ + msg = e.getMessage(); + }finally{ + if (msg != null){ + sos = null; + return msg; + } + } + return msg; + } + + + public void close() throws IOException{ + Logging.info("npReader closing"); + sos.close(); + if (networkOS != null){ + for (SequenceOutputStream out:networkOS) + out.close(); + } + Logging.info("npReader closed"); + done = true; + } + + double tempLength = 0, compLength = 0, twoDLength = 0; + int tempCount = 0, compCount = 0, twoDCount = 0; + IntArray lengths = new IntArray(); + DoubleArray qual2D = new DoubleArray(), qualComp = new DoubleArray(), qualTemp = new DoubleArray(); + IntArray lengths2D = new IntArray(), lengthsComp = new IntArray(), lengthsTemp = new IntArray(); + + int fileNumber = 0; + int passNumber = 0, failNumber = 0; + SequenceOutputStream sos; + ArrayList networkOS = null; + public boolean stats, number; + public String folder = null; + public int minLength = 1; + //public String group = ""; + public boolean wait = true; + public boolean realtime = true; + public int interval = 1, age = 30000; + public boolean doFail = false; + public String output = ""; + public String streamServers = null; + boolean doLow = true; + //public boolean getTime = false; + boolean done = false; + + public String format = "fastq"; + public boolean ready = true; + private static final byte MIN_QUAL = '!';//The minimum quality + + /** + * Compute average quality of a read + * @param fq + * @return + */ + public static double averageQuality(FastqSequence fq){ + if (fq.length() > 0){ + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + + } + return (sumQual/fq.length()); + } + else return 0; + } + + public void print(FastqSequence fq) throws IOException{ + if (format.equals("fasta")) + fq.writeFasta(sos); + else + fq.print(sos); + + if (networkOS != null){ + for (SequenceOutputStream out:networkOS) + if (format.equals("fasta")) + fq.writeFasta(out); + else + fq.print(out); //fq.print(out); + } + } + + @SuppressWarnings("unused") + private void flush() throws IOException{ + sos.flush(); + if (networkOS != null){ + for (SequenceOutputStream out:networkOS) + out.flush(); + } + } + + /************************************************************************************** + public boolean readFastq3_XXX(String fileName) throws JapsaException, IOException{ + //Logging.info("Open " + fileName); + try{ + Fast5NPReader f5Reader = new Fast5NPReader(fileName); + String log = ""; + //if (getTime){ + // log = "ExpStart=" + npReader.expStart + " timestamp=" + npReader.seqTime + " " + log; + //} + + FastqSequence fq; + + fq = f5Reader.readTwoDim(); + if (fq != null && fq.length() >= minLength){ + fq.setName((number?(fileNumber *3) + "_":"") + fq.getName() + " " + log); + print(fq); + if (stats){ + lengths.add(fq.length()); + lengths2D.add(fq.length()); + twoDCount ++; + if (fq.length() > 0){ + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + + } + qual2D.add(sumQual/fq.length()); + } + } + } + + fq = f5Reader.readTemplate(); + if (fq != null && fq.length() >= minLength && this.doLow){ + fq.setName((number?(fileNumber *3 + 1) + "_":"") + fq.getName() + " " + log); + print(fq); + if (stats){ + lengths.add(fq.length()); + lengthsTemp.add(fq.length()); + tempCount ++; + + if (fq.length() > 0){ + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + + } + qualTemp.add(sumQual/fq.length()); + } + } + } + + fq = f5Reader.readComplement(); + if (fq != null && fq.length() >= minLength && this.doLow){ + fq.setName((number?(fileNumber *3 + 2) + "_":"") + fq.getName() + " " + log); + print(fq); + if (stats){ + lengths.add(fq.length()); + lengthsComp.add(fq.length()); + compCount ++; + + if (fq.length() > 0){ + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + + } + qualComp.add(sumQual/fq.length()); + } + + } + } + f5Reader.close(); + fileNumber ++; + + }catch (JapsaException e){ + throw e; + }catch (Exception e){ + Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); + e.printStackTrace(); + return false; + } + return true; + } +/*****************************************************************************/ + public boolean readFastq2(String fileName) throws JapsaException, IOException{ + //Logging.info("Open " + fileName); + try{ + Fast5NPReader npReader = new Fast5NPReader(fileName); + npReader.readFastq(); + npReader.close(); + + ArrayList seqList = npReader.getFastqList(); + if (seqList!= null){ + for (BaseCalledFastq fq:seqList){ + if (fq.length() >= minLength){ + fq.setName((number?(fileNumber *3 + fq.type()) + "_":"") + fq.getName()); + print(fq); + if (stats){ + lengths.add(fq.length()); + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + } + if (fq.isTwoDim()){ + lengths2D.add(fq.length()); + twoDCount ++; + qual2D.add(sumQual/fq.length()); + }else if (fq.isComplement()){ + lengthsComp.add(fq.length()); + compCount ++; + qualComp.add(sumQual/fq.length()); + }else if (fq.isTemplate()){ + lengthsTemp.add(fq.length()); + tempCount ++; + qualTemp.add(sumQual/fq.length()); + } + } + } + } + } + + fileNumber ++; + }catch (JapsaException e){ + throw e; + }catch (Exception e){ + Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); + e.printStackTrace(); + return false; + } + return true; + } + /*****************************************************************************/ + + public boolean moveFile(File f, String pFolder){ + String fName = f.getName(); + if (f.renameTo(new File(pFolder + fName))){ + Logging.info("Move " + fName + " to " + pFolder); + return true; + } + else + return false; + } + + + /** + * Read read sequence from a list of fast5 files. + * @param fileList + * @param gSos : output stream + * @param stats: print out statistics + * @throws IOException + */ + public void readFastq(String pFolder) throws JapsaException, IOException{ + if (minLength < 1) + minLength = 1; + + if (pFolder != null ){ + pFolder = pFolder + File.separatorChar; + Logging.info("Copy to " + pFolder); + } + /*********************************************** + if (f5List != null){ + Logging.info("Reading in file " + f5List); + BufferedReader bf = SequenceReader.openFile(f5List); + String fileName; + while ((fileName = bf.readLine())!=null){ + readFastq2(fileName); + + //Move to done folder + if (pFolder != null){ + moveFile(new File(fileName), pFolder); + } + }//while + bf.close(); + }else + /***********************************************/ + {//folder + HashSet filesDone = new HashSet(); + + File mainFolder = new File(folder); + File passFolder = new File(folder + File.separatorChar + "pass"); + File failFolder = new File(folder + File.separatorChar + "fail"); + + while (wait){ + //Do main + long now = System.currentTimeMillis(); + File [] fileList = mainFolder.listFiles(); + Logging.info("Reading in folder " + mainFolder.getAbsolutePath()); + if (fileList!=null){ + for (File f:fileList){ + if (!wait) + break; + + //directory + if (!f.isFile()) + continue;//for + + if (!f.getName().endsWith("fast5")) + continue;//for + + //File too new + if (now - f.lastModified() < age) + continue;//for + + //if processed already + String sPath = f.getAbsolutePath(); + if (filesDone.contains(sPath)) + continue;//for + + if (readFastq2(sPath)){ + filesDone.add(sPath); + if (pFolder != null){ + moveFile(f, pFolder); + }//if + }//if + }//for + }//if + else{ + Logging.info("Folder " + mainFolder.getAbsolutePath() + " does not exist, are you sure this is the right folder?"); + } + + //Pass folder + now = System.currentTimeMillis(); + Logging.info("Reading in folder " + passFolder.getAbsolutePath()); + fileList = passFolder.listFiles(); + if (fileList!=null){ + for (File f:fileList){ + if (!wait) + break; + + //directory + if (!f.isFile()) + continue;//for + + if (!f.getName().endsWith("fast5")) + continue;//for + + //File too new + if (now - f.lastModified() < age) + continue;//for + + //if processed already + String sPath = f.getAbsolutePath(); + if (filesDone.contains(sPath)) + continue;//for + + if (readFastq2(sPath)){ + passNumber ++; + filesDone.add(sPath); + if (pFolder != null){ + moveFile(f, pFolder); + }//if + }//if + }//for + }//if + + //Fail folder + if (doFail){ + now = System.currentTimeMillis(); + Logging.info("Reading in folder " + failFolder.getAbsolutePath()); + fileList = failFolder.listFiles(); + if (fileList!=null){ + for (File f:fileList){ + if (!wait) + break; + + //directory + if (!f.isFile()) + continue; + + if (!f.getName().endsWith("fast5")) + continue; + + //File too new + if (now - f.lastModified() < age) + continue; + + //if processed already + String sPath = f.getAbsolutePath(); + if (filesDone.contains(sPath)) + continue; + + if (readFastq2(sPath)){ + failNumber ++; + filesDone.add(sPath); + if (pFolder != null){ + moveFile(f, pFolder); + }//if + }//if + }//for + }//if + } + if (!realtime) + break; + + for (int x = 0; x < interval && wait; x++){ + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + }//while + Logging.info("EXISTING"); + } + + if (stats){ + Logging.info("Getting stats ... "); + int [] ls = lengths.toArray(); + if (ls.length ==0){ + Logging.info("Open " + fileNumber + " files"); + Logging.info("Fould 0 reads"); + }else{ + Arrays.sort(ls); + + long baseCount = 0; + for (int i = 0; i < ls.length; i++) + baseCount += ls[i]; + + double mean = baseCount / ls.length; + double median = ls[ls.length/2]; + long sum = 0; + int quantile1st = 0, quantile2nd = 0, quantile3rd = 0; + for (int i = 0; i < ls.length; i++){ + sum += ls[i]; + if (quantile1st == 0 && sum >= baseCount / 4) + quantile1st = i; + + if (quantile2nd == 0 && sum >= baseCount / 2) + quantile2nd = i; + + if (quantile3rd == 0 && sum >= baseCount * 3/ 4) + quantile3rd = i; + } + + Logging.info("Open " + fileNumber + " files"); + Logging.info("Read count = " + ls.length + "(" + tempCount + " templates, " + compCount + " complements and " + twoDCount +" 2D)"); + Logging.info("Base count = " + baseCount); + Logging.info("Longest read = " + ls[ls.length - 1] + ", shortest read = " + ls[0]); + Logging.info("Average read length = " + mean); + Logging.info("Median read length = " + median); + Logging.info("Quantile first = " + ls[quantile1st] + " second = " + ls[quantile2nd] + " third = " + ls[quantile3rd]); + + if (qual2D.size() > 0){ + double sumQual = 0; + double sumQualSq = 0; + + for (int i = 0; i < qual2D.size();i++){ + sumQual += qual2D.get(i); + sumQualSq += qual2D.get(i) * qual2D.get(i); + } + + double meanQual = sumQual / qual2D.size(); + double stdQual = Math.sqrt(sumQualSq / qual2D.size() - meanQual * meanQual); + + Logging.info("Ave 2D qual " +meanQual + " " + qual2D.size() + " std = " + stdQual); + } + + if (qualTemp.size() > 0){ + double sumQual = 0; + double sumQualSq = 0; + + for (int i = 0; i < qualTemp.size();i++){ + sumQual += qualTemp.get(i); + sumQualSq += qualTemp.get(i) * qualTemp.get(i); + } + + double meanQual = sumQual / qualTemp.size(); + double stdQual = Math.sqrt(sumQualSq / qualTemp.size() - meanQual * meanQual); + + Logging.info("Ave Temp qual " +meanQual + " " + qualTemp.size() + " std = " + stdQual); + } + + if (qualComp.size() > 0){ + double sumQual = 0; + double sumQualSq = 0; + + + for (int i = 0; i < qualComp.size();i++){ + sumQual += qualComp.get(i); + sumQualSq += qualComp.get(i) * qualComp.get(i); + } + + double meanQual = sumQual / qualComp.size(); + double stdQual = Math.sqrt(sumQualSq / qualComp.size() - meanQual * meanQual); + + Logging.info("Ave Comp qual " + meanQual + " " + qualComp.size() + " std = " + stdQual); + } + } + printToFile("stats"); + } + } + + public void printToFile(String prefix) throws IOException{ + if(prefix.length() < 1) + prefix = "out"; + BufferedWriter lenFile = new BufferedWriter(new PrintWriter(prefix + ".len")), + qualTempFile = new BufferedWriter(new PrintWriter(prefix + ".temp.qual")), + qualCompFile = new BufferedWriter(new PrintWriter(prefix + ".comp.qual")), + qual2DFile = new BufferedWriter(new PrintWriter(prefix + ".2d.qual")); + for(int i=0; i < lengths.size(); i++){ + lenFile.write(lengths.get(i) + "\n"); + } + for(int i=0; i < qualTemp.size(); i++){ + qualTempFile.write(new DecimalFormat("#0.000").format(qualTemp.get(i)) + "\n"); + } + for(int i=0; i < qualComp.size(); i++){ + qualCompFile.write(new DecimalFormat("#0.000").format(qualComp.get(i)) + "\n"); + } + for(int i=0; i < qual2D.size(); i++){ + qual2DFile.write(new DecimalFormat("#0.000").format(qual2D.get(i)) + "\n"); + } + lenFile.close(); + qualTempFile.close(); + qualCompFile.close(); + qual2DFile.close(); + } +} diff --git a/src/dev/java/japsadev/obsolete/np/NanoporeReaderStream2.java b/src/dev/java/japsadev/obsolete/np/NanoporeReaderStream2.java new file mode 100644 index 0000000..f28675e --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/NanoporeReaderStream2.java @@ -0,0 +1,400 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 21/07/2014 - Minh Duc Cao: Created + * 14/03/2017 -- Minh Duc Cao modified + ****************************************************************************/ + +package japsadev.obsolete.np; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.net.Socket; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; + +import japsa.seq.FastqSequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.DoubleArray; +import japsa.util.IntArray; +import japsa.util.JapsaException; +import japsa.util.Logging; +import japsa.util.net.StreamClient; +import japsadev.obsolete.np.Fast5NPReader.BaseCalledFastq; + +/** + * Read nanopore data (read sequence, events, alignment, models etc) from a raw + * (fast5) format. + * @author minhduc + * + */ +public class NanoporeReaderStream2{ + public String prepareIO(){ + String msg = null; + try{ + sos = SequenceOutputStream.makeOutputStream(output); + if (streamServers != null && streamServers.trim().length() > 0){ + @SuppressWarnings("resource") + StreamClient streamClient = new StreamClient(streamServers); + ArrayList sockets = streamClient.getSockets(); + networkOS = new ArrayList(sockets.size()); + for (Socket socket:sockets) + networkOS.add(new SequenceOutputStream(socket.getOutputStream())); + } + }catch (Exception e){ + msg = e.getMessage(); + }finally{ + if (msg != null){ + sos = null; + return msg; + } + } + return msg; + } + + + public void close() throws IOException{ + Logging.info("npReader closing"); + sos.close(); + if (networkOS != null){ + for (SequenceOutputStream out:networkOS) + out.close(); + } + Logging.info("npReader closed"); + done = true; + } + + double tempLength = 0, compLength = 0, twoDLength = 0; + int tempCount = 0, compCount = 0, twoDCount = 0; + IntArray lengths = new IntArray(); + DoubleArray qual2D = new DoubleArray(), qualComp = new DoubleArray(), qualTemp = new DoubleArray(); + IntArray lengths2D = new IntArray(), lengthsComp = new IntArray(), lengthsTemp = new IntArray(); + + int fileNumber = 0; + int passNumber = 0, failNumber = 0; + SequenceOutputStream sos; + ArrayList networkOS = null; + public boolean stats, number; + public String folder = null; + public int minLength = 1; + //public String group = ""; + public boolean wait = true; + public boolean realtime = true; + public int interval = 1, age = 30000; + public boolean doFail = false; + public String output = ""; + public String streamServers = null; + boolean doLow = true; + //public boolean getTime = false; + boolean done = false; + + public String format = "fastq"; + public boolean ready = true; + private static final byte MIN_QUAL = '!';//The minimum quality + + /** + * Compute average quality of a read + * @param fq + * @return + */ + public static double averageQuality(FastqSequence fq){ + if (fq.length() > 0){ + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + + } + return (sumQual/fq.length()); + } + else return 0; + } + + public void print(FastqSequence fq) throws IOException{ + if (format.equals("fasta")) + fq.writeFasta(sos); + else + fq.print(sos); + + if (networkOS != null){ + for (SequenceOutputStream out:networkOS) + if (format.equals("fasta")) + fq.writeFasta(out); + else + fq.print(out); //fq.print(out); + } + } + + @SuppressWarnings("unused") + private void flush() throws IOException{ + sos.flush(); + if (networkOS != null){ + for (SequenceOutputStream out:networkOS) + out.flush(); + } + } + + + public boolean readFastq2(String fileName) throws JapsaException, IOException{ + //Logging.info("Open " + fileName); + try{ + Fast5NPReader npReader = new Fast5NPReader(fileName); + npReader.readFastq(); + npReader.close(); + + ArrayList seqList = npReader.getFastqList(); + if (seqList!= null){ + for (BaseCalledFastq fq:seqList){ + if (fq.length() >= minLength){ + fq.setName((number?(fileNumber *3 + fq.type()) + "_":"") + fq.getName()); + print(fq); + if (stats){ + lengths.add(fq.length()); + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + } + if (fq.isTwoDim()){ + lengths2D.add(fq.length()); + twoDCount ++; + qual2D.add(sumQual/fq.length()); + }else if (fq.isComplement()){ + lengthsComp.add(fq.length()); + compCount ++; + qualComp.add(sumQual/fq.length()); + }else if (fq.isTemplate()){ + lengthsTemp.add(fq.length()); + tempCount ++; + qualTemp.add(sumQual/fq.length()); + } + } + } + } + } + + fileNumber ++; + }catch (JapsaException e){ + throw e; + }catch (Exception e){ + Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); + e.printStackTrace(); + return false; + } + return true; + } + /*****************************************************************************/ + + + + + /** + * Read read sequence from a list of fast5 files. + * @param fileList + * @param gSos : output stream + * @param stats: print out statistics + * @throws IOException + */ + public void readFast5() throws JapsaException, IOException{ + if (minLength < 1) + minLength = 1; + + Logging.info("Start reading " + folder); + + + HashSet filesDone = new HashSet(); + + while (wait){ + //Do main + final long now = System.currentTimeMillis(); + + Logging.info("Start reading " + now ); + + Files.walk(Paths.get(folder)) + //is a file + .filter(Files::isRegularFile) + //fast5 file + .filter(p -> p.toString().endsWith("fast5")) + //age is old enough + .filter(p -> { + try{ + return now - Files.getLastModifiedTime(p).toMillis() > age; + }catch (IOException e1) { + e1.printStackTrace(); + return false; + } + }) + //if (!doFail){ + // stream = stream.filter(p->!p.toString().contains("fail")); + //} + //not read before + .filter(p -> !filesDone.contains(p.toString())) + //read + .forEach(p -> { + // System.out.println(p); + try { + if (readFastq2(p.toString())){ + filesDone.add(p.toString()); + } + } catch (JapsaException | IOException e) { + e.printStackTrace(); + } + }); + + /*******************************************************/ + if (!realtime) + break; + + for (int x = 0; x < interval && wait; x++){ + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + }//while + Logging.info("EXISTING"); + + + if (stats){ + Logging.info("Getting stats ... "); + int [] ls = lengths.toArray(); + if (ls.length ==0){ + Logging.info("Open " + fileNumber + " files"); + Logging.info("Fould 0 reads"); + }else{ + Arrays.sort(ls); + + long baseCount = 0; + for (int i = 0; i < ls.length; i++) + baseCount += ls[i]; + + double mean = baseCount / ls.length; + double median = ls[ls.length/2]; + long sum = 0; + int quantile1st = 0, quantile2nd = 0, quantile3rd = 0; + for (int i = 0; i < ls.length; i++){ + sum += ls[i]; + if (quantile1st == 0 && sum >= baseCount / 4) + quantile1st = i; + + if (quantile2nd == 0 && sum >= baseCount / 2) + quantile2nd = i; + + if (quantile3rd == 0 && sum >= baseCount * 3/ 4) + quantile3rd = i; + } + + Logging.info("Open " + fileNumber + " files"); + Logging.info("Read count = " + ls.length + "(" + tempCount + " templates, " + compCount + " complements and " + twoDCount +" 2D)"); + Logging.info("Base count = " + baseCount); + Logging.info("Longest read = " + ls[ls.length - 1] + ", shortest read = " + ls[0]); + Logging.info("Average read length = " + mean); + Logging.info("Median read length = " + median); + Logging.info("Quantile first = " + ls[quantile1st] + " second = " + ls[quantile2nd] + " third = " + ls[quantile3rd]); + + if (qual2D.size() > 0){ + double sumQual = 0; + double sumQualSq = 0; + + for (int i = 0; i < qual2D.size();i++){ + sumQual += qual2D.get(i); + sumQualSq += qual2D.get(i) * qual2D.get(i); + } + + double meanQual = sumQual / qual2D.size(); + double stdQual = Math.sqrt(sumQualSq / qual2D.size() - meanQual * meanQual); + + Logging.info("Ave 2D qual " +meanQual + " " + qual2D.size() + " std = " + stdQual); + } + + if (qualTemp.size() > 0){ + double sumQual = 0; + double sumQualSq = 0; + + for (int i = 0; i < qualTemp.size();i++){ + sumQual += qualTemp.get(i); + sumQualSq += qualTemp.get(i) * qualTemp.get(i); + } + + double meanQual = sumQual / qualTemp.size(); + double stdQual = Math.sqrt(sumQualSq / qualTemp.size() - meanQual * meanQual); + + Logging.info("Ave Temp qual " +meanQual + " " + qualTemp.size() + " std = " + stdQual); + } + + if (qualComp.size() > 0){ + double sumQual = 0; + double sumQualSq = 0; + + + for (int i = 0; i < qualComp.size();i++){ + sumQual += qualComp.get(i); + sumQualSq += qualComp.get(i) * qualComp.get(i); + } + + double meanQual = sumQual / qualComp.size(); + double stdQual = Math.sqrt(sumQualSq / qualComp.size() - meanQual * meanQual); + + Logging.info("Ave Comp qual " + meanQual + " " + qualComp.size() + " std = " + stdQual); + } + } + printToFile("stats"); + } + } + + public void printToFile(String prefix) throws IOException{ + if(prefix.length() < 1) + prefix = "out"; + BufferedWriter lenFile = new BufferedWriter(new PrintWriter(prefix + ".len")), + qualTempFile = new BufferedWriter(new PrintWriter(prefix + ".temp.qual")), + qualCompFile = new BufferedWriter(new PrintWriter(prefix + ".comp.qual")), + qual2DFile = new BufferedWriter(new PrintWriter(prefix + ".2d.qual")); + for(int i=0; i < lengths.size(); i++){ + lenFile.write(lengths.get(i) + "\n"); + } + for(int i=0; i < qualTemp.size(); i++){ + qualTempFile.write(new DecimalFormat("#0.000").format(qualTemp.get(i)) + "\n"); + } + for(int i=0; i < qualComp.size(); i++){ + qualCompFile.write(new DecimalFormat("#0.000").format(qualComp.get(i)) + "\n"); + } + for(int i=0; i < qual2D.size(); i++){ + qual2DFile.write(new DecimalFormat("#0.000").format(qual2D.get(i)) + "\n"); + } + lenFile.close(); + qualTempFile.close(); + qualCompFile.close(); + qual2DFile.close(); + } +} diff --git a/src/main/java/japsa/seq/nanopore/NanoporeReaderWindow.java b/src/dev/java/japsadev/obsolete/np/NanoporeReaderWindow.java similarity index 97% rename from src/main/java/japsa/seq/nanopore/NanoporeReaderWindow.java rename to src/dev/java/japsadev/obsolete/np/NanoporeReaderWindow.java index 02e75ab..2d0b142 100644 --- a/src/main/java/japsa/seq/nanopore/NanoporeReaderWindow.java +++ b/src/dev/java/japsadev/obsolete/np/NanoporeReaderWindow.java @@ -31,7 +31,7 @@ * 17 Apr 2015 - Minh Duc Cao: Created * ****************************************************************************/ -package japsa.seq.nanopore; +package japsadev.obsolete.np; import japsa.util.DynamicHistogram; import japsa.util.JapsaException; @@ -289,10 +289,10 @@ public void actionPerformed(ActionEvent e) { controlPanel.add(formatPanel); final JRadioButton fqRadioButton = new JRadioButton("fastq"); - fqRadioButton.setBounds(46, 22, 62, 23); + fqRadioButton.setBounds(46, 22, 72, 23); final JRadioButton faRadioButton = new JRadioButton("fasta"); - faRadioButton.setBounds(186, 22, 62, 23); + faRadioButton.setBounds(186, 22, 72, 23); formatPanel.setLayout(null); final ButtonGroup formatBtGroup = new ButtonGroup(); @@ -361,16 +361,28 @@ public void itemStateChanged(ItemEvent e){ final JLabel lblMinReadLength = new JLabel("Min read length"); - lblMinReadLength.setBounds(8, 83, 154, 15); + lblMinReadLength.setBounds(7, 83, 154, 15); optionPanel.add(lblMinReadLength); final JTextField txtMinLenth = new JTextField(); txtMinLenth.setText(reader.minLength+""); - txtMinLenth.setBounds(137, 77, 71, 21); + txtMinLenth.setBounds(137, 77, 80, 21); optionPanel.add(txtMinLenth); + + + //final JLabel lblGroup = new JLabel("Group"); + //lblGroup.setBounds(7, 113, 154, 15); + //optionPanel.add(lblGroup); + + //final JTextField txtGroup = new JTextField(); + //txtGroup.setText(reader.group); + //txtGroup.setBounds(137, 107, 150, 21); + //optionPanel.add(txtGroup); + + final JPanel lPanel = new JPanel(); - lPanel.setBounds(0, 521, 320, 55); + lPanel.setBounds(0, 541, 320, 55); controlPanel.add(lPanel); lPanel.setLayout(null); @@ -526,7 +538,7 @@ public void actionPerformed(ActionEvent e) { ///////////////////////////////////////////////////////////// //Histogram histoLengthDataSet=new DynamicHistogram(); - histoLengthDataSet.prepareSeries("Read Length", 200, 0, 50000); + histoLengthDataSet.prepareSeries("Read Length", 500, 0, 100000); //histoDataset.prepareSeries("2D", 50, 0, 50000); //histoDataset.prepareSeries("template", 50, 0, 50000); //histoDataset.prepareSeries("complement", 50, 0, 50000); @@ -656,6 +668,7 @@ public void actionPerformed(ActionEvent e) { chckReads.setEnabled(false); chckbxAddAUnicqu.setEnabled(false); txtMinLenth.setEnabled(false); +// txtGroup.setEnabled(false); btnStart.setEnabled(false); btnStop.setEnabled(true); diff --git a/src/dev/java/japsadev/obsolete/np/NanoporeReaderWindow2.java b/src/dev/java/japsadev/obsolete/np/NanoporeReaderWindow2.java new file mode 100644 index 0000000..3ffb19b --- /dev/null +++ b/src/dev/java/japsadev/obsolete/np/NanoporeReaderWindow2.java @@ -0,0 +1,760 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 17 Apr 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.obsolete.np; + +import japsa.util.DynamicHistogram; +import japsa.util.JapsaException; +import japsa.util.Logging; + +import java.awt.EventQueue; + +import javax.swing.BorderFactory; +import javax.swing.ButtonGroup; +import javax.swing.JFileChooser; +import javax.swing.JFrame; +import javax.swing.JOptionPane; +import javax.swing.JPanel; +import javax.swing.JTextField; + +import java.awt.BorderLayout; +import java.awt.Dimension; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.awt.event.ItemEvent; +import java.awt.event.ItemListener; +import java.io.File; +import java.io.IOException; +import java.text.DecimalFormat; +import java.text.SimpleDateFormat; + +import javax.swing.JRadioButton; +import javax.swing.JLabel; +import javax.swing.JButton; +import javax.swing.JCheckBox; + +import org.jfree.chart.ChartFactory; +import org.jfree.chart.ChartPanel; +import org.jfree.chart.JFreeChart; +import org.jfree.chart.axis.DateAxis; +import org.jfree.chart.axis.NumberAxis; +import org.jfree.chart.plot.PlotOrientation; +import org.jfree.chart.plot.SeriesRenderingOrder; +import org.jfree.chart.plot.XYPlot; +import org.jfree.chart.renderer.xy.StackedXYAreaRenderer; +import org.jfree.data.statistics.HistogramType; +import org.jfree.data.time.Second; +import org.jfree.data.time.TimeTableXYDataset; + +/** + * @author minhduc + * + */ +public class NanoporeReaderWindow2 implements Runnable{ + + private JFrame frmNanoporeReader; + private int height = 50; + private int topR = 100, topC = 100; + //String downloadFolder; + + TimeTableXYDataset dataSet; + NanoporeReaderStream2 reader; + + /** + * Launch the application. + */ + public static void main(String[] args) { + EventQueue.invokeLater(new Runnable() { + public void run() { + try { + NanoporeReaderWindow2 window = new NanoporeReaderWindow2(new NanoporeReaderStream2(),null); + window.frmNanoporeReader.setVisible(true); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + } + + /** + * Create the application. + * @throws IOException + */ + public NanoporeReaderWindow2(NanoporeReaderStream2 r, TimeTableXYDataset dataset) throws IOException { + reader = r; + this.dataSet = dataset; + + initialize(); + //frmNanoporeReader.pack(); + frmNanoporeReader.setVisible(true); + } + + /** + * Initialize the contents of the frame. + * @throws IOException + */ + private void initialize() throws IOException { + frmNanoporeReader = new JFrame(); + frmNanoporeReader.setTitle("Nanopore Reader"); + frmNanoporeReader.setBounds(topC, topR, 1238, 714); + frmNanoporeReader.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + frmNanoporeReader.getContentPane().setLayout(new BorderLayout(0, 0)); + + final JPanel controlPanel = new JPanel(); + frmNanoporeReader.getContentPane().add(controlPanel, BorderLayout.WEST); + controlPanel.setPreferredSize(new Dimension(330, height)); + controlPanel.setLayout(null); + + final JPanel inputPanel = new JPanel(); + inputPanel.setBounds(0, 8, 320, 120); + inputPanel.setBorder(BorderFactory.createTitledBorder("Input")); + controlPanel.add(inputPanel); + inputPanel.setLayout(null); + + final JTextField txtDir = new JTextField(reader.folder == null?"":reader.folder); + txtDir.setBounds(10, 51, 300, 20); + inputPanel.add(txtDir); + + //final ButtonGroup group = new ButtonGroup(); + + final JButton btnChange = new JButton("Change"); + btnChange.setBounds(28, 83, 117, 25); + inputPanel.add(btnChange); + + final JCheckBox chckbxInc = new JCheckBox("Include fail folder",reader.doFail); + chckbxInc.setBounds(153, 84, 159, 23); + inputPanel.add(chckbxInc); + + JLabel lblNewLabel = new JLabel("Folder containing base-called reads"); + lblNewLabel.setBounds(10, 24, 300, 15); + inputPanel.add(lblNewLabel); + + chckbxInc.addItemListener(new ItemListener() { + public void itemStateChanged(ItemEvent e){ + reader.doFail = (e.getStateChange() == ItemEvent.SELECTED); + } + }); + + btnChange.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent ae) { + JFileChooser fileChooser = new JFileChooser(); + fileChooser.setDialogTitle("Select download directory"); + fileChooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY); + fileChooser.setCurrentDirectory(new File(txtDir.getText())); + + int returnValue = fileChooser.showOpenDialog(null); + if (returnValue == JFileChooser.APPROVE_OPTION) { + reader.folder = fileChooser.getSelectedFile().getPath(); + txtDir.setText(reader.folder); + } + } + }); + + + final JPanel outputPanel = new JPanel(); + outputPanel.setBounds(0, 140, 320, 188); + outputPanel.setBorder(BorderFactory.createTitledBorder("Output")); + controlPanel.add(outputPanel); + outputPanel.setLayout(null); + + final JRadioButton rdbtnOut2Str = new JRadioButton("Output to output stream"); + rdbtnOut2Str.setBounds(10, 22, 302, 23); + outputPanel.add(rdbtnOut2Str); + + final JRadioButton rdbtnOut2File = new JRadioButton("Output to file"); + rdbtnOut2File.setBounds(10, 48, 302, 23); + outputPanel.add(rdbtnOut2File); + + final ButtonGroup group2 = new ButtonGroup(); + group2.add(rdbtnOut2Str); + group2.add(rdbtnOut2File); + + final JTextField txtOFile = new JTextField(reader.output); + txtOFile.setBounds(10, 79, 300, 20); + outputPanel.add(txtOFile); + + + final JButton btnFileChange = new JButton("Change"); + btnFileChange.setBounds(26, 105, 117, 25); + outputPanel.add(btnFileChange); + + + final JCheckBox chckbxStreamServer = new JCheckBox("Stream output to some servers"); + final JTextField txtStreamServers = new JTextField(); + + chckbxStreamServer.setSelected(reader.streamServers!=null); + chckbxStreamServer.setBounds(10, 132, 283, 23); + outputPanel.add(chckbxStreamServer); + + + txtStreamServers.setText(reader.streamServers != null?reader.streamServers:""); + txtStreamServers.setEnabled(chckbxStreamServer.isSelected()); + txtStreamServers.setBounds(10, 163, 300, 19); + outputPanel.add(txtStreamServers); + txtStreamServers.setColumns(10); + + chckbxStreamServer.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + txtStreamServers.setEnabled(chckbxStreamServer.isSelected()); + } + }); + + + btnFileChange.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent ae) { + JFileChooser fileChooser = new JFileChooser(); + fileChooser.setDialogTitle("Select output file"); + //fileChooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY); + fileChooser.setCurrentDirectory(new File(txtOFile.getText())); + int returnValue = fileChooser.showOpenDialog(null); + if (returnValue == JFileChooser.APPROVE_OPTION) { + reader.output = fileChooser.getSelectedFile().getPath(); + txtOFile.setText(reader.output); + } + } + }); + + + if ("-".equals(reader.output)){ + rdbtnOut2Str.setSelected(true); + rdbtnOut2File.setSelected(false); + btnFileChange.setEnabled(false); + txtOFile.setEnabled(false); + txtOFile.setText(""); + }else{ + rdbtnOut2Str.setSelected(false); + rdbtnOut2File.setSelected(true); + btnFileChange.setEnabled(true); + txtOFile.setEnabled(true); + } + + rdbtnOut2Str.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + if (rdbtnOut2Str.isSelected()){ + btnFileChange.setEnabled(false); + txtOFile.setEnabled(false); + }else{ + btnFileChange.setEnabled(true); + txtOFile.setEnabled(true); + } + } + }); + + + rdbtnOut2File.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + if (rdbtnOut2Str.isSelected()){ + btnFileChange.setEnabled(false); + txtOFile.setEnabled(false); + }else{ + btnFileChange.setEnabled(true); + txtOFile.setEnabled(true); + } + } + }); + + JPanel formatPanel = new JPanel(); + formatPanel.setBorder(BorderFactory.createTitledBorder("Output format")); + formatPanel.setBounds(0, 338, 320, 55); + controlPanel.add(formatPanel); + + final JRadioButton fqRadioButton = new JRadioButton("fastq"); + fqRadioButton.setBounds(46, 22, 72, 23); + + final JRadioButton faRadioButton = new JRadioButton("fasta"); + faRadioButton.setBounds(186, 22, 72, 23); + formatPanel.setLayout(null); + + final ButtonGroup formatBtGroup = new ButtonGroup(); + + formatBtGroup.add(fqRadioButton); + formatBtGroup.add(faRadioButton); + + formatPanel.add(fqRadioButton); + formatPanel.add(faRadioButton); + + if ("fasta".equals(reader.format)){ + fqRadioButton.setSelected(false); + faRadioButton.setSelected(true); + }else{ + faRadioButton.setSelected(false); + fqRadioButton.setSelected(true); + } + + fqRadioButton.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + if (fqRadioButton.isSelected()) + reader.format = "fastq"; + else + reader.format = "fasta"; + } + }); + + faRadioButton.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + if (fqRadioButton.isSelected()) + reader.format = "fastq"; + else + reader.format = "fasta"; + } + }); + + + + final JPanel optionPanel = new JPanel(); + optionPanel.setBounds(0, 405, 320, 115); + optionPanel.setBorder(BorderFactory.createTitledBorder("Options")); + controlPanel.add(optionPanel); + optionPanel.setLayout(null); + + final JCheckBox chckReads = new JCheckBox("Include template and complement reads",true); + chckReads.setBounds(7, 23, 310, 23); + optionPanel.add(chckReads); + + chckReads.addItemListener(new ItemListener() { + public void itemStateChanged(ItemEvent e){ + reader.doLow = (e.getStateChange() == ItemEvent.SELECTED); + } + }); + + + final JCheckBox chckbxAddAUnicqu = new JCheckBox("Add a unique number to read name",reader.number); + chckbxAddAUnicqu.setBounds(8, 52, 304, 23); + optionPanel.add(chckbxAddAUnicqu); + + chckbxAddAUnicqu.addItemListener(new ItemListener() { + public void itemStateChanged(ItemEvent e){ + reader.number = (e.getStateChange() == ItemEvent.SELECTED); + } + }); + + + + final JLabel lblMinReadLength = new JLabel("Min read length"); + lblMinReadLength.setBounds(7, 83, 154, 15); + optionPanel.add(lblMinReadLength); + + final JTextField txtMinLenth = new JTextField(); + txtMinLenth.setText(reader.minLength+""); + txtMinLenth.setBounds(137, 77, 80, 21); + optionPanel.add(txtMinLenth); + + + + //final JLabel lblGroup = new JLabel("Group"); + //lblGroup.setBounds(7, 113, 154, 15); + //optionPanel.add(lblGroup); + + //final JTextField txtGroup = new JTextField(); + //txtGroup.setText(reader.group); + //txtGroup.setBounds(137, 107, 150, 21); + //optionPanel.add(txtGroup); + + + final JPanel lPanel = new JPanel(); + lPanel.setBounds(0, 541, 320, 55); + controlPanel.add(lPanel); + lPanel.setLayout(null); + + final JButton btnStart = new JButton("Start"); + btnStart.setBounds(28, 18, 117, 25); + btnStart.setEnabled(true); + lPanel.add(btnStart); + + + final JButton btnStop = new JButton("Stop"); + btnStop.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + reader.wait = false; + + while (!reader.done){ + try { + Thread.sleep(100); + } catch (InterruptedException ee) { + ee.printStackTrace(); + } + } + + stillRun = false; + JOptionPane.showMessageDialog(null, "Done", "Information", JOptionPane.PLAIN_MESSAGE); + + } + }); + btnStop.setBounds(191, 18, 117, 25); + btnStop.setEnabled(false); + lPanel.add(btnStop); + + + + final JPanel mainPanel = new JPanel(); + frmNanoporeReader.getContentPane().add(mainPanel, BorderLayout.CENTER); + //mainPanel.setBorder(BorderFactory.createTitledBorder("Statistics")); + mainPanel.setLayout(null); + + final JPanel panelCounts = new JPanel(); + panelCounts.setBounds(12, 304, 428, 280); + mainPanel.add(panelCounts); + panelCounts.setLayout(null); + + + ////////////////////////////////////////////////// + final JLabel lblFiles = new JLabel("Total fast5 files"); + lblFiles.setBounds(10, 10, 140, 20); + panelCounts.add(lblFiles); + + txtTFiles = new JTextField("0"); + txtTFiles.setEditable(false); + txtTFiles.setBounds(150, 10, 110, 20); + panelCounts.add(txtTFiles); + txtTFiles.setColumns(10); + + //////////////////////////////////////////////// + final JLabel lblpFiles = new JLabel("Pass files"); + lblpFiles.setBounds(10, 35, 140, 20); + //lblpFiles.setBounds(63, 61, 68, 15); + panelCounts.add(lblpFiles); + + txtPFiles = new JTextField("0"); + txtPFiles.setEditable(false); + txtPFiles.setBounds(150, 35, 110, 20); + panelCounts.add(txtPFiles); + txtPFiles.setColumns(10); + + final JLabel lblFFiles = new JLabel("Fail files"); + lblFFiles.setBounds(10, 60, 140, 20); + panelCounts.add(lblFFiles); + + txtFFiles = new JTextField("0"); + txtFFiles.setEditable(false); + txtFFiles.setBounds(150, 60, 110, 20); + panelCounts.add(txtFFiles); + txtFFiles.setColumns(10); + + + final JLabel lbl2DReads = new JLabel("2D reads"); + lbl2DReads.setBounds(10, 90, 110, 20); + panelCounts.add(lbl2DReads); + + final JLabel lblTempReads = new JLabel("Template reads"); + lblTempReads.setBounds(10, 115, 140, 20); + panelCounts.add(lblTempReads); + + + final JLabel lblCompReads = new JLabel("Complement reads"); + lblCompReads.setBounds(10, 140, 140, 20); + panelCounts.add(lblCompReads); + + txtTempReads= new JTextField("0"); + txtTempReads.setEditable(false); + txtTempReads.setBounds(150, 115, 110, 20); + panelCounts.add(txtTempReads); + txtTempReads.setColumns(10); + + txt2DReads= new JTextField("0"); + txt2DReads.setEditable(false); + txt2DReads.setBounds(150, 90, 110, 20); + panelCounts.add(txt2DReads); + txt2DReads.setColumns(10); + + txtCompReads= new JTextField("0"); + txtCompReads.setEditable(false); + txtCompReads.setBounds(150, 140, 110, 20); + panelCounts.add(txtCompReads); + txtCompReads.setColumns(10); + + + final JFreeChart chart = ChartFactory.createStackedXYAreaChart( + "Read count", // chart title + "Time", // domain axis label + "Read number", // range axis label + this.dataSet + ); + + final StackedXYAreaRenderer render = new StackedXYAreaRenderer(); + + DateAxis domainAxis = new DateAxis(); + domainAxis.setAutoRange(true); + domainAxis.setDateFormatOverride(new SimpleDateFormat("HH:mm:ss")); + + XYPlot plot = (XYPlot) chart.getPlot(); + plot.setRenderer(render); + plot.setDomainAxis(domainAxis); + plot.setSeriesRenderingOrder(SeriesRenderingOrder.FORWARD); + plot.setForegroundAlpha(0.5f); + + NumberAxis rangeAxis = (NumberAxis) plot.getRangeAxis(); + rangeAxis.setNumberFormatOverride(new DecimalFormat("#,###.#")); + rangeAxis.setAutoRange(true); + + ChartPanel chartPanel = new ChartPanel(chart, + 450, + 280, + 450, + 280, + 450, + 280, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + + chartPanel.setBounds(0, 12, 450, 280); + mainPanel.add(chartPanel); + + ///////////////////////////////////////////////////////////// + //Histogram + histoLengthDataSet=new DynamicHistogram(); + histoLengthDataSet.prepareSeries("Read Length", 500, 0, 100000); + //histoDataset.prepareSeries("2D", 50, 0, 50000); + //histoDataset.prepareSeries("template", 50, 0, 50000); + //histoDataset.prepareSeries("complement", 50, 0, 50000); + + JFreeChart hisLengths=ChartFactory.createHistogram("Read length histogram","length","count",histoLengthDataSet,PlotOrientation.VERTICAL,true,true,false); + ChartPanel hisPanel = new ChartPanel(hisLengths, + 450, + 280, + 450, + 280, + 450, + 280, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + + XYPlot hisPlot = (XYPlot) hisLengths.getPlot(); + hisPlot.getDomainAxis().setAutoRange(true); + hisPlot.getRangeAxis().setAutoRange(true); + + + hisPanel.setBounds(452, 12, 450, 280); + mainPanel.add(hisPanel); + + + histoQualDataSet=new DynamicHistogram(); + histoQualDataSet.setType(HistogramType.SCALE_AREA_TO_1); + + histoQualDataSet.prepareSeries("2D", 100, 0, 30); + histoQualDataSet.prepareSeries("complement", 100, 0, 30); + histoQualDataSet.prepareSeries("template", 100, 0, 30); + + JFreeChart hisQual=ChartFactory.createXYLineChart("Quality","quality","frequency",histoQualDataSet,PlotOrientation.VERTICAL,true,true,false); + ChartPanel hisQualPanel = new ChartPanel(hisQual, + 450, + 280, + 450, + 280, + 450, + 280, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + + XYPlot hisQualPlot = (XYPlot) hisQual.getPlot(); + hisQualPlot.getDomainAxis().setAutoRange(true); + hisQualPlot.getRangeAxis().setAutoRange(true); + + hisQualPlot.setForegroundAlpha(0.8F); + //XYBarRenderer xybarrenderer = (XYBarRenderer)hisQualPlot.getRenderer(); + //xybarrenderer.setDrawBarOutline(false); + + + hisQualPanel.setBounds(452, 300, 450, 280); + mainPanel.add(hisQualPanel); + + + + btnStart.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + //1. Validate before running + + //validate input + + String _path = txtDir.getText().trim(); + if (_path.equals("")){ + JOptionPane.showMessageDialog(null, "Please specify download directory", "Warning", JOptionPane.PLAIN_MESSAGE); + txtDir.grabFocus(); + return; + } + + File _file = new File(_path); + if (!_file.exists()){ + JOptionPane.showMessageDialog(null, "Directory \"" + _path + "\" does not exist!", "Warning", JOptionPane.PLAIN_MESSAGE); + txtDir.grabFocus(); + return; + } + reader.folder = _path; + //validate output + if (rdbtnOut2File.isSelected()){ + String _foutput = txtOFile.getText().trim(); + if (_foutput.equals("")){ + JOptionPane.showMessageDialog(null, "Please specify output file", "Warning", JOptionPane.PLAIN_MESSAGE); + txtOFile.grabFocus(); + return; + } + reader.output = _foutput; + }else + reader.output = "-";//stream + + + //validate stream + if (chckbxStreamServer.isSelected()){ + if (txtStreamServers.getText().trim().equals("")){ + JOptionPane.showMessageDialog(null, "Please specify output address of a server", "Warning", JOptionPane.PLAIN_MESSAGE); + txtStreamServers.grabFocus(); + return; + } + reader.streamServers = txtStreamServers.getText().trim(); + } + + + String msg = reader.prepareIO(); + if (msg !=null){ + JOptionPane.showMessageDialog(null, msg, "Warning", JOptionPane.PLAIN_MESSAGE); + return; + } + + //Start running + txtDir.setEnabled(false); + btnChange.setEnabled(false); + chckbxInc.setEnabled(false); + rdbtnOut2Str.setEnabled(false); + rdbtnOut2File.setEnabled(false); + txtOFile.setEnabled(false); + btnFileChange.setEnabled(false); + chckReads.setEnabled(false); + chckbxAddAUnicqu.setEnabled(false); + txtMinLenth.setEnabled(false); +// txtGroup.setEnabled(false); + + btnStart.setEnabled(false); + btnStop.setEnabled(true); + + reader.ready = true; + } + }); } + + JTextField txtCompReads, txtTempReads, txt2DReads; + JTextField txtPFiles, txtFFiles, txtTFiles; + DynamicHistogram histoLengthDataSet, histoQualDataSet; + + boolean stillRun = true; + + public void interupt(JapsaException e){ + this.stillRun = false; + reader.wait = false; + JOptionPane.showMessageDialog(null, e.getMessage(), "Error", JOptionPane.ERROR_MESSAGE); + } + + public void run() { + int lastIndexLengths = 0;//, lastIndexLengths2D = 0, lastIndexLengthsComp = 0, lastIndexLengthsTemp = 0; + int lastIndexQual2D = 0, lastIndexQualComp = 0, lastIndexQualTemp = 0; + + while(stillRun) { + //synchronized(reader) {//avoid concurrent update + Second period = new Second(); + dataSet.add(period, reader.twoDCount,"2D"); + dataSet.add(period, reader.compCount,"complement"); + dataSet.add(period, reader.tempCount,"template"); + + txtTFiles.setText(reader.fileNumber+""); + txtPFiles.setText(reader.passNumber+""); + txtFFiles.setText(reader.failNumber+""); + + txt2DReads.setText(reader.twoDCount+""); + txtCompReads.setText(reader.compCount+""); + txtTempReads.setText(reader.tempCount+""); + + int currentIndex = reader.lengths.size(); + + if (currentIndex > lastIndexLengths){ + int index = histoLengthDataSet.getSeriesIndex("Read Length"); + for (int i = lastIndexLengths; i < currentIndex;i++) + histoLengthDataSet.addSeries(index, reader.lengths.get(i)); + + lastIndexLengths = currentIndex; + + histoLengthDataSet.notifyChanged(); + } + + currentIndex = reader.qual2D.size(); + if (currentIndex > lastIndexQual2D){ + int index = histoQualDataSet.getSeriesIndex("2D"); + for (int i = lastIndexQual2D; i < currentIndex;i++) + histoQualDataSet.addSeries(index, reader.qual2D.get(i)); + + lastIndexQual2D = currentIndex; + histoQualDataSet.notifyChanged(); + } + + currentIndex = reader.qualComp.size(); + if (currentIndex > lastIndexQualComp){ + int index = histoQualDataSet.getSeriesIndex("complement"); + for (int i = lastIndexQualComp; i < currentIndex;i++) + histoQualDataSet.addSeries(index, reader.qualComp.get(i)); + + lastIndexQualComp = currentIndex; + histoQualDataSet.notifyChanged(); + } + + currentIndex = reader.qualTemp.size(); + if (currentIndex > lastIndexQualTemp){ + int index = histoQualDataSet.getSeriesIndex("template"); + for (int i = lastIndexQualTemp; i < currentIndex;i++) + histoQualDataSet.addSeries(index, reader.qualTemp.get(i)); + + lastIndexQualTemp = currentIndex; + histoQualDataSet.notifyChanged(); + } + + try { + Thread.sleep(1000); + } catch (InterruptedException ex) { + Logging.error(ex.getMessage()); + } + } + } +} diff --git a/src/dev/java/japsadev/tools/AnalyseCaptureCmd.java b/src/dev/java/japsadev/tools/AnalyseCaptureCmd.java new file mode 100644 index 0000000..c5c4761 --- /dev/null +++ b/src/dev/java/japsadev/tools/AnalyseCaptureCmd.java @@ -0,0 +1,181 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 25/08/2016 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools; + + +import java.io.File; +import java.io.IOException; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.capFragmentAnalysis", + scriptDesc = "Analyse the fragment size" + ) +public class AnalyseCaptureCmd extends CommandLine{ + //CommandLine cmdLine; + public AnalyseCaptureCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("probe", null, "File containing probes in bam format"); + addString("logFile", "-", "Log file"); + + addString("miseq", null, "Name of read file if miseq is simulated"); + addString("pacbio", null, "Name of read file if pacbio is simulated"); + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + CommandLine cmdLine = new AnalyseCaptureCmd (); + args = cmdLine.stdParseLine(args); + + + /**********************************************************************/ + + String logFile = cmdLine.getStringVal("logFile"); + String probe = cmdLine.getStringVal("probe"); + + String miseq = cmdLine.getStringVal("miseq"); + String pacbio = cmdLine.getStringVal("pacbio"); + + if (miseq == null && pacbio == null){ + System.err.println("At least one of fragment, miseq and pacbio has to be set\n" + cmdLine.usageString()); + System.exit(-1); + } + + + SequenceOutputStream + logOS = logFile.equals("-")? (new SequenceOutputStream(System.err)):(SequenceOutputStream.makeOutputStream(logFile)); + + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader probeReader = SamReaderFactory.makeDefault().open(new File(probe)); + + + if (pacbio != null){ + SamReader pbReader = SamReaderFactory.makeDefault().open(new File(pacbio)); + + SAMRecordIterator iter = pbReader.iterator(); + while (iter.hasNext()){ + SAMRecord sam = iter.next(); + + int fragmentSize = sam.getReadLength(); + if (sam.getReadUnmappedFlag()){ + System.out.println(fragmentSize + "\t" + 0 + "\t" + 0); + continue; + } + + + int fragmentStart = sam.getAlignmentStart(); + int fragmentEnd = sam.getAlignmentEnd(); + String refName = sam.getReferenceName(); + int countProbe = countProbe(probeReader, refName, fragmentStart, fragmentEnd); + System.out.println(fragmentSize + "\t" + countProbe + "\t" + 1); + } + + iter.close(); + pbReader.close(); + } + if (miseq != null){ + SamReader msReader = SamReaderFactory.makeDefault().open(new File(miseq)); + + SAMRecordIterator iter = msReader.iterator(); + while (iter.hasNext()){ + SAMRecord sam = iter.next(); + + if (sam.getReadUnmappedFlag()){ + continue; + } + + int fragmentSize = sam.getInferredInsertSize(); + //if mate is not aligned, or mate aligned to the left, ignore + if ( fragmentSize <= 0) + continue; + + int fragmentStart = sam.getAlignmentStart(); + int fragmentEnd = fragmentStart + fragmentSize; + String refName = sam.getReferenceName(); + int countProbe = countProbe(probeReader, refName, fragmentStart, fragmentEnd); + System.out.println(fragmentSize + "\t" + countProbe + "\t" + 1); + } + + iter.close(); + msReader.close(); + + } + + logOS.close(); + } + + static int countProbe(SamReader probeReader, String refName, int fragmentStart, int fragmentEnd){ + SAMRecordIterator iter = probeReader.query(refName, fragmentStart, fragmentEnd, true); + + int countProbe = 0; + int myEnd = 0; + while (iter.hasNext()){ + SAMRecord sam = iter.next(); + + int start = sam.getAlignmentStart(); + if (start < myEnd) + continue; + + int end = sam.getAlignmentEnd(); + //probe can only bind if > 90% + if ((end - start) < 0.9 * sam.getReadLength()) + continue; + + countProbe += (end - start + 1); + myEnd = end; + } + iter.close(); + return countProbe; + + } + +} diff --git a/src/dev/java/japsadev/tools/BetaBinomial.java b/src/dev/java/japsadev/tools/BetaBinomial.java new file mode 100644 index 0000000..4573b0c --- /dev/null +++ b/src/dev/java/japsadev/tools/BetaBinomial.java @@ -0,0 +1,42 @@ +package japsadev.tools; + +import org.apache.commons.math3.special.Gamma; + +public class BetaBinomial { + double alpha, beta, trials; + public void set(double alpha, double beta, double trials){ + this.alpha = alpha; + this.beta = beta; + this.trials = trials; + } + public double logdensity(double k) { + //int k = (int) Math.rint(x); + +// if (k < 0 | k > trials) return 0; + if( alpha < 1e-10){ + return Double.NEGATIVE_INFINITY; + /* double res1 =Gamma.logGamma(k+alpha); + double res2 = Gamma.logGamma(trials-k+beta); + double res3 = Gamma.logGamma(alpha+beta); + double res4 = Gamma.logGamma(trials+2); + double res5 = Math.log(trials+1); + double res6 = Gamma.logGamma(alpha+beta+trials); + double res7 = Gamma.logGamma(alpha); + double res8 = Gamma.logGamma(beta); + double res9 = Gamma.logGamma(k+1); + double res10 = Gamma.logGamma(trials-k+1); + throw new RuntimeException("os na");*/ + } + double res = (Gamma.logGamma(k+alpha)+Gamma.logGamma(trials-k+beta)+Gamma.logGamma(alpha+beta)+Gamma.logGamma(trials+1)) - + ( + // Math.log(trials+1)+ + Gamma.logGamma(alpha+beta+trials)+Gamma.logGamma(alpha)+Gamma.logGamma(beta)+Gamma.logGamma(k+1)+Gamma.logGamma(trials-k+1)); + + //double res = (Gamma.logGamma(k+alpha)+Gamma.logGamma(trials-k+beta)+Gamma.logGamma(alpha+beta)+Gamma.logGamma(trials+2)) +/* if(Double.isNaN(res)){ + throw new RuntimeException("!!"); +}*/ + return res ;//+ norm; + +} +} diff --git a/src/dev/java/japsadev/tools/BreakPointAnalysisCmd.java b/src/dev/java/japsadev/tools/BreakPointAnalysisCmd.java new file mode 100644 index 0000000..8484131 --- /dev/null +++ b/src/dev/java/japsadev/tools/BreakPointAnalysisCmd.java @@ -0,0 +1,190 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 02/09/2016 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools; + + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Analysis of break-point patterns + * + * + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.breakpoint", + scriptDesc = "Analysis of break point pattern" + ) +public class BreakPointAnalysisCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(BreakPointAnalysisCmd.class); + + public BreakPointAnalysisCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("bamFile", null, "Bam file",true); + addString("output", null, "output",true); + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + BreakPointAnalysisCmd cmdLine = new BreakPointAnalysisCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String bamFile = cmdLine.getStringVal("bamFile"); + String output = cmdLine.getStringVal("output"); + + + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader samReader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + + List seqList = + samReader.getFileHeader().getSequenceDictionary().getSequences(); + + + int halfWindow = 5; + int norm = 100; + for (SAMSequenceRecord sequenceRecord:seqList){ + String chrom = sequenceRecord.getSequenceName(); + int [] countThrough = new int[sequenceRecord.getSequenceLength()]; + int [] countBreaks = new int[sequenceRecord.getSequenceLength()]; + //Arrays.fill(countReads, 0); + //Arrays.fill(countBreaks, 0); + + SAMRecordIterator samIter = samReader.query(chrom, 0, 0, false); + while (samIter.hasNext()){ + SAMRecord samRecord = samIter.next(); + Cigar cigar = samRecord.getCigar(); + + int clipLeft = 0; + int clipRight = 0; + + //check the left clip + CigarElement endElement = cigar.getCigarElement(0); + if (endElement.getOperator() == CigarOperator.S + || endElement.getOperator() == CigarOperator.H + || endElement.getOperator() == CigarOperator.P){ + clipLeft = endElement.getLength(); + + } + + //Now look at the right clip + endElement = cigar.getCigarElement(cigar.numCigarElements() - 1); + if (endElement.getOperator() == CigarOperator.S + || endElement.getOperator() == CigarOperator.H + || endElement.getOperator() == CigarOperator.P){ + clipRight = endElement.getLength(); + } + + int start = samRecord.getAlignmentStart(); + int end = samRecord.getAlignmentEnd(); + + if (clipLeft > norm){ + int i = start - 1 - halfWindow; + if (i < 0) + i=0; + for (; i < start - 1 + halfWindow && i < countBreaks.length;i++){ + countBreaks[i] ++; + } + } + + if (clipRight > norm){ + int i = end - 1 - halfWindow; + if (i < 0) + i=0; + for (; i < end - 1 + halfWindow && i < countBreaks.length;i++){ + countBreaks[i] ++; + } + } + for (int i = start + halfWindow; i < end - halfWindow - 2;i++){ + countThrough[i]++; + } + }//while + samIter.close(); + + LOG.info("Write"); + SequenceOutputStream tCount = SequenceOutputStream.makeOutputStream(output + "_" + chrom +"_through.bedgraph"); + SequenceOutputStream bCount = SequenceOutputStream.makeOutputStream(output + "_" + chrom +"_breaks.bedgraph"); + tCount.print("track type=bedGraph\n"); + bCount.print("track type=bedGraph\n"); + char sep = '\t'; + for (int i = 0; i < countThrough.length;i++){ + tCount.print(chrom); + tCount.print(sep); + tCount.print(i); + tCount.print(sep); + tCount.print(i+1); + tCount.print(sep); + tCount.print(countThrough[i]); + tCount.print('\n'); + + bCount.print(chrom); + bCount.print(sep); + bCount.print(i); + bCount.print(sep); + bCount.print(i+1); + bCount.print(sep); + bCount.print(countBreaks[i]); + bCount.print('\n'); + + } + tCount.close(); + bCount.close(); + }//while + + } +} diff --git a/src/dev/java/japsadev/tools/CaptureProbeDesignCmd.java b/src/dev/java/japsadev/tools/CaptureProbeDesignCmd.java new file mode 100644 index 0000000..2328642 --- /dev/null +++ b/src/dev/java/japsadev/tools/CaptureProbeDesignCmd.java @@ -0,0 +1,159 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 09/09/2016 - Minh Duc Cao started + * + ****************************************************************************/ + +package japsadev.tools; + +import java.io.File; +import java.io.IOException; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.capDesign", + scriptDesc = "Capture probe design" + ) +public class CaptureProbeDesignCmd extends CommandLine{ + public CaptureProbeDesignCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + + addString("bamFile",null,"Bam file",true); + addString("output","-","Output file"); + addInt("distance",2000, "Acceptable distance"); + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new CaptureProbeDesignCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String bamFile = cmdLine.getStringVal("bamFile"); + String output = cmdLine.getStringVal("output"); + int distance = cmdLine.getIntVal("distance"); + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(output); + + + + SAMRecordIterator iter = reader.iterator(); + String currentName = ""; + String currentSeq = ""; + String currentQual = ""; + + String myChrom = ""; + int myStart = 0; + + //int distance = 1000; + + int counterGood = 0; + int counterBad = 0; + + while (iter.hasNext()){ + SAMRecord record = iter.next(); + String readName = record.getReadName(); + + if (record.getReadUnmappedFlag()){ + sos.print(readName + "\t0\t0\t" + record.getReadString() + "\t" + record.getBaseQualityString() + "\n"); + //odd + continue; + } + + if (!readName.equals(currentName)){ + if (currentName.length() > 0) + sos.print(currentName + "\t" + counterGood + "\t"+ counterBad + "\t" + currentSeq + "\t" + currentQual + "\n"); + + currentName = readName; + currentSeq = record.getReadString(); + currentQual = record.getBaseQualityString(); + counterGood = 0; + counterBad = 0; + String [] toks = readName.split("_"); + myChrom = toks[0]; + myStart = Integer.parseInt(toks[1]); + } + + String chrom = record.getReferenceName(); + if (!chrom.equals(myChrom)){ + counterBad ++; + continue; + } + + if (Math.abs(myStart - record.getAlignmentStart()) > distance){ + counterBad ++; + }else + counterGood ++; + } + sos.print(currentName + "\t" + counterGood + "\t"+ counterBad + "\t" + currentSeq + "\t" + currentQual + "\n"); + + iter.close(); + sos.close(); + + } +} +/*RST* + + + +asdfds +sfasf + + + + + + + + *RST*/ + diff --git a/src/dev/java/japsadev/tools/CaptureVNTR.java b/src/dev/java/japsadev/tools/CaptureVNTR.java new file mode 100644 index 0000000..05d9ad9 --- /dev/null +++ b/src/dev/java/japsadev/tools/CaptureVNTR.java @@ -0,0 +1,800 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools; + +import japsa.bio.tr.TandemRepeat; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.seq.XAFReader; +import japsa.util.BetaBinomialModel; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Random; + + +import org.apache.commons.math3.distribution.NormalDistribution; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * @author minhduc + * + */ +@Deployable(scriptName = "jsa.dev.captureVNTR", +scriptDesc = "VNTR typing using capture sequencing") +public class CaptureVNTR extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(CaptureVNTR.class); + + + public CaptureVNTR(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("xafFile", "VNTR.xaf", "Name of repeat file"); + addString("reference", "hg19.fas", "Name of reference genome"); + addString("target", "target.fa", "Where to write the target"); + addString("technology", "pacbio", "Technology: pacbio or illumina"); + addString("directory", "./", "Directory with depth files"); + addInt("stat", 2, "0,1,2"); + addInt("readLength", 250, "Read length"); + + addInt("stage", 2, "Stage of processing:\n" + + "0: Generate hmm profile, technology parameter is required\n" + + "1: Extract target sequences, prepare for alignment\n" + + "2: Look for reads spanning any of the repeats\n" + + "3: Read depth information extraction\n" + + "4: ---\n" + + "5: Read depth analysis\n" + + "6: Read depth analysis with likelihood\n" + ); + addString("output", "-", "Name of output file, - for standard out"); + addInt("pad", 10, "Gaps"); + addString("resample", null, "reference sample"); + addString("resAllele", null, "reference alleles"); + + //addBoolean("reverse",false,"Reverse sort order"); + + addStdHelp(); + } + + + + public static void main(String [] args) throws IOException, InterruptedException{ + CommandLine cmdLine = new CaptureVNTR(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + + String xafFile = cmdLine.getStringVal("xafFile"); + String referenceFile = cmdLine.getStringVal("reference"); + String targetFile = cmdLine.getStringVal("target"); + String technology = cmdLine.getStringVal("technology"); + String output = cmdLine.getStringVal("output"); + String resample = cmdLine.getStringVal("resample"); + String resAllele = cmdLine.getStringVal("resAllele"); + int stat = cmdLine.getIntVal("stat"); + int readLength = cmdLine.getIntVal("readLength"); + + int stage = cmdLine.getIntVal("stage"); + //stage = 6; + + //int pad = cmdLine.getIntVal("pad"); + + if (stage == 0) + generateProfile(xafFile, technology); + + if (stage == 1) + stage1_extractTargetSequence(referenceFile, xafFile, targetFile); + + //Post stage 1: + //bwa index target.fa + //bwa mem + //samtools view/sort/index/depth + if (stage == 2){ + for (int i= 0; i< args.length; i++) + stage2_spanRead(args[i], xafFile); + } + if (stage == 3){ + System.out.println("This funciton no longer here, please use jsa.tr.trdepth"); + } + if (stage == 4){ + System.out.println("This funciton no longer here, please use stage 5"); + } + + if (stage == 5){ + stage5_readDepthAnalysis(xafFile,resample, args, output,stat, readLength); + } + + if (stage == 6){ + String dir = cmdLine.getStringVal("directory"); + for(int i=0; i "+prob[x]); + //} + + sos.print('\t'); + sos.print(result);//+"("+String.format("%5.2g", mass[2]).trim()+")"); + // +// sos.print(String.format("%5.3g", genos[range[0]]).trim()+"-"+String.format("%5.3g", genos[range[1]]).trim()+"("+String.format("%5.2g", mass[2]).trim()+")"); + // sos.print((lengthR)/period); + + // sos.print('\t'); + // sos.print((lengthS + readLength)/period); + + // sos.print('\t'); + // sos.print((lengthSlow + readLength)/period); + // sos.print('\t'); + // sos.print((lengthShigh + readLength)/period); + + + //sos.print('\t'); + //sos.print(dist.getMean() + "\t" + dist.getStandardDeviation()); + }//for i + sos.println(); + }//while iter + sos.close(); + xafReader.close(); + for (int i = 0; i < rReader.length;i++){ + rReader[i].close(); + } + for (int i = 0; i < sFiles.length;i++){ + sReaders[i].close(); + } + } + /** + * Stage 0 + * @param xafFile + * @param technology + * @throws IOException + * @throws InterruptedException + */ + static void generateProfile(String xafFile, String technology) throws IOException, InterruptedException{ + Profile profile = illuminaProfile; + if (technology.startsWith("pac")) + profile = pacbioProfile; + + XAFReader xafReader = new XAFReader(xafFile); + + int nSeqs = 50; + Random rd = new Random(1); + + //int index = 0; + while (xafReader.next() != null){ + TandemRepeat str = TandemRepeat.read(xafReader); + //TODO: + if (str.getPeriod() <= 8) + continue; + + String repUnit = xafReader.getField("repUnit");//chromosome + String target = technology + "_" + str.getID(); + + Sequence repSeq = + new Sequence(dna, repUnit, target); + + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream("f.fasta"); + byte [] nBytes = new byte[repSeq.length() * 2]; + + for (int i =0; i< nSeqs; i++){ + int seqIndex = 0; + int j = 0; + while (j < repSeq.length()){ + double toss = rd.nextDouble(); + if (toss < profile.subs){//subs + int current = repSeq.getBase(j); + if (current > 3) + current = rd.nextInt(4); + current += rd.nextInt(3); + current = (current + 1) % 4; + nBytes[seqIndex] = (byte) current; + seqIndex ++; + j ++; + }else if (toss < profile.subs + profile.del){ + //deletion + j ++; + }else if (toss < profile.subs + profile.del + profile.ins){ + //insertion + nBytes[seqIndex] = (byte) rd.nextInt(4); + seqIndex ++; + }else{//copy + nBytes[seqIndex] = repSeq.getBase(j); + seqIndex ++; + j ++; + } + } + Sequence newSeq = new Sequence(dna, nBytes, seqIndex, "seq"+i); + //TODO: + if(newSeq.length() > 6) + newSeq.writeFasta(sos); + }//for + sos.close(); + + Process process = Runtime.getRuntime().exec("rm -f " + target + "o.fasta"); + process.waitFor(); + + process = Runtime.getRuntime().exec("cp f.fasta " + target + "o.fasta"); + process.waitFor(); + + String cmd = "kalign -q -i f.fasta -o " + target + "o.fasta"; + LOG.info("Running " + cmd); + + + process = Runtime.getRuntime().exec(cmd); + process.waitFor(); + LOG.info("Done " + cmd); + + process = Runtime.getRuntime().exec("hmmbuild --dna " + target +".hmm " + target + "o.fasta"); + process.waitFor(); + } + + xafReader.close(); + } + + static Alphabet dna = Alphabet.DNA(); + + + /** + * Read in the repeat information from xaf file and construct capture sequences + * from the reference genome. + * @param referenceFile + * @param xafFile + * @param targetFile + * @throws IOException + */ + static void stage1_extractTargetSequence(String referenceFile, String xafFile, String targetFile) throws IOException{ + //Read in the genome + LOG.info("Read genome begins"); + HashMap genome = new HashMap (); + SequenceReader reader = SequenceReader.getReader(referenceFile); + Sequence seq; + while ((seq = reader.nextSequence(dna)) != null){ + genome.put(seq.getName(), seq); + } + reader.close(); + LOG.info("Read genome done"); + + XAFReader xafReader = new XAFReader(xafFile); + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(targetFile); + + System.out.println("#H:ID\tchrom\tstart\tend\tlflank\trflank"); + while (xafReader.next() != null){ + //Extract + String chrom = xafReader.getField("chrom");//chromosome + int start = Integer.parseInt(xafReader.getField("start")); + int end = Integer.parseInt(xafReader.getField("end")); + int rflank = Integer.parseInt(xafReader.getField("rflank")); + int lflank = Integer.parseInt(xafReader.getField("lflank")); + + int mrflank = rflank + 300; + int mlflank = lflank + 300; + + if (seq == null || !seq.getName().equals(chrom)){ + seq = genome.get(chrom); + } + if (seq == null){ + xafReader.close(); + sos.close(); + LOG.error("Chrom in line " + xafReader.lineNo() + " not found!!!"); + System.exit(1); + } + Sequence s = seq.subSequence(start - mlflank - 1, end + mrflank); + s.setName(chrom+"_"+start+"_"+end+"_"+mlflank); + s.writeFasta(sos); + + System.out.println(xafReader.getField("ID") + + "\t" + s.getName() + + "\t" + (mlflank + 1) + + "\t" + (mlflank + end -start) + + "\t" + lflank + + "\t" + rflank + ); + + } + + xafReader.close(); + sos.close(); + } + /** + * Stage 2 + + */ + + static void stage2_spanRead(String bamFile, String xafFile) throws IOException{ + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader samReader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + + //String xafFile = ""; + XAFReader xafReader = new XAFReader(xafFile); + + + while (xafReader.next() != null){ + String ID = xafReader.getField("ID"); + String chrom = xafReader.getField("chrom"); + int startRep = Integer.parseInt(xafReader.getField("start")); + int endRep = Integer.parseInt(xafReader.getField("end")); + int period = Integer.parseInt(xafReader.getField("period")); + + SAMRecordIterator iter = samReader.query(chrom, startRep, endRep, false); + int count = 0, countProper = 0; + while (iter.hasNext()){ + SAMRecord sam = iter.next(); + if (sam.getAlignmentStart() > startRep) + break;//already got inside the repeat + if (sam.getAlignmentEnd() > endRep){ + count ++; + if (sam.getReadPairedFlag()){ + if (!sam.getMateUnmappedFlag() && chrom.equals(sam.getMateReferenceName())) + countProper ++; + } + + Cigar cigar = sam.getCigar(); + int readPos = 0; + int refPos = sam.getAlignmentStart(); + int indel = 0; + + for (final CigarElement e : cigar.getCigarElements()) { + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + break; // ignore hard clips + case P : //pad is a kind of clipped + break; // ignore pads + case S ://advance on the reference + readPos += length; + break; // soft clip read bases + case N : + refPos += length; + break; // reference skip + + case D : + if (refPos >= startRep && refPos <= endRep){ + indel -= length;//need to fix this + } + refPos += length; + break; + + case I : + if (refPos >= startRep && refPos <= endRep){ + indel += length;//need to fix this + } + readPos += length; + break; + case M : + readPos += length; + refPos += length; + break; + case EQ : + readPos += length; + refPos += length; + break; + + case X : + readPos += length; + refPos += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//casse + }//for + System.out.println(ID + "\t" + (indel * 1.0/period)+ "\t"+sam.getCigarString()+"#"+(1 + sam.getAlignmentEnd() - sam.getAlignmentStart())+"#"+indel+"#"+period); + } + }//while + iter.close(); + if (count > 0) + System.out.println(ID + "\t" + count+"\t"+countProper+"====================================="); + + } + + /******************************************************************* + int nSeq = samHeader.getSequenceDictionary().size(); + for (int seqIndex = 0; seqIndex < nSeq; seqIndex++){ + String seqName = samHeader.getSequence(seqIndex).getSequenceName(); + + //TODO: make this more genetic + String [] toks = seqName.split("_"); + int start = Integer.parseInt(toks[3]); + int end = Integer.parseInt(toks[2]) - Integer.parseInt(toks[1]) + start; + + start -= pad; + end += pad; + + SAMRecordIterator iter = samReader.query(seqName, start, end, false); + int count = 0, countProper = 0; + while (iter.hasNext()){ + SAMRecord sam = iter.next(); + if (sam.getAlignmentStart() > start) + break;//already got inside the repeat + if (sam.getAlignmentEnd() > end){ + count ++; + if (!sam.getMateUnmappedFlag() && seqName.equals(sam.getMateReferenceName())) + countProper ++; + System.out.println("#"+sam.getCigarString()+"#"+(1 + sam.getAlignmentEnd() - sam.getAlignmentStart())); + + + } + }//while + iter.close(); + System.out.println(seqName + "\t" + (end-start-pad-pad) + "\t"+ count+"\t"+countProper); + } + /*******************************************************************/ + samReader.close(); + xafReader.close(); + } + + + + + static class Profile{ + double subs = 0.1; + double del = 0.05; + double ins = 0.05; + + Profile (double s, double d, double i){ + subs = s; del = d; ins = i; + } + } + + //Subs = 1%, del = 0.01%, ins = 0.01% + static Profile illuminaProfile = new Profile(0.01, 0.001, 0.001); + //Subs = 1%, del = 10%, ins = 5% + static Profile pacbioProfile = new Profile(0.01, 0.075, 0.075); + + +} diff --git a/src/dev/java/japsadev/tools/CheckInductionCmd.java b/src/dev/java/japsadev/tools/CheckInductionCmd.java new file mode 100644 index 0000000..33ed596 --- /dev/null +++ b/src/dev/java/japsadev/tools/CheckInductionCmd.java @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2017 Minh Duc Cao (minhduc.cao@gmail.com). + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the names of the institutions nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Revision History + * 11/01/2012 - Minh Duc Cao: Revised + * 01/01/2013 - Minh Duc Cao, revised + ****************************************************************************/ + +package japsadev.tools; + +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFeature; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.checkInduction", + scriptDesc = "Sample script description" +) +public class CheckInductionCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(CheckInductionCmd.class); + public CheckInductionCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("path",null, "path to the group"); + addString("list",null, "file contain list of samples"); + //addBoolean("reverse",false,"Reverse sort order"); + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new CheckInductionCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String path = cmdLine.getStringVal("path"); + String list = cmdLine.getStringVal("list"); + + BufferedReader reader = SequenceReader.openFile(list); + String line = null; + + TreeSet sampleSet = new TreeSet(); + HashMap> geneMap = new HashMap>(); + HashSet unionSet = new HashSet(); + + while ((line = reader.readLine())!=null){ + sampleSet.add(line.trim()); + } + reader.close(); + + + Files.walk(Paths.get(path)) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith("gff")) + .filter(p -> sampleSet.contains(getSample(p))) + .forEach(p-> { + Set mySet = openGFF(p); + unionSet.addAll(mySet); + geneMap.put(getSample(p), mySet); + LOG.info("Union size " + unionSet.size()); + }); + ArrayList unionList = new ArrayList(unionSet); + Collections.sort(unionList); + + HashSet coreGene = new HashSet(); + + for (String gene:unionSet){ + // System.out.println(gene); + boolean good = true; + for (Set mySet:geneMap.values()){ + if (!mySet.contains(gene)){ + good = false; + break; + } + }//for + if (good){ + coreGene.add(gene); + } + } + + + + System.out.print("gene"); + for (String gene:unionList) { + if (coreGene.contains(gene)) + continue; + System.out.print("\t" +gene); + } + System.out.println(); + + for (String sample:sampleSet){ + System.out.print(sample); + Set mySet = geneMap.get(sample); + for (String gene:unionList){ + if (coreGene.contains(gene)) + continue; + System.out.print("\t" + (mySet.contains(gene)?"Y":"N")); + } + System.out.println(); + } + + //System.out.println("===================================================="); + //for (String sample:sampleSet){ + // System.out.print(sample); + // Set mySet = geneMap.get(sample); + // for (String gene:unionList){ + // System.out.print("\t" + (mySet.contains(gene)?"Y":"N")); + // } + // System.out.println(); + // } + } + + public static String getSample(Path filePath){ + return filePath.getFileName().toString().replace(".gff",""); + } + + public static HashSet openGFF(Path fileName){ + + HashSet geneSet = new HashSet(); + + try { + FileInputStream in = new FileInputStream(fileName.toFile()); + ArrayList annoGFF = JapsaAnnotation.readMGFF(in, 0, 0, "CDS"); + in.close(); + + for (JapsaAnnotation anno : annoGFF) { + for (JapsaFeature f : anno.getFeatureList()) { + String desc = f.getDesc(); + //String [] toks = desc.split(";"); + int index = desc.indexOf(":UniProtKB:"); + if (index >= 0) { + geneSet.add(desc.substring(index + 1, index + 17)); + continue; + } + index = desc.indexOf(":CARD:"); + if (index >= 0) { + geneSet.add(desc.substring(index + 1, index + 16)); + continue; + } + index = desc.indexOf(":CLUSTERS:"); + if (index >= 0) { + geneSet.add(desc.substring(index + 1, index + 18)); + continue;//for + } + index = desc.indexOf(":Pfam:"); + if (index >= 0) { + geneSet.add(desc.substring(index + 1, index + 15)); + continue; + } + index = desc.indexOf(":HAMAP:"); + if (index >= 0) { + geneSet.add(desc.substring(index + 1, index + 15)); + continue; + } + }//for + }//for + }catch (Exception e){ + e.printStackTrace(); + } + + LOG.info("Read " + geneSet.size() + " from " + fileName); + return geneSet; + } + + +} +/*RST* + + + + + + + + + + + + +*RST*/ + diff --git a/src/dev/java/japsadev/tools/ConvertProbeCmd.java b/src/dev/java/japsadev/tools/ConvertProbeCmd.java new file mode 100644 index 0000000..77cb71b --- /dev/null +++ b/src/dev/java/japsadev/tools/ConvertProbeCmd.java @@ -0,0 +1,209 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 14/09/2016 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.IntArray; +import japsa.util.deploy.Deployable; + + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.convertProbe", + scriptDesc = "Convert probes to sam and bed file" + ) +public class ConvertProbeCmd extends CommandLine{ + //CommandLine cmdLine; + public ConvertProbeCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + //Input/output + addString("probe", "-", "File containing probe name"); + addString("reference", null, "Reference"); + addString("output", "output", "Output prefix"); + + addInt("good",100,"good threshold"); + addInt("bad",100,"bad threshold"); + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + CommandLine cmdLine = new ConvertProbeCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String probe = cmdLine.getStringVal("probe"); + String output = cmdLine.getStringVal("output"); + String reference = cmdLine.getStringVal("reference"); + + int goodThreshold = cmdLine.getIntVal("good"); + int badThreshold = cmdLine.getIntVal("bad"); + + IntArray chrLens = new IntArray(24); + ArrayList chrList = new ArrayList (24); + HashMap chromsMap = new HashMap (24); + + FastaReader.Faster fread = new FastaReader.Faster(reference); + Sequence seq; + + // Timer timer = new Timer(); + int index = 0; + while ((seq = fread.nextSequence(Alphabet.DNA16())) != null) { + String id = seq.getName().split("\\s")[0]; + chrList.add(id); + chrLens.add(seq.length()); + + Chromosome chrom = new Chromosome(index, seq.length()); + chromsMap.put(id, chrom); + index ++; + } + fread.close(); + BitSet [] bitSets = new BitSet[chrList.size()]; + for (int i = 0; i < bitSets.length;i++) + bitSets[i] = new BitSet(chrLens.get(i)); + + + SequenceOutputStream samOut = SequenceOutputStream.makeOutputStream(output + ".sam"); + SequenceOutputStream badSamOut = SequenceOutputStream.makeOutputStream(output + "_bad.sam"); + for (int i = 0; i< chrLens.size();i++){ + samOut.print("@SQ\tSN:"+chrList.get(i)+"\tLN:"+chrLens.get(i) + '\n'); + badSamOut.print("@SQ\tSN:"+chrList.get(i)+"\tLN:"+chrLens.get(i) + '\n'); + } + + + + SequenceOutputStream badOut = SequenceOutputStream.makeOutputStream(output + ".bad"); + BufferedReader reader = SequenceReader.openFile(probe); + + String line = ""; + + while ( (line = reader.readLine()) !=null){ + line = line.trim(); + String [] toks = line.split("\t"); + + int badCount = Integer.parseInt(toks[2]); + int goodCount = Integer.parseInt(toks[1]); + + String name = toks[0]; + + String readSeq = toks[3]; + String readQual = toks[4]; + + toks = name.split("_"); + int end = Integer.parseInt(toks[2]); + int start = Integer.parseInt(toks[1]); + String chr = toks[0]; + + + if (goodCount > goodThreshold){ + badOut.print(name + "\t" + goodCount + "\t" + badCount + "\t(good>" + goodThreshold +")\n"); + badSamOut.print(name+"\t0\t"+chr+"\t"+start + "\t60\t" + (end-start+1) +"M\t*\t0\t0\t"+readSeq + "\t" + readQual + "\n"); + continue; + } + + if (badCount > badThreshold){ + badOut.print(name + "\t" + goodCount + "\t" + badCount + "\t(bad>" + badThreshold +")\n"); + badSamOut.print(name+"\t0\t"+chr+"\t"+start + "\t60\t" + (end-start+1) +"M\t*\t0\t0\t"+readSeq + "\t" + readQual + "\n"); + continue; + } + + samOut.print(name+"\t0\t"+chr+"\t"+start + "\t60\t" + (end-start+1) +"M\t*\t0\t0\t"+readSeq + "\t" + readQual + "\n"); + + BitSet bitSet = bitSets[chromsMap.get(chr).index]; + for (int i = start - 1; i< end;i++){ + bitSet.set(i); + } + } + reader.close(); + samOut.close(); + badOut.close(); + + SequenceOutputStream bedOut = SequenceOutputStream.makeOutputStream(output + ".bed"); + + + for (int x=0; x < chrList.size();x++){ + String chrName = chrList.get(x); + BitSet myBitSet = bitSets[x]; + int regionStart = -1; + for (int i = 0; i < chrLens.get(x);i++){ + if (myBitSet.get(i) && (regionStart < 0)){ + //start of a new region + regionStart = i; + }else if (!myBitSet.get(i) && (regionStart >= 0)){ + //end of a region + bedOut.print(chrName + "\t" + regionStart + "\t" + i + "\n"); + + regionStart = -1; + } + } + if (regionStart >=0){ + bedOut.print(chrName + "\t" + regionStart + "\t" + chrLens.get(x) + "\n"); + } + }//for + bedOut.close(); + + } + + /** + * Implement regions that may be capturable + * @author minhduc + * + */ + static class Chromosome{ + int index; + int length; + public Chromosome(int i, int l){ + index = i; + length = l; + } + } + +} diff --git a/src/dev/java/japsadev/tools/FilterPEConcordance.java b/src/dev/java/japsadev/tools/FilterPEConcordance.java new file mode 100644 index 0000000..fe6e051 --- /dev/null +++ b/src/dev/java/japsadev/tools/FilterPEConcordance.java @@ -0,0 +1,236 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/**************************************************************************** + * Revision History + * 01/04/2014 - Minh Duc Cao: Started + * + ****************************************************************************/ + +package japsadev.tools; + +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import java.io.File; +import java.io.IOException; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedList; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMTextWriter; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * + */ +@Deployable( + scriptName = "jsa.dev.filterPE", + scriptDesc = "Filter concordance PE reads (keeps only reads having mate mapped within a distance)" +) +public class FilterPEConcordance extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(FilterPEConcordance.class); + + + public FilterPEConcordance(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input", "-", "Name of input sam/bam file (- for from standard input)", true); + addString("output", "-", "Name of output file, (- for from standard out)"); + addInt("max", 700, "The maxmum size of a fragment"); + + addStdHelp(); + } + + static int checkPoint = 2000000; + + public static void main(String[] args) throws Exception { + CommandLine cmdLine = new FilterPEConcordance(); + args = cmdLine.stdParseLine(args); + + + String output = cmdLine.getStringVal("output"); + String samFile = cmdLine.getStringVal("input"); + filter(samFile, output, cmdLine.getIntVal("max")); + } + + static void filter(String inFile, String outFile, int max) + throws IOException { + HashMap hashRec = new HashMap(); + LinkedList listRec = new LinkedList(); + + /////////////////////////////////////////////////////////// + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader samReader = SamReaderFactory.makeDefault().open(new File(inFile)); + + + SAMFileHeader samHeader = samReader.getFileHeader(); + SAMTextWriter bamWriter = new SAMTextWriter(new File(outFile)); + + //samWriter.setSortOrder(SortOrder.unsorted, false); + bamWriter.writeHeader( samHeader.getTextHeader()); + /////////////////////////////////////////////////////////// + + + int readIn = 0, readMapped = 0, readOut = 0; + + SAMRecordIterator samIter = samReader.iterator(); + int currentRefIndex = -1; + int dup = 0; + + while (samIter.hasNext()) { + SAMRecord sam = samIter.next(); + readIn++; + + if (readIn % 10000000 == 0) { + Date date = new Date(); + + LOG.info("No. of reads processed : " + readIn + " at " + + date.toString()); + LOG.info("Statistics so far : " + " Total Read = " + + readIn + " Mapped Read = " + readMapped + + " Out Reads = " + readOut); + LOG.info("Hash = " + hashRec.size() + " List = " + + listRec.size() + " dup = " + dup + " at " + + currentRefIndex); + Runtime.getRuntime().gc(); + } + + // if the read is not mapped: continue + if (sam.getReadUnmappedFlag()) + continue; + + readMapped++; + int order = 0; + if (sam.getFirstOfPairFlag()) + order = 1; + + int referenceIndex = sam.getReferenceIndex(); + int pos = sam.getAlignmentStart(); + + // clean if ref differ to the current one + if (referenceIndex != currentRefIndex) { + while (!listRec.isEmpty()) {// clear queue and hash + SAMRecord head = listRec.remove(); + hashRec.remove(head.getReadName() + "_" + + (head.getFirstOfPairFlag() ? 1 : 0)); + if (head.getProperPairFlag()) { + bamWriter.addAlignment(head); + readOut++; + } + } + currentRefIndex = referenceIndex; + dup = 0; + } else { + while (!listRec.isEmpty()) { + SAMRecord head = listRec.getFirst(); + if (pos - head.getAlignmentStart() > max) { + listRec.remove(); + hashRec.remove(head.getReadName() + "_" + + (head.getFirstOfPairFlag() ? 1 : 0)); + if (head.getProperPairFlag()) { + bamWriter.addAlignment(head); + readOut++; + } + } else + break;// while + }// while + } + + // String refID = sam.getReferenceName(); + String readName = sam.getReadName(); + String readKey = readName + "_" + order; + + SAMRecord mate = hashRec.get(readName + "_" + (1 - order)); + if (mate != null) { + // found: both reads are properly mapped + // pair them together + mate.setMateAlignmentStart(sam.getAlignmentStart()); + sam.setMateAlignmentStart(mate.getAlignmentStart()); + + mate.setMateReferenceIndex(referenceIndex); + sam.setMateReferenceIndex(referenceIndex); + + mate.setMateNegativeStrandFlag(sam.getReadNegativeStrandFlag()); + sam.setMateNegativeStrandFlag(mate.getReadNegativeStrandFlag()); + + mate.setMateUnmappedFlag(false); + sam.setMateUnmappedFlag(false); + + mate.setProperPairFlag(true); + sam.setProperPairFlag(true); + + mate.setInferredInsertSize(sam.getAlignmentEnd() + - mate.getAlignmentStart()); + sam.setInferredInsertSize(-mate.getInferredInsertSize()); + + } else { + sam.setProperPairFlag(false); + } + + listRec.add(sam); + if (hashRec.put(readKey, sam) != null) + dup++; + }// while + + // finally clear + while (!listRec.isEmpty()) {// clear queue and hash + SAMRecord head = listRec.remove(); + hashRec.remove(head.getReadName() + "_" + + (head.getFirstOfPairFlag() ? 1 : 0)); + if (head.getProperPairFlag()) { + bamWriter.addAlignment(head); + readOut++; + } + } + + samReader.close(); + bamWriter.close(); + + Date date = new Date(); + + LOG.info("Finally No. of reads processed : " + readIn + " at " + + date.toString()); + LOG.info("Statistics so far : " + " Total Read = " + readIn + + " Mapped Read = " + readMapped + " Out Reads = " + + readOut); + } +} + diff --git a/src/dev/java/japsadev/tools/GapCloserCmd.java b/src/dev/java/japsadev/tools/GapCloserCmd.java new file mode 100644 index 0000000..cd67dc5 --- /dev/null +++ b/src/dev/java/japsadev/tools/GapCloserCmd.java @@ -0,0 +1,440 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 30/06/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.tools; + +import japsadev.bio.hts.scaffold.ContigBridge; +import japsadev.bio.hts.scaffold.RealtimeScaffolding; +import japsadev.bio.hts.scaffold.ScaffoldGraph; +import japsadev.bio.hts.scaffold.ScaffoldGraphDFS; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.Logging; +import japsa.util.deploy.Deployable; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author sonnguyen, minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.npscarf", + scriptDesc = "Experimental Scaffold and finish assemblies using Oxford Nanopore sequencing reads", + seeAlso = "jsa.np.npreader, jsa.util.streamServer, jsa.util.streamClient" + ) +public class GapCloserCmd extends CommandLine{ + + public GapCloserCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("seqFile", null, "Name of the assembly file (sorted by length)",true); + + addString("input", "-", "Name of the input file, - for stdin", true); + addString("format", "sam", "Format of the input: fastq/fasta or sam/bam", true); + addBoolean("index", true, "Whether to index the contigs sequence by the aligner or not."); + + addString("bwaExe", "bwa", "Path to bwa"); + addInt("bwaThread", 4, "Theads used by bwa"); + + addBoolean("long", false, "Whether report all sequences, including short/repeat contigs (default) or only long/unique/completed sequences."); + addBoolean("selective", false, "If set to true, only output contigs that mapped to the long read data. Useful for metagenomic reference."); + addBoolean("eukaryotic", false, "Whether eukaryotic or bacterial (default) genomes"); + addBoolean("update", true, "Whether output genome sequences in real-time or not."); + + addString("assembler", "spades", "Name of the assembler used for Illumina assembly: SPAdes (default) or ABySS."); + addString("graphDir", null, "Name of the output folder by SPAdes/ABySS: assembly graph and paths will be used for better gap-filling."); + addString("prefix", "out", "Prefix for the output files"); + + addString("genes", null , "Realtime annotation: name of annotated genes in GFF 3.0 format"); + addString("resistGene", null , "Realtime annotation: name of antibiotic resistance gene fasta file"); + addString("insertSeq", null , "Realtime annotation: name of IS fasta file"); + addString("oriRep", null, "Realtime annotation: name of fasta file containing possible origin of replication"); + addInt("minContig", 200, "Minimum contigs length that are used in scaffolding."); + addInt("maxRepeat", 7500, "Maximum length of repeat in considering species."); + + addDouble("cov", 0, "Expected average coverage of Illumina, <=0 to estimate"); + addInt("qual", 1, "Minimum quality"); + addInt("support", 1, "Minimum supporting long read needed for a link between markers"); + + addBoolean("realtime", false, "Process in real-time mode. Default is batch mode (false)"); + addInt("read", 50, "Minimum number of reads between analyses"); + addInt("time", 10, "Minimum number of seconds between analyses"); + addBoolean("verbose", false, "Turn on debugging mode"); + + addStdHelp(); + + } + //static boolean hardClip = false; + + public static void main(String[] args) throws IOException, InterruptedException { + CommandLine cmdLine = new GapCloserCmd(); + args = cmdLine.stdParseLine(args); + + /***********************************************************************/ + ScaffoldGraph.verbose = cmdLine.getBooleanVal("verbose"); + ScaffoldGraph.reportAll = !cmdLine.getBooleanVal("long"); + ScaffoldGraph.eukaryotic = cmdLine.getBooleanVal("eukaryotic"); + ScaffoldGraph.updateGenome = cmdLine.getBooleanVal("update"); + ScaffoldGraph.select = cmdLine.getBooleanVal("selective"); + + + String prefix = cmdLine.getStringVal("prefix"); + //String bamFile = cmdLine.getStringVal("bamFile"); + + String input = cmdLine.getStringVal("input"); + String bwaExe = cmdLine.getStringVal("bwaExe"); + int bwaThread = cmdLine.getIntVal("bwaThread"); + String format = cmdLine.getStringVal("format").toLowerCase().trim(); + + + + String sequenceFile = cmdLine.getStringVal("seqFile"), + graphFolder = cmdLine.getStringVal("graphDir"), + + genesFile = cmdLine.getStringVal("genes"), + resistFile = cmdLine.getStringVal("resistGene"), + isFile = cmdLine.getStringVal("insertSeq"), + oriFile = cmdLine.getStringVal("oriRep"); + + + String assembler = cmdLine.getStringVal("assembler").toLowerCase(); + //TODO: driver for ABySS + if(assembler.equals("abyss")){ + ScaffoldGraph.assembler=0b01; + //graphFile = ; + //pathFile = ; + } + + if (format.startsWith("fastq") || + format.startsWith("fasta") || + format.startsWith("fq") || + format.startsWith("fa")){ + try{ + ProcessBuilder pb = new ProcessBuilder(bwaExe).redirectErrorStream(true); + Process process = pb.start(); + BufferedReader bf = SequenceReader.openFile(process.getInputStream()); + + + String line; + String version = ""; + Pattern versionPattern = Pattern.compile("^Version:\\s(\\d+\\.\\d+\\.\\d+).*"); + Matcher matcher=versionPattern.matcher(""); + + while ((line = bf.readLine())!=null){ + matcher.reset(line); + if (matcher.find()){ + version = matcher.group(1); + break;//while + } + + + } + bf.close(); + + if (version.length() == 0){ + System.err.println(bwaExe + " is not the right path to bwa. bwa is required"); + System.exit(1); + }else{ + System.out.println("bwa version: " + version); + if (version.compareTo("0.7.11") < 0){ + System.err.println(" Require bwa of 0.7.11 or above"); + System.exit(1); + } + } + + //run indexing + if(cmdLine.getBooleanVal("index")){ + ProcessBuilder pb2 = new ProcessBuilder(bwaExe,"index",sequenceFile); + Process indexProcess = pb2.start(); + indexProcess.waitFor(); + } + }catch (IOException e){ + System.err.println(e.getMessage()); + System.exit(1); + } + + }else if (format.startsWith("sam") || format.startsWith("bam")){ + // no problem + }else{ + System.err.println("I dont understand format " + format); + System.exit(1); + } + + File graphFile = new File(graphFolder+"/assembly_graph.fastg"), + pathFile = new File(graphFolder+"/contigs.paths"); + + if(graphFolder !=null){ + if(ScaffoldGraph.assembler==0b00 && graphFile.exists() && pathFile.exists()){ + Logging.info("===> Use assembly graph from short-read assembler: SPAdes!"); + + }else if (ScaffoldGraph.assembler==0b01){ + File f = new File(graphFolder); + File[] matchingFiles = f.listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.endsWith("contigs.dot"); + } + }); + if(matchingFiles.length != 1){ + Logging.error("Failed to looking for an unique *-contigs.dot file in " + graphFolder + " . Proceeding without assembly graph..."); + graphFolder=null; + } else{ + graphFile=matchingFiles[0]; + Logging.info("===> Use assembly graph from short-read assembler ABySS: " + graphFile); + } + + } + + } + else{ + Logging.warn("Not found any legal assembly output folder, assembly graph thus not included!"); + graphFolder=null; + } + + + int //marginThres = cmdLine.getIntVal("marginThres"), + minContig = cmdLine.getIntVal("minContig"), + minSupport = cmdLine.getIntVal("support"), + maxRepeat = cmdLine.getIntVal("maxRepeat"); + //if(marginThres < 0) + // Logging.exit("Marginal threshold must not be negative", 1); + if(minContig <= 0) + Logging.exit("Minimum contig length has to be positive", 1); + if(minSupport <= 0) + Logging.exit("Minimum supporting reads has to be positive", 1); + if(maxRepeat <= 0) + Logging.exit("Maximal possible repeat length has to be positive", 1); + + + ScaffoldGraph.minContigLength = minContig; + ScaffoldGraph.minSupportReads = minSupport; + ScaffoldGraph.maxRepeatLength = ScaffoldGraph.eukaryotic?Math.max(maxRepeat,10000):Math.max(maxRepeat, 7500); + //ScaffoldGraph.marginThres = marginThres; + + + double cov = cmdLine.getDoubleVal("cov"); + int qual = cmdLine.getIntVal("qual"); + if(qual < 0) + Logging.exit("Phred score of quality has to be positive", 1); + + int number = cmdLine.getIntVal("read"), + time = cmdLine.getIntVal("time"); + + if(number <= 0) + Logging.exit("Number of reads has to be positive", 1); + if(time < 0) + Logging.exit("Sleeping time must not be negative", 1); + /**********************************************************************/ + + ScaffoldGraph graph; + boolean rt = cmdLine.getBooleanVal("realtime"); + ContigBridge.relaxFilling(); + if(rt){ + RealtimeScaffolding rtScaffolding = new RealtimeScaffolding(sequenceFile, genesFile, resistFile, isFile, oriFile, "-"); + + graph = rtScaffolding.graph; + if(prefix != null) + graph.prefix = prefix; + if(graphFolder!=null){ + synchronized(graph){ + if(ScaffoldGraph.assembler==0b00) + graph.readMore(graphFile.getAbsolutePath(),pathFile.getAbsolutePath()); + else if(ScaffoldGraph.assembler==0b01) + graph.readMore(graphFile.getAbsolutePath(),""); + + } + } + if (cov <=0) + cov = ScaffoldGraph.estimatedCov; + + rtScaffolding.scaffolding(input, number, time, cov/1.6, qual, format, bwaExe, bwaThread, sequenceFile); + + } + else{ + graph = new ScaffoldGraphDFS(sequenceFile, genesFile, resistFile, isFile, oriFile); + if(graphFolder!=null){ + if(ScaffoldGraph.assembler==0b00) + graph.readMore(graphFile.getAbsolutePath(),pathFile.getAbsolutePath()); + else if(ScaffoldGraph.assembler==0b01) + graph.readMore(graphFile.getAbsolutePath(),""); + } + + if (cov <=0) + cov = ScaffoldGraph.estimatedCov; + + graph.makeConnections(input, cov / 1.6, qual, format, bwaExe, bwaThread, sequenceFile); + + graph.connectBridges(); + if(prefix != null) + graph.prefix = prefix; + + ContigBridge.forceFilling(); + graph.printSequences(true,true); + } + + } +} + +/*RST* +--------------------------------------------------------------------------------------- + *npScaffolder*: real-time scaffolder using SPAdes contigs and Nanopore sequencing reads +--------------------------------------------------------------------------------------- + + *npScaffolder* (jsa.np.npscarf) is a program that connect contigs from a draft genomes +to generate sequences that are closer to finish. These pipelines can run on a single laptop +for microbial datasets. In real-time mode, it can be integrated with simple structural +analyses such as gene ordering, plasmid forming. + +npScaffolder is included in the `Japsa package `_. + + + +~~~~~~~~~~~~~~ +Usage examples +~~~~~~~~~~~~~~ + +A summary of *npScarf* usage can be obtained by invoking the --help option:: + + jsa.np.npscarf --help + +Input +===== + *npScarf* takes two files as required input:: + + jsa.np.npscarf -seq <*draft*> -input <*nanopore*> + +<*draft*> input is the FASTA file containing the pre-assemblies. We support outputs +from running SPAdes (for small genomes, e.g. microbial) or ABySS (for large eukaryotic +genomes) on Illumina MiSeq paired end reads. +The assembler used to generate the pre-assemblies can be specified using option: + + --assembler <*assembler*> + +The default <*assembler*> is SPAdes. +Input from another non-supported short-read assemblers could be used with your own risk. + +<*nanopore*> is either the long reads in FASTA/FASTQ file or SAM/BAM formated alignments +between them to <*draft*> file. We use BWA-MEM as the recommended aligner +with the fixed parameter set as follow:: + + bwa mem -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y <*draft*> <*nanopore*> > <*bam*> + +The input file format is specified by option --format. The default is FASTA/FASTQ in which +the path to BWA version 0.7.11 or newer is required. If SAM/BAM is provided as input instead, +then do not worry about the aligner. + + Note: Remember to always *INDEXING* the reference before running BWA:: + + bwa index <*draft*> + + Missing this step would break down the whole pipeline. + +Output +======= + *npScarf* output is specified by *-prefix* option. The default prefix is \'out\'. +Normally the tool generates two files: *prefix*.fin.fasta and *prefix*.fin.japsa which +indicate the result contigs in FASTA and JAPSA format. + +In realtime mode, if any annotation analysis is enabled, a file named + *prefix*.anno.japsa is generated instead. This file contains features detected after +scaffolding. + +Real-time scaffolding +===================== +To run *npScarf* in streaming mode:: + + jsa.np.npscarf -realtime [options] + +In this mode, the <*bam*> file will be processed block by block. The size of block +(number of BAM/SAM records) can be manipulated through option *-read* and *-time*. + +The idea of streaming mode is when the input <*nanopore*> file is retrieved in stream. +npReader is the module that provides such data from fast5 files returned from the real-time +base-calling cloud service Metrichor. Ones can run:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npscarf --realtime -bwaExe= -bwaThread=10 -input - -seq <*draft*> > log.out 2>&1 + +For the same purpose, you can also invoke BWA-MEM explicitly as in the old version of *npScarf*, +In this case, option --format=SAM must be presented as follow: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + bwa mem -t 10 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y -K 3000 <*draft*> - 2> /dev/null | \ + jsa.np.npscarf --realtime -input - -format=SAM -seq <*draft*> > log.out 2>&1 + +or if you have the whole set of Nanopore long reads already and want to emulate the +streaming mode:: + + jsa.np.timeEmulate -s 100 -i <*nanopore*> -output - | \ + jsa.np.npscarf --realtime -bwaExe= -bwaThread=10 -input - -seq <*draft*> > log.out 2>&1 + +Note that jsa.np.timeEmulate based on the field *timestamp* located in the read name line to +decide the order of streaming data. So if your input <*nanopore*> already contains the field, +you have to sort it:: + + jsa.seq.sort -i <*nanopore*> -o <*nanopore-sorted*> -sortKey=timestamp + +or if your file does not have the *timestamp* data yet, you can manually make ones. For example:: + + cat <*nanopore*> | \ + awk 'BEGIN{time=0.0}NR%4==1{printf "%s timestamp=%.2f\n", $0, time; time++}NR%4!=1{print}' \ + > <*nanopore-with-time*> + +Real-time annotation +==================== +The tool includes usecase for streaming annotation. Ones can provides database of antibiotic +resistance genes and/or Origin of Replication in FASTA format for the analysis of gene ordering +and/or plasmid identifying respectively:: + + jsa.np.timeEmulate -s 100 -i <*nanopore*> -output - | \ + jsa.np.npscarf --realtime -bwaExe= -input - -seq <*draft*> -resistGene <*resistDB*> -oriRep <*origDB*> > log.out 2>&1 + +Assembly graph +============== + *npScarf* can read the assembly graph info from SPAdes/AbySS to make the results more precise. +The results might be slightly deviate from the old version in term of number of final contigs:: + + jsa.np.npscarf --graphDir= + +where output_directory indicates the result folder of SPAdes/ABySS, containing files such as contigs.fasta, +contigs.paths, assembly_graph.fastg in case of SPAdes and *-contigs.fa, *-contigs.dot if from ABySS. + *RST*/ diff --git a/src/dev/java/japsadev/tools/GetFlankBlast.java b/src/dev/java/japsadev/tools/GetFlankBlast.java new file mode 100644 index 0000000..d07acce --- /dev/null +++ b/src/dev/java/japsadev/tools/GetFlankBlast.java @@ -0,0 +1,220 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 25/09/2015 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools; + + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.flankFromBlastn", + scriptDesc = "Get sequencing with flanking sequences from blastn results" + ) +public class GetFlankBlast extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(GetFlankBlast.class); + + // Parse result from /sw/blast/current/bin/blastn -db /DataOnline/Data/Bacterial_Genome/Eskape/ftp.ncbi.nlm.nih.gov/blast/db/refseq_genomic -query F0.fasta -num_threads 16 -out F0_refseq.blastn -outfmt + // '7 qseqid qlen qstart qend sseqid slen sstart send length frames pident nident gaps mismatch score bitscore' + + //CommandLine cmdLine; + public GetFlankBlast(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input", null, "Name of the genome file",true); + addString("output", "-", "Name of the output file"); + addString("source", null, "Source sequences",true); + + addStdHelp(); + } + + public static void main(String [] args) throws IOException, InterruptedException{ + GetFlankBlast cmdLine = new GetFlankBlast (); + args = cmdLine.stdParseLine(args); + + String input = cmdLine.getStringVal("input"); + String output = cmdLine.getStringVal("output"); + String source = cmdLine.getStringVal("source"); + + double ratio = .85; + + + ArrayList locs = new ArrayList(); + HashSet seqSet = new HashSet(); + + { + BufferedReader br = new BufferedReader(new FileReader(input)); + String line = ""; + String currentGene = ""; + double score = 0; + + int myStart = 0, myEnd = 1; + String mySeq = ""; + boolean myRev = false; + + + LOG.info("Read information"); + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) + continue; + String [] toks = line.trim().split("\t"); + //Start a new gene + if (!currentGene.equals(toks[0])){ + if (score > 0){ + Location loc = new Location(); + loc.geneName = currentGene; + loc.seqName = mySeq; + loc.start = myStart; + loc.end = myEnd; + loc.rev = myRev; + locs.add(loc); + seqSet.add(mySeq); + System.out.println(loc.geneName + " " + loc.seqName + ":" + loc.start + "-" + loc.end + (loc.rev?" -rev":"")); + } + //starting a new gene + score = 0; + currentGene = toks[0]; + } + + int myScore = Integer.parseInt(toks[14]); + if (myScore <= score) + continue; + + double length = Double.parseDouble(toks[8]); + double qlen = Double.parseDouble(toks[1]); + double slen = Double.parseDouble(toks[5]); + double identity = Double.parseDouble(toks[10]); + if (identity < ratio * 100) + continue; + if (length/qlen < ratio) + continue; + + int sstart = Integer.parseInt(toks[6]); + int send = Integer.parseInt(toks[7]); + + if (sstart <= 100 || send <= 100) + continue; + + if (slen - sstart < 100 || slen - send < 100) + continue; + + score = myScore; + if (sstart < send){ + mySeq = toks[4]; + myStart = sstart - 100; + myEnd = send + 100; + myRev = false; + //location = toks[8] + ":" + (sstart - 100) + "-" + (send + 100); + }else{ + mySeq = toks[4]; + myStart = send - 100; + myEnd = sstart + 100; + myRev = true; + //location = "-reverse " + toks[8] + ":" + (send - 100) + "-" + (sstart + 100); + } + } + if (score > 0){ + Location loc = new Location(); + loc.geneName = currentGene; + loc.seqName = mySeq; + loc.start = myStart; + loc.end = myEnd; + loc.rev = myRev; + locs.add(loc); + seqSet.add(mySeq); + System.out.println(loc.geneName + " " + loc.seqName + ":" + loc.start + "-" + loc.end + (loc.rev?" -rev":"")); + } + br.close(); + } + + LOG.info("Read source"); + HashMap seqMap = new HashMap(); + SequenceReader reader = SequenceReader.getReader(source); + Sequence seq; + while ( (seq = reader.nextSequence(Alphabet.DNA())) != null){ + if (seqSet.contains(seq.getName())){ + if (seqMap.put(seq.getName(), seq) != null){ + LOG.warn("Sequence " + seq.getName() + " duplicated"); + } + } + } + reader.close(); + LOG.info("Extract"); + SequenceOutputStream out = SequenceOutputStream.makeOutputStream(output); + ////////////////////////////////////////////////////// + // out.print(currentGene + " " + location + "\n"); + for (Location loc:locs){ + seq = seqMap.get(loc.seqName).subSequence(loc.start - 1, loc.end); + if (loc.rev){ + seq = Alphabet.DNA.complement(seq); + seq.setDesc(loc.seqName + ":" + loc.start + "-" + loc.end + " -"); + }else{ + seq.setDesc(loc.seqName + ":" + loc.start + "-" + loc.end); + } + + seq.setName(loc.geneName); + seq.writeFasta(out); + } + + out.close(); + } + + static class Location{ + String geneName; + String seqName; + int start; + int end; + boolean rev; + } +} + diff --git a/src/dev/java/japsadev/tools/NewScarfCmd.java b/src/dev/java/japsadev/tools/NewScarfCmd.java new file mode 100644 index 0000000..b7ece8a --- /dev/null +++ b/src/dev/java/japsadev/tools/NewScarfCmd.java @@ -0,0 +1,105 @@ +package japsadev.tools; +import java.io.IOException; + +import org.graphstream.graph.Edge; +import org.graphstream.graph.Node; +import org.graphstream.ui.view.Viewer; + +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import japsadev.bio.hts.newscarf.Alignment; +import japsadev.bio.hts.newscarf.BidirectedGraph; +import japsadev.bio.hts.newscarf.HybridAssembler; + +@Deployable( + scriptName = "jsa.dev.newScarf", + scriptDesc = "New npscarf" + ) +public class NewScarfCmd extends CommandLine{ + public NewScarfCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("fastg", null, "Assembly graph fastg file",true); + addString("sam", null, "Sam file alignment of assembly graph to long reads",true); + addInt("qual", 30, "Minimum quality of alignment to considered"); + addString("path", null, "SPAdes contigs path file"); + addString("title", "Scaffolding using assembly graph", "Title of GUI window"); + + addStdHelp(); + } + + public static void main(String[] args) throws IOException{ + CommandLine cmdLine = new NewScarfCmd (); + args = cmdLine.stdParseLine(args); + + Alignment.MIN_QUAL = cmdLine.getIntVal("qual"); + String fastgFile = cmdLine.getStringVal("fastg"); + String samFile = cmdLine.getStringVal("sam"); + String pathFile = cmdLine.getStringVal("path"); + String name = cmdLine.getStringVal("title"); + + String styleSheet = + "node {" + + " fill-color: black; z-index: 0;" + + "}" + + "edge {" + + " text-alignment: along;" + + "}" + + "node.marked {" + + " fill-color: red;" + + "}" + + "edge.marked {" + + " fill-color: red;" + + "}"; + System.setProperty("java.awt.headless", "false"); + HybridAssembler hbAss = new HybridAssembler(fastgFile); + //For SAM file, run bwa first on the edited assembly_graph.fastg by running: + //awk -F '[:;]' -v q=\' 'BEGIN{flag=0;}/^>/{if(index($1,q)!=0) flag=0; else flag=1;}{if(flag==1) print $1;}' ../EcK12S-careful/assembly_graph.fastg > Eck12-careful.fasta + //TODO: need to make this easier + BidirectedGraph graph= hbAss.simGraph; + + //graph.addAttribute("ui.quality"); + //graph.addAttribute("ui.antialias"); + graph.addAttribute("ui.stylesheet", styleSheet); + graph.addAttribute("ui.default.title", name); + + Viewer viewer = graph.display(); + // Let the layout work ... + + System.out.println("Node: " + graph.getNodeCount() + " Edge: " + graph.getEdgeCount()); + + + for (Node node : graph) { + node.addAttribute("ui.label", node.getId()); + node.setAttribute("ui.style", "text-offset: -10;"); + node.addAttribute("layout.weight", 10); + + if(BidirectedGraph.isUnique(node)) + node.setAttribute("ui.class", "marked"); + } + + + try { + if(pathFile!=null) + hbAss.reduceFromSPAdesPaths(pathFile); + hbAss.assembly(samFile); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + +// for (Edge edge: graph.getEdgeSet()){ +// if(edge.hasAttribute("isReducedEdge")) +// edge.addAttribute("layout.weight", 10); +// } + + + HybridAssembler.promptEnterKey(); + viewer.disableAutoLayout(); + System.out.println("Node: " + graph.getNodeCount() + " Edge: " + graph.getEdgeCount()); + } +} diff --git a/src/dev/java/japsadev/tools/ProfileDPCmd.java b/src/dev/java/japsadev/tools/ProfileDPCmd.java new file mode 100644 index 0000000..64adbe3 --- /dev/null +++ b/src/dev/java/japsadev/tools/ProfileDPCmd.java @@ -0,0 +1,194 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/**************************************************************************** + * Revision History + * 5 Sep 2016 - Minh Duc Cao: Started + * + ****************************************************************************/ +package japsadev.tools; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Random; + +import japsa.bio.alignment.ProfileDP; +import japsa.bio.alignment.ProfileDP.EmissionState; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +/** + * @author minhduc + * + */ + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.profileDP", + scriptDesc = "Using a 1-state machine for alignment" + ) + +public class ProfileDPCmd extends CommandLine{ + public ProfileDPCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addInt("length", 20, "Length"); + addDouble("iProb", 0.10, "Probability of insertion"); + addDouble("dProb", 0.10, "Probability of deletion"); + addDouble("mProb", 0.10, "Probability of mutation"); + + + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new ProfileDPCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + int length = cmdLine.getIntVal("length"); + + double iProb = cmdLine.getDoubleVal("iProb"); + double dProb = cmdLine.getDoubleVal("dProb"); + double mProb = cmdLine.getDoubleVal("mProb"); + + Alphabet dna = Alphabet.DNA4(); + Random rnd = new Random(1); + Sequence seq = Sequence.random(dna, length, new double[]{.25,.25,.25,.25}, rnd); + + //SequenceOutputStream out = SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("out")); + //seq.print(out); + /*****************************************************/ + //Sequence seq = SequenceReader.getReader(args[0]).nextSequence(dna); + //ProfileDP dp = new ProfileDP(seq, 20, seq.length() - 20); + ProfileDP genDp = new ProfileDP(seq, -1, seq.length() *2); + + //datOutGen = SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("prefix") + "gen.dat"); + //datOutEst = SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("prefix") + "est.dat"); + + genDp.setTransitionProbability(1 - iProb - dProb, iProb, dProb); + genDp.setMatchProbability(1 - mProb); + + ProfileDP dp = new ProfileDP(seq, -1, seq.length() *2); + + System.out.println("Length = " + length + " Ins = " + iProb + " Del = " + dProb + " Mis = " + mProb); + System.out.printf("%8.4f %8.4f %8.4f\n",dp.getMatCost(),dp.getInsCost(),dp.getDelCost()); + System.out.printf("%8.4f %8.4f\n",dp.getMatchCost(),dp.getMisMatchCost()); + + int numSeq = 20; + ArrayList seqs = new ArrayList(numSeq); + for (int i = 0; i < numSeq; i++ ){ + System.out.printf("%3d ",i); + Sequence genSeq = genDp.generate(1, rnd); + seqs.add(genSeq); + //genSeq.print(out); + /*************************************************************** + //Emission alignScore = dp.align(genSeq); + Emission alignScore = genDp.align(genSeq); + System.out.println(alignScore.score + " " + genSeq.length() * 2); + + IntArray iP = new IntArray(); + IntArray iS = new IntArray(); + + Emission tmp = alignScore; + do{ + iP.add(tmp.profilePos); + iS.add(tmp.seqPos); + tmp = tmp.bwdState; + }while (tmp != null); + + for (int x = iP.size()-1; x> 0; x--){ + int p = iP.get(x) + 1; + int s = iS.get(x) + 1; + + if (iP.get(x) == iP.get(x-1)){ + datOutEst.print("I " + p + " " + s + " " + genSeq.charAt(s) + "\n"); + }else if (iS.get(x) == iS.get(x-1)){ + datOutEst.print("D " + p + " " + s + " " + seq.charAt(p) + "\n"); + }else if (seq.getBase(p) == genSeq.getBase(s)){ + datOutEst.print("= " + p + " " + s + " " + seq.charAt(p) + "\n"); + }else + datOutEst.print("X " + p + " " + s + " " + seq.charAt(p) + " " + genSeq.charAt(s) + "\n"); + } + datOutEst.print("EST: " + alignScore.countMG + " " +alignScore.countMB + " " + alignScore.countIns + " " + alignScore.countDel + " " + alignScore.score + "\n"); + /***************************************************************/ + } + //out.close(); + /*************************************************************** + datOutEst.close(); + datOutGen.close(); + /***************************************************************/ + for (int x = 0; x < 5;x++){ + int countIns = 0, countDel = 0, countMG = 0, countMB = 0; + for (int i = 0; i < numSeq; i++ ){ + System.out.printf("%3d ",i); + EmissionState retState = dp.align(seqs.get(i)); + countIns += retState.getCountIns(); + countDel += retState.getCountDel(); + countMG += retState.getCountMG(); + countMB += retState.getCountMB(); + } + + double sum = 3.0 + countMG + countMB + countIns + countDel; + double insP = (countIns + 1.0) /sum; + double delP = (countDel + 1.0) /sum; + double matP = (countMG + countMB + 1.0) /sum; + double matchP = (countMG + 1.0) / (countMG + countMB + 2.0); + double misMatchP = 1 - matchP; + System.out.printf("Total: %3d %3d %3d %3d %8.4f %8.4f %8.4f\n", countMG, countMB, countIns, countDel, insP, delP, misMatchP); + dp.setTransitionProbability(matP, insP, delP); + dp.setMatchProbability(matchP); + } + /***************************************************************/ + + + } +} +/*RST* + + + + + + + + + + + + +*RST*/ \ No newline at end of file diff --git a/src/dev/java/japsadev/tools/ResGeneGenomesCmd.java b/src/dev/java/japsadev/tools/ResGeneGenomesCmd.java new file mode 100644 index 0000000..7be61b2 --- /dev/null +++ b/src/dev/java/japsadev/tools/ResGeneGenomesCmd.java @@ -0,0 +1,225 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 07/09/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import japsa.bio.amra.ResistanceGeneDB; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.tools.bio.amra.Genomes2ResistanceGeneCmd; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.refseq2res", + scriptDesc = "Extract resistance classes from sequences" + ) +public class ResGeneGenomesCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(ResGeneGenomesCmd.class); + + //CommandLine cmdLine; + public ResGeneGenomesCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("db", "-", "name of db file"); + addString("resDB", null, "Name of the resistance gene database",true); + addString("output", "-", "name of output file"); + + addDouble("identity", 0.9, "Minimum identity"); + addDouble("coverage", 0.8, "Minimum coverage of gene"); + + addInt("thread",1, "Number of threads"); + + addStdHelp(); + } + + /** + * @param args + * @throws InterruptedException + * @throws Exception + * @throws OutOfMemoryError + */ + public static void main(String[] args) + throws IOException, InterruptedException{ + + CommandLine cmdLine = new ResGeneGenomesCmd(); + args = cmdLine.stdParseLine(args); + + String db = cmdLine.getStringVal("db"); + String resDBPath = cmdLine.getStringVal("resDB"); + + String output = cmdLine.getStringVal("output"); + + gIdentity = cmdLine.getDoubleVal("identity"); + gCoverage = cmdLine.getDoubleVal("coverage"); + + int thread = cmdLine.getIntVal("thread"); + + + gResDB = new ResistanceGeneDB(resDBPath); + gSos = SequenceOutputStream.makeOutputStream(output); + + gSos.print("#strainID\tstrainName\tstrainType\tclasses\n"); + + if (!processDB(db, thread)){ + LOG.error("Job queue for too long"); + } + gSos.close(); + } + + static double gIdentity, gCoverage; + static SequenceOutputStream gSos; + static ResistanceGeneDB gResDB; + + private static boolean processDB(String dbFile, int threadNumber) throws IOException, InterruptedException{ + BufferedReader bf = SequenceReader.openFile(dbFile); + String line = ""; + + ExecutorService executor = Executors.newFixedThreadPool(threadNumber); + + while ( (line = bf.readLine())!=null){ + if (line.startsWith("#")) + continue; + + String [] toks = line.trim().split("\t"); + String strainID = toks[4]; + + String fnaFile = toks[5]; + + double n50 = Double.parseDouble(toks[7]); + + if (n50 < 100000){ + LOG.info(strainID + " Ignored because of low n50 " + n50); + continue;//while + } + + if (toks.length < 14){ + LOG.info(strainID + " Ignored because of malform " + toks.length); + continue;//while + } + + if (!toks[13].equals("0")){ + LOG.info(strainID + " Ignored because of not good ST " + toks[13]); + continue;//while + } + + String ST = toks[12]; + + String organismName = toks[2]; + if (!organismName.equals(toks[1] + " " + toks[3])){ + organismName += " " + toks[3]; + } + //organismName += "_ST" + ST; + + //Remove all the weird chars + String strainName = organismName.replaceAll(" ", "_"); + strainName = strainName.replaceAll("/", "_"); + strainName = strainName.replaceAll("'", "_"); + strainName = strainName.replaceAll("\"", "_"); + strainName = strainName.replaceAll(";", "_"); + strainName = strainName.replaceAll(":", "_"); + strainName = strainName.replaceAll("__*", "_");//Make sure no double hyphen + /*****************************************************/ + executor.execute(new ResGenome (strainID, fnaFile, ST, strainName)); + } + bf.close(); + executor.shutdown(); + boolean finished = executor.awaitTermination(7, TimeUnit.DAYS); + return finished; + } + + static class ResGenome implements Runnable{ + + String strainID; + String strainName; + String strainType; + String fnaFile; + + ResGenome(String strainID, String fnaFile, String strainType, String organismName){ + //mapClasses = shareMap; + this.strainID = strainID; + this.fnaFile = fnaFile; + this.strainName = organismName; + this.strainType = strainType; + + } + + /* (non-Javadoc) + * @see java.lang.Runnable#run() + */ + @Override + public void run() { + try{ + ArrayList seqs = SequenceReader.readAll(fnaFile, Alphabet.DNA()); + HashSet dgClasses = Genomes2ResistanceGeneCmd.blastn(seqs, gResDB, gIdentity, gCoverage); + + synchronized(gSos){ + gSos.print(strainID + "\t" + strainName + "\t" + strainType + "\t"); + for (String c:dgClasses){ + gSos.print(c + ", "); + } + gSos.println(); + } + + }catch(IOException e){ + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + } + +} + diff --git a/src/dev/java/japsadev/tools/SampleCmd.java b/src/dev/java/japsadev/tools/SampleCmd.java new file mode 100644 index 0000000..70cea5e --- /dev/null +++ b/src/dev/java/japsadev/tools/SampleCmd.java @@ -0,0 +1,106 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 11/01/2012 - Minh Duc Cao: Revised + * 01/01/2013 - Minh Duc Cao, revised + ****************************************************************************/ + +package japsadev.tools; + +import java.io.BufferedReader; +import java.io.IOException; + +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.sample", + scriptDesc = "Sample script description" + ) +public class SampleCmd extends CommandLine{ + public SampleCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addStdInputFile(); + //addBoolean("reverse",false,"Reverse sort order"); + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new SampleCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String input = cmdLine.getStringVal("input"); + + BufferedReader reader = SequenceReader.openFile(input); + String line = ""; + while ((line = reader.readLine())!=null){ + + if (line.startsWith("sbatch")){ + String [] toks = line.split(" "); + System.out.println(toks[4]); + }else if (line.startsWith("chr")){ + line = line.trim(); + String [] toks = line.split(":"); + System.out.print(line + "\t" + toks[2]); + toks = toks[1].split("-"); + System.out.println(Double.parseDouble(toks[1]) - Double.parseDouble(toks[0])); + } + } + reader.close(); + } +} +/*RST* + + + + + + + + + + + + +*RST*/ + diff --git a/src/dev/java/japsadev/tools/SelectReadsCmd.java b/src/dev/java/japsadev/tools/SelectReadsCmd.java new file mode 100644 index 0000000..331d143 --- /dev/null +++ b/src/dev/java/japsadev/tools/SelectReadsCmd.java @@ -0,0 +1,156 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 05/01/2017 - Minh Duc Cao: Revised + ****************************************************************************/ + +package japsadev.tools; + +import java.io.File; +import java.io.IOException; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.HTSUtilities; +import japsa.util.deploy.Deployable; + + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.selectReadsMapToPosition", + scriptDesc = "Select reads spanning repeats" + ) +public class SelectReadsCmd extends CommandLine{ + public SelectReadsCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input", null, "Name of the input file, - for standard input", true); + addString("regions", null, "The regions to extract format chr1:s1-e1,chr2:s2-e2 no spaces", true); + + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new SelectReadsCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String input = cmdLine.getStringVal("input"); + String regions = cmdLine.getStringVal("regions"); + + + String [] regionArray = regions.split(","); + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = SamReaderFactory.makeDefault().open(new File(input)); + + + for (String region:regionArray){ + + String[] toks = region.split(":"); + + + if (toks.length < 1){ + System.err.println("region need to be in format chrX:start-end"); + System.exit(1); + } + String chrom = toks[0]; + toks = toks[1].split("-"); + if (toks.length < 1){ + System.err.println("region need to be in format chrX:start-end"); + System.exit(1); + } + int start = Integer.parseInt(toks[0]); + int end = Integer.parseInt(toks[1]); + + SequenceOutputStream outFile = SequenceOutputStream.makeOutputStream(chrom + start + "_" + end + ".fasta"); + + System.out.println(region + ":" + (end - start) + ":"); + SAMRecordIterator iter = reader.query(chrom, start, end,false); + while (iter.hasNext()){ + SAMRecord record = iter.next(); + if (record.getReadString().length() < 10){ + System.out.println("== " + record.getReadName()); + continue;//while + } + int [] refPositions = {start, end}; + int [] pos = HTSUtilities.positionsInRead(record, refPositions); + if (pos[0] == 0 || pos[1] == 0) + continue; + + String readSub = record.getReadString().substring(pos[0],pos[1]-1); + Sequence rs = new Sequence(Alphabet.DNA16(), readSub, record.getReadName()); + + rs.writeFasta(outFile); + //System.out.printf("%5d %s %s %s\n", readSub.length(),readSub.substring(0, 22),readSub.substring(readSub.length() - 24), readSub); + + } + iter.close(); + outFile.close(); + } + + reader.close(); + + } + +} + + + +/*RST* + + + + + + + + + + + + +*RST*/ + diff --git a/src/dev/java/japsadev/tools/StructuralVariationCmd.java b/src/dev/java/japsadev/tools/StructuralVariationCmd.java new file mode 100644 index 0000000..2dccba4 --- /dev/null +++ b/src/dev/java/japsadev/tools/StructuralVariationCmd.java @@ -0,0 +1,209 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 8 Jan 2017 - Minh Duc Cao: Created + ****************************************************************************/ +package japsadev.tools; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +/** + * @author minhduc + * Analysis of structural variation. The idea is to find alignments of + * different fragments on the same reads that contradict each other. + * + */ +@Deployable( + scriptName = "jsa.dev.svstream", + scriptDesc = "Structural variation detection" + ) +public class StructuralVariationCmd extends CommandLine { + public StructuralVariationCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input",null,"Sam/bam file (freshed from bwa or sorted by name)",true); + addString("reference",null,"Reference sequence",true); + addInt("quality",0,"Minimum alignment quality score"); + + addStdHelp(); + } + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new StructuralVariationCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String input = cmdLine.getStringVal("input"); + String reference = cmdLine.getStringVal("reference"); + int qual = 0; + + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = null; + if ("-".equals(input)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(input)); + + //ArrayList refs = SequenceReader.readAll(reference, Alphabet.DNA16()); + + SAMRecordIterator iter = reader.iterator(); + String readID = ""; + ArrayList samList = new ArrayList(); + long currentReadCount = 0, + currentBaseCount = 0; + + + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + if (rec.getReadUnmappedFlag() || rec.getMappingQuality() < qual){ + //not count + if (!readID.equals(rec.getReadName())){ + samList.clear(); + readID = rec.getReadName(); + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + continue; + } + + AlignmentRec myRec = new AlignmentRec(rec, rec.getReferenceName()); + + if (readID.equals(myRec.readID)) { + }else { + if (samList.size() > 1){ + System.out.println(readID); + for (AlignmentRec aRec:samList){ + System.out.printf("%8d %8d %8d %6s %10d %10d %c %d\n",aRec.readEnd - aRec.readStart, aRec.readStart, aRec.readEnd,aRec.sequenceName,aRec.refStart, aRec.refEnd, aRec.strand?'+':'-',aRec.score); + } + } + samList.clear(); + readID = myRec.readID; + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + samList.add(myRec); + + }// while + + reader.close(); + } + + public static class AlignmentRec{ + String readID; + String sequenceName; + int refStart, refEnd, readStart, readEnd; + public int readLength = 0; + public boolean strand = true;//positive + ArrayList alignmentCigars = new ArrayList(); + int score = 0; + + public AlignmentRec(SAMRecord sam, String seqName) { + // readID = Integer.parseInt(sam.getReadName().split("_")[0]); + readID = sam.getReadName(); + + sequenceName = seqName; + + //mySam = sam; + refStart = sam.getAlignmentStart(); + refEnd = sam.getAlignmentEnd(); + + Cigar cigar = sam.getCigar(); + boolean enterAlignment = false; + ////////////////////////////////////////////////////////////////////////////////// + + for (final CigarElement e : cigar.getCigarElements()) { + alignmentCigars.add(e); + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + case S : + case P : //pad is a kind of clipped + if (enterAlignment) + readEnd = readLength; + readLength += length; + break; // soft clip read bases + case I : + case M : + case EQ : + case X : + if (!enterAlignment){ + readStart = readLength + 1; + enterAlignment = true; + } + readLength += length; + break; + case D : + case N : + if (!enterAlignment){ + readStart = readLength + 1; + enterAlignment = true; + } + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//case + }//for + if (readEnd == 0) + readEnd = readLength; + + score = refEnd + 1 - refStart; + if (sam.getReadNegativeStrandFlag()){ + strand = false; + //need to convert the alignment position on read the correct direction + readStart = 1 + readLength - readStart; + readEnd = 1 + readLength - readEnd; + } + + + } + } + +} diff --git a/src/dev/java/japsadev/tools/VNTRClusteringCmd.java b/src/dev/java/japsadev/tools/VNTRClusteringCmd.java new file mode 100644 index 0000000..4063864 --- /dev/null +++ b/src/dev/java/japsadev/tools/VNTRClusteringCmd.java @@ -0,0 +1,96 @@ +/***************************************************************************** + * Copyright (c) Bhuvaneswari Thirugnanasambandham, buvan.suji@gmail.com + * All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * + ****************************************************************************/ + +package japsadev.tools; + + + + +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import japsadev.bio.hts.clustering.KmeanClustering; + + + +/** + * @author Bhuvaneswari Thirugnanasambandham + * + */ +@Deployable( + scriptName = "jsa.dev.kcluster", + scriptDesc = "Clustering reads" + ) +public class VNTRClusteringCmd extends CommandLine{ + //CommandLine cmdLine; + public VNTRClusteringCmd(){ + + + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input", null, "Input file"); + addString("output", "-", "Output file"); + + addStdHelp(); + + + + + } + public static void main(String [] args) throws Exception{ + CommandLine cmdLine = new VNTRClusteringCmd(); + args = cmdLine.stdParseLine(args); + /*String[] s = args; + System.out.println(s);*/ + + + + + /**********************************************************************/ + + String input = cmdLine.getStringVal("input"); + String output= cmdLine.getStringVal("output"); + ////YOUR CODE GOES HERE + /*System.out.println("Hello world input is " + input); + System.out.println("Testing this statement");*/ + //////////// + + KmeanClustering cluster = new KmeanClustering(); + cluster.Clustering(); + + + } +} diff --git a/src/dev/java/japsadev/tools/VNTRClusteringHmmCmd.java b/src/dev/java/japsadev/tools/VNTRClusteringHmmCmd.java new file mode 100644 index 0000000..9f8d8ac --- /dev/null +++ b/src/dev/java/japsadev/tools/VNTRClusteringHmmCmd.java @@ -0,0 +1,855 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/**************************************************************************** + * Revision History + * 15/05/2014 - Minh Duc Cao: Started + * + ****************************************************************************/ +package japsadev.tools; + +import htsjdk.samtools.*; +import japsa.bio.alignment.MultipleAlignment; +import japsa.bio.alignment.ProfileDP; +import japsa.bio.alignment.ProfileDP.EmissionState; +import japsa.bio.np.ErrorCorrection; +import japsa.bio.tr.TandemRepeat; +import japsa.bio.tr.TandemRepeatVariant; +import japsa.seq.*; +import japsa.util.*; +import japsa.util.deploy.Deployable; +import japsa.xm.expert.Expert; +import japsa.xm.expert.MarkovExpert; +import japsadev.bio.hts.clustering.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + + +/** + * VNTR typing using long reads + * + */ + +@Deployable(scriptName = "jsa.dev.clusteringhmm", scriptDesc = "Clustring in the hmm framework") +public class VNTRClusteringHmmCmd extends CommandLine { + private static final Logger LOG = LoggerFactory.getLogger(VNTRClusteringHmmCmd.class); + + public VNTRClusteringHmmCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + Option referenceOpt = + addString("reference", null, "Name of the reference genome ", true); + ///addStdInputFile(); + Option bamFileOpt = + addString("bamFile", null, "Name of the bam file", true); + + Option outputOpt = + addString("output", "-", + "Name of the output file, - for stdout"); + + Option xafFileOpt = + addString("xafFile", null, "Name of the regions file in xaf", + true); + + Option flankingOpt = + addInt("flanking", 30, "Size of the flanking regions"); + + Option minQualOpt = + addInt("qual", 0, "Minimum quality"); + + addInt("iteration", 1, "Number of iteration"); + addInt("nploidy",2, + "The ploidy of the genome 1 = happloid, 2 = diploid. Currenly only support up to 2-ploidy"); + addString("prefix", "", + "Prefix of temporary files, if not specified, will be automatically generated"); + + ///////////////Adding galaxy support///////////// + flankingOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + minQualOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + xafFileOpt.setGalaxySetting(new GalaxySetting("data", "tabular",false)); + + GalaxySetting galaxyOutput = new GalaxySetting("data", "text",true); + galaxyOutput.setLabel("countRead.txt"); + outputOpt.setGalaxySetting(galaxyOutput); + bamFileOpt.setGalaxySetting(new GalaxySetting("data", "bam",false)); + referenceOpt.setGalaxySetting(new GalaxySetting("data", "fasta",false)); + setGalaxy(annotation.scriptName()); + + + addStdHelp(); + } + + static Alphabet dna = Alphabet.DNA16(); + static IntArray profilePositions = new IntArray(); + static IntArray seqPositions = new IntArray(); + static DoubleArray costGeneration = new DoubleArray(); + static ByteArray byteArray = new ByteArray(); + + + public static void main(String[] args) throws Exception, + InterruptedException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new VNTRClusteringHmmCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + int flanking = cmdLine.getIntVal("flanking"); + if (flanking < 10) + flanking = 10; + + int qual = cmdLine.getIntVal("qual"); + + int np = cmdLine.getIntVal("nploidy"); + if (np > 2) { + System.err.println("The program currenly only support haploid and diployd. Enter nploidy of 1 or 2"); + System.exit(1); + } + + String bamFile = cmdLine.getStringVal("bamFile"); + String prefix = cmdLine.getStringVal("prefix"); + + if (prefix == null || prefix.length() == 0) { + prefix = "p" + System.currentTimeMillis(); + } + /**********************************************************************/ + + SequenceOutputStream outOS = SequenceOutputStream + .makeOutputStream(cmdLine.getStringVal("output")); + + String[] headers = TandemRepeatVariant.SIMPLE_HEADERS; + if (np > 1) { + headers = TandemRepeatVariant.SIMPLE_HEADERS2; + } + + TandemRepeatVariant.printHeader(outOS, headers); + + String strFile = cmdLine.getStringVal("xafFile"); + + LOG.info("Read genome begins"); + HashMap genome = new HashMap (); + SequenceReader seqReader = SequenceReader.getReader(cmdLine.getStringVal("reference")); + Sequence seq; + while ((seq = seqReader.nextSequence(dna)) != null){ + genome.put(seq.getName(), seq); + } + seqReader.close(); + LOG.info("Read genome done"); + + /**********************************************************************/ + XAFReader xafReader = new XAFReader(strFile); + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + Expert.setAlphabet(Alphabet.DNA4()); + + ArrayList readSequences = new ArrayList(); + + //Go through the list of repeats + while (xafReader.next() != null){ + TandemRepeat str = TandemRepeat.read(xafReader); + + //start,end = the start and end of the region (including flanks) + int start = Integer.parseInt(xafReader.getField("start")) - flanking; + int end = Integer.parseInt(xafReader.getField("end")) + flanking; + String chrom = xafReader.getField("chrom"); + + if (seq == null || !seq.getName().equals(chrom)){ + seq = genome.get(chrom); + } + if (seq == null){ + xafReader.close(); + LOG.error("Chrom in line " + xafReader.lineNo() + " not found!!!"); + System.exit(1); + } + + if (end > seq.length()) + end = seq.length(); + + if (start < 1) + start = 1; + + int hmmFlank = flanking; + int period = str.getPeriod(); + double fraction = str.getUnitNo() - Math.floor(str.getUnitNo()); + int hmmPad = (int)(fraction * period ) ; + + //System.out.println("###" + str.getPeriod() + " " + str.getUnitNo() + " " + hmmPad); + Sequence hmmSeq = new Sequence(dna, hmmFlank * 2 + hmmPad + str.getPeriod()); + int i = 0; + + for (;i < hmmFlank + hmmPad + str.getPeriod(); i++) + hmmSeq.setBase(i, seq.getBase(str.getStart() - hmmFlank + i -1)); + + for (;i < hmmSeq.length();i++){ + byte base = seq.getBase(str.getEnd() + i - (hmmFlank + hmmPad + str.getPeriod()) );//no need to -1 + hmmSeq.setBase(i,base); + } + + ProfileDP dp = new ProfileDP(hmmSeq, hmmFlank + hmmPad, hmmFlank + hmmPad + str.getPeriod() - 1);//-1 for 0-index, inclusive + + //System.out.println("Lengths: " + hmmFlank + ", " + hmmPad + " " + str.getPeriod() + " " + hmmSeq.length() ); + //System.out.println("CHECKING BEGIN"); + + outOS.print("##"+str.getID()+"\n## "); + for (int x = 0; x < hmmSeq.length();x++){ + outOS.print(hmmSeq.charAt(x)); + if (x == hmmFlank + hmmPad -1 || x == hmmFlank + hmmPad + period - 1) + outOS.print("=="); + } + outOS.println(); + + //run on the reference + //if (1==0) + { + Sequence refRepeat = seq.subSequence(start, end); + refRepeat.setName("reference"); + processRead(refRepeat, dp, fraction, hmmFlank, hmmPad, period, outOS ); + + } + + SAMRecordIterator iter = reader.query(str.getParent(), start, end, false); + + String fileName = prefix + "_" + str.getID() + "_i.fasta"; + String tempFile = str.getID(); + + SequenceOutputStream os = SequenceOutputStream.makeOutputStream(fileName); + + // double var = 0; + TandemRepeatVariant trVar = new TandemRepeatVariant(); + trVar.setTandemRepeat(str); + + int readIndex = 0; + + //Clear the list + readSequences.clear(); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + // Check qualilty + if (rec.getMappingQuality() < qual) { + continue; + } + + // Only reads that fully span the repeat and flankings + int currentRefPos = rec.getAlignmentStart(); + if (currentRefPos > start) + continue; + if (rec.getAlignmentEnd() < end) + continue; + + readIndex ++; + //////////////////////////////////////////////////////////////////// + //assert currentRefBase < start + + Sequence readSeq = getReadPosition(rec,start,end); + if (readSeq == null) + continue; + + + String readName = readSeq.getName(); + String [] toks = readName.split("/",4); + + String polymerageRead = (toks.length > 1) ? toks[1] : toks[0]; + String subRead = (toks.length > 2) ? toks[2] : "_"; + //String alignSubRead = (toks.length > 3) ? toks[3] : "_"; + readSeq.setName(polymerageRead + "_" + subRead); + readSeq.writeFasta(os); + + //processRead(readSeq, dp, fraction, hmmFlank, hmmPad, period, outOS ); + readSequences.add(readSeq); + }// while + iter.close(); + os.close(); + //FIXME: readSequences: an array of reads + //clustering of this array, + + // + HashMap tempReadSequences = new HashMap(); + + + for(int x = 0;x tempReads = new ArrayList(tempReadSequences.keySet()); + + + + ArrayList> clusterResult = new ArrayList>(); + + KmeanClusteringWithReads clusterObj1 = new KmeanClusteringWithReads(); + + + clusterResult = clusterObj1.Clustering(tempReads); + + ArrayList cluster1String = clusterResult.get(1); + ArrayList cluster2String = clusterResult.get(2); + + ArrayList cluster1Sequence = new ArrayList(); + ArrayList cluster2Sequence = new ArrayList(); + Sequence tempSeq1;Sequence tempSeq2; + + //seq = new Sequence(Alphabet.DNA16(), sequenceString, sequenceName) + if(cluster1String.size()>0){ + for(int x=0; x0){ + for(int x=0; x= 1) { + //speed// processBatch(cluster1Sequence, dpBatch, fraction, hmmFlank, hmmPad, period, outOS); + //speed//}else + //speed// outOS.print("##No cluster found for 1"); + + outOS.print("####Allele 2\n"); + if (cluster2Consensus != null) + processRead(cluster2Consensus, dpBatch, fraction, hmmFlank, hmmPad, period, outOS ); + else + outOS.print("##No consensus found for 2"); + //speed//if (cluster2Sequence.size() >= 1) { + //speed// processBatch(cluster2Sequence, dpBatch, fraction, hmmFlank, hmmPad, period, outOS); + //speed//}else + //speed// outOS.print("##No cluster found for 2"); + + //outOS.print(trVar.toString(headers)); + //outOS.print('\n'); + }// for + + reader.close(); + outOS.close(); + } + + static private void processBatch(ArrayList readBatch, ProfileDP dpBatch, double fraction, int hmmFlank, int hmmPad, int period, SequenceOutputStream outOS ) throws IOException{ + + for (int round = 0; round < 5;round ++){ + double myCost = 0; + int countIns = 0, countDel = 0, countMG = 0, countMB = 0; + for (Sequence readSeq:readBatch){ + EmissionState bestState = dpBatch.align(readSeq); + //TODO: make a filter here: select only eligible alignment + double alignScore = bestState.getScore(); + countIns += bestState.getCountIns(); + countDel += bestState.getCountDel(); + countMG += bestState.getCountMG(); + countMB += bestState.getCountMB(); + //System.out.println("Score " + alignScore + " vs " + readSeq.length()*2 + " (" + alignScore/readSeq.length() +")"); + double bestIter = bestState.getIter() + fraction; + myCost += alignScore; + }//for readSeq + double sum = 3.0 + countMG + countMB + countIns + countDel; + double insP = (countIns + 1.0) /sum; + double delP = (countDel + 1.0) /sum; + double matP = (countMG + countMB + 1.0) /sum; + double matchP = (countMG + 1.0) / (countMG + countMB + 2.0); + double misMatchP = 1 - matchP; + outOS.print(String.format("Total: %3d %3d %3d %3d %8.4f %8.4f %8.4f %8.4f\n", countMG, countMB, countIns, + countDel, insP, delP, misMatchP,myCost)); + dpBatch.setTransitionProbability(matP, insP, delP); + dpBatch.setMatchProbability(matchP); + }//round + + for (Sequence readSeq:readBatch){ + MarkovExpert expert = new MarkovExpert(1); + double costM = 0; + for (int x = 0; x< readSeq.length();x++){ + int base = readSeq.getBase(x); + costM -= JapsaMath.log2(expert.update(base)); + } + + double backGround = costM / readSeq.length() - 0.1; + boolean pass = true; + outOS.print("Markov " + costM + "\t" + (costM / readSeq.length()) + "\n"); + + + EmissionState bestState = dpBatch.align(readSeq); + double alignScore = bestState.getScore(); + //System.out.println("Score " + alignScore + " vs " + readSeq.length()*2 + " (" + alignScore/readSeq.length() +")"); + double bestIter = bestState.getIter() + fraction; + profilePositions.clear(); + seqPositions.clear(); + costGeneration.clear(); + byteArray.clear(); + + EmissionState lastState = bestState; + bestState = bestState.bwdState; + + while (bestState != null){ + profilePositions.add(bestState.profilePos); + seqPositions.add(bestState.profilePos); + costGeneration.add(lastState.score - bestState.score); + + if (bestState.seqPos == lastState.seqPos) + byteArray.add((byte)Alphabet.DNA.GAP); + else + byteArray.add(readSeq.getBase(lastState.seqPos)); + + lastState = bestState; + bestState = bestState.bwdState; + } + + double costL = 0, costR = 0, costCurrentRep = 0, costRep = 0; + int stateL = 0, stateR = 0, stateCurrentRep = 0, stateRep = 0; + int baseL = 0, baseR = 0, baseCurrentRep = 0, baseRep = 0; + int bSeqL = 0, bSeqR = 0, bSeqCurrentRep = 0, bSeqRep = 0; + + int lastProfilePos = -1, lastSeqPos = -1; + + for (int x = profilePositions.size() - 1; x >=0; x--){ + outOS.print(Alphabet.DNA().int2char(byteArray.get(x))); + + int profilePos = profilePositions.get(x); + int seqPos = seqPositions.get(x); + + if (profilePos < hmmFlank + hmmPad){ + stateL ++; + costL += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseL ++; + + if (lastSeqPos != seqPos) + bSeqL ++; + + }else if(profilePos > hmmFlank + hmmPad + period){ + stateR ++; + costR += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseR ++; + + if (lastSeqPos != seqPos) + bSeqR ++; + }else{ + stateCurrentRep ++; + costCurrentRep += costGeneration.get(x); + + stateRep ++; + costRep += costGeneration.get(x); + + if (lastProfilePos != profilePos){ + baseRep ++; + baseCurrentRep ++; + } + + if (lastSeqPos != seqPos){ + bSeqRep ++; + bSeqCurrentRep ++; + } + + } + + //end of a repeat cycle + if (profilePos < lastProfilePos){ + outOS.print("<-----------------REP " + costCurrentRep + + " " + stateCurrentRep + + " " + (stateCurrentRep == 0?"inf": "" + (costCurrentRep/stateCurrentRep)) + + " " + baseCurrentRep + + " " + (baseCurrentRep == 0?"inf": "" + (costCurrentRep/baseCurrentRep)) + + " " + bSeqCurrentRep + + " " + (bSeqCurrentRep == 0?"inf": "" + (costCurrentRep/bSeqCurrentRep)) + ); + + if (costCurrentRep/bSeqCurrentRep > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + costCurrentRep = 0;//restart + stateCurrentRep = 0;//restart + baseCurrentRep = 0; + bSeqCurrentRep = 0; + } + + //left + if (profilePos >= hmmFlank + hmmPad && lastProfilePos < hmmFlank + hmmPad){ + outOS.print("<-----------------LEFT " + costL + + " " + stateL + + " " + (stateL == 0?"inf": "" + (costL/stateL)) + + " " + baseL + + " " + (baseL == 0?"inf": "" + (costL/baseL)) + + " " + bSeqL + + " " + (bSeqL == 0?"inf": "" + (costL/bSeqL)) + ); + if (costL/bSeqL > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + + } + + //right + //if (profilePos < hmmFlank + hmmPad + period && lastProfilePos >= hmmFlank + hmmPad + period){ + // outOS.print("<-----------------RIGHT " + costR + // + " " + stateR + // + " " + (stateR == 0?"inf": "" + (costR/stateR)) + // + " " + baseR + // + " " + (baseR == 0?"inf": "" + (costR/baseR)) + // + " " + bSeqR + // + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + // ); + // outOS.println(); + //} + lastProfilePos = profilePos; + lastSeqPos = seqPos; + + }//for x + + //move to out of the loop + outOS.print("<-----------------RIGHT " + costR + + " " + stateR + + " " + (stateR == 0?"inf": "" + (costR/stateR)) + + " " + baseR + + " " + (baseR == 0?"inf": "" + (costR/baseR)) + + " " + bSeqR + + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + ); + + if (costR/bSeqR > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + //outOS.println(); + + outOS.println(); + outOS.print ("L = " + (costL/(hmmFlank + hmmPad)) + " R = " + costR/(dpBatch.getProfileLength() - hmmFlank - hmmPad - period) + "\n"); + + /*****************************************************************/ + outOS.print("##" + readSeq.getName() +"\t"+bestIter+"\t"+readSeq.length() +"\t" +alignScore+"\t" + alignScore/readSeq.length() + '\t' + costM + "\t" + costM / readSeq.length() + "\t" + costL + "\t" + stateL + "\t" + costR + "\t" + stateR + "\t" + (alignScore - costL - costR) + "\t" + stateRep + "\t" + pass + '\n'); + outOS.print("==================================================================\n"); + } + } + /*******************************************************************/ + + static private void processRead(Sequence readSeq, ProfileDP dp, double fraction, int hmmFlank, int hmmPad, int period, SequenceOutputStream outOS ) throws IOException{ + + MarkovExpert expert = new MarkovExpert(1); + double costM = 0; + for (int x = 0; x< readSeq.length();x++){ + int base = readSeq.getBase(x); + costM -= JapsaMath.log2(expert.update(base)); + } + + double backGround = costM / readSeq.length() - 0.1; + boolean pass = true; + + + outOS.print("Markov " + costM + "\t" + (costM / readSeq.length()) + "\n"); + + EmissionState bestState = dp.align(readSeq); + double alignScore = bestState.getScore(); + //System.out.println("Score " + alignScore + " vs " + readSeq.length()*2 + " (" + alignScore/readSeq.length() +")"); + double bestIter = bestState.getIter() + fraction; + + /*******************************************************************/ + profilePositions.clear(); + seqPositions.clear(); + costGeneration.clear(); + byteArray.clear(); + + //double oldCost = bestState.score; + EmissionState lastState = bestState; + bestState = bestState.bwdState; + + while (bestState != null){ + profilePositions.add(bestState.profilePos); + seqPositions.add(bestState.profilePos); + costGeneration.add(lastState.score - bestState.score); + + if (bestState.seqPos == lastState.seqPos) + byteArray.add((byte)Alphabet.DNA.GAP); + else + byteArray.add(readSeq.getBase(lastState.seqPos)); + + lastState = bestState; + bestState = bestState.bwdState; + } + + double costL = 0, costR = 0, costCurrentRep = 0, costRep = 0; + int stateL = 0, stateR = 0, stateCurrentRep = 0, stateRep = 0; + int baseL = 0, baseR = 0, baseCurrentRep = 0, baseRep = 0; + int bSeqL = 0, bSeqR = 0, bSeqCurrentRep = 0, bSeqRep = 0; + + int lastProfilePos = -1, lastSeqPos = -1; + + for (int x = profilePositions.size() - 1; x >=0; x--){ + outOS.print(Alphabet.DNA().int2char(byteArray.get(x))); + + int profilePos = profilePositions.get(x); + int seqPos = seqPositions.get(x); + + if (profilePos < hmmFlank + hmmPad){ + stateL ++; + costL += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseL ++; + + if (lastSeqPos != seqPos) + bSeqL ++; + + }else if(profilePos > hmmFlank + hmmPad + period){ + stateR ++; + costR += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseR ++; + + if (lastSeqPos != seqPos) + bSeqR ++; + }else{ + stateCurrentRep ++; + costCurrentRep += costGeneration.get(x); + + stateRep ++; + costRep += costGeneration.get(x); + + if (lastProfilePos != profilePos){ + baseRep ++; + baseCurrentRep ++; + } + + if (lastSeqPos != seqPos){ + bSeqRep ++; + bSeqCurrentRep ++; + } + + } + + //end of a repeat cycle + if (profilePos < lastProfilePos){ + outOS.print("<-----------------REP " + costCurrentRep + + " " + stateCurrentRep + + " " + (stateCurrentRep == 0?"inf": "" + (costCurrentRep/stateCurrentRep)) + + " " + baseCurrentRep + + " " + (baseCurrentRep == 0?"inf": "" + (costCurrentRep/baseCurrentRep)) + + " " + bSeqCurrentRep + + " " + (bSeqCurrentRep == 0?"inf": "" + (costCurrentRep/bSeqCurrentRep)) + ); + + if (costCurrentRep/bSeqCurrentRep > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + costCurrentRep = 0;//restart + stateCurrentRep = 0;//restart + baseCurrentRep = 0; + bSeqCurrentRep = 0; + } + + //left + if (profilePos >= hmmFlank + hmmPad && lastProfilePos < hmmFlank + hmmPad){ + outOS.print("<-----------------LEFT " + costL + + " " + stateL + + " " + (stateL == 0?"inf": "" + (costL/stateL)) + + " " + baseL + + " " + (baseL == 0?"inf": "" + (costL/baseL)) + + " " + bSeqL + + " " + (bSeqL == 0?"inf": "" + (costL/bSeqL)) + ); + if (costL/bSeqL > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + + } + + //right + //if (profilePos < hmmFlank + hmmPad + period && lastProfilePos >= hmmFlank + hmmPad + period){ + // outOS.print("<-----------------RIGHT " + costR + // + " " + stateR + // + " " + (stateR == 0?"inf": "" + (costR/stateR)) + // + " " + baseR + // + " " + (baseR == 0?"inf": "" + (costR/baseR)) + // + " " + bSeqR + // + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + // ); + // outOS.println(); + //} + lastProfilePos = profilePos; + lastSeqPos = seqPos; + + }//for x + + //move to out of the loop + outOS.print("<-----------------RIGHT " + costR + + " " + stateR + + " " + (stateR == 0?"inf": "" + (costR/stateR)) + + " " + baseR + + " " + (baseR == 0?"inf": "" + (costR/baseR)) + + " " + bSeqR + + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + ); + + if (costR/bSeqR > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + //outOS.println(); + + outOS.println(); + outOS.print ("L = " + (costL/(hmmFlank + hmmPad)) + " R = " + costR/(dp.getProfileLength() - hmmFlank - hmmPad - period) + "\n"); + + /*****************************************************************/ + outOS.print("##" + readSeq.getName() +"\t"+bestIter+"\t"+readSeq.length() +"\t" +alignScore+"\t" + alignScore/readSeq.length() + '\t' + costM + "\t" + costM / readSeq.length() + "\t" + costL + "\t" + stateL + "\t" + costR + "\t" + stateR + "\t" + (alignScore - costL - costR) + "\t" + stateRep + "\t" + pass + '\n'); + outOS.print("==================================================================\n"); + } + + public static Sequence getReadPosition(SAMRecord rec, int startRef, int endRef){ + byte[] seqRead = rec.getReadBases();// + if (seqRead.length <= 1) + return null; + + int startRead = -1, endRead = -1; + + int refPos = rec.getAlignmentStart(); + int readPos = 0; + //currentRefPos <= startRead + + for (final CigarElement e : rec.getCigar().getCigarElements()) { + int length = e.getLength(); + switch (e.getOperator()) { + case H: + break; // ignore hard clips + case P: + break; // ignore pads + case S: + readPos += e.getLength(); + break; // soft clip read bases + case N: // N ~ D + case D: + refPos += length; + + if (startRead < 0 && refPos >= startRef){ + startRead = readPos; + } + + if (endRead < 0 && refPos >= endRef){ + endRead = readPos; + } + + break;// case + case I: + readPos += length; + break; + + case M: + case EQ: + case X: + if ((startRead < 0) && refPos + length >= startRef) { + startRead = readPos + startRef - refPos; + } + + if ((endRead < 0) && (refPos + length >= endRef)){ + endRead = readPos + endRef - refPos; + } + + refPos += length; + readPos += length; + break; + default: + throw new IllegalStateException( + "Case statement didn't deal with cigar op: " + + e.getOperator()); + }// case + if (refPos >= endRef) + break;//for + + }// for + if (startRead < 0 || endRead < 0){ + LOG.warn(" " + refPos + " " + readPos + " " + startRead + " " + endRead); + return null; + } + + Alphabet alphabet = Alphabet.DNA16(); + Sequence retSeq = new Sequence(alphabet, endRead - startRead + 1, rec.getReadName() + "/" + startRead + "_" + endRead); + for (int i = 0; i < retSeq.length();i++){ + retSeq.setBase(i, alphabet.byte2index(seqRead[startRead + i])); + } + return retSeq; + + } + +} diff --git a/src/dev/java/japsadev/tools/VNTRDepthAnalyserCmd.java b/src/dev/java/japsadev/tools/VNTRDepthAnalyserCmd.java new file mode 100644 index 0000000..f1b51c5 --- /dev/null +++ b/src/dev/java/japsadev/tools/VNTRDepthAnalyserCmd.java @@ -0,0 +1,343 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools; + +import japsa.seq.SequenceOutputStream; +import japsa.seq.XAFReader; +import japsa.util.BetaBinomialModel; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import java.io.File; +import java.io.IOException; + +import org.apache.commons.math3.distribution.NormalDistribution; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * @author minhduc + * + */ +@Deployable(scriptName = "jsa.dev.vntrDepthAnalyser", +scriptDesc = "VNTR typing using coverage depth information of Illumina capture sequencing") +public class VNTRDepthAnalyserCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(VNTRDepthAnalyserCmd.class); + + public VNTRDepthAnalyserCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("xafFile", "VNTR.xaf", "Name of repeat file"); + addInt("readLength", 250, "Average read length"); + addString("output", "-", "Name of output file, - for standard out"); + addString("resample", null, "reference sample"); + addString("resAllele", null, "reference alleles"); + addString("model", "bn", "bn (beta-binomial) or nm (normal)"); + //addBoolean("reverse",false,"Reverse sort order"); + + addStdHelp(); + } + + + + public static void main(String [] args) throws IOException, InterruptedException{ + CommandLine cmdLine = new VNTRDepthAnalyserCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + + String xafFile = cmdLine.getStringVal("xafFile"); + String output = cmdLine.getStringVal("output"); + String resample = cmdLine.getStringVal("resample"); + String resAllele = cmdLine.getStringVal("resAllele"); + String model = cmdLine.getStringVal("model"); + int readLength = cmdLine.getIntVal("readLength"); + + depthAnalysis(xafFile,resample, resAllele, args, output, readLength, model); + } + + static void depthAnalysis(String xafFile, String rData, String resAllele, String[] sFiles, String outputFile, int readLength, String model) throws IOException, InterruptedException{ + if (sFiles.length ==0) + return; + + int stat = 2; + String [] sampleID = new String[sFiles.length]; + + XAFReader [] sReaders = new XAFReader[sFiles.length]; + for (int i = 0; i < sFiles.length;i++){ + File file = new File(sFiles[i]); + sampleID[i] = file.getName(); + sampleID[i] = sampleID[i].replaceAll(".xaf", ""); + sReaders[i] = new XAFReader(sFiles[i]); + } + + XAFReader xafReader = new XAFReader(xafFile); + XAFReader rReader = new XAFReader(rData); + XAFReader rAlleleReader = new XAFReader(resAllele); + + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(outputFile); + //sos.print("#H:ID\tchrom\tstart\tend\trepLen\tseqLen"); + sos.print("#H:ID\tchrom\tstart\tend"); + for (int i = 0; i < sFiles.length;i++){ + sos.print("\t" + sampleID[i]); + } + sos.print('\n'); + + int repCol = stat * 2 + 6; + int seqCol = repCol + 1; + if (stat == 3){ + repCol = 14; + seqCol = 12; + } + if (stat == 4){ + repCol = 14; + seqCol = 13; + } + if (stat == 5){ + repCol = 14; + seqCol = 15; + } + if (stat == 6){ + repCol = 14; + seqCol = 16; + } + + + while (xafReader.next() != null){ + //Extract + String ID = xafReader.getField("ID"); + String chrom = xafReader.getField("chrom"); + int startRep = Integer.parseInt(xafReader.getField("start")); + int endRep = Integer.parseInt(xafReader.getField("end")); + + //sum of lengths of the flanks + int flankings = Integer.parseInt(xafReader.getField("lflank")) + Integer.parseInt(xafReader.getField("rflank")); + double period = Double.parseDouble(xafReader.getField("period")); + //double lengthR = period * Double.parseDouble(xafReader.getField("unitNo"));// - readLength; + + rReader.next(); + if( !ID.equals(rReader.getField("ID"))){ + LOG.error("Wrong ID at line a " + rReader.lineNo()); + System.exit(1); + } + + rAlleleReader.next(); + if( !ID.equals(rAlleleReader.getField("ID"))){ + LOG.error("Wrong ID at line b " + rAlleleReader.lineNo()); + System.exit(1); + } + + for (int i = 0; i < sFiles.length;i++){ + sReaders[i].next(); + } + + String refAlleleStr = rAlleleReader.getField("refAllele"); + if (refAlleleStr == null){ + continue;//while + } + + double refAllele = 0; + try{ + refAllele = Double.parseDouble(refAlleleStr); + }catch(Exception e){ + continue;//while + } + if (refAllele == 0) + continue;//while + + double lengthR = period * refAllele;// + + + sos.print(ID); + sos.print('\t'); + sos.print(chrom); + sos.print('\t'); + sos.print(startRep); + sos.print('\t'); + sos.print(endRep); + //} + + double countR = Double.parseDouble(rReader.getField(repCol)); + double totR = Double.parseDouble(rReader.getField(seqCol)); + //double ratioR = countR / totR; + countR = countR / readLength; + totR = totR / readLength; + + for (int i = 0; i < sFiles.length;i++){ + if( !ID.equals(sReaders[i].getField("ID"))){ + LOG.error("Wrong ID at line " + rReader.lineNo()); + System.exit(1); + } + + double countS = Double.parseDouble(sReaders[i].getField(repCol)); + double totS = Double.parseDouble(sReaders[i].getField(seqCol)); + + countS = countS / readLength; + totS = totS / readLength; + + sos.print('\t'); + + ///////////////////////////////////////////////////////////////// + //Actual analysis here + if (model.equals("nm")){ + NormalDistribution dist = BetaBinomialModel.ratioDistribution(totR - countR, totR, totS - countS, totS, 1000); + double lengthShigh = (flankings + lengthR) / (dist.getMean() - 2 * dist.getStandardDeviation()) - flankings; + double lengthSlow = 1;//(flankings + lengthR) / (dist.getMean() + 2 * dist.getStandardDeviation()) - flankings; + }else{ + double conf = 0.95; + VNTRGenotyper vg = new VNTRGenotyper(); + + vg.setRef(refAllele, countR, totR - countR) ; + vg.setSample(countS, totS - countS); + vg.bdw = 1; + // String result = vg.getConf1(0.2); + String result = vg.getConf(0.95); + sos.print(result); + + + // double[] reports = bn(refAllele, totR, countR, totS, countS, conf); + // sos.print("("+reports[0] + "," + reports[1] + "," + reports[2] + "," + reports[3] + ")"); + + + + + + + } + + + //System.err.println(genos[range[0]] +" to "+genos[range[1]] +" "+mass[2]+"\n max:"+mass[0]+ " maxp:"+mass[1]); + //for(int x=range[0]; x<=range[1]; x++){ + // System.err.println(genos[i]+" => "+prob[x]); + //} + + // sos.print('\t'); + // sos.print((lengthR)/period); + + // sos.print('\t'); + // sos.print((lengthS + readLength)/period); + + // sos.print('\t'); + // sos.print((lengthSlow + readLength)/period); + // sos.print('\t'); + // sos.print((lengthShigh + readLength)/period); + + + //sos.print('\t'); + //sos.print(dist.getMean() + "\t" + dist.getStandardDeviation()); + }//for i + sos.println(); + }//while iter + sos.close(); + xafReader.close(); + + rReader.close(); + for (int i = 0; i < sFiles.length;i++){ + sReaders[i].close(); + } + } + + static double [] bn(double refAllele, double totR, double countR, double totS, double countS, double conf){ + /******************************************************************** + VNTRGenotyper vg = new VNTRGenotyper(); + + vg.setRef(refAllele, countR, totR - countR) ; + vg.setSample(countS, totS - countS); + + vg.bdw = 1; + // String result = vg.getConf1(0.2); + String result = vg.getConf(0.95); + + + + double[] genos = new double[20]; + int half = genos.length/2; + + double startGeno = Math.max(0, (refAllele - half)); + for (int x = 0; x < genos.length; x++){ + genos[x] = startGeno +x; + } + + double[] prob = new double[genos.length]; + vg.probability(prob, genos); + //System.err.println("Simulated "+sample+" depth:"+depth); + int[] range = new int[2]; + + //double conf = 0.5; + double[] mass = VNTRGenotyper.getconf(prob, genos, conf, range); + + //geno[maxi], prob[maxi], sum + //low, high, support,max likelihood genotype, prob of the max genotype + return new double[] {genos[range[0]], mass[0], genos[range[1]], mass[2], mass[0], mass[1]}; + /********************************************************************/ + return null; + } + + + static double [] nm(double totR, double countR, double totS, double countS, int samplingSize){ + NormalDistribution dist = BetaBinomialModel.ratioDistribution(totR - countR, totR, totS - countS, totS, samplingSize); + + double lengthShigh = 0;//(flankings + lengthR) / (dist.getMean() - 2 * dist.getStandardDeviation()) - flankings; + double lengthSlow = 1;//(flankings + lengthR) / (dist.getMean() + 2 * dist.getStandardDeviation()) - flankings; + return new double [] {lengthShigh, lengthSlow}; + // double lengthS = (flankings + lengthR) / (dist.getMean()) - flankings; + // /* + // * note: + // * + // * r = (2f+l_s)/(2f+l_r) + // * r_low = mean-2sd + // * r_high=mean+2sd + // * + // * lS = (2f + lR) * r - 2f + // * + // */ + // + // sos.print('\t'); + // sos.print((lengthR + readLength)/period);/ + + // sos.print('\t'); + // sos.print((lengthS + readLength)/period); + + // sos.print('\t'); + // sos.print((lengthSlow + readLength)/period); + // sos.print('\t'); + // sos.print((lengthShigh + readLength)/period); + + } + +} diff --git a/src/dev/java/japsadev/tools/VNTRDepthSumCmd.java b/src/dev/java/japsadev/tools/VNTRDepthSumCmd.java new file mode 100644 index 0000000..22ab14b --- /dev/null +++ b/src/dev/java/japsadev/tools/VNTRDepthSumCmd.java @@ -0,0 +1,126 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools; + +import japsa.seq.SequenceOutputStream; +import japsa.seq.XAFReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + + + +/** + * @author minhduc + * + */ +@Deployable(scriptName = "jsa.dev.vntrDepthSum", +scriptDesc = "Sum read depth information") +public class VNTRDepthSumCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(VNTRDepthSumCmd.class); + + public VNTRDepthSumCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options] file1 [file2...]"); + setDesc(annotation.scriptDesc()); + + addString("output", "-", "Name of output file, - for standard out"); + addString("sampleID", "ID", "Sample ID"); + + addStdHelp(); + } + + + + public static void main(String [] args) throws IOException, InterruptedException{ + CommandLine cmdLine = new VNTRDepthSumCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + String output = cmdLine.getStringVal("output"); + String sampleID = cmdLine.getStringVal("sampleID"); + + + if (args.length < 1){ + LOG.error("Need to supply some files",1); + System.exit(1); + } + + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(output); + sos.print("#H:ID\tchrom\tstart\tend\trepLen\tseqLen\t" + + sampleID + "_R1\t" + sampleID + "_S1\t" + + sampleID + "_R2\t" + sampleID + "_S2\t" + + sampleID + "_R3\t" + sampleID + "_S3\t" + + sampleID + "_B1\t" + sampleID + "_B2\t" + + sampleID + "_B3\t" + sampleID + "_B4\t" + + sampleID + "_B5\n"); + + XAFReader [] sReaders = new XAFReader[args.length]; + for (int i = 0; i < args.length;i++){ + sReaders[i] = new XAFReader(args[i]); + } + + + while (sReaders[0].next() != null){ + //Extract + String ID = sReaders[0].getField("ID"); + String chrom = sReaders[0].getField("chrom"); + int startRep = Integer.parseInt(sReaders[0].getField("start")); + int endRep = Integer.parseInt(sReaders[0].getField("end")); + int repLen = Integer.parseInt(sReaders[0].getField("repLen")); + int seqLen = Integer.parseInt(sReaders[0].getField("seqLen")); + + sos.print(ID+"\t" + chrom + "\t" + startRep + "\t" + endRep + "\t" + + repLen + "\t" + seqLen); + + for (int i = 1; i < sReaders.length;i++) + sReaders[i].next(); + + for (int j = 6; j < 17; j ++){ + int sum = 0; + for (int i = 0; i < sReaders.length;i++) + sum += Integer.parseInt(sReaders[i].getField(j)); + sos.print("\t" + sum); + } + sos.println(); + } + + sos.close(); + + //depthAnalysis(xafFile,resample, resAllele, args, output, readLength, model); + } +} diff --git a/src/dev/java/japsadev/tools/VNTRGenotyper.java b/src/dev/java/japsadev/tools/VNTRGenotyper.java new file mode 100644 index 0000000..fdfc5bb --- /dev/null +++ b/src/dev/java/japsadev/tools/VNTRGenotyper.java @@ -0,0 +1,198 @@ +package japsadev.tools; + +public class VNTRGenotyper { + /* test */ + public static void main(String[] args){ + VNTRGenotyper vg = new VNTRGenotyper(); + double certainty = 0.5; + double[] ref = new double[] {5,10,15}; + double[] sample = new double[] {5,10,15} ; + double[] depth = new double[] {500}; + for(int i=0; i "+prob[i]); + //} + } + /**NOTE: THIS MODEL IS SPECIFICALLY DESIGNED TO COUNT READS. IF YOU ARE COUNTING BASES IT IS CRITICAL TO DIVIDE THE COUNTS BY THE AVERAGE LENGTH OF THE READS + THE REASON FOR THIS IS THAT THE BETABINOMIAL REFLECTS THE UNCERTAINTY IN THE ESTIMATE OF THE PROPORTION OF READS IN THE REFERENCE IN FLANKING VS NORMAL + IF YOU ARE COUNTING BASES IT GIVES AN ARTIFICIALLY HIGH DEGREE OF CERTAINTY IN THIS FRACTION. + + **/ + static BetaBinomial bb = new BetaBinomial(); + + + + private double[] genos, prob; + + /** just gets the array of genos and probs ready */ + private void setGenos(){ + double refAllele = this.genotype_reference; + double ratio = ( this.count_repeat_sample/ this.count_flanking_sample)/(this.count_repeat_ref/ this.count_flanking_ref); + ratio = Math.max(2*ratio, 4); + genos = new double[(int) Math.max(40, Math.ceil(ratio*refAllele))]; + double rem = refAllele - Math.floor(refAllele); + // int half = genos.length/2; + for(int x=0; xprob[maxi]) maxi =i; + } + double mlgeno = genos[maxi]; + int range = (int) Math.floor(mlgeno * perc); // this assumes genotypes in steps of 1 + + double sum=0; + for(int i=maxi-range; i<=maxi+range; i++){ + sum +=prob[i]; + } + // double[] ranges = new double[] {maxi-range,maxi+range}; + return new Double[] {((double) mlgeno-range)/mult, ((double) mlgeno+range)/mult, sum}; + + } + + + Double[] getconf( double mass, int[] range){ + int maxi=0; + for(int i=1; iprob[maxi]) maxi =i; + } + int i=0; + int len = prob.length; + double sum=prob[maxi]; + range[0] = maxi; + range[1] = maxi; + if(sum0 && maxi +i =mass){ + break; + } + } + } + + return new Double[] {genos[range[0]]/mult, genos[range[1]]/mult, sum, genos[maxi]/mult, prob[maxi]}; + + } + /*bdw is a factor that can be used to make the genotype calls more uncertain. It does this by artificially decreasing the counts. So a value of 10, for example + * would decrease the counts (both in repeat and flanking) by a factor of 10. Note that its best to change this on a log scale if you want to see any affect, i.e. going from 1 to 2 + * has little effect, need to change from 1 to 10 + * */ + static double bdw = 1; + + double genotype_reference; + double count_repeat_ref; + double count_flanking_ref; + double count_repeat_sample; + double count_flanking_sample; + double mult = 1; + void setRef(double genotype_reference, double count_repeat_ref, double count_flanking_ref){ + mult = 100.0/genotype_reference; + this.genotype_reference = 100; + this.count_repeat_ref = count_repeat_ref; + this.count_flanking_ref = count_flanking_ref; + + } + + void setSample(double count_repeat_sample, double count_flanking_sample){ + this.count_flanking_sample = count_flanking_sample; + this.count_repeat_sample = count_repeat_sample; + } + + + /** This calculates the log likelihood */ + double likelihood(double genotype){ + double relative_cn = genotype/genotype_reference; + double n = count_repeat_sample + count_flanking_sample; + bb.set((relative_cn*count_repeat_ref)/bdw, count_flanking_ref/bdw,n); + return bb.logdensity(count_repeat_sample); + + } + + double[] probability(int max_cn){ + double[] res = new double[max_cn+1]; + double[] genos = new double[max_cn+1]; + for(int i=0; imaxv) maxv = v; + logprobs[i] = v; + sum+=Math.exp(v); + } + //NEED TO CHECK THIS IS RIGHT - IDEA IS TO TRANSFORM LOG PROBS TO PROBS + for(int i=0; i 2) { + System.err + .println("The program currenly only support haploid and diployd. Enter nploidy of 1 or 2"); + System.exit(1); + } + /**********************************************************************/ + + String cmd; + if ("kalign".equals(cmdLine.getStringVal("msa"))) { + cmd = "kalign -gpo 60 -gpe 10 -tgpe 0 -bonus 0 -q -i " + prefix + + "i.fasta -o " + prefix + "o.fasta"; + } else { + cmd = "clustalo --force -i " + prefix + "i.fasta -o " + prefix + + "o.fasta"; + } + + SequenceOutputStream outOS = SequenceOutputStream + .makeOutputStream(cmdLine.getStringVal("output")); + + String[] headers = TandemRepeatVariant.SIMPLE_HEADERS; + if (np > 1) { + headers = TandemRepeatVariant.SIMPLE_HEADERS2; + } + + TandemRepeatVariant.printHeader(outOS, headers); + + String strFile = cmdLine.getStringVal("xafFile"); + + // TODO: make it multiple sequence + Sequence seq = SequenceReader.getReader(cmdLine.getStringVal("input")) + .nextSequence(alphabet); + + /**********************************************************************/ + //ArrayList myList = TandemRepeat.readFromFile( + // SequenceReader.openFile(strFile), new ArrayList()); + + + XAFReader xafReader = new XAFReader(strFile); + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = SamReaderFactory.makeDefault().open(new File(cmdLine.getStringVal("bamFile"))); + + + int _tIndex = 0; + while (xafReader.next() != null){ + _tIndex ++; + //for (TandemRepeat str : myList) { + TandemRepeat str = TandemRepeat.read(xafReader); + System.out.println(_tIndex + " " + str.getChr() + " " + str.getStart() + " " + str.getEnd()); + if (str.getPeriod() <= 4) + continue; + + int start = str.getStart() - flanking; + int end = str.getEnd() + flanking; + + if (end > seq.length()) + end = seq.length(); + + if (start < 1) + start = 1; + + SAMRecordIterator iter = reader.query(str.getParent(), start, end, + false); + + int maxAlign = 300; + MultipleAlignment ma = new MultipleAlignment(maxAlign, seq); + + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + //FIXME: this should be handdled by making sure sequence reads present + if (rec.getReadLength() < 10) + continue; + + // Check qualilty + if (rec.getMappingQuality() < qual) { + continue; + } + + // Only reads that fully span the repeat and flankings + if (rec.getAlignmentStart() > start) + continue; + if (rec.getAlignmentEnd() < end) + continue; + + + ma.addRead(rec); + }// while + iter.close(); + // os.close(); + + // seq.subSequence(start, end).writeFasta(prefix+"r.fasta"); + double var = 0; + // double evidence = 0; + + TandemRepeatVariant trVar = new TandemRepeatVariant(); + trVar.setTandemRepeat(str); + + if (ma.printFasta(start, end, prefix + "i.fasta") >= 4) { + Logging.info("Running " + cmd); + Process process = Runtime.getRuntime().exec(cmd); + process.waitFor(); + Logging.info("Done " + cmd); + + SequenceReader msaReader + = FastaReader.getReader(prefix + "o.fasta"); + ArrayList seqList = new ArrayList(); + Sequence nSeq = null; + while ((nSeq = msaReader.nextSequence(Alphabet.DNA16())) != null) { + seqList.add(nSeq); + } + + String target = "pacbio_" + str.getID(); + //str.getChr()+"_"+str.getStart()+"_"+str.getEnd(); + + if (np >= 2) { + /***************************************************************** + //Logging.info("Run hmmsearcg " + "hmmsearch -o "+target+".hmmsearch " + target + ".hmm " + prefix + "i.fasta"); + process = + Runtime.getRuntime().exec("hmmsearch -o "+target+".hmmsearch " + target + ".hmm " + prefix + "i.fasta"); + process.waitFor(); + //Read hmm results + + BufferedReader hmmReader = SequenceReader.openFile(target+".hmmsearch"); + HashMap instances = new HashMap(); + + String hmmLine = ""; + while ((hmmLine = hmmReader.readLine()) != null){ + String[] hmmToks = hmmLine.trim().split(" *"); + //Logging.info(hmmLine + " " +hmmToks.length); + if(hmmToks.length != 4) + continue; + + + if(!hmmToks[0].startsWith("SSS")){ + continue; + } + try{ + int hmmMatchStart = Integer.parseInt(hmmToks[3]), + hmmMatchEnd = Integer.parseInt(hmmToks[1]); + + ReadInstance inst = instances.get(hmmToks[0]); + + if (inst == null){ + inst = new ReadInstance(2, hmmToks[0]); + inst.increase(0, 1) ; + inst.increase(1,hmmMatchEnd - hmmMatchStart) ; + + instances.put(hmmToks[0], inst); + }else{ + inst.increase(0, 1) ; + inst.increase(1,hmmMatchEnd - hmmMatchStart) ; + } + }catch (Exception e){ + Logging.warn("Problems: " + e.getMessage() +" " +hmmLine); + } + } + hmmReader.close(); + + + HashSet set1 = new HashSet (); + + + + Dataset data = new DefaultDataset(); + for (ReadInstance inst:instances.values()){ + data.add(inst); + } + if (data.size() > 1){ + Clusterer km = new KMeans(2); + Dataset[] clusters = km.cluster(data); + + for (int i = 0; i < clusters[0].size();i++){ + set1.add(clusters[0].get(i).toString()); + } + } + + + ArrayList seqList1 = new ArrayList(), + seqList2 = new ArrayList(); + + for (Sequence s:seqList){ + if (set1.contains(s.getName())){ + seqList1.add(s); + }else{ + seqList2.add(s); + } + } + + int llength = seqList.get(0).length(); + + + System.err.print("#Call :" + _tIndex + " 1:"); + for (int s = 0; s < seqList1.size(); s++) { + System.err.print(seqList1.get(s).getName() + ","); + } + System.err.println(); + + + System.err.print("#Call :" + _tIndex + " 2:"); + for (int s = 0; s < seqList2.size(); s++) { + System.err.print(seqList2.get(s).getName() + ","); + } + System.err.println(); + + + int gaps = call(seqList1); + + var = (llength - gaps - end + start) * 1.0 + / str.getPeriod(); + + trVar.setVar(var); + + gaps = call(seqList2); + var = (llength - gaps - end + start) * 1.0 + / str.getPeriod(); + + trVar.setVar2(var); + + trVar.addEvidence(seqList1.size()); + trVar.addEvidence2(seqList2.size()); + + // trVar.setVar2(var); + // trVar.addEvidence(seqList.size()); + + /*****************************************************************/ + } else {// nploidy ==1 + int llength = seqList.get(0).length(); + + int gaps = call(seqList); + + var = (llength - gaps - end + start) * 1.0 + / str.getPeriod(); + + trVar.setVar(var); + trVar.addEvidence(seqList.size()); + } + + }// if + + Process process = + Runtime.getRuntime().exec("rm -f " + prefix + _tIndex + "i.fasta"); + process.waitFor(); + + process = + Runtime.getRuntime().exec("cp " + prefix + "i.fasta " + prefix + _tIndex + "i.fasta"); + process.waitFor(); + + outOS.print(trVar.toString(headers)); + outOS.print('\n'); + ma.printAlignment(start, end); + ma.reduceAlignment(start, end). printAlignment(start, end); + }// for + + reader.close(); + outOS.close(); + + } + /***************************************************************** + * Temporary commented out for independence of javaml + * + static class ReadInstance extends DenseInstance{ + + private static final long serialVersionUID = 1L; + String readName; + + public ReadInstance(int nAtt, String name) { + super(nAtt); + readName = name; + } + + public void increase(int attNo, double added){ + put(attNo,value(attNo) + added); + } + + public String toString(){ + return readName; + } + } +/*****************************************************************/ + + /** + * + * @param seqList + * @param startState + * : the start index of the list (inclusive) + * @param end + * : the end index of the list (exclusive) + */ + static int call(ArrayList seqList, int indexStart, int indexEnd) { + if (indexEnd <= indexStart) + return 0; + + // Get consensus + int gaps = 0; + Sequence nSeq = new Sequence(Alphabet.DNA6(), seqList.get(0).length(), + "consensus"); + int[] votes = new int[6]; + for (int i = 0; i < nSeq.length(); i++) { + Arrays.fill(votes, 0); + for (int s = indexStart; s < indexEnd; s++) { + votes[seqList.get(s).symbolAt(i)]++; + } + byte best = 0; + for (byte b = 1; b < 6; b++) + if (votes[b] > votes[best]) + best = b; + + nSeq.setBase(i, best); + if (best == 5) + gaps++; + }// for + return gaps; + } + + static int call(ArrayList seqList) { + return call(seqList,0,seqList.size()); + + } + /************************************************ + static int[][] distance(ArrayList seqList) { + int[][] dis = new int[seqList.size()][seqList.size()]; + + for (int i = seqList.size() - 1; i > 1; i--) { + for (int j = i - 1; j >= 0; j--) { + dis[i][j] = dis[j][i] = distance(seqList.get(i), seqList.get(j)); + } + } + + return dis; + } + + static int distance(Sequence s1, Sequence s2) { + int dis = 0; + for (int i = 0; i < s1.length(); i++) { + if (s1.getBase(i) != s2.getBase(i)) { + if (s1.getBase(i) == DNA.GAP || s2.getBase(i) == DNA.GAP) + dis += 1; + else + dis += 0; + } + } + return dis; + } +/************************************************/ +} diff --git a/src/dev/java/japsadev/tools/VNTRSelectCmd.java b/src/dev/java/japsadev/tools/VNTRSelectCmd.java new file mode 100644 index 0000000..94f108c --- /dev/null +++ b/src/dev/java/japsadev/tools/VNTRSelectCmd.java @@ -0,0 +1,139 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 11/01/2012 - Minh Duc Cao: Revised + * 01/01/2013 - Minh Duc Cao, revised + ****************************************************************************/ + +package japsadev.tools; + + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +import japsa.bio.tr.TandemRepeat; +import japsa.seq.XAFReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.vntrselect", + scriptDesc = "Select vntr d" + ) +public class VNTRSelectCmd extends CommandLine{ + public VNTRSelectCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new VNTRSelectCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + if (args.length != 3){ + System.err.println(" args = 3 "); + } + ArrayList myStr = new ArrayList(); + + String strFile = args[0]; + String hgFile = args[1]; + String trdbFile = args[2]; + + XAFReader xafReader = new XAFReader(strFile); + + while (xafReader.next() != null){ + //_tIndex ++; + TandemRepeat str = TandemRepeat.read(xafReader); + myStr.add(str); + //System.out.println(str.getID() + " " + str.getChr() + " " + str.getStart() + " " + str.getEnd()); + } + xafReader.close(); + + HashMap map = new HashMap(); + + + BufferedReader bf = new BufferedReader (new FileReader(hgFile)); + String line = ""; + + while ( (line = bf.readLine())!= null){ + line = line.trim(); + String [] toks = line.split("\t"); + map.put("trid" + toks[0], line); + } + bf.close(); + + + + + bf = new BufferedReader (new FileReader(trdbFile)); + line = ""; + + while ( (line = bf.readLine())!= null){ + if (line.startsWith(">")){ + line = line.trim().substring(1); + String [] toks = line.split(":"); + int myStart = Integer.parseInt(toks[2]); + String myChrom = "chr"+toks[1]; + + for (TandemRepeat tr:myStr){ + int start = tr.getStart(); + String chrom = tr.getChr(); + if (Math.abs(start - myStart) < 400 && chrom.equals(myChrom)){ + + System.out.println(tr.getID() + "\t" + + tr.getChr() + "\t" + + tr.getStart() +"\t" + + tr.getEnd() +"\t" + + tr.getPeriod() +"\t" + + tr.getUnitNo() +"==" + + line +"==" + + map.get(toks[0])); + } + } + + } + }//while + bf.close(); + + } +} \ No newline at end of file diff --git a/src/dev/java/japsadev/tools/misc/CollectSTRV.java b/src/dev/java/japsadev/tools/misc/CollectSTRV.java new file mode 100644 index 0000000..4a910bf --- /dev/null +++ b/src/dev/java/japsadev/tools/misc/CollectSTRV.java @@ -0,0 +1,125 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 08/04/2012 - Minh Duc Cao: Revised + * 16/11/2013 - Minh Duc Cao + ****************************************************************************/ + +package japsadev.tools.misc; + +import japsa.bio.tr.TandemRepeatVariant; +import japsa.bio.tr.TandemRepeat; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; + +import java.io.BufferedReader; +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; + +/** + * @author minhduc + * C + */ +public class CollectSTRV { + + /** + * Collect variation from many individuals and compute the variability + * + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception { + + System.err.println("Process " + (args.length) + " files"); + BufferedReader [] bin = new BufferedReader[args.length]; + double qual = 0.5; + + + SequenceOutputStream outS = SequenceOutputStream.makeOutputStream("output.strvar"); + SequenceOutputStream outbed = SequenceOutputStream.makeOutputStream("output.bed"); + outS.print("#H:chr\tstart\tend\tperiod\tvar\tconfidence\n"); + + DescriptiveStatistics stats = new DescriptiveStatistics(); + + for (int i = 0; i < bin.length;i++){ + System.err.println(args[i]); + bin[i] = SequenceReader.openFile(args[i]); + } + + String line = bin[0].readLine(); + String [] headers = line.trim().substring(3). split("\t"); + + for (int i = 1; i < bin.length;i++){ + line = bin[i].readLine(); + } + + + while (true){ + int ssss = -1; + stats.clear(); + TandemRepeatVariant aSTRV = null; + for (int i = 0; i < bin.length;i++){ + line = bin[i].readLine(); + if (line == null) + break; + + aSTRV = TandemRepeatVariant.read(line, headers); + + if (ssss < 0) + ssss = aSTRV.getStart(); + + if (ssss != aSTRV.getStart()){ + System.err.println("ERROR " + aSTRV); + System.exit(1); + } + + if (aSTRV.getConfidence() > qual){ + stats.addValue(aSTRV.getVar() * aSTRV.getPeriod()); + stats.addValue(aSTRV.getVar2() * aSTRV.getPeriod()); + } + }//for + if (aSTRV == null) + break;//while + double var = 0, conf = 0; + if (stats.getN() > 0){ + var = stats.getStandardDeviation() ; + conf = stats.getN(); + } + outS.print(aSTRV.getChr()+"\t"+aSTRV.getStart()+"\t"+aSTRV.getEnd()+"\t" +aSTRV.getPeriod() + + "\t" + var + "\t" + conf + "\n"); + + TandemRepeat str = aSTRV.getTandemRepeat(); + str.setScore(var); + str.writeBED(outbed); + + }//while + outS.close(); + outbed.close(); + } +} diff --git a/src/dev/java/japsadev/tools/misc/CombineSTRV.java b/src/dev/java/japsadev/tools/misc/CombineSTRV.java new file mode 100644 index 0000000..fc13ed8 --- /dev/null +++ b/src/dev/java/japsadev/tools/misc/CombineSTRV.java @@ -0,0 +1,121 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 08/04/2012 - Minh Duc Cao: Revised + * + ****************************************************************************/ + +package japsadev.tools.misc; + +import japsa.bio.tr.TandemRepeatVariant; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; + +import java.io.BufferedReader; + + +/** + * Combine strv from various tools for an individual + * + * @author minhduc + * + */ + +public class CombineSTRV { + + public static void main(String[] args) throws Exception { + System.err.println("Process " + (args.length - 1) + " files"); + BufferedReader [] bin = new BufferedReader[args.length -1 ]; + int lineNo = 1; + double qual = 0.5; + + SequenceOutputStream outS =SequenceOutputStream.makeOutputStream(args[0] + ".strvc"); + outS.print("#H:chr\tstart\tend\tperiod\tvar\tvar2\tconfidence\n"); + + for (int i = 0; i < bin.length;i++) + bin[i] = SequenceReader.openFile(args[i+1]); + + String line = bin[0].readLine(); + String [] headers = line.trim().substring(3). split("\t"); + + for (int i = 1; i < bin.length;i++){ + line = bin[i].readLine(); + } + + while (true){ + line = bin[0].readLine(); + if (line == null) + break; + + lineNo ++; + + TandemRepeatVariant strv = TandemRepeatVariant.read(line, headers); + if (strv.getVar() > strv.getVar2()){ + strv.swapVar(); + } + if (strv.getConfidence() > 1) + strv.setConfidence(1); + + int count = 0; + + if (strv.getConfidence() > qual) + count = 1; + + for (int i = 1; i < bin.length;i++){ + line = bin[i].readLine(); + TandemRepeatVariant aSTRV = TandemRepeatVariant.read(line, headers); + if (strv.getStart() != aSTRV.getStart() || strv.getEnd() != aSTRV.getEnd()) + throw new RuntimeException("Error at line " + lineNo); + + if (aSTRV.getConfidence() > 0.5){ + if (aSTRV.getVar() > aSTRV.getVar2()){ + aSTRV.swapVar(); + } + if (aSTRV.getConfidence() > 1) + aSTRV.setConfidence(1); + + strv.setVar(strv.getVar() + aSTRV.getVar()); + strv.setVar2(strv.getVar2() + aSTRV.getVar2()); + strv.setConfidence(strv.getConfidence() + aSTRV.getConfidence()); + + count ++; + } + }//for + if (count > 0){ + strv.setVar(strv.getVar() / count); + strv.setVar2(strv.getVar2() / count); + strv.setConfidence(strv.getConfidence() / count); + } + outS.print(strv.getChr()+"\t"+strv.getStart()+"\t"+strv.getEnd()+"\t" +strv.getTandemRepeat().getPeriod() + "\t" + strv.getVar()+"\t"+strv.getVar2()+"\t" + strv.getConfidence() + "\n"); + + }//while + outS.close(); + } +} diff --git a/src/dev/java/japsadev/tools/misc/PhyloSimulation.java b/src/dev/java/japsadev/tools/misc/PhyloSimulation.java new file mode 100755 index 0000000..d5bbad2 --- /dev/null +++ b/src/dev/java/japsadev/tools/misc/PhyloSimulation.java @@ -0,0 +1,446 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.tools.misc; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.util.Distribution; +import japsa.util.JapsaMath; + +import java.util.Random; + + + + +//import Jama.Matrix; + + +/** + * @author Minh Duc Cao www.caominhduc.org + * Simulation used in XMas + */ +public class PhyloSimulation { + public static int LENGTH = 10000;// Length = 1000 + + static double[] fre = { 0.1, 0.4, 0.4, 0.1 }; + + // static double [] fre = {0.25,0.25,0.25,0.25}; + static double alpha = 0.001; + + static double ga = fre[0], gc = fre[1], gg = fre[2], gt = fre[3]; + static double beta = alpha / 2.0; + + public static double[][] unit = { { 1.0, 0.0, 0.0, 0.0 }, + { 0.0, 1.0, 0.0, 0.0 }, { 0.0, 0.0, 1.0, 0.0 }, + { 0.0, 0.0, 0.0, 1.0 } }; + + public static double[][] jukes_cantor = { + { 1 - 3 * alpha, alpha, alpha, alpha }, + { alpha, 1 - 3 * alpha, alpha, alpha }, + { alpha, alpha, 1 - 3 * alpha, alpha }, + { alpha, alpha, alpha, 1 - 3 * alpha } }; + + public static double[][] kimura = { + { 1 - alpha - 2 * beta, beta, alpha, beta }, + { beta, 1 - alpha - 2 * beta, beta, alpha }, + { alpha, beta, 1 - alpha - 2 * beta, beta }, + { beta, alpha, beta, 1 - alpha - 2 * beta } }; + + public static double[][] equal_input = { + { 1 - alpha * (1 - ga), alpha * gc, alpha * gg, alpha * gt }, + { alpha * ga, 1 - alpha * (1 - gc), alpha * gg, alpha * gt }, + { alpha * ga, alpha * gc, 1 - alpha * (1 - gg), alpha * gt }, + { alpha * ga, alpha * gc, alpha * gg, 1 - alpha * (1 - gt) } }; + + public static double[][] hky = { + { 1 - beta * gc - alpha * gg - beta * gt, beta * gc, alpha * gg, + beta * gt }, + { beta * ga, 1 - beta * ga - beta * gg - alpha * gt, beta * gg, + alpha * gt }, + { alpha * ga, beta * gc, 1 - alpha * ga - beta * gc - beta * gt, + beta * gt }, + { beta * ga, alpha * gc, beta * gg, + 1 - beta * ga - alpha * gc - beta * gg } }; + + static double pa = 0.001, pb = 0.003, pc = 0.002, pd = 0.0015, pe = 0.0025, + pf = 0.0035; + public static double[][] rev = { + { 1 - (pa * gc + pb * gg + pc * gt), pa * gc, pb * gg, pc * gt }, + { pa * ga, 1 - (pa * ga + pd * gg + pe * gt), pd * gg, pe * gt }, + { pb * ga, pd * gc, 1 - (pb * ga + pd * gc + pf * gt), pf * gt }, + { pc * ga, pe * gc, pf * gg, 1 - (pc * ga + pe * gc + pf * gg) } }; + + /********************************************************************************* + * public static double [][] rateMtx = {{0.9997,0.0001,0.0001,0.0001}, + * {0.0001,0.9997,0.0001,0.0001}, {0.0001,0.0001,0.9997,0.0001}, + * {0.0001,0.0001,0.0001,0.9997} }; + * /**************************************** + * ***************************************** public static double [][] + * rateMtx = {{0.999,0.0002,0.0005,0.0003}, {0.0003,0.999,0.0002,0.0005}, + * {0.0005,0.0003,0.999,0.0002}, {0.0002,0.0005,0.0003,0.999} }; + * /*********** + * ********************************************************************** + * + * / + *********************************************************************************/ + public static double[][] rateMtx = + { { 0.999, 0.0002, 0.0005, 0.0003 }, + { 0.0005, 0.9985, 0.0004, 0.0006 }, + { 0.0004, 0.0007, 0.9983, 0.0006 }, + { 0.0003, 0.0004, 0.0003, 0.999 } }; + + /********************************************************************************* + * public static double [][] rateMtx = {{0.99,0.003,0.005,0.002}, + * {0.003,0.99,0.002,0.005}, {0.005,0.002,0.99,0.003}, + * {0.002,0.006,0.002,0.99} }; / + *********************************************************************************/ + + /** + * matrix a (mxm) times b (mxm) -- a slow implemetation + * + * @param a + * @param b + * @return + */ + public static double[][] times(double[][] a, double[][] b) { + int m = a.length; + double[][] c = new double[m][m]; + + for (int i = 0; i < m; i++) { + for (int j = 0; j < m; j++) { + c[i][j] = 0; + for (int k = 0; k < m; k++) + c[i][j] += a[i][k] * b[k][j]; + } + } + return c; + } + + public static double[] times(double[] a, double[][] b) { + int m = a.length; + double[] c = new double[m]; + + for (int i = 0; i < 1; i++) { + for (int j = 0; j < m; j++) { + c[j] = 0; + for (int k = 0; k < m; k++) + c[j] += a[k] * b[k][j]; + } + } + return c; + } + + public static double[][] copy(double[][] a) { + int m = a.length; + double[][] c = new double[m][m]; + for (int i = 0; i < m; i++) { + for (int j = 0; j < m; j++) { + c[i][j] = a[i][j]; + } + } + return c; + } + + public static double entropy(double[] s) { + double ent = 0; + for (int i = 0; i < s.length; i++) + ent -= s[i] * JapsaMath.log2(s[i]); + + return ent; + } + + public static void generate(int y, int z) throws Exception { + + System.out.printf("%4d %4d %f\n", y, z, y * 1.0 / z); + int length = 10000; + + double[] aGeneDist = { .2, .3, .3, .2 }; + japsa.util.Distribution dist = new japsa.util.Distribution(aGeneDist); + + byte[] seqX = new byte[length], seqZ = new byte[length], seqY = new byte[length]; + + for (int i = 0; i < seqX.length; i++) { + seqX[i] = dist.randomGenerate(rnd); + } + + japsa.util.Distribution[] mtY = new japsa.util.Distribution[4], mtZ = new japsa.util.Distribution[4]; + + double[][] c = copy(rateMtx); + for (int i = 0; i < y; i++) + // y times + c = times(c, rateMtx); + + for (int i = 0; i < c.length; i++) { + double s = 0; + for (int j = 0; j < c[i].length; j++) { + s += c[i][j]; + System.out.printf("%f ", c[i][j]); + } + System.out.println(" " + s); + } + + for (int i = 0; i < 4; i++) { + mtY[i] = new Distribution(c[i]); + } + + c = copy(rateMtx); + for (int i = 0; i < z; i++) + // y times + c = times(c, rateMtx); + + for (int i = 0; i < c.length; i++) { + double s = 0; + for (int j = 0; j < c[i].length; j++) { + s += c[i][j]; + System.out.printf("%f ", c[i][j]); + } + System.out.println(" " + s); + } + + for (int i = 0; i < 4; i++) { + mtZ[i] = new Distribution(c[i]); + } + + for (int i = 0; i < seqX.length; i++) { + seqY[i] = mtY[seqX[i]].randomGenerate(rnd); + seqZ[i] = mtZ[seqX[i]].randomGenerate(rnd); + } + + Sequence Y = new Sequence(Alphabet.DNA4(), seqY), X = new Sequence( + Alphabet.DNA4(), seqX), Z = new Sequence(Alphabet.DNA4(), seqZ); + + X.writeFasta("seqX"); + Y.writeFasta("seqY"); + Z.writeFasta("seqZ"); + + System.out + .println("======================================================="); + } + + + static Random rnd = new Random(13); + + + + public static byte[] genDis(byte[] src, int dis) { + byte[] target = new byte[src.length]; + // Random rnd = new Random(); + + double[][] ct = copy(rateMtx); + + for (int i = 1; i < dis; i++) + // y times + ct = times(ct, rateMtx); + + japsa.util.Distribution[] pvGene = new japsa.util.Distribution[4]; + for (int i = 0; i < pvGene.length; i++) { + pvGene[i] = new japsa.util.Distribution(ct[i]); + } + + for (int i = 0; i < src.length; i++) { + target[i] = pvGene[src[i]].randomGenerate(rnd); + } + return target; + } + + public static double disVector2Matrix(double[] v, double[][] mt) { + double res = 0.0; + for (int i = 0; i < mt.length; i++) { + for (int j = 0; j < mt[i].length; j++) { + res -= v[i] * mt[i][j] * JapsaMath.log2(mt[i][j]); + } + } + return res; + } + + public static void main4() throws Exception { + double scale = 1; + double[] s = { 0.1, 0.2, 0.3, 0.4 }; + + // the subs matrix + double[][] ct = copy(rateMtx); + for (int i = 0; i < 1800; i++) {// y times + ct = times(ct, rateMtx); + + // target + double[] t = times(s, ct); + + // if (i % 100 ==0) + double I_t = entropy(t); + double I_ts = disVector2Matrix(s, ct); + + double I_s = entropy(s); + double I_st = disVector2Matrix(t, ct); + + double dis1 = -JapsaMath.log2((I_t - I_ts) / (I_t + I_ts)); + + double dis2 = -JapsaMath.log2((2 - I_st) / (2 + I_st)); + + double dis = -JapsaMath.log2((I_s - I_st + I_t - I_ts) + / (I_s + I_st + I_t + I_ts)); + + System.out.println(i / scale + " " + I_st + " " + I_s + " " + + (I_s - I_st) + " " + dis1 + " " + dis2 + " " + dis); + + } + } + + + + public static double nlogn(double x) { + return 6 * x * (x - 1) * JapsaMath.log2e / (x + 4 * Math.sqrt(x) + 1); + } + + public static double calx(double p) { + return JapsaMath.log2e + * 2 + * (1 - p) + * 9 + / 4 + * ((1 + 3 * p) / (5 + 3 * p + 8 * Math.sqrt(1 + 3 * p)) + (p + 3) + / (5 - p + 8 * Math.sqrt(1 - p))); + } + + public static void test2() { + for (int t = 1; t < 100; t++) { + + double p = Math.exp(-t * 0.04); + double d = (1.0 + 3.0 * p) / 4.0; + + double od = (1.0 - p) / 4.0; + + double y = -(d * JapsaMath.log2(d) + 3 * od * JapsaMath.log2(od)); + + // double x = MyMath.log2((2 - y)/(2 + y)); + + double z = 2 - 2 * Math.pow(p, 2); + // System.out.println(t + " " + y + " " + z); + + double app = -nlogn(d) - 3 * nlogn(od); + // (1 + 3 * p)/(5 + 3 * p + 8 * Math.sqrt(1 + 3*p)) + // + (3+p)/(5 - p + 8 * Math.sqrt(1 - p)); + + // app = app * 9 * (1 - p)/MyMath.loge2 / 2; + + // System.out.println(y/(1 - p) + " " + app/(1 - p) + " " + z/(1 - + // p) +" " + calx(p)/(1 - p)); + System.out.println(y + " " + app + " " + z + " " + calx(p) + " " + + (1 - p)); + } + } + + public static void testDistance() { + + // rateMtx = jukes_cantor; + rateMtx = kimura; + // rateMtx = hky; + // rateMtx = rev; + // rateMtx = equal_input; + + double[][] mx = copy(unit); + + double[][] my = copy(unit); + // double I_s = entropy(fre); + + for (int t = 1; t < 2000; t++) { + mx = times(mx, rateMtx); + + // time my twice + my = times(my, rateMtx); + // my = times(my, rateMtx); + + double[] fx = times(fre, mx); + double[] fy = times(fre, my); + + double I_x = entropy(fx); + double I_y = entropy(fy); + + // double I_xs = disVector2Matrix(fre, mx); + // double I_sx = disVector2Matrix(fx, mx); + + // System.out.println(-MyMath.log2( (I_x + I_s - I_sx - I_xs)/(I_s + + // I_x) )); + + double I_xy = disVector2Matrix(fy, times(mx, my)); + + double I_yx = disVector2Matrix(fx, times(my, mx)); + + System.out.println(t +" " + (-JapsaMath.log2((I_x + I_y - I_xy - I_yx) + / (I_x + I_y)))); + + } + } + + /** + * Compare + */ + public static void testFunction() { + double x = 0.9999; + double p = 1; + for (int t = 1; t < 50000; t++) { + p *= x; + System.out.println(2 - (-(1 + 3 * p) / 4 + * JapsaMath.log2((1 + 3 * p) / 4) - 3 * (1 - p) / 4 + * JapsaMath.log2((1 - p) / 4))); + } + } + + public static void functionFitting() { + double p; + //alpha = 0.0001; + for (int t = 0; t < 5000; t++) { + double a = -8*t*alpha; + p = Math.exp(a); + + double y = (1.0 - p) / 4.0; + double x = (1.0 + 3 * p) / 4.0; + double r = -3 * y * JapsaMath.log2(y) - x * JapsaMath.log2(x); + + // System.out.println(r); + System.out.println(t + " " + r / 2 + " " + (1 - p * p)+" " + (1 - p*p - r/2)); + } + } + + public static void main(String[] args) throws Exception { + // main2Seq(); + // testPriors(args); + + // threeSome(args); + // testFunction(); + // main4(); + // test2(); + // mainTemp2(); + + testDistance(); + + //functionFitting(); + //testFunction(); + } +} diff --git a/src/dev/java/japsadev/tools/misc/STRVariationSimulation.java b/src/dev/java/japsadev/tools/misc/STRVariationSimulation.java new file mode 100755 index 0000000..e0db033 --- /dev/null +++ b/src/dev/java/japsadev/tools/misc/STRVariationSimulation.java @@ -0,0 +1,314 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 10/05/2012 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.tools.misc; + +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFeature; +import japsa.seq.JapsaFileFormat; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +import java.util.Random; + + + +/** + * Simulate genomes with short tandem repeat variations. Used in STRViper + * @author minhduc + * + */ +public class STRVariationSimulation{ + /** + * @param args + */ + public static void main(String[] args) throws IOException { + /*********************** Setting up script ****************************/ + String scriptName = "str.simATGenome"; + String desc = "Simulate genomes with short tandem variations.\n";// + Sequence.AUTHOR; + CommandLine cmdLine = new CommandLine("\nUsage: " + scriptName + " [params]"); + /**********************************************************************/ + cmdLine.addString("input", null, "Name of input file",true); + cmdLine.addString("output", null, "Prefix output file",true); + cmdLine.addString("myCost", null, "Name of file containing myCost values"); + + cmdLine.addBoolean("help", false, "Display usage message and exit"); + + /**********************************************************************/ + args = cmdLine.parseLine(args); + if (cmdLine.getBooleanVal("help")){ + System.out.println(desc + cmdLine.usageMessage()); + System.exit(0); + } + if (cmdLine.errors() != null) { + System.err.println(cmdLine.errors() + cmdLine.usageMessage()); + System.exit(-1); + } + /**********************************************************************/ + + + String inFile = cmdLine.getStringVal("input"); + String scoreFile = cmdLine.getStringVal("myCost"); + String outFile = cmdLine.getStringVal("output"); + + //First read in the reference sequences + BufferedReader sIn = new BufferedReader(new FileReader(scoreFile)); + + //BioCompFileFormat fileFormat = + // new BioCompFileFormat(SequenceReader.openFile(inFile)); + + JapsaFileFormat reader = new JapsaFileFormat(inFile); + + + JapsaAnnotation anno = reader.readAnnotation(); + //fileFormat.getAnnotationIterator().next(); + Sequence seq = anno.getSequence(); + //fileFormat.getSequenceIterator().next(); + + + + double SNPs = 1.5 * 4.9*1000000/119146348, indels = 1.5 * 810467.0/119146348; + int numSeqs = 3;//maximum distance + int length = seq.length(); + + Random rnd = new Random(1); + + byte[][] seqByte = new byte[numSeqs][length + length / 5]; + byte[][] seqByteNV = new byte[numSeqs][length + length / 5]; + + // make plenty of space + System.out.println(length); + + JapsaAnnotation[] annos = new JapsaAnnotation[numSeqs]; + JapsaAnnotation[] annosNV = new JapsaAnnotation[numSeqs]; + + for (int i = 0; i < numSeqs; i++) { + annos[i] = new JapsaAnnotation(); + annos[i].addDescription("Simulated genome " + (i+1) + " SNP = " + ((i + 1) * SNPs / numSeqs) + " Indels = " + ((i + 1) * indels / numSeqs)); + + annosNV[i] = new JapsaAnnotation(); + annosNV[i].addDescription("Simulated genome " + (i+1) + " SNP = " + ((i + 1) * SNPs / numSeqs) + " Indels = " + ((i + 1) * indels / numSeqs)); + + } + + int [] currentInx = new int[numSeqs]; + int [] currentInxNV = new int[numSeqs]; + + int [] numSNPs = new int[numSeqs]; + int [] numIndels = new int[numSeqs]; + //currentIdx[i] = 0; + + int featureIdx = 0; + JapsaFeature currentFeature = anno.getFeature(featureIdx); + int index = 0; + for (; index < seq.length();){ + if (currentFeature == null || index < currentFeature.getStart()){ + for (int seqIdx = 0; seqIdx < numSeqs; seqIdx++){ + char nucleotide = Character.toUpperCase(seq.charAt(index)); + if (nucleotide != 'A' && nucleotide != 'C' && nucleotide != 'G' && nucleotide != 'T'){ + //Generate a random + seqByte[seqIdx][currentInx[seqIdx]] = (byte) (rnd.nextInt(4)); + seqByteNV[seqIdx][currentInxNV[seqIdx]] = seqByte[seqIdx][currentInx[seqIdx]];//copy the previous symbol + + currentInx[seqIdx] ++; + currentInxNV[seqIdx] ++; + + }else{// + double val = rnd.nextDouble(); + if (val <= SNPs * (seqIdx+1) / numSeqs){//SNPs + //An SNP + //Generate a random number between 0-2, then plus 1 and plus the index of the previous char + //to avoid generating the same nucleotide + seqByte[seqIdx][currentInx[seqIdx]] = (byte) ( (1 + rnd.nextInt(3) + Alphabet.DNA4().char2int(nucleotide)) % 4); + seqByteNV[seqIdx][currentInxNV[seqIdx]] = seqByte[seqIdx][currentInx[seqIdx]];//copy the previous symbol + + currentInx[seqIdx] ++; + currentInxNV[seqIdx] ++; + + numSNPs[seqIdx] ++; + System.out.println(seqIdx+": substitution " + (index +1) + " at " + currentInx[seqIdx]); + }else if (val <= SNPs * (seqIdx+1) / numSeqs + indels * (seqIdx + 1) / numSeqs){//indel + numIndels[seqIdx] ++; + val = rnd.nextDouble(); + if (val > 0.5 && currentInx[seqIdx] > 1){ + System.out.println(seqIdx+": deletion " + (index +1) + " at " + currentInx[seqIdx]); + //A deletion + //currentInx[seqIdx] --; + }else{//an insertion + seqByte[seqIdx][currentInx[seqIdx]] = (byte) (rnd.nextInt(4)); + seqByteNV[seqIdx][currentInxNV[seqIdx]] = seqByte[seqIdx][currentInx[seqIdx]];//copy the previous symbol + + currentInx[seqIdx] ++; + currentInxNV[seqIdx] ++; + + + System.out.println(seqIdx+": insertion " + (index +1) + " at " + currentInx[seqIdx]); + seqByte[seqIdx][currentInx[seqIdx]] = (byte) (Alphabet.DNA4().char2int(nucleotide)); + seqByteNV[seqIdx][currentInxNV[seqIdx]] = seqByte[seqIdx][currentInx[seqIdx]];//copy the previous symbol + currentInx[seqIdx] ++; + currentInxNV[seqIdx] ++; + }//insert vs delete + }else{//Direct copy + seqByte[seqIdx][currentInx[seqIdx]] = (byte) (Alphabet.DNA4().char2int(nucleotide)); + seqByteNV[seqIdx][currentInxNV[seqIdx]] = seqByte[seqIdx][currentInx[seqIdx]];//copy the previous symbol + + currentInx[seqIdx] ++; + currentInxNV[seqIdx] ++; + //System.out.println(seqIdx+": copy " + index + " at " + currentInx[seqIdx]); + } + }//if + }//for + index ++; + }else{ + //start of a STR + //Read the myCost + String scoreLine = sIn.readLine(); + String [] toks = scoreLine.split("\\t"); + int period = Integer.parseInt(toks[2]); + double varScore = Double.parseDouble(toks[4]); + + for (int seqIdx = 0; seqIdx < numSeqs; seqIdx++){ + //scale such as + int diff = (int) ((seqIdx +1)* varScore * (rnd.nextDouble() + .5) * 1.5); + + if (diff <= 0){//include varScore <=0 + //no change + diff = 0; + }else if (diff < currentFeature.getLength() / period && rnd.nextDouble() <= 0.5){ + //contraction + diff = -diff; + }//else expantion + + System.out.println(seqIdx+":D = " +diff + " P = " + period + " L = " + currentFeature.getLength() + " " + varScore); + //assert: diff = 0: no variation + //diff < 0 : contraction + //diff > 0 : expantion + JapsaFeature aFeature = currentFeature.cloneFeature(); + aFeature.setStart(currentInx[seqIdx]); + aFeature.setEnd(currentFeature.getStart() + currentFeature.getLength() -1 + diff * period); + aFeature.addDesc("@DIF:"+diff); + aFeature.addDesc("@VR:"+period); + + JapsaFeature aFeatureNV = currentFeature.cloneFeature(); + aFeatureNV.setStart(currentInxNV[seqIdx]); + aFeatureNV.setEnd(currentFeature.getEnd()); + aFeatureNV.addDesc("@DIF:0"); + aFeatureNV.addDesc("@VR:"+period); + + annos[seqIdx].add(aFeature); + annosNV[seqIdx].add(aFeatureNV); + + int i = currentFeature.getLength() - 1; + int j = aFeature.getLength() - 1; + + for (;i >=0 && j >=0;){ + char nucleotide = Character.toUpperCase(seq.charAt(index + i)); + seqByte[seqIdx][currentInx[seqIdx] + j] = (byte) (Alphabet.DNA4().char2int(nucleotide)); + if (seqByte[seqIdx][currentInx[seqIdx] + j] < 0) + seqByte[seqIdx][currentInx[seqIdx] + j] = (byte) rnd.nextInt(4); + i--;j--; + } + while(j >= 0){ + //expansion + if (i < 0) + i = period - 1; + char nucleotide = Character.toUpperCase(seq.charAt(index + i)); + seqByte[seqIdx][currentInx[seqIdx] + j] = (byte) (Alphabet.DNA4().char2int(nucleotide)); + if (seqByte[seqIdx][currentInx[seqIdx] + j] < 0) + seqByte[seqIdx][currentInx[seqIdx] + j] = (byte) rnd.nextInt(4); + + i--;j--; + } + currentInx[seqIdx] += aFeature.getLength(); + + for (i = currentFeature.getLength() - 1;i >=0 ;){ + char nucleotide = Character.toUpperCase(seq.charAt(index + i)); + seqByteNV[seqIdx][currentInxNV[seqIdx] + i] = (byte) (Alphabet.DNA4().char2int(nucleotide)); + if (seqByteNV[seqIdx][currentInxNV[seqIdx] + i] < 0) + seqByteNV[seqIdx][currentInxNV[seqIdx] + i] = (byte) rnd.nextInt(4); + i--; + } + currentInxNV[seqIdx] += aFeatureNV.getLength(); + + }//for + index += currentFeature.getLength(); + + featureIdx ++; + if (featureIdx < anno.numFeatures()) + currentFeature = anno.getFeature(featureIdx); + else + currentFeature = null; + } + } + sIn.close(); + reader.close(); + + //System.out.println("Insert reps done"); + for (int ni = 0; ni < numSeqs; ni++) { + Sequence nSeq = new Sequence(Alphabet.DNA4(), seqByte[ni], currentInx[ni], seq.getName()); + + SequenceOutputStream out = SequenceOutputStream.makeOutputStream(outFile + (ni+1) + ".bio"); + + annos[ni].addDescription(numSNPs[ni] + " SNPs " + numIndels[ni] + " indels "); + JapsaAnnotation.write(nSeq, annos[ni], out); + out.close(); + + out = SequenceOutputStream.makeOutputStream(outFile + (ni+1) + ".fas"); + nSeq.writeFasta(out); + out.close(); + System.out.println("Sequence " + (ni+1) + " : " + numSNPs[ni] + " SNPs " + numIndels[ni] + " indels "); + //////////////////////////////////////////////////////////////////////////////////////////////////////// + nSeq = new Sequence(Alphabet.DNA4(), seqByteNV[ni], currentInxNV[ni], seq.getName()); + + out = SequenceOutputStream.makeOutputStream(outFile + (ni+1) + ".NV.bio"); + annosNV[ni].addDescription(numSNPs[ni] + " SNPs " + numIndels[ni] + " indels "); + JapsaAnnotation.write(nSeq, annosNV[ni], out); + out.close(); + + out = SequenceOutputStream.makeOutputStream(outFile + (ni+1) + ".NV.fas"); + nSeq.writeFasta(out); + out.close(); + System.out.println("Sequence " + (ni+1) + " : " + numSNPs[ni] + " SNPs " + numIndels[ni] + " indels "); + } + System.out.println("Write done"); + + + } + +} diff --git a/src/dev/java/japsadev/tools/misc/SimulateLer0Genome.java b/src/dev/java/japsadev/tools/misc/SimulateLer0Genome.java new file mode 100755 index 0000000..bb6142e --- /dev/null +++ b/src/dev/java/japsadev/tools/misc/SimulateLer0Genome.java @@ -0,0 +1,373 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 10/11/2012 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.tools.misc; + + + +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFeature; +import japsa.seq.JapsaFileFormat; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; + +import java.util.Arrays; +import java.util.Random; + + + + +/** + * Simulate the genome of Ler-0 with STR variations + * @author minhduc + * + */ +public class SimulateLer0Genome{ + /** + * @param args + */ + public static void main(String[] args) throws IOException { + /*********************** Setting up script ****************************/ + String scriptName = "japsa.seq.simLer0"; + String desc = "Simulate a Ler-0 genome from an existing genome.\n";// + Sequence.AUTHOR; + CommandLine cmdLine = new CommandLine("\nUsage: " + scriptName + " [params]"); + /**********************************************************************/ + cmdLine.addString("input", null, "Name of input file (genome of Col-0)",true); + cmdLine.addString("var", null, "Name of variations input file (genome of Col-0)",true); + cmdLine.addString("output", null, "Prefix output file",true); + + cmdLine.addInt("seed", 0, "Random seed, 0 for a random seed"); + cmdLine.addBoolean("help", false, "Display usage message and exit"); + /**********************************************************************/ + args = cmdLine.parseLine(args); + if (cmdLine.getBooleanVal("help")){ + System.out.println(desc + cmdLine.usageMessage()); + System.exit(0); + } + if (cmdLine.errors() != null) { + System.err.println(cmdLine.errors() + cmdLine.usageMessage()); + System.exit(-1); + } + /**********************************************************************/ + + String inFile = cmdLine.getStringVal("input"); //input in str.combio format + String outFile = cmdLine.getStringVal("output"); + String scoreFile = cmdLine.getStringVal("var"); + + SequenceOutputStream out = SequenceOutputStream.makeOutputStream(outFile); + + //First read in the reference sequences + + int seed = cmdLine.getIntVal("seed"); + Random rnd; + if (seed <= 0) + rnd = new Random(); + else + rnd = new Random(seed); + + + BufferedReader sIn = new BufferedReader(new FileReader(scoreFile)); + //BioCompFileFormat fileFormat = + // new BioCompFileFormat(SequenceReader.openFile(inFile)); + // + // Iterator annoItr = fileFormat.getAnnotationIterator(); + // Iterator seqItr = fileFormat.getSequenceIterator(); + + JapsaFileFormat reader = new JapsaFileFormat(inFile); + + + for (int xx = 1; xx <= 5; xx++){ + JapsaAnnotation anno = reader.readAnnotation(); + Sequence seq = anno.getSequence(); + + // seqItr.next(); + //JapsaAnnotation anno = annoItr.next(); + + JapsaAnnotation newAnno = new JapsaAnnotation(); + int fIndex = 0; + JapsaFeature currentFeature = anno.getFeature(fIndex); + + //byte [] mSeq = japsa.seq.getSequence(); + byte [] newSeq = new byte[seq.length() + seq.length() / 2]; + int [] indexes = new int[newSeq.length];//mapping back the new japsa.seq -> old + + Arrays.fill(indexes, -1); + + //note posSrc'es are 0-based index + BufferedReader in = SequenceReader.openFile(xx+".sdi"); + String line; + int mPos = 0, newPos = 0; + + while (mPos < seq.length()){// + //assert nextSTR == null || nextSTR.getStart >= mPos + line = in.readLine(); + if (line == null){ + if (currentFeature != null){ + System.err.println("I am not done at " + mPos + " at " + currentFeature.getStart()); + } + while (mPos < seq.length()){ + newSeq[newPos] = (byte) Alphabet.DNA4().char2int(seq.charAt(mPos)); + indexes[newPos] = mPos; + + if (newSeq[newPos] < 3){ + newSeq[newPos] = (byte) rnd.nextInt(4); + } + newPos ++; + mPos ++; + } + break;//while (mPos < mSeq.length) + } + + String [] toks = line.trim().split("\\t"); + //do a check + if (!("Chr"+xx).equals(toks[0])){ + System.err.println("Wrong info 1 : " + line); + System.exit(1); + }//do a check + + //note STR ois 1-based index + int pos = Integer.parseInt(toks[1]) - 1;//convert to 0-based index + int length = Integer.parseInt(toks[2]); + + //copy up to this event + //assert nextSTR == null || nextSTR.getStart >= mPos + while (mPos < pos){ + if (currentFeature != null && mPos >= currentFeature.getStart() - 1){ + //infact, mPos == nextFeature.getStart() + + String scoreLine = sIn.readLine(); + String [] ts = scoreLine.split("\\t"); + int period = Integer.parseInt(ts[2]); + + double varScore = Double.parseDouble(ts[4]); + //scale such as + int diff = (int) (3 * varScore * (rnd.nextDouble() + .5) * 1.5);//one way to scale + + if (diff <= 0){//include varScore <=0 + //no change + diff = 0; + }else if (diff < currentFeature.getLength() / period && rnd.nextDouble() <= 0.5){ + //contraction + diff = -diff; + }//else expantion + //System.out.println(seqIdx+":D = " +diff + " P = " + period + " L = " + nextFeature.getLength() + " " + varScore); + + + JapsaFeature aFeature = currentFeature.cloneFeature(); + aFeature.setStart(newPos + 1); + aFeature.setEnd(newPos + currentFeature.getLength() + diff * period); + aFeature.addDesc("@DIF:"+diff); + aFeature.addDesc("@VR:"+period); + newAnno.add(aFeature); + + + int i = currentFeature.getLength() - 1; + int j = aFeature.getLength() - 1; + + for (;i >=0 && j >=0;){ + char nucleotide = Character.toUpperCase(seq.charAt(mPos + i)); + newSeq[newPos + j] = (byte) (Alphabet.DNA4().char2int(nucleotide)); + if (newSeq[newPos + j] < 0) + newSeq[newPos + j] = (byte) rnd.nextInt(4); + + indexes[newPos + j] = mPos + i; + + i--;j--; + + } + while(j >= 0){ + //expansion + if (i < 0) + i = period - 1; + char nucleotide = Character.toUpperCase(seq.charAt(mPos + i)); + newSeq[newPos + j] = (byte) (Alphabet.DNA4().char2int(nucleotide)); + + indexes[newPos + j] = mPos; + + if (newSeq[newPos + j] < 0) + newSeq[newPos + j] = (byte) rnd.nextInt(4); + + i--;j--; + }//while j + mPos += currentFeature.getLength(); + newPos += aFeature.getLength(); + + fIndex ++; + if (fIndex < anno.numFeatures()) + currentFeature = anno.getFeature(fIndex); + else + currentFeature = null; + + + continue; //while mPos < posSrc + }//if + + newSeq[newPos] = (byte)Alphabet.DNA4().char2int(seq.charAt(mPos)); + indexes[newPos] = mPos; + if (newSeq[newPos] < 0){ + newSeq[newPos] = (byte) rnd.nextInt(4); + } + newPos ++; + mPos ++; + + }//while mPos < posSrc + + if (mPos > pos) { + System.out.println("Ignore1 " + (pos + 1) + " " + line); + continue; + } + //assert mPos == posSrc && posSrc < currentFeature.start + + if (currentFeature != null && mPos + (toks[3].equals("-")?0:toks[3].length()) >= currentFeature.getStart()-1){ + System.out.println("Ignore2 " + (pos + 1) + " " + line); + continue;//ignore this because it overlaps with STR + } + + if (length == 0){ + //point mutation + char nucleotide = toks[3].charAt(0); + if (nucleotide != seq.charAt(mPos)){ + System.err.println("Wrong info 2: " + line + " : Expect " + seq.charAt(mPos) + " see " + nucleotide); + System.err.println(currentFeature.getStart()); + System.err.println("mPos is " + mPos); + + System.exit(1); + } + nucleotide = toks[4].charAt(0); + newSeq[newPos] = (byte)Alphabet.DNA4().char2int(nucleotide); + if (newSeq[newPos] < 0) + newSeq[newPos] = (byte) rnd.nextInt(4); + indexes[newPos] = mPos; + + JapsaFeature aFeature = new JapsaFeature(newPos + 1,1); + aFeature.setType("SNP"); + aFeature.setID("S"+(mPos+1)); + aFeature.addDesc(line); + newAnno.add(aFeature); + + mPos ++; + newPos ++; + }else if (length > 0){ + //insertion + + JapsaFeature aFeature = new JapsaFeature(newPos + 1,toks[4].length()); + aFeature.setType("INS"); + aFeature.setID("I"+(mPos+1)); + aFeature.addDesc(line); + newAnno.add(aFeature); + + if ("-".equals(toks[3])){ + //straight insert + if (length != toks[4].length()){ + System.err.println("Wrong info 3: " + line + " : Expect length " + length + " see " + toks[4].length()); + System.exit(1); + } + if (length != toks[4].length()){ + System.err.println("Wrong info 4: " + line + " : Expect length " + length + " see " + toks[4].length()); + System.exit(1); + } + for (int t = 0; t < toks[4].length(); t++){ + char c = toks[4].charAt(t); + newSeq[newPos+t] = (byte)Alphabet.DNA4().char2int(c); + if (newSeq[newPos+t] < 0) + newSeq[newPos+t] = (byte) rnd.nextInt(4); + + indexes[newPos+t] = mPos; + } + newPos += length; + + }else{ + if (length + toks[3].length() != toks[4].length()){ + System.err.println("Wrong info 5: " + line + " : Expect length " + (length + toks[3].length()) + " see " + toks[4].length()); + System.exit(1); + } + for (int t = 0; t < toks[4].length(); t++){ + indexes[newPos+t] = mPos; + char c = toks[4].charAt(t); + newSeq[newPos+t] = (byte) Alphabet.DNA4().char2int(c); + if (newSeq[newPos+t] < 0) + newSeq[newPos+t] = (byte) rnd.nextInt(4); + } + newPos += toks[4].length(); + mPos += toks[3].length(); + } + }else{ + //do deletion + JapsaFeature aFeature = new JapsaFeature(newPos + 1,0); + aFeature.setType("DEL"); + aFeature.setID("D"+(mPos+1)); + aFeature.addDesc(line); + newAnno.add(aFeature); + + mPos+= toks[3].length(); + if (!"-".equals(toks[4])){ + //TODO: critical check below + //was aFeature.setLength(toks[4].length()); + aFeature.setEnd(newPos + toks[4].length()); + for (int t = 0; t < toks[4].length(); t++){ + char c = toks[4].charAt(t); + newSeq[newPos+t] =(byte) Alphabet.DNA4().char2int(c); + indexes[newPos+t] = mPos; + if (newSeq[newPos+t] < 0) + newSeq[newPos+t] = (byte) rnd.nextInt(4); + } + newPos += toks[4].length(); + } + } + }//while + Sequence newSequence = new Sequence(Alphabet.DNA4(), newSeq, newPos, "S"+xx); + newAnno.sortFeatures(); + JapsaAnnotation.write(newSequence, newAnno, out); + System.out.println("#" + newAnno.numFeatures()); + + for (int k = 0; k < newPos; k++){ + System.out.println((k +1) + " " + (indexes[k] +1)); + } + }//for + + out.close(); + sIn.close(); + reader.close(); + + + } +} + + diff --git a/src/dev/java/japsadev/tools/misc/TandemRepeatSim.java b/src/dev/java/japsadev/tools/misc/TandemRepeatSim.java new file mode 100644 index 0000000..153ec6d --- /dev/null +++ b/src/dev/java/japsadev/tools/misc/TandemRepeatSim.java @@ -0,0 +1,228 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 18/10/2013 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.tools.misc; + +import japsa.bio.tr.TandemRepeat; +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; + +import java.io.IOException; +import java.util.Random; + + + +/** + * Generate a sequence with short tandem repeats + * @author Minh Duc Cao (http://www.caominhduc.org/) + * + */ +public class TandemRepeatSim { + + /** + * @param args + */ + public static void main(String[] args) throws IOException{ + /*********************** Setting up script ****************************/ + String scriptName = "japsa.sim trep"; + String desc = "A program to simulate tandem repeats\n"; + CommandLine cmdLine = new CommandLine("\nUsage: " + scriptName + " [options]"); + /**********************************************************************/ + + cmdLine.addDouble ("at", 0.4, "AT content, assume %A ~ %T, %C ~ %G "); + cmdLine.addInt ("length", 1000000, "Length of the sequence"); + cmdLine.addInt ("unit", 3, "Repeat unit length"); + cmdLine.addDouble ("meanLen",20, "Mean number of repeat units"); + cmdLine.addDouble ("stdLen",2, "Standard deviation of repeat units"); + cmdLine.addDouble ("repeatRatio", 0.05, "Propotion of repeat DNA"); + cmdLine.addString ("output", "output", "Name of output file"); + cmdLine.addDouble ("indel", 0.02, "rate of indels"); + cmdLine.addDouble ("subs", 0.05, "rate of substitutions"); + + /**********************************************************************/ + cmdLine.addStdHelp(); + args = cmdLine.parseLine(args); + if (cmdLine.getBooleanVal("help")){ + System.out.println(desc + cmdLine.usageMessage()); + System.exit(0); + } + if (cmdLine.errors() != null) { + System.err.println(cmdLine.errors() + cmdLine.usageMessage()); + System.exit(-1); + } + /**********************************************************************/ + + double at = cmdLine.getDoubleVal("at"); + double ratio = cmdLine.getDoubleVal("repeatRatio"); + + double rIndel = cmdLine.getDoubleVal("indel"); + double rSubs = cmdLine.getDoubleVal("subs"); + if (at < 0 || at > 1){ + System.err.println("AT content has to be between 0 and 1"); + System.exit(-1); + } + if (ratio < 0 || ratio > 1){ + System.err.println("Repeat ration has to be between 0 and 1"); + System.exit(-1); + } + + //length of sequence + int length = cmdLine.getIntVal("length"); + + //unit length + int unit = cmdLine.getIntVal ("unit"); + + //mean and std of repeats in units + double meanLen = cmdLine.getDoubleVal("meanLen"); + double stdLen = cmdLine.getDoubleVal("stdLen"); + + //number of repeats + int numReps = (int) (length * ratio / unit / meanLen); + + //Generate the sequence + Alphabet alphabet = Alphabet.DNA4(); + double [] freqs = new double[4]; + freqs[0] = freqs[3] = at / 2.0; + freqs[1] = freqs[2] = (1.0 - at) / 2.0; + + Sequence seq = Sequence.random(alphabet, length, freqs); + + //System.out.println(numReps); + + //Generate repeats + Random rand = new Random(); + byte [] rUnits = new byte[unit]; + + JapsaAnnotation anno = new JapsaAnnotation(seq); + + //the generation of repeat start here + for (int n = 0; n < numReps; n++){ + + //generate the length of repeats + int rLen = (int) (rand.nextGaussian() * stdLen + meanLen); + //make sure rLen >= 2 + if (rLen < 2) + rLen = 2; + + //convert to nucleotide + rLen *= unit; + + //generate the position of repeats, making sure no repeats overlap + int pos = rand.nextInt(length / numReps - rLen); + pos += (length / numReps) * n; + + TandemRepeat rep = new TandemRepeat (seq.getName(), pos + 1, pos + rLen);// + String repStr = ""; + //get repeat units + int i; + for (i = 0; i < unit;i++){ + rUnits[i] = seq.getBase(pos + i); + repStr = repStr + alphabet.int2char(rUnits[i]); + } + + rep.addDesc("@U:"+repStr); + + repStr = repStr + "\t" + repStr+' '; + + + rep.setID("P"+(pos+1)); + rep.setPeriod(unit); + rep.setUnitNo(rLen / unit); + + anno.add(rep); + String change = "C:"; + + i = 0;//index to unit + + for (int p = unit; p < rLen; p++ ){ + //toss the coin + double chance = rand.nextDouble(); + + if (chance < rIndel / 2){ + //insertion + change = change + "I"+(p+1)+":"; + repStr += seq.charAt(pos + p); + + }else if (chance < rIndel){ + //deletion + change = change + "D"+p+":"; + p --; + i ++; + if (i >= unit){ + i = 0; + repStr += ' '; + } + + }else if(chance < rIndel + rSubs){ + //substitution + byte b = (byte) ((rUnits[i] + 1 + rand.nextInt(alphabet.size() - 1)) % alphabet.size()); + seq.setBase(pos + p, b); + change = change + "S"+(p+1)+":"; + repStr += seq.charAt(pos + p); + i ++; + if (i >= unit){ + i = 0; + repStr += ' '; + } + }else{ + //normal + seq.setBase(pos + p, rUnits[i]); + repStr += seq.charAt(pos + p); + + //advance i + i ++; + if (i >= unit){ + i = 0; + repStr += ' '; + } + } + }//for + + rep.addDesc(repStr); + rep.addDesc(change+"L"+rLen); + } + + SequenceOutputStream out = SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("output")+".jsa"); + JapsaAnnotation.write(seq, anno, out) ; + out.close(); + + out = SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("output")+".fas"); + seq.writeFasta(out); + out.close(); + } + +} diff --git a/src/dev/java/japsadev/tools/misc/VNTRReadDepth.java b/src/dev/java/japsadev/tools/misc/VNTRReadDepth.java new file mode 100644 index 0000000..a69193a --- /dev/null +++ b/src/dev/java/japsadev/tools/misc/VNTRReadDepth.java @@ -0,0 +1,512 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.misc; + +import java.io.BufferedReader; + +import japsa.seq.SequenceReader; + +import org.apache.commons.math3.distribution.BetaDistribution; +import org.apache.commons.math3.distribution.PoissonDistribution; +import org.apache.commons.math3.special.Gamma; +import org.apache.commons.math3.util.FastMath; +import org.apache.commons.math3.util.MathUtils; + +/** + * @author minhduc + * + */ +public class VNTRReadDepth { + public static double logBin(int k, int n, double p){ + return + logBinomialProbability(k,n, p, 1.0 - p); + } + + public static void main(String[] args) throws Exception{ + String totFile = "/home/minhduc/Projects/data/Genomes/TB/collections/TB_samples.fdepth"; + String depthFile = "/home/minhduc/Projects/data/Genomes/TB/collections/TB_samples.rdepth"; + String lengthFile = "/home/minhduc/Projects/data/Genomes/TB/collections/TB_samples.length"; + + BufferedReader inAns = SequenceReader.openFile(totFile); + int [][] tot = new int[24][32]; + int [][] depth = new int[24][32]; + int [][] length = new int[24][32]; + + + BufferedReader in = SequenceReader.openFile(totFile); + in.readLine();//header + for (int r = 0; r < 24; r++){ + String [] toks = in.readLine().trim().split("\t"); + for (int l = 0; l < 32;l++) + tot[r][l] = Integer.parseInt(toks[l]); + } + in.close(); + + in = SequenceReader.openFile(depthFile); + in.readLine();//header + for (int r = 0; r < 24; r++){ + String [] toks = in.readLine().trim().split("\t"); + for (int l = 0; l < 32;l++) + depth[r][l] = Integer.parseInt(toks[l]); + } + in.close(); + + in = SequenceReader.openFile(lengthFile); + in.readLine();//header + for (int r = 0; r < 24; r++){ + String [] toks = in.readLine().trim().split("\t"); + for (int l = 0; l < 32;l++) + length[r][l] = Integer.parseInt(toks[l]); + } + in.close(); + + int index = 5; + + int wrong = 0, right = 0; + for (int r = 0; r < 24; r++){ + for (int c = 6; c < 32; c++){ + double p = (depth[r][c] * 1.0 / tot[r][c]) / (depth[r][index] * 1.0 / tot[r][index]); + double t = length[r][c] * 1.0 / length[r][index]; + System.out.printf("%4.2f vs %4.2f\t",p,t); + if (0.9 < t && t < 1.1){//Nova + if (0.8 < p && p < 1.2) right ++; + else wrong ++; + } + if (t>=1.1){ + if (p >= 1.1) right ++; + else wrong ++; + } + if (t<=0.9){ + if (p <0.9) right ++; + else wrong ++; + } + + }//for c + System.out.println(); + }//for r + System.out.println(right + " " + wrong); + inAns.close(); + } + + + + public static void main1(String[] args) throws Exception{ + int readLength = 100; + + String ansFile = "/home/minhduc/Projects/data/Genomes/TB/collections/Miru24.dat"; + String datFile = "/home/minhduc/Projects/data/Genomes/TB/collections/TB_samples.depths"; + + BufferedReader inAns = SequenceReader.openFile(ansFile); + + String aLine = inAns.readLine().trim(); + + String [] strains = {"H37Rv", "Erdman", "Haarlem", "KZN_1435", "W_148"}; + double [] strainType = new double[strains.length]; + inAns.readLine(); + + BufferedReader inDat = SequenceReader.openFile(datFile); + String line = inDat.readLine().trim().substring(3); + + String [] samples = line.split("\t"); + + double [] tot = new double[samples.length]; + double [] depth = new double[samples.length]; + + int numRep = 24; + + //Read in total depth + line = inDat.readLine().trim(); + String[] toks = line.split("\t"); + for (int i = 0; i < tot.length; i++) + tot[i] = Double.parseDouble(toks[i]) / readLength; + + int rightP = 0, wrongP = 0; + int noChange = 0; + for (int r = 0 ; r < numRep; r++){ + //get rep information + aLine = inAns.readLine().trim(); + toks = aLine.split("\t"); + + double repeatRef = Double.parseDouble(toks[2]) - Double.parseDouble(toks[1]); + double repeatUnit = Double.parseDouble(toks[3]); + + for (int i = 0; i< strains.length; i++) + strainType[i] = Double.parseDouble(toks[6+i]); + + //Read read depth + line = inDat.readLine().trim(); + toks = line.split("\t"); + for (int i = 0; i< depth.length; i++){ + depth[i] = Double.parseDouble(toks[i]) / readLength; + } + + //Analysis + int ref = 4; + for (int i = 5; i< depth.length; i++){ + //System.out.print(depthBin2(depth[i], tot[i], depth[ref], tot[ref],1)+"," + depthBin2(depth[i], tot[i], depth[ref], tot[ref],100)+"\t"); + + int index = 0; + if (samples[i].startsWith("E")) + index = 1; + else if (samples[i].startsWith("H")) + index = 2; + else if (samples[i].startsWith("K")) + index = 3; + else if (samples[i].startsWith("W")) + index = 4; + + double actualLength = repeatRef + repeatUnit * strainType[index]; + //expct actualLength / repeatRef ~ (Ds/Ts)/(Dr/Tr) + //System.out.print(actualLength/repeatRef + "," + actualLength + "," +repeatRef + "," + ((depth[i]/tot[i])/(depth[ref]/tot[ref]))+"," + strainType[index] + + // "#" + (depth[i]+1) + "," + (tot[i]+1) + "," + (depth[ref] + 1) + "," + (tot[ref] + 1)+"\t"); + + double ratio = depthBeta(depth[i], tot[i], depth[ref], tot[ref]); + if (actualLength / repeatRef > 1.1) { + if (ratio > 1){ + rightP ++; + }else + wrongP ++; + }else if (repeatRef / actualLength > 1.1) { + if (ratio > 1){ + wrongP ++; + }else + rightP ++; + } else{ + //if (ratio > 1.1 || ratio < 0.9) + // wrongP ++; + //else + // rightP++; + + } + + + System.out.print(actualLength/repeatRef + " vs " + depthBeta(depth[i], tot[i], depth[ref], tot[ref]) + "\t"); + + + //double odd = - depthBin(depth[i], tot[i], depth[ref], tot[ref],1) + depthBin3(depth[i], tot[i], depth[ref], tot[ref],actualLength, repeatRef,1); + //if (odd > 0.001) { + // rightP ++; + //} + //if (odd < -0.001) { + // wrongP ++; + //} + } + System.out.println(); + } + System.out.println(rightP + " vs " + wrongP + " vs " + noChange); + } + + public static double depthPoisson(int depthS, int totDepthS, double depthR, double totDepthR, int numIt){ + double lamda = totDepthS * depthR / totDepthR; + //if (numIt <=1){ + PoissonDistribution pd = new PoissonDistribution(lamda); + return pd.probability(depthS); + //} + } + + public static double depthBeta(double depthS, double totDepthS, double depthR, double totDepthR){ + double ps = (depthS + 1) / (totDepthS + 1); + double pr = (depthR + 1) / (totDepthR + 1); + + return ps/ pr; + } + + //public static double depthBin4(int depthS, double totDepthS, double depthR, double totDepthR, double s, double r, int numIt){ + // + // + //} + + /** + * log likelihood of P(dS | totS, dR, totR) ~ Bin(dS|totS, p) where p ~ Beta(dR + 1, totR - dR + 1) + * @param depthS + * @param totDepthS + * @param depthR + * @param totDepthR + * @return + */ + public static double depthBin(double depthS, double totDepthS, double depthR, double totDepthR, int numIt){ + double p = depthR / totDepthR; + if (numIt <=1) + return - logBin((int)depthS, (int)totDepthS, p); + + BetaDistribution beta = new BetaDistribution(depthR + 1, totDepthR - depthR + 1); + int sum = 0; + int ids = (int) depthS; + int its = (int) depthS; + for (int i = 0; i < numIt; i++){ + p = beta.sample(); + sum -= logBin(ids, its, p); + } + //Note: this is the average of logProb, LC's version is average of prob + return sum / numIt; + } + + + /** + * P(dS | totS, dR, totR) ~ Bin (dS|dS + dR, p) where p ~ Beta(totS+1, totR+1) + * @param depthS + * @param totDepthS + * @param depthR + * @param totDepthR + * @param numIt + * @return + */ + public static double depthBin2(double depthS, double totDepthS, double depthR, double totDepthR, int numIt){ + int ids = (int) depthS; + int idr = (int) depthR; + + double p = totDepthS / (totDepthS + totDepthR); + + if (numIt <=1) + return -logBin(ids, (ids + idr), p); + + BetaDistribution beta = new BetaDistribution(totDepthS + 1, totDepthR + 1); + int sum = 0; + for (int i = 0; i < numIt; i++){ + p = beta.sample(); + sum -= logBin(ids, (ids + idr), p); + } + //Note: this is the average of logProb, LC's version is average of prob + return sum / numIt; + } + + + public static double depthBin3(double depthS, double totDepthS, double depthR, double totDepthR, double s, double r, int numIt){ + double p = (depthR / totDepthR) * (s/r); + + if (numIt <=1) + return - logBin((int)depthS, (int) totDepthS, p); + + BetaDistribution beta = new BetaDistribution(depthR + 1, totDepthR - depthR + 1); + int sum = 0; + int ids = (int) depthS; + int its = (int) totDepthS; + for (int i = 0; i < numIt; i++){ + p = beta.sample() * s/r; + sum -= logBin(ids, its, p); + } + //Note: this is the average of logProb, LC's version is average of prob + return sum / numIt; + } + + /************************************************************************ + double countnormal = 1, totnormal = 2; + + public double probability(double lrr, double baf) { + double cellularity = vals[0]; + double ratio = vals[1]; + if(ratioAsLevels) ratio = ratios[backCN]*ratio; + else if(cellAsLevels){ + cellularity = Math.min(1.0, ratios[backCN]*cellularity); + //if(cellularity>1) cellularity = 1.0/cellularity; + } + double mult = ((rcn/ratio)*cellularity) + (1-cellularity); + + double k = baf;//counttumour; + double n = lrr;// tottumour; + + if(numIt==1){ + double p = (countnormal/totnormal)* mult; + double v = k *Math.log(p)+(n-k)*Math.log(1-p); + return v; + + }else{ + double maxv = Double.NEGATIVE_INFINITY; + b.setState((countnormal+1)/betaDownWeight, ((totnormal-countnormal)+1)/betaDownWeight); + for(int j = 0; jmaxv) maxv = v; + logprobs[j] = v; + } + double sum=0; + for(int j =0; j + * References: + *
    + *
  1. Eric W. Weisstein. "Stirling's Series." From MathWorld--A Wolfram Web + * Resource. + * http://mathworld.wolfram.com/StirlingsSeries.html
  2. + *
+ *

+ * + * @param z the value. + * @return the Striling's series error. + */ + static double getStirlingError(double z) { + double ret; + if (z < 15.0) { + double z2 = 2.0 * z; + if (FastMath.floor(z2) == z2) { + ret = EXACT_STIRLING_ERRORS[(int) z2]; + } else { + ret = Gamma.logGamma(z + 1.0) - (z + 0.5) * FastMath.log(z) + + z - HALF_LOG_2_PI; + } + } else { + double z2 = z * z; + ret = (0.083333333333333333333 - + (0.00277777777777777777778 - + (0.00079365079365079365079365 - + (0.000595238095238095238095238 - + 0.0008417508417508417508417508 / + z2) / z2) / z2) / z2) / z; + } + return ret; + } + + /** + * A part of the deviance portion of the saddle point approximation. + *

+ * References: + *

    + *
  1. Catherine Loader (2000). "Fast and Accurate Computation of Binomial + * Probabilities.". + * http://www.herine.net/stat/papers/dbinom.pdf
  2. + *
+ *

+ * + * @param x the x value. + * @param mu the average. + * @return a part of the deviance. + */ + static double getDeviancePart(double x, double mu) { + double ret; + if (FastMath.abs(x - mu) < 0.1 * (x + mu)) { + double d = x - mu; + double v = d / (x + mu); + double s1 = v * d; + double s = Double.NaN; + double ej = 2.0 * x * v; + v = v * v; + int j = 1; + while (s1 != s) { + s = s1; + ej *= v; + s1 = s + ej / ((j * 2) + 1); + ++j; + } + ret = s1; + } else { + ret = x * FastMath.log(x / mu) + mu - x; + } + return ret; + } + + /** + * Compute the logarithm of the PMF for a binomial distribution + * using the saddle point expansion. + * + * @param x the value at which the probability is evaluated. + * @param n the number of trials. + * @param p the probability of success. + * @param q the probability of failure (1 - p). + * @return log(p(x)). + */ + static double logBinomialProbability(int x, int n, double p, double q) { + double ret; + if (x == 0) { + if (p < 0.1) { + ret = -getDeviancePart(n, n * q) - n * p; + } else { + ret = n * FastMath.log(q); + } + } else if (x == n) { + if (q < 0.1) { + ret = -getDeviancePart(n, n * p) - n * q; + } else { + ret = n * FastMath.log(p); + } + } else { + ret = getStirlingError(n) - getStirlingError(x) - + getStirlingError(n - x) - getDeviancePart(x, n * p) - + getDeviancePart(n - x, n * q); + double f = (MathUtils.TWO_PI * x * (n - x)) / n; + ret = -0.5 * FastMath.log(f) + ret; + } + return ret; + } +} diff --git a/src/dev/java/japsadev/tools/work/BuildMLSTTreeCmd.java b/src/dev/java/japsadev/tools/work/BuildMLSTTreeCmd.java new file mode 100644 index 0000000..2794e67 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/BuildMLSTTreeCmd.java @@ -0,0 +1,138 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.work; + + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.HashMap; + +import japsa.bio.amra.MLSTyping; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Filter a bam filem based on some criteria. Input file in bam format assumed + * to be sorted and indexed + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.treest", + scriptDesc = "Build tree of MLST profiles" + ) +public class BuildMLSTTreeCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(BuildXMTreeCmd.class); + + //CommandLine cmdLine; + public BuildMLSTTreeCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("base", null, "base"); + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + BuildMLSTTreeCmd cmdLine = new BuildMLSTTreeCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String base = cmdLine.getStringVal("base"); + BufferedReader br = SequenceReader.openFile("index2Names"); + + HashMap mlstMap = new HashMap(); + HashMap profileMap = new HashMap(); + + String line; + while ( (line = br.readLine())!=null){ + String [] toks = line.trim().split(" "); + + MLSTyping mlst = mlstMap.get(toks[1]); + if (mlst == null){ + mlst = new MLSTyping(base + "/" + toks[1]); + mlstMap.put(toks[1], mlst); + } + //mlst + + String profile = toks[1] + "#" + toks[4]; + if (!profileMap.containsKey(profile)){ + LOG.info("Profile " + profile); + SequenceBuilder sb = new SequenceBuilder(Alphabet.DNA4(), 1000, toks[3]); + String [] genesNames = toks[4].split("\\|"); + for (int x =0; x < 7;x++){ + int sepPos = genesNames[x].lastIndexOf('_'); + int alleleNo = Integer.parseInt(genesNames[x].substring(sepPos+1)); + int alleleIndex = mlst.alleleNo2AlleleIndex(x, alleleNo); + Sequence seq = mlst.alleles(x).get(alleleIndex); + if (!genesNames[x].equals(seq.getName())){ + LOG.error("Error at " + genesNames[x] + " vs " + seq.getName()); + System.exit(1); + } + LOG.info("Found " + seq.getName()); + sb.append(seq); + } + profileMap.put(profile, sb); + } + } + + br.close(); + + HashMap outMap = new HashMap(); + for (String key:profileMap.keySet()){ + String [] toks = key.split("#"); + String species = toks[0]; + SequenceOutputStream myOut = outMap.get(species); + if (myOut == null){ + myOut = SequenceOutputStream.makeOutputStream("ST_"+species + ".fasta"); + outMap.put(species, myOut); + } + + profileMap.get(key).writeFasta(myOut); + }//for + + for (SequenceOutputStream myOut:outMap.values()){ + myOut.close(); + } + } +} diff --git a/src/dev/java/japsadev/tools/work/BuildXMTreeCmd.java b/src/dev/java/japsadev/tools/work/BuildXMTreeCmd.java new file mode 100644 index 0000000..807ff77 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/BuildXMTreeCmd.java @@ -0,0 +1,210 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.work; + + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashMap; + +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Filter a bam filem based on some criteria. Input file in bam format assumed + * to be sorted and indexed + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.treexm", + scriptDesc = "Build tree of XM" + ) +public class BuildXMTreeCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(BuildXMTreeCmd.class); + + public BuildXMTreeCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("output", null, "output",true); + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + BuildXMTreeCmd cmdLine = new BuildXMTreeCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + //String base = cmdLine.getStringVal("base"); + BufferedReader br = SequenceReader.openFile("index2Names"); + + HashMap id2Index = new HashMap(); + ArrayList idList = new ArrayList(); + ArrayList nameList = new ArrayList(); + + LOG.info("point 1"); + String line; + int noTaxa = 0; + while ((line = br.readLine())!=null){ + String [] toks = line.trim().split(" "); + String ID = toks[0]; + + String name = toks[3]; + String [] xxxx = name.split("_"); + + idList.add(ID); + nameList.add(xxxx[0] + "_" + xxxx[2]); + + id2Index.put(ID,noTaxa); + noTaxa ++; + } + br.close(); + + int [] lengths = new int[noTaxa]; + double [] myInfo = new double[noTaxa]; + + double [][] disMeasure1 = new double[noTaxa][]; + double [][] disMeasure2 = new double[noTaxa][]; + + LOG.info("point 2"); + int index = 0; + while (index < noTaxa){ + File file = new File("rdistance."+index+".out"); + if (!file.exists()){ + break;//no more + } + + LOG.info("Read " + index); + + BufferedReader reader = new BufferedReader(new FileReader(file)); + line = reader.readLine(); + String [] toks = line.trim().split("\t"); + if (toks.length != 3){ + break; + } + int myLength = Integer.parseInt(toks[1]); + lengths[index] = myLength; + myInfo [index] = Double.parseDouble(toks[2]); + + boolean good = true; + + + disMeasure1[index] = new double[index]; + disMeasure2[index] = new double[index]; + + int count = 0; + while ((line = reader.readLine())!=null){ + toks = line.trim().split("\t"); + if (toks.length != 4){ + good = false; + break; + } + int mateIndex = id2Index.get(toks[0]); + int mateLength = Integer.parseInt(toks[1]); + + double ij = Double.parseDouble(toks[2]); + double ji = Double.parseDouble(toks[3]); + + disMeasure1[index][mateIndex] = (ji + ij) /(myInfo[index] + myInfo[mateIndex]); + disMeasure2[index][mateIndex] = (ji * myLength + ij * mateLength) /(myInfo[index] * myLength + myInfo[mateIndex] * mateLength); + + count ++; + } + reader.close(); + + if (count < index) + good = false; + + if (!good) + break; + + index++; + } + + System.out.println(index); + + PrintStream out = + new PrintStream(new BufferedOutputStream( + new FileOutputStream( + "dis1"+cmdLine.getStringVal("output")))); + + + out.println(" " + index); + for (int s = 0; s < index; s++) { + out.printf("%-12s ", nameList.get(s)); + for (int x = 0; x < index; x++) { + if (x < s) + out.printf(" %10f ", disMeasure1[s][x]); + else if (x==s) + out.printf(" %10f ", 0.0); + else + out.printf(" %10f ", disMeasure1[x][s]); + } + out.println(); + } + out.close(); + + out = new PrintStream(new BufferedOutputStream( + new FileOutputStream( + "dis2"+cmdLine.getStringVal("output")))); + + + out.println(" " + index); + for (int s = 0; s < index; s++) { + out.printf("%-12s ", nameList.get(s)); + for (int x = 0; x < index; x++) { + if (x < s) + out.printf(" %10f ", disMeasure2[s][x]); + else if (x==s) + out.printf(" %10f ", 0.0); + else + out.printf(" %10f ", disMeasure2[x][s]); + } + out.println(); + } + out.close(); + } +} diff --git a/src/dev/java/japsadev/tools/work/FixFastqNameCmd.java b/src/dev/java/japsadev/tools/work/FixFastqNameCmd.java new file mode 100644 index 0000000..5c1a9de --- /dev/null +++ b/src/dev/java/japsadev/tools/work/FixFastqNameCmd.java @@ -0,0 +1,140 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 11/01/2012 - Minh Duc Cao: Revised + * 01/01/2013 - Minh Duc Cao, revised + ****************************************************************************/ + +package japsadev.tools.work; + +import japsa.seq.Alphabet; +import japsa.seq.FastqReader; +import japsa.seq.FastqSequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.fixfastq", + scriptDesc = "Fix fastq files for the plasmaDNA samples" + ) +public class FixFastqNameCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(FixFastqNameCmd.class); + public FixFastqNameCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " file1 file2"); + setDesc(annotation.scriptDesc()); + + addString("output", "output", "Prefix of the output"); + //addBoolean("reverse",false,"Reverse sort order"); + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new FixFastqNameCmd(); + args = cmdLine.stdParseLine(args); + if (args.length != 2){ + System.err.println(cmdLine.usage()); + System.exit(1); + } + /**********************************************************************/ + String output = cmdLine.getStringVal("output"); + + SequenceOutputStream out1 = SequenceOutputStream.makeOutputStream(output + "_1.fq.gz"); + SequenceOutputStream out2 = SequenceOutputStream.makeOutputStream(output + "_2.fq.gz"); + SequenceOutputStream out = SequenceOutputStream.makeOutputStream(output + "_S.fq.gz"); + + FastqReader reader1 = new FastqReader(args[0]); + FastqReader reader2 = new FastqReader(args[1]); + + + String regex = " |\\."; + //Pattern pattern = Pattern.compile(" |."); + FastqSequence seq2 = reader2.nextSequence(Alphabet.DNA()); + String [] toks = seq2.getName().split(regex); + int readNo2 = Integer.parseInt(toks[1]); + seq2.setName(toks[0]+"."+toks[1]); + + + FastqSequence seq1 = reader1.nextSequence(Alphabet.DNA()); + toks = seq1.getName().split(regex); + int readNo1 = Integer.parseInt(toks[1]); + seq1.setName(toks[0]+"."+toks[1]); + + while (readNo1 > 0 || readNo2 > 0){ + if (readNo2 == 0 || readNo1 < readNo2){ + seq1.print(out); + seq1 = reader1.nextSequence(Alphabet.DNA()); + if (seq1 != null){ + toks = seq1.getName().split(regex); + readNo1 = Integer.parseInt(toks[1]); + seq1.setName(toks[0]+"."+toks[1]); + }else + readNo1 = 0; + + }else if (readNo1 == readNo2){ + seq1.print(out1); + seq2.print(out2); + + seq1 = reader1.nextSequence(Alphabet.DNA()); + if (seq1 != null){ + toks = seq1.getName().split(regex); + readNo1 = Integer.parseInt(toks[1]); + seq1.setName(toks[0]+"."+toks[1]); + }else + readNo1 = 0; + + seq2 = reader2.nextSequence(Alphabet.DNA()); + if (seq2 != null){ + toks = seq2.getName().split(regex); + readNo2 = Integer.parseInt(toks[1]); + seq2.setName(toks[0]+"."+toks[1]); + }else + readNo2 = 0; + + }else{ + LOG.error("Dont understand this " + readNo1 + " vs " + readNo2); + System.exit(1); + } + }//while + reader1.close();reader2.close();out.close();out1.close();out2.close(); + } +} \ No newline at end of file diff --git a/src/dev/java/japsadev/tools/work/FixNamesTreeCmd.java b/src/dev/java/japsadev/tools/work/FixNamesTreeCmd.java new file mode 100644 index 0000000..eee09b4 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/FixNamesTreeCmd.java @@ -0,0 +1,273 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.work; + + +import java.io.BufferedReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import japsa.bio.phylo.PhylogenyTree; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Filter a bam filem based on some criteria. Input file in bam format assumed + * to be sorted and indexed + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.nametaxa", + scriptDesc = "Fix names" + ) +public class FixNamesTreeCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(FixNamesTreeCmd.class); + + //CommandLine cmdLine; + public FixNamesTreeCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("mode", null, "Mode: name, compare, trip",true); + + addStdHelp(); + } + public static void main(String [] args) throws Exception{ + FixNamesTreeCmd cmdLine = new FixNamesTreeCmd (); + args = cmdLine.stdParseLine(args); + + String mode = cmdLine.getStringVal("mode"); + /**********************************************************************/ + if (mode.equals("name")){ + //String base = cmdLine.getStringVal("base"); + BufferedReader br = SequenceReader.openFile("index2Names"); + + //HashMap key12Species = new HashMap(); + HashMap key2Name = new HashMap(); + HashMap name2Name = new HashMap(); + //HashMap key2NewName = new HashMap(); + + String line; + + while ((line = br.readLine())!=null){ + String [] toks = line.trim().split(" "); + String name = toks[3]; + String [] xxxx = name.split("_"); + String key = (xxxx[0] + "_" + xxxx[2]); + key2Name.put(key,toks[1]+ "_" + xxxx[1] + "_" + xxxx[2]); + name2Name.put(name,toks[1]+ "_" + xxxx[1] + "_" + xxxx[2]); + } + br.close(); + PhylogenyTree tree = PhylogenyTree.readFromFile(args[0]); + + ArrayList leaves = tree.getLeaves(); + for (PhylogenyTree leaf:leaves){ + String lName = leaf.getName(); + String name = key2Name.get(lName); + if (name != null){ + LOG.info(lName + " found in key "); + leaf.setName(name); + }else{ + name = name2Name.get(lName); + if (name != null){ + LOG.info(lName + " found in name "); + leaf.setName(name); + }else{ + LOG.info(lName + " not found"); + } + } + } + System.out.println(tree.toString() + ";"); + } + + if (mode.equals("trip")){ + PhylogenyTree tree0 = PhylogenyTree.readFromFile(args[0]); + HashSet leafSet0 = new HashSet(); + ArrayList leaves0 = tree0.getLeaves(); + for (PhylogenyTree leaf:leaves0){ + String name = leaf.getName().split("_")[0]; + if (!leafSet0.add(name)){ + PhylogenyTree parent = leaf.getParent(); + if (parent != null){ + int gIndex = leaf.getIndex(); + + PhylogenyTree grantParent = parent.getParent(); + if (grantParent != null){ + int cIndex = parent.getIndex(); + PhylogenyTree removed = grantParent.removeGrandChild(cIndex, gIndex); + if (removed == leaf){ + LOG.info("Yay, removed " + name); + continue; + }else{ + LOG.info("Some thing not right " + name); + } + }else{ + LOG.warn("GRANT not found of " + name); + } + }else{ + LOG.warn("PARENT not found " + name); + } + + //LOG.warn("Once more " + name); + + if (tree0.getChild(0) == leaf){ + tree0 = tree0.getChild(1); + tree0.setParent(null); + LOG.info("Yay, removed with round" + name); + }else if (tree0.getChild(1) == leaf){ + tree0 = tree0.getChild(0); + tree0.setParent(null); + LOG.info("Yay, removed with round" + name); + }else + LOG.warn("CAN not remove " + name); + } + + + + } + + System.out.println(tree0 + ";"); + } + + if (mode.equals("compare")){ + PhylogenyTree tree0 = PhylogenyTree.readFromFile(args[0]); + HashSet leafSet0 = new HashSet(); + + ArrayList leaves0 = tree0.getLeaves(); + for (PhylogenyTree leaf:leaves0){ + leafSet0.add(leaf.getName()); + } + + PhylogenyTree tree1 = PhylogenyTree.readFromFile(args[1]); + HashSet leafSet1 = new HashSet(); + + ArrayList leaves1 = tree1.getLeaves(); + + for (PhylogenyTree leaf:leaves1){ + leafSet1.add(leaf.getName()); + + String name = leaf.getName(); + if (!leafSet0.contains(name)){ + //remove + PhylogenyTree parent = leaf.getParent(); + if (parent != null){ + int gIndex = leaf.getIndex(); + + PhylogenyTree grantParent = parent.getParent(); + if (grantParent != null){ + int cIndex = parent.getIndex(); + PhylogenyTree removed = grantParent.removeGrandChild(cIndex, gIndex); + if (removed == leaf){ + LOG.info("Yay, removed " + name); + continue; + }else{ + LOG.info("Some thing not right " + name); + } + }else{ + LOG.warn("GRANT not found of " + name); + } + }else{ + LOG.warn("PARENT not found " + name); + } + + //LOG.warn("Once more " + name); + + if (tree1.getChild(0) == leaf){ + tree1 = tree1.getChild(1); + tree1.setParent(null); + LOG.info("Yay, removed with round" + name); + }else if (tree1.getChild(1) == leaf){ + tree1 = tree1.getChild(0); + tree1.setParent(null); + LOG.info("Yay, removed with round" + name); + }else + LOG.warn("CAN not remove " + name); + } + + } + + + for (PhylogenyTree leaf:leaves0){ + String name = leaf.getName(); + if (!leafSet1.contains(name)){ + //remove + PhylogenyTree parent = leaf.getParent(); + if (parent != null){ + int gIndex = leaf.getIndex(); + + PhylogenyTree grantParent = parent.getParent(); + if (grantParent != null){ + int cIndex = parent.getIndex(); + PhylogenyTree removed = grantParent.removeGrandChild(cIndex, gIndex); + if (removed == leaf){ + LOG.info("Yay, removed " + name); + continue; + }else{ + LOG.info("Some thing not right " + name); + } + }else{ + LOG.warn("GRANT not found of " + name); + } + }else{ + LOG.warn("PARENT not found " + name); + } + + //LOG.warn("Once more " + name); + if (tree0.getChild(0) == leaf){ + tree0 = tree0.getChild(1); + tree0.setParent(null); + LOG.info("Yay, removed with round" + name); + }else if (tree0.getChild(1) == leaf){ + tree0 = tree0.getChild(0); + tree0.setParent(null); + LOG.info("Yay, removed with round" + name); + }else + LOG.warn("CAN not remove " + name); + } + + }// + System.out.println(tree0 + ";"); + System.out.println(tree1 + ";"); + //System.out.println(tree1); + } + } +} + diff --git a/src/dev/java/japsadev/tools/work/GetCDHitCmd.java b/src/dev/java/japsadev/tools/work/GetCDHitCmd.java new file mode 100644 index 0000000..f8c78b2 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/GetCDHitCmd.java @@ -0,0 +1,179 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 11/01/2012 - Minh Duc Cao: Revised + * 01/01/2013 - Minh Duc Cao, revised + ****************************************************************************/ + +package japsadev.tools.work; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + + +/** + * @author Minh Duc Cao + * + */ +@Deployable( + scriptName = "jsa.dev.cdhit", + scriptDesc = "Sample script description" + ) +public class GetCDHitCmd extends CommandLine{ + public GetCDHitCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("sequence",null,"sequence",true); + addString("input",null,"name",true); + addString("output",null,"name",true); + //addBoolean("reverse",false,"Reverse sort order"); + addStdHelp(); + } + + public static void main(String[] args) throws IOException { + + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new GetCDHitCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String sequence = cmdLine.getStringVal("sequence"); + String input = cmdLine.getStringVal("input"); + String output = cmdLine.getStringVal("output"); + + ArrayList seqs = SequenceReader.readAll(sequence, Alphabet.DNA()); + HashMap map = new HashMap(); + + for (Sequence seq:seqs){ + map.put(seq.getName(),seq); + } + + BufferedReader reader = SequenceReader.openFile(input + ".clstr"); + ArrayList groups = new ArrayList(); + + String line = ""; + + Group group = null; + while ((line = reader.readLine())!=null){ + if (line.startsWith(">Cluste")){ + if (group != null){ + groups.add(group); + + } + group = new Group(); + continue; + } + + String [] toks = line.trim().split("\\s"); + String name = toks[2].substring(1); + name = name.substring(0, name.length()-3); + group.count ++; + group.appendList(name); + + + Sequence s = map.get(name); + String desc = s.getDesc(); + if (desc != null) { + String[] descs = desc.split("\\s"); + try{ + int reads = Integer.parseInt(descs[0]); + group.countRead += reads; + }catch (Exception e){ + } + } + + + if (toks[3].equals("*")) + group.seq = s; + }//while + if (group != null){ + groups.add(group); + } + + reader.close(); + Collections.sort(groups); + + + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(output + ".fasta"); + SequenceOutputStream sosDat = SequenceOutputStream.makeOutputStream(output + ".dat"); + int index = 0; + for (Group g: groups){ + index ++; + Sequence s = g.seq; + s.setName("group"+index); + + s.setDesc(g.count + " " + g.countRead); + s.writeFasta(sos); + + sosDat.print("group" + index + " " + g.count + " " + g.countRead + " " + g.list); + sosDat.println(); + + } + sos.close(); + sosDat.close(); + + } + + static class Group implements Comparable{ + Sequence seq = null; + int count = 0; + int countRead = 0; + String list = ""; + + public void appendList(String s){ + if (list.length() > 0) list = list + ","; + list = list + s; + + } + + @Override + public int compareTo(Group o) { + int r = o.count - this.count; + if (r == 0) + return o.countRead - countRead; + else + return r; + } + } +} + diff --git a/src/dev/java/japsadev/tools/work/MethylationAnalysis2Cmd.java b/src/dev/java/japsadev/tools/work/MethylationAnalysis2Cmd.java new file mode 100644 index 0000000..3ae5a3a --- /dev/null +++ b/src/dev/java/japsadev/tools/work/MethylationAnalysis2Cmd.java @@ -0,0 +1,177 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.work; + + +import java.io.BufferedReader; +import java.io.IOException; + +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +/** + * Filter a bam filem based on some criteria. Input file in bam format assumed + * to be sorted and indexed + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.methyC2", + scriptDesc = "Put the counts together" + ) +public class MethylationAnalysis2Cmd extends CommandLine{ + //CommandLine cmdLine; + public MethylationAnalysis2Cmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("sample", null, "Sample ID",true); + addString("output", null, "name of the output file",true); + addInt("window", 1000, "window"); + //addInt("filterBits", 0, "Filter reads based on flag. Common values:\n 0 no filter\n 256 exclude secondary alignment \n 1024 exclude PCR/optical duplicates\n 2048 exclude supplementary alignments"); + + + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + MethylationAnalysis2Cmd cmdLine = new MethylationAnalysis2Cmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String sample = cmdLine.getStringVal("sample"); + String output = cmdLine.getStringVal("output"); + int window = cmdLine.getIntVal("window"); + + + analyse(sample, window, output); + } + + static void analyse(String sample, int windowSize, String output) throws IOException{ + + + String [] chroms = {"chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", + "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", + "chr21", "chr22", "chrX", "chrY"}; + + SequenceOutputStream fCount = SequenceOutputStream.makeOutputStream(output); + + fCount.print("#CHROM\tID\tSTART\tEND\tFORMAT\tNORMAL\tTUMOUR\n"); + + for (String chrom:chroms){ + BufferedReader ncBR = SequenceReader.openFile("count_" + sample + "N-CNTL_S1_" + chrom + ".dat"); + BufferedReader tcBR = SequenceReader.openFile("count_" + sample + "T-CNTL_S1_" + chrom + ".dat"); + BufferedReader nbBR = SequenceReader.openFile("count_" + sample + "N-BSC_S1_" + chrom + ".dat"); + BufferedReader tbBR = SequenceReader.openFile("count_" + sample + "T-BSC_S1_" + chrom + ".dat"); + + int start = 0; + double normalCount = 0, tumourCount = 0; + + int pos = 0; + + String ncLine = ""; + String nbLine, tbLine, tcLine; + while ((ncLine = ncBR.readLine())!=null){ + nbLine = nbBR.readLine(); + tcLine = tcBR.readLine(); + tbLine = tbBR.readLine(); + + if (ncLine.startsWith("#")) + continue; + + //double value = 0; + + //tumour control + String [] toks = tcLine.split("\t"); + pos = Integer.parseInt(toks[0]); + + while (pos > start + windowSize){ + fCount.print(chrom+"\t"+(start+1)+"_"+(start+windowSize)+"\t"+(start+1) + "\t" + (start+windowSize) + "\tDP\t"+((int) (normalCount)) + '\t' + ((int) (tumourCount)) + '\n'); + start += windowSize; + normalCount = tumourCount = 0; + } + + //if tumourcontrol base is NOT C + if (Integer.parseInt(toks[1]) == 0){ + //fCount.print(chrom +"\t" + (pos -1 ) + "\t" + pos +"\t0\n"); + continue; + } + + //normal control + toks = ncLine.split("\t"); + if (Integer.parseInt(toks[1])==0){ + // fCount.print(chrom +"\t" + (pos -1 ) + "\t" + pos +"\t0\n"); + continue; + } + + + toks = nbLine.split("\t"); + if (Integer.parseInt(toks[2])==0){ + // fCount.print(chrom +"\t" + (pos -1 ) + "\t" + pos +"\t0\n"); + continue; + } + + double normalRatio = Double.parseDouble(toks[1]) / Double.parseDouble(toks[2]); + + toks = tbLine.split("\t"); + if (Integer.parseInt(toks[2])==0){ + // fCount.print(chrom +"\t" + (pos -1 ) + "\t" + pos +"\t0\n"); + continue; + } + double tumourRatio = Double.parseDouble(toks[1]) / Double.parseDouble(toks[2]); + + normalCount += normalRatio; + tumourCount += tumourRatio; + // fCount.print(chrom +"\t" + (pos -1 ) + "\t" + pos +"\t" + (tumourRatio - normalRatio) + '\n'); + } + tbBR.close(); + tcBR.close(); + nbBR.close(); + ncBR.close(); + + while (pos > start){ + fCount.print(chrom+"\t"+(start+1)+"_"+Math.min(pos, start+windowSize)+"\t"+(start+1) + "\t" + (start+windowSize) + "\tDP\t"+((int) (normalCount)) + '\t' + ((int) (tumourCount)) + '\n'); + start += windowSize; + normalCount = tumourCount = 0; + } + } + + + fCount.close(); + } +} diff --git a/src/dev/java/japsadev/tools/work/MethylationAnalysisCmd.java b/src/dev/java/japsadev/tools/work/MethylationAnalysisCmd.java new file mode 100644 index 0000000..f4a2da6 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/MethylationAnalysisCmd.java @@ -0,0 +1,174 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.work; + + +import java.io.File; +import java.io.IOException; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +/** + * Filter a bam filem based on some criteria. Input file in bam format assumed + * to be sorted and indexed + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.methyC", + scriptDesc = "Count fraction of methylated" + ) +public class MethylationAnalysisCmd extends CommandLine{ + //CommandLine cmdLine; + public MethylationAnalysisCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("inputBam", null, "Name of the input bam",true); + addString("chrom", null, "Name of the chromosome",true); + addInt("qual", 0, "Minimum mapping quality"); + + addString("output", null, "name of the output file",true); + //addInt("filterBits", 0, "Filter reads based on flag. Common values:\n 0 no filter\n 256 exclude secondary alignment \n 1024 exclude PCR/optical duplicates\n 2048 exclude supplementary alignments"); + + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + MethylationAnalysisCmd cmdLine = new MethylationAnalysisCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String inputBam = cmdLine.getStringVal("inputBam"); + String chrom = cmdLine.getStringVal("chrom"); + String output = cmdLine.getStringVal("output"); + int qual = cmdLine.getIntVal("qual"); + + analyse(inputBam, chrom, qual, output); + } + + static void analyse(String inFile, String chrom, int qual, String output) throws IOException{ + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader samReader = SamReaderFactory.makeDefault().open(new File(inFile)); + SAMRecordIterator samIter = samReader.query(chrom, 0, 0, false); + SAMSequenceRecord refSequence = samReader.getFileHeader().getSequence(chrom); + + int seqLength = refSequence.getSequenceLength(); + int [] countC = new int[seqLength], + countTot = new int[seqLength]; + + while (samIter.hasNext()){ + SAMRecord sam = samIter.next(); + + if (sam.getMappingQuality() < qual) + continue; + ////////////////////////////////////////////////////////////////int readPos = 0;//start from 0 + + Sequence readSeq = new Sequence(Alphabet.DNA(), sam.getReadString(), sam.getReadName()); + + int readPos = 0;//start from 0 + int refPos = sam.getAlignmentStart() - 1;//convert to 0-based index + + for (final CigarElement e : sam.getCigar().getCigarElements()) { + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + //nothing todo + break; // ignore hard clips + case P : + //pad is a kind of hard clipped ?? + break; // ignore pads + case S : + //advance on the read + readPos += length; + break; // soft clip read bases + case N : + refPos += length; + break; // reference skip + + case D ://deletion + refPos += length; + break; + + case I : + readPos += length; + break; + case M : + case EQ: + case X : + for (int i = 0; i < length; i++){ + countTot[refPos + i]++; + if (readSeq.getBase(readPos + i) == Alphabet.DNA.C) + countC[refPos + i]++; + } + readPos += length; + refPos += length; + break; + + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//case + }//for + + //////////////////////////////////////////////////////////////// + + } + samIter.close(); + samReader.close(); + + SequenceOutputStream fCount = SequenceOutputStream.makeOutputStream(output); + fCount.print("#pos\tstart\tend\n"); + for (int i = 0; i< countTot.length;i++){ + fCount.print(i+1); + fCount.print('\t'); + fCount.print(countC[i]); + fCount.print('\t'); + fCount.print(countTot[i]); + fCount.print('\n'); + } + fCount.close(); + } +} diff --git a/src/dev/java/japsadev/tools/work/PairedEndRepair.java b/src/dev/java/japsadev/tools/work/PairedEndRepair.java new file mode 100644 index 0000000..63d5e95 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/PairedEndRepair.java @@ -0,0 +1,310 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 13/07/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.tools.work; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import japsa.bio.tr.TandemRepeat; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +/** + * A tool to repair concordant pairs information + * + * @author minhduc + * + */ +@Deployable(scriptName = "jsa.dev.pairrepair", +scriptDesc = "Repair concordant pairs from multiple alignment (such as bwa mem -a). This bamfile should be sorted by read name (or from running as a single-threaded program).") +public class PairedEndRepair extends CommandLine{ + public PairedEndRepair(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("bamFile", null, "Name of the s/bam file", true); + addString("tr", null, "Name of TR file"); + + addStdHelp(); + } + + /** + * @param args + */ + public static void main(String[] args) throws IOException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new PairedEndRepair(); + args = cmdLine.stdParseLine(args); + + /*********************** Setting up script ****************************/ + + String bamFile = cmdLine.getStringVal("bamFile"); + String trFile = cmdLine.getStringVal("tr"); + + if (trFile != null) { + BufferedReader bf = SequenceReader.openFile(trFile); + trList = TandemRepeat.readFromFile(bf, null); + } + + ArrayList first = new ArrayList(16), second = new ArrayList( + 16); + + ArrayList disList = new ArrayList(); + + /////////////////////////////////////////////////////////// + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader samReader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + + // SAMFileWriterFactory factory = new SAMFileWriterFactory(); + // SAMFileWriter bamWriter = + // factory.makeSAMOrBAMWriter(reader.getFileHeader(), true, new + // File(output)); + + String currentID = null; + SAMRecordIterator iter = samReader.iterator(); + + while (iter.hasNext()) { + + SAMRecord rec = iter.next(); + String readName = rec.getReadName(); + // start a new + + if (currentID != null && !readName.equals(currentID)) { + disList.clear(); + for (SAMRecord f : first) { + for (SAMRecord s : second) { + int d = distance(f, s); + if (d < 2500) + disList.add(d); + }// for + }// for + + if (trList != null) { + call(currentID, disList, false); + } else { + System.out.print(currentID); + for (int x : disList) { + System.out.print("\t" + x); + }// for + System.out.println(); + } + + first.clear(); + second.clear(); + }// fi + + currentID = readName; + + if (rec.getFirstOfPairFlag()) + first.add(rec); + else + second.add(rec); + }// while + + disList.clear(); + for (SAMRecord f : first) { + for (SAMRecord s : second) { + int d = distance(f, s); + if (d < 2500) + disList.add(d); + }// for + }// for + + if (trList != null) { + call(currentID, disList, true);// the last call + + } else { + System.out.print(currentID); + for (int x : disList) { + System.out.print("\t" + x); + }// for + System.out.println(); + } + + first.clear(); + second.clear(); + + samReader.close(); + } + + public static int distance(SAMRecord rec1, SAMRecord rec2) { + + if (rec1.getReadUnmappedFlag() || rec2.getReadUnmappedFlag()) + return Integer.MAX_VALUE; + + if (rec1.getReferenceIndex().intValue() != rec2.getReferenceIndex() + .intValue()) + return Integer.MAX_VALUE; + + int dis = Math.abs(rec1.getAlignmentStart() - rec2.getAlignmentEnd()); + + int d = Math.abs(rec2.getAlignmentStart() - rec1.getAlignmentEnd()); + if (d > dis) + dis = d; + + return dis + 1; + } + + static TandemRepeat currentTR = null; + + static ArrayList trList = null; + static int index = -1; + static ArrayList stats = new ArrayList(); + + public static void call(String name, ArrayList intList, + boolean last) { + String[] toks = name.split("#"); + if (index < 0) { + index = 0; + currentTR = trList.get(0); + System.out.print(currentTR.getChr() + "\t" + currentTR.getID() + + "\t" + currentTR.getPeriod()); + stats.clear(); + } + + while ((!currentTR.getID().equals(toks[0])) + || !currentTR.getChr().equals(toks[2])) { + index++; + + // Collections.sort(stats); + + for (Stat stat : stats) { + System.out.printf("\t%.2f,%d", stat.sum / stat.count, + stat.count); + // System.out.printf("\t%.2f",stat.sum/stat.count); + } + System.out.println(); + + if (index >= trList.size()) { + // return; + throw new RuntimeException("index " + index + + " run out of bound"); + } + + currentTR = trList.get(index); + System.out.print(currentTR.getChr() + "\t" + currentTR.getID() + + "\t" + currentTR.getPeriod()); + stats.clear(); + } + + int e = Integer.parseInt(toks[5]); + + for (int d : intList) { + double var = ((d - e) * 1.0 / currentTR.getPeriod()); + if (var < 50) { + // System.out.printf("\t%8.2f", var); + boolean notuse = true; + for (Stat stat : stats) { + // System.out.print(var + " " + Math.abs(var - + // stat.sum/stat.count) + " " + stat.sum + " " + + // stat.count); + if (Math.abs(var - stat.sum / stat.count) < 0.5) { + // System.out.println(true); + stat.sum += var; + stat.count++; + notuse = false; + break;// for + }// if + // else + // System.out.println(false); + }// for + if (notuse) { + stats.add(new Stat(1, var)); + }// if + }// if + }// for + + // flush + if (last) { + while (true) { + index++; + + // Collections.sort(stats); + for (Stat stat : stats) { + System.out.printf("\t%.2f,%d", stat.sum / stat.count, + stat.count); + } + System.out.println(); + + if (index >= trList.size()) { + return; + } + + currentTR = trList.get(index); + System.out.print(currentTR.getChr() + "\t" + currentTR.getID() + + "\t" + currentTR.getPeriod()); + stats.clear(); + } + } + } + + public static class Stat implements Comparable { + int count = 0; + double sum = 0; + + Stat(int c, double s) { + count = c; + sum = s; + } + + public String toString() { + return (sum / count) + "," + count; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(Stat oo) { + // Stat oo = (Stat) o; + return oo.count - count; + } + + } +} diff --git a/src/dev/java/japsadev/tools/work/PlasmaAnalysisCmd.java b/src/dev/java/japsadev/tools/work/PlasmaAnalysisCmd.java new file mode 100644 index 0000000..2cd4b60 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/PlasmaAnalysisCmd.java @@ -0,0 +1,180 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.work; + + +import java.io.File; +import java.io.IOException; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +/** + * Filter a bam filem based on some criteria. Input file in bam format assumed + * to be sorted and indexed + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.plasma", + scriptDesc = "Analysis of plasma sequencing" + ) +public class PlasmaAnalysisCmd extends CommandLine{ + //CommandLine cmdLine; + public PlasmaAnalysisCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("inputBam", null, "Name of the input bam",true); + addString("chrom", null, "Name of the chromosome",true); + addInt("qual", 0, "Minimum mapping quality"); + addInt("min", 0, "Minimum fragment length"); + addInt("max", 10000, "Maximum fragment length"); + addInt("window", 10, "Half window sise"); + + + addString("output", null, "Name of the output file",true); + + //addInt("filterBits", 0, "Filter reads based on flag. Common values:\n 0 no filter\n 256 exclude secondary alignment \n 1024 exclude PCR/optical duplicates\n 2048 exclude supplementary alignments"); + + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + PlasmaAnalysisCmd cmdLine = new PlasmaAnalysisCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String inputBam = cmdLine.getStringVal("inputBam"); + //String controlBam = cmdLine.getStringVal("controlBam"); + String chrom = cmdLine.getStringVal("chrom"); + String output = cmdLine.getStringVal("output"); + int qual = cmdLine.getIntVal("qual"); + int min = cmdLine.getIntVal("min"); + int max = cmdLine.getIntVal("max"); + int window = cmdLine.getIntVal("window"); + + analyse(inputBam, chrom, qual, min, max, window, output); + + } + + static void analyse(String inFile, String chrom, int qual, int min, int max, int window, String output) throws IOException{ + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader samReader = SamReaderFactory.makeDefault().open(new File(inFile)); + + SAMRecordIterator samIter = samReader.query(chrom, 0, 0, false); + + SAMSequenceRecord refSequence = samReader.getFileHeader().getSequence(chrom); + + int length = refSequence.getSequenceLength(); + int [] countStart = new int[length], + countEnd = new int[length]; + + while (samIter.hasNext()){ + SAMRecord sam = samIter.next(); + if (sam.getMappingQuality() < qual) + continue; + + int insertSize = sam.getInferredInsertSize(); + + if (insertSize <= 0)//this step may be redundant + continue; + + if (insertSize < min) + continue; + + if (insertSize > max) + continue; + + + int posStart = sam.getAlignmentStart() - 1;//-1 for 0-index + int posEnd = posStart + insertSize; + + countStart[posStart] ++; + + if (posEnd < length) + countEnd[posEnd] ++; + } + samIter.close(); + samReader.close(); + + + writeBedGraph(chrom, countStart, window, output + "start.bedgraph"); + writeBedGraph(chrom, countEnd, window, output + "end.bedgraph"); + for (int i = 0; i< countStart.length;i++){ + countStart[i] += countEnd[i]; + } + writeBedGraph(chrom, countStart, window, output + "tot.bedgraph"); + } + + static void writeBedGraph(String chrom, int [] countStart, int halfWindowSize, String fileName) throws IOException{ + SequenceOutputStream fCount = SequenceOutputStream.makeOutputStream(fileName); + fCount.print("track type=bedGraph\n"); + char sep = '\t'; + int sum = 0; + for (int i = 0; i < Math.min(halfWindowSize, countStart.length);i++) + sum += countStart[i]; + + //fCount.print("#pos\tstart\tend\n"); + for (int i = 0; i< countStart.length;i++){ + int newPos = i + halfWindowSize; + if (newPos < countStart.length) + sum += countStart[newPos]; + + int oldPos = i - halfWindowSize - 1; + if (oldPos >= 0) + sum -= countStart[oldPos]; + + + fCount.print(chrom); + fCount.print(sep); + fCount.print(i); + fCount.print(sep); + fCount.print(i+1); + fCount.print(sep); + fCount.print(sum); + fCount.print('\n'); + } + fCount.close(); + } +} diff --git a/src/dev/java/japsadev/tools/work/PlasmaAnalysisCrossCorrelationCmd.java b/src/dev/java/japsadev/tools/work/PlasmaAnalysisCrossCorrelationCmd.java new file mode 100644 index 0000000..b014f2b --- /dev/null +++ b/src/dev/java/japsadev/tools/work/PlasmaAnalysisCrossCorrelationCmd.java @@ -0,0 +1,213 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/02/2016 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsadev.tools.work; + + +import java.io.BufferedReader; +import java.io.IOException; + +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.DoubleArray; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Compute cross correllation between two datasets in bedgraph format + * + * + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.dev.plasmaCrossCor", + scriptDesc = "Analysis of plasma sequencing using Cross correlation" + ) +public class PlasmaAnalysisCrossCorrelationCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(PlasmaAnalysisCrossCorrelationCmd.class); + + //CommandLine cmdLine; + public PlasmaAnalysisCrossCorrelationCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("xFile", null, "File 1",true); + addString("yFile", null, "File 2",true); + addInt("window", 500, "Window sise"); + addInt("lag", 10, "lag"); + addString("output", null, "Name of the output file",true); + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + PlasmaAnalysisCrossCorrelationCmd cmdLine = new PlasmaAnalysisCrossCorrelationCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String xFile = cmdLine.getStringVal("xFile"); + //String controlBam = cmdLine.getStringVal("controlBam"); + String yFile = cmdLine.getStringVal("yFile"); + String output = cmdLine.getStringVal("output"); + int window = cmdLine.getIntVal("window"); + int lag = cmdLine.getIntVal("lag"); + + LOG.info("read file 1"); + BufferedReader bf = SequenceReader.openFile(xFile); + DoubleArray array = new DoubleArray(); + + int start = -1; + String chr = ""; + String line = ""; + while ( (line = bf.readLine())!= null){ + // skip the bedGraph file header + if (line.startsWith("track type=")){ + continue; + } + + String [] toks = line.split("\t"); + array.add(Double.parseDouble(toks[3])); + + if (start < 0){ + start = Integer.parseInt(toks[1]); + chr = toks[0]; + } + } + bf.close(); + + double [] x = array.toArray(); + array.clear(); + + LOG.info("read file 2"); + bf = SequenceReader.openFile(yFile); + while ( (line = bf.readLine())!= null){ + // skip the header + if (line.startsWith("track type=")){ + continue; + } + + String [] toks = line.split("\t"); + array.add(Double.parseDouble(toks[3])); + } + bf.close(); + + double [] y = array.toArray(); + + double [] crr = new double[x.length]; + LOG.info("Run 0"); + cross_correlation(x,y,window,0,crr); + for (int i=1; i < lag; i++){ + LOG.info("Run " + i); + cross_correlation(x,y,window,i,crr); + LOG.info("Run -" + i); + cross_correlation(y,x,window,i,crr); + } + + LOG.info("Write"); + SequenceOutputStream fCount = SequenceOutputStream.makeOutputStream(output); + fCount.print("track type=bedGraph\n"); + char sep = '\t'; + for (int i = 0; i < crr.length;i++){ + fCount.print(chr); + fCount.print(sep); + fCount.print(i + start); + fCount.print(sep); + fCount.print(i+1 + start); + fCount.print(sep); + fCount.print(crr[i]); + fCount.print('\n'); + } + fCount.close(); + } + + /** + * Compute cross correlation between two arrays of double + * @param x + * @param y + * @param windows + * @param lag + * @param results + * @return + */ + + public static double[] cross_correlation(double [] x, double [] y, int windows, int lag, double [] results){ + //maxdelay=20 + int length=x.length; + //double [] xcorr = new double[length]; + double xSum = 0, ySum = 0, xSq = 0, ySq = 0; + + //the first windows + for (int i = 0; i < windows - 1; i++){ + xSum += x[i]; + xSq += x[i] * x[i]; + + ySum += y[i + lag]; + ySq += y[i + lag] * y[i + lag]; + } + + + for (int start = 0; start < length - windows - lag; start++){ + int end = start + windows - 1; + xSum += x[end]; + xSq += x[end] * x[end]; + + ySum += y[end + lag]; + ySq += y[end + lag] * y[end + lag]; + + //mean + double mx = xSum/windows; + double my = ySum/windows; + double denom = Math.sqrt((xSq - windows * mx * mx) * (ySq - windows * my * my)); + + double sum = 0; + for (int i = 0; i < windows;i++){ + sum += (x[start + i] - mx) * (y [start + i + lag] - my); + } + + double xcorr = sum / denom;//Math.abs(sum / denom); + if (xcorr > results[start]) + results[start] = xcorr; + + xSum -= x[start]; + xSq -= x[start] * x[start]; + + ySum -= y[start + lag]; + ySq -= y[start + lag] * y[start + lag]; + }//fo + return results; + } +} diff --git a/src/dev/java/japsadev/tools/work/RepeatPrimerCmd.java b/src/dev/java/japsadev/tools/work/RepeatPrimerCmd.java new file mode 100644 index 0000000..ac233dc --- /dev/null +++ b/src/dev/java/japsadev/tools/work/RepeatPrimerCmd.java @@ -0,0 +1,321 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 08/04/2012 - Minh Duc Cao: Revised + * + ****************************************************************************/ + +package japsadev.tools.work; + +import japsa.bio.tr.TandemRepeat; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * @author minhduc + * + */ +@Deployable(scriptName = "jsa.dev.primerTR", +scriptDesc = "Design primers for repeats. Each primer is added 30 bases for mapping") +public class RepeatPrimerCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(RepeatPrimerCmd.class); + + public RepeatPrimerCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input", null, "Name of the input file, - for standard input", true); + addString("tr", null, "Name of the tandem repeat file", true); + addInt("flank", 300, "Length of flanking regions"); + addInt("pad", 30, "Pad to the primers for more specific mappings"); + + addString("primer_exe", "primer3_core", "Path to primer3 "); + addString("bwa_exe", "bwa", "Path to bwa for checking"); + + + addStdHelp(); + } + + public static void main(String[] args) throws Exception { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new RepeatPrimerCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String input = cmdLine.getStringVal("input"); + int flanking = cmdLine.getIntVal("flank"); + int pad = cmdLine.getIntVal("pad"); + // int pad = 30; + int gap = 5 + pad; + + // Reader sequences to the hash + SequenceReader reader = SequenceReader.getReader(input); + HashMap genomes = new HashMap(); + Sequence seq; + + String primerFile = "pinput"; + SequenceOutputStream pos = SequenceOutputStream + .makeOutputStream(primerFile); + Sequence wSeq = null; + + LOG.info("Read sequence from " + input); + while ((seq = reader.nextSequence(Alphabet.DNA16())) != null) { + genomes.put(seq.getName(), seq); + if (wSeq == null) + wSeq = seq; + } + reader.close(); + LOG.info("Read sequence done"); + + seq = wSeq; + // Reader in tr + BufferedReader bf = SequenceReader.openFile(cmdLine.getStringVal("tr")); + ArrayList trList = TandemRepeat.readFromFile(bf, null); + bf.close(); + + // Get sequence information + for (TandemRepeat tr : trList) { + LOG.info("Process " + tr.getID()); + if (!seq.getName().equals(tr.getChr())) { + seq = genomes.get(tr.getChr()); + } + if (seq == null) { + throw new RuntimeException("Sequence " + tr.getChr() + + " not found"); + } + + int start = tr.getStart() - flanking; + int end = tr.getEnd() + flanking; + + if (start <= 0) + start = 1; + + if (end > seq.length()) + end = seq.length(); + + int primerStart = tr.getStart() - start - gap;// to make sure + int primerLength = tr.getLength() + gap * 2; + + pos.print("SEQUENCE_ID=" + tr.getID() + "#" + tr.getParent() + "#" + + start + "\n"); + pos.print("SEQUENCE_TEMPLATE="); + for (int i = start - 1; i < end; i++) { + pos.print(seq.charAt(i)); + } + pos.print("\nSEQUENCE_TARGET=" + primerStart + "," + primerLength + + "\n=\n"); + } + pos.close(); + // Run primer 3 + + final File tmpFile = File.createTempFile("out", null); + tmpFile.deleteOnExit(); + + String primerExe = "primer3_core"; + ProcessBuilder pb = new ProcessBuilder(primerExe, + "-p3_settings_file=settings", + "-output=primer3.out", + "-error=primer.error", + "pinput" + ).redirectErrorStream(true).redirectInput(tmpFile); + + Process process = pb.start(); + int status = process.waitFor(); + if (status ==0 ){ + LOG.info("Successfully run primer3"); + //continue; + }else{ + LOG.error("Run primer 3 FAIL"); + System.exit(1); + } + + // Reat the output + bf = SequenceReader.openFile("primer3.out"); + + SequenceOutputStream fq1 = SequenceOutputStream + .makeOutputStream("1.fq"); + SequenceOutputStream fq2 = SequenceOutputStream + .makeOutputStream("2.fq"); + + String name = ""; + int index = 0; + int seqOffset = 0; + int leftPos = 0, rightPos = 0; + String left = "", right = ""; + + String line = ""; + while ((line = bf.readLine()) != null) { + + String[] toks = line.trim().split("="); + // System.out.println(line + toks.length); + if (toks.length < 2) + System.out.println();// end of record + else if (toks[0].equals("SEQUENCE_ID")) { + System.out.print(toks[1]); + int x = toks[1].lastIndexOf('#'); + name = toks[1].substring(0, x); + + seqOffset = Integer.parseInt(toks[1].substring(x + 1)); + index = 0; + } else if (toks[0].equals("PRIMER_LEFT_NUM_RETURNED") + || toks[0].equals("PRIMER_RIGHT_NUM_RETURNED") + || toks[0].equals("PRIMER_PAIR_NUM_RETURNED")) { + System.out.print("\t" + toks[1]); + } else if (toks[0].equals("PRIMER_LEFT_" + index + "_SEQUENCE")) { + left = toks[1]; + System.out.print("\t" + toks[1]); + } else if (toks[0].equals("PRIMER_RIGHT_" + index + "_SEQUENCE")) { + right = toks[1]; + System.out.print("\t" + toks[1]); + } else if (toks[0].equals("PRIMER_LEFT_" + index)) { + int x = toks[1].indexOf(','); + leftPos = seqOffset + Integer.parseInt(toks[1].substring(0, x)) + - 1; + } else if (toks[0].equals("PRIMER_RIGHT_" + index)) { + int x = toks[1].indexOf(','); + rightPos = seqOffset + Integer.parseInt(toks[1].substring(0, x)) + - right.length(); + } else if (toks[0].equals("PRIMER_PAIR_" + index + "_PRODUCT_SIZE")) { + + int x = name.indexOf('#'); + String seqID = name.substring(x + 1); + if (!seq.getName().equals(seqID)) { + seq = genomes.get(seqID); + } + if (seq == null) { + throw new RuntimeException("Sequence " + seqID + " not found"); + } + //rightPos -= pad; + String readName = name.replaceFirst("#", "#" + index + "#") + + "#" + leftPos + "#" + rightPos + "#" + toks[1]; + fq1.print("@" + readName + "\n"); + + for (int i = 0; i < left.length(); i++) {//+pad + fq1.print(seq.charAt(i + leftPos - 1)); + } + fq1.print("\n+\n"); + + for (int i = 0; i < left.length(); i++) {//pad + fq1.print('I'); + } + fq1.print('\n'); + + Alphabet.DNA alphabet = (Alphabet.DNA) seq.alphabet(); + fq2.print("@" + readName + "\n"); + for (int i = right.length() - 1; i >= 0; i--) { //+pad + fq2.print(alphabet.int2char(alphabet.complement(seq + .getBase(i + rightPos - 1)))); + } + fq2.print("\n+\n"); + + for (int i = right.length() - 1; i >= 0; i--) {//+pad + fq2.print('I'); + } + fq2.print('\n'); + + System.out.print("\t" + toks[1]); + index++; + } + }// while + bf.close(); + fq1.close(); + fq2.close(); + // Run bwa to make sure + /*******************************************************************/ + + String bwaExe = "bwa"; + pb = new ProcessBuilder(bwaExe, + "aln", + "-o", + "0", + "-l", + "16", + "-k", + "0", + "-f", + "1.sai", + input, + "1.fq" + ).redirectErrorStream(true).redirectOutput(tmpFile); + + LOG.info("Run: " + pb.command()); + process = pb.start(); + status = process.waitFor(); + + pb = new ProcessBuilder(bwaExe, + "aln", + "-o", + "0", + "-l", + "16", + "-k", + "0", + "-f", + "2.sai", + input, + "2.fq" + ).redirectErrorStream(true).redirectOutput(tmpFile); + + LOG.info("Run: " + pb.command()); + process = pb.start(); + status = process.waitFor(); + + pb = new ProcessBuilder(bwaExe, + "sampe", + "-a", + "2000", + "-f", + "a.sam", + input, + "1.sai", + "2.sai", + "1.fq", + "2.fq" + ).redirectErrorStream(true) + .redirectOutput(tmpFile); + + LOG.info("Run: " + pb.command()); + process = pb.start(); + status = process.waitFor(); + } +} diff --git a/src/dev/java/japsadev/tools/work/Test.java b/src/dev/java/japsadev/tools/work/Test.java new file mode 100644 index 0000000..3e9d1a5 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/Test.java @@ -0,0 +1,143 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 10 Dec 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.tools.work; + +import japsa.seq.SequenceOutputStream; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.Date; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +import com.google.common.io.Files; +import com.google.common.io.Resources; + +/** + * @author minhduc + * + */ +public class Test { + + /** + * @param args + * @throws ParserConfigurationException + * @throws IOException + * @throws SAXException + */ + public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { + + String BASE = "/home/minhduc/MLST"; + + //Get Document Builder + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + + //Build Document + String link = "http://pubmlst.org/data/dbases.xml"; + Document document = builder.parse(link); + + //Normalize the XML Structure; It's just too important !! + document.getDocumentElement().normalize(); + + //Here comes the root node + Element root = document.getDocumentElement(); + System.out.println(root.getNodeName()); + + //Get all employees + NodeList nList = document.getElementsByTagName("species"); + System.out.println("============================"); + + for (int temp = 0; temp < nList.getLength(); temp++){ + //get species name + Node node = nList.item(temp).getFirstChild(); + String species = node.getNodeValue(); + species = species.trim().replaceAll(" ", "_"); + File baseDir = new File(BASE+"/" + species); + baseDir.mkdirs(); + + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(baseDir.getAbsolutePath()+"/logs"); + sos.print("Date " + new Date() + "\n"); + //go to mlst + node = node.getNextSibling(); + + //mlst field (blank) + node = node.getFirstChild(); + + //get to database + node = node.getNextSibling(); + + //first child = blank + NodeList childNodes = node.getChildNodes(); + + for (int x = 0; x < childNodes.getLength(); x++){ + node = childNodes.item(x); + if ("url".equals(node.getNodeName())) + sos.print("URL: " + node.getFirstChild().getNodeValue () + "\n"); + else if ("retrieved".equals(node.getNodeName())) + sos.print("retrieved: " + node.getFirstChild().getNodeValue () + "\n"); + else if ("profiles".equals(node.getNodeName())){ + String profileULR = node.getChildNodes().item(3).getFirstChild().getNodeValue(); + sos.print("Prifle: "+profileULR +"\n"); + Resources.asByteSource(new URL(profileULR)).copyTo(Files.asByteSink(new File(baseDir.getAbsolutePath()+"/profile.dat"))); + }else if ("loci".equals(node.getNodeName())){ + NodeList lociList = node.getChildNodes(); + for (int i = 0; i< lociList.getLength();i++){ + Node locusNode = lociList.item(i); + if ("locus".equals(locusNode.getNodeName ())){ + String geneName = locusNode.getChildNodes().item(0).getNodeValue().trim(); + String geneULR = locusNode.getChildNodes().item(1).getChildNodes().item(0).getNodeValue(); + Resources.asByteSource(new URL(geneULR)).copyTo(Files.asByteSink(new File(baseDir.getAbsolutePath()+"/" + geneName + ".fas"))); + sos.print(geneName + " "+ geneULR + "\n" ); + } + }//for + }//if + + }//for + sos.close(); + }//for + + } + +} diff --git a/src/dev/java/japsadev/tools/work/TreeHetLinageCmd.java b/src/dev/java/japsadev/tools/work/TreeHetLinageCmd.java new file mode 100755 index 0000000..e9c8853 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/TreeHetLinageCmd.java @@ -0,0 +1,102 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.tools.work; + +import japsa.bio.phylo.PhylogenyTree; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import java.io.BufferedReader; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Random; + + + +/** + * @author minhduc + * + */ +@Deployable(scriptName = "jsa.work.treehet", +scriptDesc = "Make the branch of the tree with random model") +public class TreeHetLinageCmd extends CommandLine{ + public TreeHetLinageCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addStdInputFile(); + addString("output", "-", "Name of the file for output, - for stdout"); + addString("model", "m", "Model prefix"); + + addStdHelp(); + } + /** + * @param args + */ + public static void main(String[] args) throws IOException { + CommandLine cmdLine = new TreeHetLinageCmd(); + args = cmdLine.stdParseLine(args); + String output = cmdLine.getStringVal("output"); + prefix = " #" + cmdLine.getStringVal("model"); + BufferedReader bf = SequenceReader.openFile(cmdLine.getStringVal("input")); + + String line = null, str = ""; + while ((line = bf.readLine()) != null) { + str = str + line.trim(); + } + bf.close(); + PhylogenyTree tree = PhylogenyTree.parseTree(str); + + PrintStream ps = System.out; + if(!"-".equals(output)) + ps = new PrintStream(new FileOutputStream(output)); + + ps.println(ranTree(tree) + prefix + rand.nextInt(4)+";"); + ps.close(); + } + + static Random rand = new Random(); + static String prefix = ""; + + public static String ranTree(PhylogenyTree tree){ + if (tree.isLeaf()) { + return tree.getName(); + } + int b1 = rand.nextInt(4); + int b2 = rand.nextInt(4); + + String s = "(" + ranTree(tree.getChild(0)) + prefix + b1 + "," + ranTree(tree.getChild(1)) + prefix+b2 + ")"; + return s; + } +} diff --git a/src/dev/java/japsadev/tools/work/VariableVariantVCF.java b/src/dev/java/japsadev/tools/work/VariableVariantVCF.java new file mode 100644 index 0000000..1b3b1c3 --- /dev/null +++ b/src/dev/java/japsadev/tools/work/VariableVariantVCF.java @@ -0,0 +1,90 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 3 Jul 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsadev.tools.work; + +import java.io.File; + +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypesContext; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFFileReader; + +/** + * @author minhduc + * + */ +public class VariableVariantVCF { + public static void main(String[] args){ + VCFFileReader vcf = new VCFFileReader(new File(args[0])); + CloseableIterator iter = vcf.iterator(); + + System.out.print("#CHROM start end qual"); + + for (String sample:vcf.getFileHeader().getSampleNamesInOrder()){ + System.out.print(" " + sample); + } + System.out.println(); + while (iter.hasNext()){ + VariantContext var = iter.next(); + var.getChr(); + GenotypesContext gtypes = var.getGenotypes(); + boolean dongbo = true; + Allele firstAllele = gtypes.get(0).getAllele(0); + for (Genotype genotype:gtypes){ + Allele allele = genotype.getAllele(0); + if (allele.compareTo(firstAllele) != 0){ + dongbo = false; + break;//for + } + }//for + if (dongbo){ + continue; + } + //!dongbo + System.out.print(var.getChr() + " " + var.getStart() + " " + var.getEnd() + " " + var.getPhredScaledQual()); + for (Genotype genotype:gtypes){ + System.out.print(" " + genotype.getAllele(0)); + } + System.out.println(); + + + } + + vcf.close(); + + } +} diff --git a/src/dev/java/japsadev/util/deploy/DevDeploy.java b/src/dev/java/japsadev/util/deploy/DevDeploy.java new file mode 100644 index 0000000..1d46baa --- /dev/null +++ b/src/dev/java/japsadev/util/deploy/DevDeploy.java @@ -0,0 +1,176 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * File: Deploy.java + * 15/11/2013 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.util.deploy; + +import japsa.tools.bio.phylo.XMDistance2Cmd; +import japsa.util.CommandLine; +import japsa.util.deploy.Deploy; +import japsadev.tools.*; +//import japsadev.tools.ConsensusGenerateCmd; +import japsadev.tools.work.*; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +/** + * This class is used to deploy tools: create a makefile to generate scripts + * + * @author Minh Duc Cao (http://www.caominhduc.org/) + */ +public class DevDeploy { + private static ArrayList tools = new ArrayList(); + static { + //tools.add(new SampleCmd()); + + tools.add(new CaptureVNTR()); + tools.add(new ResGeneGenomesCmd()); + tools.add(new GetFlankBlast()); + + tools.add(new String("Working commands")); + tools.add(new TreeHetLinageCmd()); + + tools.add(new FilterPEConcordance()); + + tools.add(new VNTRLongReadsHmmerCmd()); + + tools.add(new RepeatPrimerCmd()); + + tools.add(new ProfileDPCmd()); + + tools.add(new PlasmaAnalysisCmd()); + tools.add(new MethylationAnalysisCmd()); + tools.add(new MethylationAnalysis2Cmd()); + tools.add(new PlasmaAnalysisCrossCorrelationCmd()); + //tools.add(new RemoveNsCmd()); + tools.add(new XMDistance2Cmd()); + tools.add(new BuildMLSTTreeCmd()); + + tools.add(new BuildXMTreeCmd()); + tools.add(new FixNamesTreeCmd()); + tools.add(new VNTRDepthAnalyserCmd()); + tools.add(new VNTRDepthSumCmd()); + tools.add(new FixFastqNameCmd()); + tools.add(new SampleCmd()); + tools.add(new AnalyseCaptureCmd()); + tools.add(new BreakPointAnalysisCmd()); + tools.add(new CaptureProbeDesignCmd()); + tools.add(new ConvertProbeCmd()); + tools.add(new VNTRSelectCmd()); + tools.add(new NewScarfCmd()); + tools.add(new GetCDHitCmd()); + + tools.add(new GapCloserCmd()); + + tools.add(new SelectReadsCmd()); + tools.add(new StructuralVariationCmd()); + + tools.add(new VNTRClusteringCmd()); + tools.add(new VNTRClusteringHmmCmd()); + tools.add(new CheckInductionCmd()); + //new + //tools.add(new CheckInductionCmd()); + // + + } + + public static void main(String[] args) throws NoSuchFieldException, + SecurityException, IOException { + CommandLine cmdLine = new CommandLine(); + cmdLine.addString("mode", "install", "install or uinstall"); + cmdLine.addString("libs", "", "list of extenal libraries"); + cmdLine.addString("installDir", null, "the directory to install"); + cmdLine.addString("jlp", null, "Directories to libhdf5 and to jri"); + cmdLine.addString("xmx", null, "Set default maximum memory"); + cmdLine.addString("compiler", null, "Compiler version"); + cmdLine.addBoolean("version", false, "Get version and exit"); + cmdLine.addString("server", "na", "Run on server: yes/true for yes; no/false for no"); + + cmdLine.addStdHelp();// help + /********************** Standard processing ***************************/ + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + if (cmdLine.getBooleanVal("version")){ + System.out.println(Deploy.VERSION); + System.exit(0); + } + + ///Get command lines option + String mode = cmdLine.getStringVal("mode"); + + + Deploy.japsaPath = cmdLine.getStringVal("installDir"); + Deploy.compiler = cmdLine.getStringVal("compiler"); + Deploy.jlp = cmdLine.getStringVal("jlp"); + Deploy.libs = cmdLine.getStringVal("libs"); + Deploy.maxMem = cmdLine.getStringVal("xmx"); + String serverOpt = cmdLine.getStringVal("server").toLowerCase(); + + if(serverOpt.equals("yes") || serverOpt.equals("true")) + Deploy.server = 1; + if(serverOpt.equals("no") || serverOpt.equals("false")) + Deploy.server = 0; + + Deploy.japsaJar = "japsa-dev.jar"; + /**********************************************************************/ + + if ("install".equals(mode)) { + Deploy.setUpDirectory(); + + Deploy.setUpScripts(Deploy.tools,"jsa"); + Deploy.setUpScripts(tools, "jsa.dev"); + //System.out.println("Japsa-dev installtion complete\nFor your convenience, please add your PATH: " + Deploy.japsaPath + File.separator+"bin\n"); + System.out.println("Japsa-dev installtion complete\nFor your convenience, please add the following directory your PATH: " + Deploy.japsaPath + File.separator+"bin\n"); + } else if ("uninstall".equals(mode)) { + if (Deploy.uninstallLibraries()){ + Deploy.uninstallScripts(Deploy.tools,"jsa"); + Deploy.uninstallScripts(tools, "jsa.dev"); + } + } + else if ("galaxy".equals(mode)) { + ArrayList fullList = (ArrayList)Deploy.tools.clone(); + fullList.addAll(tools); + Deploy.setUpGalaxyScripts(fullList); + } + else { + System.err.println("Mode " + mode + " not recognised"); + System.err.println(cmdLine.usageString()); + System.exit(-1); + } + } + +} diff --git a/src/dev/java/japsadev/xm/ExpertModelTandem.java b/src/dev/java/japsadev/xm/ExpertModelTandem.java new file mode 100755 index 0000000..ee023ba --- /dev/null +++ b/src/dev/java/japsadev/xm/ExpertModelTandem.java @@ -0,0 +1,318 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 10/01/2012 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.xm; + +import java.util.Iterator; +import java.util.LinkedList; + +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.util.CommandLine; +import japsa.util.JapsaMath; +import japsa.xm.ExpertModel; +import japsa.xm.expert.AdaptiveMarkovExpert; +import japsa.xm.expert.Expert; +import japsa.xm.expert.MarkovExpert; +import japsadev.xm.expert.*; + +public class ExpertModelTandem { + // protected CombinationExpert baseEx; + MarkovExpert markovEx; + AdaptiveMarkovExpert adapMarkovEx;// + // = new MarkovExpert(null,2); + + // Weighted probability distribution of the next character over all models. + double[] finalD;// ,baseD; + + // W'ghted probability distribution of the next character over both markovs. + double[] markovD; + + int currentInd; + + // The cost incurred by the expert for not being certain of the correct + // symbol. + double[] markovCost = null; + // double[][] tandemCosts; // tandems of length 3,4,5 + + int maxPeriod = 9; + + protected double repeatPrior; + protected double repeatPriorProb; + + public static double threshold; + public static int minLen = 10; + public static JapsaAnnotation anno; + + // Two expert seeds + // public TandemExpert tandemEx; + LinkedList panel; + + /* + * Parameters of the algorithm + */ + + public ExpertModelTandem(Alphabet alphabet, int context, + double listenThreshold) { + this(alphabet, context, listenThreshold, 9); + } + + public ExpertModelTandem(Alphabet alphabet, int context, + double listenThreshold, int max) { + super(); + + // this.hashSize = hashSize; + Expert.setParams(alphabet.size(), context); + this.repeatPrior = listenThreshold * context; + repeatPriorProb = JapsaMath.exp2(-repeatPrior); + + finalD = new double[alphabet.size()]; + markovD = new double[alphabet.size()];// Combination of the markov + + maxPeriod = max; + threshold = listenThreshold; + } + + public void printParams() { + System.out.println("Parameters:" + "\nContext : " + + Expert.CONTEXT_LENGTH + "\nListen Threshold : " + repeatPrior + / Expert.CONTEXT_LENGTH + "bps"); + } + + protected void initialiseCommon(Sequence seq) { + + anno = new JapsaAnnotation(seq); + // Background knowledge/based knowledge + markovEx = new MarkovExpert(2, 100); + adapMarkovEx = new AdaptiveMarkovExpert(1, 256, 100); + + markovCost = new double[seq.length()]; + + for (int i = 0; i < maxPeriod; i++) { + panel.add(new TandemExpert(seq, i + 1, Expert.CONTEXT_LENGTH)); + } + } + + /*****************************************************************/ + + protected void preCoding() { + // Get ranking of experts + double baseRateProb = markovEx.posteriorProb(); + double finalSum = markovEx.posteriorProb() + + adapMarkovEx.posteriorProb(); + + for (byte a = 0; a < Expert.alphabet().size(); a++) { + // baseD[a] = + markovD[a] = finalD[a] = markovEx.posteriorProb() + * markovEx.probability(a) + adapMarkovEx.posteriorProb() + * adapMarkovEx.probability(a); + + markovD[a] /= finalSum; + } + + // Listen to no more than expert limit experts + Iterator panelIter = panel.iterator(); + + while (panelIter.hasNext()) {// Inv: head.next = ptr + // double rate = ptr.rate();//msg leng over an history + TandemExpert ptr = panelIter.next(); + double score = ptr.posteriorProb(); + // Only listen if the offset/palindrome is sufficiently good + if (score > baseRateProb / repeatPriorProb) { + for (byte a = 0; a < Expert.alphabet().size(); a++) { + finalD[a] += ptr.probability(a) * score; + } + // ptr.weight = myCost; + finalSum += score; + } + // ptr = ptr.getNext(); + } + + // Normalise + for (byte a = 0; a < Expert.alphabet().size(); a++) { + finalD[a] /= finalSum; + } + } + + /** + * Update all experts at this positition + * + * @param seqArray + * @param i + * @param sid + */ + protected void updateExperts(byte c) { + markovEx.update(c); + adapMarkovEx.update(c); + + /********************* Update Exp ********************/ + Iterator panelIter = panel.iterator(); + + while (panelIter.hasNext()) { + TandemExpert ptr = panelIter.next(); + ptr.update(c);// This actually the prob + // ptr = ptr.getNext(); + } + } + + protected void updateTandemExpertsAtPos(int pos) { + + Iterator panelIter = panel.iterator(); + while (panelIter.hasNext()) { + panelIter.next().updatePos(pos); + } + } + + protected void postCoding(Sequence seqArray) { + } + + /** Get the cost array for the tandem expert. */ + public void calculateTandemCosts(Sequence seq) { + initialiseCommon(seq); + + for (currentInd = 0; currentInd < seq.length(); currentInd++) { + updateTandemExpertsAtPos(currentInd); + } + } + + /** + * Done only after calculate Tandem cost + */ + public void filtering() { + } + + // double gain; + public double[] encode(Sequence seqArray) { + + initialiseCommon(seqArray); + + // Get the sequence to be encode + // byte japsa.seq[] = seqArray.toBytes(); + double[] costs = new double[seqArray.length()]; + + // double markovTotal = 0; + // double totalCost = 0.0; + // go thru the sequence and encode each charactor + for (currentInd = 0; currentInd < seqArray.length(); currentInd++) { + // Compute the probability distribution + preCoding(); + + int actual = seqArray.getBase(currentInd); + // double cost = -JapsaMath.log2(finalD[actual]); + + markovCost[currentInd] = -JapsaMath.log2(markovD[actual]); + + // costs[currentInd] = cost; + // totalCost += cost; + + // markovTotal += markovCost[currentInd]; + updateExperts(seqArray.getBase(currentInd)); + } + + // gain = (totalCost - markovTotal) / seqArray.length(); + + // System.out.print(gain + " " + (markovTotal / seqArray.length()) + + // " "); + + return costs; + } + + public static CommandLine prepareCmd() { + CommandLine cmdLine = new CommandLine(); + cmdLine.addInt("context", 15, "Length of the context"); + cmdLine.addDouble("threshold", 0.15, "Listen threshold"); + + return cmdLine; + } + + public static void main11(String[] args) throws Exception { + + // Get params from users + + CommandLine cmdLine = prepareCmd(); + + args = cmdLine.parseLine(args); + System.out.println(ExpertModel.version()); + + if (args == null || args.length <= 0) { + System.err + .println("Usage: java CommandLine [options] file1 file2 ...\n" + + cmdLine.usageMessage() + "\n"); + System.exit(1); + } + + ExpertModelTandem tModel = new ExpertModelTandem(Alphabet.DNA4(), 5, + cmdLine.getDoubleVal("threshold")); + + // Print out all params + tModel.printParams(); + + // cmdLine.printOptions(); + + FastaReader fReader = new FastaReader(args[0]); + + Sequence seq; + + while ((seq = fReader.nextSequence(Alphabet.DNA4())) != null) { + System.out.print("\n" + seq.getName() + " : "); + + // long start = System.currentTimeMillis(); + // seqHash[1] = japsa.seq; + // double[] costs = + tModel.encode(seq);// ,args[1]); + + // long time = (System.currentTimeMillis() - start); + + // System.out.printf("Compress in %d ms\n", time); + // IOTools.writeDoubleSequence("cost.info", "Cost", costs); + // IOTools.writeDoubleSequence("markov.info", "Markov Cost", + // tModel.markovCost); + // for (int idx = 0; idx < tModel.tandemCosts.length; idx++) { + // IOTools.writeDoubleSequence("tandem" + (idx + 2) + ".info", + // "Cost of tandem " + (idx + 2), tModel.tandemCosts[idx]); + // } + // System.out.println(tModel.gain); + + System.out + .println("============================================================================="); + + /*************************************************************************/ + + } + fReader.close(); + } + +} diff --git a/src/dev/java/japsadev/xm/TandemFinder.java b/src/dev/java/japsadev/xm/TandemFinder.java new file mode 100644 index 0000000..23b6311 --- /dev/null +++ b/src/dev/java/japsadev/xm/TandemFinder.java @@ -0,0 +1,184 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 15/03/2013 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.xm; + +/** + * @author Minh Duc Cao (http://www.caominhduc.org/) + * + */ + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; + +import java.util.*; + +/** + * A Tool for finding tandem repeats. + * + * @author Jackson Gatenby (jackson.gatenby@gmail.com) + * + * Usage: + * + *
+ * java bio.xm.TandemFinder [reads.fq]
+ * 
+ * + */ +public class TandemFinder { + + static ExpertModelTandem tModel; + + // Thresholds for determining if a tandem repeat exists + public static double REPEAT_THRESHOLD = 1.0; + + /** + * Determine if a sequence has a tandem repeat in it. + * + * @param seqArray + * the sequence + * @return an empty set if no repeat, else a set of the possible lengths of + * the repeat unit. + */ + public static Set classify(Sequence seqArray) { + /* + * - Do the encoding. - If one of the tandem costs has a low region, it + * wins. + */ + tModel.calculateTandemCosts(seqArray); + + // In case a count of each successful candidate is needed, use a map. + // Map candidates = new HashMap(); + Set candidates = new HashSet(); + + /**************************************************************** + * for (int i = 1; i < seqArray.length(); ++i) { for (int t = 0; t < + * tModel.tandemCosts.length; ++t) { if (tModel.tandemCosts[t][i] < + * REPEAT_THRESHOLD) { // We're sure about the sequence repeating. + * candidates.add(t + 2); } } } / + ****************************************************************/ + + /* + * Debug: *-/ for (int n : candidates) { System.out.print(n + ", "); } + * System.out.println(); // + */ + + // If there is a repeat, multiples of the repeat period will occur + return candidates; + } + + @SuppressWarnings("unused") + private static int mostCommon(Map nums) { + int best = 0, bestScore = -1; + for (int n : nums.keySet()) { + if (nums.get(n) > bestScore) { + best = n; + bestScore = nums.get(n); + } + } + return best; + } + + private static int gcd(Collection nums) { + int res = 0; + for (int n : nums) { + res = gcd(res, n); + } + return res; + } + + private static int gcd(int a, int b) { + while (b != 0) { + int c = a; + a = b; + b = c % b; + } + return a; + } + + public static CommandLine prepareCmd() { + CommandLine cmdLine = new CommandLine(); + cmdLine.addInt("context", 15, "Length of the context"); + cmdLine.addDouble("threshold", 0.15, "Listen threshold"); + cmdLine.addBoolean("all-periods", false, + "Show all candidate repeat periods"); + return cmdLine; + } + + public static void main(String[] args) throws Exception { + // Get params from users + + CommandLine cmdLine = prepareCmd(); + + args = cmdLine.parseLine(args); + + if (args == null || args.length <= 0) { + System.err + .println("Usage: java bio.xm.TandemFinder [options] file\n" + + cmdLine.usageMessage() + "\n"); + System.exit(1); + } + + tModel = new ExpertModelTandem(Alphabet.DNA5(), 20, + cmdLine.getDoubleVal("threshold")); + + // cmdLine.printOptions(); + + // SequenceFileReader fReader + SequenceReader fReader = SequenceReader.getReader(args[0]); + + Sequence s; + while ((s = fReader.nextSequence(Alphabet.DNA5())) != null) { + Set candidates = classify(s); + int period = gcd(candidates); + System.out.print(s.getName() + "\t" + period); + if (cmdLine.getBooleanVal("all")) { + // Show all potential candidates + System.out.print("\t"); + Iterator it = candidates.iterator(); + for (int i = 0; i < candidates.size(); ++i) { + int c = it.next(); + if (i == 0) { + System.out.print(c); + } else { + System.out.print(" " + c); + } + } + } + System.out.println(); + } + } +} diff --git a/src/dev/java/japsadev/xm/TandemRepeatFinder.java b/src/dev/java/japsadev/xm/TandemRepeatFinder.java new file mode 100644 index 0000000..ff36a24 --- /dev/null +++ b/src/dev/java/japsadev/xm/TandemRepeatFinder.java @@ -0,0 +1,133 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 15/03/2013 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.xm; + +/** + * @author Minh Duc Cao (http://www.caominhduc.org/) + * + */ + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsadev.xm.expert.TandemExpert; + +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.Iterator; + +/** + * A Tool for finding tandem repeats. + * + * + */ + +public class TandemRepeatFinder { + /** + * Determine if a sequence has a tandem repeat in it. + * + * @param seqArray + * the sequence + * @return an empty set if no repeat, else a set of the possible lengths of + * the repeat unit. + */ + + public static void main(String[] args) throws Exception { + // Get params from users + CommandLine cmdLine = new CommandLine(); + cmdLine.addInt("max", 8, "Maximum period size"); + cmdLine.addString("output", "-", "Output file"); + cmdLine.addInt("context", 10, "Length of the context"); + cmdLine.addDouble("threshold", 0.9, "Listen threshold"); + + args = cmdLine.parseLine(args); + + if (args == null || args.length <= 0) { + System.err + .println("Usage: java bio.xm.TandemFinder [options] file\n" + + cmdLine.usageMessage() + "\n"); + System.exit(1); + } + + int max = cmdLine.getIntVal("max"); + double threshold = cmdLine.getDoubleVal("threshold"); + + ExpertModelTandem tModelF = new ExpertModelTandem(Alphabet.DNA5(), + cmdLine.getIntVal("context"), threshold, max); + + // SequenceFileReader fReader + SequenceReader fReader = SequenceReader.getReader(args[0]); + + Sequence s; + + SequenceOutputStream outBFF; + + String output = cmdLine.getStringVal("output"); + if (!output.equals("-")) { + outBFF = new SequenceOutputStream(new FileOutputStream(output)); + } else { + outBFF = new SequenceOutputStream(System.out); + } + while ((s = fReader.nextSequence(Alphabet.DNA5())) != null) { + + System.out.println(s.getName()); + tModelF.calculateTandemCosts(s); + ExpertModelTandem.anno.writeAnnotation(outBFF); + + // tModelB.calculateTandemCosts(s.reverseComplement()); + + /***************************************************/ + + Iterator iter = tModelF.panel.iterator(); + for (int m = 0; m < max; m++) { + TandemExpert exp = iter.next(); + PrintStream out = new PrintStream(new FileOutputStream( + s.getName() + (m + 1) + ".info")); + PrintStream out2 = new PrintStream(new FileOutputStream( + s.getName() + (m + 1) + ".inf")); + for (int i = 0; i < s.length(); i++) { + out.println(exp.tandemCosts[i]); + out2.println(exp.positive[i]); + } + out.close(); + out2.close(); + } + /***************************************************/ + } + outBFF.close(); + } +} diff --git a/src/dev/java/japsadev/xm/expert/FixDistributionExpert.java b/src/dev/java/japsadev/xm/expert/FixDistributionExpert.java new file mode 100755 index 0000000..9e4a00f --- /dev/null +++ b/src/dev/java/japsadev/xm/expert/FixDistributionExpert.java @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/** + * Written by Chris Mears and modified by Minh Duc Cao + */ +package japsadev.xm.expert; + +import japsa.xm.expert.Expert; + +public class FixDistributionExpert extends Expert { + double[] dist = { .2, .3, .3, .2 }; + + // double[] dist = {.4,.1,.1,.4}; + public FixDistributionExpert() { + super(); + } + + public double probability(int character) { + return dist[character]; + } + + public double update(int actual) { + return 1.0; + } + + public String toString() { + return "ME"; + } +} diff --git a/src/dev/java/japsadev/xm/expert/RepeatSubsExpertFix.java b/src/dev/java/japsadev/xm/expert/RepeatSubsExpertFix.java new file mode 100755 index 0000000..41d2ea1 --- /dev/null +++ b/src/dev/java/japsadev/xm/expert/RepeatSubsExpertFix.java @@ -0,0 +1,110 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.expert; + +import japsa.seq.AbstractSequence; +import japsa.util.MyBitSet; +import japsa.xm.expert.RepeatExpert; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintStream; + +public class RepeatSubsExpertFix extends RepeatExpert { + + public static double[][] globalMatrix = { { .25, .25, .25, .25 }, + { .25, .25, .25, .25 }, { .25, .25, .25, .25 }, + { .25, .25, .25, .25 } }; + + public RepeatSubsExpertFix(AbstractSequence seq, int start, MyBitSet b, + int type) { + super(seq, start, b, type); + } + + public double probability(int character) { + + int match; + if (expertType != PALIN_TYPE) + match = character; + else + match = 3 - character; + + return globalMatrix[seq.symbolAt(currentPointer)][match]; + } + + public double update(int actual) { + // Move current pointer to the next char in knowledge + if ((currentPointer - start) * expertType >= length) { + return -1.0; + } + + double prob = probability(actual); + + currentPointer += expertType; + updateCost(prob); + return prob; + + } + + public static void readMatrix(BufferedReader in) throws IOException { + int x = 0; + String line = ""; + while ((line = in.readLine()) != null) { + line = line.trim(); + if (line.startsWith("#")) + continue; + String[] tks = line.split(" +"); + for (int i = 0; i < 4; i++) { + globalMatrix[x][i] = Double.parseDouble(tks[i]); + } + + x++; + if (x >= 4) + break; + } + if (x < 4) { + System.err.println("There are only " + x + " lines "); + } + } + + public static void printMatrix(PrintStream out) { + for (int x = 0; x < globalMatrix.length; x++) { + for (int y = 0; y < globalMatrix[x].length; y++) { + out.printf("%6.4f ", globalMatrix[x][y]); + } + out.println(); + } + } + + public RepeatExpert duplicate(AbstractSequence seq_, int start_, MyBitSet b_) { + return new RepeatSubsExpertFix(seq_, start_, b_, expertType); + } + +} diff --git a/src/dev/java/japsadev/xm/expert/RepeatSubsExpertLearn.java b/src/dev/java/japsadev/xm/expert/RepeatSubsExpertLearn.java new file mode 100755 index 0000000..3aca325 --- /dev/null +++ b/src/dev/java/japsadev/xm/expert/RepeatSubsExpertLearn.java @@ -0,0 +1,260 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.expert; + +import japsa.seq.AbstractSequence; +import japsa.util.MyBitSet; +import japsa.xm.expert.Expert; +import japsa.xm.expert.RepeatExpert; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintStream; +import java.util.StringTokenizer; + +public class RepeatSubsExpertLearn extends RepeatExpert { + + public static double[][] countMatrix = { { 0.0, 0.0, 0.0, 0.0 }, + { 0.0, 0.0, 0.0, 0.0 }, { 0.0, 0.0, 0.0, 0.0 }, + { 0.0, 0.0, 0.0, 0.0 } }; + + public static int countSeq[] = { 0, 0, 0, 0 }; + public static int countBg[] = { 0, 0, 0, 0 }; + static int countSeqAll = 0, countBgAll = 0; + + // public static double[] dnaProb = + // {.4043986778540554, 0.2972285786931096, 0.19946605644546148, + // 0.09890668700737351}; + // {.25 ,.25 ,.25, .25}; + + // #DEBUG_BEGIN + public static int longestL = 0; + public static double bestInfoGain = 0; + // #DEBUG_END + + public static double[][] globalMatrix = + // {{59.8817, 5.0548, 29.9753, 5.0882},{3.0111, 64.9124, 7.0311, 25.0454}, + // {29.9853, 4.0224, 59.9208, 6.0715}, {4.9642, 30.0038, 4.9152 , 60.116}}; + // {{40,10,10,10},{10,40,10,10},{10,10,40,10},{10,10,10,40}}; + { { .50, .10, .30, .10 }, { .05, .60, .10, .25 }, { .20, .04, .70, .06 }, + { .05, .05, .05, .80 } }; + + // double[][] myMatrix; + + // double[] sum; + + double[][] myCount; + + // static double [] bgFreq = {0.1,0.2,.3,.4}; + // static double [] seqFreq = {0.25,0.25,.25,.25}; + + public RepeatSubsExpertLearn(AbstractSequence seq, int start, MyBitSet b, + int type) { + + super(seq, start, b, type); + // Copy to my matrix + // myMatrix = new + // double[Expert.alphabet().size()][Expert.alphabet().size()]; + myCount = new double[Expert.alphabet().size()][Expert.alphabet().size()]; + + // sum = new double[Expert.alphabet().size()]; + for (int i = 0; i < Expert.alphabet().size(); i++) { + // sum[i] = 0.0; + for (int j = 0; j < Expert.alphabet().size(); j++) { + // myMatrix[i][j] = globalMatrix[i][j]; + // sum[i] += myMatrix[i][j]; + myCount[i][j] = 1.0; + } + } + } + + public double probability(int character) { + + int match; + if (expertType != PALIN_TYPE) + match = character; + else + match = 3 - character; + + return globalMatrix[seq.symbolAt(currentPointer)][match]; + // myMatrix[japsa.seq[currentPointer]][match] / + // sum[japsa.seq[currentPointer]]; + } + + public double update(int actual) { + + // if (encodeCount < countSize.length) + // countSize[encodeCount] ++; + + // Move current pointer to the next char in knowledge + if ((currentPointer - start) * expertType >= length) { + return -1.0; + } + + double prob = probability(actual); + + int match; + if (expertType != PALIN_TYPE) + match = actual; + else + match = 3 - actual; + + myCount[seq.symbolAt(currentPointer)][match] += 1.0;// weight / + // Expert.repWeight;// + // Expert.repWeight + + // myMatrix[japsa.seq[currentPointer]][match] += 1.0; + // sum[japsa.seq[currentPointer]] += 1; + + currentPointer += expertType; + updateCost(prob); + return prob; + + } + + public static void readMatrix(BufferedReader in) throws IOException { + int x = 0; + String line = ""; + while ((line = in.readLine()) != null) { + line = line.trim(); + if (line.startsWith("#")) + continue; + StringTokenizer tk = new StringTokenizer(line); + for (int i = 0; i < 4; i++) { + globalMatrix[x][i] = Double.parseDouble(tk.nextToken()); + } + + x++; + if (x >= 4) + break; + } + if (x < 4) { + System.err.println("There are only " + x + " lines "); + } + } + + public static void printMatrix(PrintStream out) { + for (int x = 0; x < RepeatSubsExpertLearn.globalMatrix.length; x++) { + for (int y = 0; y < RepeatSubsExpertLearn.globalMatrix[x].length; y++) { + out.printf("%6.4f ", RepeatSubsExpertLearn.globalMatrix[x][y]); + } + out.println(); + } + } + + public static void summary() { + /** ****************************************************** */ + double countSum[] = { 0, 0, 0, 0 }; + double countSumSeq[] = { 0, 0, 0, 0 }; + for (int x = 0; x < RepeatSubsExpertLearn.countMatrix.length; x++) { + for (int y = 0; y < RepeatSubsExpertLearn.countMatrix[x].length; y++) { + System.out.printf("%6.2f ", + RepeatSubsExpertLearn.countMatrix[x][y]); + countSum[x] += RepeatSubsExpertLearn.countMatrix[x][y]; + countSumSeq[y] += RepeatSubsExpertLearn.countMatrix[x][y]; + } + System.out.println(); + } + System.out + .println("----------------------------------------------------------"); + + // #DEBUG_BEGIN + System.out.println(" Longest = " + longestL + "\n Best Infogain = " + + bestInfoGain); + + longestL = 0; + bestInfoGain = 0; + // #DEBUG_END + + System.out + .println("----------------------------------------------------------"); + for (int y = 0; y < RepeatSubsExpertLearn.countMatrix.length; y++) { + for (int x = 0; x < RepeatSubsExpertLearn.countMatrix[y].length; x++) { + System.out.printf("%6.4f ", 100 + * RepeatSubsExpertLearn.countMatrix[x][y] + / countSumSeq[y]); + } + System.out.println(); + } + System.out + .println("----------------------------------------------------------"); + + for (int x = 0; x < RepeatSubsExpertLearn.countMatrix.length; x++) { + for (int y = 0; y < RepeatSubsExpertLearn.countMatrix[x].length; y++) { + // System.out.printf("%6.4f ", 100 + // * RepeatSubsExpertLearn.countMatrix[x][y] / countSum[x]); + + RepeatSubsExpertLearn.globalMatrix[x][y] = (RepeatSubsExpertLearn.countMatrix[x][y] / countSum[x]); + // * (bgFreq[x] / (countBg[x] * 1.0/countBgAll)); + countMatrix[x][y] = 0.0; + }// normalise(RepeatSubsExpertLearn.globalMatrix[x]); + // System.out.println(); + } + printMatrix(System.out); + // for (int x = 0; x < RepeatSubsExpertLearn.globalMatrix.length; x++) { + // for (int y = 0; y < RepeatSubsExpertLearn.globalMatrix[x].length; + // y++) { + // System.out.printf("%6.4f ", + // RepeatSubsExpertLearn.globalMatrix[x][y]); + // } + // System.out.println(); + // } + + countSeqAll = countBgAll = 0; + + System.out + .println("===================================================="); + /*********************************************************************** + * for (int x = 0; x < RepeatExpert.countSize.length; x ++){ if + * (RepeatExpert.countSize[x] > 0) System.err.println(x+"\t" + + * JapsaMath.log2(RepeatExpert.countSize[x])); else + * System.err.println(x+"\t0.0"); } + * /********************************************************* + * + * //double countProb = for (int x = 0; x < countProb.length; x ++){ + * System.err.println((x * (COUNT_MAX-COUNT_MIN)/countProb.length)+"\t" + * + (countProb[x])); } / + **********************************************************************/ + } + + public static void normalise(double[] ar) { + double sumA = 0; + for (int i = 0; i < ar.length; i++) { + sumA += ar[i]; + } + for (int i = 0; i < ar.length; i++) { + ar[i] /= sumA; + } + } + + public RepeatExpert duplicate(AbstractSequence seq_, int start_, MyBitSet b_) { + return new RepeatSubsExpertLearn(seq_, start_, b_, expertType); + } +} diff --git a/src/dev/java/japsadev/xm/expert/TandemExpert.java b/src/dev/java/japsadev/xm/expert/TandemExpert.java new file mode 100755 index 0000000..a9f3eeb --- /dev/null +++ b/src/dev/java/japsadev/xm/expert/TandemExpert.java @@ -0,0 +1,236 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 10/01/2012 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsadev.xm.expert; + +import japsa.bio.tr.TandemRepeat; +import japsa.seq.Sequence; +import japsa.util.JapsaMath; +import japsa.xm.expert.Expert; +import japsadev.xm.ExpertModelTandem; + +import java.util.Arrays; + +/** + * Expert predict based on thinking the current base is part of a tandem repeat + * + * @author Minh Duc Cao + * + */ +public class TandemExpert extends Expert { + Sequence seq; + private int period = 3; + private int learnWindow = 20; + private int currentPos = 0; + private int totalCounts = 4, countRight = 1; + + double totalGain = 2.5, countPredict = 1; + public double[] tandemCosts; + public double[] positive; + int trial = 0; + int currentRun = 0; + double currentSum = 0; + double trialInfo = 0; + + int maxTrial = 3; + int windowIndex = 0; + int[] windowCount; + + double sumLastFour = 8; + + public TandemExpert(Sequence seq, int period) { + this(seq, period, 20); + } + + public TandemExpert(Sequence seq, int period, int window) { + super(); + this.seq = seq; + this.period = period; + learnWindow = window; + + tandemCosts = new double[seq.length()]; + positive = new double[seq.length()]; + + windowCount = new int[learnWindow]; + Arrays.fill(windowCount, -1); + + maxTrial = period * 2 / 3; + if (maxTrial < 4) + maxTrial = 4; + + // totalCounts = learnWindow; + // countRight = totalCounts - 3 * totalCounts / 4; + } + + @Override + public double probability(int character) { + if (currentPos < period) + return 0.25; + + double prob = (countRight + 0.0) / (totalCounts); + if (prob <= 0 || prob >= 1) { + System.err.println("Error " + prob + " " + totalCounts + " " + + countRight); + (new Exception()).printStackTrace(); + System.exit(1); + } + + if (seq.symbolAt(currentPos - period) == character) + return prob; + else + return (1 - prob) / (Expert.alphabet().size() - 1); + } + + @Override + public double update(int actual) { + double prob = probability(actual); + + // update position + + // update counts + if (currentPos >= period) { + totalCounts++; + if (seq.symbolAt(currentPos - period) == actual) + countRight++; + } + + if (currentPos >= learnWindow + period) { + totalCounts--; + if (seq.symbolAt(currentPos - learnWindow) == seq + .symbolAt(currentPos - learnWindow - period)) + countRight--; + } + currentPos++; + + updateCost(prob); + return prob; + } + + public double updatePos(int pos) { + int actual = seq.symbolAt(pos); + double prob = probability(actual); + + // if (posSrc == 1000){ + // int x; + // x = 1; + // } + // update counts + + if (currentPos >= period) { + if (windowCount[windowIndex] == 0) { + totalCounts--; + } + if (windowCount[windowIndex] == 1) { + totalCounts--; + countRight--; + } + + totalCounts++; + if (seq.symbolAt(currentPos - period) == actual + // || countRight * 1.6 < totalCounts + ) { + windowCount[windowIndex] = 1; + countRight++; + } else { + windowCount[windowIndex] = 0; + } + + windowIndex++; + if (windowIndex >= windowCount.length) + windowIndex = 0; + } + + currentPos++; + + double msgLen = -JapsaMath.log2(prob); + tandemCosts[pos] = msgLen; + + if (pos >= 4) { + this.sumLastFour -= tandemCosts[pos - 4]; + } else + this.sumLastFour -= 2; + + sumLastFour += msgLen; + + if (sumLastFour / 4 < totalGain / countPredict + - ExpertModelTandem.threshold) { + trial = 0; + currentRun++; + currentSum += msgLen; + positive[pos] = currentSum / currentRun; + } else { + trial++; + if (trial < maxTrial) { + currentRun++; + currentSum += msgLen; + positive[pos] = currentSum / currentRun; + } else { + if (currentRun - trial > ExpertModelTandem.minLen) { + int start = pos - currentRun - period + 1 - learnWindow; + + if (start < 1) + start = 1; + TandemRepeat str = new TandemRepeat(seq.getName(), start, + pos - maxTrial); + str.setPeriod(period); + str.setScore(totalGain / countPredict - currentSum + / currentRun); + str.setUnitNo((double) str.getLength() / period); + + str.setID("M" + str.getStart()); + + str.addDesc("@R:" + period); + // Unit no + str.addDesc("@N:" + str.getUnitNo()); + // myCost + str.addDesc("@S:" + str.getScore()); + + ExpertModelTandem.anno.add(str); + } + positive[pos] = -0.5; + currentRun = 0; + currentSum = 0; + } + } + + positive[pos] = sumLastFour / 4; + + totalGain += msgLen; + countPredict++; + + updateCost(prob); + return prob; + } + +} diff --git a/src/dev/java/japsadev/xm/expert/UniformExpert.java b/src/dev/java/japsadev/xm/expert/UniformExpert.java new file mode 100755 index 0000000..62c618e --- /dev/null +++ b/src/dev/java/japsadev/xm/expert/UniformExpert.java @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/** + * Written by Chris Mears, modified and maintained by Minh Duc Cao + */ +package japsadev.xm.expert; + +import japsa.xm.expert.Expert; + +public class UniformExpert extends Expert { + + public UniformExpert() { + super(); + } + + public double probability(int character) { + return 1.0 / Expert.alphabet().size(); + } + + public double update(int actual) { + double prob = 1.0 / Expert.alphabet().size(); + updateCost(1.0 / Expert.alphabet().size()); + return prob; + } + + public String toString() { + return "UE"; + } +} diff --git a/src/dev/java/japsadev/xm/genome/AdaptiveMarkovExpertLong.java b/src/dev/java/japsadev/xm/genome/AdaptiveMarkovExpertLong.java new file mode 100755 index 0000000..3fd5531 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/AdaptiveMarkovExpertLong.java @@ -0,0 +1,134 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +public class AdaptiveMarkovExpertLong extends ExpertLong { + private AdaptiveMarkovLong markov; + + public AdaptiveMarkovExpertLong(GenomeSequence seq, int order) { + super(seq); + markov = new AdaptiveMarkovLong(order); + // posProb = 1.0 /3; + } + + public void resurrect(GenomeSequence work, long i, int past) { + } + + public void resign() { + } + + public double probability(int character) { + return markov.probability(character); + } + + public int copyFrom(int i) { + return 0; + } + + public double update(int actual) { + double cost = (markov.probability(actual)); + updateCost(cost); + markov.update(actual); + return cost; + } + + public String toString() { + return "ME"; + } + + public static void main(String[] args) { + + } + + public int copyFrom() { + return 0; + } + + public void learn() { + } +} + +class AdaptiveMarkovLong { + int[] charCounts; + int[] countTotal; + int order; + int currentInd = 0;// index of current context + int MASK;// matrix size + + int[] history; + int HIS_SIZE = 256; + int backInd = 0; + int ind = 0; + + public AdaptiveMarkovLong(int order) { + this.order = order; + MASK = (int) Math.pow(ExpertLong.ALPHABET_SIZE, order); + + history = new int[HIS_SIZE]; + if (order >= 0) { + charCounts = new int[MASK * ExpertLong.ALPHABET_SIZE]; + countTotal = new int[MASK]; + for (int i = 0; i < charCounts.length; i++) + charCounts[i] = 1; + for (int i = 0; i < countTotal.length; i++) + countTotal[i] = ExpertLong.ALPHABET_SIZE; + } + } + + boolean yes = false; + + public void update(int a) { + // double res = encodeLen(a); + countTotal[currentInd]++;// = DistributionExpert.ADD; + currentInd = currentInd * ExpertLong.ALPHABET_SIZE + a; + charCounts[currentInd]++; + currentInd = currentInd % MASK; + + if (yes) { + countTotal[backInd]--; + backInd = backInd * ExpertLong.ALPHABET_SIZE + history[ind]; + charCounts[backInd]--; + backInd = backInd % MASK; + history[ind] = a; + ind = (ind + 1) % HIS_SIZE; + } else { + history[ind] = a; + ind = (ind + 1) % HIS_SIZE; + if (ind == 0) { + yes = true; + } + } + } + + public double probability(int a) { + return ((double) charCounts[currentInd * ExpertLong.ALPHABET_SIZE + a]) + / countTotal[currentInd]; + } +} diff --git a/src/dev/java/japsadev/xm/genome/CombinationExpertLong.java b/src/dev/java/japsadev/xm/genome/CombinationExpertLong.java new file mode 100755 index 0000000..52194a2 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/CombinationExpertLong.java @@ -0,0 +1,81 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +import japsa.util.Distribution; + +public class CombinationExpertLong extends ExpertLong { + Distribution combDist; + protected long repCount; + + public CombinationExpertLong() { + super(null); + combDist = new Distribution(ExpertLong.ALPHABET_SIZE); + repCount = 1; + } + + public double probability(int character) { + return combDist.getWeight(character); + } + + public double update(int actual) { + double prob = combDist.getWeight(actual); + updateCost(prob); + return prob; + + } + + public void resurrect(GenomeSequence workSeq, long pos, int past) { + } + + public void resign() { + } + + public String toString() { + return "RE"; + } + + public double priorProb(int index) { + return repCount / (index + 1.0); + + } + + public Distribution getCombDistribution() { + return combDist; + } + + public void setCombDistribution(Distribution a) { + combDist = a; + } + + public void incrementCount() { + repCount++; + } +} diff --git a/src/dev/java/japsadev/xm/genome/ExpertLong.java b/src/dev/java/japsadev/xm/genome/ExpertLong.java new file mode 100755 index 0000000..0d0db60 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/ExpertLong.java @@ -0,0 +1,218 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +import japsa.util.Distribution; +import japsa.util.JapsaMath; + +public abstract class ExpertLong { + + // global variables, params for the algorithm + protected static int CONTEXT_LENGTH = 20;// Just a defaut value + public static int ALPHABET_SIZE = 4;// should be 4 for dna + public static double INV_ALPHABET_SIZE = .25; + public static int HASH_SIZE = 11; + private static double[] DEF_HISTORY; + private static double DEF_MSGLEN; + + // + ExpertLong next = null; // to implement a linked list of experts + protected long id;// ID to identify it self + private int counter = 0; + + GenomeSequence genSeq; + // private double prior = 1.0; + + // Evaluation of experts + protected double msgLenProb = 1; + // private double prior = 1; + + // History of prediction + private double[] history; + private int ind = 0; + + /** + * For summary of experts + */ + // #DEBUG_BEGIN + // public static double baseCost = 0; + + // debug + // protected double encodeCost = 0; + // protected int encodeCount = 0; + // protected double encodeCostLuck = 0; + // #DEBUG_END + + // public double infoGain = 0.0; + // A dummy one + public ExpertLong() { + } + + // public double getPrior() { + // return prior; + // } + + // public void setPrior(double prior) { + // this.prior = prior; + // } + + public ExpertLong(GenomeSequence gen) { + this.genSeq = gen; + msgLenProb = DEF_MSGLEN;// + + history = new double[CONTEXT_LENGTH]; + + for (int i = 0; i < CONTEXT_LENGTH; i++) { + history[i] = INV_ALPHABET_SIZE;// DEFAULT_LEN;//JapsaMath.log2(ALPHABET_SIZE); + // posteriorProb *= history[i]; + } + } + + public void reset(GenomeSequence gen) { + this.genSeq = gen; + msgLenProb = DEF_MSGLEN;// + + for (int i = 0; i < CONTEXT_LENGTH; i++) { + history[i] = INV_ALPHABET_SIZE;// DEFAULT_LEN;//JapsaMath.log2(ALPHABET_SIZE); + } + ind = counter = 0; + } + + public void updateCost(double prob) { + msgLenProb = msgLenProb * prob / history[ind]; + history[ind] = prob; + + ind = (ind + 1) % history.length; + } + + // Rating based on msg Length + // More is less, less is more + public double rate() { + // return msgLen; + return -JapsaMath.log2(msgLenProb); + + } + + // More is actually more + public double rateProb() { + return msgLenProb; + } + + public long getID() { + return id; + } + + public void setID(long id) { + this.id = id; + } + + // Operate on linked list + public ExpertLong getNext() { + return next; + } + + public void setNext(ExpertLong next) { + this.next = next; + } + + /** + * The probability of current charactor + * + * @param character + * @return + */ + public abstract double probability(int character); + + /** + * When update a position, the expert need to get the cost of current + * prediction, this cost needed for rating expert return neg if out of bound + */ + + public abstract double update(int actual); + + // Should return the current rate after resurrect + // Resurrect when working on workSeq, at position posSrc + public abstract void resurrect(GenomeSequence workSeq, long pos, int past); + + public abstract void resign(); + + /** + * Counter + * + * @return + */ + public int getCounter() { + return counter; + } + + public void setCounter(int counter) { + this.counter = counter; + } + + public void resetCounter() { + counter = 0; + } + + public void incrementCounter() { + counter++; + } + + // These methods are for setting global parameters + public static void setAphabetSize(int size) { + ALPHABET_SIZE = size; + INV_ALPHABET_SIZE = 1.0 / ALPHABET_SIZE; + } + + public static void setContext(int context) { + CONTEXT_LENGTH = context; + } + + public static void setParams(int alSize, int context) { + setAphabetSize(alSize); + setContext(context); + + DEF_HISTORY = new double[CONTEXT_LENGTH]; + DEF_MSGLEN = 1.0; + + for (int i = 0; i < CONTEXT_LENGTH; i++) { + DEF_HISTORY[i] = INV_ALPHABET_SIZE;// DEFAULT_LEN;//JapsaMath.log2(ALPHABET_SIZE); + DEF_MSGLEN *= DEF_HISTORY[i]; + } + } + + public Distribution getDistribution() { + Distribution dist = new Distribution(ExpertLong.ALPHABET_SIZE); + for (byte i = 0; i < ExpertLong.ALPHABET_SIZE; i++) + dist.setWeight(i, this.probability(i)); + + return dist; + } + +} diff --git a/src/dev/java/japsadev/xm/genome/ExpertModelLong.java b/src/dev/java/japsadev/xm/genome/ExpertModelLong.java new file mode 100755 index 0000000..b006af8 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/ExpertModelLong.java @@ -0,0 +1,1035 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.JapsaMath; +import japsadev.xm.genome.MyBigHashtableLong.MyBigHashLongIterator; + +import java.io.*; + +import com.colloquial.arithcode.*; + +public class ExpertModelLong { + public boolean optimise = false; + + public static String VERSION = "V 2.0"; + + // #DEBUG_BEGIN + static public int DEBUG_LEVEL = 2; + // #DEBUG_END + + static final long TWO_MAX_INT = 1l << 32; + + // #CHECKPOINT_BEGIN + protected int checkPoint = 1000000; + // #CHECKPOINT_END + + // 'global' Variables + protected int expertCount;// The number of experts currently empliyed + protected long[] accLengths;// lengths of sequences (including context) + + // protected CombinationExpert baseEx;//Any fixed expert are in this linked + // list + MarkovExpertLong markovEx; + AdaptiveMarkovExpertLong adapMarkovEx;// + // = new MarkovExpert(null,2); + double[] finalD;// ,baseD; + double[] markovD; + public String hashType = "hash"; + long currentInd; + + public static double[] markovCost = null; + + protected CombinationExpertLong repEx; + protected RepeatExpertLong restHead = null; + + protected double repeatPrior; + protected double repeatPriorProb; + + // protected Distribution finalDist; + // protected int posSize = 0; + protected long lengthSeqs = 0;// Total lengths of all sequences + protected int total = 1024 * 1024; + MyBitSetLong bitSet, pBitSet;// To store if an particular offset has been + // stored + + // Two expert seeds + // public RepeatExpertLong offSetSeed; //= new + // OffsetCountExpert(null,1,null);// a dummy one + // public RepeatExpertLong palinSeed;// = new + // PalindromeCountExpert(null,1,null);// a dummy one + /* + * Parameters of the algorithm + */ + protected int expertsLimit;// Limit of experts + protected int chances;// Number of chances given to each expert + // PatternStore myHash; //The hashtable + + // HASH MyHashtableLong myHash; //The hashtable + + MyBigHashtableLong myHash; // The hashtable + + int hashSize; + + boolean selfRep = false; + boolean binaryHash = false; + + public ExpertModelLong(int hashSize, int alphabetSize, int context, + int expertsLimit, double listenThreshold, int chances) { + super(); + + // this.hashSize = hashSize; + ExpertLong.setParams(alphabetSize, context); + this.expertsLimit = expertsLimit; + this.chances = chances; + this.repeatPrior = listenThreshold * context; + repeatPriorProb = JapsaMath.exp2(-repeatPrior); + + // thExp = JapsaMath.exp2(listenThreshold); + // this.listenThreshold = Math.pow(2,listenThreshold); + + // finalDist = new Distribution(Expert.ALPHABET_SIZE); + + finalD = new double[ExpertLong.ALPHABET_SIZE]; + markovD = new double[ExpertLong.ALPHABET_SIZE];// Combination of the + // markov experts + this.hashSize = hashSize; + // this.binaryHash = binaryHash; + } + + public void setBinaryHash(boolean useBinary) { + binaryHash = useBinary; + } + + public void setSelfRep(boolean rep) { + selfRep = rep; + } + + public void setHashType(String hashT) throws Exception { + // Normal hash + if ("hash".equals(hashT)) + this.hashType = "hash"; + // Suffix array + else if ("sfa".equals(hashT)) + this.hashType = "" + "sfa"; + // suffix tree + else if ("sft".equals(hashT)) + this.hashType = "" + "sft"; + // gapped + else { + for (int i = 0; i < hashT.length(); i++) { + char c = hashT.charAt(i); + if (c != '0' && c != '1') { + throw new Exception("Unknown hash type : " + hashT); + } + } // assert: hashT ok + + int i = hashT.length() - 1; + while (i >= 0 && hashT.charAt(i) == '0') + i--; + if (i < 0) + throw new Exception("Unknown hash type : " + hashT); + this.hashType = hashT.substring(hashT.indexOf('1'), i + 1); + System.out.println(hashT.indexOf('1') + " vs " + i); + + this.hashSize = hashType.length(); + + } + } + + public void printParams() { + String hashName = "Hashtable"; + + if ("hash".equals(hashType)) { + + } else if ("sfa".equals(hashType)) + hashName = "PrefixArray"; + else if ("sft".equals(hashType)) { + hashName = "PrefixTree"; + } else { + hashName = "Gapped(" + hashType + ")"; + } + + System.out.println("Parameters:" + "\nHash size : " + hashSize + + "\nExpert Limit : " + expertsLimit + + "\nContext : " + ExpertLong.CONTEXT_LENGTH + + "\nListen Threshold : " + repeatPrior + / ExpertLong.CONTEXT_LENGTH + "bps" + "\nChances : " + + chances + "\nBinaryHash : " + binaryHash + + "\nHashType : " + hashName + "\nExpert Type : " + + restHead.getClass()); + } + + // protected abstract void initilise(BioCompSequence[] seqArray);//should + // store + /************************************************************************* + * protected void initiliseGappedHash(BioCompSequence[] seqArray){ myHash = + * new GappedHashtable(hashType); store(seqArray); } / + *************************************************************************/ + protected void initiliseHash(GenomeSequence[] seqArray) { + + // if (binaryHash){ + // myHash = new MyBinaryHashtable(hashSize); + // }else + + // HASH myHash = new MyHashtableLong(hashSize); + myHash = new MyBigHashtableLong(seqArray.length, hashSize); + store(seqArray); + } + + /************************************************************************* + * protected void initilisePrefixArray(BioCompSequence[] seqArray){ + * + * //#TIME_BEGIN long buildingTime=System.currentTimeMillis(); + * System.out.println(" #Building Suffix Array"); //#TIME_END + * + * PrefixArrayAbstract sArray; + * + * if (binaryHash) sArray = new PrefixArrayBinary(seqArray[0].toBytes()); + * else sArray = new PrefixArray(seqArray[0].toBytes()); + * + * myHash = sArray; sArray.setHashSize(hashSize); + * + * sArray.setSeqToMatch(seqArray[1].toBytes()); //#TIME_BEGIN + * System.out.printf + * (" #Suffix Array (%s) built in %d ms\n",sArray.getClass() + * ,(System.currentTimeMillis() - buildingTime)); //#TIME_END } + * /************ + * ************************************************************* + * + * + * private void initilisePrefixTree(BioCompSequence[] seqArray){ //myHash = + * new SuffixTree(seqArray[0].toBytes()); SuffixTree tree = new + * SuffixTree(seqArray[0].toBytes()); tree.setMinHash(hashSize); myHash = + * tree; tree.setIncrementLeaf(1); + * + * store(seqArray); System.out.println("Tree build done"); + * tree.setIncrementLeaf(0); tree.setRefSeq(seqArray[seqArray.length - + * 1].toBytes()); } + * + * / + *************************************************************************/ + + public void store(GenomeSequence[] seqArray) { + // Store all back ground sequence in the hash + for (int sid = 0; sid < seqArray.length - 1; sid++) { + for (long i = 0; i < seqArray[sid].getLength(); i++) { + myHash.nextKey(seqArray[sid].getBase(i)); + // HASH myHash.putCurrentValue((int)((sid << posSize) + i)); + myHash.putCurrentValue(sid, (int) (i)); + } + } + } + + protected void initilise_optimise(GenomeSequence[] seqArray) { + // Common stuff + lengthSeqs = 0; + + // Compute accumulated length + accLengths = new long[seqArray.length + 1]; + accLengths[0] = 0; + for (int i = 0; i < seqArray.length; i++) { + lengthSeqs += seqArray[i].getLength(); + accLengths[i + 1] = lengthSeqs; + } + + expertCount = 0; + + // Background knowledge/based knowledge + markovEx = new MarkovExpertLong(null, 2); + adapMarkovEx = new AdaptiveMarkovExpertLong(null, 1); + markovEx.setNext(adapMarkovEx); + + repEx = new CombinationExpertLong(); + + // HASH MyHashtableLong hash = new MyHashtableLong(hashSize); + MyBigHashtableLong hash = new MyBigHashtableLong(seqArray.length, + hashSize); + /*************************************************************************/ + + System.out.println("Run first pass"); + // In this pass, count the needed cell on the hash table + + for (int sid = 0; sid < seqArray.length; sid++) { + for (long i = 0; i < seqArray[sid].getLength(); i++) { + hash.nextKey(seqArray[sid].getBase(i)); + // HASH hash.putCurrentValue_psuedo((int) ((sid << posSize) + + // i)); + hash.putCurrentValue_psuedo(sid, (int) (i)); + } + } + hash.printMemoryNeeded(); + + hash.reinitialise_optimise(); + System.out.println("Finish first pass"); + + myHash = hash; + store(seqArray); + + checkPoint(0); + + bitSet = new MyBitSetLong(lengthSeqs); + pBitSet = new MyBitSetLong(lengthSeqs * 2); + + } + + /*************************************************************************/ + protected void initiliseCommon(GenomeSequence[] seqArray) { + // Common stuff + lengthSeqs = 0; + + // Compute accumulated length + accLengths = new long[seqArray.length + 1]; + accLengths[0] = 0; + for (int i = 0; i < seqArray.length; i++) { + lengthSeqs += seqArray[i].getLength(); + accLengths[i + 1] = lengthSeqs; + } + + expertCount = 0; + + // Background knowledge/based knowledge + markovEx = new MarkovExpertLong(null, 2); + adapMarkovEx = new AdaptiveMarkovExpertLong(null, 1); + markovEx.setNext(adapMarkovEx); + + repEx = new CombinationExpertLong(); + + // MyHashtable hash = new MyHashtable(hashSize); + /*************************************************************************/ + // Initilise hashtable + initiliseHash(seqArray); + + // if ("sft".equals(this.hashType)) + // initilisePrefixTree(seqArray); + // else if ("sfa".equals(this.hashType)) + // initilisePrefixArray(seqArray); + // else if ("hash".equals(this.hashType)) + // initiliseHash(seqArray); + // else{//gappped + // initiliseGappedHash(seqArray); + // } + + bitSet = new MyBitSetLong(lengthSeqs); + pBitSet = new MyBitSetLong(lengthSeqs * 2); + } + + /*****************************************************************/ + protected void preCoding() { + double baseRateProb = markovEx.rateProb(); + + for (byte a = 0; a < ExpertLong.ALPHABET_SIZE; a++) { + markovD[a] = finalD[a] = markovEx.rateProb() + * markovEx.probability(a) + adapMarkovEx.rateProb() + * adapMarkovEx.probability(a); + } + + // Initialise expert prediction and total prediction + this.repEx.getCombDistribution().setWeights(0.0); + + double repSum = 0.0; + + // Listen to no more than expert limit experts + ExpertLong ptr = repEx.getNext();// go thro all rep experts + ExpertLong head = repEx; + while (ptr != null) {// Inv: head.next = ptr + // double rate = ptr.rate();//msg leng over an history + double score = ptr.rateProb(); // costToWeight(rate); + // Only listen if the offset/palindrome is sufficiently good + // if (rate < baseRate - repeatPrior){// * Expert.CONTEXT_LENGTH){ + if (score > baseRateProb / repeatPriorProb) {// * + // Expert.CONTEXT_LENGTH){ + for (byte a = 0; a < ExpertLong.ALPHABET_SIZE; a++) { + repEx.getCombDistribution().addWeight(a, + score * ptr.probability(a)); + } + // ptr.weight = myCost; + repSum += score; + ptr.resetCounter(); + // goodRep.add(ptr); + } else { + // ptr.weight = 0.0; + ptr.incrementCounter(); + + if (ptr.getCounter() > chances) { + // remove ptr + resignExpert(ptr);// ptr.resign(); + + expertCount--; + + head.setNext(ptr.getNext()); + ExpertLong retired = ptr; + + ptr = ptr.getNext(); + retired.next = restHead; + restHead = (RepeatExpertLong) retired; + + continue; + } + } + head = ptr; + ptr = ptr.getNext(); + } + + if (repSum != 0.0) {// There is repeat + repEx.incrementCount(); + repEx.getCombDistribution().scale(1.0 / repSum); + + // double repExRate = repEx.rate() + + // repeatPrior;//JapsaMath.log2(repEx.getPrior()) ; + // double repExScore = costToWeight(repExRate); + + double repExScore = repEx.rateProb() * repeatPriorProb; + + // Combine 2 experts + for (byte a = 0; a < ExpertLong.ALPHABET_SIZE; a++) { + // finalDist.addWeight(a, + // repEx.getCombDistribution().getWeight(a) * repExScore); + finalD[a] += repEx.getCombDistribution().getWeight(a) + * repExScore; + } + // finalSum += repExScore; + } else { + repEx.getCombDistribution().setWeights( + 1.0 / ExpertLong.ALPHABET_SIZE); + } + + double fSum = 0; + for (byte a = 0; a < ExpertLong.ALPHABET_SIZE; a++) { + // finalDist.addWeight(a, repEx.getCombDistribution().getWeight(a) * + // repExScore); + fSum += finalD[a]; + } + + for (byte a = 0; a < ExpertLong.ALPHABET_SIZE; a++) { + // finalDist.addWeight(a, repEx.getCombDistribution().getWeight(a) * + // repExScore); + finalD[a] /= fSum; + } + } + + protected void resurrectExpert(RepeatExpertLong e, GenomeSequence bs, + long pos, int past) { + e.resurrect(bs, pos, past); + } + + /** + * Update all experts at this positition + * + * @param seqArray + * @param i + * @param sid + */ + protected void updateExperts(int c) { + // Update base experts + // Expert + + markovEx.update(c); + adapMarkovEx.update(c); + + // Update rep experts + // baseEx.update(c); + repEx.update(c); + /********************* Update Exp ********************/ + ExpertLong ptr = repEx.getNext(); + ExpertLong head = repEx; + + ExpertLong retired; + + while (ptr != null) { + double thisCost = ptr.update(c);// This actually the prob + if (thisCost > 0) {// A cost should be a positive number + head = ptr; + ptr = ptr.getNext(); + } else {// resign + resignExpert(ptr);// ptr.resign(); + expertCount--; + head.setNext(ptr.getNext()); + retired = ptr; + ptr = ptr.getNext(); + + // put retired into the rest list + retired.next = this.restHead; + this.restHead = (RepeatExpertLong) retired; + } + } + } + + void resignExpert(ExpertLong p) { + p.resign(); + } + + // static int id_zero = 0; + protected void postCoding(GenomeSequence[] seqArray, int sid) { + /**********************************************************************/ + // Move to next key + // if (currentInd < seqArray[sid].length() - 1) + myHash.nextKey(seqArray[sid].getBase(currentInd)); + + // Add good diagonals. + if (currentInd >= hashSize + && currentInd < seqArray[sid].getLength() - 1) { + // assert n >= 0 + // IntIterator iter = myHash.copyIterator(); + // HASH MyHashLongIterator iter = myHash.getLongIterator(); + MyBigHashLongIterator iter = myHash.iterator(); + + // #DEBUG_BEGIN + if (DEBUG_LEVEL > 3) { + System.out.printf("Find : %10d%10d\n", iter.sizeAvailable(), + expertCount); + } + // #DEBUG_END + + while (expertsLimit > expertCount && iter.hasNext()) { + int position = iter.next(); + RepeatExpertLong e = null; + int id = iter.sid; + + if (!iter.isPalin) {// Offset expert + long pos = position; + if (pos < 0) { + pos = TWO_MAX_INT + pos; + } + + if (pos <= hashSize) { + continue; + } + + if (pos > hashSize && // Have enough for resurrect + !bitSet.get(currentInd + accLengths[sid] + - accLengths[id] - pos)// Not in there + && pos < seqArray[id].getLength() - 3) {// have some + // thing to + // predict + // get the next free + e = this.restHead; + this.restHead = (RepeatExpertLong) this.restHead.next; + e.reuseExpert(seqArray[id], pos, bitSet, + RepeatExpertLong.COPY_TYPE); + // e = offSetSeed.duplicate(seqArray[id], posSrc, bitSet); + e.setID(currentInd + accLengths[sid] - accLengths[id] + - pos); + } + } else {// Palindrome expert + long pos = position; + if (pos < 0) { + pos = TWO_MAX_INT + pos; + } + + if (pos > hashSize + && (!pBitSet.get(currentInd + accLengths[sid] + + accLengths[id] + pos))// ? + && (pos + 3 < seqArray[id].getLength())) {// Have + // something + // to + // predict + e = this.restHead; + this.restHead = (RepeatExpertLong) this.restHead.next; + e.reuseExpert(seqArray[id], pos - hashSize + 1, + pBitSet, RepeatExpertLong.PALIN_TYPE); + // e = palinSeed.duplicate(seqArray[id], posSrc - hashSize + // + 1,pBitSet); + e.setID(currentInd + accLengths[sid] + accLengths[id] + + pos); + } + } + // Add this expert in only if an identical expert not in the + // list + if (e != null) { + resurrectExpert(e, seqArray[sid], currentInd, hashSize); + // e.resurrect(seqArray[sid].toBytes(),currentInd,hashSize); + e.setNext(repEx.getNext()); + repEx.setNext(e); + expertCount++; + } + } + } + // System.out.println(i + " " + expertCount); + /********************** Store current *********************/ + if (selfRep) + // HASH myHash.putCurrentValue( (int) (currentInd )); + myHash.putCurrentValue(sid, (int) (currentInd)); + // myHash.putCurrentValue( (int) ((sid << posSize) + currentInd)); + /******************************************************************/ + } + + protected void checkPoint(long steps) { + System.out.print("Reach milestone " + steps + " : "); + myHash.printSummary(); + + System.out.println(" Memory availabe " + + Runtime.getRuntime().freeMemory() + " " + + Runtime.getRuntime().totalMemory()); + // System.gc(); + System.out.println(" Memory availabe " + + Runtime.getRuntime().freeMemory() + " " + + Runtime.getRuntime().totalMemory()); + } + + // public static double costToWeight(double cost){ + // return JapsaMath.exp2(-cost); + // } + + // Decode a sequence + /*****************************************************************************/ + public void decode(GenomeSequence[] seqArray, File encodedFile) + throws IOException { + FileInputStream fileIn = new FileInputStream(encodedFile); + long length = (new DataInputStream(fileIn)).readLong(); + + // Get the sequence to be encode + int sid = seqArray.length - 1; + GenomeSequence seq = new GenomeSequence(length); + seqArray[seqArray.length - 1] = seq; + + initiliseCommon(seqArray); + + ArithDecoder decoder = new ArithDecoder(new BitInput(fileIn)); + + currentInd = 0; + double accu = 0; + while (!decoder.endOfStream()) { + int mid = decoder.getCurrentSymbolCount(total); + if (mid >= total) + break; + preCoding(); + + int actual = 0; + // double + accu = 0; + + // Will later implement using binary search, for now just linear + // search + try { + while (mid >= (int) ((accu + finalD[actual]) * total)) { + accu += finalD[actual]; + actual++; + } + + decoder.removeSymbolFromStream((int) (accu * total), + (int) ((accu + finalD[actual]) * total), total); + seq.putBase(currentInd, actual); + } catch (Exception e) { + e.printStackTrace(); + System.out.println(actual + " " + mid + " " + total + " at " + + currentInd + " acc = " + accu); + double nAcc = 0; + for (int y = 0; y < finalD.length; y++) { + nAcc += finalD[y]; + System.out.println(finalD[y] + " " + nAcc + " " + + (nAcc * total) + " " + ((int) (nAcc * total))); + } + System.exit(-1); + } + // if (seqDec[i] != japsa.seq[i]){ + // System.out.println("Wrong at the possition " + i + " expected " + + // japsa.seq[i] + " but see " + actual); + // System.out.println(mid + " " + accu * total + " " + (accu + + // final_distribution[actual]) * total); + // return false; + // } + + updateExperts(actual); + + postCoding(seqArray, sid); + + currentInd++; + if (currentInd >= seq.getLength()) { + break; + // return seqDec; + } + /*******************************************************************/ + // #CHECKPOINT_BEGIN + if ((currentInd + 1) % checkPoint == 0) {// + checkPoint(currentInd + 1); + System.out.println(" Current decoding at (" + + new java.util.Date() + ")"); + } + // #CHECKPOINT_END + /*******************************************************************/ + }// while + decoder.close(); + } + + /*******************************************************************/ + public double encode1(GenomeSequence[] seqArray) { + long s1 = System.currentTimeMillis(); + if (optimise) + this.initilise_optimise(seqArray); + else + initiliseCommon(seqArray); + + // initiliseCommon(seqArray); + + long s2 = System.currentTimeMillis(); + System.out.println((s2 - s1)); + + // Get the sequence to be encode + int sid = seqArray.length - 1; + GenomeSequence seq = seqArray[sid];// seqArray.length -1]; + + double totalCost = 0.0; + + // go thru the sequence and encode each charactor + for (currentInd = 0; currentInd < seq.getLength(); currentInd++) { + // Compute the probability distribution + preCoding(); + + int actual = seq.getBase(currentInd); + double cost = -JapsaMath.log2(finalD[actual]); + + totalCost += cost; + + updateExperts(seqArray[sid].getBase(currentInd)); + postCoding(seqArray, sid); + /*******************************************************************/ + // #CHECKPOINT_BEGIN + if ((currentInd + 1) % checkPoint == 0) {// + checkPoint(currentInd + 1); + System.out.println(" Current comp = " + + (totalCost / currentInd) + " (" + + new java.util.Date() + ")"); + } + // #CHECKPOINT_END + /*******************************************************************/ + } + long s3 = System.currentTimeMillis(); + System.out.println((s3 - s2)); + + return totalCost / seq.getLength(); + } + + public File realEncode(GenomeSequence[] seqArray, String filename) { + + try { + if (optimise) + this.initilise_optimise(seqArray); + else + initiliseCommon(seqArray); + + // Get the sequence to be encode + int sid = seqArray.length - 1; + GenomeSequence seq = seqArray[sid]; + double totalCost = 0.0; + + File file = new File(filename); + FileOutputStream fileOut = new FileOutputStream(file); + + // Write the length of the sequence, this is to make encoding easier + long len = seq.getLength(); + + (new DataOutputStream(fileOut)).writeLong(len); + + ArithEncoder encoder = new ArithEncoder(fileOut); + // initilise(seqHash); + for (currentInd = 0; currentInd < seq.getLength(); currentInd++) { + preCoding(); + int actual = seq.getBase(currentInd); + + /**********************************************************************/ + double accu = 0; + for (int j = 0; j < actual; ++j) { + accu += finalD[j]; + } + int low = (int) (accu * total); + int high = (int) ((accu + finalD[actual]) * total); + // System.out.println("Encode " + low + " " + high + " " + + // total); + encoder.encode(low, high, total); + /**********************************************************************/ + + updateExperts(actual); + postCoding(seqArray, sid); + + totalCost -= JapsaMath.log2(finalD[actual]); + /*******************************************************************/ + // #CHECKPOINT_BEGIN + if ((currentInd + 1) % checkPoint == 0) {// + checkPoint(currentInd + 1); + System.out.println(" Current comp = " + + (totalCost / currentInd) + " (" + + new java.util.Date() + ")"); + } + // #CHECKPOINT_END + /*******************************************************************/ + } + + System.out.println(" Theoritical bps " + + (totalCost / seq.getLength())); + encoder.close(); + + return file; + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + public static String version() { + return "The eXpert-Model (XM) for Compression of DNA Sequences " + + VERSION + + "\n Minh Duc Cao, T. I. Dix, L. Allison, C. Mears." + + "\n A simple statistical algorithm for biological sequence compression" + + "\n IEEE Data Compression Conf., Snowbird, Utah, 2007, [doi:10.1109/DCC.2007.7]\n"; + } + + public static void main(String[] args) { + try { + // Get params from users + CommandLine cmdLine = new CommandLine(); + cmdLine.addInt("hashSize", 11, "Hash size"); + cmdLine.addInt("context", 15, "Length of the context"); + cmdLine.addInt("limit", 200, "Expert Limit"); + cmdLine.addDouble("threshold", 0.15, "Listen threshold"); + cmdLine.addInt("chance", 20, "Chances"); + cmdLine.addString( + "offsetType", + "counts", + "Way of update offset/palindrome expert: possible value count, subs, viaprotein,static"); + cmdLine.addBoolean( + "optimise", + false, + "Running in optimise mode, just report the entropy,recommended for long sequence"); + + cmdLine.addString("real", null, "File name of the real compression"); + cmdLine.addString("decode", null, "File name of the encoded"); + cmdLine.addString("output", "decoded", + "The output file of decoded file"); + + // #CHECKPOINT_BEGIN + cmdLine.addInt("checkPoint", 1000000, "Frequency of check point"); + // #CHECKPOINT_END + + args = cmdLine.parseLine(args); + + int alphabetSize = 4; + int hashSize = cmdLine.getIntVal("hashSize"); + int context = cmdLine.getIntVal("context"); + int expertsLimit = cmdLine.getIntVal("limit"); + double listenThreshold = cmdLine.getDoubleVal("threshold"); + int chances = cmdLine.getIntVal("chance"); + + ExpertModelLong eModel = new ExpertModelLong(hashSize, + alphabetSize, context, expertsLimit, listenThreshold, + chances); + + eModel.optimise = cmdLine.getBooleanVal("optimise"); + + eModel.setHashType("hash"); + eModel.setSelfRep(true); + + // eModel.offSetSeed = new RepeatCountExpertLong(new + // GenomeSequence(1),0,null,RepeatExpertLong.COPY_TYPE); + // eModel.palinSeed = new RepeatCountExpertLong(new + // GenomeSequence(1),0,null,RepeatExpertLong.PALIN_TYPE); + + eModel.restHead = new RepeatCountExpertLong(new GenomeSequence(1), + 0, null, RepeatExpertLong.COPY_TYPE); + ExpertLong tail = eModel.restHead; + + for (int x = 0; x < expertsLimit + 1; x++) { + RepeatCountExpertLong t = new RepeatCountExpertLong( + new GenomeSequence(1), 0, null, + RepeatExpertLong.COPY_TYPE); + tail.setNext(t); + tail = t; + } + + System.out.println(ExpertModelLong.version()); + // Print out all params + eModel.printParams(); + + if (cmdLine.getStringVal("decode") == null + && (args == null || args.length <= 0)) { + System.err.println("Usage: java CommandLine [options]\n" + + cmdLine.usageMessage() + " file1 file2 ..."); + System.exit(1); + } + // cmdLine.printOptions(); + + // #TIME_BEGIN + long timeStart = System.currentTimeMillis(); + System.out.println(" #Reading file(s)"); + // #TIME_END + + // BioCompDNA[] dnaArray;// + GenomeSequence[] genSeqs; + if (cmdLine.getStringVal("decode") != null) { + if (args == null) + args = new String[0]; + + genSeqs = new GenomeSequence[args.length + 1]; + } else {// Read per normal + genSeqs = new GenomeSequence[args.length]; + } + for (int i = 0; i < args.length; i++) { + System.out.print("Read " + args[i] + "..."); + genSeqs[i] = GenomeSequence.guessFormat(args[i]); + System.out.println(" done (" + genSeqs[i].getLength() + ")"); + } + + // #TIME_BEGIN + long timeEnd = System.currentTimeMillis(); + System.out.println(" #Read file(s) in " + (timeEnd - timeStart) + + "ms "); + // #TIME_END + + // if (cmdLine.getStringVal("hashType").equals("sft")){ + // Augment all the background sequences + // GenomeSequence[] augdnaArray = new GenomeSequence[2]; + // augdnaArray[1] = genSeqs[genSeqs.length - 1]; + // TODO: fix the following + // augdnaArray[0] = new BioCompDNA(new byte[0],"Combine"); + + // for (int x = 0; x < dnaArray.length - 1; x++){ + // augdnaArray[0].concatenate(dnaArray[x]); + // augdnaArray[0].concatenate(dnaArray[x].reverseComplement()); + // } + // dnaArray = augdnaArray; + // }else if (cmdLine.getStringVal("hashType").equals("sfa") && + // genSeqs.length > 1){ + // Augment all the background sequences for suffix array + // BioCompDNA[] augdnaArray = new BioCompDNA[2]; + // augdnaArray[1] = dnaArray[dnaArray.length - 1]; + + // int combileLength = 0; + // for (int x = 0; x < dnaArray.length - 1; x++){ + // combileLength += dnaArray[x].length(); + // } + + // Combine all sequences, including the rev comp into 1/ + // in the reverse order. the correct order is retrieved later + // byte [] combineByte = new byte[combileLength * 2];///Backward and + // forward + // int ind = 0; + + // for (int x = 0; x < dnaArray.length - 1; x++){ + // byte[] seqX = dnaArray[x].toBytes(); + // for (int y = 0; y < seqX.length; y++){ + // // + // combineByte[ind ++] = seqX[y];//(byte)(3 - seqX[y]);//compliment + // seqX[y]; + // combineByte[combineByte.length - ind] = (byte)(3 - + // seqX[y]);//seqX[y];//(byte)(3 - seqX[y]);//compliment + // } + // } + + // augdnaArray[0] = new BioCompDNA(combineByte,"Combine"); + // dnaArray = augdnaArray; + // } + + // for (int i = 0; i < dnaArray.length; i++){ + // if (i == dnaArray.length - 1){ + // if (cmdLine.getStringVal("decode") == null){ + // System.out.println("Encode : " + dnaArray[i] + + // "\t"+dnaArray[i].length()+""); + // } + // }else + // System.out.println("Context : " + dnaArray[i] + + // "\t"+dnaArray[i].length()+""); + // } + System.out + .println("----------------------------------------------------------------------"); + + long start; + /************************************************************************* + * if (cmdLine.getBooleanVal("optimise")){ //System.gc(); start = + * System.currentTimeMillis(); + * + * double cost = eModel.encode_optimise(genSeqs); long time = + * (System.currentTimeMillis() - start); // + * System.out.printf("%f bps in %d ms\n",cost,time); + * System.out.println( + * "======================================================================" + * ); + * + * }else / + *************************************************************************/ + if (cmdLine.getStringVal("decode") != null) { + if (cmdLine.getBooleanVal("optimise")) { + System.err + .println("Warn: optimise option cannot be used with decode, disabled"); + } + + String file = cmdLine.getStringVal("decode"); + String output = cmdLine.getStringVal("output"); + + System.out.println("Decoding"); + start = System.currentTimeMillis(); + + eModel.decode(genSeqs, new File(file)); + + System.out.println(" Time decode " + + (System.currentTimeMillis() - start) / 1000.0 + + "seconds\n"); + SequenceOutputStream outPrintStream = SequenceOutputStream + .makeOutputStream(output); + genSeqs[genSeqs.length - 1].write(outPrintStream); + outPrintStream.close(); + + } + /*************************************************************************/ + else if (cmdLine.getStringVal("real") != null) { + // System.gc(); + System.out.println("Real encoding"); + start = System.currentTimeMillis(); + File outputFile = eModel.realEncode(genSeqs, + cmdLine.getStringVal("real")); + + System.out.println(" Time encode " + + (System.currentTimeMillis() - start) / 1000.0 + + "seconds"); + System.out.println(" Encoding cost = " + + (outputFile.length() * 8.0) + / genSeqs[genSeqs.length - 1].getLength() + "bps"); + + } else {// Normal + // System.gc(); + start = System.currentTimeMillis(); + // seqHash[1] = japsa.seq; + double costs = eModel.encode1(genSeqs);// ,args[1]); + + long time = (System.currentTimeMillis() - start); + // System.out.printf(" Comp rate : #%f#\n",total / + // costs.length); + System.out.printf("%f bps in %d ms\n", costs, time); + System.out + .println("============================================================================="); + } + + } catch (Exception e) { + e.printStackTrace(System.err); + } + } + +} diff --git a/src/dev/java/japsadev/xm/genome/GenomeSequence.java b/src/dev/java/japsadev/xm/genome/GenomeSequence.java new file mode 100755 index 0000000..5e0d833 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/GenomeSequence.java @@ -0,0 +1,592 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +import japsa.seq.JapsaFileFormat; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; + +import java.io.*; +import java.util.*; + +public class GenomeSequence { + public static final int ALPHABET_SIZE = 4; + public static char[] NUCLEOTIDE = { 'a', 'c', 'g', 't', 'n' }; + + int[] seqs;// use int to avoid the time of casting + private long length; + String seqID = ""; + String info = null; + + static int LAST2BIT_MASK = 0x3; + static int LAST4BIT_MASK = 0xf; + static int[] MASKS = new int[16], INV_MASKS = new int[16]; + static { + MASKS[15] = 0x3; + INV_MASKS[15] = ~MASKS[15]; + for (int i = MASKS.length - 2; i >= 0; i--) { + MASKS[i] = MASKS[i + 1] << 2; + INV_MASKS[i] = ~MASKS[i]; + } + } + + public long getLength() { + return length; + } + + public static GenomeSequence concat(GenomeSequence[] gseqs) { + long l = 0l; + for (int i = 0; i < gseqs.length; i++) + l += gseqs[i].length; + + GenomeSequence gSeq = new GenomeSequence(l); + // TODO: this is a very inefficient way + l = 0; + for (int i = 0; i < gseqs.length; i++) { + for (long y = 0; y < gseqs[i].length; y++) { + gSeq.putBase(l, gseqs[i].getBase(y)); + l++; + } + } + + return gSeq; + } + + public GenomeSequence(long l) { + length = l; + if (length > (1l << 34)) { + System.out.println("Warning: length cant be larger than " + + (1l << 34)); + length = (1l << 34) - 1; + } + + int size = (int) ((length - 1) >> 4) + 1; + seqs = new int[size]; + } + + public GenomeSequence(char[] charSeq) { + // TODO: this is a very inefficient way, but will optimise later + // such as: put 16 chars in an int before put in the array + this(charSeq.length); + + for (int i = 0; i < charSeq.length; i++) { + putChar(i, charSeq[i]); + } + } + + public GenomeSequence(byte[] byteSeq) { + // TODO: this is a very inefficient way, but will optimise later + // such as: put 16 chars in an int before put in the array + this(byteSeq.length); + + for (int i = 0; i < byteSeq.length; i++) { + putBase(i, byteSeq[i]); + } + } + + public void increaseSize(long l) { + if (l < length) + return; + length = l; + if (length > (1l << 34)) { + System.out.println("Warning: length cant be larger than " + + (1l << 34)); + length = (1l << 34) - 1; + } + int size = (int) ((length - 1) >> 4) + 1; + int[] newSeqs = new int[size]; + // copy over + for (int i = 0; i < seqs.length; i++) + newSeqs[i] = seqs[i]; + + seqs = newSeqs; + // deallocate old seqHash + } + + public static byte charToByte(char c) { + if (c == 'a' || c == 'A') + return 0; + if (c == 'c' || c == 'C') + return 1; + if (c == 'g' || c == 'G') + return 2; + if (c == 't' || c == 'T' || c == 'u' || c == 'U') + return 3; + return (byte) (new Random()).nextInt(4); + } + + public int getBase(long ind) { + int pos = (int) (ind >> 4); + int place = (int) (ind & LAST4BIT_MASK); + + return (seqs[pos] >>> (30 - (place << 1))) & LAST2BIT_MASK; + } + + public char getChar(long ind) { + return NUCLEOTIDE[getBase(ind)]; + } + + public void putChar(long ind, char c) { + if (c == 'a' || c == 'A') + putBase(ind, 0); + else if (c == 'c' || c == 'C') + putBase(ind, 1); + else if (c == 'g' || c == 'G') + putBase(ind, 2); + else if (c == 't' || c == 'T' || c == 'u' || c == 'U') + putBase(ind, 3); + } + + public void putBase(long ind, int base) { + int pos = (int) (ind >> 4); // posSrc = ind / 16 + int place = (int) (ind & LAST4BIT_MASK); // place = ind % 16 + + // base gets 0 1 2 3 + + base <<= (30 - (place << 1)); + seqs[pos] = (seqs[pos] & INV_MASKS[place]) | base; + } + + public static GenomeSequence readRawBio(BufferedReader in) throws Exception { + String line = in.readLine(); + line = in.readLine(); + String tokens[] = line.split(" "); + int length = Integer.parseInt(tokens[0]); + + GenomeSequence genome = new GenomeSequence(length); + long ind = 0; + + System.out.println("Allocating " + length); + while ((line = in.readLine()) != null) { + // Ignore those lines start with #, < and > + if (line.startsWith("#")) + continue; + if (line.startsWith(">")) + continue; + if (line.startsWith("<")) + continue; + + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + if (c == 'a' || c == 'A') { + genome.putBase(ind, 0); + ind++; + } else if (c == 'c' || c == 'C') { + genome.putBase(ind, 1); + ind++; + } else if (c == 'g' || c == 'G') { + genome.putBase(ind, 2); + ind++; + } else if (c == 't' || c == 'T' || c == 'u' || c == 'U') { + genome.putBase(ind, 3); + ind++; + } + if (ind == length) + break;// japsa.seq full + } + if (ind == length) + break;// japsa.seq full + } + + if (ind != length) + System.err.println("Mismatched size " + ind + " vs " + length); + // format: + // First line #RAWBIO + // secondline length(space) something else + // thirdline onward: base + in.close(); + + return genome; + + } + + /** + * this method opens a file and guess the format of that file + * + * @param: file name + * @return DNA object + */ + + public static GenomeSequence guessFormat(String filename) { + try { + BufferedReader in = SequenceReader.openFile(filename); + if (in == null) + return null; + + in.mark(10); + + char[] buf = new char[10]; + in.read(buf, 0, 10); + in.reset(); + String format = new String(buf); + + // if (format.startsWith(BioCompFileFormat.RAW_HEADER)) {// + // BioCompress + // // raw + // // format + // return readRawBio(in); + // } + + if (format.startsWith(">")) {// BioCompress raw format + return readFasta(in); + } + + if (format.startsWith(JapsaFileFormat.HEADER)) {// BioCompress raw + // format + return readBioComp(in); + } + + return readRaw(in); + + // read raw + + } catch (Exception e) { + e.printStackTrace(); + System.exit(-1); + } + return null; + } + + public static GenomeSequence readRaw(BufferedReader in) { + try { + StringBuffer seqBuf = new StringBuffer(); + char[] buf = new char[1024]; + int n; + while ((n = in.read(buf)) >= 0) { + for (int i = 0; i < n; i++) { + char c = buf[i]; + // if (Character.isLetter(c)) + if (c == 'a' || c == 'A' || c == 'c' || c == 'C' + || c == 'g' || c == 'G' || c == 't' || c == 'T') + seqBuf.append(c); + } + } + char[] charSeq = new char[seqBuf.length()]; + seqBuf.getChars(0, seqBuf.length(), charSeq, 0); + + in.close(); + return new GenomeSequence(charSeq); + } catch (IOException e) { + e.printStackTrace(); + } catch (Exception e) { + e.printStackTrace(); + } + return null; + } + + public static GenomeSequence readFasta(BufferedReader in) { + try { + StringBuffer seqBuf = new StringBuffer(); + + String line = ""; + while ((line = in.readLine()) != null) { + if (line.startsWith(">")) + continue; + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + if (c == 'a' || c == 'A' || c == 'c' || c == 'C' + || c == 'g' || c == 'G' || c == 't' || c == 'T') + seqBuf.append(c); + } + }// while + + char[] charSeq = new char[seqBuf.length()]; + seqBuf.getChars(0, seqBuf.length(), charSeq, 0); + + in.close(); + return new GenomeSequence(charSeq); + } catch (IOException e) { + e.printStackTrace(); + } catch (Exception e) { + e.printStackTrace(); + } + return null; + } + + public static GenomeSequence readFasta(String fileName, boolean convert) + throws Exception { + File file = new File(fileName); + long fLen = file.length(); + if (fLen <= 0) { + throw new IOException("File " + fileName + " does not exist!!! "); + } + + Random r = new Random(); + + GenomeSequence gSeq = new GenomeSequence(fLen);// this is an upper bound + BufferedReader in = new BufferedReader(new FileReader(file)); + + long l = 0; + String line = ""; + long countN = 0, countX = 0, countAny = 0; + long countA = 0, countC = 0, countG = 0, countT = 0; + while ((line = in.readLine()) != null) { + if (line.startsWith(">")) { + System.out.println(" " + line); + continue; + } + + if (line.startsWith(">")) { + System.out.println(" " + line); + continue; + } + if (line.startsWith("#")) { + System.out.println(" " + line); + continue; + } + + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + if (c == 'a' || c == 'A') { + gSeq.putBase(l++, 0); + countA++; + } else if (c == 'c' || c == 'C') { + gSeq.putBase(l++, 1); + countC++; + } else if (c == 'g' || c == 'G') { + gSeq.putBase(l++, 2); + countG++; + } else if (c == 't' || c == 'T') { + gSeq.putBase(l++, 3); + countT++; + } else if (c == 'n' || c == 'N') { + countN++; + if (convert) { + gSeq.putBase(l++, r.nextInt(4)); + } + + } else if (c == 'x' || c == 'X') { + countX++; + if (convert) { + gSeq.putBase(l++, r.nextInt(4)); + } + } else if (Character.isLetter(c)) + countAny++; + } + + }// while + + in.close(); + // restrict the length + gSeq.length = l; + System.out.println("File " + fileName + ": " + l + " bases, " + countA + + " As " + countC + " Cs " + countG + " Gs " + countT + " Ts " + + countN + " Ns " + countX + " Xs " + countAny + " others (" + + fLen + ")"); + + return gSeq; + } + + public static GenomeSequence readFasta(String fileName) throws Exception { + File file = new File(fileName); + long fLen = file.length(); + if (fLen <= 0) { + throw new IOException("File " + fileName + " does not exist!!! "); + } + + GenomeSequence gSeq = new GenomeSequence(fLen);// this is an upper bound + BufferedReader in = new BufferedReader(new FileReader(file)); + + long l = 0; + String line = ""; + long countN = 0, countX = 0, countAny = 0; + long countA = 0, countC = 0, countG = 0, countT = 0; + while ((line = in.readLine()) != null) { + if (line.startsWith(">")) { + System.out.println(" " + line); + continue; + } + + if (line.startsWith(">")) { + System.out.println(" " + line); + continue; + } + if (line.startsWith("#")) { + System.out.println(" " + line); + continue; + } + + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + if (c == 'a' || c == 'A') { + gSeq.putBase(l++, 0); + countA++; + } else if (c == 'c' || c == 'C') { + gSeq.putBase(l++, 1); + countC++; + } else if (c == 'g' || c == 'G') { + gSeq.putBase(l++, 2); + countG++; + } else if (c == 't' || c == 'T') { + gSeq.putBase(l++, 3); + countT++; + } else if (c == 'n' || c == 'N') { + countN++; + } else if (c == 'x' || c == 'X') { + countX++; + } else if (Character.isLetter(c)) + countAny++; + } + + }// while + + in.close(); + // restrict the length + gSeq.length = l; + System.out.println("File " + fileName + ": " + l + " bases, " + countA + + " As " + countC + " Cs " + countG + " Gs " + countT + " Ts " + + countN + " Ns " + countX + " Xs " + countAny + " others (" + + fLen + ")"); + + return gSeq; + } + + public static GenomeSequence readBioComp(BufferedReader in) + throws Exception { + String line = in.readLine(); + String[] toks = line.trim().split(JapsaFileFormat.DELIMITER + ""); + long l = Long.parseLong(toks[3]); + GenomeSequence gSeq = new GenomeSequence(l); + gSeq.seqID = toks[2]; + + l = 0; + while ((line = in.readLine()) != null) { + if (line.startsWith("<")) + continue; + if (line.startsWith(">")) + continue; + if (line.startsWith("#")) + continue; + + for (int i = 0; i < line.length(); i++) { + char c = line.charAt(i); + if (c == 'a' || c == 'A') { + gSeq.putBase(l++, 0); + } else if (c == 'c' || c == 'C') { + gSeq.putBase(l++, 1); + } else if (c == 'g' || c == 'G') { + gSeq.putBase(l++, 2); + } else if (c == 't' || c == 'T') { + gSeq.putBase(l++, 3); + } + } + } + + if (l != gSeq.length) + System.err.println("Warning : Real length " + l + " vs " + + gSeq.length); + + return gSeq; + } + + // public void writeRaw(String fileName) throws Exception { + + // PrintStream ps = new PrintStream(new File(fileName)); + // ps.println(BioCompFileFormat.RAW_HEADER); + + // ps.print(this.length); + + // for (long i = 0; i < length; i++) { + // if (i % 60 == 0) + // ps.println(); + // ps.print(this.getChar(i)); + // } + // ps.close(); + // } + + // private Vector annoDescription = null;//Description of the + // feature + + public void write(SequenceOutputStream out) throws IOException { + + out.print(JapsaFileFormat.HEADER); + out.print(JapsaFileFormat.DELIMITER); + out.print(seqID); + out.print(JapsaFileFormat.DELIMITER); + out.print(length); + out.print(JapsaFileFormat.DELIMITER); + out.print("DNA"); + out.print('\n'); + + // if (annoDescription != null ) + // for (int i = 0; i < annoDescription.size(); i++){ + // out.println(">" + annoDescription.get(i)); + // } + + for (long x = 0; x < length; x++) { + if (x % JapsaFileFormat.CHAR_PER_LINE == 0) { + out.print('\n'); + out.print(x + 1, 10); + } else if (x % JapsaFileFormat.CHAR_PER_BLOCK == 0) { + out.print(' '); + } + + out.print(this.getChar(x)); + } + + System.out + .println("Write sequence " + seqID + " " + length + " bases"); + } + + /***********************************************************/ + public void printSeq() { + for (int i = 0; i < length; i++) + System.out.print(this.getChar(i)); + + System.out.println(); + } + + public void printSeqBin() { + for (int i = 0; i < seqs.length; i++) { + String bin = Integer.toBinaryString(seqs[i]); + while (bin.length() < 32) + bin = "0" + bin; + System.out.print(bin); + } + System.out.println(); + } + + public static void main(String[] args) throws Exception { + if (args.length > 1) { + GenomeSequence[] gSeq = new GenomeSequence[args.length - 1]; + for (int i = 0; i < gSeq.length; i++) { + GenomeSequence dna = GenomeSequence.readFasta(args[i], true); + gSeq[i] = dna; + } + SequenceOutputStream out = SequenceOutputStream + .makeOutputStream(args[args.length - 1]); + GenomeSequence.concat(gSeq).write(out); + out.close(); + } + + // readGenbank(args); + // for (int i = 0; i< args.length;i++){ + } +} diff --git a/src/dev/java/japsadev/xm/genome/MarkovExpertLong.java b/src/dev/java/japsadev/xm/genome/MarkovExpertLong.java new file mode 100755 index 0000000..f104c14 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/MarkovExpertLong.java @@ -0,0 +1,114 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +public class MarkovExpertLong extends ExpertLong { + private MarkovLong markov; + + public MarkovExpertLong(GenomeSequence seq, int order) { + super(seq); + markov = new MarkovLong(order); + } + + public void learn(byte[] aSeq) { + for (int i = 0; i < aSeq.length; i++) { + markov.update(aSeq[i]); + } + + } + + public void resurrect(GenomeSequence work, long i, int past) { + } + + public void resign() { + } + + public double probability(int character) { + return markov.probability(character); + } + + public int copyFrom(int i) { + return 0; + } + + public double update(int actual) { + double costActual = markov.probability(actual); + updateCost(costActual); + markov.update(actual); + + return costActual; + + } + + public int copyFrom() { + return 0; + } + + public String toString() { + return "ME"; + } + + public void learn() { + } +} + +class MarkovLong { + // use of one-d array + long[] charCounts; + long[] countTotal; + int order; + int currentInd = 0;// index of current context + int MASK;// matrix size + + public MarkovLong(int order) { + this.order = order; + MASK = (int) Math.pow(ExpertLong.ALPHABET_SIZE, order); + if (order >= 0) { + charCounts = new long[MASK * ExpertLong.ALPHABET_SIZE]; + countTotal = new long[MASK]; + for (int i = 0; i < charCounts.length; i++) + charCounts[i] = 1; + for (int i = 0; i < countTotal.length; i++) + countTotal[i] = ExpertLong.ALPHABET_SIZE; + } + } + + public void update(int a) { + countTotal[currentInd]++; + currentInd = currentInd * ExpertLong.ALPHABET_SIZE + a; + charCounts[currentInd]++; + currentInd = currentInd % MASK; + } + + public double probability(int a) { + return ((double) charCounts[currentInd * ExpertLong.ALPHABET_SIZE + a]) + / countTotal[currentInd]; + } +} diff --git a/src/dev/java/japsadev/xm/genome/MyBigHashtableLong.java b/src/dev/java/japsadev/xm/genome/MyBigHashtableLong.java new file mode 100755 index 0000000..1af65bc --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/MyBigHashtableLong.java @@ -0,0 +1,387 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +/** + * + */ + +import japsa.util.IntIterator; + +import java.util.Random; + +public class MyBigHashtableLong {// implements PatternStore{ + + public Random rnd = new Random(1); + int hashSize; + int numSeqs; + + /** + * Maximum capacity as of in HashMap class + */ + static final int MAXIMUM_CAPACITY = 1 << 30; + static final int INITIAL_CELL_CAPACITY = 16; + + // private int hashSize; + private int currentKey = 0; + private int psuedoPalinKey = 0; + + // values[key][seqID][posSrc] + private int[][][] values; + + // valueCount[key][seqID] + private int[][] valueCount; + private int _usableBitMask = 0; + + protected int _bitPerSymbol = 2;// For DNA, for protein must be 5 + protected int complement; // == largest base complement(x) = complement - x. + // In case of dna, complement = 3 + + private long alloc = 0, used = 0, reallo = 0, init = 0; + protected int preCompute; + private MyBigHashLongIterator copyIter = null; + + // A dummy one, should not call this + protected MyBigHashtableLong() { + + } + + public MyBigHashtableLong(int nSeqs, int hashSize) { + this(nSeqs, hashSize, 2); + } + + public MyBigHashtableLong(int nSeqs, int hashSize, int bpp) { + this.numSeqs = nSeqs; + copyIter = new MyBigHashLongIterator(); + initilise(hashSize, bpp); + } + + public void initilise() { + initilise(hashSize, _bitPerSymbol); + } + + public void initilise(int hSize, int bpp) { + this.hashSize = hSize; + // Inililise usable bit mask + _bitPerSymbol = bpp; + + complement = (byte) ((1 << bpp) - 1);// 1 << 2 - 1 = 3 + + for (int i = 0; i < hashSize * _bitPerSymbol; i++) { + _usableBitMask <<= 1; + _usableBitMask |= 1; // same as ++ + }// assert _usableBitMask = 11..11: hashSize * _bitPerSymbol bit + + preCompute = (_bitPerSymbol * (hashSize - 1)); + + clear(); + } + + public void clear() { + // Create an array of entries + long s1 = System.currentTimeMillis(); + valueCount = new int[1 << (_bitPerSymbol * hashSize)][numSeqs]; + values = new int[valueCount.length][numSeqs][]; + long s2 = System.currentTimeMillis(); + + System.out.println((s2 - s1) + "============="); + + // Set all count to -1 + for (int x = 0; x < valueCount.length; x++) { + for (int y = 0; y < numSeqs; y++) + valueCount[x][y] = -1; + } + } + + public void reinitialise_optimise() { + // Create an array of entries + for (int x = 0; x < valueCount.length; x++) { + for (int y = 0; y < numSeqs; y++) { + valueCount[x][y]++; + if (valueCount[x][y] > 0) { + values[x][y] = new int[valueCount[x][y]]; + alloc += values[x][y].length; + init++; + } + valueCount[x][y] = 0; + } + } + this.currentKey = this.psuedoPalinKey = 0; + } + + public void initiliseCurrentValue() { + for (int y = 0; y < numSeqs; y++) + initiliseValue(currentKey, y); + } + + /** + * Only call when the key has not been initilised + * + * @param key + * @param sid + */ + public void initiliseValue(int key, int sid) { + values[key][sid] = new int[INITIAL_CELL_CAPACITY]; + valueCount[key][sid] = 0; + + alloc += INITIAL_CELL_CAPACITY; + init++; + } + + /** + * Compute the new key as a new base is read in + * + * @param baseInd + */ + + public void nextKey(int baseInd) { + // currentKey = currentKey x + currentKey <<= _bitPerSymbol; + currentKey &= _usableBitMask; + currentKey |= baseInd;// same as + baseInd + + psuedoPalinKey >>= _bitPerSymbol; + // baseInd = ; + psuedoPalinKey |= (complement - baseInd) << preCompute;// (_bitPerSymbol + // * (hashSize + // -1)); + + } + + /** + * Put a number into the hash table at the current key + * + * @param val + */ + public void putCurrentValue_psuedo(int sid, int val) { + valueCount[currentKey][sid]++; + } + + public void printMemoryNeeded() { + long sum = 0; + for (int i = 0; i < valueCount.length; i++) { + for (int y = 0; y < numSeqs; y++) + if (valueCount[i][y] >= 0) { + sum += (valueCount[i][y] + 1); + } + } + System.out.println(valueCount.length + " arrays of total " + sum); + } + + /** + * Put a number into the hash table at the current key + * + * @param val + */ + public void putCurrentValue(int sid, int val) { + if (valueCount[currentKey][sid] < 0) { + initiliseValue(currentKey, sid); + // initiliseCurrentValue(); + } + + putValue(val, currentKey, sid); + } + + public void putValue(int val, int key, int sid) { + // invariat valueCount[key] = values[key].length + if (valueCount[key][sid] == values[key][sid].length) { + // If no the array is full, reallocate array, doule size + // int newArray[] = new int[values[key].length << 1]; + int newArray[] = new int[(int) (values[key][sid].length * 1.5)]; + System.arraycopy(values[key][sid], 0, newArray, 0, + values[key][sid].length); + + alloc += newArray.length; + alloc -= values[key].length; + + reallo++; + + values[key][sid] = newArray; + } + values[key][sid][valueCount[key][sid]] = val; + valueCount[key][sid]++; + + used++; + } + + // /////////////////////////////////////////// + /** + * Return the size of the current hash value + * + * @return + */ + // public int getCurrentCount(){ + // return valueCount[currentKey]; + // } + + // public int getCount(int key){ + // return valueCount[key]; + // } + /** + * Return the current key + * + * @return + */ + public int getCurrentKey() { + return currentKey; + } + + public void setCurrentKey(int key) { + this.currentKey = key; + } + + public int getPsuedoPalinKey() { + return psuedoPalinKey; + } + + public void setPsuedoPalinKey(int psuedoPalinKey) { + this.psuedoPalinKey = psuedoPalinKey; + } + + public void printSummary() { + System.out + .printf(" Init %d(%d), reallocation %d, Allocatate %d, used %d, (%.2f %% )\n", + init, valueCount.length, reallo, alloc, used, + (used * 100.0 / alloc)); + } + + public MyBigHashLongIterator iterator() { + copyIter.reset(); + return copyIter; + } + + public static void main(String[] args) { + // MyBigHashtableLong myhash = new MyBigHashtableLong(5,2); + + // for ( int i = 0; i < 100; i++){ + // myhash.nextKey((byte)(i % 4)); + // myhash.putCurrentValue(i); + // System.out.println(); + // } + } + + public int hashSize() { + return hashSize; + } + + class MyBigHashLongIterator implements IntIterator { + + // v[sid][posSrc] + int[][] copies;// = new int[2 * numSeqs][]; + int[] totals;// totals for all + + int total;// total number of posSrc + + boolean isPalin = false; + int sid; + + public MyBigHashLongIterator() { + copies = new int[2 * numSeqs][]; + totals = new int[2 * numSeqs]; + } + + /** + * Set an array of arrays + */ + public void reset() { + total = 0; + for (int y = 0; y < numSeqs; y++) { + copies[y] = values[currentKey][y]; + totals[y] = valueCount[currentKey][y]; + + if (totals[y] < 0) + totals[y] = 0; + + total += totals[y]; + + copies[y + numSeqs] = values[psuedoPalinKey][y]; + totals[y + numSeqs] = valueCount[psuedoPalinKey][y]; + if (totals[y + numSeqs] < 0) + totals[y + numSeqs] = 0; + total += totals[y + numSeqs]; + } + } + + public boolean isPalin() { + return isPalin; + } + + public int sequenceID() { + return 0; + } + + public boolean hasNext() { + return total > 0; + // return cIter.hasNext() || pIter.hasNext(); + } + + public int sizeAvailable() { + return total; + // return cIter.total + pIter.total; + } + + public int next() { + int random_index = rnd.nextInt(total); + + sid = 0; + while (random_index >= totals[sid]) { + random_index -= totals[sid]; + sid++; + } + + // assert: accTotal < randomIndex + + int[] myArray = copies[sid]; + + totals[sid]--; + int myTotal = totals[sid];// point to the last availble + + if (random_index < myTotal) {// Move the expert selected to end of + // the list + // so wont be selected again + int tmp = myArray[random_index]; + myArray[random_index] = myArray[myTotal]; + myArray[myTotal] = tmp; + }// else random_index +1 == total + + if (sid < numSeqs) { + isPalin = false; + } else { + isPalin = true; + sid -= numSeqs; + } + + total--; + return myArray[myTotal]; + + } + } + +} diff --git a/src/dev/java/japsadev/xm/genome/MyBitSetLong.java b/src/dev/java/japsadev/xm/genome/MyBitSetLong.java new file mode 100755 index 0000000..d4e54b0 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/MyBitSetLong.java @@ -0,0 +1,110 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +import java.io.Serializable; +import java.util.Arrays; + +/** + * @author minhduc + * + */ +public class MyBitSetLong implements Serializable { + private static final long serialVersionUID = 1L; + + public static final int SIZE_INT = 32; + public static final int LOG_INT_SIZE = 5; + public static final long FIVE_1s = 31;// = 0x11111 + + public static int[] POS = new int[SIZE_INT]; + static { + POS[0] = 1; + for (int i = 1; i < SIZE_INT; i++) { + POS[i] = POS[i - 1] << 1; + } + } + + int array[]; + + public MyBitSetLong(long length) { + // System.out.println("Alocating " + (length / SIZE_INT + 1) + + // " ints "); + int array_length = (int) (length / SIZE_INT + 1); + array = new int[array_length]; + Arrays.fill(array, 0); + } + + public void set(long pos) { + // int ind = (int) (posSrc / SIZE_INT), place = (int) (posSrc % SIZE_INT) ; + int ind = (int) (pos >> LOG_INT_SIZE), place = (int) (pos & FIVE_1s); + array[ind] |= POS[place]; + } + + public void clear(long pos) { + // int ind = (int) (posSrc / SIZE_INT), place = (int) (posSrc % SIZE_INT) ; + int ind = (int) (pos >> LOG_INT_SIZE), place = (int) (pos & FIVE_1s); + array[ind] &= ~POS[place]; + } + + public boolean get(long pos) { + // int ind = (int) (posSrc / SIZE_INT), place = (int) (posSrc % SIZE_INT); + int ind = (int) (pos >> LOG_INT_SIZE), place = (int) (pos & FIVE_1s); + return ((array[ind] & POS[place]) != 0); + } + + public boolean equal(MyBitSetLong other) { + if (array.length != other.array.length) + return false; + for (int i = 0; i < array.length; i++) { + if (array[i] != other.array[i]) + return false; + } + + return true; + } + + // Other must not shorter than me + public void copy(MyBitSetLong other) { + for (int i = 0; i < array.length; i++) + other.array[i] = array[i]; + } + + /** + * @param args + */ + public static void main(String[] args) throws Exception { + MyBitSetLong aBitSet = new MyBitSetLong(4000000000l); + for (long i = 4000000000l - 100; i < 4000000000l; i++) { + aBitSet.set(i); + } + + } + +} diff --git a/src/dev/java/japsadev/xm/genome/MyHashtableLong.java b/src/dev/java/japsadev/xm/genome/MyHashtableLong.java new file mode 100755 index 0000000..577a1dd --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/MyHashtableLong.java @@ -0,0 +1,434 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +/** + * On 10 Jul MD added gapped seed as key + * Note: Maximum 1 << 30 cells as a limit of java array or 15 + * Maximum = + */ + +import japsa.util.IntIterator; +import japsa.xm.hash.PatternStore; + +import java.util.Arrays; +import java.util.Random; + +public class MyHashtableLong implements PatternStore { + public Random rnd = new Random(1); + int hashSize; + + /** + * Maximum capacity as of in HashMap class + */ + static final int MAXIMUM_CAPACITY = 1 << 30; + static final int INITIAL_CELL_CAPACITY = 16; + + // private int hashSize; + private int currentKey = 0; + private int psuedoPalinKey = 0; + + private int[][] values; + private int[] valueCount; + private int _usableBitMask = 0; + + protected int _bitPerSymbol = 2;// For DNA, for protein must be 5 + protected byte complement; // == largest base complement(x) = compliment - x + + private int alloc = 0, used = 0, reallo = 0, init = 0; + protected int preCompute; + + // A dummy one, should not call this + protected MyHashtableLong() { + + } + + public MyHashtableLong(int hashSize) { + this(hashSize, 2); + } + + public MyHashtableLong(int hashSize, int bpp) { + initilise(hashSize, bpp); + } + + public void initilise() { + initilise(hashSize, _bitPerSymbol); + } + + public void initilise(int hSize, int bpp) { + this.hashSize = hSize; + // Inililise usable bit mask + _bitPerSymbol = bpp; + + complement = (byte) ((1 << bpp) - 1); + + for (int i = 0; i < hSize * _bitPerSymbol; i++) { + _usableBitMask <<= 1; + _usableBitMask |= 1; // same as ++ + }// assert _usableBitMask = 11..11: hashSize * _bitPerSymbol bit + + preCompute = (_bitPerSymbol * (hSize - 1)); + + clear(); + } + + public void clear() { + // Create an array of entries + valueCount = new int[1 << (_bitPerSymbol * hashSize)]; + values = new int[valueCount.length][]; + + // Set all count to -1 + Arrays.fill(valueCount, -1); + } + + public void reinitialise_optimise() { + // Create an array of entries + for (int x = 0; x < valueCount.length; x++) { + valueCount[x]++; + if (valueCount[x] > 0) { + values[x] = new int[valueCount[x]]; + alloc += values[x].length; + init++; + } + + valueCount[x] = 0; + } + this.currentKey = this.psuedoPalinKey = 0; + } + + public void initiliseCurrentValue() { + initiliseValue(currentKey); + } + + public void initiliseValue(int key) { + values[key] = new int[INITIAL_CELL_CAPACITY]; + valueCount[key] = 0; + + alloc += INITIAL_CELL_CAPACITY; + init++; + } + + /** + * Compute the new key as a new base is read in + * + * @param baseInd + */ + + public void nextKey(byte baseInd) { + // currentKey = currentKey x + currentKey <<= _bitPerSymbol; + currentKey &= _usableBitMask; + currentKey |= baseInd;// same as + baseInd + + psuedoPalinKey >>= _bitPerSymbol; + // baseInd = ; + psuedoPalinKey |= (complement - baseInd) << preCompute;// (_bitPerSymbol + // * (hashSize + // -1)); + } + + public void nextKey(int baseInd) { + // currentKey = currentKey x + currentKey <<= _bitPerSymbol; + currentKey &= _usableBitMask; + currentKey |= baseInd;// same as + baseInd + + psuedoPalinKey >>= _bitPerSymbol; + // baseInd = ; + psuedoPalinKey |= (complement - baseInd) << preCompute;// (_bitPerSymbol + // * (hashSize + // -1)); + + } + + /** + * Put a number into the hash table at the current key + * + * @param val + */ + public void putCurrentValue_psuedo(int val) { + valueCount[currentKey]++; + } + + public void printMemoryNeeded() { + long sum = 0; + for (int i = 0; i < valueCount.length; i++) { + if (valueCount[i] >= 0) { + sum += (valueCount[i] + 1); + } + } + + System.out.println(valueCount.length + " arrays of total " + sum); + } + + /** + * Put a number into the hash table at the current key + * + * @param val + */ + public void putCurrentValue(int val) { + if (valueCount[currentKey] < 0) { + initiliseCurrentValue(); + } + + putValue(val, currentKey); + } + + public void putValue(int val, int key) { + // invariat valueCount[key] = values[key].length + if (valueCount[key] == values[key].length) { + // If no the array is full, reallocate array, doule size + // int newArray[] = new int[values[key].length << 1]; + int newArray[] = new int[(int) (values[key].length * 1.5)]; + System.arraycopy(values[key], 0, newArray, 0, values[key].length); + + alloc += newArray.length; + alloc -= values[key].length; + + reallo++; + + values[key] = newArray; + } + values[key][valueCount[key]] = val; + valueCount[key]++; + + used++; + } + + /** + * Return the size of the current hash value + * + * @return + */ + public int getCurrentCount() { + return valueCount[currentKey]; + } + + public int getCount(int key) { + return valueCount[key]; + } + + /** + * Return the current key + * + * @return + */ + public int getCurrentKey() { + return currentKey; + } + + public void setCurrentKey(int key) { + this.currentKey = key; + } + + public int getPsuedoPalinKey() { + return psuedoPalinKey; + } + + public void setPsuedoPalinKey(int psuedoPalinKey) { + this.psuedoPalinKey = psuedoPalinKey; + } + + public void printSummary() { + System.out + .printf(" Init %d(%d), reallocation %d, Allocatate %d, used %d, (%.2f %% )\n", + init, valueCount.length, reallo, alloc, used, + (used * 100.0 / alloc)); + } + + public IntIterator iterator() { + return new MyHashLongIterator(); + } + + public MyHashLongIterator getLongIterator() { + return new MyHashLongIterator(); + } + + public IntIterator copyIterator() { + return new CopyIterator(); + } + + public IntIterator palinIterator() { + return new PalinIterator(); + } + + public static void main(String[] args) { + MyHashtableLong myhash = new MyHashtableLong(5, 2); + + for (int i = 0; i < 100; i++) { + myhash.nextKey((byte) (i % 4)); + myhash.putCurrentValue(i); + System.out.println(); + } + } + + public int hashSize() { + return hashSize; + } + + class CopyIterator implements IntIterator { + private int total; + private int[] myArray = null; + + public CopyIterator() { + total = valueCount[currentKey]; + if (total < 0) + total = 0; + myArray = values[currentKey]; + } + + public boolean hasNext() { + return total > 0; + } + + public int sizeAvailable() { + return total; + } + + public void remove() { + } + + public int next() { + int random_index = rnd.nextInt(total); + // Shuffle the array + + if (random_index + 1 < total) {// Move the expert selected to end of + // the list + // so wont be selected again + int tmp = myArray[random_index]; + myArray[random_index] = myArray[total - 1]; + myArray[total - 1] = tmp; + }// else random_index +1 == total + + total--; + return myArray[total]; + } + + public int next(int random_index) { + if (random_index + 1 < total) {// Move the expert selected to end of + // the list + // so wont be selected again + int tmp = myArray[random_index]; + myArray[random_index] = myArray[total - 1]; + myArray[total - 1] = tmp; + }// else random_index +1 == total + + total--; + return myArray[total]; + } + } + + class PalinIterator implements IntIterator { + private int total; + private int[] myArray = null; + + public PalinIterator() { + total = valueCount[psuedoPalinKey]; + if (total < 0) + total = 0; + myArray = values[psuedoPalinKey]; + } + + public boolean hasNext() { + return total > 0; + } + + public int sizeAvailable() { + return total; + } + + public int next() { + // TODO: can reuse the random number generated before + int random_index = rnd.nextInt(total); + // Shuffle the array + if (random_index + 1 < total) {// Move the expert selected to end of + // the list + // so wont be selected again + int tmp = myArray[random_index]; + myArray[random_index] = myArray[total - 1]; + myArray[total - 1] = tmp; + }// else random_index +1 == total + total--; + + return (myArray[total]); + } + + public int next(int random_index) { + if (random_index + 1 < total) {// Move the expert selected to end of + // the list + // so wont be selected again + int tmp = myArray[random_index]; + myArray[random_index] = myArray[total - 1]; + myArray[total - 1] = tmp; + }// else random_index +1 == total + + total--; + return myArray[total]; + } + + } + + class MyHashLongIterator implements IntIterator { + PalinIterator pIter; + CopyIterator cIter; + boolean isPalin = false; + + public MyHashLongIterator() { + pIter = new PalinIterator(); + cIter = new CopyIterator(); + } + + public boolean isPalin() { + return isPalin; + } + + public boolean hasNext() { + return cIter.hasNext() || pIter.hasNext(); + } + + public int sizeAvailable() { + return cIter.total + pIter.total; + } + + public int next() { + int random_index = rnd.nextInt(sizeAvailable()); + if (random_index < cIter.sizeAvailable()) { + isPalin = false; + return cIter.next(); + // return cIter.next(random_index); + } else { + isPalin = true; + return pIter.next(); + // return pIter.next(random_index - cIter.sizeAvailable()); + } + } + } + +} diff --git a/src/dev/java/japsadev/xm/genome/RepeatCountExpertLong.java b/src/dev/java/japsadev/xm/genome/RepeatCountExpertLong.java new file mode 100755 index 0000000..7f78648 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/RepeatCountExpertLong.java @@ -0,0 +1,136 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/** + * Written by Chris Mears, modified and maintained by Minh Duc Cao + */ +package japsadev.xm.genome; + +public class RepeatCountExpertLong extends RepeatExpertLong { + + int countRight; + int count; + + double[] probs = new double[ExpertLong.ALPHABET_SIZE]; + + // int currentBase; + + public RepeatCountExpertLong(GenomeSequence seq, long start, + MyBitSetLong b, int type) { + super(seq, start, b, type); + + expertType = type; + count = 1; + countRight = 0; + + } + + /** + * A method resemble constructor, called to reuse experts + */ + public void reuseExpert(GenomeSequence seq, long startPos, MyBitSetLong b, + int type) { + reset(seq, startPos, b, type);// + + count = 1; + countRight = 0; + } + + protected void computeProbs() { + double prob = (countRight + .01) / (count); + if (prob <= 0 || prob >= 1) { + System.err.println("Error " + prob + " " + count + " " + + countRight); + (new Exception()).printStackTrace(); + System.exit(1); + } + + double other = (1.0 - prob) / (ExpertLong.ALPHABET_SIZE - 1); + + for (int i = 0; i < ExpertLong.ALPHABET_SIZE; i++) { + if (expertType == COPY_TYPE) { + probs[i] = (i == this.currentBase) ? prob : other; + } else { + probs[i] = (i == (3 - this.currentBase)) ? prob : other; + } + } + } + + public double probability(int character) { + return probs[character]; + /************************************************************** + * double prob = (countRight + .01 ) / (count); if (prob <=0 || prob >= + * 1){ System.err.println("Error " + prob + " " + count + " " + + * countRight); (new Exception()).printStackTrace(); System.exit(1); } + * + * int match = character; + * + * if (expertType == PALIN_TYPE) match = 3 - character; + * + * + * if (currentBase == match) return prob; else return (1 - prob) / + * (ExpertLong.ALPHABET_SIZE - 1); / + **************************************************************/ + } + + public double update(int actual) { + + if ((currentPointer - start) * expertType >= length) { + // System.out.println("hmmmm"); + return -1; + } + + int match = actual; + + if (expertType == PALIN_TYPE) + match = 3 - actual; + + // Recent prediction + double prob = probability(actual); + + // Remove previous history element + // countRight += ( genSeq.getBase(currentPointer) == match)? 1:0; + countRight += (currentBase == match) ? 1 : 0; + count++; + + currentPointer += expertType; + currentBase = genSeq.getBase(currentPointer); + computeProbs(); + + updateCost(prob); + + return prob; + } + + public RepeatExpertLong duplicate(GenomeSequence seq, long startPos, + MyBitSetLong b) { + return new RepeatCountExpertLong(seq, startPos, b, expertType); + } + +} diff --git a/src/dev/java/japsadev/xm/genome/RepeatExpertLong.java b/src/dev/java/japsadev/xm/genome/RepeatExpertLong.java new file mode 100755 index 0000000..53cf200 --- /dev/null +++ b/src/dev/java/japsadev/xm/genome/RepeatExpertLong.java @@ -0,0 +1,139 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.genome; + +/** + * @author Minh Duc Cao + * + */ +public abstract class RepeatExpertLong extends ExpertLong { + + public static int COPY_TYPE = 1; + public static int PALIN_TYPE = -1; + + protected long start;// Starting point of expert on the sequence + /** + * The current pointer of expert, japsa.seq[currentPointer] is expert + * prediction mobile elements + */ + protected long currentPointer;// the pointer to current position of expert + protected int currentBase;// the base that the pointer points to + + /** + * The length of the expert, ideally this is the lenght of the mobile + * elements + */ + protected long length;// maximum possible length + + MyBitSetLong bitSet; + protected int expertType = 1;// Copy or palindrome + + RepeatExpertLong(GenomeSequence seq, long start, MyBitSetLong b, int type) { + super(seq); + bitSet = b; + this.start = start; + currentPointer = start;// Currently point to the very first char + currentBase = this.genSeq.getBase(currentPointer); + + expertType = type; + + // Get the length of the expert + if (type == PALIN_TYPE) + length = start - 1;// at most as long as this position + else + length = seq.getLength() - start - 1; + } + + void reset(GenomeSequence seq, long startPos, MyBitSetLong b, int type) { + reset(seq); + bitSet = b; + this.start = startPos; + currentPointer = start;// Currently point to the very first char + + // currentBase = this.genSeq.getBase(currentPointer); + + expertType = type; + + // Get the length of the expert + if (type == PALIN_TYPE) + length = start - 1;// at most as long as this position + else + length = seq.getLength() - start - 1; + } + + public void resign() { + bitSet.clear(id); + } + + public void setID(long lid) { + id = lid; + } + + public long getID() { + return id; + } + + public abstract RepeatExpertLong duplicate(GenomeSequence seq1, + long start1, MyBitSetLong b); + + public abstract void reuseExpert(GenomeSequence seq1, long start1, + MyBitSetLong b, int eType); + + protected abstract void computeProbs(); + + public void resurrect(GenomeSequence workSeq, long pos, int past) { + // resurrect(byte[] workSeq, int posSrc, int past) { + bitSet.set(id); + // make sure posSrc >=context_length + currentPointer = start - past * expertType; + currentBase = this.genSeq.getBase(currentPointer); + computeProbs(); + + for (long i = pos - past; i <= pos; i++) { + update(workSeq.getBase(i)); + } + } + + public long getStart() { + return start; + } + + public long getCurrentPointer() { + return currentPointer; + } + + public long getLength() { + return length; + } + + public int getExpertType() { + return expertType; + } +} diff --git a/src/dev/java/japsadev/xm/hash/GappedSeedHashtable.java b/src/dev/java/japsadev/xm/hash/GappedSeedHashtable.java new file mode 100755 index 0000000..1206078 --- /dev/null +++ b/src/dev/java/japsadev/xm/hash/GappedSeedHashtable.java @@ -0,0 +1,210 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.hash; + +/** + * On 10 Jul MD added gapped seed as key + * Note: Maximum 1 << 30 cells as a limit of java array or 15 + * Maximum = + */ +import japsa.util.JapsaMath; +import japsa.xm.expert.Expert; + +import java.util.StringTokenizer; + +public class GappedSeedHashtable { + /** + * Maximum capacity as of in HashMap class + */ + + static final int INITIAL_CELL_CAPACITY = 16; + private long currentLongKey = 0; + private long currentPalinLongKey = 0; + + long currentKey; + long palinKey; + + // private int _usableBitMask = 0; + private int _bitPerSymbol = 2;// For DNA, for protein mush be 5 + + long[] islands; + int[] islandSize; + static final int MAX_STORE = 7000003;// A big prime number + private HashCell[] cells; + private int _bitLength; + + public GappedSeedHashtable(String mask) { + _bitPerSymbol = (int) Math.ceil(JapsaMath + .log2(Expert.alphabet().size())); + // mask should be in form [01]+ + StringTokenizer st = new StringTokenizer(mask, "0"); + islands = new long[st.countTokens()]; + islandSize = new int[st.countTokens()]; + int j = -1; + int last = 0; + int count = 0;// Count of Zeros + + // Break the mask into islands + for (int i = 0; i < mask.length(); i++) { + if (mask.charAt(mask.length() - i - 1) == '1') { + if (last == 0) { + j++; + islandSize[j] = count * 2; + islands[j] = (1l << (i * 2)) | (1l << (i * 2 + 1)); + last = 1; + } else + islands[j] = islands[j] | (1l << (2 * i)) + | (1l << (i * 2 + 1)); + + } else { + last = 0; + count++; + } + } + + System.out.println(mask); + for (j = 0; j < islands.length; j++) { + System.out.printf("%s %d \n", Long.toBinaryString(islands[j]), + islandSize[j]); + } + + // Change to count of 1s + count = mask.length() - count; + + // Inililise usable bit mask + // for (int i = 0; i < count * _bitPerSymbol; i++) { + // _usableBitMask <<= 1; + // _usableBitMask |= 1; // same as ++ + // }// assert _usableBitMask = 11..11: hashSize * _bitPerSymbol bit + + cells = new HashCell[MAX_STORE]; + _bitLength = (mask.length() - 1) * _bitPerSymbol; + } + + /** + * Compute the new key as a new base is read in + * + * @param baseInd + */ + + public void nextKey(int baseInd) { + // Get the sunsequence under mask + currentLongKey <<= _bitPerSymbol; + currentLongKey |= baseInd;// same as + baseInd + + currentKey = 0; + for (int j = 0; j < islands.length; j++) { + currentKey |= ((currentLongKey & islands[j]) >> islandSize[j]); + } + + } + + public void nextPalinKey(long baseInd) { + baseInd <<= (_bitLength); + currentPalinLongKey >>= _bitPerSymbol; + currentPalinLongKey |= baseInd;// same as + baseInd + + palinKey = 0; + for (int j = 0; j < islands.length; j++) { + palinKey |= ((currentPalinLongKey & islands[j]) >> islandSize[j]); + } + } + + // Get and create + public HashCell getCell(long key) { + int index = (int) (key % MAX_STORE); + + HashCell cell = cells[index]; + if (cell == null) { + cells[index] = new HashCell(index); + return cells[index]; + } + while (cell.id != key && cell.next != null) { + cell = cell.next; + } + // Iether cell.next == null or cell.id = key + if (cell.id == key) + return cell; + // else + cell.next = new HashCell(key); + return cell.next; + } + + public HashCell getCurrentCell() { + return getCell(currentKey); + } + + public HashCell getPalinCell() { + return getCell(palinKey); + } + + public void putCurrentCell(int val) { + HashCell cell = getCell(currentKey); + cell.putValue(val); + } + + public class HashCell { + long id; + HashCell next = null; + int count = -1; + int[] values = null; + + HashCell(long id) { + values = new int[INITIAL_CELL_CAPACITY]; + this.id = id; + } + + /** + * Put a number into the hash table at the current key + * + * @param val + */ + public void putValue(int val) { + if (count < 0) { + values = new int[INITIAL_CELL_CAPACITY]; + count = 0; + } else if (count == values.length) {// Full + int newArray[] = new int[values.length << 1]; + System.arraycopy(values, 0, newArray, 0, values.length); + values = newArray; + } + values[count] = val; + count++; + } + + public int getCount() { + return count; + } + + public int[] getValues() { + return values; + } + } +} diff --git a/src/dev/java/japsadev/xm/hash/PrefixArray.java b/src/dev/java/japsadev/xm/hash/PrefixArray.java new file mode 100755 index 0000000..47762f3 --- /dev/null +++ b/src/dev/java/japsadev/xm/hash/PrefixArray.java @@ -0,0 +1,110 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.hash; + +//import java.util.Random; + +/** + * @author Minh Duc Cao + * + */ +public class PrefixArray extends PrefixArrayAbstract { + + public PrefixArray(byte[] aSeq) { + this(aSeq, false); + + } + + public PrefixArray(byte[] aSeq, boolean isReverse) { + mySeq = aSeq; + initilise(isReverse); + } + + public void initilise(boolean isReverse) { + int lastIndex = mySeq.length - 1; + if (!isReverse) { + // Reverse the sequence + for (int i = 0; i < mySeq.length / 2; i++) { + byte tmpB = mySeq[i]; + mySeq[i] = mySeq[lastIndex - i]; + mySeq[lastIndex - i] = tmpB; + + } + } + pa = SuffixArrayConstruction.construct(mySeq);// Suffix Array + + // Convert the suffix array to inverse prefix array + // pa = new int[mySeq.length]; + invPA = new int[mySeq.length]; + + for (int i = 0; i < mySeq.length; i++) { + // Reverse s + pa[i] = lastIndex - pa[i + 1]; + invPA[pa[i]] = i; + } + + // reverse mySequence back + for (int i = 0; i < mySeq.length / 2; i++) { + byte tmpB = mySeq[i]; + mySeq[i] = mySeq[lastIndex - i]; + mySeq[lastIndex - i] = tmpB; + + } + /***/ + startRange = 0; + endRange = mySeq.length; + matchLength = 1;// No match at the moment + } + + // Impplement parent abstract + protected boolean isMatched(byte b1, byte b2) { + // boolean x = (b1 - b2) % 2 == 0; + return b1 == b2; + } + + /** + * Construct the SA object: compute the pa, lcp etc + * + * @param aSeq + */ + + public void print() { + System.out.printf("\n i lcp pa in \n"); + + for (int i = 0; i < mySeq.length; i++) { + System.out.printf("%4d%4d%4d%4d ", i, lcp[i], pa[i], invPA[i]); + for (int j = pa[i]; j >= 0; j--) { + System.out.printf("%d", mySeq[j]); + } + System.out.println(); + } + } + +} diff --git a/src/dev/java/japsadev/xm/hash/PrefixArrayAbstract.java b/src/dev/java/japsadev/xm/hash/PrefixArrayAbstract.java new file mode 100755 index 0000000..0edfbd2 --- /dev/null +++ b/src/dev/java/japsadev/xm/hash/PrefixArrayAbstract.java @@ -0,0 +1,305 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.hash; + +import japsa.util.IntIterator; +import japsa.xm.hash.PatternStore; + +/** + * @author Minh Duc Cao + * + */ +public abstract class PrefixArrayAbstract implements PatternStore { + private int hashSize = 10; + // Suffix Array + int[] pa; + // and its invert + int[] invPA;// = mySeq[sa[i] - 1];The charracter at the position previous of + // suffix i (i) + // A pointer to the sequence being indexed + byte[] mySeq; + // LCP + int[] lcp; + + int startRange, endRange; + int nextStartRange, nextEndRange; + int matchLength; + + int[] posMatches = new int[1024];// Store all matches for the next proposing + int posIndex = 0; + + // The sequence to match + byte seqToMatch[]; + int matchIndex;// The index of the current sequence + + public PrefixArrayAbstract() { + // Nothing + } + + public void setHashSize(int hs) { + this.hashSize = hs; + } + + public void setSeqToMatch(byte[] anSeq) { + seqToMatch = anSeq; + matchIndex = -1; + } + + void computeLcp() { + lcp = new int[mySeq.length]; + int i, h = 0; + h = 0; /* visit in string order */ + for (i = mySeq.length - 1; i >= 0; i--) { /* omit last, least suff */ + int x = invPA[i]; + if (x > 0) { + int j = pa[x - 1]; + + int p1 = i - h; + int p0 = j - h; + + while (p1 >= 0 && p0 >= 0 + && isMatched(mySeq[p1--], mySeq[p0--])) + h++; + + // if (lcp[x] != h) + // System.out.println("NO at i = " + i + " x = " + x + " h = " + + // h + " lcp = " + lcp[x]); + + lcp[x] = h; + if (h > 0) + h--; + } + } + lcp[0] = 0; /* least suffix has no predecessor */ + } + + public void printSummary() { + }; + + abstract protected boolean isMatched(byte b1, byte b2); + + // { + // return b1 == b2; + // } + + protected void shrink(int nextByte) { + if (nextStartRange > nextEndRange) { + this.nextStartRange = 0; + this.nextEndRange = mySeq.length - 1; + + while (!isMatched(mySeq[pa[this.nextStartRange]], + seqToMatch[matchIndex])) + this.nextStartRange++; + + while (!isMatched(mySeq[pa[this.nextEndRange]], + seqToMatch[matchIndex])) + this.nextEndRange--; + + matchLength = 0; + } + + matchLength++;// Add another char to the match + startRange = nextStartRange;// startNext[nextByte]; + endRange = nextEndRange;// endNext[nextByte]; + + nextStartRange = mySeq.length; + nextEndRange = 0; + } + + byte prevSymbol(int aPos) { + if (pa[aPos] == mySeq.length - 1) + return -1; + else + return mySeq[pa[aPos] + 1]; + } + + int prePosition(int aPos) { + if (pa[aPos] == mySeq.length - 1) + return 0; + else + return invPA[pa[aPos] + 1]; + } + + /** + * The next byte from the current encoding symbol + * + * @param nextByte + */ + public void nextKey(int nextByte) { + // Try to make the next shrink valid. you cant shrink while the next + // range is not valid + while (nextEndRange < nextStartRange) { + if (!expand()) + break; + } + + matchIndex++;// Another one + // shrinking + shrink(nextByte); + + // expand expand + browse(); + } + + // Shrink the match length to get more candidates + private boolean expand() { + matchLength--; + if (matchLength == 0) + return false;// The character has not seen before + + int tmp; + // expand upward + while (lcp[startRange] >= matchLength) { + startRange--; + + // Put the new value into the nextRange + if (posIndex < posMatches.length) + posMatches[posIndex++] = pa[startRange]; + + // To see if the nextRange valide + if (isMatched(seqToMatch[matchIndex + 1], prevSymbol(startRange)) + && (tmp = prePosition(startRange)) < nextStartRange) + nextStartRange = tmp; + } + // expand download + + while (endRange + 1 < mySeq.length && lcp[endRange + 1] >= matchLength) { + endRange++; + + // Put the new value into the nextRange + if (posIndex < posMatches.length) + posMatches[posIndex++] = pa[endRange]; + // To see if the nextRange valide + if (isMatched(seqToMatch[matchIndex + 1], prevSymbol(endRange)) + && (tmp = prePosition(endRange)) > nextEndRange) + nextEndRange = tmp; + } + + return true; + + } + + // Browse throw the list of all matches + + public void browse() { + posIndex = 0; + int tmp; + for (int i = startRange; i <= endRange; i++) { + // Put into matches table + if (posIndex < posMatches.length) + posMatches[posIndex++] = pa[i]; + + if (isMatched(seqToMatch[matchIndex + 1], prevSymbol(i))) { + if ((tmp = prePosition(i)) > nextEndRange) + nextEndRange = tmp; + if ((tmp = prePosition(i)) < nextStartRange) + nextStartRange = tmp; + } + // Update nextStart/nextEnd range + } + } + + /** + * Construct the SA object: compute the pa, lcp etc + * + * @param aSeq + */ + + abstract public void print(); + + // Methods required by PatternStore interface + public void clear() { + }; + + public void putCurrentValue(int val) { + }; + + /** + * Return the iterator of all previous candidate copy experts + * + * @return + */ + public IntIterator copyIterator() { + return null; + }; + + /** + * Return the iterator of all previous candidate palin experts + * + * @return + */ + public IntIterator palinIterator() { + return null; + }; + + /** + * Return the iterator of all previous candidate copy and palin experts They + * are in fact the combination of both copy and palin experts + * + * @return + */ + public IntIterator iterator() { + return new CopyIterator(); + } + + class CopyIterator implements IntIterator { + int myIndex; + + public CopyIterator() { + myIndex = 0; + } + + public boolean hasNext() { + + if (hashSize > matchLength) + return false; + + if (myIndex < posIndex) + return true; + + /****************************************************************/ + while (hashSize < matchLength) { + expand(); + if (myIndex < posIndex) + return true; + } + /****************************************************************/ + return false; + } + + public int sizeAvailable() { + return posIndex; + } + + public int next() { + return posMatches[myIndex++]; + } + } +} diff --git a/src/dev/java/japsadev/xm/hash/PrefixArrayBinary.java b/src/dev/java/japsadev/xm/hash/PrefixArrayBinary.java new file mode 100755 index 0000000..056da6e --- /dev/null +++ b/src/dev/java/japsadev/xm/hash/PrefixArrayBinary.java @@ -0,0 +1,104 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.hash; + +/** + * @author Minh Duc Cao + * + */ +public class PrefixArrayBinary extends PrefixArrayAbstract { + + public PrefixArrayBinary(byte[] aSeq) { + mySeq = aSeq; + + int lastIndex = mySeq.length - 1; + + byte[] binSeq = new byte[mySeq.length]; + // Reverse the sequence + for (int i = 0; i <= mySeq.length / 2; i++) { + binSeq[i] = (byte) (mySeq[lastIndex - i] % 2); + binSeq[lastIndex - i] = (byte) (mySeq[i] % 2); + } + + pa = SuffixArrayConstruction.construct(binSeq);// Suffix Array + invPA = new int[mySeq.length]; + + for (int i = 0; i < mySeq.length; i++) { + pa[i] = lastIndex - pa[i + 1]; + invPA[pa[i]] = i; + } + + // reverse mySequence back + for (int i = 0; i < mySeq.length / 2; i++) { + byte tmpB = binSeq[i]; + binSeq[i] = binSeq[lastIndex - i]; + binSeq[lastIndex - i] = tmpB; + } + + this.computeLcp(); + + /***/ + startRange = 0; + endRange = mySeq.length; + matchLength = 1;// No match at the moment + } + + // Impplement parent abstract + protected boolean isMatched(byte b1, byte b2) { + // boolean x = (b2 - b1) %2 ==0; + return (b1 % 2 == b2 % 2); + // return (b2 - b1) %2 == 0; + } + + /** + * Construct the SA object: compute the pa, lcp etc + * + * @param aSeq + */ + + /** + * Construct the SA object: compute the pa, lcp etc + * + * @param aSeq + */ + + public void print() { + System.out.printf("\n i lcp pa \n"); + + for (int i = 0; i < mySeq.length; i++) { + System.out.printf("%4d%4d%4d ", i, lcp[i], pa[i]); + for (int j = pa[i]; j >= 0; j--) { + System.out.printf("%d", (mySeq[j] % 2)); + } + System.out.println(); + } + } + +} diff --git a/src/dev/java/japsadev/xm/hash/SuffixArrayConstruction.java b/src/dev/java/japsadev/xm/hash/SuffixArrayConstruction.java new file mode 100755 index 0000000..e27185a --- /dev/null +++ b/src/dev/java/japsadev/xm/hash/SuffixArrayConstruction.java @@ -0,0 +1,363 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.hash; + +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; + +/** + * @author Minh Duc Cao + * + */ +// A Collection of suffix array construction algorithm +public class SuffixArrayConstruction { + + static { + // Load library + System.loadLibrary("sarray"); + } + + /** + * s array M&M algorithm. + */ + protected static native void jsarray(String s, int a[], int n); + + /** + * Hybrid algorithm s: array of chars a:of size length(s) + 1 + */ + protected static native void jbsarray(byte s[], int a[], int n); + + protected static native void jlcp(int a[], byte[] s, int b[], int n); + + /** + * Deep - Shallow + */ + protected static native void jds(byte s[], int a[], int n); + + /** + * Divsufsort s: array of chars a:of size length(s) + 1 + */ + protected static native void jdivsufsort(byte s[], int a[], int n); + + public static int[] construct(byte[] seq) { + int[] a = new int[seq.length + 1]; + jdivsufsort(seq, a, seq.length); + return a; + } + + public static int[] ssarray(byte s[]) { + int[] a = new int[s.length]; + for (int i = 0; i < a.length; i++) + a[i] = s[i]; + + int ORIG = 1 << 31, // ~(~0>>1), /* sign bit */ + BUCK = 1 << 31;// ~(~0>>1); + + int h, i, j, l; + int k = 0; /* initialized for lint */ + int[] p; + + int t; + j = 4;// dna size + + p = new int[a.length]; + + for (i = 0; i < a.length; i++) + /* (0) initialize */ + p[i] = i | ORIG;// Turn the first bit to 1?? why + + for (h = 0;; h = h == 0 ? 1 : 2 * h) { + + for (i = 0; i < a.length; i++) { /* (1) link */ + for (j = p[i]; j >= 0; j = a[j]) + ; + t = (j & ~ORIG) - h; + j = t < 0 ? t + a.length : t; + l = a[j]; + a[j] = p[l]; + p[l] = j; + } + + if (h == 0) { /* find k */ + for (k = 0; k < a.length; k++) + if (p[k] < 0) + break; + } + + for (i = a.length; --k >= 0;) { /* (2) order */ + j = p[k]; + do + p[--i] = j; + // while(((j=al[j]) & ORIG) == 0); + while ((j = a[j]) >= 0); + p[i] |= BUCK; + } + + for (i = 0; i < a.length; i++) { /* (3) reconstruct */ + // if((p[i] & BUCK) != 0) + if (p[i] < 0) + k++; + a[p[i] & ~BUCK] = k; + } + + for (i = 0, j = -1; i < a.length; i++, j = l) { /* (4) refine */ + t = (p[i] & ~BUCK) + h; + + l = a[t >= a.length ? t - a.length : t]; + + if (l != j) + p[i] |= BUCK; + + } + + for (i = 0, k = -1; i < a.length; i++) { /* (5) recode */ + if (p[i] < 0) + k++; + + a[p[i] & ~BUCK] = k; + p[i] |= ORIG; /* (0b) reinitialize */ + } + if (++k >= a.length) + break; + } + + for (i = 0; i < a.length; i++) { + a[i] = p[i] & ~ORIG; + + } + return a; + } + + /** + * + */ + public SuffixArrayConstruction() { + // TODO Auto-generated constructor stub + } + + public static int[] suffixArray(byte[] st) { + int[] a = new int[st.length + 1]; + jds(st, a, st.length); + return a; + } + + public static void print(byte[] s, int[] array, int[] lcp) { + for (int i = 0; i < array.length; i++) { + System.out.printf("%5d %5d ", lcp[i], array[i]); + for (int j = array[i]; j < s.length; j++) { + System.out.printf("%2d ", s[j]); + } + System.out.println(); + } + } + + /** + * Compute the lcp of the suffix array a, on sequence s0 + * + * @param a + * @param s0 + * @return + */ + + static int[] lcp(int[] a, byte[] s0) { + + int[] lcp = new int[a.length]; + int i, h; + int[] inv = new int[a.length]; + + for (i = 0; i < a.length; i++) + inv[a[i]] = i; + + h = 0; /* visit in string order */ + for (i = 0; i < a.length - 1; i++) { /* omit last, least suff */ + int x = inv[i]; /* i,j,x,h as in intro */ + int j = a[x - 1]; + int p1 = i + h; + int p0 = j + h; + + while (p1 < s0.length && p0 < s0.length && s0[p1++] == s0[p0++]) + h++; + lcp[x] = h; + if (h > 0) + h--; + } + lcp[0] = 0; /* least suffix has no predecessor */ + + return lcp; + } + + static int sufcheck(byte[] T, int[] SA) { + int ALPHABET_SIZE = 4; + int[] C = new int[ALPHABET_SIZE]; + int i = 0, p, t = 0; + int c; + int err = 0; + int n = T.length; + + /* ranges. */ + if (err == 0) { + for (i = 0; i <= n; ++i) { + if ((SA[i] < 0) || (n < SA[i])) { + err = -2; + break; + } + } + } + + /* first characters. */ + if (err == 0) { + for (i = 1; i < n; ++i) { + if (T[SA[i]] > T[SA[i + 1]]) { + err = -3; + break; + } + } + } + + /* suffixes. */ + if (err == 0) { + for (i = 0; i < ALPHABET_SIZE; ++i) { + C[i] = 0; + } + for (i = 0; i < n; ++i) { + ++C[T[i]]; + } + for (i = 0, p = 1; i < ALPHABET_SIZE; ++i) { + t = C[i]; + C[i] = p; + p += t; + } + + for (i = 0; i <= n; ++i) { + p = SA[i]; + if (0 < p) { + c = T[--p]; + t = C[c]; + } else { + p = n; + c = -1; + t = 0; + } + if (p != SA[t]) { + err = -4; + break; + } + if (0 <= c) { + ++C[c]; + if ((n < C[c]) || (T[SA[C[c]]] != c)) { + C[c] = -1; + } + } + } + } + + return err; + } + + /** + * @param args + */ + + public static void main(String[] args) throws Exception { + + Sequence dna = SequenceReader.getReader(args[0]).nextSequence(null);// (filename)IOTools.read(args[0]); + + System.out.print("Initilise ... "); + + // byte[] s = (new BioCompDNA(s1,"aa")).toBytes(); + + byte[] s = dna.toBytes(); + // for (int i = 0; i< s.length; i++) + // System.out.print(s[i]); + + System.out.println(); + System.out.println("Done "); + long timeEnd, timeStart = System.currentTimeMillis(); + int[] a;// = new int[s.length + 1]; + + /**************************************************************************** + * { a = new int[s.length + 1]; System.out.println( + * "Testing for jbsarray ************************************** "); + * Runtime.getRuntime().runFinalization (); Runtime.getRuntime().gc (); + * Thread.currentThread ().yield (); + * + * timeStart = System.currentTimeMillis(); jbsarray(s,a,s.length); + * timeEnd = System.currentTimeMillis(); + * + * System.out.println("Running time = " +(timeEnd - timeStart)); + * System.out.println("Checking = " + (sufcheck(s,a) == 0)); + * + * } / + ****************************************************************************/ + { + a = new int[s.length + 1]; + System.out + .println("Testing for jdivsufsort ************************************** "); + Runtime.getRuntime().runFinalization(); + Runtime.getRuntime().gc(); + + // Thread.currentThread ().yield (); + + timeStart = System.currentTimeMillis(); + jdivsufsort(s, a, s.length); + timeEnd = System.currentTimeMillis(); + + System.out.println("Running time = " + (timeEnd - timeStart)); + System.out.println("Checking = " + (sufcheck(s, a) == 0)); + + } + + /****************************************************************************/ + /************************************************************* + * char [] s1 = dna.getCharSequence(); byte[] s2 = new byte[s1.length]; + * for (int i = 0; i < s1.length; i++) s2[i] = (byte)s1[i]; + * /************************************************************* { a = + * new int[s.length + 1]; System.out.println( + * "Testing for jds ************************************** "); + * Runtime.getRuntime().runFinalization (); Runtime.getRuntime().gc (); + * Thread.currentThread ().yield (); + * + * timeStart = System.currentTimeMillis(); jds(s,a,s.length); timeEnd = + * System.currentTimeMillis(); + * + * System.out.println("Running time = " +(timeEnd - timeStart)); + * System.out.println("Checking = " + (sufcheck(s,a) == 0)); + * //print(s,a,h); + * + * } / + *************************************************************/ + + int[] h = lcp(a, s); + // jlcp(a,s,h,a.length); + print(s, a, h); + + } + +} diff --git a/src/dev/java/japsadev/xm/hash/SuffixTree.java b/src/dev/java/japsadev/xm/hash/SuffixTree.java new file mode 100755 index 0000000..043e345 --- /dev/null +++ b/src/dev/java/japsadev/xm/hash/SuffixTree.java @@ -0,0 +1,462 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.hash; + +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.util.IntIterator; +import japsa.xm.hash.PatternStore; + +import java.util.Random; + +/** + * @author Minh Duc Cao + * + */ +public class SuffixTree implements PatternStore { + public static int ALPHABET = 4; + public int longestS = 0, longestNode = 0; + int incrementLeaf = 1; + + int minHash = 11; + + byte[] seqs; + SuffixNode root = new SuffixNode(-1, -1); + + byte[] refSeq; + + SuffixNode currentNode = null, palinNode = null, matchNode = null; + + int currentIndex = 0; + + public SuffixTree(byte[] aSeqs) { + seqs = aSeqs; + refSeq = seqs;// For building + } + + public void setMinHash(int minH) { + minHash = minH; + } + + public void setRefSeq(byte[] ref) { + refSeq = ref; + } + + public void setIncrementLeaf(int incrementLeaf) { + this.incrementLeaf = incrementLeaf; + currentIndex = 0; + } + + public void clear() { + root = new SuffixNode(-1, -1); + + } + + public SuffixNode matchNode() { + SuffixNode node = currentNode; + if (longestNode != longestS) { + node = currentNode.children[refSeq[currentIndex - longestNode]]; + } + + return node; + } + + // Break this into 2: First one to find the Maximum match + // Second one to put the new posSrc in + + public void next(int baseInd) { + longestS = 0; + currentNode = moveNext(root); + addNode(currentIndex); + currentIndex++; + + // return currentNode; + + } + + private SuffixNode moveNext(SuffixNode node) { + if (currentIndex <= longestS) + return node; + + node.leaves += incrementLeaf; + // index == currentIndex - longestS; + // leaves ++; + + // count ++; + longestNode = longestS; + + if (node.children == null) { + return node;// Case 1 + } + + if (node.children[refSeq[currentIndex - longestS]] == null) {// No child + // at + // this + // brand + return node;// case 2 + + } else {// the two prefixes are the same, travel down more + SuffixNode child = node.children[refSeq[currentIndex - longestS]]; + // assert seqHash[child.start] == seqHash[index]; + int childIndex = child.start; + + int endIndex = (child.isLeaf()) ? 0 : child.end; + + while (childIndex >= endIndex + && refSeq[currentIndex - longestS] == seqs[childIndex]) { + longestS++; + childIndex--; + if (currentIndex == longestS) + return node; + } + + if (childIndex >= endIndex) {// add a node here + return node;// case 3 + + } else { + return moveNext(child); + } + } + + } + + /** + * Precond: moveNext has been call, longestS > 0, node = the node nearest to + * the longestS + * + * @param node + * @param posSrc + */ + private void addNode(int pos) { + if (longestNode == longestS) { + if (currentNode.children == null) + currentNode.children = new SuffixNode[ALPHABET]; + // assert node.children[seqHash[currentIndex - longestS]] == null + currentNode.children[seqs[currentIndex - longestS]] = new SuffixNode( + currentIndex - longestS, pos + 1); + } else { + SuffixNode node = new SuffixNode(currentIndex - longestNode, + currentIndex - longestS + 1); + + node.children = new SuffixNode[ALPHABET]; + SuffixNode.internalCount++; + + SuffixNode child = currentNode.children[seqs[currentIndex + - longestNode]]; + currentNode.children[seqs[currentIndex - longestNode]] = node; + + child.start -= (longestS - longestNode); + // Set the previous child + node.children[seqs[child.start]] = child; + + // Set the new branch=>leaf node + node.children[seqs[currentIndex - longestS]] = new SuffixNode( + currentIndex - longestS, pos + 1); + node.leaves = 1 + child.leaves; + + } + + } + + public void printSummary() { + + } + + public void printTree() { + root.print(1, true); + } + + public void nextKey(int baseInd) { + longestS = 0; + currentNode = moveNext(root); + } + + public void putCurrentValue(int val) { + if (incrementLeaf == 1) + addNode(val); + currentIndex++; + } + + /** + * Return the iterator of all previous candidate copy experts + * + * @return + */ + public IntIterator copyIterator() { + return new CopyIterator(); + } + + /** + * Return the iterator of all previous candidate palin experts + * + * @return + */ + public IntIterator palinIterator() { + return new CopyIterator(); + } + + /** + * Return the iterator of all previous candidate copy and palin experts They + * are in fact the combination of both copy and palin experts + * + * @return + */ + public IntIterator iterator() { + return new CopyIterator(); + } + + /** + * @param args + */ + public static void main(String[] args) { + int i = 0; + try { + Sequence dna = SequenceReader.getReader(args[0]).nextSequence(null);// (filename)IOTools.read(args[0]); + + System.out.print("Initilise ... "); + // int size = Integer.parseInt(args[0]); + // byte[] bytes = new byte[size]; + // Random rnd = new Random(); + // for (i = 0; i < bytes.length; i++){ + // bytes[i] = (byte) rnd.nextInt(ALPHABET); + // } + + byte[] bytes = dna.toBytes(); + System.out.println("done"); + + // byte[] x = {0,1,1,1,0,0,1,1,0,1,0,1,1,0,1,0,1,0,1,0,1}; + + // bytes = x; + + SuffixTree tree = new SuffixTree(bytes); + long now = 0, start = System.currentTimeMillis(); + + for (i = 0; i < bytes.length; i++) { + // System.out.printf("%3d ",i); + // System.out.print(bytes[i]); + // SuffixNode node = + + tree.nextKey(bytes[i]); + + tree.putCurrentValue(i); + + if (i % 1000000 == 0 && i > 1) { + now = System.currentTimeMillis(); + System.out + .printf("Milestone %8d , created %8d leaves and %8d internal in %8d ms %f\n", + i, SuffixNode.count + - SuffixNode.internalCount, + SuffixNode.internalCount, (now - start), + SuffixNode.internalCount * 1.0 / i); + + System.gc(); + Runtime.getRuntime().gc(); + + System.gc(); + Runtime.getRuntime().gc(); + + } + // System.out.println(node + " " + node.count); + // System.out.println(tree.root.count); + // + // tree.printTree(); + } + if (now == 0) + now = System.currentTimeMillis(); + System.out.println("\nBuild tree done in " + (now - start) + " ms"); + + // for (int i = 0; i < bytes.length; i++){ + // System.out.print(bytes[i]); + // } + + // System.out.println(); + // tree.printTree(); + + System.out + .printf("Milestone %8d , created %8d leaves and %8d internal in %8d ms %f\n", + bytes.length, SuffixNode.count + - SuffixNode.internalCount, + SuffixNode.internalCount, (now - start), + SuffixNode.internalCount * 1.0 / bytes.length); + + // tree.printTree(); + + /*********************************************** + * tree.nextKey(); tree.printTree(); + * + * + * tree.nextKey(); tree.printTree(); + * + * tree.nextKey(); tree.printTree(); + * + * tree.nextKey(); tree.printTree(); + * + * tree.nextKey(); tree.printTree(); + * + * tree.nextKey(); tree.printTree(); + * + * / + ***********************************************/ + + } catch (Exception e) { + e.printStackTrace(); + System.out.println("End " + i); + + } + System.out.println(i); + } + + public Random rnd = new Random(1); + + class CopyIterator implements IntIterator { + private int total; + private int[] myArray = null; + + public CopyIterator() { + if (longestS < minHash || currentNode == root) { + total = 0; + return; + } + + SuffixNode node = currentNode; + + // System.out.println(currentNode + " " + currentIndex + " " + + // longestNode + // + " " + longestS); + if (longestNode != longestS) { + node = currentNode.children[refSeq[currentIndex - longestNode]]; + } + + // System.out.println(node ); + myArray = new int[node.leaves]; + total = 0; + populate(node); + // assert total = currentNode.leaves; + } + + public boolean hasNext() { + return total > 0; + } + + public int sizeAvailable() { + return total; + } + + public int next() { + int random_index = rnd.nextInt(total); + // Shuffle the array + + if (random_index + 1 < total) {// Move the expert selected to end of + // the list + // so wont be selected again + int tmp = myArray[random_index]; + myArray[random_index] = myArray[total - 1]; + myArray[total - 1] = tmp; + }// else random_index +1 == total + + total--; + + return myArray[total]; + } + + private void populate(SuffixNode node) { + if (node.isLeaf()) { + myArray[total] = node.end - 1; + + total++; + } + + if (node.children == null) + return; + for (int i = 0; i < ALPHABET; i++) { + if (node.children[i] != null) { + populate(node.children[i]); + } + + } + + } + } +} + +// Leaf is the one that has end > start. End is actually the position + +class SuffixNode { + // Can compress start/end/leaves into 1 + // 30 bit ~1B + // 100Tr ->27 bits + // 3 numbers=> 54 bits=>two ints + + public static int count = 0; + public static int internalCount = 0; + + int start, end; + int leaves = 0;// Number of leaves can be reached from here + + SuffixNode[] children;// = new SuffixNode[4]; + + public SuffixNode(int s, int e) { + start = s; + end = e; + count++; + leaves = (isLeaf()) ? 1 : 0; + } + + public boolean isLeaf() { + return (end > start); + } + + public void print(int offSet, boolean newLine) { + if (newLine) + for (int i = 0; i < offSet; i++) + System.out.print(' '); + + System.out.print("----" + this); + boolean first = true; + if (children != null) { + for (int i = 0; i < children.length; i++) { + + if (children[i] != null) { + children[i].print(offSet + 12, !first); + first = false; + // System.out.println(); + } + } + } + + System.out.println(); + } + + public String toString() { + // return "(" + start + "," + count +"," + end +")"; + return "(" + start + "," + leaves + "," + end + ")"; + } + +} diff --git a/src/dev/java/japsadev/xm/hash/TrieNode.java b/src/dev/java/japsadev/xm/hash/TrieNode.java new file mode 100755 index 0000000..5cc53dc --- /dev/null +++ b/src/dev/java/japsadev/xm/hash/TrieNode.java @@ -0,0 +1,115 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.hash; + +import japsa.xm.expert.Expert; + +public class TrieNode { + private static final int INITIAL_SIZE = 16; + + private TrieNode[] children;// Should have some children unless it is a leaf + int count; + private int[] values = null; + int level; + + int maxLevel, minLevel; + + /** + * Create a normal node + * + */ + public TrieNode() { + children = new TrieNode[Expert.alphabet().size()]; + for (int i = 0; i < Expert.alphabet().size(); i++) + children[i] = null; + count = 0; + } + + /** + * Create a node to store values, and depends of whether it is a leaf + * + * @param isLeaf + */ + public TrieNode(boolean isLeaf) { + if (!isLeaf) { + children = new TrieNode[Expert.alphabet().size()]; + for (int i = 0; i < Expert.alphabet().size(); i++) + children[i] = null; + } + count = 0; + values = new int[INITIAL_SIZE]; + } + + public void addNewChild(int i) { + children[i] = new TrieNode(); + } + + public void addNewChild(int i, TrieNode trieNode) { + children[i] = trieNode; + } + + public TrieNode getChild(int i) { + return children[i]; + } + + public int getCount() { + return count; + } + + /** + * Only if the values array has been allocated + * + * @param value + */ + public void addValue(int value) { + if (count >= values.length) {// Full + // reallocate + int[] array = new int[values.length * 2]; + System.arraycopy(values, 0, array, 0, values.length); + values = array; + } + // assert: enough room for adding + values[count] = value; + count++; + } + + public int[] getValues() { + return values; + } + + public int getLevel() { + return level; + } + + public void setLevel(int level) { + this.level = level; + } + +} diff --git a/src/dev/java/japsadev/xm/phylo/CompareTreeLength.java b/src/dev/java/japsadev/xm/phylo/CompareTreeLength.java new file mode 100755 index 0000000..d11359f --- /dev/null +++ b/src/dev/java/japsadev/xm/phylo/CompareTreeLength.java @@ -0,0 +1,156 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsadev.xm.phylo; + +import japsa.bio.phylo.PhylogenyTree; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.util.Iterator; +import java.util.Vector; + +public class CompareTreeLength { + + public static double compare(PhylogenyTree treeA, PhylogenyTree treeB) { + + Iterator iterA = treeA.getLeafIterator(); + + Vector a = new Vector(); + while (iterA.hasNext()) { + a.add(iterA.next()); + } + + PhylogenyTree[] aA = new PhylogenyTree[a.size()]; + a.toArray(aA); + + // sort array A + for (int i = 1; i < aA.length; i++) { + for (int j = 0; j < i; j++) { + if (aA[i].toString().compareTo(aA[j].toString()) < 0) { + // swap + PhylogenyTree tmp = aA[j]; + aA[j] = aA[i]; + aA[i] = tmp; + } + } + } + + double[][] disA = new double[aA.length][aA.length]; + // double sumA = 0; + for (int i = 1; i < aA.length; i++) { + for (int j = 0; j < i; j++) { + disA[i][j] = disA[j][i] = aA[i].distanceTo(aA[j]); + // sumA += disA[i][j]; + } + } + + Iterator iterB = treeB.getLeafIterator(); + + Vector b = new Vector(); + while (iterB.hasNext()) { + b.add(iterB.next()); + } + + PhylogenyTree[] aB = new PhylogenyTree[b.size()]; + b.toArray(aB); + + // sort array A + for (int i = 1; i < aB.length; i++) { + for (int j = 0; j < i; j++) { + if (aB[i].toString().compareTo(aB[j].toString()) < 0) { + // swap + PhylogenyTree tmp = aB[j]; + aB[j] = aB[i]; + aB[i] = tmp; + } + } + } + + double[][] disB = new double[aB.length][aB.length]; + + // double sumB = 0; + for (int i = 1; i < aB.length; i++) { + for (int j = 0; j < i; j++) { + disB[i][j] = disB[j][i] = aB[i].distanceTo(aB[j]); + } + } + + // for (int i = 0; i < aB.length; i++) { + // System.out.println(aB[i]+ " " + aA[i]); + // } + double sum = 0; + + for (int i = 1; i < aB.length; i++) { + for (int j = 0; j < i; j++) { + double v = (disB[i][j] - disA[i][j]); + sum += v * v; + } + } + + return sum; + } + + public static void main(String[] args) throws Exception { + BufferedReader bf = new BufferedReader(new FileReader(args[0])); + + String line = ""; + String dndA = "";// ((n07:93.344,( n02:78.0547, (n04:71.9283, ( + // n08:55.1436,(n09:11.2462, + // n01:11.2462):43.8974):16.7848):6.12634):15.2893 + // ):6.63599, ( n06:84.565, ( n03:73.6297, ( + // n05:22.5472, n00:22.5472 ):51.0826 ):10.9352 + // ):15.415);"; + String dndB = "";// ((((n00:0.18065,n05:0.17464):0.25103,n03:0.34336):0.16730,n06:0.31645):0.18090,(n04:0.28654,(((n01:0.09601,n09:0.10990):0.26280,n08:0.27821):0.17186,n02:0.31050):0.15744):0.16279,n07:0.33363)[0.5000];"; + + while ((line = bf.readLine()) != null) { + dndA = dndA + line.trim(); + } + + bf.close(); + + bf = new BufferedReader(new FileReader(args[1])); + while ((line = bf.readLine()) != null) { + dndB = dndB + line.trim(); + } + + bf.close(); + + PhylogenyTree treeA = PhylogenyTree.parseTree(dndA); + PhylogenyTree treeB = PhylogenyTree.parseTree(dndB); + + String tmp = ""; + if (args.length > 2) { + tmp = " " + args[2]; + } + + System.out.println("## " + compare(treeA, treeB) + tmp); + } + +} diff --git a/src/main/java/japsa/tools/seq/BuildGeneDatabase.java b/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java similarity index 70% rename from src/main/java/japsa/tools/seq/BuildGeneDatabase.java rename to src/main/java/japsa/bio/BuildSequenceGroupDatabase.java index 52b0167..d9d2199 100644 --- a/src/main/java/japsa/tools/seq/BuildGeneDatabase.java +++ b/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java @@ -1,38 +1,52 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ +/* + * Copyright (c) 2017 Minh Duc Cao (minhduc.cao@gmail.com). + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. Neither the names of the institutions nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /************************** REVISION HISTORY ************************** * 07/09/2014 - Minh Duc Cao: Created * ****************************************************************************/ -package japsa.tools.seq; +package japsa.bio; + + + +import japsa.bio.gene.GeneDatabase; +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFeature; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.seq.Alphabet.DNA; +import japsa.util.HTSUtilities; +//import japsa.util.JapsaTimer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; @@ -47,36 +61,24 @@ import java.util.ArrayList; import java.util.Date; import java.util.HashMap; - -import japsa.bio.gene.GeneDatabase; -import japsa.seq.JapsaAnnotation; -import japsa.seq.JapsaFeature; -import japsa.seq.Sequence; -import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; -import japsa.seq.Alphabet.DNA; -import japsa.util.HTSUtilities; -import japsa.util.JapsaTimer; -import japsa.util.Logging; +import java.util.Map; /** * Implement a database of genes - * TODO: to finalise the design of the database, and move this class out here * @author minhduc * */ -public class BuildGeneDatabase { - //ArrayList geneFamilies;//Array list of gene family, each is a list of gene alleles +public class BuildSequenceGroupDatabase { + private static final Logger LOG = LoggerFactory.getLogger(BuildSequenceGroupDatabase.class); public GeneDatabase geneDatabase; String prefix; - String fFile; String gFile; String bFile; - public BuildGeneDatabase(String p) throws IOException{ + public BuildSequenceGroupDatabase(String p) throws IOException{ //geneFamilies = new ArrayList(); geneDatabase = new GeneDatabase(); prefix = p; @@ -111,7 +113,7 @@ private void prepreScript2() throws IOException{ } public void cleanUp() throws IOException, InterruptedException{ - Logging.info("rm -f "+ prefix +"*"); + LOG.info("rm -f "+ prefix +"*"); Process process = Runtime.getRuntime().exec("rm -f "+ prefix +"*"); process.waitFor(); } @@ -119,23 +121,26 @@ public void cleanUp() throws IOException, InterruptedException{ /** * Add a collection of genes to the database as a batch * - * @param seqs - * @param checkGeneID - * @return + * @param seqs: new list of sequences to be added + * @param checkGeneID: insert based on geneID first + * @return A map of what sequences were inserted to what * @throws IOException * @throws InterruptedException */ - public HashMap addGeneMap(final HashMap seqs, boolean checkGeneID) throws IOException, InterruptedException{ + public HashMap addGeneMap(final Map seqs, boolean checkGeneID) + throws IOException, InterruptedException{ HashMap strMap = new HashMap (); //0: initialise grouping within the new sequences HashMap> setMap = new HashMap>(); for (Sequence seq:seqs.values()){ + //tSet: set of sequences belong to a group. Potentially, every sequence makes up one group ArrayList tSet = new ArrayList(); tSet.add(seq.getName()); setMap.put(seq.getName(), tSet); - } - Logging.info("Total " + seqs.size() + " sequences"); + } + //note: setMap: maps each set to an ID (of the first sequence) + LOG.info("Total " + seqs.size() + " sequences"); //0.5 Merge based on annotation if (checkGeneID){ HashMap> annoMap = new HashMap>(); @@ -196,11 +201,11 @@ public void cleanUp() throws IOException, InterruptedException{ } list.add(seq.getName()); }else{ - //Logging.info("NOT found annoID " + seq.getName()); + //LOG.info("NOT found annoID " + seq.getName()); } }//for - Logging.info(" Step 0.5 " + annoMap.size() + " from " + getIntrisciID); + LOG.info(" Step 0.5 " + annoMap.size() + " from " + getIntrisciID); for (String annoID:annoMap.keySet()){ ArrayList list = annoMap.get(annoID); String firstName = list.get(0); @@ -213,17 +218,19 @@ public void cleanUp() throws IOException, InterruptedException{ } }//for } + boolean merge = true; int iteration = 0; while (merge){ //1. Group these sequences first - Logging.info(" --Start of " + iteration + " " + new Date()); + LOG.info(" --Start of " + iteration + " " + new Date()); merge = false; iteration ++; SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(gFile); - Logging.info("Iteration " + iteration + " size = " + setMap.size() + " " + new Date()); + LOG.info("Iteration " + iteration + " size = " + setMap.size() + " " + new Date()); int countI = 0; + //write the reps of temp groups to file anc cosider them as `reads' as well as refs for (String key:setMap.keySet()){ ArrayList tSet = setMap.get(key); if (tSet.get(0) == key){ @@ -233,9 +240,10 @@ public void cleanUp() throws IOException, InterruptedException{ } sos.close(); - Logging.info("Iteration " + iteration + " of " + countI + " sequences " + new Date()); - JapsaTimer.systemInfo(); + LOG.info("Iteration " + iteration + " of " + countI + " sequences " + new Date()); + //JapsaTimer.systemInfo(); + //TODO: 1. think of a way to perform alignment `in memory' instead of write to bwa (may be use jalign) Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA2.sh"); process.waitFor(); @@ -257,13 +265,14 @@ public void cleanUp() throws IOException, InterruptedException{ ArrayList refSet = setMap.get(refName); if (readSet == refSet){ - //these two have been in a group + //these two have been from group continue; } Sequence readSeq = seqs.get(readName); Sequence refSeq = seqs.get(refName); - + + //now we find the respresentations from groups are similar, and hence join two groups together if (isSimilar(refSeq, readSeq, sam)){ merge = true; refSet.addAll(readSet); @@ -275,20 +284,33 @@ public void cleanUp() throws IOException, InterruptedException{ } samIter.close(); samReader.close(); - JapsaTimer.systemInfo(); + //JapsaTimer.systemInfo(); } - Logging.info(" 2 Grouping " + geneDatabase.size()); - JapsaTimer.systemInfo(); + LOG.info(" 2 Grouping " + geneDatabase.size()); + //JapsaTimer.systemInfo(); - //2.Try to add groups whose family is already in + //2.Try to add to the existing groups. Align the rep from each group to each of the family rep + //if match, add the whole group to the family int G = 0, GG = 0; if (geneDatabase.size() > 0){ - geneDatabase.write2File(fFile, false); + //geneDatabase.write2File(fFile, false); + + //creat a map of current reps + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(fFile); + HashMap repMap = new HashMap(); + for (GeneDatabase.GeneFamily family:geneDatabase){ + Sequence rep = family.represetationSequence(); + repMap.put(rep.getName(),rep); + rep.writeFasta(sos); + } + sos.close(); + //Run bwa - //Logging.info("Running bwa for " + seq.getName() + " " + geneFamilies.size() + " family"); - Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA.sh"); + //LOG.info("Running bwa for " + seq.getName() + " " + geneFamilies.size() + " family"); + //TODO 2: Read this directly from stdout of bwa + Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA.sh"); process.waitFor(); //Read the sam file @@ -310,25 +332,26 @@ public void cleanUp() throws IOException, InterruptedException{ Sequence readSeq = seqs.get(readName); if (readSeq == null){ - Logging.error("ERROR 4: sequence " + readName + " not found!"); + LOG.error("ERROR 4: sequence " + readName + " not found!"); } String refName = sam.getReferenceName(); GeneDatabase.GeneFamily family = geneDatabase.getFamily(refName); if (family == null){ - Logging.error("ERROR 5: family " + refName + " not found!"); + LOG.error("ERROR 5: family " + refName + " not found!"); continue; } - Sequence refSeq = family.represetationSequence(); + + Sequence refSeq = repMap.get(refName); if (refSeq == null){ - Logging.error("ERROR 6: rep for family " + refName + " not found!"); + LOG.error("ERROR 6: rep for family " + refName + " not found!"); } - + if (isSimilar(refSeq, readSeq, sam)){ + //asign all reads in this set to the family ArrayList readSet = setMap.get(readName); - for (String key:readSet){ //if (strMap.containsKey(key)){ - // Logging.error("ERROR 1 : " + key); + // LOG.error("ERROR 1 : " + key); //} Sequence keySeq = seqs.get(key); geneID = family.addSequence(keySeq); @@ -337,28 +360,29 @@ public void cleanUp() throws IOException, InterruptedException{ } readSet.clear(); } - //Logging.info(" ADD a G " + G + " " + (new Date())); + LOG.trace(" ADD a G " + G + " " + (new Date())); } samIter.close(); samReader.close(); } - Logging.info(" 3 Grouping " + geneDatabase.size() + " " + seqs.size()); - JapsaTimer.systemInfo(); + LOG.info(" 3 Grouping " + geneDatabase.size() + " " + seqs.size()); + //JapsaTimer.systemInfo(); + //IF there are any groups left, add each of them as a family for (Sequence seq:seqs.values()){ String seqName = seq.getName(); ArrayList tSet = setMap.get(seqName); if (tSet.isEmpty()){ if (!strMap.containsKey(seqName)){ - Logging.error("ERROR 2 : " + seqName); + LOG.error("ERROR 2 : " + seqName); } continue;//added } //tSet is not empty GeneDatabase.GeneFamily family = null; - //Logging.info(" ADDing GGs " + GG + " of size " + tSet.size() + " " + (new Date())); + LOG.info(" ADDing GGs " + GG + " of size " + tSet.size() + " " + (new Date())); for (String key:tSet){ Sequence keySeq = seqs.get(key); if (family == null){ @@ -372,9 +396,8 @@ public void cleanUp() throws IOException, InterruptedException{ GG ++; } tSet.clear(); - } - - Logging.info("Manage to add " + G + " and " + GG + " " + new Date()); + } + LOG.info("Manage to add " + G + " and " + GG + " " + new Date()); return strMap; } @@ -399,7 +422,7 @@ public String addGene(Sequence seq) throws IOException, InterruptedException{ sos.close(); //Run bwa - //Logging.info("Running bwa for " + seq.getName() + " " + geneFamilies.size() + " family"); + //LOG.info("Running bwa for " + seq.getName() + " " + geneFamilies.size() + " family"); Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA.sh"); process.waitFor(); @@ -418,7 +441,7 @@ public String addGene(Sequence seq) throws IOException, InterruptedException{ String refName = sam.getReferenceName(); GeneDatabase.GeneFamily family = geneDatabase.getFamily(refName); if (family == null){ - Logging.error("Check for problem : family " + refName + " not found!"); + LOG.error("Check for problem : family " + refName + " not found!"); continue; } Sequence refSeq = family.represetationSequence(); @@ -436,7 +459,8 @@ public String addGene(Sequence seq) throws IOException, InterruptedException{ } //TODO: no good public - public static double ratio = 0.9; + public static double ratio = 0.9; + public static double coverageTheshold = 0.9; static boolean isSimilar(Sequence refSeq, Sequence readSeq, SAMRecord sam){ japsa.util.HTSUtilities.IdentityProfile @@ -445,14 +469,13 @@ static boolean isSimilar(Sequence refSeq, Sequence readSeq, SAMRecord sam){ :HTSUtilities.identity(refSeq, readSeq, sam); double m = profile.match / ratio; - //System.out.println("XXXX " + (1.0 *profile.match/profile.refBase) + " " + (1.0 *profile.match/profile.readBase)); - return (m > profile.refBase && m > profile.readBase); + return (m > profile.refBase + && m > profile.readBase + && profile.refBase > refSeq.length() * coverageTheshold + && profile.readBase > readSeq.length() * coverageTheshold + ); } - - - - /** * Set up with a list of strains * @param file @@ -479,7 +502,7 @@ static boolean isSimilar(Sequence refSeq, Sequence readSeq, SAMRecord sam){ String n50 = toks[4]; if (Integer.parseInt(n50) < 100000){ - Logging.info("Strain " + strain + ": skipped as N50=" + toks[4]); + LOG.info("Strain " + strain + ": skipped as N50=" + toks[4]); continue; } @@ -489,8 +512,8 @@ static boolean isSimilar(Sequence refSeq, Sequence readSeq, SAMRecord sam){ annoMap = JapsaAnnotation.readMGFF(gffIn,0,0,"CDS"); gffIn.close(); - Logging.info("Genome " + toks[3] + " " + strain); - Logging.info("There are " + annoMap.size()+ " annotations here"); + LOG.info("Genome " + toks[3] + " " + strain); + LOG.info("There are " + annoMap.size()+ " annotations here"); for (JapsaAnnotation anno:annoMap){ for (int i = 0; i < anno.numFeatures(); i++){ //totGenes ++; @@ -507,7 +530,7 @@ static boolean isSimilar(Sequence refSeq, Sequence readSeq, SAMRecord sam){ } } if (geneID == null){ - //Logging.error("ERROR = " + desc ); + //LOG.error("ERROR = " + desc ); continue; } //totGenes ++; @@ -519,10 +542,10 @@ static boolean isSimilar(Sequence refSeq, Sequence readSeq, SAMRecord sam){ myGenes.put(geneID, geneSeq); //geneID = addGene(geneSeq); - //Logging.info("Added " + geneSeq.getName() + " as "+ geneID); + //LOG.info("Added " + geneSeq.getName() + " as "+ geneID); }//for i /************************************************************* - Logging.info("Trying to add " + anno.getAnnotationID() + " of " + anno.numFeatures() + " genes"); + LOG.info("Trying to add " + anno.getAnnotationID() + " of " + anno.numFeatures() + " genes"); /*************************************************************/ }//for anno diff --git a/src/main/java/japsa/bio/alignment/MultipleAlignment.java b/src/main/java/japsa/bio/alignment/MultipleAlignment.java new file mode 100644 index 0000000..1bda180 --- /dev/null +++ b/src/main/java/japsa/bio/alignment/MultipleAlignment.java @@ -0,0 +1,442 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/**************************************************************************** + * Revision History + * 25/06/2014 - Minh Duc Cao: Started + * + ****************************************************************************/ +package japsa.bio.alignment; + +import java.io.IOException; +import java.util.Arrays; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Implementation of multiple aligment of (long) short reads to the reference + * genome. The alignment is implemeted as a linked list of sites, each of which + * is an array of the corresponding positions from all sequences. + * + * @author minhduc + * + */ +public class MultipleAlignment { + private static final Logger LOG = LoggerFactory.getLogger(MultipleAlignment.class); + + static Alphabet alphabet = Alphabet.DNA5(); + // The head and the tail of the list + NodeAlignment head = null;// , tail = null; + + // Number of sequences + + Sequence[] seqs; + // NB: seqHash[0] = reference: + + int seqIndex = 0;// point to the next avaibale one + + public MultipleAlignment(int nSeq, Sequence ref) { + seqs = new Sequence[nSeq]; + seqs[0] = ref; + seqIndex = 1; + } + + public void addRead(SAMRecord rec) { + if (seqIndex >= seqs.length) { + LOG.warn("More sequences are added " + seqIndex); + return; + } + Sequence seq = new Sequence(alphabet, rec.getReadLength(), + rec.getReadName()); + byte[] bases = rec.getReadBases(); + for (int i = 0; i < seq.length(); i++) + seq.setBase(i, alphabet.byte2index(bases[i])); + + seqs[seqIndex] = seq; + // seqIndex ++; + + // ////////////////////////////////////////////////////////// + int refBase = rec.getAlignmentStart(); + NodeAlignment current; + // check if refBase already in the alignment + if (head == null) { + head = new NodeAlignment(); + head.site[0] = refBase; + current = head; + } else { + current = head; + if (current.site[0] < refBase) { + // go forward to search + while (true) { + if (current.next != null) + current = current.next; + else { + current.next = new NodeAlignment(); + current.next.prev = current; + current = current.next; + current.site[0] = current.prev.site[0] + 1; + } + + if (current.site[0] == refBase) + break;// while + }// while + // assert curent.site[0] == refBase + }// if + else { + while (current.site[0] > refBase) { + head = new NodeAlignment(); + head.next = current; + current.prev = head; + head.site[0] = current.site[0] - 1; + current = head; + }// while + }// else + }// else + // assert: current.site[0] == refBase; + + int readBase = 1; + int length; + + for (final CigarElement e : rec.getCigar().getCigarElements()) { + switch (e.getOperator()) { + case H: + break; // ignore hard clips + case P: + break; // ignore pads + case S: + readBase += e.getLength(); + break; // soft clip read bases + case N: // N ~ D + case D: + length = e.getLength(); + while (length > 0) { + if (current.site[0] != 0) { + if (current.site[0] != refBase) { + LOG.error("Fatal error " + refBase + " vs " + current.site[0]); + System.exit(1); + } + length--; + refBase++; + } + + if (current.next == null) { + current.next = new NodeAlignment(); + current.next.prev = current; + current.next.site[0] = refBase; + current = current.next; + + } else + current = current.next; + }// while + + break;// case + case I: + length = e.getLength(); + + while (current.site[0] == 0 && length > 0) { + length--; + current.site[seqIndex] = readBase; + readBase++; + + if (current.next == null) { + current.next = new NodeAlignment(); + current.next.prev = current; + current.next.site[0] = refBase; + current = current.next; + + } else + current = current.next; + }// while + while (length > 0) { + NodeAlignment newNode = new NodeAlignment(); + + newNode.prev = current.prev; + if (newNode.prev != null) + newNode.prev.next = newNode; + else + head = newNode; + + newNode.next = current; + current.prev = newNode; + + length--; + newNode.site[seqIndex] = readBase; + readBase++; + }// while + break; + + case M: + case EQ: + case X: + length = e.getLength(); + while (length > 0) { + if (current.site[0] != 0) { + if (current.site[0] != refBase) { + LOG.error("Fatal error " + refBase + " vs " + current.site[0]); + System.exit(1); + } + length--; + current.site[seqIndex] = readBase; + readBase++; + refBase++; + } + + if (current.next == null) { + current.next = new NodeAlignment(); + current.next.prev = current; + current.next.site[0] = refBase; + current = current.next; + + } else + current = current.next; + } + break; + default: + throw new IllegalStateException( + "Case statement didn't deal with cigar op: " + + e.getOperator()); + }// case + }// for + // ///////////////////////////////////////////////////////// + seqIndex++; + } + + public void printAlignment() { + for (int s = 0; s < seqIndex; s++) { + System.out.printf("%20s : ", seqs[s].getName()); + NodeAlignment current = head; + while (current != null) { + if (current.site[s] == 0) + System.out.print('-'); + else + System.out.print(seqs[s].charAt(current.site[s] - 1)); + current = current.next; + }// while + System.out.println(); + }// for + System.out.println("##########################"); + } + + public MultipleAlignment reduceAlignment(int f, int t) { + + MultipleAlignment reduce = new MultipleAlignment(2, seqs[0]); + Sequence consensus = new Sequence(alphabet, seqs[1].length(), + "consensus"); + reduce.seqs[1] = consensus; + reduce.seqIndex = 2; + + int conIdx = 0; + NodeAlignment reduceCurrent = null; + + NodeAlignment current = head, fN = null, tN = null; + while (current != null) { + if (current.site[0] == f) + fN = current; + + if (current.site[0] > t) { + tN = current; + break; + } + current = current.next; + } + + if (fN == null || tN == null) + return null; + + // head = fN; head.prev = null;//gabbabe collection + // tN.next = null;//gabbabe collection + + // so the aligment reduced to fN -> tN + current = fN; + + int lastInx = alphabet.size(); + int[] votes = new int[lastInx + 1]; + + while (current != null && current != tN.next) { + Arrays.fill(votes, 0); + // get the votes + for (int s = 1; s < seqIndex; s++) { + int loc = current.site[s] - 1; + if (loc < 0) + votes[lastInx]++; + else + votes[seqs[s].getBase(loc)]++; + + } + // check the highest + int best = lastInx; + for (int i = best - 1; i >= 0; i--) + if (votes[i] >= votes[best]) + best = i; + + if (best == lastInx) {// is a gap + if (current.site[0] == 0) { // also a gap + current = current.next; + continue;// while + } else { + if (reduceCurrent == null) { + reduce.head = reduceCurrent = new NodeAlignment(); + } else { + reduceCurrent.next = new NodeAlignment(); + reduceCurrent.next.prev = reduceCurrent; + reduceCurrent = reduceCurrent.next; + } + reduceCurrent.site[0] = current.site[0]; + reduceCurrent.site[1] = 0;// a gap + } + } else {// a char + if (reduceCurrent == null) { + reduce.head = reduceCurrent = new NodeAlignment(); + } else { + reduceCurrent.next = new NodeAlignment(); + reduceCurrent.next.prev = reduceCurrent; + reduceCurrent = reduceCurrent.next; + }// else + + consensus.setBase(conIdx, (byte) best); + conIdx++; + reduceCurrent.site[0] = current.site[0]; + reduceCurrent.site[1] = conIdx; + }// else + current = current.next; + } + return reduce; + } + + public void printAlignment(int f, int t) throws IOException { + // Locate the first node (corresponds to f) and last node (t) + NodeAlignment current = head, fN = null, tN = null; + SequenceOutputStream os = SequenceOutputStream.makeOutputStream("-"); + while (current != null) { + if (current.site[0] == f) + fN = current; + + if (current.site[0] > t) { + tN = current; + break; + } + current = current.next; + } + + if (fN == null || tN == null) + return; + + current = fN; + int pos = 0; + while (current != null && current != tN) { + if (current.site[0] != 0) + pos = current.site[0]; + + os.print(pos, 7); + os.print(' '); + + for (int s = 0; s < seqIndex; s++) { + if (current.site[s] == 0) + os.print('-'); + else + os.print(seqs[s].charAt(current.site[s] - 1)); + } + current = current.next; + os.print('\n'); + } + } + + //Number of sequences printed out + public int printFasta(int f, int t, String fileName) throws IOException { + SequenceOutputStream os = SequenceOutputStream + .makeOutputStream(fileName); + //for (int s = 0; s < seqIndex; s++) { + // System.out.println(seqs[s].getName() + " " + seqs[s].length()); + //} + + StringBuilder[] sbs = new StringBuilder[seqIndex]; + for (int i = 0; i < seqIndex; i++) + sbs[i] = new StringBuilder(64); + + // Locate the first node (corresponds to f) and last node (t) + NodeAlignment current = head; + // Search for start + while (current != null) { + if (current.site[0] == f) + break; + current = current.next; + } + + while (current != null && current.site[0] <= t) { + for (int s = 0; s < seqIndex; s++) { + if (current.site[s] != 0) { + sbs[s].append(seqs[s].charAt(current.site[s] - 1)); + } + } + current = current.next; + } + + for (int s = 1; s < seqIndex; s++) { + if (sbs[s].length() > 6){ + os.print(">" + seqs[s].getName() + "_" + s); + for (int i = 0; i < sbs[s].length(); i++) { + if (i % 60 == 0) + os.print("\n"); + os.print("" + sbs[s].charAt(i)); + } + os.print("\n"); + } + } + os.close(); + + return seqIndex; + + } + + /** + * A node in the list of + * + * @author minhduc + * + */ + class NodeAlignment { + NodeAlignment next = null, prev = null; + int[] site = new int[seqs.length];// + } + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + +} diff --git a/src/main/java/japsa/bio/alignment/ProbFSM.java b/src/main/java/japsa/bio/alignment/ProbFSM.java index 660618a..4b7366d 100644 --- a/src/main/java/japsa/bio/alignment/ProbFSM.java +++ b/src/main/java/japsa/bio/alignment/ProbFSM.java @@ -29,7 +29,7 @@ /************************** REVISION HISTORY ************************** * 08/12/2014 - Minh Duc Cao: Created - * + * ****************************************************************************/ package japsa.bio.alignment; @@ -46,25 +46,29 @@ import japsa.seq.SequenceOutputStream; import japsa.util.ByteArray; import japsa.util.JapsaMath; -import japsa.util.Logging; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Implementation of alignment using a probabilistic finite state machine + * * @author minhduc - * - * TODO: Need to implement four possible modes of alignment: - * - local alignment - * - a local, b global - * - a global, b local - * - a,b global + *

+ * TODO: Need to implement four possible modes of alignment: + * - local alignment + * - a local, b global + * - a global, b local + * - a,b global */ -public abstract class ProbFSM { +public abstract class ProbFSM { + private static final Logger LOG = LoggerFactory.getLogger(ProbFSM.class); + + /** - * List of states: + * List of states: */ - MachineState [] states; + MachineState[] states; Sequence mSeq;//sequence belong to the model double insEmissionCost = 2, changeEmissionCost = JapsaMath.log2(3); @@ -74,26 +78,28 @@ /** * Reset all counts before each learning step */ - public void resetCount(){ - for (int i = 0; i < states.length;i++){ - states[i].countCopy - = states[i].countMutate = states[i].countIns = states[i].countDel = 0; + public void resetCount() { + for (int i = 0; i < states.length; i++) { + states[i].countCopy + = states[i].countMutate + = states[i].countIns + = states[i].countDel = 0; } } /** * Re-estimate parameters based on counts */ - public void reEstimate(){ + public void reEstimate() { int countM = 0, countC = 0; - for (int i = 0; i < states.length;i++){ + for (int i = 0; i < states.length; i++) { countM += states[i].countMutate; - countC += states[i].countCopy ; + countC += states[i].countCopy; states[i].setTransitionProb(states[i].countMutate + states[i].countCopy + 1.0, states[i].countIns + 1.0, states[i].countDel + 1.0); } double probC = (countC + 1.0) / (countM + countC + 2.0); - for (int i = 0; i < states.length;i++){ + for (int i = 0; i < states.length; i++) { states[i].setCopyProb(probC); //System.out.printf("State %s: %3d %3d %3d %3d %8.4f %8.4f %8.4f %8.4f %8.4f\n", states[i].name, states[i].countCopy, states[i].countMutate, states[i].countIns, states[i].countDel, states[i].matchProb, states[i].insProb,states[i].delProb, states[i].copyProb, states[i].changeProb); } @@ -103,22 +109,24 @@ public void reEstimate(){ * Show all the parameters of the machine */ - public void showProb(){ - for (int i = 0; i < states.length;i++){ - System.out.printf("Prob state %s : [%8.4f %8.4f %8.4f] [%8.4f %8.4f]\n", states[i].name, states[i].matchProb, states[i].insProb,states[i].delProb, states[i].copyProb, states[i].changeProb); + public void showProb() { + for (int i = 0; i < states.length; i++) { + System.out.printf("Prob state %s : [%8.4f %8.4f %8.4f] [%8.4f %8.4f]\n", states[i].name, states[i].matchProb, states[i].insProb, states[i].delProb, states[i].copyProb, states[i].changeProb); //System.out.printf("Cost state %s : %8.4f %8.4f %8.4f %8.4f %8.4f\n", states[i].name, states[i].matchCost, states[i].insCost,states[i].delCost, states[i].copyCost, states[i].changeCost); - } + } } - public void setModelSequence(Sequence seq){ + public void setModelSequence(Sequence seq) { mSeq = seq; } + /** * Generate a sequence by the machine + * * @param rnd * @return */ - public Sequence generate(Random rnd){ + public Sequence generate(Random rnd) { int indexSrc = 0; ByteArray byteArray = new ByteArray(mSeq.length() * 2); @@ -127,41 +135,41 @@ public Sequence generate(Random rnd){ double toss; @SuppressWarnings("unused") double cost = 0; - while (indexSrc < mSeq.length()){ + while (indexSrc < mSeq.length()) { toss = rnd.nextDouble();//chosing the transition //System.out.print(toss + " ==> "); - if (toss < currentState.delProb && currentState.delState != null){ + if (toss < currentState.delProb && currentState.delState != null) { //System.out.println("Gen D " + indexSrc); cost += currentState.delCost; - indexSrc ++; - }else if (toss < currentState.insProb + currentState.delProb && currentState.insState != null){ + indexSrc++; + } else if (toss < currentState.insProb + currentState.delProb && currentState.insState != null) { //System.out.println("Gen I " + indexSrc); //Insertion cost += currentState.insCost; //cost of emitting the inserted base - byteArray.add((byte)rnd.nextInt(4)); - cost += insEmissionCost; - }else{//match + byteArray.add((byte) rnd.nextInt(4)); + cost += insEmissionCost; + } else {//match cost += currentState.matchCost; toss = rnd.nextDouble();//copy or change byte base = mSeq.getBase(indexSrc); - if (toss < currentState.copyProb){ + if (toss < currentState.copyProb) { //copy //System.out.println("Gen C " + indexSrc); cost += currentState.copyCost; - byteArray.add(base); - }else{ + byteArray.add(base); + } else { //System.out.println("Gen M " + indexSrc); //change cost += currentState.changeCost; //need to toss again - byteArray.add((byte)((base + 1 + rnd.nextInt(3)) % 4)); - cost += changeEmissionCost; + byteArray.add((byte) ((base + 1 + rnd.nextInt(3)) % 4)); + cost += changeEmissionCost; } - indexSrc ++; + indexSrc++; } } @@ -171,31 +179,36 @@ public Sequence generate(Random rnd){ /** * Update counts based on the path of best alignment (backward pass) + * * @param emiss * @return */ - public int updateCount(Emission emiss){ + public int updateCount(Emission emiss) { int countEmis = 0; - while (true){ + while (true) { Emission bwdEmission = emiss.bwdEmission; if (bwdEmission == null) break; - switch (emiss.type){ + switch (emiss.type) { case INSERTION: - bwdEmission.toState.countIns ++;break; + bwdEmission.toState.countIns++; + break; case DELETION: - bwdEmission.toState.countDel ++;break; + bwdEmission.toState.countDel++; + break; case COPY: - bwdEmission.toState.countCopy ++;break; + bwdEmission.toState.countCopy++; + break; case MUTATE: - bwdEmission.toState.countMutate ++;break; + bwdEmission.toState.countMutate++; + break; default: - break; + break; } - countEmis ++; - emiss = bwdEmission; + countEmis++; + emiss = bwdEmission; } return countEmis; } @@ -203,21 +216,22 @@ public int updateCount(Emission emiss){ /** * Backward pass: only print out the path in reverse. Consider updateCount * for learning + * * @param emiss * @return */ - public int backward(Emission emiss){ + public int backward(Emission emiss) { int countEmis = 0; - while (true){ + while (true) { Emission bwdEmission = emiss.bwdEmission; if (bwdEmission == null) break; - switch (emiss.type){ + switch (emiss.type) { case INSERTION: System.out.println(bwdEmission.toState.name + " I"); - break; + break; case DELETION: System.out.println(bwdEmission.toState.name + " D"); break; @@ -234,30 +248,30 @@ public int backward(Emission emiss){ System.out.println(bwdEmission.toState.name + " F"); break; } - countEmis ++; - emiss = bwdEmission; + countEmis++; + emiss = bwdEmission; } return countEmis; } - - public void printAlignment(Emission emiss, Sequence gSeq, SequenceOutputStream out) throws IOException{ + + public void printAlignment(Emission emiss, Sequence gSeq, SequenceOutputStream out) throws IOException { SequenceBuilder sbm = new SequenceBuilder(Alphabet.DNA(), 1000, mSeq.getName()); SequenceBuilder sbg = new SequenceBuilder(Alphabet.DNA(), 1000, gSeq.getName()); StringBuilder sb = new StringBuilder(1000); - - while (true){ + + while (true) { Emission bwdEmission = emiss.bwdEmission; if (bwdEmission == null) break; - switch (emiss.type){ + switch (emiss.type) { case INSERTION: sbg.append(gSeq.getBase(emiss.gPos)); - sbm.append((byte)DNA.GAP); + sbm.append((byte) DNA.GAP); sb.append("I"); - break; + break; case DELETION: - sbg.append((byte)DNA.GAP); + sbg.append((byte) DNA.GAP); sbm.append(mSeq.getBase(emiss.mPos)); sb.append("D"); break; @@ -273,105 +287,99 @@ public void printAlignment(Emission emiss, Sequence gSeq, SequenceOutputStream o break; case INSERTION_FIRST: sbg.append(gSeq.getBase(emiss.gPos)); - sbm.append((byte)DNA.GAP); + sbm.append((byte) DNA.GAP); sb.append("L"); break; case DELETION_FIRST: - sbg.append((byte)DNA.GAP); + sbg.append((byte) DNA.GAP); sbm.append(mSeq.getBase(emiss.mPos)); sb.append("F"); break; - } - emiss = bwdEmission; + } + emiss = bwdEmission; } - - - + + int nameLength = 32; - int done = sbm.length(); - while (done > 0){ + int done = sbm.length(); + while (done > 0) { int n = Math.min(60, done); String mName = sbm.getName(); String gName = sbg.getName(); - - if (mName.length() > nameLength -1) - mName = mName.substring(0, nameLength -1); - + + if (mName.length() > nameLength - 1) + mName = mName.substring(0, nameLength - 1); + if (gName.length() > nameLength) - gName = gName.substring(0, nameLength -1); - - + gName = gName.substring(0, nameLength - 1); + + out.print(mName); for (int i = mName.length(); i < nameLength; i++) out.print(' '); - + for (int i = 1; i <= n; i++) out.print(sbm.charAt(done - i)); out.println(); - + out.print(gName); for (int i = gName.length(); i < nameLength; i++) - out.print(' '); - + out.print(' '); + for (int i = 1; i <= n; i++) out.print(sbg.charAt(done - i)); out.println(); - + for (int i = 0; i < nameLength; i++) out.print(' '); - + for (int i = 1; i <= n; i++) out.print(sb.charAt(done - i)); out.println(); - + out.println(); done -= n; - } + } } - - /************************************************************************/ /** * Forward pass: find the best path to align a sequence + * * @param genSeq * @return */ - public Emission align(Sequence genSeq){ - //return state - //JapsaTimer timer = new JapsaTimer(); - //timer.systemInfo(); - - Emission retEmission = new Emission(states[0], mSeq.length()-1, genSeq.length() -1); + public Emission align(Sequence genSeq) { + Emission retEmission = new Emission(states[0], mSeq.length() - 1, genSeq.length() - 1); retEmission.myCost = genSeq.length() * (insEmissionCost + 4); - - Emission currentEmission, finalEmission;//current pointer and last pointer on the linked-list - currentEmission = finalEmission = new Emission(states[0],-1,-1); - currentEmission.myCost = 0; + + Emission currentEmission, finalEmission;//current pointer and last pointer on the linked-list + currentEmission = finalEmission = new Emission(states[0], -1, -1); + currentEmission.myCost = 0; HashMap hash = new HashMap(); - while (currentEmission != null){//linked list not exhausted - if (currentEmission.gPos >= genSeq.length() - 1){ + while (currentEmission != null) {//linked list not exhausted + if (currentEmission.gPos >= genSeq.length() - 1) { //done generating genSeq - if (currentEmission.myCost < retEmission.myCost){ - retEmission = currentEmission; + if (currentEmission.myCost < retEmission.myCost) { + retEmission = currentEmission; } - }else if (currentEmission.myCost < retEmission.myCost){ + } else if (currentEmission.myCost < retEmission.myCost) { String hashKey; Emission nextEmission; double cost; - //1. consider deletion if profile has something to offer - if (currentEmission.mPos + 1 < mSeq.length() && currentEmission.toState.delState != null){ + //1. consider deletion if profile has something to offer + if (currentEmission.mPos + 1 < mSeq.length() && currentEmission.toState.delState != null) { cost = currentEmission.myCost + currentEmission.toState.delCost; - hashKey= Emission.hashKey(currentEmission.toState.delState.name, currentEmission.mPos + 1, currentEmission.gPos); + hashKey = Emission.hashKey(currentEmission.toState.delState.name, currentEmission.mPos + 1, currentEmission.gPos); nextEmission = hash.get(hashKey); - if (nextEmission == null){ + if (nextEmission == null) { nextEmission = new Emission(currentEmission.toState.delState, currentEmission.mPos + 1, currentEmission.gPos); nextEmission.type = EmissionType.DELETION; @@ -381,8 +389,8 @@ public Emission align(Sequence genSeq){ finalEmission.next = nextEmission; finalEmission = nextEmission; nextEmission.bwdEmission = currentEmission; - }else{ - if (nextEmission.myCost > cost){ + } else { + if (nextEmission.myCost > cost) { nextEmission.myCost = cost; nextEmission.bwdEmission = currentEmission; nextEmission.type = EmissionType.DELETION; @@ -390,13 +398,13 @@ public Emission align(Sequence genSeq){ }//else - if nextstate != null }//if mPos - //2. insertion - if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.toState.insState != null){ + //2. insertion + if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.toState.insState != null) { cost = currentEmission.myCost + currentEmission.toState.insCost + this.insEmissionCost; - hashKey= Emission.hashKey(currentEmission.toState.insState.name, currentEmission.mPos, currentEmission.gPos + 1); + hashKey = Emission.hashKey(currentEmission.toState.insState.name, currentEmission.mPos, currentEmission.gPos + 1); nextEmission = hash.get(hashKey); - if (nextEmission == null){ + if (nextEmission == null) { nextEmission = new Emission(currentEmission.toState.insState, currentEmission.mPos, currentEmission.gPos + 1); nextEmission.myCost = cost; nextEmission.type = EmissionType.INSERTION; @@ -406,8 +414,8 @@ public Emission align(Sequence genSeq){ finalEmission = nextEmission; nextEmission.bwdEmission = currentEmission; - }else{ - if (nextEmission.myCost > cost){ + } else { + if (nextEmission.myCost > cost) { nextEmission.myCost = cost; nextEmission.bwdEmission = currentEmission; nextEmission.type = EmissionType.INSERTION; @@ -416,21 +424,21 @@ public Emission align(Sequence genSeq){ } //3.Match - if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.mPos + 1 < mSeq.length()){ + if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.mPos + 1 < mSeq.length()) { EmissionType type = EmissionType.COPY; - if (mSeq.getBase(currentEmission.mPos + 1) == genSeq.getBase(currentEmission.gPos + 1)){ - cost = currentEmission.myCost + currentEmission.toState.matchCost + currentEmission.toState.copyCost; - }else{ + if (mSeq.getBase(currentEmission.mPos + 1) == genSeq.getBase(currentEmission.gPos + 1)) { + cost = currentEmission.myCost + currentEmission.toState.matchCost + currentEmission.toState.copyCost; + } else { cost = currentEmission.myCost + currentEmission.toState.matchCost + currentEmission.toState.changeCost + this.changeEmissionCost; type = EmissionType.MUTATE; } - if (cost < retEmission.myCost){ - hashKey= Emission.hashKey(currentEmission.toState.matchState.name, currentEmission.mPos + 1, currentEmission.gPos +1); + if (cost < retEmission.myCost) { + hashKey = Emission.hashKey(currentEmission.toState.matchState.name, currentEmission.mPos + 1, currentEmission.gPos + 1); nextEmission = hash.get(hashKey); - if (nextEmission == null){ - nextEmission = new Emission(currentEmission.toState.matchState, currentEmission.mPos + 1, currentEmission.gPos +1); + if (nextEmission == null) { + nextEmission = new Emission(currentEmission.toState.matchState, currentEmission.mPos + 1, currentEmission.gPos + 1); nextEmission.type = type; nextEmission.myCost = cost; @@ -438,8 +446,8 @@ public Emission align(Sequence genSeq){ finalEmission.next = nextEmission; finalEmission = nextEmission; nextEmission.bwdEmission = currentEmission; - }else{ - if (nextEmission.myCost > cost){ + } else { + if (nextEmission.myCost > cost) { nextEmission.myCost = cost; nextEmission.bwdEmission = currentEmission; nextEmission.type = type; @@ -451,59 +459,60 @@ public Emission align(Sequence genSeq){ //helping GC to gabbabe collect current state Emission tmp = currentEmission.next; - - hash.remove(Emission.hashKey(currentEmission.toState.name, currentEmission.mPos, currentEmission.gPos)); + + hash.remove(Emission.hashKey(currentEmission.toState.name, currentEmission.mPos, currentEmission.gPos)); currentEmission.next = null; - + currentEmission = tmp; } - - Logging.info("Hash = " + hash.size()); - + + LOG.info("Hash = " + hash.size()); + //timer.systemInfo(); //Runtime.getRuntime().gc(); //timer.systemInfo(); - - //timer.mark("toc"); + + //timer.mark("toc"); return retEmission; } + /************************************************************** * Align with option to generate 2 bits before and after mSeq * @param genSeq * @return */ - public Emission alignGenerative(Sequence genSeq){ + public Emission alignGenerative(Sequence genSeq) { //return state - Emission retEmission = new Emission(states[0], mSeq.length()-1, genSeq.length() -1); + Emission retEmission = new Emission(states[0], mSeq.length() - 1, genSeq.length() - 1); retEmission.myCost = genSeq.length() * (insEmissionCost + 4); - - Emission currentEmission, finalEmission;//current pointer and last pointer on the linked-list - currentEmission = finalEmission = new Emission(states[0],-1,-1); - currentEmission.myCost = 0; + + Emission currentEmission, finalEmission;//current pointer and last pointer on the linked-list + currentEmission = finalEmission = new Emission(states[0], -1, -1); + currentEmission.myCost = 0; HashMap hash = new HashMap(); - while (currentEmission != null){//linked list not exhausted - if (currentEmission.gPos >= genSeq.length() - 1){ + while (currentEmission != null) {//linked list not exhausted + if (currentEmission.gPos >= genSeq.length() - 1) { //done generating genSeq - if (currentEmission.myCost < retEmission.myCost){ - retEmission = currentEmission; + if (currentEmission.myCost < retEmission.myCost) { + retEmission = currentEmission; } - }else if (currentEmission.myCost < retEmission.myCost){ + } else if (currentEmission.myCost < retEmission.myCost) { String hashKey; Emission nextEmission; double cost; - //1. consider deletion if profile has something to offer - if (currentEmission.mPos + 1 < mSeq.length() && currentEmission.toState.delState != null){ + //1. consider deletion if profile has something to offer + if (currentEmission.mPos + 1 < mSeq.length() && currentEmission.toState.delState != null) { if (currentEmission.gPos < 0) cost = currentEmission.myCost; else cost = currentEmission.myCost + currentEmission.toState.delCost; - hashKey= Emission.hashKey(currentEmission.toState.delState.name, currentEmission.mPos + 1, currentEmission.gPos); + hashKey = Emission.hashKey(currentEmission.toState.delState.name, currentEmission.mPos + 1, currentEmission.gPos); nextEmission = hash.get(hashKey); - if (nextEmission == null){ + if (nextEmission == null) { nextEmission = new Emission(currentEmission.toState.delState, currentEmission.mPos + 1, currentEmission.gPos); if (currentEmission.gPos < 0) nextEmission.type = EmissionType.DELETION_FIRST; @@ -516,8 +525,8 @@ public Emission alignGenerative(Sequence genSeq){ finalEmission.next = nextEmission; finalEmission = nextEmission; nextEmission.bwdEmission = currentEmission; - }else{ - if (nextEmission.myCost > cost){ + } else { + if (nextEmission.myCost > cost) { nextEmission.myCost = cost; nextEmission.bwdEmission = currentEmission; if (currentEmission.gPos < 0) @@ -528,19 +537,19 @@ public Emission alignGenerative(Sequence genSeq){ }//else - if nextstate != null }//if mPos - //2. insertion - if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.toState.insState != null){ - if (currentEmission.mPos < 0 || currentEmission.mPos >= mSeq.length()-1) + //2. insertion + if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.toState.insState != null) { + if (currentEmission.mPos < 0 || currentEmission.mPos >= mSeq.length() - 1) cost = currentEmission.myCost + 2; else cost = currentEmission.myCost + currentEmission.toState.insCost + this.insEmissionCost; - hashKey= Emission.hashKey(currentEmission.toState.insState.name, currentEmission.mPos, currentEmission.gPos + 1); + hashKey = Emission.hashKey(currentEmission.toState.insState.name, currentEmission.mPos, currentEmission.gPos + 1); nextEmission = hash.get(hashKey); - if (nextEmission == null){ + if (nextEmission == null) { nextEmission = new Emission(currentEmission.toState.insState, currentEmission.mPos, currentEmission.gPos + 1); nextEmission.myCost = cost; - if (currentEmission.mPos < 0 || currentEmission.mPos >= mSeq.length()-1) + if (currentEmission.mPos < 0 || currentEmission.mPos >= mSeq.length() - 1) nextEmission.type = EmissionType.INSERTION_FIRST; else nextEmission.type = EmissionType.INSERTION; @@ -550,34 +559,34 @@ public Emission alignGenerative(Sequence genSeq){ finalEmission = nextEmission; nextEmission.bwdEmission = currentEmission; - }else{ - if (nextEmission.myCost > cost){ + } else { + if (nextEmission.myCost > cost) { nextEmission.myCost = cost; nextEmission.bwdEmission = currentEmission; - if (currentEmission.mPos < 0 || currentEmission.mPos >= mSeq.length()-1) + if (currentEmission.mPos < 0 || currentEmission.mPos >= mSeq.length() - 1) nextEmission.type = EmissionType.INSERTION_FIRST; else - nextEmission.type = EmissionType.INSERTION; + nextEmission.type = EmissionType.INSERTION; } } } //3.Match - if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.mPos + 1 < mSeq.length()){ + if (currentEmission.gPos + 1 < genSeq.length() && currentEmission.mPos + 1 < mSeq.length()) { EmissionType type = EmissionType.COPY; - if (mSeq.getBase(currentEmission.mPos + 1) == genSeq.getBase(currentEmission.gPos + 1)){ - cost = currentEmission.myCost + currentEmission.toState.matchCost + currentEmission.toState.copyCost; - }else{ + if (mSeq.getBase(currentEmission.mPos + 1) == genSeq.getBase(currentEmission.gPos + 1)) { + cost = currentEmission.myCost + currentEmission.toState.matchCost + currentEmission.toState.copyCost; + } else { cost = currentEmission.myCost + currentEmission.toState.matchCost + currentEmission.toState.changeCost + this.changeEmissionCost; type = EmissionType.MUTATE; } - if (cost < retEmission.myCost){ - hashKey= Emission.hashKey(currentEmission.toState.matchState.name, currentEmission.mPos + 1, currentEmission.gPos +1); + if (cost < retEmission.myCost) { + hashKey = Emission.hashKey(currentEmission.toState.matchState.name, currentEmission.mPos + 1, currentEmission.gPos + 1); nextEmission = hash.get(hashKey); - if (nextEmission == null){ - nextEmission = new Emission(currentEmission.toState.matchState, currentEmission.mPos + 1, currentEmission.gPos +1); + if (nextEmission == null) { + nextEmission = new Emission(currentEmission.toState.matchState, currentEmission.mPos + 1, currentEmission.gPos + 1); nextEmission.type = type; nextEmission.myCost = cost; @@ -585,8 +594,8 @@ public Emission alignGenerative(Sequence genSeq){ finalEmission.next = nextEmission; finalEmission = nextEmission; nextEmission.bwdEmission = currentEmission; - }else{ - if (nextEmission.myCost > cost){ + } else { + if (nextEmission.myCost > cost) { nextEmission.myCost = cost; nextEmission.bwdEmission = currentEmission; nextEmission.type = type; @@ -603,10 +612,16 @@ public Emission alignGenerative(Sequence genSeq){ } return retEmission; } + /********************************************************************/ - public static enum EmissionType {COPY, MUTATE, INSERTION, DELETION, INSERTION_FIRST, DELETION_FIRST}; - public static class Emission{ + public static enum EmissionType { + COPY, MUTATE, INSERTION, DELETION, INSERTION_FIRST, DELETION_FIRST + } + + ; + + public static class Emission { //This pointer is for implementing dynamic programming using a linked-list Emission next = null; EmissionType type; @@ -621,27 +636,33 @@ public Emission alignGenerative(Sequence genSeq){ //String hashKey; - public Emission(MachineState state, int mP, int gP){ + public Emission(MachineState state, int mP, int gP) { toState = state; mPos = mP; gPos = gP; myCost = Double.MAX_VALUE; } - static public String hashKey(String name, int mPos, int gPos){ + static public String hashKey(String name, int mPos, int gPos) { return name + "_" + mPos + "_" + gPos; } - public double getScore(){ + + public double getScore() { return myCost; - } + } } - public static class MachineState{ + /** + * Implement a machine state + * + * @author minhduc + */ + public static class MachineState { double copyProb = 0.9, changeProb = 0.1; double matchProb = 0.8, insProb = 0.14, delProb = 0.06; - double copyCost, changeCost, matchCost, insCost, delCost; + double copyCost, changeCost, matchCost, insCost, delCost; //For training int countIns = 0, countDel = 0, countCopy = 0, countMutate = 0; @@ -649,33 +670,34 @@ public double getScore(){ MachineState insState, delState, matchState; String name; - public MachineState(String name){ + public MachineState(String name) { this.name = name; } - public void setTransitionProb(double mP, double iP, double dP){ + public void setTransitionProb(double mP, double iP, double dP) { double sum = mP + iP + dP; - matchProb = mP/ sum; - insProb = iP/sum; - delProb = dP /sum; + matchProb = mP / sum; + insProb = iP / sum; + delProb = dP / sum; - matchCost = (matchProb > 0)? -JapsaMath.log2(matchProb):Double.MAX_VALUE; - insCost = (insProb > 0)? -JapsaMath.log2(insProb):Double.MAX_VALUE; - delCost = (delProb > 0)? -JapsaMath.log2(delProb):Double.MAX_VALUE; + matchCost = (matchProb > 0) ? -JapsaMath.log2(matchProb) : Double.MAX_VALUE; + insCost = (insProb > 0) ? -JapsaMath.log2(insProb) : Double.MAX_VALUE; + delCost = (delProb > 0) ? -JapsaMath.log2(delProb) : Double.MAX_VALUE; } - public void setCopyProb(double cP){ + + public void setCopyProb(double cP) { copyProb = cP; changeProb = 1 - copyProb; - copyCost = (copyProb > 0)? -JapsaMath.log2(copyProb):Double.MAX_VALUE; - changeCost = (changeProb > 0)? -JapsaMath.log2(changeProb):Double.MAX_VALUE; + copyCost = (copyProb > 0) ? -JapsaMath.log2(copyProb) : Double.MAX_VALUE; + changeCost = (changeProb > 0) ? -JapsaMath.log2(changeProb) : Double.MAX_VALUE; } } - public static class ProbThreeSM extends ProbFSM{ - public ProbThreeSM(Sequence seq){ + public static class ProbThreeSM extends ProbFSM { + public ProbThreeSM(Sequence seq) { states = new MachineState[3]; states[0] = new MachineState("S"); @@ -683,7 +705,7 @@ public ProbThreeSM(Sequence seq){ states[2] = new MachineState("D"); states[0].matchState = states[0]; - states[0].insState = states[1]; + states[0].insState = states[1]; states[0].delState = states[2]; states[0].setCopyProb(0.9); states[0].setTransitionProb(.8, 0.1, 0.1); @@ -706,24 +728,24 @@ public ProbThreeSM(Sequence seq){ this.mSeq = seq; } - public MachineState getMatState(){ + public MachineState getMatState() { return states[0]; } - public MachineState getInsState(){ + public MachineState getInsState() { return states[1]; } - public MachineState getDelState(){ + public MachineState getDelState() { return states[2]; } } - public static class ProbOneSM extends ProbFSM{ - public ProbOneSM(Sequence seq){ + public static class ProbOneSM extends ProbFSM { + public ProbOneSM(Sequence seq) { states = new MachineState[1]; states[0] = new MachineState("S"); - states[0].insState = states[0].delState = states[0].matchState = states[0]; + states[0].insState = states[0].delState = states[0].matchState = states[0]; states[0].setCopyProb(0.9); states[0].setTransitionProb(.85, 0.07, 0.08); @@ -731,11 +753,10 @@ public ProbOneSM(Sequence seq){ this.mSeq = seq; } - public MachineState getState(){ + public MachineState getState() { return states[0]; } } - } diff --git a/src/main/java/japsa/bio/alignment/ProfileDP.java b/src/main/java/japsa/bio/alignment/ProfileDP.java new file mode 100644 index 0000000..6017c91 --- /dev/null +++ b/src/main/java/japsa/bio/alignment/ProfileDP.java @@ -0,0 +1,478 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 10/08/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsa.bio.alignment; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Random; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.util.ByteArray; +import japsa.util.JapsaMath; + + +/** + * This is a one-state finite automata + * @author minhduc + * + */ +public class ProfileDP { + // cost from a match state + double matProb, + insProb, + delProb; + + double matchProb,misMatchProb; + + private double matCost, delCost, insCost; + + public double getMatCost() { + return matCost; + } + + public double getDelCost() { + return delCost; + } + + public double getInsCost() { + return insCost; + } + + public double getMatchCost() { + return matchCost; + } + + public double getMisMatchCost() { + return misMatchCost; + } + + private double matchCost, misMatchCost; + + Alphabet alphabet = Alphabet.DNA6(); + Sequence profileSeq; + int repStart, repEnd; + + + + public ProfileDP(Sequence seq, int repStart, int repEnd){ + profileSeq = seq; + this.repEnd = repEnd; + this.repStart = repStart; + setTransitionProbability(0.95,0.025,0.025); + setMatchProbability(0.85/0.95); + } + + /**************************************************/ + public void setTransitionProbability(double matP, double insP, double delP){ + + double sum = matP + insP + delP; + matProb = matP / sum; + insProb = insP / sum; + delProb = delP / sum; + + matCost = -JapsaMath.log2(matProb); + insCost = -JapsaMath.log2(insProb) + 2; + delCost = -JapsaMath.log2(delProb); + } + + public void setMatchProbability(double matP){ + matchProb = matP; + misMatchProb = (1.0 - matchProb); + + matchCost = - JapsaMath.log2(matchProb); + misMatchCost = -JapsaMath.log2(misMatchProb)- JapsaMath.log2(1.0/3.0);; + } + + public int getProfileLength(){ + return profileSeq.length(); + } + /**************************************************/ + + public EmissionState align(Sequence seq){ + //return state + EmissionState retState = new EmissionState(profileSeq.length() -1 , seq.length()-1, 0); + retState.score = seq.length() * (insCost + 2); + + EmissionState currentState, lastState; + currentState = lastState = new EmissionState(-1,-1,0); + currentState.score = 0; + + + HashMap hash = new HashMap(); + while (currentState != null){ + if (currentState.seqPos >= seq.length() - 1){ + if (currentState.score < retState.score){ + retState = currentState; + currentState = currentState.next; + continue; + } + } + + String hashKey; + EmissionState nextState; + double cost; + + int iterAdvance = 0; + //if it is about the enter the repeat + if (currentState.profilePos + 1 == repStart){ + iterAdvance = 1; + } + + //1. consider deletion if profile has something to offer + if (currentState.profilePos + 1 < profileSeq.length()){ + cost = currentState.score + delCost; + + if (cost < retState.score){ + hashKey= EmissionState.hashKey(currentState.seqPos, currentState.profilePos + 1, currentState.iter + iterAdvance); + nextState = hash.get(hashKey); + if (nextState == null){ + nextState = new EmissionState(currentState.seqPos, currentState.profilePos + 1, currentState.iter + iterAdvance); + nextState.score = cost; + hash.put(hashKey, nextState); + lastState.next = nextState; + lastState = nextState; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel + 1; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + + }else{ + if (nextState.score > cost){ + nextState.score = cost; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel + 1; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + } + }//else - if nextstate != null + }//if cost + }//if profile + + //2. insertion + if (currentState.seqPos + 1 < seq.length()){ + cost = currentState.score + insCost; + if (cost < retState.score){ + //note: this does not advance on the profile, thus no need to add iterAdvance + hashKey= EmissionState.hashKey(currentState.seqPos + 1, currentState.profilePos, currentState.iter); + nextState = hash.get(hashKey); + if (nextState == null){ + nextState = new EmissionState(currentState.seqPos + 1, currentState.profilePos, currentState.iter); + nextState.score = cost; + hash.put(hashKey, nextState); + lastState.next = nextState; + lastState = nextState; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel; + nextState.countIns = currentState.countIns + 1; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + }else{ + if (nextState.score > cost){ + nextState.score = cost; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel; + nextState.countIns = currentState.countIns + 1; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + } + } + } + } + + //3.Match + if (currentState.seqPos + 1 < seq.length() && currentState.profilePos + 1 < profileSeq.length()){ + cost = currentState.score + matCost + (seq.getBase(currentState.seqPos + 1) == profileSeq.getBase(currentState.profilePos + 1)? matchCost:misMatchCost); + if (cost < retState.score){ + hashKey= EmissionState.hashKey(currentState.seqPos + 1, currentState.profilePos +1, currentState.iter + iterAdvance); + nextState = hash.get(hashKey); + if (nextState == null){ + nextState = new EmissionState(currentState.seqPos + 1, currentState.profilePos+1, currentState.iter + iterAdvance); + nextState.score = cost; + hash.put(hashKey, nextState); + lastState.next = nextState; + lastState = nextState; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + + if (seq.getBase(currentState.seqPos + 1) == profileSeq.getBase(currentState.profilePos + 1)){ + nextState.countMG ++; + }else + nextState.countMB ++; + + + }else{ + if (nextState.score > cost){ + nextState.score = cost; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + + if (seq.getBase(currentState.seqPos + 1) == profileSeq.getBase(currentState.profilePos + 1)){ + nextState.countMG ++; + }else + nextState.countMB ++; + } + } + } + }//match + + + //Consider jumping to the beginning of the rep + //Note I dont need iterAdvance here (it is 0 anyway) + if (currentState.profilePos == repEnd){ + cost = currentState.score + delCost; + if (cost < retState.score){ + hashKey= EmissionState.hashKey(currentState.seqPos, repStart, currentState.iter + 1); + nextState = hash.get(hashKey); + if (nextState == null){ + nextState = new EmissionState(currentState.seqPos, repStart, currentState.iter + 1); + nextState.score = cost; + hash.put(hashKey, nextState); + lastState.next = nextState; + lastState = nextState; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel + 1; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + }else{ + if (nextState.score > cost){ + nextState.score = cost; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel + 1; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + } + } + }//if (del) + + //3.Match + if (currentState.seqPos + 1 < seq.length() && currentState.profilePos + 1 < profileSeq.length()){ + cost = currentState.score + matCost + (seq.getBase(currentState.seqPos + 1) == profileSeq.getBase(repStart)? matchCost:misMatchCost); + if (cost < retState.score){ + hashKey= EmissionState.hashKey(currentState.seqPos + 1, repStart, currentState.iter + 1); + nextState = hash.get(hashKey); + if (nextState == null){ + nextState = new EmissionState(currentState.seqPos + 1, repStart, currentState.iter + 1); + nextState.score = cost; + hash.put(hashKey, nextState); + lastState.next = nextState; + lastState = nextState; + nextState.bwdState = currentState; + + + nextState.countDel = currentState.countDel; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + + if (seq.getBase(currentState.seqPos + 1) == profileSeq.getBase(repStart)){ + nextState.countMG ++; + }else + nextState.countMB ++; + + }else{ + if (nextState.score > cost){ + nextState.score = cost; + nextState.bwdState = currentState; + + nextState.countDel = currentState.countDel; + nextState.countIns = currentState.countIns; + nextState.countMG = currentState.countMG; + nextState.countMB = currentState.countMB; + + if (seq.getBase(currentState.seqPos + 1) == profileSeq.getBase(repStart)){ + nextState.countMG ++; + }else + nextState.countMB ++; + } + } + } + } + } + EmissionState tmp = currentState.next; + + //helping GC to gabbabe collect current state + hash.remove(EmissionState.hashKey(currentState.seqPos, currentState.profilePos, currentState.iter)); + currentState.next = null; + currentState = tmp; + } + //System.out.printf("Consider %d states\n",complexity); + + System.out.printf("Estimate: %2d %3d %3d %3d %3d %8.4f %8.4f\n",retState.iter, retState.countMG, retState.countMB, + retState.countIns, retState.countDel, retState.score, + retState.countMG * (matCost + matchCost) + retState.countMB *(matCost + misMatchCost) + + retState.countIns * insCost + retState.countDel * delCost); return retState;//bestScore; + } + + public static class EmissionState{ + int countDel = 0, countIns = 0, countMG = 0, countMB = 0; + + EmissionState next = null; + public int seqPos, profilePos, iter; + + public EmissionState bwdState = null; + public double score; + + String hashKey; + // int count = 0;//how many in + + public EmissionState(int sPos, int pPos, int i){ + seqPos = sPos; + profilePos = pPos; + iter = i; + score = Double.MAX_VALUE; + //hashKey = hashKey(gPos, mPos, iter); + } + + static public String hashKey(int sPos, int pPos, int iter){ + return String.format("%5d %4d %2d",sPos, pPos, iter); + } + public double getScore(){ + return score; + } + public int getIter(){ + return iter; + } + + /** + * @return the countDel + */ + public int getCountDel() { + return countDel; + } + + /** + * @return the countIns + */ + public int getCountIns() { + return countIns; + } + + /** + * @return the countMG + */ + public int getCountMG() { + return countMG; + } + + /** + * @return the countMB + */ + public int getCountMB() { + return countMB; + } + } + + public Sequence generate(int iter, Random rnd) throws IOException{ + ByteArray bArray = new ByteArray(); + int profilePos = 0; + + double cost = 0; + byte nuc; + double prob; + int myIter = 0; + int countMG = 0, countMB = 0, countDel = 0, countIns = 0; + while (profilePos < this.profileSeq.length()){ + prob = rnd.nextDouble(); + if (prob < delProb){ + cost += delCost; + nuc = this.profileSeq.getBase(profilePos); + //datOutGen.print("D " + mPos + " " + bArray.size() + " " + alphabet.int2char(nuc)+"\n"); + profilePos ++; + countDel ++; + }else if (prob < insProb + delProb){ + cost += insCost; + nuc = (byte)rnd.nextInt(4); + //datOutGen.print("I " + mPos + " " + bArray.size() + " " + alphabet.int2char(nuc)+"\n"); + bArray.add(nuc); + countIns ++; + }else{ + //match + prob = rnd.nextDouble(); + byte nextByte = this.profileSeq.getBase(profilePos); + //System.out.println(mPos + " " + bArray.size()); + + if (prob < matchProb){ + cost += matCost + matchCost; + // System.out.println(mPos + " " + bArray.size()); + nuc = this.profileSeq.getBase(profilePos); + //datOutGen.print("= " + mPos + " " + bArray.size() + " " + alphabet.int2char(nuc)+'\n'); + bArray.add(nextByte); + countMG ++; + }else{ + cost += matCost + misMatchCost; + nuc = (byte) ((nextByte + rnd.nextInt(3) + 1) % 4); + //datOutGen.print("X " + mPos + " " + bArray.size() + " " + alphabet.int2char(nextByte) + " " + alphabet.int2char(nuc)+'\n'); + //System.out.println("B " + bArray.size() + " " + mPos + " " + this.profileSeq.getBase(mPos) + " " + nextByte); + bArray.add(nuc); + countMB ++; + } + profilePos ++; + } + + if (profilePos == this.repEnd && myIter < iter){ + profilePos = this.repStart; + myIter ++; + } + //System.out.println(); + } + //datOutGen.print("GEN: " + countMG + " " +countMB + " " + countIns + " " + countDel + " " + cost + "\n"); + + + System.out.printf("Generate: %2d %3d %3d %3d %3d %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f %8.4f \n", myIter, countMG, countMB, countIns, countDel, cost, + countMG * (matCost + matchCost) + countMB *(matCost + misMatchCost) + + countIns * insCost + countDel * delCost, + matProb, insProb, delProb, matchProb, misMatchProb); + return new Sequence(Alphabet.DNA4(), bArray, "gen"); + } +} diff --git a/src/main/java/japsa/bio/alignment/ppfsm/Emission.java b/src/main/java/japsa/bio/alignment/ppfsm/Emission.java new file mode 100644 index 0000000..0adc12d --- /dev/null +++ b/src/main/java/japsa/bio/alignment/ppfsm/Emission.java @@ -0,0 +1,35 @@ +package japsa.bio.alignment.ppfsm; + +import japsa.bio.alignment.ppfsm.state.MachineState; + +public class Emission { + //This pointer is for implementing dynamic programming using a linked-list + Emission next = null; + + //These are for backward/forward + public Emission fwdEmission = null; + public Emission bwdEmission = null; + + public int gPos, iteration; + MachineState toState; + + public double myCost; + + //String hashKey; + + public Emission(MachineState state, int gP, int iter) { + toState = state; + gPos = gP; + iteration = iter; + myCost = Double.MAX_VALUE; + } + + static public String hashKey(MachineState state, int gPos, int iter) { + return state.getName() + "_" + gPos + "_" + iter; + + } + + public double getScore() { + return myCost; + } +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/alignment/ppfsm/ProfilePFSM.java b/src/main/java/japsa/bio/alignment/ppfsm/ProfilePFSM.java new file mode 100644 index 0000000..e25fe24 --- /dev/null +++ b/src/main/java/japsa/bio/alignment/ppfsm/ProfilePFSM.java @@ -0,0 +1,213 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 31/01/2017 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsa.bio.alignment.ppfsm; + +import java.util.ArrayList; +import java.util.HashMap; + +import japsa.bio.alignment.ppfsm.state.MachineState; +import japsa.bio.alignment.ppfsm.transition.Transition; +import japsa.seq.Sequence; +import japsa.util.JapsaMath; + +/** + * Implementing a flexible profile ProbFSM. This machine would have each 3 + * state corresponding to a base of the underlying sequence + * @author minhduc + * + */ +public class ProfilePFSM{ + MachineState startState; + MachineState endState; + + static double costCopyCopy = 1, costCopyMismatch = 2, costCopyDelete = 4, costCopyInsert = 4, + costDeleteCopy = 1, costDeleteMismatch = 2, costDeleteDelete = 4, + costInsertCopy = 1, costInsertMismatch = 2, costInsertInsert = 4; + + static{ + double matProb = 0.95, + insProb = 0.025, + delProb = 0.025; + + costInsertInsert = costCopyInsert = -JapsaMath.log2(insProb); + costDeleteDelete = costCopyDelete = -JapsaMath.log2(delProb); + + double matCost = -JapsaMath.log2(matProb); + + costDeleteCopy = costInsertCopy = costCopyCopy = matCost - JapsaMath.log2(0.9); + costDeleteMismatch = costInsertMismatch = costCopyMismatch = matCost - JapsaMath.log2(0.1) - JapsaMath.log2(1.0/3.0); + + } + + public ProfilePFSM(MachineState start, MachineState end){ + this.startState = start; + this.endState = end; + } + + public Emission align(Sequence genSeq) { + Emission retEmission = new Emission(endState, genSeq.length()-1, 0); + retEmission.myCost = genSeq.length() * (costCopyInsert + 2);//plus 2 because of insert + + Emission currentEmission, lastEmission; + currentEmission = lastEmission = new Emission(startState, -1, 0); + currentEmission.myCost = 0; + + HashMap hash = new HashMap(); + int count = 0; + while (currentEmission != null){ + if (currentEmission.gPos > 30 && currentEmission.myCost > 2.5 * currentEmission.gPos){ + Emission tmp = currentEmission.next; + //helping GC to gabbabe collect current state + hash.remove(Emission.hashKey(currentEmission.toState, currentEmission.gPos, currentEmission.iteration)); + currentEmission.next = null; + currentEmission = tmp; + + continue; + } + + count ++; + if (count % 10000 == 0){ + System.out.println(count + " " + hash.size() + " " + + currentEmission.toState.getName() + " " + + currentEmission.iteration + " " + + currentEmission.gPos + " " + + currentEmission.myCost + "###" + + retEmission.myCost + " " + retEmission.iteration); + + } + if (currentEmission.toState == endState){//If I am at the endState, check with the current best + if (currentEmission.myCost < retEmission.myCost){ + //TODO: May need to remove the emission links of the existing ret to make thing easy for GC + retEmission = currentEmission; + } + Emission tmp = currentEmission.next; + //helping GC to gabbabe collect current state + hash.remove(Emission.hashKey(currentEmission.toState, currentEmission.gPos, currentEmission.iteration)); + currentEmission.next = null; + currentEmission = tmp; + + continue; + } + //assert: the currentState is NOT the end + String hashKey; + Emission nextEmission; + double cost; + + ArrayList transitions = currentEmission.toState.getTransitions(); + for (Transition tran: transitions){ + int nextPos = -2; + byte nextBase = -1; + int nextInt = currentEmission.iteration + tran.getIterIncrease(); + if(tran.getState() == endState){ + //only consider this if the + if (currentEmission.gPos == genSeq.length() - 1){//done + nextPos = currentEmission.gPos; + //nextByte = -1;//dont care + } + }else if (tran instanceof Transition.InsertTransition){ + if (currentEmission.gPos < genSeq.length() - 1){ + nextPos = currentEmission.gPos + 1; + nextBase = genSeq.getBase(nextPos); + } + }else if (tran instanceof Transition.DeleteTransition){ + nextPos = currentEmission.gPos; + //nextBase = genSeq.getBase(nextPos); + }else if (tran instanceof Transition.CopyTransition){ + if (currentEmission.gPos < genSeq.length() - 1){ + nextPos = currentEmission.gPos + 1; + nextBase = genSeq.getBase(nextPos); + } + }else{ + //have to be free transation + nextPos = currentEmission.gPos; + //nextBase = genSeq.getBase(nextPos); + } + if (nextPos >= -1){ + //only considered + cost = currentEmission.myCost + tran.emissionCost(nextBase); + + if (cost < retEmission.myCost){ + hashKey= Emission.hashKey(tran.getState(), nextPos, nextInt); + nextEmission = hash.get(hashKey); + if (nextEmission == null){ + nextEmission = new Emission(tran.getState(), nextPos, nextInt); + nextEmission.myCost = cost; + hash.put(hashKey, nextEmission); + lastEmission.next = nextEmission; + lastEmission = nextEmission; + nextEmission.bwdEmission = currentEmission; + + //nextEmission.countDel = currentEmission.countDel + 1; + //nextEmission.countIns = currentEmission.countIns; + //nextEmission.countMG = currentEmission.countMG; + //nextEmission.countMB = currentEmission.countMB; + + }else{ + if (nextEmission.myCost > cost){ + nextEmission.myCost = cost; + nextEmission.bwdEmission = currentEmission; + + //nextEmission.countDel = currentEmission.countDel + 1; + //nextEmission.countIns = currentEmission.countIns; + //nextEmission.countMG = currentEmission.countMG; + //nextEmission.countMB = currentEmission.countMB; + } + }//else - if nextstate != null + }//if cost + }//nextPos + }//for + + Emission tmp = currentEmission.next; + + //helping GC to gabbabe collect current state + hash.remove(Emission.hashKey(currentEmission.toState, currentEmission.gPos, currentEmission.iteration)); + currentEmission.next = null; + currentEmission = tmp; + } + //System.out.printf("Consider %d states\n",complexity); + + //System.out.printf("Estimate: %2d %3d %3d %3d %3d %8.4f %8.4f\n",retState.iter, retState.countMG, retState.countMB, + // retState.countIns, retState.countDel, retState.score, + // retState.countMG * (matCost + matchCost) + retState.countMB *(matCost + misMatchCost) + + // retState.countIns * insCost + retState.countDel * delCost); return retState;//bestScore; + + //timer.systemInfo(); + //Runtime.getRuntime().gc(); + //timer.systemInfo(); + + //timer.mark("toc"); + return retEmission; + /*******************************************************************/ + } +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/alignment/ppfsm/VNTRpOneSM.java b/src/main/java/japsa/bio/alignment/ppfsm/VNTRpOneSM.java new file mode 100644 index 0000000..643e0e9 --- /dev/null +++ b/src/main/java/japsa/bio/alignment/ppfsm/VNTRpOneSM.java @@ -0,0 +1,218 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 31/01/2017 - Minh Duc Cao: Created + ****************************************************************************/ +package japsa.bio.alignment.ppfsm; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +import japsa.bio.alignment.ppfsm.state.MachineState; +import japsa.bio.alignment.ppfsm.transition.Transition; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.util.JapsaMath; + +/** + * @author minhduc + */ +public class VNTRpOneSM extends ProfilePFSM{ + /** + * + * Create a profileFSM from three sequence: lflank, rflank, and period. This is one state machine + * (that is copy/insert/delete merged into one for each profile position) + * + * @param seq + * @param repStart + * @param repEnd + */ + public VNTRpOneSM(Sequence seqLeft, Sequence seqRep, Sequence seqRight) { + super(new MachineState("S"), new MachineState("E")); + + System.out.println("costCopyCopy = " + costCopyCopy); + System.out.println("costCopyMismatch = " + costCopyMismatch); + System.out.println("costCopyDelete = " + costCopyDelete); + System.out.println("costCopyInsert = " + costCopyInsert); + + System.out.println("costDeleteCopy = " + costDeleteCopy); + System.out.println("costDeleteMismatch = " + costDeleteMismatch); + System.out.println("costDeleteDelete = " + costDeleteDelete); + + System.out.println("costInsertCopy = " + costInsertCopy); + System.out.println("costInsertMismatch = " + costInsertMismatch); + System.out.println("costInsertInsert = " + costInsertInsert); + + + MachineState startRepeat = new MachineState("SR");//which is also the end of left + MachineState endRepeat = new MachineState("ER"); + + //Do the left flank + MachineState[] nodeLeft = new MachineState[seqLeft.length()]; + + //Now create states + int i = 0; + nodeLeft[i] = new MachineState("Lf_" + i, seqLeft.getBase(i)); + + Transition tran = new Transition.CopyTransition(nodeLeft[0], costCopyCopy, costCopyMismatch); + startState.addTransition(tran); + + tran = new Transition.DeleteTransition(nodeLeft[0], costCopyDelete); + startState.addTransition(tran); + + tran = new Transition.InsertTransition(startState, costCopyInsert); + startState.addTransition(tran); + + for (i = 1; i < seqLeft.length(); i++) { + //Create states + nodeLeft[i] = new MachineState("LfC_" + i, seqLeft.getBase(i)); + + //link the previous copy state + tran = new Transition.CopyTransition(nodeLeft[i], costCopyCopy, costCopyMismatch); + nodeLeft[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(nodeLeft[i], costCopyDelete); + nodeLeft[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(nodeLeft[i - 1], costCopyInsert); + nodeLeft[i - 1].addTransition(tran); + } + + i = seqLeft.length() - 1; + + //link the previous copy state + tran = new Transition.InsertTransition(nodeLeft[i], costCopyInsert); + nodeLeft[i].addTransition(tran); + + tran = new Transition.FreeTransition(startRepeat, 0); + tran.setIterIncrease(1); + nodeLeft[i].addTransition(tran); + + //do the repeat + MachineState[] nodeRep = new MachineState[seqRep.length()]; + + i = 0; + nodeRep[i] = new MachineState("ReC_" + i, seqRep.getBase(i)); + + tran = new Transition.CopyTransition(nodeRep[0], costCopyCopy, costCopyMismatch); + startRepeat.addTransition(tran); + + tran = new Transition.DeleteTransition(nodeRep[0], costCopyDelete); + startRepeat.addTransition(tran); + + tran = new Transition.InsertTransition(startRepeat, costCopyInsert); + startRepeat.addTransition(tran); + + for (i = 1; i < seqRep.length(); i++) { + //Create states + nodeRep[i] = new MachineState("ReC_" + i, seqRep.getBase(i)); + + //link the previous copy state + tran = new Transition.CopyTransition(nodeRep[i], costCopyCopy, costCopyMismatch); + nodeRep[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(nodeRep[i], costCopyDelete); + nodeRep[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(nodeRep[i - 1], costCopyInsert); + nodeRep[i - 1].addTransition(tran); + } + + i = seqRep.length() - 1; + + //link the previous copy state + tran = new Transition.InsertTransition(nodeRep[i], costCopyInsert); + nodeRep[i].addTransition(tran); + + tran = new Transition.FreeTransition(endRepeat, 0); + nodeRep[i].addTransition(tran); + + //do the right + MachineState[] nodeRight = new MachineState[seqRight.length()]; + + i = 0; + nodeRight[i] = new MachineState("RfC_" + i, seqRight.getBase(i)); + + tran = new Transition.CopyTransition(nodeRight[i], costCopyCopy, costCopyMismatch); + endRepeat.addTransition(tran); + + tran = new Transition.DeleteTransition(nodeRight[i], costCopyDelete); + endRepeat.addTransition(tran); + + tran = new Transition.InsertTransition(endRepeat, costCopyInsert); + endRepeat.addTransition(tran); + + for (i = 1; i < seqRight.length(); i++) { + //Create states + nodeRight[i] = new MachineState("RfC_" + i, seqRight.getBase(i)); + + //link the previous copy state + tran = new Transition.CopyTransition(nodeRight[i], costCopyCopy, costCopyMismatch); + nodeRight[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(nodeRight[i], costCopyDelete); + nodeRight[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(nodeRight[i - 1], costCopyInsert); + nodeRight[i - 1].addTransition(tran); + } + + i = seqRight.length() - 1; + + //link the previous copy state + tran = new Transition.InsertTransition(nodeRight[i], costCopyInsert); + nodeRight[i].addTransition(tran); + + tran = new Transition.FreeTransition(endState, 0); + nodeRight[i].addTransition(tran); + + //////////////////////////////////////////////////////////////////// + tran = new Transition.FreeTransition(startRepeat, 0); + tran.setIterIncrease(1); + endRepeat.addTransition(tran); + } + + + public static void main(String [] args) throws IOException{ + SequenceReader reader = SequenceReader.getReader(args[0]); + Alphabet dnaAlphabet = Alphabet.DNA16(); + Sequence leftSeq = reader.nextSequence(dnaAlphabet); + Sequence repSeq = reader.nextSequence(dnaAlphabet); + Sequence rightSeq = reader.nextSequence(dnaAlphabet); + + VNTRpOneSM pFMS = new VNTRpOneSM(leftSeq, repSeq, rightSeq); + Sequence seq; + while ((seq = reader.nextSequence(dnaAlphabet)) != null){ + pFMS.align(seq); + } + } +} diff --git a/src/main/java/japsa/bio/alignment/ppfsm/VNTRpThreeSM.java b/src/main/java/japsa/bio/alignment/ppfsm/VNTRpThreeSM.java new file mode 100644 index 0000000..f80defa --- /dev/null +++ b/src/main/java/japsa/bio/alignment/ppfsm/VNTRpThreeSM.java @@ -0,0 +1,330 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 31/01/2017 - Minh Duc Cao: Created + ****************************************************************************/ +package japsa.bio.alignment.ppfsm; + +import java.io.IOException; + +import japsa.bio.alignment.ppfsm.state.MachineState; +import japsa.bio.alignment.ppfsm.transition.Transition; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; + +/** + * @author minhduc + */ +public class VNTRpThreeSM extends ProfilePFSM{ + + + /** + * + * Create a profileFSM from three sequence: lflank, rflank, and period + * + * @param seq + * @param repStart + * @param repEnd + */ + public VNTRpThreeSM(Sequence seqLeft, Sequence seqRep, Sequence seqRight) { + super(new MachineState("S"), new MachineState("E")); + + System.out.println("costCopyCopy = " + costCopyCopy); + System.out.println("costCopyMismatch = " + costCopyMismatch); + System.out.println("costCopyDelete = " + costCopyDelete); + System.out.println("costCopyInsert = " + costCopyInsert); + + System.out.println("costDeleteCopy = " + costDeleteCopy); + System.out.println("costDeleteMismatch = " + costDeleteMismatch); + System.out.println("costDeleteDelete = " + costDeleteDelete); + + System.out.println("costInsertCopy = " + costInsertCopy); + System.out.println("costInsertMismatch = " + costInsertMismatch); + System.out.println("costInsertInsert = " + costInsertInsert); + + MachineState startRepeat = new MachineState("SR");//which is also the end of left + MachineState endRepeat = new MachineState("ER"); + + //Do the left flank + MachineState[] copyLeft = new MachineState[seqLeft.length()]; + MachineState[] insertLeft = new MachineState[seqLeft.length()]; + MachineState[] deleteLeft = new MachineState[seqLeft.length()]; + + //Now create states + MachineState insertFirstLeft = new MachineState("LfI"); + + int i = 0; + copyLeft[i] = new MachineState("LfC_" + i, seqLeft.getBase(i)); + deleteLeft[i] = new MachineState("LfD_" + i); + insertLeft[i] = new MachineState("LfI_" + i); + + Transition tran = new Transition.CopyTransition(copyLeft[0], costCopyCopy, costCopyMismatch); + startState.addTransition(tran); + + tran = new Transition.DeleteTransition(deleteLeft[0], costCopyDelete); + startState.addTransition(tran); + + tran = new Transition.InsertTransition(insertFirstLeft, costCopyInsert); + startState.addTransition(tran); + + tran = new Transition.CopyTransition(copyLeft[i], costInsertCopy, costInsertMismatch); + insertFirstLeft.addTransition(tran); + + tran = new Transition.InsertTransition(insertFirstLeft, costInsertInsert); + insertFirstLeft.addTransition(tran); + + for (i = 1; i < seqLeft.length(); i++) { + //Create states + copyLeft[i] = new MachineState("LfC_" + i, seqLeft.getBase(i)); + deleteLeft[i] = new MachineState("LfD_" + i); + insertLeft[i] = new MachineState("LfI_" + i); + + //link the previous copy state + tran = new Transition.CopyTransition(copyLeft[i], costCopyCopy, costCopyMismatch); + copyLeft[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(deleteLeft[i], costCopyDelete); + copyLeft[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(insertLeft[i - 1], costCopyInsert); + copyLeft[i - 1].addTransition(tran); + + //link the previous insert state + tran = new Transition.CopyTransition(copyLeft[i], costInsertCopy, costInsertMismatch); + insertLeft[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(insertLeft[i - 1], costInsertInsert); + insertLeft[i - 1].addTransition(tran); + + //link the previous delete state + tran = new Transition.CopyTransition(copyLeft[i], costDeleteCopy, costDeleteMismatch); + deleteLeft[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(deleteLeft[i], costDeleteDelete); + deleteLeft[i - 1].addTransition(tran); + } + + i = seqLeft.length() - 1; + + //link the previous copy state + tran = new Transition.InsertTransition(insertLeft[i], costCopyInsert); + copyLeft[i].addTransition(tran); + + tran = new Transition.InsertTransition(insertLeft[i], costInsertInsert); + insertLeft[i].addTransition(tran); + + tran = new Transition.FreeTransition(startRepeat, 0); + tran.setIterIncrease(1); + copyLeft[i].addTransition(tran); + + tran = new Transition.FreeTransition(startRepeat, 0); + tran.setIterIncrease(1); + insertLeft[i].addTransition(tran); + + tran = new Transition.FreeTransition(startRepeat, 0); + tran.setIterIncrease(1); + deleteLeft[i].addTransition(tran); + + //do the repeat + MachineState[] copyRep = new MachineState[seqRep.length()]; + MachineState[] insertRep = new MachineState[seqRep.length()]; + MachineState[] deleteRep = new MachineState[seqRep.length()]; + + //Now create states + MachineState insertFirstRep = new MachineState("ReI"); + + i = 0; + copyRep[i] = new MachineState("ReC_" + i, seqRep.getBase(i)); + deleteRep[i] = new MachineState("ReD_" + i); + insertRep[i] = new MachineState("ReI_" + i); + + tran = new Transition.CopyTransition(copyRep[0], costCopyCopy, costCopyMismatch); + startRepeat.addTransition(tran); + + tran = new Transition.DeleteTransition(deleteRep[0], costCopyDelete); + startRepeat.addTransition(tran); + + tran = new Transition.InsertTransition(insertFirstRep, costCopyInsert); + startRepeat.addTransition(tran); + + tran = new Transition.CopyTransition(copyRep[i], costInsertCopy, costInsertMismatch); + insertFirstRep.addTransition(tran); + + tran = new Transition.InsertTransition(insertFirstRep, costInsertInsert); + insertFirstRep.addTransition(tran); + + for (i = 1; i < seqRep.length(); i++) { + //Create states + copyRep[i] = new MachineState("ReC_" + i, seqRep.getBase(i)); + deleteRep[i] = new MachineState("ReD_" + i); + insertRep[i] = new MachineState("ReI_" + i); + + //link the previous copy state + tran = new Transition.CopyTransition(copyRep[i], costCopyCopy, costCopyMismatch); + copyRep[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(deleteRep[i], costCopyDelete); + copyRep[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(insertRep[i - 1], costCopyInsert); + copyRep[i - 1].addTransition(tran); + + //link the previous insert state + tran = new Transition.CopyTransition(copyRep[i], costInsertCopy, costInsertMismatch); + insertRep[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(insertRep[i - 1], costInsertInsert); + insertRep[i - 1].addTransition(tran); + + //link the previous delete state + tran = new Transition.CopyTransition(copyRep[i], costDeleteCopy, costDeleteMismatch); + deleteRep[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(deleteRep[i], costDeleteDelete); + deleteRep[i - 1].addTransition(tran); + } + + i = seqRep.length() - 1; + + //link the previous copy state + tran = new Transition.InsertTransition(insertRep[i], costCopyInsert); + copyRep[i].addTransition(tran); + + tran = new Transition.InsertTransition(insertRep[i], costInsertInsert); + insertRep[i].addTransition(tran); + + tran = new Transition.FreeTransition(endRepeat, 0); + copyRep[i].addTransition(tran); + + tran = new Transition.FreeTransition(endRepeat, 0); + insertRep[i].addTransition(tran); + + tran = new Transition.FreeTransition(endRepeat, 0); + deleteRep[i].addTransition(tran); + + //do the right + MachineState[] copyRight = new MachineState[seqRight.length()]; + MachineState[] insertRight = new MachineState[seqRight.length()]; + MachineState[] deleteRight = new MachineState[seqRight.length()]; + + //Now create states + MachineState insertFirstRight = new MachineState("RfI"); + + i = 0; + copyRight[i] = new MachineState("RfC_" + i, seqRight.getBase(i)); + deleteRight[i] = new MachineState("RfD_" + i); + insertRight[i] = new MachineState("RfI_" + i); + + tran = new Transition.CopyTransition(copyRight[0], costCopyCopy, costCopyMismatch); + endRepeat.addTransition(tran); + + tran = new Transition.DeleteTransition(deleteRight[0], costCopyDelete); + endRepeat.addTransition(tran); + + tran = new Transition.InsertTransition(insertFirstRight, costCopyInsert); + endRepeat.addTransition(tran); + + tran = new Transition.CopyTransition(copyRight[i], costInsertCopy, costInsertMismatch); + insertFirstRight.addTransition(tran); + + tran = new Transition.InsertTransition(insertFirstRight, costInsertInsert); + insertFirstRight.addTransition(tran); + + for (i = 1; i < seqRight.length(); i++) { + //Create states + copyRight[i] = new MachineState("RfC_" + i, seqRight.getBase(i)); + deleteRight[i] = new MachineState("RfD_" + i); + insertRight[i] = new MachineState("RfI_" + i); + + //link the previous copy state + tran = new Transition.CopyTransition(copyRight[i], costCopyCopy, costCopyMismatch); + copyRight[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(deleteRight[i], costCopyDelete); + copyRight[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(insertRight[i - 1], costCopyInsert); + copyRight[i - 1].addTransition(tran); + + //link the previous insert state + tran = new Transition.CopyTransition(copyRight[i], costInsertCopy, costInsertMismatch); + insertRight[i - 1].addTransition(tran); + + tran = new Transition.InsertTransition(insertRight[i - 1], costInsertInsert); + insertRight[i - 1].addTransition(tran); + + //link the previous delete state + tran = new Transition.CopyTransition(copyRight[i], costDeleteCopy, costDeleteMismatch); + deleteRight[i - 1].addTransition(tran); + + tran = new Transition.DeleteTransition(deleteRight[i], costDeleteDelete); + deleteRight[i - 1].addTransition(tran); + } + + i = seqRight.length() - 1; + + //link the previous copy state + tran = new Transition.InsertTransition(insertRight[i], costCopyInsert); + copyRight[i].addTransition(tran); + + tran = new Transition.InsertTransition(insertRight[i], costInsertInsert); + insertRight[i].addTransition(tran); + + tran = new Transition.FreeTransition(endState, 0); + copyRight[i].addTransition(tran); + + tran = new Transition.FreeTransition(endState, 0); + insertRight[i].addTransition(tran); + + tran = new Transition.FreeTransition(endState, 0); + deleteRight[i].addTransition(tran); + + //////////////////////////////////////////////////////////////////// + tran = new Transition.FreeTransition(startRepeat, 0); + tran.setIterIncrease(1); + endRepeat.addTransition(tran); + } + + + public static void main(String [] args) throws IOException{ + SequenceReader reader = SequenceReader.getReader(args[0]); + Alphabet dnaAlphabet = Alphabet.DNA16(); + Sequence leftSeq = reader.nextSequence(dnaAlphabet); + Sequence repSeq = reader.nextSequence(dnaAlphabet); + Sequence rightSeq = reader.nextSequence(dnaAlphabet); + + VNTRpThreeSM pFMS = new VNTRpThreeSM(leftSeq, repSeq, rightSeq); + Sequence seq; + while ((seq = reader.nextSequence(dnaAlphabet)) != null){ + pFMS.align(seq); + } + } +} diff --git a/src/main/java/japsa/bio/alignment/ppfsm/state/MachineState.java b/src/main/java/japsa/bio/alignment/ppfsm/state/MachineState.java new file mode 100644 index 0000000..a9dbfbe --- /dev/null +++ b/src/main/java/japsa/bio/alignment/ppfsm/state/MachineState.java @@ -0,0 +1,232 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 31/01/2017 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsa.bio.alignment.ppfsm.state; + +import java.util.ArrayList; +import japsa.bio.alignment.ppfsm.transition.Transition; + + +/** + * Implementation of a machine state for the profile FSM + * @author minhduc + * + */ + +public class MachineState { + //protected Transition.CopyTransition copyTransition = null; + //protected Transition.DeleteTransition deleteTransition = null; + //protected Transition.InsertTransition insertTransition = null; + //protected Transition.EndTransition endTransition = null; + + ArrayList transitions = new ArrayList(); + private String name; + private byte base; + + public MachineState(String name) { + this.setName(name); + } + + public MachineState(String name, byte base) { + this(name); + setBase(base); + } + /** + * Add a non-standard transition + * @param toState + * @param cost + */ + public void addTransition(Transition tran){ + transitions.add(tran); + } + + /** + * Get non-standard transition + * @return + */ + public ArrayList getTransitions(){ + return transitions; + } + + /** + * @return the name + */ + public String getName() { + return name; + } + /** + * @param name the name to set + */ + public void setName(String name) { + this.name = name; + } + + /** + * @return the name + */ + public byte getBase() { + return base; + } + /** + * @param name the name to set + */ + public void setBase(byte base) { + this.base = base; + } + + + + // /** + // * Copy transition + // * @author minhduc + // * + // */ + // public static class CopyState extends MachineState{ + // byte base; + // + // public CopyState(String name, byte base) { + // super(name); + // this.base = base; + // } + // /** + // * @return the base + // */ + // public byte getBase() { + // return base; + // } + // /** + // * @param base the base to set + // */ + // public void setBase(byte base) { + // this.base = base; + // } + // /** + // * @param copyTransition the copyTransition to set + // */ + // public void setCopyTransition(Transition.CopyTransition copyTransition) { + // this.copyTransition = copyTransition; + // } + // /** + // * @param deleteTransition the deleteTransition to set + // */ + // public void setDeleteTransition(Transition.DeleteTransition deleteTransition) { + // this.deleteTransition = deleteTransition; + // } + // /** + // * @param insertTransition the insertTransition to set + // */ + // public void setInsertTransition(Transition.InsertTransition insertTransition) { + // this.insertTransition = insertTransition; + // } + // } + // + // /** + // * Insert transition + // * @author minhduc + // * + // */ + // public static class InsertState extends MachineState{ + // public InsertState(String name) { + // super(name); + // } + // /** + // * @param copyTransition the copyTransition to set + // */ + // public void setCopyTransition(Transition.CopyTransition copyTransition) { + // this.copyTransition = copyTransition; + // } + // /** + // * @param insertTransition the insertTransition to set + // */ + // public void setInsertTransition(Transition.InsertTransition insertTransition) { + // this.insertTransition = insertTransition; + // } + // } + // + // /** + // * Copy transition + // * @author minhduc + // * + // */ + // public static class DeleteState extends MachineState{ + // public DeleteState(String name) { + // super(name); + // } + // + // /** + // * @param copyTransition the copyTransition to set + // */ + // public void setCopyTransition(Transition.CopyTransition copyTransition) { + // this.copyTransition = copyTransition; + // } + // /** + // * @param deleteTransition the deleteTransition to set + // */ + // public void setDeleteTransition(Transition.DeleteTransition deleteTransition) { + // this.deleteTransition = deleteTransition; + // } + // } + // + // public static class StartState extends MachineState{ + // public StartState() { + // super("S"); + // } + // /** + // * @param copyTransition the copyTransition to set + // */ + // public void setCopyTransition(Transition.CopyTransition copyTransition) { + // this.copyTransition = copyTransition; + // } + // /** + // * @param deleteTransition the deleteTransition to set + // */ + // public void setDeleteTransition(Transition.DeleteTransition deleteTransition) { + // this.deleteTransition = deleteTransition; + // } + // + // /** + // * @param insertTransition the insertTransition to set + // */ + // public void setInsertTransition(Transition.InsertTransition insertTransition) { + // this.insertTransition = insertTransition; + // } + // } + // //EndState: has no transition + // public static class EndState extends MachineState{ + // public EndState() { + // super("E"); + // } + // } + + +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/alignment/ppfsm/transition/Transition.java b/src/main/java/japsa/bio/alignment/ppfsm/transition/Transition.java new file mode 100644 index 0000000..a55e13c --- /dev/null +++ b/src/main/java/japsa/bio/alignment/ppfsm/transition/Transition.java @@ -0,0 +1,154 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 31/01/2017 - Minh Duc Cao: Created + ****************************************************************************/ +package japsa.bio.alignment.ppfsm.transition; + +import japsa.bio.alignment.ppfsm.state.MachineState; + +/** + * Transition between states + * @author minhduc + * + */ +public abstract class Transition{ + + /** + * The probability of this transition + */ + double cost = 0; + + /** + * The count of this transition + */ + //int count = 1; + int iterIncrease = 0; + protected final int numGenerate; + MachineState toState; + + public Transition(MachineState state, int numGenerate){ + toState = state; + this.numGenerate = numGenerate; + } + + /** + * @return the cost + */ + public double getCost() { + return cost; + } + + public abstract double emissionCost(byte base); + + /** + * @return the iteration + */ + public int getIterIncrease() { + return iterIncrease; + } + + /** + * @param iteration the iteration to set + */ + public void setIterIncrease(int it) { + this.iterIncrease = it; + } + + /** + * @param cost the cost to set + */ + public void setCost(double cost) { + this.cost = cost; + } + + public MachineState getState() { + return toState; + } + + + public static class CopyTransition extends Transition{ + double copyCost; + double mismatchCost; + //MachineState.CopyState toState; + + public CopyTransition(MachineState state, double copyCost, double mismatchCost) { + super(state,1); + this.copyCost = copyCost; + this.mismatchCost = mismatchCost; + } + + public double emissionCost(byte base) { + if (base == toState.getBase()) + return copyCost; + else + return mismatchCost; + } + } + + public static class DeleteTransition extends Transition{ + double deleteCost; + + public DeleteTransition(MachineState state, double cost) { + super(state,0); + this.deleteCost = cost; + } + + public double emissionCost(byte base) { + return deleteCost; + } + } + + public static class InsertTransition extends Transition{ + double insertCost; + + public InsertTransition(MachineState state, double cost) { + super(state,1); + this.insertCost = cost; + } + + public double emissionCost(byte base) { + return insertCost + 2; + } + + } + + public static class FreeTransition extends Transition{ + + public FreeTransition(MachineState state, double cost) { + super(state,0); + this.cost = cost; + } + + public double emissionCost(byte base) { + return cost; + } + } +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/bac/GenePresenceDB.java b/src/main/java/japsa/bio/amra/GenePresenceDB.java similarity index 99% rename from src/main/java/japsa/bio/bac/GenePresenceDB.java rename to src/main/java/japsa/bio/amra/GenePresenceDB.java index 05fe9fc..dafaa62 100644 --- a/src/main/java/japsa/bio/bac/GenePresenceDB.java +++ b/src/main/java/japsa/bio/amra/GenePresenceDB.java @@ -32,7 +32,7 @@ * 7 Sep 2015 - Minh Duc Cao: Created * ****************************************************************************/ -package japsa.bio.bac; +package japsa.bio.amra; /** * Database for presence/absence of gene typing diff --git a/src/main/java/japsa/bio/bac/MLSTyping.java b/src/main/java/japsa/bio/amra/MLSTyping.java similarity index 98% rename from src/main/java/japsa/bio/bac/MLSTyping.java rename to src/main/java/japsa/bio/amra/MLSTyping.java index c65e272..67b5df1 100644 --- a/src/main/java/japsa/bio/bac/MLSTyping.java +++ b/src/main/java/japsa/bio/amra/MLSTyping.java @@ -31,7 +31,7 @@ * 28/05/2014 - Minh Duc Cao: Created ****************************************************************************/ -package japsa.bio.bac; +package japsa.bio.amra; @@ -121,10 +121,6 @@ public MLSTyping(String mlstBase) throws IOException{ index_ = alleleName.lastIndexOf('_'); alleleNo = Integer.parseInt(alleleName.substring(1 + index_)); alleleNo2AlleleIndex[i][alleleNo] = x; - - //if (!bitSets[i].get(alleleNo)){ - // Logging.warn("Allele " + alleleName + " not used"); - //}//if }//for } } @@ -187,7 +183,7 @@ public static String bestMlst(ArrayList seqs, String mlstDir) throws I String stKey = ""; for (int i = 0; i < mlstScheme.numGenes; i++){ - stKey += mlstScheme.geneNames[i] + "_" + profile.geneAlleles[i] + "_"; + stKey += mlstScheme.geneNames[i] + "_" + profile.geneAlleles[i] + "|"; } stMap.put(stKey, ST); } @@ -224,7 +220,7 @@ public static String bestMlst(ArrayList seqs, String mlstDir) throws I br.close(); process.waitFor(); - key += allele + "_"; + key += allele + "|"; typeScore += score; } diff --git a/src/main/java/japsa/bio/bac/ResistanceGeneDB.java b/src/main/java/japsa/bio/amra/ResistanceGeneDB.java similarity index 93% rename from src/main/java/japsa/bio/bac/ResistanceGeneDB.java rename to src/main/java/japsa/bio/amra/ResistanceGeneDB.java index bfe6d2f..f8a2493 100644 --- a/src/main/java/japsa/bio/bac/ResistanceGeneDB.java +++ b/src/main/java/japsa/bio/amra/ResistanceGeneDB.java @@ -32,10 +32,11 @@ * 7 Sep 2015 - Minh Duc Cao: Created * ****************************************************************************/ -package japsa.bio.bac; +package japsa.bio.amra; import japsa.seq.SequenceReader; -import japsa.util.Logging; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; @@ -47,8 +48,10 @@ * */ public class ResistanceGeneDB { + private static final Logger LOG = LoggerFactory.getLogger(ResistanceGeneDB.class); - public static final String SEPARATOR = "\t"; + + public static final String SEPARATOR = "\t"; public static final String COMMENT = "#"; String dbPath;//Act line the ID of the database @@ -78,7 +81,7 @@ public ResistanceGeneDB(String path) throws IOException { } } - Logging.info("Read in " + gene2Res.size() + " genes"); + LOG.info("Read in " + gene2Res.size() + " genes"); br.close(); } diff --git a/src/main/java/japsa/bio/bac/SpeciesTypingDB.java b/src/main/java/japsa/bio/amra/SpeciesTypingDB.java similarity index 99% rename from src/main/java/japsa/bio/bac/SpeciesTypingDB.java rename to src/main/java/japsa/bio/amra/SpeciesTypingDB.java index d10dae7..51397d8 100644 --- a/src/main/java/japsa/bio/bac/SpeciesTypingDB.java +++ b/src/main/java/japsa/bio/amra/SpeciesTypingDB.java @@ -32,7 +32,7 @@ * 7 Sep 2015 - Minh Duc Cao: Created * ****************************************************************************/ -package japsa.bio.bac; +package japsa.bio.amra; /** * Database for bacterial species typing diff --git a/src/main/java/japsa/bio/gene/GeneDatabase.java b/src/main/java/japsa/bio/gene/GeneDatabase.java index 7d220ab..c33372e 100644 --- a/src/main/java/japsa/bio/gene/GeneDatabase.java +++ b/src/main/java/japsa/bio/gene/GeneDatabase.java @@ -38,25 +38,27 @@ import japsa.seq.SequenceReader; import japsa.seq.Alphabet.DNA; import japsa.seq.SequenceOutputStream; -import japsa.util.Logging; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; -public class GeneDatabase implements Iterable{ - ArrayList geneFamilies; +public class GeneDatabase extends ArrayList{ + private static final Logger LOG = LoggerFactory.getLogger(GeneDatabase.class); String dbID = "JSA"; public GeneDatabase(){ - geneFamilies = new ArrayList(); + super(); } public String addNewFamily(Sequence seq){ GeneDatabase.GeneFamily newFam = new GeneDatabase.GeneFamily(size()); String geneID = newFam.addSequence(seq); - geneFamilies.add(newFam); + add(newFam); return geneID; } @@ -68,14 +70,14 @@ public String addNewFamily(Sequence seq){ */ public void write2File(String fileName, boolean includeAlleles) throws IOException{ SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(fileName); - for (GeneDatabase.GeneFamily family:geneFamilies){ + for (GeneDatabase.GeneFamily family:this){ Sequence rep = family.represetationSequence(); rep.writeFasta(sos); if (includeAlleles){ for (Sequence seq:family){ seq.writeFasta(sos); } - } + } } sos.close(); } @@ -98,13 +100,13 @@ public static GeneDatabase readDB(String fileName) throws IOException{ if (toks.length == 2){ //rep GeneDatabase.GeneFamily newFam = new GeneDatabase.GeneFamily(db.size()); - db.geneFamilies.add(newFam); + db.add(newFam); }else if (toks.length == 3){ //a gene GeneDatabase.GeneFamily fam = db.getFamily(Integer.parseInt(toks[1])); - fam.geneAlleles.add(seq); + fam.add(seq); }else{ - Logging.error("Unknown sequence " + seq.getName()); + LOG.error("Unknown sequence " + seq.getName()); } } reader.close(); @@ -131,7 +133,7 @@ public static GeneDatabase readDB(String fileName) throws IOException{ //doing nothing } - return geneFamilies.get(fID); + return get(fID); } /** @@ -141,47 +143,27 @@ public static GeneDatabase readDB(String fileName) throws IOException{ */ public GeneDatabase.GeneFamily getFamily(int famID){ - if (famID >= geneFamilies.size() || famID < 0){ + //TODO: Check if really need to validate famID + if (famID >= size() || famID < 0){ return null; } - return geneFamilies.get(famID); + return get(famID); } - - - /** - * Return the number of families in the database - * @return - */ - public int size(){ - return geneFamilies.size(); - } - - - /* (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - public Iterator iterator() { - return geneFamilies.iterator(); - } ///////////////////////////////////////////////////////////////////////// - public static class GeneFamily implements Iterable{ + public static class GeneFamily extends ArrayList{ private final int fID; - private ArrayList geneAlleles;//known instance of this family - //Sequence rep = null;//The representation of this gene family - + int repIndex = -1; String desc = ""; public GeneFamily(int id){ + super(); fID = id; - geneAlleles = new ArrayList(); } - /** * @return the desc */ @@ -203,28 +185,30 @@ public String familyID(){ } public Sequence represetationSequence(){ - Sequence rep = geneAlleles.get(repIndex).clone(); - rep.setDesc(desc + ";index=" +repIndex); + Sequence rep = get(repIndex).clone(); + rep.setDesc(desc + ";index=" +repIndex + ";size=" + size()); rep.setName(familyID()); return rep; } private void updateRep(int newIndex){ - if (repIndex < 0 || geneAlleles.get(newIndex).length() > geneAlleles.get(repIndex).length()){ + if (repIndex < 0 || get(newIndex).length() > get(repIndex).length()){ repIndex = newIndex; } } /** * Add a new sequence to the new family. This will create a new allele - * if it is not already in the dababase + * if it is not already in the dababase. This is the proper method for adding + * a new sequence to the famlily, instead of the add() * @param seq * @return */ public String addSequence(Sequence seq){ //This allele already in the database - for (int i = 0; i < geneAlleles.size(); i++){ - Sequence eSeq = geneAlleles.get(i); + //TODO: Need to see if this feature is really needed + for (int i = 0; i < size(); i++){ + Sequence eSeq = get(i); if (eSeq.match(seq) == 0) return eSeq.getName(); if (eSeq.match(DNA.complement(seq)) == 0) @@ -233,19 +217,10 @@ public String addSequence(Sequence seq){ Sequence nSeq = seq.clone(); nSeq.setDesc(nSeq.getName() + " " + nSeq.getDesc()); - nSeq.setName(familyID() + "_" + (geneAlleles.size())); - geneAlleles.add(nSeq); - updateRep(geneAlleles.size() - 1); + nSeq.setName(familyID() + "_" + (size())); + super.add(nSeq); + updateRep(size() - 1); return nSeq.getName(); } - - /* (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - public Iterator iterator() { - return geneAlleles.iterator(); - } } - } \ No newline at end of file diff --git a/src/main/java/japsa/bio/hts/SpeciesCoverageIdenfication.java b/src/main/java/japsa/bio/hts/SpeciesCoverageIdenfication.java index 7c871ec..72360ba 100644 --- a/src/main/java/japsa/bio/hts/SpeciesCoverageIdenfication.java +++ b/src/main/java/japsa/bio/hts/SpeciesCoverageIdenfication.java @@ -36,7 +36,7 @@ import japsa.seq.SequenceReader; import japsa.util.DoubleArray; -import japsa.util.Logging; + import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamInputResource; @@ -54,13 +54,17 @@ import org.rosuda.JRI.REXP; import org.rosuda.JRI.Rengine; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * @author minhduc * */ -public class SpeciesCoverageIdenfication { +public class SpeciesCoverageIdenfication { + private static final Logger LOG = LoggerFactory.getLogger(SpeciesCoverageIdenfication.class); + private double qual = 0; private Rengine rengine; @@ -78,12 +82,13 @@ public SpeciesCoverageIdenfication(String outputFile, double minQual) throws IOException{ rengine = new Rengine (new String [] {"--no-save"}, false, null); if (!rengine.waitForR()){ - Logging.exit("Cannot load R",1); + LOG.error("Cannot load R"); + System.exit(1); } rengine.eval("library(MultinomialCI)"); rengine.eval("alpha<-0.05"); - Logging.info("REngine ready"); + LOG.info("REngine ready"); //countsOS = SequenceOutputStream.makeOutputStream(outputFile); if (outputFile.equals("-")) outOS = System.out; @@ -97,12 +102,8 @@ public void close() throws IOException{ outOS.close(); rengine.end(); } - /** - * @param bamFile - * @param geneFile - * @throws IOException - * @throws InterruptedException - */ + + static class SpeciesCount implements Comparable{ String species; int count = 0; @@ -146,7 +147,7 @@ public void preTyping(String indexFile)throws IOException{ } }//while bf.close(); - Logging.info(seq2Species.size() + " " + species2Count.size()); + LOG.info(seq2Species.size() + " " + species2Count.size()); speciesList.addAll(species2Count.keySet()); //Write header @@ -197,7 +198,7 @@ private void simpleAnalysisCurrent() throws IOException{ } outOS.flush(); - //Logging.info(step+" " + countArray.size()); + //LOG.info(step+" " + countArray.size()); } diff --git a/src/main/java/japsa/bio/hts/SpeciesMixtureIdenfication.java b/src/main/java/japsa/bio/hts/SpeciesMixtureIdenfication.java index 7a9e7cb..84bd071 100644 --- a/src/main/java/japsa/bio/hts/SpeciesMixtureIdenfication.java +++ b/src/main/java/japsa/bio/hts/SpeciesMixtureIdenfication.java @@ -36,7 +36,6 @@ import japsa.seq.SequenceReader; import japsa.util.DoubleArray; -import japsa.util.Logging; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamInputResource; @@ -54,6 +53,8 @@ import org.rosuda.JRI.REXP; import org.rosuda.JRI.Rengine; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -63,7 +64,9 @@ * @author minhduc * */ -public class SpeciesMixtureIdenfication { +public class SpeciesMixtureIdenfication { + private static final Logger LOG = LoggerFactory.getLogger(SpeciesMixtureIdenfication.class); + private double qual = 0; private Rengine rengine; private int currentReadCount = 0; @@ -79,12 +82,13 @@ public SpeciesMixtureIdenfication(String outputFile, double minQual, double threshold) throws IOException{ rengine = new Rengine (new String [] {"--no-save"}, false, null); if (!rengine.waitForR()){ - Logging.exit("Cannot load R",1); + LOG.error("Cannot load R"); + System.exit(1); } rengine.eval("library(MultinomialCI)"); rengine.eval("alpha<-0.05"); - Logging.info("REngine ready"); + LOG.info("REngine ready"); if (outputFile.equals("-")) outOS = System.out; @@ -99,12 +103,7 @@ public void close() throws IOException{ outOS.close(); rengine.end(); } - /** - * @param bamFile - * @param geneFile - * @throws IOException - * @throws InterruptedException - */ + public void preTyping(String indexFile)throws IOException{ @@ -126,7 +125,7 @@ public void preTyping(String indexFile)throws IOException{ } }//while bf.close(); - Logging.info(seq2Species.size() + " " + species2Count.size()); + LOG.info(seq2Species.size() + " " + species2Count.size()); speciesList.addAll(species2Count.keySet()); //Write header @@ -176,7 +175,7 @@ private void simpleAnalysisCurrent() throws IOException{ } outOS.flush(); - //Logging.info(step+" " + countArray.size()); + //LOG.info(step+" " + countArray.size()); } diff --git a/src/main/java/japsa/bio/hts/scaffold/AlignmentRecord.java b/src/main/java/japsa/bio/hts/scaffold/AlignmentRecord.java index 8a19ac4..9e096f2 100644 --- a/src/main/java/japsa/bio/hts/scaffold/AlignmentRecord.java +++ b/src/main/java/japsa/bio/hts/scaffold/AlignmentRecord.java @@ -47,14 +47,13 @@ public String readID; Contig contig; - public int refStart, refEnd; //position on ref of the start and end of the alignment + public int refStart, refEnd; //1-based position on ref of the start and end of the alignment //Position on read of the start and end of the alignment (using the direction of read) - public int readStart = 0, readEnd = 0; - //readStart map to refStart, readEnd map to refEnd. //readStart < readEnd if strand = true, else readStart > readEnd - + public int readStart = 0, readEnd = 0; + //read length public int readLength = 0; @@ -68,10 +67,29 @@ //public int readLeft, readRight, readAlign, refLeft, refRight, refAlign; //left and right are in the direction of the reference sequence - public AlignmentRecord(){ + public AlignmentRecord(String readID, int refStart, int refEnd, int readLength, + int readStart, int readEnd, boolean strand, boolean useful, Contig contig, int score){ + this.readID = readID; + this.contig = contig; + this.refStart = refStart; + this.refEnd = refEnd; + this.readLength = readLength; + this.readStart = readStart;//1-index + this.readEnd = readEnd;//1-index + this.strand = strand; + this.useful = useful; + this.contig = contig; + this.score = score; } public AlignmentRecord(SAMRecord sam, Contig ctg) { + if(!sam.getReferenceName().equals(ctg.getName())){ + System.err.println("Reference in SAM file doesn't agree with contigs name: " + + sam.getReferenceName() + " != " + ctg.getName()); + System.err.println("Hint: SAM file must resulted from alignment between long reads and contigs!"); + System.exit(1); + } + // readID = Integer.parseInt(sam.getReadName().split("_")[0]); readID = sam.getReadName(); @@ -139,7 +157,6 @@ public AlignmentRecord(SAMRecord sam, Contig ctg) { ) useful = true; - contig.addRange(refStart,refEnd,score); } @@ -176,38 +193,21 @@ public String pos() { ; } // return same alignment but with reversed read + //TODO: change to object self-editing function? public AlignmentRecord reverseRead(){ - AlignmentRecord revAlign = new AlignmentRecord(); - revAlign.readID = readID; - revAlign.contig = contig; - revAlign.refStart = refStart; - revAlign.refEnd = refEnd; + AlignmentRecord revAlign = new AlignmentRecord(readID, refStart, refEnd, readLength, + readLength - readStart + 1, readLength - readEnd + 1, !strand, useful, contig, score); - revAlign.readLength = readLength; - revAlign.readStart = readLength - readStart + 1;//1-index - revAlign.readEnd = readLength - readEnd + 1;//1-index - revAlign.strand = !strand; - revAlign.useful = useful; revAlign.alignmentCigars = alignmentCigars; - revAlign.contig = contig; - revAlign.score = score; + return revAlign; } public AlignmentRecord clones(){ - AlignmentRecord align = new AlignmentRecord(); - align.readID = readID; - align.contig = contig; - align.refStart = refStart; - align.refEnd = refEnd; + AlignmentRecord align = new AlignmentRecord(readID, refStart, refEnd, readLength, + readStart, readEnd, strand, useful, contig, score); - align.readLength = readLength; - align.readStart = readStart;//1-index - align.readEnd = readEnd;//1-index - align.strand = strand; - align.useful = useful; align.alignmentCigars = alignmentCigars; - align.contig = contig; - align.score = score; + return align; } public void copy(AlignmentRecord rec){ diff --git a/src/main/java/japsa/bio/hts/scaffold/Contig.java b/src/main/java/japsa/bio/hts/scaffold/Contig.java index 26b5a6b..51f5f26 100644 --- a/src/main/java/japsa/bio/hts/scaffold/Contig.java +++ b/src/main/java/japsa/bio/hts/scaffold/Contig.java @@ -35,42 +35,112 @@ package japsa.bio.hts.scaffold; import java.util.ArrayList; -import java.util.Collections; - import japsa.seq.Sequence; +import japsa.seq.JapsaFeature; public class Contig{ int index; ScaffoldVector myVector;//relative position to the head contig of my scaffold Sequence contigSequence;//the sequence of the contig - double coverage = 1.0; - double portionUsed = 0.0; - //int used = 0; + double coverage = 1.0; int head = -1; //point to the index of its head contig in the scaffold double prevScore=0, nextScore=0; - boolean isCircular = false; - //for depth first search - ArrayList bridges; - + int cirProb = 0; //measure how likely the contig itself is circular + + //for annotation + ArrayList genes, //genes list + oriRep, //origin of replication: indicator of plasmid for bacteria + insertSeq, //Insertion Sequence + resistanceGenes; //list of antibiotic resistance genes found in this contig + //a contig is composed of edges from assembly graph + + static Graph asGraph=null; + public static void setGraph(Graph g){ + asGraph=g; + } + public static boolean hasGraph(){ + return asGraph!=null; + } + + ArrayList paths; + public Contig(int index, Sequence seq){ this.index = index; contigSequence = seq; myVector = new ScaffoldVector(0,1); - bridges = new ArrayList(); - usedRanges = new ArrayList(); + + genes = new ArrayList(); + oriRep = new ArrayList(); + insertSeq = new ArrayList(); + resistanceGenes = new ArrayList(); + + paths = new ArrayList(); } + public Contig clone(){ Contig ctg = new Contig(this.index, this.contigSequence); - //ctg.used = used++; //TODO: replace by static array usage[nContigs] in ScaffoldGraphDFS?? ctg.coverage = coverage; - ctg.portionUsed = portionUsed; - ctg.bridges = this.bridges; + ctg.head = this.head; //update later - ctg.isCircular = this.isCircular; - ctg.usedRanges = this.usedRanges; + ctg.cirProb = this.cirProb; + + ctg.genes = this.genes; + ctg.oriRep = this.oriRep; + ctg.insertSeq = this.insertSeq; + ctg.resistanceGenes = this.resistanceGenes; + + ctg.paths = new ArrayList(); + for(Path p:paths) + ctg.paths.add(p); + return ctg; } + // Get features in an interval of contig + public ArrayList getFeatures(ArrayList features, int start, int end){ + + ArrayList remainFeatures = new ArrayList(); + boolean isReverse= (start>end)?true:false; + for(JapsaFeature feature:features){ + JapsaFeature cutFeature=feature.cloneFeature(); + int fstart = feature.getStart(), + fend = feature.getEnd(); + + //find overlap + if(Integer.signum(fstart-start)*Integer.signum(fstart-end) <= 0){ + if(Integer.signum(fend-start)*Integer.signum(fend-end) > 0){ + fend = (Math.abs(fend-start) < Math.abs(fend-end))?start:end; + } + }else{ + fstart = (Math.abs(fstart-start) < Math.abs(fstart-end))?start:end; + if(Integer.signum(start-fend)*Integer.signum(start-fstart) <= 0 && Integer.signum(end-fend)*Integer.signum(end-fstart) <= 0) + fend = (Math.abs(fend-start) < Math.abs(fend-end))?start:end; + else if(Integer.signum(start-fend)*Integer.signum(start-fstart) > 0 && Integer.signum(end-fend)*Integer.signum(end-fstart) > 0) + continue; + } + //if the contig is reversed complement + if(isReverse){ + int ostart = fstart; + fstart= this.length() - fend; + fend = this.length() - ostart; + if(cutFeature.getStrand() == '+') + cutFeature.setStrand('-'); + else + cutFeature.setStrand('+'); + } + + cutFeature.setStart(fstart); + cutFeature.setEnd(fend); + double cutRate=(float) Math.abs(cutFeature.getLength())/Math.abs(feature.getLength()); + if(cutRate > .9){ + cutFeature.setScore(feature.getScore()*cutRate); + remainFeatures.add(cutFeature); + } + } + + return remainFeatures; + } + public String getName(){ return contigSequence.getName(); @@ -127,6 +197,9 @@ public int rightMost(){ return rightMost(myVector); } + public boolean isCircular(){ + return (cirProb > 0); + } public ScaffoldVector getVector(){ return myVector; @@ -139,59 +212,23 @@ public int length(){ public double getCoverage(){ return coverage; } + + /* + * Operators related to Path + */ + public ArrayList getPaths(){ + return paths; + } + public void setPath(Path path){ + this.paths.add(path); + } + public void setCoverage(double cov){ coverage = cov; } public String toString(){ return new String(" contig" + getIndex()); } - ////////////////for tracing the used part//////////////////// - ArrayList usedRanges; - class Range implements Comparable { - int start, end, score; - Range(){ - start = end = score = 0; - } - Range(int start, int end, int score){ - this.start = start " + end + ": " + score); - } - @Override - public int compareTo(Range rg) { - // TODO Auto-generated method stub - if(this.start!=rg.start) - return (this.start-rg.start); - else if(this.end != rg.end) - return (this.end-rg.end); - else - return (this.score-rg.score); - } - } - public void addRange(int start, int end, int score){ - usedRanges.add(new Range(start,end,score)); - } - public void display(){ - Collections.sort(usedRanges); - System.out.println("Contig " + this.getName()); - for(Range rg:usedRanges) - System.out.println("used " + rg); - - int prevEnd = 0, minLen = 100; - for (Range rg:usedRanges){ - if(rg.start > prevEnd + minLen){ - System.out.println("\tuncovered: " + prevEnd + " --> " + rg.start + " ( " + (rg.start-prevEnd+1) +" )"); - } - if(prevEnd < rg.end) - prevEnd = rg.end; - } - if (prevEnd + minLen < length()-1) - System.out.println("\tuncovered: " + prevEnd + " --> " + (length()-1) + " ( " + (length()-prevEnd) +" )"); - } + + } \ No newline at end of file diff --git a/src/main/java/japsa/bio/hts/scaffold/ContigBridge.java b/src/main/java/japsa/bio/hts/scaffold/ContigBridge.java index e237df6..2033903 100644 --- a/src/main/java/japsa/bio/hts/scaffold/ContigBridge.java +++ b/src/main/java/japsa/bio/hts/scaffold/ContigBridge.java @@ -43,17 +43,17 @@ import japsa.seq.Sequence; import japsa.seq.SequenceBuilder; import japsa.seq.SequenceOutputStream; - import java.io.IOException; import java.util.ArrayList; import java.util.BitSet; import java.util.Collections; + import japsa.bio.np.ErrorCorrection; /** - * Create a bridge that connects two contigs. The bridged can be ranked based - * on the confidence so that more confident bridge is used before. - * Note that two contigs can have more than one bridges from circular + * Create a bridge that connects two contigs. The bridge can be ranked based + * on the confidence so that more confident bridge is used priorly. + * Note that two contigs can have more than one bridge from circular * sequence or false positives. * @author minhduc * @@ -61,13 +61,26 @@ public class ContigBridge implements Comparable{ - final Contig firstContig, secondContig; + Contig firstContig, secondContig; final String hashKey; final int orderIndex; - + private double score = 0;//more is better private ScaffoldVector transVector = null; private ArrayList connections;//a list of connections that make up this + private Path bridgePath=null; + + + final static int SEARCH_THRES=300; + protected static boolean forceFill=false; + + public static void forceFilling(){ + forceFill=true; + } + public static void relaxFilling(){ + forceFill=false; + } + public ContigBridge(Contig c1, Contig c2, int ind){ firstContig = c1; @@ -76,8 +89,22 @@ public ContigBridge(Contig c1, Contig c2, int ind){ hashKey = makeHash(c1.index,c2.index, orderIndex); connections = new ArrayList(); + } + /** + * Re-assign the two contigs + * @param first + * @param second + */ + public ContigBridge clone(Contig first, Contig second){ + ContigBridge dolly=new ContigBridge(first,second,orderIndex); + dolly.bridgePath=bridgePath; + dolly.transVector=transVector; + dolly.score=score; + dolly.connections=connections; + return dolly; + } public static String makeHash(int aIndex, int bIndex, int order){ return aIndex+"#"+bIndex + "#" + order; } @@ -106,39 +133,312 @@ public double addConnection(ReadFilling readSequence, connections.add(new Connection(readSequence, firstAlignment,secondAlignment,trans)); }else{ Connection newConnect = new Connection(readSequence, firstAlignment,secondAlignment,trans); - connections.add(newConnect); - transVector.magnitude = (transVector.magnitude * connections.size() + trans.magnitude) / (connections.size() + 1); - + connections.add(newConnect); + //the metric for bridge score is important! score += sc; + //score = score>sc?score:sc; } return score; } + /* + * Try to find a path that connect firstContig to secondContig, + * based on the transVector. + * Only being invoked from Scaffold.viewSequence()?? Yes! + */ + public Connection updatePath(){ + ArrayList firstPathList=firstContig.getPaths(), + secondPathList=secondContig.getPaths(), + candidates=new ArrayList(); + int d=transVector.distance(firstContig, secondContig); + + //if the distance is too long, we should wait for more long reads coming in + if(!forceFill && d>SEARCH_THRES) + return null; + + //not doing it again + //FIXME: remove this when implement progressive taxa-typing based on long reads... + if(bridgePath!=null) + return path2Connection(bridgePath); + + + for(Path p1:firstPathList) + for(Path p2:secondPathList){ + System.out.println("Trying to find path that connect " + firstContig.getName() + "("+ (firstContig.getRelDir()>0?"F":"R") + ")" + + " to " + secondContig.getName() + "("+ (secondContig.getRelDir()>0?"F":"R") + ")"); + System.out.print((firstContig.getRelDir()>0?p1:p1.rc()) + " =====> "); + System.out.println(secondContig.getRelDir()>0?p2:p2.rc()); + + + //because we go from left->right of a Scaffold when invoking Scaffold.viewSequence() + Node tip1=firstContig.getRelDir()>0?p1.getEnd():p1.rc().getEnd(), + tip2=secondContig.getRelDir()>0?p2.getStart():p2.rc().getStart(); + + // Find a path from tip1 -> tip2 with distance as close to d as possible + candidates.addAll(Contig.asGraph.DFS(tip1, tip2, d)); + } + Collections.sort(candidates); + + /** + * Using poa to find best candidate regarding long reads data. + */ + String bestMatch=null; + + if(candidates.isEmpty()) + return null; +// else if(candidates.size()==1) +// bridgePath=candidates.get(0); +// else{ +// ArrayList allSeq=new ArrayList(); +// Collections.sort(connections); +// allSeq.addAll(connections); +// for(Path p:candidates) +// allSeq.add(path2Connection(p)); +// +// ArrayList readList = new ArrayList(connections.size()); +// // locate the offset points on two contigs. Note: 1-based due to the fuking htsjdk.samtools +// int cutOnFirstContig=firstContig.getRelDir()>0?(firstContig.length()):1, +// cutOnSecondContig=secondContig.getRelDir()>0?1:(secondContig.length()); +// +// for (Connection connection:allSeq){ +// int firstCutOnRead=mapToRead(cutOnFirstContig, connection.firstAlignment), +// secondCutOnRead=mapToRead(cutOnSecondContig, connection.secondAlignment); +// Sequence tmp = null; +// try{ +// ReadFilling tmpRead = connection.read; +// if (firstCutOnRead > secondCutOnRead){ +// connection.read = connection.read.reverse(); +// connection.firstAlignment=connection.firstAlignment.reverseRead(); +// connection.secondAlignment=connection.secondAlignment.reverseRead(); +// firstCutOnRead = tmpRead.readSequence.length()-firstCutOnRead+1; +// secondCutOnRead = tmpRead.readSequence.length()-secondCutOnRead+1; +// } +// +// tmp = connection.read.readSequence.subSequence(firstCutOnRead-1, secondCutOnRead-1); +// tmp.setName(tmpRead.readSequence.getName()); +// tmp.setDesc(tmpRead.readSequence.getDesc()); +// readList.add(tmp); +// } +// catch(Exception e){ +// e.printStackTrace(); +// System.err.println("Failed attempt to extract (" + firstCutOnRead + ", " + secondCutOnRead +// + ") from sequence with length " + connection.read.readSequence.length()); +// } +// } +// +// try { +// +// String faiFile = hashKey + "_ai.fasta";//name of input fasta file +// String faoFile = hashKey + "_ao_pir.fasta";//name of output +// { +// SequenceOutputStream faiSt = SequenceOutputStream.makeOutputStream(faiFile); +// for (Sequence seq:readList){ +// Logging.info(seq.getName() + " " + seq.length()); +// seq.writeFasta(faiSt); +// } +// faiSt.close(); +// } +// +// //2.0 Run multiple alignment +// { +// String cmd = "/home/s.hoangnguyen/Tools/poaV2/poa -read_fasta " + faiFile + " -pir " + faoFile + " -hb -best /home/s.hoangnguyen/Tools/poaV2/blosum80.mat"; +// //String cmd = "/home/s.hoangnguyen/Tools/poaV2/poa -read_fasta " + faiFile + " -clustal clustal_" + faoFile + " -hb -best /home/s.hoangnguyen/Tools/poaV2/blosum80.mat"; +// +// +// Logging.info("Running " + cmd); +// Process process = Runtime.getRuntime().exec(cmd); +// process.waitFor(); +// Logging.info("Done " + cmd); +// } +// +// FastaReader reader = new FastaReader(faoFile); +// bestMatch=reader.nextSequence(Alphabet.DNA()).getName(); +// reader.close(); +// +// +// } catch (Exception e) { +// e.printStackTrace(); +// System.err.println("Can not generate consensus sequence!"); +// } +// for(Path p:candidates){ +// if(p.getID().equals(bestMatch)){ +// bridgePath=p; +// break; +// } +// +// } +// } + + + if(bestMatch==null){ + System.out.println("Not found a stand-out path! Pick the first one."); + bridgePath=candidates.get(0); + } + + //now make change to the transVector to fit the bridgePath + int newDistance=bridgePath.length-bridgePath.getStart().getSeq().length()-bridgePath.getEnd().getSeq().length(); //distance between two closest tips of two connecting Nodes + transVector.setMagnitute(transVector.getMagnitute()+(newDistance-d)*Integer.signum(firstContig.getRelDir())); + //check if this is not the close bridge of the scaffold + if(consistentWith(ScaffoldVector.composition(secondContig.myVector, ScaffoldVector.reverse(firstContig.myVector)))) + secondContig.myVector=ScaffoldVector.composition(transVector, firstContig.myVector); + return path2Connection(bridgePath); + + } + /** + * Get an artifact connection out of current bridgePath + * @return Connection: corresponding connection + */ + private Connection path2Connection(Path p){ + Node tip1=p.getStart(), + tip2=p.getEnd(); + int d=transVector.distance(firstContig, secondContig); + + + Connection retval=null; + if(!p.isEmpty()){ + // Convert bridgePath to a Connection + String readID=p.getID(); + Sequence seq=p.spelling(); + int refStart, refEnd, readLength=seq.length(), readStart, readEnd, score=Integer.MAX_VALUE; + boolean strand, useful=true; + + refStart=firstContig.length()-tip1.getSeq().length()+1; refEnd=firstContig.length(); + readStart=1; readEnd=tip1.getSeq().length(); + strand=true; + if(firstContig.getRelDir()<0){ + refStart=1; refEnd=tip1.getSeq().length(); + int tmp=readStart; + readStart=readEnd; + readEnd=tmp; + strand=false; + } + AlignmentRecord firstAlignment=new AlignmentRecord(readID, refStart, refEnd, readLength, readStart, readEnd, strand, useful, firstContig, score); + + refStart=1; refEnd=tip2.getSeq().length(); + readStart=readLength-tip2.getSeq().length()+1; readEnd=readLength; + strand=true; + if(secondContig.getRelDir()<0){ + refStart=secondContig.length()-tip2.getSeq().length()+1; refEnd=secondContig.length(); + int tmp=readStart; + readStart=readEnd; + readEnd=tmp; + strand=false; + } + AlignmentRecord secondAlignment=new AlignmentRecord(readID, refStart, refEnd, readLength, readStart, readEnd, strand, useful, secondContig, score); + ArrayList list = new ArrayList(); + list.add(firstAlignment); + list.add(secondAlignment); + ReadFilling read = new ReadFilling(seq,list); + + //now make change to the transVector to fit the bridgePath + int newDistance=p.length-tip1.getSeq().length()-tip2.getSeq().length(); //distance between two closest tips of two connecting Nodes + + retval=new Connection( read, firstAlignment, secondAlignment, + new ScaffoldVector(transVector.getDirection(), transVector.getMagnitute()+(newDistance-d)*Integer.signum(firstContig.getRelDir()))); + } + return retval; + + } + /** * @return the score */ public double getScore() { return score; } - - public void setScore(double s) { - score = s; - } + + //NOTE: magnitude usually doesn't help for bridges with repeat. + // E.g. <--===---------------> prev not next for the both public void setContigScores(){ + int firstPointer = 0, + secondPointer = 0; + if(transVector.magnitude < 0){ - firstContig.prevScore = score; + firstPointer=-1; if(transVector.direction < 0) - secondContig.prevScore = score; + secondPointer=-1; else - secondContig.nextScore = score; + secondPointer=1; + } + // special case: magnitude < firstContig.length() && transVector.direction < 0; + else if(transVector.magnitude < firstContig.length() && transVector.direction < 0){ + firstPointer = secondPointer = -1; } else{ + + firstPointer=1; + if(transVector.direction > 0) + secondPointer=-1; + else + secondPointer=1; + } + //reset based on the pointers + if(firstPointer > 0){ firstContig.nextScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set nextScore of %s to %.2f\n", firstContig.getName(), score); + }else{ + firstContig.prevScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set prevScore of %s to %.2f\n", firstContig.getName(), score); + } + + if(secondPointer > 0){ + secondContig.nextScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set nextScore of %s to %.2f\n", secondContig.getName(), score); + }else{ + secondContig.prevScore = score; + if(ScaffoldGraph.verbose) + System.out.printf("...set prevScore of %s to %.2f\n", secondContig.getName(), score); + } + + } + // when contig bridge is removed, reset the scores + public void resetContigScores(){ + int firstPointer = 0, + secondPointer = 0; + if(ScaffoldGraph.verbose) + System.out.print("Trans vector " + transVector + " :" ); + if(transVector.magnitude < 0){ + firstPointer=-1; + if(transVector.direction < 0) + secondPointer=-1; + else + secondPointer=1; + } + // special case: magnitude < firstContig.length() && transVector.direction < 0; + else if(transVector.magnitude < firstContig.length() && transVector.direction < 0){ + firstPointer = secondPointer = -1; + } + else{ + firstPointer=1; if(transVector.direction > 0) - secondContig.prevScore = score; + secondPointer=-1; else - secondContig.nextScore = score; + secondPointer=1; + } + //reset based on the pointers + if(firstPointer > 0){ + firstContig.nextScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("...reset nextScore of %s to 0, ", firstContig.getName()); + }else{ + firstContig.prevScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("...reset prevScore of %s to 0, ", firstContig.getName()); + } + + if(secondPointer > 0){ + secondContig.nextScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("reset nextScore of %s to 0\n", secondContig.getName()); + }else{ + secondContig.prevScore = .0; + if(ScaffoldGraph.verbose) + System.out.printf("reset prevScore of %s to 0\n", secondContig.getName()); } + } /** * @return the transVector @@ -152,27 +452,42 @@ public ScaffoldVector getTransVector() { public ArrayList getConnections() { return connections; } + /* + * @return the equivalent path + */ + public Path getBridgePath(){ + return bridgePath; + } public Connection fewestGapConnection() throws IOException{ - Collections.sort(connections); + if(ScaffoldGraph.verbose) + System.out.println("Finding best connection for bridge "+ hashKey + ":"); - //Find the best connections (has fewest gaps) - Connection gapsBestConnection = null; - int gapsBest = Integer.MAX_VALUE; - for (Connection connection:connections){ - int gapsBt = connection.gapsBetween(); - if (gapsBt < gapsBest){ - gapsBest = gapsBt; - gapsBestConnection = connection; - } - } + Connection gapsBestConnection = updatePath(); + if(gapsBestConnection==null){ + if(ScaffoldGraph.verbose) + System.out.println("Path not found! Use connection with fewest gaps instead..."); + //Collections.sort(connections); + //Find the best connections (has fewest gaps) + int gapsBest = Integer.MAX_VALUE; + for (Connection connection:connections){ + int gapsBt = connection.gapsBetween(); + if (gapsBt < gapsBest){ + gapsBest = gapsBt; + gapsBestConnection = connection; + } + } + } else if(ScaffoldGraph.verbose) + System.out.println("Found path("+bridgePath.length+"): "+bridgePath); + + if(ScaffoldGraph.verbose) + gapsBestConnection.display(); return gapsBestConnection; } /** * Try to connect contigs with consensus sequence from involved reads - * TODO optimized the code * @return * @throws IOException */ @@ -292,53 +607,60 @@ else if (idx >= rplStart+consensus.length()) * @param record * @return */ - static int positionOnRef(int posOnRead, AlignmentRecord record){ - if (posOnRead < record.readAlignmentStart() || posOnRead > record.readAlignmentEnd()) + static int positionOnRef(int readLookingPositon, AlignmentRecord record){ + if(ScaffoldGraph.verbose) + System.out.println("...locating position on reference of read's position " + readLookingPositon + + "(" + record.readAlignmentStart() + "," + record.readAlignmentEnd() + ")"); + if (readLookingPositon < record.readAlignmentStart() || readLookingPositon > record.readAlignmentEnd()) return 0; if (!record.strand) - posOnRead = record.readLength - posOnRead + 1; // use direction of ref (forward) + readLookingPositon = record.readLength - readLookingPositon + 1; // use direction of ref (forward) - int pos = record.strand?record.readStart:(record.readLength + 1 - record.readStart); + int posOnRead = record.strand?record.readStart:(record.readLength + 1 - record.readStart); int posOnRef = record.refStart; - //assert pos <= posOnRead - for (final CigarElement e : record.alignmentCigars) { - final int length = e.getLength(); - switch (e.getOperator()) { - case H : - case S : - case P : - break; // ignore pads and clips - case I : - //insert - if (pos + length < posOnRead){ - pos += length; - }else{ - return posOnRef; - } - break; - case M ://match or mismatch - case EQ://match - case X ://mismatch - if (pos + length < posOnRead){ - pos += length; - posOnRef += length; - }else{ - return posOnRef + posOnRead - pos; - } - break; - case D : - posOnRef += length; - break; - case N : - posOnRef += length; - break; - default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); - }//casse - }//for + if(record.alignmentCigars.isEmpty()){ //perfect alignment made by overlapped EDGES (when using assembly graph) + return posOnRef + readLookingPositon - posOnRead; + + }else{ + for (final CigarElement e : record.alignmentCigars) { + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + case S : + case P : + break; // ignore pads and clips + case I : + //insert + if (posOnRead + length < readLookingPositon){ + posOnRead += length; + }else{ + return posOnRef; + } + break; + case M ://match or mismatch + case EQ://match + case X ://mismatch + if (posOnRead + length < readLookingPositon){ + posOnRead += length; + posOnRef += length; + }else{ + return posOnRef + readLookingPositon - posOnRead; + } + break; + case D : + posOnRef += length; + break; + case N : + posOnRef += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//casse + }//for + } return 0; } /** @@ -522,8 +844,7 @@ public ReadFilling consensusRead() throws IOException{ public Sequence fillConsensus(AlignmentRecord ttAlign, AlignmentRecord ffAlign){ int tS = 1, tE = firstContig.length(), fS, fE, tC, fC; - AlignmentRecord tAlign = new AlignmentRecord(), - fAlign = new AlignmentRecord(); + AlignmentRecord tAlign=null, fAlign=null; if (transVector.direction > 0){ fS = transVector.magnitude; fE = transVector.magnitude + secondContig.length(); @@ -633,7 +954,6 @@ public Sequence fillConsensus(AlignmentRecord ttAlign, AlignmentRecord ffAlign){ if (contig == fromContig) continue; - contig.addRange(record.refStart,record.refEnd,record.score); if (posReadEnd >= posReadFinal -1) //continue;//I can break here, but want to get portionUsed of other contigs break; @@ -743,7 +1063,7 @@ public Sequence fillConsensus(AlignmentRecord ttAlign, AlignmentRecord ffAlign){ // first.readID = second.readID = -1; // first.strand=fromContig.getRelDir()>0; // second.strand = toContig.getRelDir()>0; -// // |--|------------|----> +// // |--|------------|----|> // // -----|--|-- --|----|----- // first.readLength = second.readLength = ligateToStart.length() + consensus.length() + ligateToEnd.length(); // @@ -762,6 +1082,10 @@ public Sequence fillConsensus(AlignmentRecord ttAlign, AlignmentRecord ffAlign){ // } + /* + * Connections are linking structure retrieved by 2 contigs aligned to the same nanopore read + * each common read makes up a Connection + */ public class Connection implements Comparable{ ReadFilling read; String readID; @@ -769,8 +1093,9 @@ public Sequence fillConsensus(AlignmentRecord ttAlign, AlignmentRecord ffAlign){ ScaffoldVector trans; AlignmentRecord firstAlignment, secondAlignment; - int distanceOnRead = 0; - + Connection(){ + + } Connection(ReadFilling mRead, AlignmentRecord a, AlignmentRecord b, ScaffoldVector trans){ this.read = mRead; this.readID = a.readID; @@ -782,19 +1107,16 @@ public Sequence fillConsensus(AlignmentRecord ttAlign, AlignmentRecord ffAlign){ score = aAlign * bAlign / (aAlign +bAlign); this.trans = trans; - - distanceOnRead = Math.max(Math.min(b.readStart, b.readEnd) - Math.max(a.readEnd,a.readStart), - Math.min(a.readStart, a.readEnd) - Math.max(b.readEnd,b.readStart)); + } void display (){ - System.out.printf("[%6d %6d] -> [%6d %6d] : [%6d %6d] -> [%6d %6d] (%s) score=%d Read %s ==> %d [%d]\n", + System.out.printf("[%6d %6d] -> [%6d %6d] : [%6d %6d] -> [%6d %6d] (%s) score=%d Read %s ==> %d\n", firstAlignment.refStart, firstAlignment.refEnd, secondAlignment.refStart, secondAlignment.refEnd, firstAlignment.readStart, firstAlignment.readEnd, secondAlignment.readStart, secondAlignment.readEnd, trans.toString(), score, read.readSequence.getName(), - trans.distance(firstContig, secondContig), - distanceOnRead); + trans.distance(firstContig, secondContig)); } /** @@ -846,7 +1168,7 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ int posReadFinal = toAlignment.readAlignmentStart();// I need as far as posReadFinal // locate the last position being extended... int lastExtendedPosition = posReadFinal; - if(posReadEnd > posReadFinal -1 ){ + if(posReadEnd >= posReadFinal ){ lastExtendedPosition = Math.min(posReadEnd,toAlignment.readAlignmentEnd()); return positionOnRef(lastExtendedPosition, toAlignment); } @@ -855,19 +1177,16 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ for (AlignmentRecord record:readFilling.alignments){ Contig contig = record.contig; - if (contig.getIndex() == fromContig.getIndex()) - continue; +// if (contig.getIndex() == fromContig.getIndex()) +// continue; - contig.portionUsed += (1.0 + record.refEnd - record.refStart) / contig.length(); - //contig.addRange(record.refStart,record.refEnd,record.score); if (posReadEnd >= posReadFinal -1) - continue;//I can break here, but want to get portionUsed of other contigs + break; - if (record.readAlignmentEnd() < posReadEnd) + if (record.readAlignmentEnd() <= posReadEnd) continue; - - //assert: posReadEnd < readEnd + if (record.readAlignmentStart() > posReadEnd){ //Really need to fill in using read information int newPosReadEnd = Math.min(posReadFinal - 1, record.readAlignmentStart() -1); @@ -881,13 +1200,16 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ anno.add(feature); seqBuilder.append(readFilling.readSequence.subSequence(posReadEnd, newPosReadEnd)); posReadEnd = newPosReadEnd; + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); + } if (posReadEnd + 1 >= posReadFinal) continue;//Done //Now get information on the contig from start if (contig.getIndex() == toContig.getIndex()) - continue;//could break + continue;//tandem if (record.strand){ int refLeft = record.refStart; int refRight = record.refEnd; @@ -898,14 +1220,23 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ }else{ posReadEnd = record.readAlignmentEnd(); } - + if(refLeft > refRight) + continue; + + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refLeft-1) + " to " + refRight + " out of " + contig.getName()); JapsaFeature feature = new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() + refRight - refLeft +1, "CONTIG",contig.getName(),'+',""); feature.addDesc(contig.getName() + "+("+(refLeft ) +"," + refRight+")"); anno.add(feature); + seqBuilder.append(contig.contigSequence.subSequence(refLeft - 1, refRight)); + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); }else{//neg strain int refRight = record.refStart; @@ -917,7 +1248,11 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ }else{ posReadEnd = record.readAlignmentEnd(); } + if(refLeft < refRight) + continue; + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refRight-1) + " to " + refLeft + " out of " + contig.getName()); JapsaFeature feature = new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() - refRight + refLeft +1, "CONTIG",contig.getName(),'+',""); @@ -925,12 +1260,16 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ anno.add(feature); seqBuilder.append(Alphabet.DNA.complement(contig.contigSequence.subSequence(refRight - 1, refLeft))); - + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); } }//if record.readAlignmentStart() > posReadEnd else{//Now get information on the contig from start if (contig.getIndex() == toContig.getIndex()) - continue;//could break + continue;//tandem if (record.strand){ int refLeft = positionOnRef(posReadEnd, record) + 1; int refRight = record.refEnd; @@ -941,7 +1280,11 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ }else{ posReadEnd = record.readAlignmentEnd(); } + if(refLeft > refRight) + continue; + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refLeft-1) + " to " + refRight + " out of " + contig.getName()); JapsaFeature feature = new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() + refRight - refLeft +1, "CONTIG",contig.getName(),'+',""); @@ -949,7 +1292,11 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ anno.add(feature); seqBuilder.append(contig.contigSequence.subSequence(refLeft - 1, refRight)); - + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); }else{//neg strand int refLeft = positionOnRef(posReadEnd, record) + 1; int refRight = record.refStart; @@ -960,7 +1307,11 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ }else{ posReadEnd = record.readAlignmentEnd(); } + if(refLeft < refRight) + continue; + if(ScaffoldGraph.verbose) + System.out.println("+++from " + (refRight-1) + " to " + refLeft + " out of " + contig.getName()); JapsaFeature feature = new JapsaFeature(seqBuilder.length() + 1, seqBuilder.length() - refRight + refLeft +1, "CONTIG",contig.getName(),'+',""); @@ -968,7 +1319,11 @@ public int filling(SequenceBuilder seqBuilder, JapsaAnnotation anno){ anno.add(feature); seqBuilder.append(Alphabet.DNA.complement(contig.contigSequence.subSequence(refRight - 1, refLeft))); - + //count the appearance by 1 more + ScaffoldGraph.oneMore(contig); + + if(ScaffoldGraph.verbose) + System.out.println("Append to fill: " + feature.getDesc()); } } } @@ -982,4 +1337,5 @@ public int compareTo(Connection o) { return o.score - score; } } + } diff --git a/src/main/java/japsa/bio/hts/scaffold/Edge.java b/src/main/java/japsa/bio/hts/scaffold/Edge.java new file mode 100644 index 0000000..ac7b42c --- /dev/null +++ b/src/main/java/japsa/bio/hts/scaffold/Edge.java @@ -0,0 +1,180 @@ +package japsa.bio.hts.scaffold; + +/** + * This class models an bidirected Edge in my Graph implementation. + * An Edge contains two vertices and a weight (distance between them). + * A certain edge (v1,v2) can take one among 4 types: ++, --, +- and -+. Each + * type corresponds to the way we read the DNA sequence in each read when traversing + * this edge. + * For example: v1->---<-v2 or (v1,v2)+- spells out (v1 v2') and/or (v2 v1') as in SPAdes output. + * This class also deviates from the expectations of the Comparable interface + * in that a return value of 0 does not indicate that this.equals(other). The + * equals() method only compares the vertices, while the compareTo() method + * compares the edge weights. This provides more efficient implementation for + * checking uniqueness of edges, as well as the fact that two edges of equal weight + * should be considered equitably in a path finding or spanning tree algorithm. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Edge implements Comparable { + + private Vertex one, two; + private boolean dOne, dTwo; + private int weight; + + /** + * + * @param one The first vertex in the Edge + * @param two The second vertex in the Edge + */ + public Edge(Vertex one, Vertex two, boolean d1, boolean d2){ + this(one, two, d1, d2, -127); + } + + /** + * + * @param one The first vertex in the Edge + * @param two The second vertex of the Edge + * @param weight The weight of this Edge + */ + public Edge(Vertex one, Vertex two, boolean dOne, boolean dTwo, int weight){ + //this.one = (one.getLabel().compareTo(two.getLabel()) <= 0) ? one : two; + //this.two = (this.one == one) ? two : one; + this.one=one; + this.two=two; + this.weight = weight; + this.dOne=dOne; + this.dTwo=dTwo; + } + + + /** + * + * @param current + * @return The neighbor of current along this Edge + */ + public Vertex getNeighbor(Vertex current){ + if(!(current.equals(one) || current.equals(two))){ + return null; + } + + return (current.equals(one)) ? two : one; + } + /** + * Return the same Edge but reading the other way around + * just swap the order of its vertices upside down + * @param + * @return the identical Edge + */ + public Edge getReversedRead(){ + return new Edge(this.two, this.one, !this.dTwo, !this.dOne, this.weight); + } + /** + * + * @param current + * @return The direction to spell *current* along this Edge + */ + public boolean getDirection(Vertex current){ + assert (current.equals(one) || current.equals(two)):"Vertex doesn't belong to this Edge!"; + + return (current.equals(one)) ? dOne : !dTwo; + } + + /** + * + * @return Vertex this.one + */ + public Vertex getOne(){ + return this.one; + } + + /** + * + * @return Vertex this.two + */ + public Vertex getTwo(){ + return this.two; + } + + /** + * + * @return boolean this.dOne + */ + public boolean getDOne(){ + return this.dOne; + } + + /** + * + * @return boolean this.dTwo + */ + public boolean getDTwo(){ + return this.dTwo; + } + /** + * + * @return int The weight of this Edge + */ + public int getWeight(){ + return this.weight; + } + + + /** + * + * @param weight The new weight of this Edge + */ + public void setWeight(int weight){ + this.weight = weight; + } + + + /** + * Note that the compareTo() method deviates from + * the specifications in the Comparable interface. A + * return value of 0 does not indicate that this.equals(other). + * The equals() method checks the Vertex endpoints, while the + * compareTo() is used to compare Edge weights + * + * @param other The Edge to compare against this + * @return int this.weight - other.weight + */ + public int compareTo(Edge other){ + return this.weight - other.weight; + } + + /** + * + * @return String A String representation of this Edge + */ + public String toString(){ + return "({" + one + (dOne?"":"'") + ", " + two + (dTwo?"":"'") + "}, " + weight + ")"; + } + + /** + * + * @return int The hash code for this Edge + */ + public int hashCode(){ + return (one.getLabel() + (dOne?"":"'") + two.getLabel() + (dTwo?"":"'")).hashCode(); + } + + /** + * + * @param other The Object to compare against this + * @return true iff other is an Edge with the same Vertices as this + */ + public boolean equals(Object other){ + if(!(other instanceof Edge)){ + return false; + } + + Edge e = (Edge)other; + + return (e.one.equals(this.one) && e.two.equals(this.two) && (e.getDOne()==this.dOne) && (e.getDTwo()==this.dTwo)) + || (e.one.equals(this.two) && e.two.equals(this.one) && (e.getDOne()!=this.dOne) && (e.getDTwo()!=this.dTwo)); + } +} + + diff --git a/src/main/java/japsa/bio/hts/scaffold/Graph.java b/src/main/java/japsa/bio/hts/scaffold/Graph.java new file mode 100644 index 0000000..6559b1f --- /dev/null +++ b/src/main/java/japsa/bio/hts/scaffold/Graph.java @@ -0,0 +1,317 @@ +package japsa.bio.hts.scaffold; + +import java.io.IOException; +import java.util.*; + +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; + + +/** + * This class models a simple, bidirected graph using an + * incidence list representation. Vertices are identified + * uniquely by their labels, and only unique vertices are allowed. + * At most one unique Edge per vertex pair is allowed in this Graph. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Graph { + + private HashMap vertices; + private HashMap edges; + private int kmer; + + static final int TOLERATE=500; + + + public Graph(){ + this.vertices = new HashMap(); + this.edges = new HashMap(); + this.kmer=127;//default kmer size used by SPAdes to assembly MiSeq data + } + + + public Graph(String graphFile) throws IOException{ + this(); + //1. next iterate over again to read the connections + SequenceReader reader = new FastaReader(graphFile); + Sequence seq; + int shortestLen = 10000; + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + if(seq.length() 1){ + String[] nbList = adjList[1].split(","); + for(int i=0; i < nbList.length; i++){ + // create list of bridges here (distance=-kmer overlapped) + String neighbor = nbList[i]; + boolean dir2=neighbor.contains("'")?false:true; + neighbor=neighbor.replaceAll("[^a-zA-Z0-9_.]", "").trim(); + + Vertex nbVertex=new Vertex(neighbor); + if(getVertex(nbVertex.getLabel())!=null) + nbVertex=getVertex(nbVertex.getLabel()); + + addVertex(nbVertex, false); + + addEdge(current, nbVertex, dir1, dir2); + } + } + + } + //rough estimation of kmer used + if((shortestLen-1) != getKmerSize()) + setKmerSize(shortestLen-1); + + reader.close(); + } + /** + * This constructor accepts an ArrayList and populates + * this.vertices. If multiple Vertex objects have the same label, + * then the last Vertex with the given label is used. + * + * @param vertices The initial Vertices to populate this Graph + */ + public Graph(ArrayList vertices){ + this.vertices = new HashMap(); + this.edges = new HashMap(); + + for(Vertex v: vertices){ + this.vertices.put(v.getLabel(), v); + } + this.kmer=127;//default kmer size used by SPAdes to assembly MiSeq data + + } + + public int getKmerSize(){ + return this.kmer; + } + public void setKmerSize(int kmer){ + this.kmer=kmer; + } + /** + * This method adds am edge between Vertices one and two + * and their corresponding direction of weight kmer, + * if no Edge between these Vertices already exists in the Graph. + * + * @param one The first vertex to add + * @param two The second vertex to add + * @param d1 The direction on the side of vertex one + * @param d2 The direction on the side of vertex two + * @return true iff no Edge relating one and two exists in the Graph + */ + public boolean addEdge(Vertex one, Vertex two, boolean d1, boolean d2){ + return addEdge(one, two, d1, d2, -kmer); + } + + + /** + * Accepts two vertices, their directions and a weight, and adds the edge + * ({one, two}, {d1, d2}, weight) iff no Edge relating one and two + * exists in the Graph. + * + * @param one The first Vertex of the Edge + * @param two The second Vertex of the Edge + * @param d1 The direction on the side of vertex one + * @param d2 The direction on the side of vertex two + * @param weight The weight of the Edge + * @return true iff no Edge already exists in the Graph + */ + public boolean addEdge(Vertex one, Vertex two, boolean d1, boolean d2, int weight){ + + //ensures the Edge is not in the Graph + Edge e = new Edge(one, two, d1, d2, weight); + if(edges.containsKey(e.hashCode()) || edges.containsKey(e.getReversedRead().hashCode())){ + return false; + } + + //and that the Edge isn't already incident to one of the vertices + else if(one.containsNeighbor(e) || two.containsNeighbor(e.getReversedRead())){ + return false; + } + + edges.put(e.hashCode(), e); + one.addNeighbor(e); + two.addNeighbor(e.getReversedRead()); + return true; + } + + /** + * + * @param e The Edge to look up + * @return true iff this Graph contains the Edge e + */ + public boolean containsEdge(Edge e){ + if(e.getOne() == null || e.getTwo() == null){ + return false; + } + + return this.edges.containsKey(e.hashCode()) + || this.edges.containsKey(e.getReversedRead().hashCode()); + } + + + /** + * This method removes the specified Edge from the Graph, + * including as each vertex's incidence neighborhood. + * + * @param e The Edge to remove from the Graph + * @return Edge The Edge removed from the Graph + */ + public Edge removeEdge(Edge e){ + e.getOne().removeNeighbor(e); + e.getTwo().removeNeighbor(e.getReversedRead()); + Edge rmEdge = this.edges.remove(e.hashCode()); + if (rmEdge==null) + rmEdge = this.edges.remove(e.getReversedRead().hashCode()); + return rmEdge; + } + + /** + * + * @param vertex The Vertex to look up + * @return true iff this Graph contains vertex + */ + public boolean containsVertex(Vertex vertex){ + return this.vertices.get(vertex.getLabel()) != null; + } + + /** + * + * @param label The specified Vertex label + * @return Vertex The Vertex with the specified label + */ + public Vertex getVertex(String label){ + return vertices.get(label); + } + + /** + * This method adds a Vertex to the graph. If a Vertex with the same label + * as the parameter exists in the Graph, the existing Vertex is overwritten + * only if overwriteExisting is true. If the existing Vertex is overwritten, + * the Edges incident to it are all removed from the Graph. + * + * @param vertex + * @param overwriteExisting + * @return true iff vertex was added to the Graph + */ + public boolean addVertex(Vertex vertex, boolean overwriteExisting){ + Vertex current = this.vertices.get(vertex.getLabel()); + if(current != null){ + if(!overwriteExisting){ + return false; + } + + while(current.getNeighborCount() > 0){ + this.removeEdge(current.getNeighbor(0)); + } + } + + + vertices.put(vertex.getLabel(), vertex); + return true; + } + + /** + * + * @param label The label of the Vertex to remove + * @return Vertex The removed Vertex object + */ + public Vertex removeVertex(String label){ + Vertex v = vertices.remove(label); + + while(v.getNeighborCount() > 0){ + this.removeEdge(v.getNeighbor((0))); + } + + return v; + } + + /** + * + * @return Set All Graph's Vertex objects + */ + public Set getVertices(){ + return new HashSet(this.vertices.values()); + } + + /** + * + * @return Set The Edges of this graph + */ + public Set getEdges(){ + return new HashSet(this.edges.values()); + } + + /** + * Find a path between two nodes within a given distance + */ + public ArrayList DFS(Node source, Node dest, int distance){ + System.out.println("Looking for path between " + source.toString() + " to " + dest.toString() + " with distance " + distance); + Path tmp = new Path(this); + ArrayList retval = new ArrayList(); + tmp.addNode(source); + + //traverse(tmp, dest, retval, distance+source.getSeq().length()+dest.getSeq().length()); + traverse(tmp, dest, retval, distance); + + return retval; + } + + public void traverse(Path path, Node dest, ArrayList curResult, int distance){ + Node source=path.getEnd(); + assert source!=null:"Path null fault!"; + + ArrayList nList = source.getVertex().getNeighbors(); + for(Edge e:nList){ + if(e.getDOne()==source.getDirection()){ + path.addNode(e.getTwo(), e.getDTwo()); + + if(e.getTwo()==dest.getVertex() && e.getDTwo()==dest.getDirection() && Math.abs(distance+getKmerSize()) < TOLERATE){ + + Path curPath=curResult.isEmpty()?new Path(this):curResult.get(0), //the best path saved among all possible paths from the list curResult + tmpPath=new Path(this); + tmpPath.setComp(path.getComp()); + tmpPath.setDeviation(Math.abs(distance+getKmerSize())); + if( Math.abs(distance+getKmerSize()) < curPath.getDeviation() ) + curResult.add(0, tmpPath); + else + curResult.add(tmpPath); + + System.out.println("Hit added: "+path+"(candidate deviation: "+Math.abs(distance+getKmerSize())+")"); + }else{ + int newDistance=distance-e.getTwo().getSequence().length()+getKmerSize(); + if (newDistance+getKmerSize()<-TOLERATE){ + System.out.println("Stop following path with distance "+newDistance+" already! : "+path); + }else + traverse(path, dest, curResult, newDistance); + } + path.removeLast(); + } + } + } +} + + diff --git a/src/main/java/japsa/bio/hts/scaffold/Node.java b/src/main/java/japsa/bio/hts/scaffold/Node.java new file mode 100644 index 0000000..fe14c8e --- /dev/null +++ b/src/main/java/japsa/bio/hts/scaffold/Node.java @@ -0,0 +1,34 @@ +package japsa.bio.hts.scaffold; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; + + +public class Node{ + Vertex v; + boolean dir; + Node(Vertex v, boolean dir){ + this.v=v; + this.dir=dir; + } + public Vertex getVertex(){ + return v; + } + public void setVertex(Vertex v){ + this.v = v; + } + public boolean getDirection(){ + return dir; + } + public void setDirection(boolean dir){ + this.dir=dir; + } + public Node getRC(){ + return new Node(v,!dir); + } + public Sequence getSeq(){ + return dir?v.getSequence():Alphabet.DNA.complement(v.getSequence()); + } + public String toString(){ + return v.getLabel()+ (dir?"+":"-"); + } +} diff --git a/src/main/java/japsa/bio/hts/scaffold/Path.java b/src/main/java/japsa/bio/hts/scaffold/Path.java new file mode 100644 index 0000000..ac963f5 --- /dev/null +++ b/src/main/java/japsa/bio/hts/scaffold/Path.java @@ -0,0 +1,149 @@ +package japsa.bio.hts.scaffold; + +import java.util.ArrayList; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; + +public class Path implements Comparable{ + ArrayList nodes; + Graph graph; + int length, deviation; //how this path differ to long read data (todo: by multiple-alignment??) + public Path(){ + this.nodes=new ArrayList(); + graph=new Graph(); + length=0; + deviation=Integer.MAX_VALUE; + } + + public Path(Graph graph){ + this(); + associate(graph); + } + public Path(Path p){ + this(p.graph); + for(Node node:p.nodes) + this.nodes.add(node); + this.length=p.length; + } + /* + * @param String: a path as in contigs.paths of SPAdes output + * For example: 1+,2-,3+ + */ + public Path(Graph graph, String paths){ + this(graph); + paths=paths.replace(";", ""); //optimized it! + String[] comps = paths.split(","); + for(int i=0; i getComp(){ + return nodes; + } + public void setComp(ArrayList nodes){ + this.length=0; + for(Node node:nodes) + this.addNode(node); + } + + public Path rc(){ + Path retval=new Path(graph); + for(Node node:nodes){ + retval.nodes.add(0, node.getRC()); + } + return retval; + } + + public String toString(){ + return "P"+getID(); + } + public String getID(){ + String retval=""; + for(Node node:nodes){ + retval+=node.toString(); + } + return retval.trim(); + } + public Node removeLast(){ + Node retval=nodes.remove(nodes.size()-1); + length-=retval.getSeq().length()-graph.getKmerSize(); + return retval; + } + + public Sequence spelling(){ + SequenceBuilder seq = new SequenceBuilder(Alphabet.DNA16(), 1024*1024, this.toString()); + + for(int i=0;i= end){ - System.out.println("========FILLED=========="); - return 0; - } - - sortAlignment(); - - int gaps = 0; - int myStart = start; - //myEnd = end; - - for (AlignmentRecord record:alignments){ - if (record.readAlignmentEnd() < start) - continue; - - if (record.readAlignmentStart() > end) - break; - - System.out.printf("============== %5d -> %5d Contig %d: %5d %5d\n", - record.readAlignmentStart(), - record.readAlignmentEnd(), - record.contig.index, - record.strand?record.refStart:record.refEnd, - record.strand?record.refEnd:record.refStart - ); - - if (record.readAlignmentStart() > myStart) - gaps += record.readAlignmentStart() - myStart; - - myStart = Math.max(myStart, record.readAlignmentEnd()); - } - if (myStart < end) - gaps += end - myStart; - - System.out.printf("Left %d bases\n",gaps); - return gaps; - } } \ No newline at end of file diff --git a/src/main/java/japsa/bio/hts/scaffold/RealtimeScaffolding.java b/src/main/java/japsa/bio/hts/scaffold/RealtimeScaffolding.java index d74f9a0..80dbc18 100644 --- a/src/main/java/japsa/bio/hts/scaffold/RealtimeScaffolding.java +++ b/src/main/java/japsa/bio/hts/scaffold/RealtimeScaffolding.java @@ -1,14 +1,9 @@ package japsa.bio.hts.scaffold; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordIterator; -import htsjdk.samtools.SamInputResource; -import htsjdk.samtools.SamReader; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; import java.io.File; import java.io.IOException; +import java.lang.ProcessBuilder.Redirect; import java.util.ArrayList; import java.util.Date; @@ -16,29 +11,211 @@ import japsa.seq.Alphabet; import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; -import japsa.util.Logging; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; + + //Simulate fastq realtime generator: jsa.np.timeEmulate -i -output - public class RealtimeScaffolding { + private static final Logger LOG = LoggerFactory.getLogger(RealtimeScaffolding.class); + RealtimeScaffolder scaffolder; - public ScaffoldGraph graph; + public ScaffoldGraphDFS graph; int currentReadCount = 0; long currentBaseCount = 0; - - public RealtimeScaffolding(String seqFile, String output)throws IOException{ + + public RealtimeScaffolding(String seqFile, String genesFile, String resistFile, String isFile, String oriFile, String output)throws IOException, InterruptedException{ scaffolder = new RealtimeScaffolder(this, output); - graph = new ScaffoldGraphDFS(seqFile); + graph = new ScaffoldGraphDFS(seqFile, genesFile, resistFile, isFile, oriFile); } - + + + /** + * MDC tried to include BWA as part + * @param inFile + * @param readNumber + * @param timeNumber + * @param minCov + * @param qual + * @throws IOException + * @throws InterruptedException + */ + public void scaffolding2(String inFile, int readNumber, int timeNumber, double minCov, int qual, String format, String bwaExe, int bwaThread, String bwaIndex) + throws IOException, InterruptedException{ + scaffolder.setReadPeriod(readNumber); + scaffolder.setTimePeriod(timeNumber * 1000); + + LOG.info("Scaffolding ready at " + new Date()); + + //... + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = null; + + Process bwaProcess = null; + + if (format.endsWith("am")){//bam or sam + if ("-".equals(inFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(inFile)); + }else{ + LOG.info("Starting bwa at " + new Date()); + ProcessBuilder pb = null; + if ("-".equals(inFile)){ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", + "-K", + "20000", + bwaIndex, + "-" + ). + redirectInput(Redirect.INHERIT); + }else{ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", + "-K", + "20000", + bwaIndex, + inFile + ); + } + + bwaProcess = pb.redirectError(ProcessBuilder.Redirect.to(new File("/dev/null"))).start(); + + LOG.info("bwa started x"); + + //SequenceReader seqReader = SequenceReader.getReader(inFile); + + //SequenceOutputStream + //outStrs = new SequenceOutputStream(bwaProcess.getOutputStream()); + //LOG.info("set up output from bwa"); + + //Start a new thread to feed the inFile into bwa input + //Thread thread = new Thread(){ + // public void run(){ + // Sequence seq; + // Alphabet dna = Alphabet.DNA16(); + // try { + // LOG.info("Thread to feed bwa started"); + // while ( (seq = seqReader.nextSequence(dna)) !=null){ + // seq.writeFasta(outStrs); + // } + // outStrs.close();//as well as signaling + // seqReader.close(); + // } catch (IOException e) { // + + // }finally{ + + // } + // } + //}; + + //thread.start(); + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(bwaProcess.getInputStream())); + + } + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + ReadFilling readFilling = null; + ArrayList samList = null;// alignment record of the same read; + + Thread thread = new Thread(scaffolder); + thread.start(); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + if (rec.getReadUnmappedFlag() || rec.getMappingQuality() < qual){ + if (!readID.equals(rec.getReadName())){ + readID = rec.getReadName(); + synchronized(this){ + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + } + continue; + } + AlignmentRecord myRec = new AlignmentRecord(rec, graph.contigs.get(rec.getReferenceIndex())); + + if (readID.equals(myRec.readID)) { + + if (myRec.useful){ + for (AlignmentRecord s : samList) { + if (s.useful){ + //...update with synchronized + synchronized(this.graph){ + graph.addBridge(readFilling, s, myRec, minCov); + //Collections.sort(graph.bridgeList); + } + } + } + } + } else { + samList = new ArrayList(); + readID = myRec.readID; + readFilling = new ReadFilling(new Sequence(Alphabet.DNA5(), rec.getReadString(), "R" + readID), samList); + synchronized(this){ + currentReadCount ++; + currentBaseCount += rec.getReadLength(); + } + } + + samList.add(myRec); + + }// while + scaffolder.stopWaiting(); + thread.join(); + iter.close(); + reader.close(); + + if (bwaProcess != null){ + bwaProcess.waitFor(); + } + + } + public void scaffolding(String bamFile, int readNumber, int timeNumber, double minCov, int qual) - throws IOException, InterruptedException{ + throws IOException, InterruptedException{ scaffolder.setReadPeriod(readNumber); scaffolder.setTimePeriod(timeNumber * 1000); - Logging.info("Scaffolding ready at " + new Date()); + LOG.info("Scaffolding ready at " + new Date()); //... SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); - + SamReader reader; if ("-".equals(bamFile)) reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); @@ -69,7 +246,7 @@ public void scaffolding(String bamFile, int readNumber, int timeNumber, double m AlignmentRecord myRec = new AlignmentRecord(rec, graph.contigs.get(rec.getReferenceIndex())); if (readID.equals(myRec.readID)) { - + if (myRec.useful){ for (AlignmentRecord s : samList) { if (s.useful){ @@ -90,15 +267,13 @@ public void scaffolding(String bamFile, int readNumber, int timeNumber, double m currentBaseCount += rec.getReadLength(); } } - + samList.add(myRec); }// while scaffolder.stopWaiting(); thread.join(); iter.close(); - - //outOS.close(); reader.close(); } @@ -108,12 +283,16 @@ public void scaffolding(String bamFile, int readNumber, int timeNumber, double m RealtimeScaffolder(RealtimeScaffolding scf, String output) throws IOException{ scaffolding = scf; outOS = SequenceOutputStream.makeOutputStream(output); - outOS.print("time\tstep\treads\tbases\tscaffolds\n"); } @Override protected void close() { - // TODO Auto-generated method stub + //if SPAdes assembly graph is involved + if(Contig.hasGraph()){ + ContigBridge.forceFilling(); + analysis(); + } + try{ outOS.close(); }catch (Exception e){ @@ -126,7 +305,7 @@ protected void analysis() { long step = (lastTime - startTime)/1000;//convert to second scaffolding.graph.connectBridges(); int scfCount = 0, - cirCount = 0; + cirCount = 0; for (int i = 0; i < scaffolding.graph.scaffolds.length;i++){ if (scaffolding.graph.scaffolds[i].size() > 0){ int len = scaffolding.graph.scaffolds[i].getLast().rightMost() - scaffolding.graph.scaffolds[i].getFirst().leftMost(); @@ -136,13 +315,19 @@ protected void analysis() { continue; } if (scaffolding.graph.contigs.get(i).head == i - && !ScaffoldGraph.isRepeat(scaffolding.graph.contigs.get(i)) - && len > ScaffoldGraph.maxRepeatLength) + && !ScaffoldGraph.isRepeat(scaffolding.graph.contigs.get(i)) + && len > ScaffoldGraph.maxRepeatLength) scfCount++; } } try { - outOS.print(timeNow + "\t" + step + "\t" + lastReadNumber + "\t" + scaffolding.currentBaseCount + "\t" + scfCount + "\t" + cirCount); + // This function is for the sake of real-time annotation experiments being more readable + //scaffolding.graph.printRT(scaffolding.currentBaseCount); + scaffolding.graph.printSequences(); + outOS.print("Time |\tStep |\tRead count |\tBase count|\tNumber of scaffolds|\tCircular scaffolds |\tN50 | \tBreaks (maxlen)\n"); + outOS.print(timeNow + " |\t" + step + " |\t" + lastReadNumber + " |\t" + scaffolding.currentBaseCount + " |\t" + scfCount + + " |\t" + cirCount + " |\t" + scaffolding.graph.getN50() + " |\t" + scaffolding.graph.getGapsInfo()); + outOS.println(); outOS.flush(); } catch (IOException e) { @@ -155,6 +340,6 @@ protected int getCurrentRead() { // TODO Auto-generated method stub return scaffolding.currentReadCount; } - + } } diff --git a/src/main/java/japsa/bio/hts/scaffold/Scaffold.java b/src/main/java/japsa/bio/hts/scaffold/Scaffold.java index e3269a2..41dec89 100644 --- a/src/main/java/japsa/bio/hts/scaffold/Scaffold.java +++ b/src/main/java/japsa/bio/hts/scaffold/Scaffold.java @@ -41,6 +41,8 @@ import japsa.seq.SequenceOutputStream; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; import java.util.LinkedList; import java.util.Iterator; import java.util.ListIterator; @@ -57,6 +59,7 @@ private static final long serialVersionUID = -4310125261868862931L; LinkedList bridges; int scaffoldIndex; + int len=-1; //boolean closed = false; /** * invariant: the direction of the decque is the same as the main (the longest one) @@ -83,10 +86,24 @@ public void setCloseBridge(ContigBridge bridge){ if(this.peekFirst().getIndex() == bridge.firstContig.getIndex()) last2first = ScaffoldVector.reverse(last2first); circle = ScaffoldVector.composition(last2first, circle); + +// change magnitute of vector to positive for convenience, a.k.a the direction of head contig + circle.magnitude = Math.abs(circle.magnitude); + //bridge.setContigScores(); //closed = true; } - + + /** + * Return the vector of a contig after move it forward or backward 1 circular length + * @param ScaffoldVector v of the contig + * @param boolean direction to move: true to move forward, false for backward (w.r.t. head contig) + * @return ScaffoldVector of contig after moving + */ + public ScaffoldVector rotate(ScaffoldVector v, boolean direction){ + return (direction && (v.getDirection()>0))?ScaffoldVector.composition(circle, v):ScaffoldVector.composition(ScaffoldVector.reverse(circle), v); + } + /** * Return 1 or -1 if the contig is at the first or last of the list. * Otherwise, return 0 @@ -116,8 +133,8 @@ public boolean isLast(Contig ctg){ * @param bridge */ public void addFront(Contig contig, ContigBridge bridge){ + assert bridge.firstContig.getIndex() == contig.getIndex(): "Front prob: "+ bridge.hashKey + " not connect " + contig.getIndex() + " and " + this.getFirst().getIndex(); this.addFirst(contig); - assert bridge.firstContig.getIndex() == contig.getIndex(): "Front: "+ bridge.hashKey + " " + contig.getIndex(); bridges.addFirst(bridge); if(ScaffoldGraph.verbose) System.out.printf("...adding contig %d to scaffold %d backward!\n", contig.getIndex(), scaffoldIndex); @@ -130,8 +147,8 @@ public void addFront(Contig contig, ContigBridge bridge){ * @param bridge */ public void addRear(Contig contig, ContigBridge bridge){ + assert bridge.secondContig.getIndex() == contig.getIndex():"Rear prob: "+ bridge.hashKey + " not connect " + this.getLast().getIndex() + " and " + contig.getIndex(); this.addLast(contig); - assert bridge.secondContig.getIndex() == contig.getIndex():"Rear: "+ bridge.hashKey + " " + contig.getIndex(); bridges.addLast(bridge); if(ScaffoldGraph.verbose) System.out.printf("...adding contig %d to scaffold %d forward!\n", contig.getIndex(), scaffoldIndex); @@ -178,7 +195,7 @@ public Contig nearestMarker(Contig ctg, boolean forward){ return marker; } - + //reset prevScore or nextScore to 0 according to removed bridges. public void trim(){ if(ScaffoldGraph.verbose) System.out.println("Trimming scaffold: " + scaffoldIndex); @@ -190,7 +207,6 @@ public void trim(){ if(ScaffoldGraph.verbose) System.out.println("...removing contig " + rightmost.getIndex()); this.removeLast(); - //bridges.removeLast(); rightmost=this.peekLast(); } @@ -205,14 +221,15 @@ public void trim(){ else{ if(ScaffoldGraph.verbose) System.out.println("...removing bridge " + bridges.peekLast().hashKey); - bridges.removeLast(); + //bridges.peekLast(); + bridges.removeLast().resetContigScores(); } } if(bridges.size() > 1){ if(bridges.get(bridges.size() - 2).isContaining(rightmost)){ if(ScaffoldGraph.verbose) System.out.println("...removing bridge " + bridges.peekLast().hashKey); - this.bridges.removeLast(); + bridges.removeLast().resetContigScores(); } } @@ -222,7 +239,6 @@ public void trim(){ if(ScaffoldGraph.verbose) System.out.println("...removing contig " + leftmost.getIndex()); this.removeFirst(); - //bridges.removeFirst(); leftmost=this.peekFirst(); } if(this.size() <=1){ @@ -236,14 +252,15 @@ public void trim(){ else{ if(ScaffoldGraph.verbose) System.out.println("...removing bridge " + bridges.peekFirst().hashKey); - bridges.removeFirst(); + //bridges.peekFirst(); + bridges.removeFirst().resetContigScores(); } } if(bridges.size() > 1){ if(bridges.get(1).isContaining(leftmost)){ if(ScaffoldGraph.verbose) - System.out.println("...removing bridge " + bridges.peekFirst().hashKey); - this.bridges.removeFirst(); + System.out.println("...removing bridge " + bridges.peekFirst().hashKey); + bridges.removeFirst().resetContigScores(); } } @@ -278,15 +295,21 @@ public void view(){ } /** - * FIXME: return the length of this scaffold - * May maintain a length variable, and update it every time the scaffold is bridged + * Return the length of this scaffold * Check out quast (https://github.com/ablab/quast) */ public int length(){ - return 0; + if(isEmpty()) + return 0; + if(len > 0) + return len; + int len = getLast().rightMost() - getFirst().leftMost(); + if(circle!=null) + len = circle.getMagnitute(); + return len; } - public void viewSequence(SequenceOutputStream out) throws IOException{ + public synchronized void viewSequence(SequenceOutputStream fout, SequenceOutputStream jout) throws IOException{ System.out.println("========================== START ============================="); @@ -306,6 +329,16 @@ public void viewSequence(SequenceOutputStream out) throws IOException{ System.out.println("Size = " + size() + " sequence"); + // Synchronize positions of 2 contigs (myVector) of a bridge based on the real list of contigs + // TODO: do the same with viewAnnotation() + assert this.size()==bridges.size()+1:"Number of contigs ("+this.size()+")" + " doesn't agree with number of bridges ("+bridges.size()+"!"; + for(int i=0;i | | leftContig | | | | rightContig * Contigs: ... ~~~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*~~~ ... ~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * startLeft endLeft + * startLeft(1) endLeft(1) startLeft(2) ... * * that's what happens below! * @@ -328,13 +361,25 @@ public void viewSequence(SequenceOutputStream out) throws IOException{ rightContig = getFirst(); //startLeft: the leftPoint of leftContig, endLeft: rightPoint of left Contig - int startLeft = (rightContig.getRelDir() > 0)?1:rightContig.length(); //starting point after the last fillFrom - int endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + int startLeft = (rightContig.getRelDir() > 0)?0:rightContig.length(); //starting point after the last fillFrom + int endLeft = (rightContig.getRelDir() < 0)?0:rightContig.length(); + //TODO: re-check the coordinate of two ends (0 or 1, inclusive/exclusive) +// int startLeft = (rightContig.getRelDir() > 0)?1:rightContig.length(); //starting point after the last fillFrom +// int endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + + + /* uncomment for illumina-based */ + //int closeDis = 0; if (closeBridge != null){ bestCloseConnection = closeBridge.fewestGapConnection(); leftContig = closeBridge.firstContig; - startLeft = bestCloseConnection.filling(null, null); //adjust the starting point + /* uncomment for longread-based */ + startLeft = bestCloseConnection.filling(null, null); + /* uncomment for illumina-based */ +// closeDis =closeBridge.getTransVector().distance(closeBridge.firstContig, closeBridge.secondContig); +// if(closeDis > 0) +// startLeft = bestCloseConnection.filling(null, null); //adjust the starting point anno.addDescription("Circular"); @@ -342,49 +387,78 @@ public void viewSequence(SequenceOutputStream out) throws IOException{ anno.addDescription("Linear"); - bridIter = bridges.iterator(); Iterator ctgIter = this.iterator(); leftContig = ctgIter.next();//The first for (ContigBridge bridge:bridges){ - //System.out.println("------------------------------------ START ------------------------------------"); rightContig = ctgIter.next(); - ContigBridge.Connection connection = bridge.fewestGapConnection(); - - endLeft = (leftContig.getRelDir()>0)?(connection.firstAlignment.refEnd): - (connection.firstAlignment.refStart); - + /* uncomment for longread-based */ + endLeft = (leftContig.getRelDir()>0)?(connection.firstAlignment.refEnd): + (connection.firstAlignment.refStart); + /* uncomment for illumina-based */ +// int distance = bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig); +// if(distance < 0){ +// endLeft = (leftContig.getRelDir()>0)?(leftContig.length()-Math.abs(distance)):Math.abs(distance); +// }else{ +// endLeft = (leftContig.getRelDir()>0)?(connection.firstAlignment.refEnd): +// (connection.firstAlignment.refStart); +// } + if (startLeft 0)?1:rightContig.length(); +// }else{ +// //Fill in the connection +// startLeft = connection.filling(seq, anno); +// } leftContig = rightContig; }//for //leftContig = lastContig in the queue if (bestCloseConnection != null){ + /* uncomment for longread-based */ endLeft = (leftContig.getRelDir()>0)?(bestCloseConnection.firstAlignment.refEnd): - (bestCloseConnection.firstAlignment.refStart); + (bestCloseConnection.firstAlignment.refStart); + /* uncomment for illumina-based */ +// if(closeDis > 0) +// endLeft = (leftContig.getRelDir()>0)?(bestCloseConnection.firstAlignment.refEnd): +// (bestCloseConnection.firstAlignment.refStart); +// else{ +// endLeft = (rightContig.getRelDir() < 0)?Math.abs(closeDis):rightContig.length()-Math.abs(closeDis); +// } } else endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); @@ -393,28 +467,189 @@ public void viewSequence(SequenceOutputStream out) throws IOException{ JapsaFeature feature = new JapsaFeature(seq.length() + 1, seq.length() + endLeft - startLeft, "CONTIG",leftContig.getName(),'+',""); - feature.addDesc(leftContig.getName() + "+("+startLeft +"," + endLeft+")"); + feature.addDesc(leftContig.getName() + "+["+startLeft +"," + endLeft+")"); anno.add(feature); - seq.append(leftContig.contigSequence.subSequence(startLeft - 1, endLeft)); - leftContig.portionUsed += (1.0 + endLeft - startLeft + 1) / leftContig.length(); + if(ScaffoldGraph.verbose) + System.out.println("Append " + leftContig.getName() + ": " + startLeft + " to " + (endLeft-1)); + + seq.append(leftContig.contigSequence.subSequence(startLeft, endLeft)); + + //seq.append(leftContig.contigSequence.subSequence(startLeft - 1, endLeft)); }else{ JapsaFeature feature = new JapsaFeature(seq.length() + 1, seq.length() + startLeft - endLeft, "CONTIG",leftContig.getName(),'+',""); - feature.addDesc(leftContig.getName() + "-("+endLeft +"," + startLeft+")"); + feature.addDesc(leftContig.getName() + "-["+endLeft +"," + startLeft+")"); anno.add(feature); + + if(ScaffoldGraph.verbose) + System.out.println("Append RC of " + leftContig.getName() + ": " + endLeft + " to " + (startLeft-1)); + seq.append(Alphabet.DNA.complement(leftContig.contigSequence.subSequence(endLeft, startLeft))); - seq.append(Alphabet.DNA.complement(leftContig.contigSequence.subSequence(endLeft - 1, startLeft))); - leftContig.portionUsed += (1.0 - endLeft + startLeft + 1) / leftContig.length(); + //seq.append(Alphabet.DNA.complement(leftContig.contigSequence.subSequence(endLeft - 1, startLeft))); } + //count the appearance by 1 more + ScaffoldGraph.oneMore(leftContig); + if (bestCloseConnection != null){ System.out.printf("Append bridge %d -- %d\n",closeBridge.firstContig.index, closeBridge.secondContig.index); + /* uncomment for longread-based */ bestCloseConnection.filling(seq, anno); + /* uncomment for illumina-based */ +// if(closeDis >0 ) +// bestCloseConnection.filling(seq, anno); } System.out.println("============================ END ==========================="); - //JapsaAnnotation.write(seq.toSequence(), anno, out); //uncomment this line and comment next line for debug - seq.writeFasta(out); + len = seq.length(); + JapsaAnnotation.write(seq.toSequence(), anno, jout); + seq.writeFasta(fout); + } + /* Output annotation of this scaffold + * TODO: output annotations from the filling sequences + */ + public synchronized void viewAnnotation(SequenceOutputStream out) throws IOException{ + + SequenceBuilder seq = new SequenceBuilder(Alphabet.DNA16(), 1024*1024, "Scaffold" + scaffoldIndex); + JapsaAnnotation anno = new JapsaAnnotation(); + + ContigBridge.Connection bestCloseConnection = null; + Contig leftContig, rightContig; +/* + * Nanopore reads: + * ==================================== ========================================== + * | | | | | | + * | | | | | | + * | | leftContig | | | | rightContig + * Contigs: ... ~~~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*~~~ ... ~~~*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * startLeft endLeft + * + * that's what happens below! + * + */ + + rightContig = getFirst(); + + //startLeft: the leftPoint of leftContig, endLeft: rightPoint of left Contig + int startLeft = (rightContig.getRelDir() > 0)?1:rightContig.length(); //starting point after the last fillFrom + int endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + + if (closeBridge != null){ + bestCloseConnection = closeBridge.fewestGapConnection(); + leftContig = closeBridge.firstContig; + startLeft = bestCloseConnection.filling(null, null); + + anno.addDescription("Circular"); + + }else + anno.addDescription("Linear"); + + Iterator ctgIter = this.iterator(); + leftContig = ctgIter.next();//The first + + JapsaFeature lastGene=null; + if(leftContig.resistanceGenes.size()>0) + lastGene = startLeft0)?(connection.firstAlignment.refEnd): + (connection.firstAlignment.refStart); + + /**********************************************************************************************/ + ArrayList resistLeft = leftContig.getFeatures(leftContig.resistanceGenes, startLeft, endLeft), + insertLeft = leftContig.getFeatures(leftContig.insertSeq, startLeft, endLeft), + oriLeft = leftContig.getFeatures(leftContig.oriRep, startLeft, endLeft); + + for(JapsaFeature resist:resistLeft){ + resist.setStart(resist.getStart()+seq.length()); + resist.setEnd(resist.getEnd()+seq.length()); + anno.add(resist); + } + Collections.sort(resistLeft); + if(resistLeft.size() > 0){ + JapsaFeature leftGene = startLeftendLeft?resistLeft.get(0):resistLeft.get(resistLeft.size()-1); + //extract first and last gene here + if(lastGene!=null) + System.out.println(lastGene.getID() + "...next to..." + leftGene.getID()); + lastGene = rightGene; + } + for(JapsaFeature insert:insertLeft){ + insert.setStart(insert.getStart()+seq.length()); + insert.setEnd(insert.getEnd()+seq.length()); + anno.add(insert); + } + for(JapsaFeature ori:oriLeft){ + ori.setStart(ori.getStart()+seq.length()); + ori.setEnd(ori.getEnd()+seq.length()); + anno.add(ori); + } + /**********************************************************************************************/ + + if (startLeft0)?(bestCloseConnection.firstAlignment.refEnd): + (bestCloseConnection.firstAlignment.refStart); + + } + else + endLeft = (rightContig.getRelDir() < 0)?1:rightContig.length(); + + /**********************************************************************************************/ + ArrayList resistLeft = leftContig.getFeatures(leftContig.resistanceGenes, startLeft, endLeft), + insertLeft = leftContig.getFeatures(leftContig.insertSeq, startLeft, endLeft), + oriLeft = leftContig.getFeatures(leftContig.oriRep, startLeft, endLeft); + + for(JapsaFeature resist:resistLeft){ + resist.setStart(resist.getStart()+seq.length()); + resist.setEnd(resist.getEnd()+seq.length()); + anno.add(resist); + } + if(resistLeft.size() > 0){ + JapsaFeature leftGene = startLeft contigs; - //int nScaffolds=0; + public static boolean reportAll = false; + public boolean annotation = false; + public static HashMap countOccurence; + + public String prefix = "out"; public static double estimatedCov = 0; - double estimatedLength = 0; - //ArrayList bridgeList = new ArrayList(); - HashMap bridgeMap= new HashMap(); + private static double estimatedLength = 0; + //below maps contain avatar of contigs and bridges only, + //not the actual ones in used (because of the repeats that need to be cloned) + ArrayList contigs; + HashMap bridgeMap= new HashMap(); + static HashMap> bridgesFromContig = new HashMap>(); + + Scaffold [] scaffolds; // DNA translator, previous image of sequence is stored for real-time processing + // Constructor for the graph with contigs FASTA file (contigs.fasta from SPAdes output) public ScaffoldGraph(String sequenceFile) throws IOException{ //1. read in contigs SequenceReader reader = SequenceReader.getReader(sequenceFile); @@ -93,35 +106,287 @@ public ScaffoldGraph(String sequenceFile) throws IOException{ ctg.setCoverage(mycov); contigs.add(ctg); + bridgesFromContig.put(ctg.getIndex(), new ArrayList()); index ++; } reader.close(); - + estimatedCov /= estimatedLength; if(verbose) System.out.println("Cov " + estimatedCov + " Length " + estimatedLength); //2. Initialise scaffold graph - scaffolds = new Scaffold[contigs.size()]; - //nScaffolds = contigs.size(); - //head = new int[contigs.size()]; - + scaffolds = new Scaffold[contigs.size()]; + for (int i = 0; i < contigs.size();i++){ scaffolds[i] = new Scaffold(contigs.get(i)); - //head[i] = i;//pointer contig -> scaffold //point to the head of the scaffold contigs.get(i).head = i; }//for + + }//constructor - - - public int n50(){ - return 0; + + /* Read short-read assembly information from SPAdes output: assembly graph (assembly_graph.fastg) and + ** traversed paths (contigs.pahth) to make up the contigs + */ + public void readMore(String assemblyGraph, String paths) throws IOException{ + //1. Read assembly graph and store in a string graph + Graph g = new Graph(assemblyGraph); + + // for(Vertex v:g.getVertices()){ + // System.out.println("Neighbors of vertex " + v.getLabel() + " (" + v.getNeighborCount() +"):"); + // for(Edge e:v.getNeighbors()) + // System.out.println(e + "; "); + // System.out.println(); + // } + + Contig.setGraph(g); + + //2. read file contigs.paths from SPAdes + BufferedReader pathReader = new BufferedReader(new FileReader(paths)); + + String s; + //Read contigs from contigs.paths and refer themselves to contigs.fasta + Contig curContig = null; + while((s=pathReader.readLine()) != null){ + if(s.contains("NODE")) + curContig=getSPadesContig(s); + else if(curContig!=null) + curContig.setPath(new Path(g,s)); + + } + pathReader.close(); + + } + + public Contig getSPadesContig(String name){ + if(name.contains("'")){ + if(verbose) + System.out.println("Ignored (redundant) reversed sequence: " + name); + return null; + } + + Contig res = null; + + for(Contig ctg:contigs){ + // Extract to find contig named NODE_x_ + //because sometimes there are disagreement between contig name (_length_) in contigs.paths and contigs.fasta in SPAdes!!! + // + if(ctg.getName().contains("NODE_"+name.split("_")[1]+"_")){ + res = ctg; + break; + } + } + + if(res==null && verbose){ + System.out.println("Contig not found:" + name); + } + + return res; + } + + public synchronized int getN50(){ + int [] lengths = new int[scaffolds.length]; + int count=0; + double sum = 0; + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + int len = scaffolds[i].length(); + if ((contigs.get(i).head == i + && !isRepeat(contigs.get(i)) + && len > maxRepeatLength + ) + || scaffolds[i].closeBridge != null) + { + lengths[count] = len; + sum+=len; + count++; + } + } + + Arrays.sort(lengths); + + int index = lengths.length; + double contains = 0; + while (contains < sum/2){ + index --; + contains += lengths[index]; + } + + return lengths[index]; } + public synchronized String getGapsInfo(){ + int gapCount=0, + gapMaxLen=0; + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + int len = scaffolds[i].length(); + if ((contigs.get(i).head == i + && !isRepeat(contigs.get(i)) + && len > maxRepeatLength + ) + || scaffolds[i].closeBridge != null) + { + for(ContigBridge brg:scaffolds[i].bridges){ + if(brg.getBridgePath()==null){ + gapCount++; + if(brg.getTransVector().distance(brg.firstContig, brg.secondContig) > gapMaxLen) + gapMaxLen=brg.getTransVector().distance(brg.firstContig, brg.secondContig); + } + } + if(scaffolds[i].closeBridge!=null){ + ContigBridge brg=scaffolds[i].closeBridge; + if(brg.getBridgePath()==null){ + gapCount++; + if(brg.getTransVector().distance(brg.firstContig, brg.secondContig) > gapMaxLen) + gapMaxLen=brg.getTransVector().distance(brg.firstContig, brg.secondContig); + } + } + + } + } + + + return gapCount+" ("+gapMaxLen+")"; + } /** - * Make connections between any two uniquely (non-repeat) contigs + * MDC added second version that include bwa + * @param bamFile + * @param minCov + * @param qual + * @throws IOException + * @throws InterruptedException + */ + public void makeConnections2(String inFile, double minCov, int qual, String format, String bwaExe, int bwaThread, String bwaIndex) throws IOException, InterruptedException{ + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + + SamReader reader = null; + Process bwaProcess = null; + + if (format.endsWith("am")){//bam or sam + if ("-".equals(inFile)) + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + else + reader = SamReaderFactory.makeDefault().open(new File(inFile)); + }else{ + Logging.info("Starting bwa at " + new Date()); + + ProcessBuilder pb = null; + if ("-".equals(inFile)){ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", +// "-K", +// "20000", + bwaIndex, + "-" + ). + redirectInput(Redirect.INHERIT); + }else{ + pb = new ProcessBuilder(bwaExe, + "mem", + "-t", + "" + bwaThread, + "-k11", + "-W20", + "-r10", + "-A1", + "-B1", + "-O1", + "-E1", + "-L0", + "-a", + "-Y", +// "-K", +// "20000", + bwaIndex, + inFile + ); + } + bwaProcess = pb.redirectError(ProcessBuilder.Redirect.to(new File("/dev/null"))).start(); + + //Logging.info("bwa started x"); + reader = SamReaderFactory.makeDefault().open(SamInputResource.of(bwaProcess.getInputStream())); + } + + + //SamReader reader; + //if ("-".equals(bamFile)) + // reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); + //else + // reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + SAMRecordIterator iter = reader.iterator(); + + String readID = ""; + ReadFilling readFilling = null; + ArrayList samList = null;// alignment record of the same read; + + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + if (rec.getReadUnmappedFlag()) + continue; + if (rec.getMappingQuality() < qual) + continue; + + AlignmentRecord myRec = new AlignmentRecord(rec, contigs.get(rec.getReferenceIndex())); + + + ////////////////////////////////////////////////////////////////// + // make bridge of contigs that align to the same (Nanopore) read. + // Note that SAM file MUST be sorted based on readID (samtools sort -n) + + //not the first occurrance + if (readID.equals(myRec.readID)) { + if (myRec.useful){ + for (AlignmentRecord s : samList) { + if (s.useful){ + this.addBridge(readFilling, s, myRec, minCov); //stt(s) < stt(myRec) -> (s,myRec) appear once only! + //...update with synchronized + + } + } + } + } else { + + samList = new ArrayList(); + readID = myRec.readID; + readFilling = new ReadFilling(new Sequence(Alphabet.DNA5(), rec.getReadString(), "R" + readID), samList); + } + samList.add(myRec); + + }// while + iter.close(); + + //outOS.close(); + reader.close(); + if (bwaProcess != null){ + bwaProcess.waitFor(); + } + + Logging.info("Sort list of bridges"); + //Collections.sort(bridgeList); + } + + + + /** + * Forming bridges based on alignments * * @param bamFile * @param minCov @@ -130,9 +395,9 @@ public int n50(){ * @param qual * @throws IOException */ - public void makeConnections(String bamFile, double minCov, int qual, SequenceOutputStream connectStr, SequenceOutputStream statStr) throws IOException{ + public void makeConnections(String bamFile, double minCov, int qual) throws IOException{ SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); - + SamReader reader; if ("-".equals(bamFile)) reader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); @@ -143,11 +408,8 @@ public void makeConnections(String bamFile, double minCov, int qual, SequenceOut String readID = ""; ReadFilling readFilling = null; - ArrayList samList = null;// alignment record of the same read; - BitSet bitSet =null; - int readScore = 0, readLength = 0; - if(statStr != null) - statStr.print("#readID\tlength\tcovered\tscore\n"); + ArrayList samList = null;// alignment record of the same read; + while (iter.hasNext()) { SAMRecord rec = iter.next(); if (rec.getReadUnmappedFlag()) @@ -157,49 +419,28 @@ public void makeConnections(String bamFile, double minCov, int qual, SequenceOut AlignmentRecord myRec = new AlignmentRecord(rec, contigs.get(rec.getReferenceIndex())); - ////////////////////////////////////////////////////////////////// - // Just to save this - if (connectStr != null && readID.equals(myRec.readID)) { - for (AlignmentRecord s : samList) { - connectStr.print(myRec.contig.index + " " + s.contig.index + " " + readID + " " + myRec.useful + " " + s.useful + " " + myRec.pos() + " " + s.pos()); - connectStr.println(); - connectStr.print(s.contig.index + " " + myRec.contig.index + " " + readID + " " + s.useful + " " + myRec.useful + " " + s.pos() + " " + myRec.pos()); - connectStr.println(); - } - } + ////////////////////////////////////////////////////////////////// // make bridge of contigs that align to the same (Nanopore) read. // Note that SAM file MUST be sorted based on readID (samtools sort -n) - + //not the first occurrance if (readID.equals(myRec.readID)) { if (myRec.useful){ for (AlignmentRecord s : samList) { if (s.useful){ - this.addBridge(readFilling, s, myRec, minCov); + this.addBridge(readFilling, s, myRec, minCov); //stt(s) < stt(myRec) -> (s,myRec) appear once only! //...update with synchronized - + } } } } else { - //samList.clear(); - if(statStr != null && bitSet !=null) - statStr.print(readID + "\t" + readLength + "\t" + bitSet.cardinality() + "\t" + readScore + "\n"); - bitSet = new BitSet(myRec.readLength); - readScore = 0; - readLength = 0; - + samList = new ArrayList(); readID = myRec.readID; readFilling = new ReadFilling(new Sequence(Alphabet.DNA5(), rec.getReadString(), "R" + readID), samList); - } - bitSet.set(myRec.readAlignmentStart(), myRec.readAlignmentEnd()); - //statStr.print(readID + "\t" + myRec.readAlignmentStart() + "->" + myRec.readAlignmentEnd() + ": " + myRec.refStart + "->" + myRec.refEnd + "\n"); - - readScore += myRec.score; - readLength = myRec.readLength; - + } samList.add(myRec); }// while @@ -239,18 +480,22 @@ protected void addBridge(ReadFilling readSequence, AlignmentRecord a, AlignmentR && (Math.abs(gP)*1.0 / a.contig.length()) < 1.1 && (Math.abs(gP)*1.0 / a.contig.length()) > 0.9 && a.readLength < 1.1* a.contig.length() - ) + ) { if(verbose) System.out.printf("Potential CIRCULAR or TANDEM contig %s map to read %s(length=%d): (%d,%d)\n" - , a.contig.getName(), a.readID, a.readLength, gP, alignD); - a.contig.isCircular = true; + , a.contig.getName(), a.readID, a.readLength, gP, alignD); + a.contig.cirProb ++; } + else{ + a.contig.cirProb--; + b.contig.cirProb--; + } // overlap length on aligned read (<0 if not overlap) int overlap = Math.min( a.readAlignmentEnd() - b.readAlignmentStart(), b.readAlignmentEnd() - a.readAlignmentStart()); if ( overlap > Math.min( .5 * Math.min(a.readAlignmentEnd()-a.readAlignmentStart(), b.readAlignmentEnd()-b.readAlignmentStart()), - minContigLength) + minContigLength) || a.contig.getCoverage() < minCov // filter out contigs with inappropriate cov || b.contig.getCoverage() < minCov ){ @@ -259,75 +504,88 @@ protected void addBridge(ReadFilling readSequence, AlignmentRecord a, AlignmentR ScaffoldVector trans = new ScaffoldVector(gP, alignD); - int bridgeID = 0; + int count = 0; ContigBridge bridge, bridge_rev; while (true){ - String hash = ContigBridge.makeHash(a.contig.index, b.contig.index, bridgeID), - hash_rev = ContigBridge.makeHash(b.contig.index, a.contig.index, bridgeID); - if(a.contig.getIndex()==b.contig.getIndex()) - hash_rev = ContigBridge.makeHash(b.contig.index, a.contig.index, ++bridgeID); + int brgID = count, revID = count; + if(a.contig.getIndex()==b.contig.getIndex()){ + brgID = 2*count; + revID = brgID+1; + } + String hash = ContigBridge.makeHash(a.contig.index, b.contig.index, brgID), + hash_rev = ContigBridge.makeHash(b.contig.index, a.contig.index, revID); + bridge = bridgeMap.get(hash); bridge_rev = bridgeMap.get(hash_rev); if (bridge == null){ assert bridge_rev==null:hash_rev + " not null!"; - bridge = new ContigBridge(a.contig, b.contig, bridgeID); - bridge.addConnection(readSequence, a, b, trans, score); + bridge = new ContigBridge(a.contig, b.contig, brgID); + bridge_rev = new ContigBridge(b.contig, a.contig, revID); - bridge_rev = new ContigBridge(b.contig, a.contig, bridgeID); + bridge.addConnection(readSequence, a, b, trans, score); bridge_rev.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); - a.contig.bridges.add(bridge); - b.contig.bridges.add(bridge_rev); - + // a.contig.bridges.add(bridge); + // b.contig.bridges.add(bridge_rev); + bridgesFromContig.get(a.contig.getIndex()).add(bridge); + bridgesFromContig.get(b.contig.getIndex()).add(bridge_rev); + bridgeMap.put(hash, bridge); bridgeMap.put(hash_rev, bridge_rev); - + break; - }else if ((a.contig.getIndex() != b.contig.getIndex()) && bridge.consistentWith(trans)){ + } + if ((a.contig.getIndex() != b.contig.getIndex()) && bridge.consistentWith(trans)){ assert bridge_rev!=null:hash_rev + "is null!"; bridge.addConnection(readSequence, a, b, trans, score); bridge_rev.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); break; - }else if(a.contig.getIndex() == b.contig.getIndex()){ + } + if(a.contig.getIndex() == b.contig.getIndex()){ assert bridge_rev!=null:hash_rev + "is null"; if(bridge.consistentWith(trans)){ bridge.addConnection(readSequence, a, b, trans, score); bridge_rev.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); + break; } if(bridge.consistentWith(ScaffoldVector.reverse(trans))){ bridge_rev.addConnection(readSequence, b, a, trans, score); bridge.addConnection(readSequence, b, a, ScaffoldVector.reverse(trans), score); - } - break; - }else{ - bridgeID ++; - //continue; + break; + } } - } + count ++; + }//while } + public static ArrayList getListOfBridgesFromContig(Contig ctg){ + return bridgesFromContig.get(ctg.getIndex()); + } /**********************************************************************************************/ public ContigBridge getReversedBridge(ContigBridge bridge){ String hash = ContigBridge.makeHash(bridge.secondContig.index, bridge.firstContig.index, bridge.orderIndex); return bridgeMap.get(hash); } /**********************************************************************************************/ - // check if it's possible to extend from *contig* with *bridge* to another extended-already contig (contigF) - // use for markers and unique bridge only + /* + * Check if it's possible to extend from *contig* with *bridge* to another extended-already contig (contigF) + * use for markers and unique bridge only. + * This is a pre-step to join 2 scaffolds: scaffoldT going to scaffoldF + * @param Contig: a contig to start with + * ContigBridge: a bridge from given contig to a candidate unique contig for the extension + * @return int: direction on targeted scaffold (scaffoldF) that can be traversed + */ protected int extendDirection(Contig contig, ContigBridge bridge){ Contig contigF = bridge.secondContig; - ScaffoldVector trans = bridge.getTransVector(); //contigF -> contig -// if(contig.getIndex() == bridge.secondContig.getIndex()){ -// contigF = bridge.firstContig; -// trans = ScaffoldVector.reverse(trans); -// } + ScaffoldVector trans = bridge.getTransVector(); //contig->contigF + int pointer = Integer.signum(trans.magnitude * trans.direction); //pointer < 0 => tail of contigF on bridge assert scaffolds[contigF.head].size() > 1 : contigF.head; - + int headF = contigF.head; int direction = 0; //direction of extension on scaffoldT (we need to return direction on scaffoldF) ScaffoldVector headT2contigF = ScaffoldVector.composition(trans, contig.getVector()); int rEnd = contig.rightMost(), rEndF = contigF.rightMost(headT2contigF), - lEnd = contig.leftMost(), lEndF = contigF.leftMost(headT2contigF); + lEnd = contig.leftMost(), lEndF = contigF.leftMost(headT2contigF); if(rEndF > rEnd){ direction = 1; } @@ -339,57 +597,78 @@ else if(lEndF < lEnd){ if(verbose) System.out.println("Examining extending direction from contig " + contig.getIndex() + " to " + bridge.hashKey); Scaffold scaffoldF = scaffolds[headF]; - Contig prevMarker = scaffoldF.nearestMarker(contigF, false), // previous marker of contigF on corresponding scaffold - nextMarker = scaffoldF.nearestMarker(contigF, true); // next marker of contigF on corresponding scaffold - + // Get order-based (order on scaffold other than orientation-based of contig) previous and next marker(unique contig) + Contig prevMarker = scaffoldF.nearestMarker(contigF, false), // previous marker of contigF on *corresponding scaffold* + nextMarker = scaffoldF.nearestMarker(contigF, true); // next marker of contigF on *corresponding scaffold* + ScaffoldVector rev = ScaffoldVector.reverse(contigF.getVector()); //rev = contigF->headF if(prevMarker != null){ ScaffoldVector toPrev = ScaffoldVector.composition(prevMarker.getVector(),rev); //contigF->prevMarker if(scaffoldF.indexOf(prevMarker) > scaffoldF.indexOf(contigF) && scaffoldF.closeBridge != null) - toPrev = ScaffoldVector.composition(ScaffoldVector.reverse(scaffoldF.circle), toPrev); + toPrev = scaffoldF.rotate(toPrev, false); ScaffoldVector headT2Prev = ScaffoldVector.composition(toPrev, headT2contigF); int rEndPrev = prevMarker.rightMost(headT2Prev), - lEndPrev = prevMarker.leftMost(headT2Prev); + lEndPrev = prevMarker.leftMost(headT2Prev); if(verbose){ System.out.printf("Extending from contigT %d to targeted contig (contigF) %d with previous contig (prevMarker) %d \n", contig.getIndex(), contigF.getIndex(), prevMarker.getIndex()); System.out.println("...headT->contig, contigF and prevMarker: " + contig.getVector() + headT2contigF + headT2Prev); } if ((direction > 0?rEndPrev > rEndF: lEndPrev < lEndF)){ - if((rev.direction>=0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ + //check if the candidate ContigBridge is more confident than the current or not + if((pointer<0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ if(verbose) System.out.printf("=> go from %d to %d to %d \n", contig.getIndex(), contigF.getIndex(), prevMarker.getIndex()); return -1; } - else + else{ + if(verbose) + System.out.printf("Bridge score not strong enough: %.2f < %.2f (%.2f)\n", + bridge.getScore(), pointer<0?contigF.nextScore:contigF.prevScore, + pointer<0?contigF.prevScore:contigF.nextScore); + return 0; + } + }else{ + if(verbose) + System.out.printf("Direction conflict: %d, %d %d or %d %d. Checking otherway... \n", direction, rEndPrev, rEndF, lEndPrev, lEndF); } } if(nextMarker != null){ ScaffoldVector toNext = ScaffoldVector.composition(nextMarker.getVector(),rev); //contigF->nextMarker if(scaffoldF.indexOf(nextMarker) < scaffoldF.indexOf(contigF) && scaffoldF.closeBridge != null) - toNext = ScaffoldVector.composition(scaffoldF.circle, toNext); + toNext = scaffoldF.rotate(toNext, true); ScaffoldVector headT2Next = ScaffoldVector.composition(toNext, headT2contigF); int rEndNext = nextMarker.rightMost(headT2Next), - lEndNext = nextMarker.leftMost(headT2Next); + lEndNext = nextMarker.leftMost(headT2Next); if(verbose){ System.out.printf("Extending from contigT %d to targeted contig (contigF) %d with next contig (nextMarker) %d \n", contig.getIndex(), contigF.getIndex(), nextMarker.getIndex()); System.out.println("...headT->contig, contigF and nextMarker: " + contig.getVector() + headT2contigF + headT2Next); } - + if ((direction > 0? rEndNext > rEndF : lEndNext < lEndF)){ - if((rev.direction<0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ + //if((rev.direction<0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ + if((pointer<0?contigF.nextScore:contigF.prevScore) < bridge.getScore()){ if(verbose) System.out.printf("=> go from %d to %d to %d \n", contig.getIndex(), contigF.getIndex(), nextMarker.getIndex()); return 1; } - else + else{ + if(verbose) + System.out.printf("Bridge score not strong enough: %.2f < %.2f (%.2f)\n", + bridge.getScore(), pointer<0?contigF.nextScore:contigF.prevScore, + pointer<0?contigF.prevScore:contigF.nextScore); return 0; + + } + }else{ + if(verbose) + System.out.printf("Direction conflict: %d, %d %d or %d %d. End searching! \n", direction, rEndNext, rEndF, lEndNext, lEndF); } } return 0; } /*********************************************************************************/ - public synchronized boolean joinScaffold(Contig contig, ContigBridge bridge, int extendDir){ + public synchronized boolean joinScaffold(Contig contig, ContigBridge bridge, boolean firstDir, int secondDir){ if(verbose) { System.out.println("PROCEED TO CONNECT " + bridge.hashKey + " with score " + bridge.getScore() + ", size " + bridge.getConnections().size() + @@ -398,18 +677,14 @@ public synchronized boolean joinScaffold(Contig contig, ContigBridge bridge, int bridge.display(); } - + Contig contigF = bridge.secondContig, contigT = contig; ScaffoldVector trans = bridge.getTransVector(); -// if(contig.getIndex() == bridge.secondContig.getIndex()){ -// contigF = bridge.firstContig; -// trans = ScaffoldVector.reverse(trans); -// } - + int headF = contigF.head, - headT = contigT.head; + headT = contigT.head; Scaffold scaffoldF = scaffolds[headF], - scaffoldT = scaffolds[headT]; + scaffoldT = scaffolds[headT]; int posT = scaffoldT.isEnd(contigT); if (posT == 0){ if(verbose) @@ -419,21 +694,22 @@ public synchronized boolean joinScaffold(Contig contig, ContigBridge bridge, int if(verbose) System.out.println("Before joining " + contigF.index + " (" + headF +") to " + contigT.index - + " (" + headT +") " - + (scaffoldT.getLast().rightMost() - scaffoldT.getFirst().leftMost()) - + " " + (scaffoldF.getLast().rightMost() - scaffoldF.getFirst().leftMost()) - + " " + (scaffoldT.getLast().rightMost() - scaffoldT.getFirst().leftMost() + scaffoldF.getLast().rightMost() - scaffoldF.getFirst().leftMost())); + + " (" + headT +") " + + (scaffoldT.getLast().rightMost() - scaffoldT.getFirst().leftMost()) + + " " + (scaffoldF.getLast().rightMost() - scaffoldF.getFirst().leftMost()) + + " " + (scaffoldT.getLast().rightMost() - scaffoldT.getFirst().leftMost() + scaffoldF.getLast().rightMost() - scaffoldF.getFirst().leftMost())); //=================================================================================================== int index = scaffoldF.indexOf(contigF), - count = index; + count = index; ScaffoldVector rev = ScaffoldVector.reverse(contigF.getVector()); //rev = contigF->headF - //int extendDir = extendDirection(contigT, bridge); + int addScf=-1; - if(extendDir == -1){ + if(secondDir == -1){ if(headF==headT){ - if(posT!=1) + //if(posT!=1) + if(firstDir) return false; else{ Contig nextMarker = scaffoldF.nearestMarker(contigF, true); @@ -463,40 +739,44 @@ public synchronized boolean joinScaffold(Contig contig, ContigBridge bridge, int ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT - + ctg.head = headT; - if (posT == 1){ + //if (posT == 1){ + if(!firstDir){ scaffoldT.addFront(ctg,brg); }else{ scaffoldT.addRear(ctg,getReversedBridge(brg)); } - if(count==0) break; + if(count<1) break; ctg = scaffoldF.remove(--count); brg = scaffoldF.bridges.remove(count); - + } if(scaffoldF.closeBridge!=null && !scaffoldF.isEmpty()){ + count = scaffoldF.size()-1; ctg = scaffoldF.removeLast(); brg = scaffoldF.closeBridge; - count = scaffoldF.size()-1; + while(true){ + ctg.myVector = scaffoldF.rotate(ctg.myVector, false); ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT - ctg.composite(ScaffoldVector.reverse(scaffoldF.circle)); + //ctg.composite(ScaffoldVector.reverse(scaffoldF.circle)); //composite co tinh giao hoan k ma de day??? ctg.head = headT; - if (posT == 1){ + //if (posT == 1){ + if(!firstDir){ scaffoldT.addFront(ctg,brg); }else{ scaffoldT.addRear(ctg,getReversedBridge(brg)); } - if(count==0) break; + if(count<1) break; brg = scaffoldF.bridges.remove(count--); ctg = scaffoldF.remove(count); - + } } - + //set the remaining scaffoldT.trim(); if(scaffoldF.size() > 0){ @@ -504,34 +784,35 @@ public synchronized boolean joinScaffold(Contig contig, ContigBridge bridge, int addScf=scaffoldF.getFirst().getIndex(); changeHead(scaffoldF, scaffoldF.getFirst()); } - + } scaffoldF = new Scaffold(contigs.get(headF)); } - else if(extendDir == 1){ + else if(secondDir == 1){ if(headF==headT){ - if(posT!=-1) - return false; - else{ - Contig prevMarker = scaffoldF.nearestMarker(contigF, false); - if(prevMarker!=null){ - Contig ctg = scaffoldF.remove(--count); - Scaffold newScf = new Scaffold(ctg); - ContigBridge brg = scaffoldF.bridges.remove(count); - while(true){ - if(count<1) break; - ctg= scaffoldF.remove(--count); - brg = scaffoldF.bridges.remove(count); - newScf.addFront(ctg,brg); - } - newScf.trim(); - changeHead(newScf, prevMarker); - addScf=prevMarker.getIndex(); + //if(posT!=-1) + if(!firstDir) + return false; + else{ + Contig prevMarker = scaffoldF.nearestMarker(contigF, false); + if(prevMarker!=null){ + Contig ctg = scaffoldF.remove(--count); + Scaffold newScf = new Scaffold(ctg); + ContigBridge brg = scaffoldF.bridges.remove(count); + while(true){ + if(count<1) break; + ctg= scaffoldF.remove(--count); + brg = scaffoldF.bridges.remove(count); + newScf.addFront(ctg,brg); } - scaffoldF.setCloseBridge(bridge); - changeHead(scaffoldF, contigF); - + newScf.trim(); + changeHead(newScf, prevMarker); + addScf=prevMarker.getIndex(); } + scaffoldF.setCloseBridge(bridge); + changeHead(scaffoldF, contigF); + + } }else{ Contig ctg = scaffoldF.remove(index); ContigBridge brg = bridge; @@ -540,9 +821,10 @@ else if(extendDir == 1){ ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT - + ctg.head = headT; - if (posT == 1){ + //if (posT == 1){ + if(!firstDir){ scaffoldT.addFront(ctg,getReversedBridge(brg)); }else{ scaffoldT.addRear(ctg,brg); @@ -555,12 +837,14 @@ else if(extendDir == 1){ ctg = scaffoldF.removeFirst(); brg = scaffoldF.closeBridge; while(true){ + ctg.myVector = scaffoldF.rotate(ctg.myVector, true); ctg.composite(rev); // contigF->headF + headF->ctg = contigF->ctg ctg.composite(trans); // contigT->contigF + contigF->ctg = contigT->ctg ctg.composite(contigT.getVector()); //headT->contigT + contigT->ctg = headT->ctg : relative position of this ctg w.r.t headT - ctg.composite(scaffoldF.circle); + //ctg.composite(scaffoldF.circle); ctg.head = headT; - if (posT == 1){ + //if (posT == 1){ + if(!firstDir){ scaffoldT.addFront(ctg,getReversedBridge(brg)); }else{ scaffoldT.addRear(ctg,brg); @@ -577,13 +861,13 @@ else if(extendDir == 1){ addScf=scaffoldF.getLast().getIndex(); changeHead(scaffoldF, scaffoldF.getLast()); } - //scaffoldF = new Scaffold(contigs.get(headF)); here? + } scaffoldF = new Scaffold(contigs.get(headF)); } else return false; - + //=================================================================================================== if(verbose){ System.out.println("After Joining: " + (addScf<0?1:2) + " scaffolds!"); @@ -610,7 +894,7 @@ public void changeHead(Scaffold scf, Contig newHead){ } Scaffold newScf = new Scaffold(newHead.getIndex()); ScaffoldVector rev = ScaffoldVector.reverse(newHead.getVector()); //rev = newHead->head - + if(newHead.getRelDir() == 0){ if(verbose) System.out.printf("Contig %d of scaffold %d got direction 0!\n" , newHead.getIndex(), scfIndex); @@ -633,7 +917,7 @@ else if(newHead.getRelDir() > 0){ newScf.bridges.add(getReversedBridge(scf.bridges.removeLast())); if(scf.closeBridge != null){ newScf.closeBridge = getReversedBridge(scf.closeBridge); - newScf.circle = ScaffoldVector.reverse(scf.circle); + newScf.circle = scf.circle; } } @@ -642,49 +926,137 @@ else if(newHead.getRelDir() > 0){ } newScf.setHead(newHead.getIndex()); scaffolds[newHead.getIndex()] = newScf; - //scaffolds[scfIndex] = new Scaffold(contigs.get(scfIndex)); + } - public synchronized void printSequences(SequenceOutputStream out) throws IOException{ - //System.out.println(nScaffolds); - for (int i = 0; i < scaffolds.length;i++){ - if(scaffolds[i].isEmpty()) continue; - int len = scaffolds[i].getLast().rightMost() - scaffolds[i].getFirst().leftMost(); - if ((contigs.get(i).head == i - && !isRepeat(contigs.get(i)) - && len > maxRepeatLength - ) - || scaffolds[i].closeBridge != null - ) - { - if(verbose) - System.out.println("Scaffold " + i + " length " + len); - scaffolds[i].viewSequence(out); + public synchronized void printSequences() throws IOException{ + countOccurence=new HashMap(); + if(annotation){ + SequenceOutputStream aout = SequenceOutputStream.makeOutputStream(prefix+".anno.japsa"); + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + int len = scaffolds[i].getLast().rightMost() - scaffolds[i].getFirst().leftMost(); + + if(contigs.get(i).head == i || scaffolds[i].closeBridge != null){ + if ( (!isRepeat(contigs.get(i)) && len > maxRepeatLength) //here are the big ones + || (reportAll && needMore(contigs.get(i)) && contigs.get(i).coverage > .5*estimatedCov)) //short/repeat sequences here if required + { + if(verbose) + System.out.println("Scaffold " + i + " estimated length " + len); + scaffolds[i].viewAnnotation(aout); + } + } + } + aout.close(); + } else{ + SequenceOutputStream fout = SequenceOutputStream.makeOutputStream(prefix+".fin.fasta"), + jout = SequenceOutputStream.makeOutputStream(prefix+".fin.japsa"); + for (int i = 0; i < scaffolds.length;i++){ + if(scaffolds[i].isEmpty()) continue; + int len = scaffolds[i].getLast().rightMost() - scaffolds[i].getFirst().leftMost(); + + if(contigs.get(i).head == i || scaffolds[i].closeBridge != null){ + if ( (!isRepeat(contigs.get(i)) && len > maxRepeatLength) //here are the big ones + || (reportAll && needMore(contigs.get(i)) && contigs.get(i).coverage > .5*estimatedCov)) //short/repeat sequences here if required + { + if(verbose) + System.out.println("Scaffold " + i + " estimated length " + len); + + scaffolds[i].viewSequence(fout, jout); + } + } } + fout.close(); + jout.close(); } + } + public static void oneMore(Contig ctg){ + if(countOccurence.get(ctg.getIndex())==null) + countOccurence.put(ctg.getIndex(), 1); + else + countOccurence.put(ctg.getIndex(), countOccurence.get(ctg.getIndex())+1); + } + private boolean needMore(Contig ctg) { + // TODO Auto-generated method stub + Integer count = countOccurence.get(ctg.getIndex()); + if(count==null) return true; + int estimatedOccurence = (int) Math.floor(ctg.coverage/estimatedCov); + if(estimatedOccurence <= Math.floor(.75*count)) + return true; + else + return false; + } + public synchronized void printRT(long tpoint) throws IOException{ for (Contig contig:contigs){ - contig.display(); + if(contig.oriRep.size() > 0){ + String fname = contig.getName() + ".rtout"; + File f = new File(fname); + if(!f.exists()) + f.createNewFile(); + + //BufferedWriter out = new BufferedWriter(new FileWriter(f.getPath(), true)); + FileWriter fw = new FileWriter(f,true); + BufferedWriter bw = new BufferedWriter(fw); + PrintWriter pw = new PrintWriter(bw); + + ArrayList ctgList = new ArrayList(), + origList = new ArrayList(), + resList = new ArrayList(), + genesList = new ArrayList(); + + for(Contig ctg:scaffolds[contig.head]){ + ctgList.add(ctg.getName()); + if(ctg.oriRep.size()>0) + for(JapsaFeature ori:ctg.oriRep) + origList.add(ori.getID()); + for (JapsaFeature feature:ctg.genes) + genesList.add(feature.toString()); + for (JapsaFeature feature:ctg.resistanceGenes) + resList.add(feature.toString()); + } + float streamData=tpoint/1000000; + pw.print(">"); + for(String ctg:ctgList) + pw.printf("%s\t", ctg); + + pw.printf("\n>%.2fMpb\t%d genes\t", streamData, genesList.size()); + + for(String ori:origList) + pw.printf("+%s", ori); + + for(String genes:genesList) + pw.print(" \n\t"+genes); + pw.println(""); + + for(String res:resList) + pw.print(" \n\t"+res); + pw.println(""); + + pw.close(); + + } } - + + } // To check if this contig is likely a repeat or a singleton. If FALSE: able to be used as a milestone. public static boolean isRepeat(Contig ctg){ //for the case of AbySS when no coverage information of contigs is found - if(estimatedCov == 1.0 && ctg.coverage == 1.0){ + if(estimatedCov == 1.0 && ctg.getCoverage() == 1.0){ if(ctg.length() > maxRepeatLength) return false; else return true; } - - if (ctg.length() < minContigLength) return true; - else if (ctg.length() > maxRepeatLength || ctg.coverage < 1.3 * estimatedCov) + + if (ctg.length() < minContigLength || ctg.getCoverage() < .3 * estimatedCov) return true; + else if (ctg.length() > maxRepeatLength || ctg.getCoverage() < 1.3 * estimatedCov) return false; - else if (ctg.coverage > 1.5 * estimatedCov) + else if (ctg.getCoverage() > 1.5 * estimatedCov) return true; else{ - for(ContigBridge bridge:ctg.bridges){ + for(ContigBridge bridge:getListOfBridgesFromContig(ctg)){ Contig other = bridge.firstContig.getIndex()==ctg.getIndex()?bridge.secondContig:bridge.firstContig; if(other.getIndex()==ctg.getIndex()) continue; int dist=bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig); @@ -693,9 +1065,9 @@ else if (ctg.coverage > 1.5 * estimatedCov) return true; } } - //return false; + } - if(ctg.coverage < .5 * estimatedCov || ctg.length() < 2*minContigLength) // second filter: maybe not repeat but insignificant contig + if(ctg.length() < 2*minContigLength) // further filter: maybe not repeat but insignificant contig return true; else return false; diff --git a/src/main/java/japsa/bio/hts/scaffold/ScaffoldGraphDFS.java b/src/main/java/japsa/bio/hts/scaffold/ScaffoldGraphDFS.java index 8ab1b7a..7eb0d43 100644 --- a/src/main/java/japsa/bio/hts/scaffold/ScaffoldGraphDFS.java +++ b/src/main/java/japsa/bio/hts/scaffold/ScaffoldGraphDFS.java @@ -33,7 +33,13 @@ ****************************************************************************/ package japsa.bio.hts.scaffold; +import japsa.seq.JapsaFeature; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import java.io.BufferedReader; +import java.io.FileReader; import java.io.IOException; +import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -47,11 +53,157 @@ * @param sequenceFile * @throws IOException */ - public ScaffoldGraphDFS(String sequenceFile) throws IOException { + public ScaffoldGraphDFS(String sequenceFile, String genesFile, String resistFile, String isFile, String oriFile) throws IOException, InterruptedException { super(sequenceFile); + if(resistFile != null){ + readDb(resistFile, "Resistance genes", .9, 1.0); + annotation = true; + } + if(isFile != null){ + readDb(isFile, "Insertion sites", .8, .9); + annotation = true; + } + if(oriFile != null){ + readDb(oriFile, "Origin of replication", .8, .9); + annotation = true; + } + if(genesFile != null){ + readGFF(genesFile); + annotation = true; + } +// for (Contig contig:contigs){ +// for(JapsaFeature feature:contig.genes) +// System.out.println(contig.getName() + "\t" + feature.getStrand() + "\t" + feature); +// } +// for (Contig contig:contigs){ +// for(JapsaFeature feature:contig.oriRep) +// System.out.println(contig.getName() + "\t" + feature.getStrand() + "\t" + feature); +// } +// for (Contig contig:contigs){ +// for(JapsaFeature feature:contig.insertSeq) +// System.out.println(contig.getName() + "\t" + feature.getStrand() + "\t" + feature); +// } + } + + private void readDb(String data, String type, double minCov, double minID) throws IOException, InterruptedException{ + type = type.toLowerCase(); + + String blastn = "blastn"; + + ProcessBuilder pb = new ProcessBuilder(blastn, + "-subject", + "-", + "-query", + data, + "-outfmt", + "7 qseqid qlen qstart qend sseqid slen sstart send length frames pident nident gaps mismatch score bitscore sstrand"); + ///// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + + //6 qseqid qlen length pident nident gaps mismatch + // 0 1 2 3 4 5 6 + Process process = pb.start(); + //Pass on the genome to blastn + SequenceOutputStream out = new SequenceOutputStream(process.getOutputStream()); + for (Contig ctg:contigs){ + Sequence seq=ctg.contigSequence; + seq.writeFasta(out); + } + out.close(); + + //Read the output of blastn + BufferedReader br = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line; + + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) + continue; + + String [] toks = line.trim().split("\t"); + int length = Integer.parseInt(toks[8]); + int qlen = Integer.parseInt(toks[1]); + double cov = (float)length/qlen; + if (minCov > cov){ + continue; + } + + if (Double.parseDouble(toks[10]) < minID * 100){ + continue; + } + //pass + + Contig ctg = getSPadesContig(toks[4]); + if(ctg != null){ + char strand = toks[16].equals("plus")?'+':'-'; + JapsaFeature feature = new JapsaFeature(Integer.parseInt(toks[6]), Integer.parseInt(toks[7]), type, toks[0], strand, ctg.getName()); + + feature.addDesc(toks[0]+ ":" + (int)(cov*100) + "% cover, " + toks[10] + "% identity"); + + switch (type.toLowerCase()){ + case "resistance genes": + ctg.resistanceGenes.add(feature); + break; + case "insertion sites": + ctg.insertSeq.add(feature); + break; + case "origin of replication": + ctg.oriRep.add(feature); + break; + default: + System.err.println(type + " has not yet included in our analysis!"); + break; + } + //Rewritten the below with the above + //switch (type.toLowerCase()){ + // case "resistance genes": + // ctg.resistanceGenes.add(feature); + // break; + // case "insertion sites": + // ctg.insertSeq.add(feature); + // break; + // case "origin of replication": + // ctg.oriRep.add(feature); + // break; + // default: + // System.err.println(type + " has not yet included in our analysis!"); + // break; + //} + } + Collections.sort(ctg.resistanceGenes); + Collections.sort(ctg.insertSeq); + Collections.sort(ctg.oriRep); + } + br.close(); + //process.waitFor();//Do i need this??? + } + private void readGFF(String fileName) throws IOException, InterruptedException{ + BufferedReader br = new BufferedReader(new FileReader(fileName)); + String line; + + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) + continue; + if (line.startsWith(">")) + break; + String [] toks = line.trim().split("\t"); + Contig ctg = getSPadesContig(toks[0]+"_"); //get the contig from its shorten name + if(ctg != null){ + int start = Integer.parseInt(toks[3]), + end = Integer.parseInt(toks[4]); + String [] des = toks[8].trim().split(";"); + String [] id = des[0].trim().split("="); + String ID = "undefined"; + if(id[0].equals("ID")) + ID = id[1]; + if(!toks[2].equals("gene")){ + JapsaFeature feature = new JapsaFeature(start, end, toks[2], ID, toks[6].charAt(0), ctg.getName()); + feature.addDesc(toks[8]); + ctg.genes.add(feature); + } + } + } + br.close(); } - public ScaffoldGraphDFS(String sequenceFile, String graphFile) throws IOException { super(sequenceFile); //TODO: implement fastg reader for SequenceReader to have pre-assembled bridges @@ -64,6 +216,23 @@ public ScaffoldGraphDFS(String sequenceFile, String graphFile) throws IOExceptio */ @Override public synchronized void connectBridges(){ +// for(Contig ctg:contigs){ +// //System.out.println(ctg.getName()); +// if(ctg.isCircular){ +// if(ctg.bridges.size() != 2){ +// ctg.isCircular = false; +// break; +// } +// for(ContigBridge brg:ctg.bridges){ +// //System.out.println("\t"+brg.hashKey+" : "+brg.getTransVector()); +// if(brg.firstContig.getIndex() != brg.secondContig.getIndex()){ +// ctg.isCircular = false; +// break; +// } +// } +// } +// } + // Start scaffolding if(verbose) System.out.println("Starting scaffolding......."); @@ -80,7 +249,7 @@ public synchronized void connectBridges(){ //Now extend scaffold i if( isRepeat(scaffolds[i].element()) || scaffolds[i].element().length() < minContigLength){ - if(!scaffolds[i].element().isCircular) + if(!scaffolds[i].element().isCircular()) continue; } //1.a extend to the first @@ -123,7 +292,7 @@ private ContigBridge checkHang(Contig marker, ContigBridge toPrev, ContigBridge ScaffoldVector prevToCur = ScaffoldVector.composition(mark2Cur,ScaffoldVector.reverse(mark2Prev)); if(verbose) System.out.printf("\texamining %s to %s:\n ",prevContig.getName(),curContig.getName()); - for(ContigBridge brg:prevContig.bridges){ + for(ContigBridge brg:getListOfBridgesFromContig(prevContig)){ if(brg.secondContig.getIndex() == curContig.getIndex()){ if(brg.consistentWith(prevToCur)){ if(verbose) @@ -147,20 +316,22 @@ private ContigBridge checkHang(Contig marker, ContigBridge toPrev, ContigBridge private boolean walk2(int i, boolean direction ){ Scaffold scaffold = scaffolds[i]; boolean extended = true; - boolean closed = false; + boolean closed = scaffold.closeBridge!=null; /*****************************************************************/ while (extended && (!closed) && scaffold.size() > 0){ - Contig ctg = direction?scaffold.getLast():scaffold.getFirst(); + Contig ctg = direction?scaffold.getLast():scaffold.getFirst(); + ArrayList bridges=getListOfBridgesFromContig(ctg); + if(verbose) { System.out.printf(" Last of scaffold %d extention is on contig %d (%s): ",i,ctg.getIndex(),ctg.getName()); - System.out.printf("iterating among %d bridges\n",ctg.bridges.size()); + System.out.printf("iterating among %d bridges\n",bridges.size()); } int ctgEnd = direction?ctg.rightMost():ctg.leftMost(); extended = false; //only continue the while loop if extension is on the move (line 122) - int maxLink = ctg.bridges.size(), - extendDir = 0, + int maxLink = bridges.size(), + extendDir = 0, //direction to go on the second scaffold: ScaffoldT (realtime mode) curStep = Integer.MAX_VALUE; //distance between singleton1 -> singleton2 double curScore = 0.0; //score between singleton1 -> singleton2 ContigBridge stepBridge = null; @@ -169,10 +340,10 @@ private boolean walk2(int i, boolean direction ){ ArrayList extendableContigBridge = new ArrayList(maxLink); ArrayList extendableVector = new ArrayList(maxLink); ArrayList distances = new ArrayList(maxLink); - Collections.sort(ctg.bridges); - for (ContigBridge bridge:ctg.bridges){ + Collections.sort(bridges); + for (ContigBridge bridge:bridges){ if (bridge.firstContig == bridge.secondContig) //2 identical markers ??! - if(!bridge.firstContig.isCircular) + if(!bridge.firstContig.isCircular()) continue; Contig nextContig = bridge.secondContig; ScaffoldVector trans = bridge.getTransVector(); @@ -189,7 +360,7 @@ private boolean walk2(int i, boolean direction ){ //only take one next singleton (with highest score possible sorted) as the marker for the next extension int distance = bridge.getTransVector().distance(bridge.firstContig, bridge.secondContig); if (direction?(newEnd > ctgEnd):(newEnd < ctgEnd)){ - if(!isRepeat(nextContig) || (ctg.isCircular && ctg.getIndex() == nextContig.getIndex())){ + if(!isRepeat(nextContig) || (ctg.isCircular() && ctg.getIndex() == nextContig.getIndex())){ //check quality of the bridge connected 2 markers int aDir = 0; if(scaffolds[nextContig.head].size() > 1){ @@ -254,7 +425,7 @@ else if(verbose) ScaffoldVector curVector = extendableVector.get(index); if(verbose) System.out.println("Checking contig " + curContig.getName() + "..."); - if( isRepeat(curContig) && !curContig.isCircular) + if( isRepeat(curContig) && !curContig.isCircular()) if(checkHang(ctg, curContigBridge, stepBridge)==null) continue; prevVector = prevContig.getVector(); @@ -289,7 +460,7 @@ else if(verbose) if(extendable){ // if extension is circularized if(curContig.getIndex() == (direction?scaffold.getFirst().getIndex():scaffold.getLast().getIndex()) - && (!isRepeat(curContig) || curContig.isCircular) + && (!isRepeat(curContig) || curContig.isCircular()) ){ if(verbose) System.out.printf(" *****************SCAFFOLD %d CLOSED AFTER CONNECT %d ***********************\n", i,curContig.index); @@ -300,14 +471,13 @@ else if(verbose) if(isRepeat(curContig)){ curContig.head = i; //must be here! - - curContig.isCircular = false; // tandem! curContig = curContig.clone(); }else{ //check to join 2 scaffolds and stop this round if (scaffolds[curContig.head].size() > 1){ - if(!joinScaffold(prevContig,confirmedBridge,extendDir)){ - System.out.printf(" Skip to connect contig %d of %d to contig %d of %d\n", ctg.index,i,curContig.index, curContig.head); + if(!joinScaffold(prevContig,confirmedBridge,direction,extendDir)){ + if(verbose) + System.out.printf(" Skip to connect contig %d of %d to contig %d of %d\n", ctg.index,i,curContig.index, curContig.head); continue; } else{ @@ -327,12 +497,15 @@ else if(verbose) i,ctg.index, ctgEnd, curContig.index, curContig.rightMost(curVector), curContigBridge.getScore()); System.out.printf(" curContigBridge %d -> %d\n", confirmedBridge.firstContig.getIndex(), curContigBridge.secondContig.getIndex()); } - + curContig.myVector = ScaffoldVector.composition(prevToCur,prevContig.getVector());//from the head contig + //confirmedBridge=confirmedBridge.clone(prevContig,curContig); + if(direction) scaffolds[i].addRear(curContig, confirmedBridge); else scaffolds[i].addFront(curContig, getReversedBridge(confirmedBridge)); - curContig.myVector = ScaffoldVector.composition(prevToCur,prevContig.getVector());//from the head contig + + curEnd = direction?curContig.rightMost(curVector):curContig.leftMost(curVector); extended = true; //scaffold extension is really on the move... @@ -356,6 +529,8 @@ else if(verbose) }//while return closed; } + + class LengthIndex implements Comparable{ int length, index; public LengthIndex(int len, int index){ diff --git a/src/main/java/japsa/bio/hts/scaffold/ScaffoldVector.java b/src/main/java/japsa/bio/hts/scaffold/ScaffoldVector.java index 77bb6ae..f926887 100644 --- a/src/main/java/japsa/bio/hts/scaffold/ScaffoldVector.java +++ b/src/main/java/japsa/bio/hts/scaffold/ScaffoldVector.java @@ -34,6 +34,7 @@ package japsa.bio.hts.scaffold; + /** * Implementation of a vector of relative position of a contig in its scaffold * @author minhduc @@ -90,6 +91,7 @@ public int distance(Contig tContig, Contig fContig){ //FIXME: not handle the case that contig A contain contigB and via verse return Math.max(fS - tE, tS - fE); } + /** * Compose two vectors: a -> b is v2, b -> c is v1. returned a -> c is v1 * v2 * Warning: the parameters' order doesn't follow normal intuition. USE WITH CARE!!! @@ -122,5 +124,12 @@ public int getDirection() { return direction; } - + /** + * Set the new magnitude + * @param magnitude + */ + public void setMagnitute(int magnitude){ + this.magnitude=magnitude; + + } } \ No newline at end of file diff --git a/src/main/java/japsa/bio/hts/scaffold/Vertex.java b/src/main/java/japsa/bio/hts/scaffold/Vertex.java new file mode 100644 index 0000000..d396e30 --- /dev/null +++ b/src/main/java/japsa/bio/hts/scaffold/Vertex.java @@ -0,0 +1,173 @@ +package japsa.bio.hts.scaffold; + +import java.util.ArrayList; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; + +/** + * This class models a vertex for my string graph which actually corresponds to an edge in SPAdes's assembly graph. + * Label for this vertex is extracted from its full name and used as its index. + * For example, vertex named 'EDGE_1_length_1000_cov_50' is labeled as vertex 1. + * This vertex's neighborhood is described by the Edges incident to it. + * + * @author Son Nguyen + * @date August 20, 2016 + */ +public class Vertex { + + private ArrayList neighborhood; + private String fullName, label; + private Sequence seq=null; + + /** + * + * @param label The unique label associated with this Vertex + */ + public Vertex(String name){ + this.fullName=name; + this.label=getID(name); + this.neighborhood = new ArrayList(); + this.seq=new Sequence(Alphabet.DNA5(), 0); + } + + public Vertex(String name, Sequence seq){ + this(name); + this.seq = seq; + + } + /** + * + * @param name The name of Edge in assembly graph that correspond to this Vertex + */ + private String getID(String name){ + return name.split("_")[1]; + } + /** + * This method adds an Edge to the incidence neighborhood of this graph iff + * the edge is not already present. + * + * @param edge The edge to add + */ + public void addNeighbor(Edge edge){ + if(this.neighborhood.contains(edge)){ + return; + } + this.neighborhood.add(edge); + } + + + /** + * + * @param other The edge for which to search + * @return true iff other is contained in this.neighborhood + */ + public boolean containsNeighbor(Edge other){ + return this.neighborhood.contains(other); + } + + /** + * + * @param index The index of the Edge to retrieve + * @return Edge The Edge at the specified index in this.neighborhood + */ + public Edge getNeighbor(int index){ + return this.neighborhood.get(index); + } + + + /** + * + * @param index The index of the edge to remove from this.neighborhood + * @return Edge The removed Edge + */ + Edge removeNeighbor(int index){ + return this.neighborhood.remove(index); + } + + /** + * + * @param e The Edge to remove from this.neighborhood + */ + public void removeNeighbor(Edge e){ + this.neighborhood.remove(e); + } + + + /** + * + * @return int The number of neighbors of this Vertex + */ + public int getNeighborCount(){ + return this.neighborhood.size(); + } + /** + * + * @return String The label of this Vertex + */ + public String getLabel(){ + return this.label; + } + /** + * + * @return String The full name of this Vertex + */ + public String getName(){ + return this.fullName; + } + /** + * + * @param Sequence A sequence + */ + public void setSequence(Sequence seq){ + this.seq = seq; + } + /** + * + * @return Sequence The sequence of this Vertex + */ + public Sequence getSequence(){ + return this.seq; + } + /** + * + * @return String A String representation of this Vertex + */ + public String toString(){ + return "Vertex " + label; + } + + /** + * + * @return The hash code of this Vertex's label + */ + public int hashCode(){ + return this.label.hashCode(); + } + + /** + * + * @param other The object to compare + * @return true iff other instanceof Vertex and the two Vertex objects have the same label + */ + public boolean equals(Object other){ + if(!(other instanceof Vertex)){ + return false; + } + + Vertex v = (Vertex)other; + return this.label.equals(v.label); + } + + /** + * + * @return ArrayList A copy of this.neighborhood. Modifying the returned + * ArrayList will not affect the neighborhood of this Vertex + */ + public ArrayList getNeighbors(){ + return new ArrayList(this.neighborhood); + } + +} + + diff --git a/src/main/java/japsa/bio/misc/alignCompress/AlignCompress.java b/src/main/java/japsa/bio/misc/alignCompress/AlignCompress.java new file mode 100755 index 0000000..b406e2a --- /dev/null +++ b/src/main/java/japsa/bio/misc/alignCompress/AlignCompress.java @@ -0,0 +1,648 @@ +/* + * Copyright (c) David Powell + * + * This file is part of AlignCompress. + * + * AlignCompress aligns two sequences using a modified DPA + * that allows the use of a _model_ for each sequence. + * + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + */ + +package japsa.bio.misc.alignCompress; + +import japsa.bio.misc.common.*; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; + +class BlendModel implements Seq_Model { + private static final long serialVersionUID = 1L; + + Seq_Model models[]; + double m_lens[]; + + public BlendModel(char[] alpha) { + MarkovN_exact m0 = new MarkovN_exact(-1, alpha); + MarkovN_exact m1 = new MarkovN_exact(1, alpha); + + m1.setProb("aa", 0.28); + m1.setProb("at", 0.7); + m1.setProb("ag", 0.01); + m1.setProb("ac", 0.01); + + m1.setProb("ta", 0.7); + m1.setProb("tt", 0.28); + m1.setProb("tg", 0.01); + m1.setProb("tc", 0.01); + + m1.setProb("ga", 0.49); + m1.setProb("gt", 0.49); + m1.setProb("gg", 0.01); + m1.setProb("gc", 0.01); + + m1.setProb("ca", 0.49); + m1.setProb("ct", 0.49); + m1.setProb("cg", 0.01); + m1.setProb("cc", 0.01); + + models = new Seq_Model[2]; + m_lens = new double[2]; + models[0] = m0; + models[1] = m1; + m_lens[0] = 1; + m_lens[1] = 1; + } + + public double encodeLen(char a, int i) { + double sum = Double.POSITIVE_INFINITY; + double weights[] = new double[models.length]; + + for (int j = 0; j < models.length; j++) + sum = MyMath.logplus(sum, m_lens[j]); + + for (int j = 0; j < models.length; j++) + weights[j] = MyMath.exp2(sum - m_lens[j]); + + double p = 0; + for (int j = 0; j < models.length; j++) + p += weights[j] * MyMath.exp2(-models[j].encodeLen(a, i)); + + return -MyMath.log2(p); + } + + public double update(char a, int i) { + double res = encodeLen(a, i); + + for (int j = 0; j < models.length; j++) { + m_lens[j] += models[j].update(a, i); + m_lens[j] /= 1.5; + } + + return res; + } +} + +/** + * BufferModel takes a model and a sequence and the sequence dna. It + * precomputes the encoding length of every dna character for every + * position in the sequence. The cumulative encoding length is also computed + * along the sequence. + **/ +class BufferModel implements Seq_Model { + private static final long serialVersionUID = 1L; + + String str; + char[] alphabet; + double[][] enc; + double[] encCumulative; + + BufferModel(Seq_Model model, String str, char[] alpha) { + this.str = str; + this.alphabet = alpha; + + enc = new double[str.length()][alphabet.length]; + encCumulative = new double[str.length() + 1]; + encCumulative[0] = 0; + + for (int i = 0; i < str.length(); i++) { + for (int j = 0; j < alphabet.length; j++) { + enc[i][j] = model.encodeLen(alphabet[j], i); + } + model.update(str.charAt(i), i); + encCumulative[i + 1] = encCumulative[i] + + enc[i][char2int(str.charAt(i))]; + } + + // Test 'model' is efficient and doesn't cheat. + for (int i = 0; i < str.length(); i++) { + double s = Double.POSITIVE_INFINITY; + for (int j = 0; j < alphabet.length; j++) { + s = MyMath.logplus(s, enc[i][j]); + } + Misc.my_assert(Math.abs(s) < 1E-10, "logplus() != 0. s=" + s); + } + } + + public String toString() { + StringBuffer s = new StringBuffer(); + for (int i = 0; i < str.length(); i++) { + s.append(str.charAt(i) + " "); + for (int j = 0; j < alphabet.length; j++) { + double n = Math.rint(enc[i][j] * 100) / 100; + s.append(alphabet[j] + ": " + n + " "); + } + s.append("\n"); + } + return s.toString(); + } + + int char2int(char a) { + for (int i = 0; i < alphabet.length; i++) + if (a == alphabet[i]) + return i; + Misc.my_assert(false, "Charater '" + a + "' not in defined dna"); + return -1; + } + + public double encodeLen(char a, int i) { + return enc[i][char2int(a)]; + } + + public double update(char a, int i) { + return enc[i][char2int(a)]; + } + + public double encodeCumulative(int i) { + return encCumulative[i]; + } +} + +class AlignCompress { + char[] alphabet; + String seqA; + String seqB; + + String paramString; + int markovOrder; + int maxIterations; + int verbose; + boolean useBlendModel; + boolean linearCosts; + boolean localAlign; + boolean sumAlignments; + + boolean doTraceBack; + + // Encode length l: 0..infinity + static private double encode_length(double l) { + Misc.my_assert(l >= 0, "Bad length to encode:" + l); + // return MyMath.logstar_continuous(l+1); + // return -(l*MyMath.log2(PcharEncode) + MyMath.log2(1-PcharEncode)); // + // Geometric distribution + // return 0.104808 * l; // This is to match SW costs. (TESTING!) + return 0; + } + + public AlignCompress() { + } + + // Return the appropriate cell of D[][]. Use i%2 if only keeping 2 rows + private Mutation_FSM cell(Mutation_FSM D[][], int i, int j) { + if (doTraceBack) { + return D[i][j]; + } else { + return D[i % 2][j]; + } + } + + private String getAlignment(Mutation_FSM D[][], + Mutation_FSM.TraceBack_Info fcell) { + Mutation_FSM.TraceBack_Data d = fcell.get_tbdata(); + int i = d.i; + int j = d.j; + + StringBuffer resA = new StringBuffer(); + StringBuffer resB = new StringBuffer(); + + if (i < 0 && j < 0) { + d = fcell.get_from(d); + i = d.i; + j = d.j; + } + + int end_i = i, end_j = j; + + String endA = seqA.substring(i); + String endB = seqB.substring(j); + + while (true) { + // System.err.println("\n\n(i,j)="+i+","+j+"\nA="+resA+"\nB="+resB+"\n"); + + d = ((Mutation_FSM.TraceBack_Info) D[i][j]).get_from(d); + + if (d == null) + break; + + resA.append((d.i == i) ? '-' : seqA.charAt(d.i)); + resB.append((d.j == j) ? '-' : seqB.charAt(d.j)); + + i = d.i; + j = d.j; + } + + System.out.println("ALIGNMENT CUTS: A[" + i + ".." + end_i + "] B[" + j + + ".." + end_j + "]"); + + resA.reverse(); + resB.reverse(); + + StringBuffer res = new StringBuffer(); + + // Pretty up the alignment. For local alignments, put the front and end + // bits on. + for (int x = 0; x < j - (i < j ? i : j); x++) + res.append(" "); + res.append(seqA.substring(0, i)); + if (i > 0 || j > 0) + res.append(" "); + res.append(resA.toString().toUpperCase()); + res.append(" "); + res.append(endA); + + res.append("\n"); + + for (int x = 0; x < i - (i < j ? i : j); x++) + res.append(" "); + res.append(seqB.substring(0, j)); + if (i > 0 || j > 0) + res.append(" "); + res.append(resB.toString().toUpperCase()); + res.append(" "); + res.append(endB); + + return res.toString(); + } + + public double doAlign() { + System.out.println("# SeqA = " + + seqA + + "\n# SeqB = " + + seqB + + "\n# japsa.seq model = " + + (useBlendModel ? "blend model" : "markov order " + + markovOrder) + "\n# max iterations=" + maxIterations + + "\n# linearCosts=" + linearCosts + "\n# sumAlignments=" + + sumAlignments + "\n# local Alignment=" + localAlign + + "\n# verbosity=" + verbose + "\n# params=" + paramString + + "\n"); + + Params p = new Params(); + p.fromString(paramString); + + BufferModel modelA, modelB; + { + Seq_Model mdlA, mdlB; + if (useBlendModel) { + mdlA = new BlendModel(alphabet); + mdlB = new BlendModel(alphabet); + } else { + mdlA = new MarkovN_fitted(markovOrder, alphabet, seqA); + mdlB = new MarkovN_fitted(markovOrder, alphabet, seqB); + } + + modelA = new BufferModel(mdlA, seqA, alphabet); + modelB = new BufferModel(mdlB, seqB, alphabet); + } + + // System.err.println("modelA\n"+modelA+"\n"); + // System.err.println("modelB\n"+modelB+"\n"); + + int mdlCounts = Model_SeqAB.required_counts(); + + int fsmCounts = -1; + if (!linearCosts && !sumAlignments) + fsmCounts = Mutation_1State.One.required_counts(); + if (!linearCosts && sumAlignments) + fsmCounts = Mutation_1State.All.required_counts(); + if (linearCosts && !sumAlignments) + fsmCounts = Mutation_3State.One.required_counts(); + if (linearCosts && sumAlignments) + fsmCounts = Mutation_3State.All.required_counts(); + + Misc.my_assert(fsmCounts >= 0, "Bad number of fsmCounts"); + + int totCounts = mdlCounts + fsmCounts; + + double bestDiff = Double.NEGATIVE_INFINITY; + double lastAlignment = 0; + + int iter = 0; + while (maxIterations < 0 || iter < maxIterations) { + int countPos = 0; + Two_Seq_Model_Counts model = new Model_SeqAB(p, modelA, modelB, + countPos); + countPos += mdlCounts; + Mutation_FSM fsmType = null; + + if (!linearCosts && !sumAlignments) + fsmType = new Mutation_1State.One(model, p, totCounts, countPos); + if (!linearCosts && sumAlignments) + fsmType = new Mutation_1State.All(model, p, totCounts, countPos); + if (linearCosts && !sumAlignments) + fsmType = new Mutation_3State.One(model, p, totCounts, countPos); + if (linearCosts && sumAlignments) + fsmType = new Mutation_3State.All(model, p, totCounts, countPos); + + Misc.my_assert(fsmType != null, "Unable to construct fsmType"); + + countPos += fsmCounts; + Misc.my_assert(countPos == totCounts, + "Internal error: countPos!=totCounts"); + + Counts initialCounts = new Counts(totCounts); + for (int i = 0; i < totCounts; i++) + initialCounts.inc(i, 0.5); // Initialise all counts. + + // Determine if fsmType supports traceBack. ie. Does it implement + // TraceBack_Info + doTraceBack = false; + if (fsmType instanceof Mutation_FSM.TraceBack_Info) + doTraceBack = true; + + // Setup the DPA matrix + Mutation_FSM D[][]; + Mutation_FSM final_cell; + if (!doTraceBack) { + // No traceback info, only keep 2 rows in D[][] + D = new Mutation_FSM[2][seqB.length() + 1]; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < seqB.length() + 1; j++) { + D[i][j] = (Mutation_FSM) fsmType.clone(); + } + } + final_cell = (Mutation_FSM) fsmType.clone(); + } else { + // Keep all traceback info, so keep all rows in D[][] + D = new Mutation_FSM[seqA.length() + 1][seqB.length() + 1]; + for (int i = 0; i < seqA.length() + 1; i++) { + for (int j = 0; j < seqB.length() + 1; j++) { + D[i][j] = (Mutation_FSM) fsmType.clone(); + Mutation_FSM.TraceBack_Info c = (Mutation_FSM.TraceBack_Info) D[i][j]; + c.set_tbdata(new Mutation_FSM.TraceBack_Data(i, j)); + } + } + final_cell = (Mutation_FSM) fsmType.clone(); + ((Mutation_FSM.TraceBack_Info) final_cell) + .set_tbdata(new Mutation_FSM.TraceBack_Data(-1, -1)); + } + + // Initialise the first cell of the DPA matrix + cell(D, 0, 0).init_counts(initialCounts); // Initialise counts + if (localAlign) + cell(D, 0, 0).init_val(Double.POSITIVE_INFINITY); + else + cell(D, 0, 0).init_val(0); + + final_cell.init_val(Double.POSITIVE_INFINITY); + + if (verbose >= 1) { + System.out.println("\n\nIteration: " + iter); + System.out.println(model); + System.out.println(cell(D, 0, 0).paramsToString()); + } + + // Do the DPA! + // NB. The Mutation_FSM calc function calculates the value of this + // cell + // on the three neighbouring cell. This is the reverse of the way + // the + // DPA is usually expressed. + for (int i = 0; i < seqA.length() + 1; i++) { + + // Reset the next row of the DPA matrix + if (!doTraceBack) + for (int j = 0; j < seqB.length() + 1; j++) { + cell(D, i + 1, j).reset(); + } + + for (int j = 0; j < seqB.length() + 1; j++) { + Mutation_FSM v = (i == seqA.length() ? null : cell(D, + i + 1, j)); + Mutation_FSM h = (j == seqB.length() ? null : cell(D, i, + j + 1)); + Mutation_FSM d = (i == seqA.length() || j == seqB.length() ? null + : cell(D, i + 1, j + 1)); + char aChar = (i == seqA.length() ? '-' : seqA.charAt(i)); + char bChar = (j == seqB.length() ? '-' : seqB.charAt(j)); + + if (localAlign) { + double val; + + // Compute contribution of a local alignment that starts + // at (i,j) + val = modelA.encodeCumulative(i) + + modelB.encodeCumulative(j); + val += encode_length(i); + val += encode_length(j); + cell(D, i, j).or(val, initialCounts); + + // Compute contribution of a local alignment that ends + // at (i,j) + val = cell(D, i, j).get_val() + + (modelA.encodeCumulative(seqA.length()) - modelA + .encodeCumulative(i)) + + (modelB.encodeCumulative(seqB.length()) - modelB + .encodeCumulative(j)); + + val += encode_length(seqA.length() - i); + val += encode_length(seqB.length() - j); + + if (doTraceBack) + ((Mutation_FSM.TraceBack_Info) final_cell).or(val, + cell(D, i, j).get_counts(), cell(D, i, j)); + else + final_cell.or(val, cell(D, i, j).get_counts()); + } + + cell(D, i, j).calc(h, v, d, aChar, bChar, i, j); + + if (verbose >= 3) { + System.err.println("Calc outputs from D[" + i + "][" + + j + "]"); + System.err.println(cell(D, i, j)); + } + + } + } + + // If global alignment, then the final cell is really the bottom + // right of D[][] + if (!localAlign) + final_cell = cell(D, seqA.length(), seqB.length()); + + double encAlignModel = final_cell.encode_params(); + double encAlignment = encAlignModel + final_cell.get_val(); + if (localAlign) { + // Add a cost for the length of the alignment. + // Assume we know the length of the sequences. Need to encode + // the start and end + // of the alignment. Assume uniform over all positions. Have 4 + // cut-points to + // encode, but only need to encode 3 (kind of). + // So encode 2 from the shorter sequence, and 1 from the longer. + double l1 = (seqA.length() < seqB.length() ? seqA.length() + : seqB.length()); + double l2 = (seqA.length() > seqB.length() ? seqA.length() + : seqB.length()); + encAlignment += MyMath.log2(l1) * 2 - 1; + encAlignment += MyMath.log2(l2); + } + + double encA = modelA.encodeCumulative(seqA.length()); + double encB = modelB.encodeCumulative(seqB.length()); + double encNull = encA + encB; + + if (localAlign) { + // Encode lengths for the null theory + // Note that global alignments ignore lengths, ignore for null + // when doing global. + encNull += encode_length(seqA.length()); + encNull += encode_length(seqB.length()); + } + + if (doTraceBack) { + String s = getAlignment(D, + (Mutation_FSM.TraceBack_Info) final_cell); + System.out.println("ALIGNMENT:\n" + s); + } + + // Get new parameters for next iteration + p = fsmType.counts_to_params(final_cell.get_counts()); + /* + * if (localAlign) { double n = final_cell.get_counts().get(0); + * p.put("PalignChar", (n-1)/n); } + */ + + if (verbose >= 2) { + System.out.println("encA=" + encA + " encB=" + encB + + " encNull=" + (encNull)); + + System.out.print("Mutual Encoding = " + encAlignment + " bits"); + System.out.println(" (model=" + encAlignModel + " data=" + + (encAlignment - encAlignModel) + ")"); + System.out.println("\nCounts:\n" + final_cell.get_counts()); + System.out.println("\nEstimated Parameters:\n" + p); + } + System.out.println((encAlignment < encNull ? "related" + : "unrelated") + + " (" + + (encNull - encAlignment) + + ")" + + " log odds ratio = " + + (encNull - encAlignment) + + " bits"); + + if (bestDiff < encNull - encAlignment) + bestDiff = encNull - encAlignment; + + if (iter > 0 && verbose >= 1 && encAlignment > lastAlignment) { + System.err.println("NON-CONVERGENCE: this=" + encAlignment + + " last=" + lastAlignment); + } + + // Done we change little in alignment length? + if (iter > 0 && encAlignment <= lastAlignment + && lastAlignment - encAlignment < 0.1) + break; + + lastAlignment = encAlignment; + iter++; + } // End iterations + + return bestDiff; + } + + public static void printLicense() { + System.err.println(""); + System.err + .println(" AlignCompress, Copyright (C) 2004 David Powell "); + System.err + .println(" AlignCompress comes with ABSOLUTELY NO WARRANTY; and is provided"); + System.err + .println(" under the GNU Public License v2, for details see file COPYRIGHT"); + System.err.println(""); + System.err.println("Please cite:"); + System.err.println(" L. Allison, D. R. Powell and T. I. Dix"); + System.err.println(" \"Compression and Approximate Matching\""); + System.err.println(" The Computer Journal, 1999 42:1 pp1-10"); + System.err.println(""); + } + + public static void main(String args[]) throws Exception { + printLicense(); + + CommandLine cmdLine = new CommandLine(); + cmdLine.addInt("markov", 0, + "Order of Markov Model to use for sequence models."); + cmdLine.addInt("maxIterations", -1, "Maximum number of iterations."); + cmdLine.addBoolean("blendModel", false, + "Use blend model (a fixed model)."); + cmdLine.addBoolean("linearCosts", true, "Use linear gap costs."); + cmdLine.addBoolean("sumAlignments", false, "Sum over all alignments."); + cmdLine.addBoolean("local", true, "Compute using local alignments."); + cmdLine.addInt("verbose", 0, + "Display verbose output (larger num means more verbosity)."); + cmdLine.addBoolean("exSeq", false, + "Command line options are explicit sequence, not filenames"); + cmdLine.addBoolean("protein", false, "Sequences are protein data"); + cmdLine.addString("params", "", + "Params to pass to all classes (comma separated)"); + + args = cmdLine.parseLine(args); + + AlignCompress a = new AlignCompress(); + boolean readFile = !cmdLine.getBooleanVal("exSeq"); + + if (args == null || args.length != 2) { + System.err + .println("Usage: java AlignCompress [options] \n" + + cmdLine.usageMessage()); + System.exit(1); + } else if (readFile && args.length == 2) { + // Two filenames on commandline + Sequence d = SequenceReader.getReader(args[0]).nextSequence(null);// (filename)IOTools.read(args[0]); + a.seqA = d.toString(); + d = SequenceReader.getReader(args[1]).nextSequence(null);// (filename)IOTools.read(args[0]); + a.seqB = d.toString(); + } else if (args.length == 2) { + // Two strings on commandline, assume they are sequences + a.seqA = args[0]; + a.seqB = args[1]; + } + + if (a.seqA == null || a.seqB == null) { + System.err.println("Unable to read both sequences"); + System.exit(1); + } + + a.seqA = a.seqA.toLowerCase(); + a.seqB = a.seqB.toLowerCase(); + + a.markovOrder = cmdLine.getIntVal("markov"); + a.maxIterations = cmdLine.getIntVal("maxIterations"); + a.useBlendModel = cmdLine.getBooleanVal("blendModel"); + a.linearCosts = cmdLine.getBooleanVal("linearCosts"); + a.sumAlignments = cmdLine.getBooleanVal("sumAlignments"); + a.localAlign = cmdLine.getBooleanVal("local"); + a.verbose = cmdLine.getIntVal("verbose"); + a.paramString = cmdLine.getStringVal("params"); + + if (cmdLine.getBooleanVal("protein")) { + a.alphabet = new char[] { 'a', 'r', 'n', 'd', 'c', 'q', 'e', 'g', + 'h', 'i', 'l', 'k', 'm', 'f', 'p', 's', 't', 'w', 'y', 'v', + 'b', 'z', 'x', '*' }; + } else { + a.alphabet = new char[] { 'a', 't', 'g', 'c' }; + // a.alphabet = new char[] {'a', 't', 'g', 'c', + // 'y', 'r', 'w', 'n'}; + } + + System.out.println("-log odds ratio = " + a.doAlign() + " bits"); + + } // End main() +} diff --git a/src/main/java/japsa/bio/misc/common/Alignment_viewer.java b/src/main/java/japsa/bio/misc/common/Alignment_viewer.java new file mode 100755 index 0000000..35ae5ac --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Alignment_viewer.java @@ -0,0 +1,132 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.*; + +import japsa.bio.misc.dnaPlatform.gui.MainFrame; + +import java.awt.*; +import java.awt.event.*; + +class Alignment_viewer extends Panel { + + static final long serialVersionUID = MainFrame.serialVersionUID; + Label l1, l2; + ScrollPane sp; + Panel p; + + public Alignment_viewer(String a1) { + int p_ = a1.indexOf("\n"); + init(a1.substring(0, p_), a1.substring(p_ + 1)); + } + + public Alignment_viewer(String a1, String a2) { + init(a1, a2); + } + + private void init(String a1, String a2) { + StringBuffer mat = new StringBuffer(); + for (int i = 0; i < a1.length() && i < a2.length(); i++) { + char c1 = a1.charAt(i); + char c2 = a2.charAt(i); + mat.append((!Character.isLowerCase(c1) && Character.isLetter(c1) && c1 == c2) ? "|" + : " "); + } + + setFont(new Font("Fixed", Font.PLAIN, 12)); + setLayout(new GridLayout(1, 1)); + + p = new Panel(); + p.setLayout(new GridLayout(3, 1)); + + l1 = new Label(a1); + l2 = new Label(a2); + p.add(l1); + p.add(new Label(mat.toString())); + p.add(l2); + + sp = new ScrollPane(); + sp.add(p); + add(sp); + } + + public void init_size() { + sp.setSize((int) p.getPreferredSize().getWidth(), (int) p + .getPreferredSize().getHeight() + 80); + setSize(sp.getPreferredSize()); + } + + public static void main(String args[]) { + Alignment_viewer t = null; + + if (args.length == 0) { + // No args, read sequence from stdin. + String a1 = null, a2 = null; + try { + BufferedReader in = new BufferedReader(new InputStreamReader( + System.in)); + a1 = in.readLine(); + a2 = in.readLine(); + } catch (Exception e) { + System.err.println("Error stdin: " + e); + } + t = new Alignment_viewer(a1, a2); + } else if (args.length == 1) { + // One arg, assume it is a filename to read the sequences from + String a1 = null, a2 = null; + try { + BufferedReader in = new BufferedReader(new FileReader(args[0])); + a1 = in.readLine(); + a2 = in.readLine(); + in.close(); + } catch (Exception e) { + System.err.println("Error reading '" + args[0] + "': " + e); + } + t = new Alignment_viewer(a1, a2); + } else if (args.length == 2) + t = new Alignment_viewer(args[0], args[1]); + else { + System.err.println("Usage: java Alignment_viewer "); + System.exit(-1); + } + + Frame f = new Frame("Sequence Alignment"); + f.addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent e) { + System.exit(0); + } + }); + + f.add("Center", t); + // f.setSize((int)t.getPreferredSize().getWidth(), + // (int)t.getMinimumSize().getHeight()); + // f.show(); + f.setVisible(true); + t.init_size(); + f.setSize(t.getPreferredSize().getWidth() < 800 ? (int) t + .getPreferredSize().getWidth() : 800, (int) t + .getPreferredSize().getHeight()); + } +} diff --git a/src/main/java/japsa/bio/misc/common/CombineGetMin.java b/src/main/java/japsa/bio/misc/common/CombineGetMin.java new file mode 100755 index 0000000..d3f2a64 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/CombineGetMin.java @@ -0,0 +1,93 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.common; + +import java.io.*; + +public class CombineGetMin { + String inFile1, inFile2; + + public CombineGetMin(String in1, String in2) { + this.inFile1 = in1; + this.inFile2 = in2; + } + + String readLn(BufferedReader in) throws IOException { + String ln = "#"; + while (ln.startsWith("#")) { + ln = in.readLine(); + if (ln == null) + return null; + } + + return ln; + } + + // Read from stdin, smooth and write back to stdout + public void combineMin() { + try { + // This approach is rather not memory efficient + BufferedReader in1 = new BufferedReader(new FileReader(inFile1)); + BufferedReader in2 = new BufferedReader(new FileReader(inFile2)); + + String line1, line2; + int count = 0; + while (((line1 = readLn(in1)) != null) + && (line2 = readLn(in2)) != null) { + + String arr1[] = line1.split(" |\t"); + String arr2[] = line2.split(" |\t"); + + double value1 = Double.parseDouble(arr1[arr1.length - 1]); + double value2 = Double.parseDouble(arr2[arr2.length - 1]); + System.out.println(count + "\t" + Math.min(value1, value2)); + count++; + + } + in1.close(); + in2.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * @param args + */ + public static void main(String[] args) { + if (args.length < 2) { + System.err.println("Smooth inFile1 inFile2"); + System.exit(1); + } + + CombineGetMin sm = new CombineGetMin(args[0], args[1]); + sm.combineMin(); + } +} diff --git a/src/main/java/japsa/bio/misc/common/Counts.java b/src/main/java/japsa/bio/misc/common/Counts.java new file mode 100755 index 0000000..ae7dba8 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Counts.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +public final class Counts {// implements Serializable { + public double counts[]; + public int num; + + public Counts(int num) { + counts = new double[num]; + this.num = num; + } + + public double get(int i) { + return counts[i]; + }; + + public void zero() { + for (int i = 0; i < num; i++) { + counts[i] = 0; + } + } + + public void combine_with_lens(double myLen, Counts c, double otherLen) { + double min = MyMath.min2(myLen, otherLen); + double w1 = (min == myLen ? 1 : MyMath.exp2(min - myLen)); + double w2 = (min == otherLen ? 1 : MyMath.exp2(min - otherLen)); + scale(w1); + linearWeight(c, w2); + scale(1.0 / (w1 + w2)); + } + + public void linearWeight(Counts c, double w) { + for (int i = 0; i < num; i++) { + counts[i] += w * c.counts[i]; + } + } + + public void inc(int index, double w) { + counts[index] += w; + } + + public void scale(double w) { + for (int i = 0; i < num; i++) + counts[i] *= w; + } + + public void duplicate(Counts c) { + for (int i = 0; i < num; i++) + counts[i] = c.counts[i]; + } + + public Object clone() { + Counts c = new Counts(num); + c.duplicate(this); + return c; + } + + public String toString() { + String r = new String(""); + for (int i = 0; i < num; i++) { + r = Misc.sprintf("%s %d:%.3f", new Object[] { r, new Integer(i), + new Double(counts[i]) }); + } + return r; + } +} diff --git a/src/main/java/japsa/bio/misc/common/Has_Value.java b/src/main/java/japsa/bio/misc/common/Has_Value.java new file mode 100755 index 0000000..b40b246 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Has_Value.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +public interface Has_Value { + public Object clone(); + + public double get_val(); +} diff --git a/src/main/java/japsa/bio/misc/common/MarkovN.java b/src/main/java/japsa/bio/misc/common/MarkovN.java new file mode 100755 index 0000000..14a23a7 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/MarkovN.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +/** + * An arbitrary order, adaptive Markov Model for an arbitrary dna of + * characters + * + * If order == -1 then we are a uniform model over the dna + */ +public class MarkovN implements Seq_Model { + /** + * + */ + private static final long serialVersionUID = 1L; + char[] chars; + int[] charCounts; + int[] countTotal; + int order; + + StringBuffer past; + + public MarkovN(int order, char[] chars) { + Misc.my_assert(order >= -1, "Bad order=" + order); + + this.chars = chars; + this.order = order; + + if (order >= 0) { + charCounts = new int[(int) Math.pow(chars.length, order + 1)]; + countTotal = new int[(int) Math.pow(chars.length, order)]; + for (int i = 0; i < charCounts.length; i++) + charCounts[i] = 1; + for (int i = 0; i < countTotal.length; i++) + countTotal[i] = chars.length; + } + + past = new StringBuffer(); + } + + int chars2Num(String c) { + int res = 0, i; + for (i = 0; i < c.length(); i++) { + int j; + for (j = 0; j < chars.length; j++) + if (chars[j] == c.charAt(i)) + break; + Misc.my_assert(j < chars.length, "Character '" + c.charAt(i) + + "' is unexpected"); + res = (res * chars.length) + j; + } + // System.out.println("char2Num("+c+")="+res); + return res; + } + + public double encodeLen(char a, int i) { + if (past.length() < order || order < 0) + return -MyMath.log2((double) 1.0 / chars.length); + int n = charCounts[chars2Num(past.substring(i - order) + a)]; + int d = countTotal[chars2Num(past.substring(i - order))]; + return -MyMath.log2((double) n / d); + } + + public double update(char a, int i) { + if (past.length() < order || order < 0) { + past.append(a); + return -MyMath.log2((double) 1.0 / chars.length); + } + double res = encodeLen(a, i); + charCounts[chars2Num(past.substring(i - order) + a)]++; + countTotal[chars2Num(past.substring(i - order))]++; + past.append(a); + return res; + } + + /** + * Minh Duc Added + */ + public double probability(char a, int i) { + if (past.length() < order || order < 0) + return 1.0 / chars.length; + int n = charCounts[chars2Num(past.substring(i - order) + a)]; + int d = countTotal[chars2Num(past.substring(i - order))]; + return ((double) n) / d; + } + + public static void main(String args[]) { + String s = args[0]; + char[] a = { 'a', 't', 'g', 'c' }; + MarkovN m = new MarkovN(0, a); + + double tot = 0; + for (int i = 0; i < s.length(); i++) { + double r = m.update(s.charAt(i), i); + tot += r; + System.out.println(r); + } + System.out.println("Total entropy = " + tot + " bits/ch"); + } +} diff --git a/src/main/java/japsa/bio/misc/common/MarkovN_exact.java b/src/main/java/japsa/bio/misc/common/MarkovN_exact.java new file mode 100755 index 0000000..98dbba3 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/MarkovN_exact.java @@ -0,0 +1,148 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +/** + * An arbitrary order Markov Model for an arbitrary dna of characters. This + * meant to be used when the transition probabilities are known. + * + * If order == -1 then we are a uniform model over the dna + */ + +public class MarkovN_exact implements Seq_Model { + private static final long serialVersionUID = Seq_Model.serialVersionUID; + + char[] chars; + double[] probs; + int order; + + StringBuffer past; + + public MarkovN_exact(int order, char[] chars) { + Misc.my_assert(order >= -1, "Bad order=" + order); + + this.chars = chars; + this.order = order; + + if (order >= 0) { + probs = new double[(int) Math.pow(chars.length, order + 1)]; + for (int i = 0; i < probs.length; i++) + probs[i] = 1.0 / chars.length; + } + + past = new StringBuffer(); + } + + protected String num2Chars(int n) { + StringBuffer res = new StringBuffer(); + for (int i = 0; i < order + 1; i++) { + res.append(chars[n % chars.length]); + n = n / chars.length; + } + return res.reverse().toString(); + } + + protected int chars2Num(String c) { + int res = 0, i; + for (i = 0; i < c.length(); i++) { + int j; + for (j = 0; j < chars.length; j++) + if (chars[j] == c.charAt(i)) + break; + Misc.my_assert(j < chars.length, "Character '" + c.charAt(i) + + "' is unexpected"); + res = (res * chars.length) + j; + } + // System.out.println("char2Num("+c+")="+res); + return res; + } + + public void setProb(String c, double p) { + Misc.my_assert(c.length() - 1 == order, + "Bad length of chars passed to setProb: c=" + c); + if (order < 0) + return; + probs[chars2Num(c)] = p; + } + + public void normalise() { + for (int n = 0; n < probs.length; n += chars.length) { + double sum = 0; + for (int i = 0; i < chars.length; i++) + sum += probs[n + i]; + for (int i = 0; i < chars.length; i++) + probs[n + i] /= sum; + } + } + + public double encodeLen(char a, int i) { + if (past.length() < order || order < 0) + return -MyMath.log2((double) 1.0 / chars.length); + double p = probs[chars2Num(past.substring(i - order) + a)]; + return -MyMath.log2(p); + } + + public double update(char a, int i) { + double res = encodeLen(a, i); + past.append(a); + return res; + } + + public String toString() { + StringBuffer res = new StringBuffer(); + res.append("Markov_exact: probs:\n"); + if (order < 0) + return res.toString(); + for (int i = 0; i < probs.length; i++) { + String s = num2Chars(i); + res.append("p[" + s.substring(order) + " | " + + s.substring(0, order) + "]"); + res.append(" = " + probs[i] + "\n"); + } + return res.toString(); + } + + public static void main(String args[]) { + String s = args[0]; + char[] a = { 'a', 't', 'g', 'c' }; + MarkovN_exact m = new MarkovN_exact(1, a); + + m.setProb("aa", 1); + m.setProb("at", 1); + m.setProb("ag", 2); + m.setProb("ac", 1); + m.normalise(); + + System.out.println(m); + + double tot = 0; + for (int i = 0; i < s.length(); i++) { + double r = m.update(s.charAt(i), i); + tot += r; + System.out.println(r); + } + System.out.println("Total entropy = " + tot + " bits/ch"); + } + +} diff --git a/src/main/java/japsa/bio/misc/common/MarkovN_fitted.java b/src/main/java/japsa/bio/misc/common/MarkovN_fitted.java new file mode 100755 index 0000000..098884f --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/MarkovN_fitted.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +/** + * An arbitrary order Markov Model for an arbitrary dna of characters. The + * model is first fitted to the sequence. This is _not_ a proper model because + * the model cost is not accounted for. Thus the entropy of a sequence + * calculated by this class is below its real value. The discrepency become + * worse for higher order models. + * + * If order == -1 then we are a uniform model over the dna + */ +public class MarkovN_fitted implements Seq_Model { + /** + * + */ + private static final long serialVersionUID = 1L; + char[] chars; + int[] charCounts; + int[] countTotal; + int order; + + StringBuffer past; + + public MarkovN_fitted(int order, char[] chars, String s) { + Misc.my_assert(order >= -1, "Bad order=" + order); + + this.chars = chars; + this.order = order; + + if (order >= 0) { + charCounts = new int[(int) Math.pow(chars.length, order + 1)]; + countTotal = new int[(int) Math.pow(chars.length, order)]; + for (int i = 0; i < charCounts.length; i++) + charCounts[i] = 1; + for (int i = 0; i < countTotal.length; i++) + countTotal[i] = chars.length; + } + + past = new StringBuffer(); + + // Iterate over sequence to fill in all counts + for (int i = 0; i < s.length(); i++) { + if (i < order || order < 0) + continue; + charCounts[chars2Num(s.substring(i - order, i) + s.charAt(i))]++; + countTotal[chars2Num(s.substring(i - order, i))]++; + } + } + + private int chars2Num(String c) { + int res = 0, i; + for (i = 0; i < c.length(); i++) { + int j; + for (j = 0; j < chars.length; j++) + if (chars[j] == c.charAt(i)) + break; + Misc.my_assert(j < chars.length, "Character '" + c.charAt(i) + + "' is unexpected"); + res = (res * chars.length) + j; + } + // System.out.println("char2Num("+c+")="+res); + return res; + } + + public double encodeLen(char a, int i) { + if (past.length() < order || order < 0) + return -MyMath.log2((double) 1.0 / chars.length); + int n = charCounts[chars2Num(past.substring(i - order) + a)]; + int d = countTotal[chars2Num(past.substring(i - order))]; + return -MyMath.log2((double) n / d); + } + + public double update(char a, int i) { + double res = encodeLen(a, i); + past.append(a); + return res; + } + + public static void main(String args[]) { + String s = args[0]; + char[] a = { 'a', 't', 'g', 'c' }; + MarkovN_fitted m = new MarkovN_fitted(0, a, s); + + double tot = 0; + for (int i = 0; i < s.length(); i++) { + double r = m.update(s.charAt(i), i); + tot += r; + System.out.println(r); + } + System.out.println("Total entropy = " + tot + " bits/ch"); + } +} diff --git a/src/main/java/japsa/bio/misc/common/Misc.java b/src/main/java/japsa/bio/misc/common/Misc.java new file mode 100755 index 0000000..1f304a0 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Misc.java @@ -0,0 +1,325 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.*; +import java.util.*; + +public final class Misc { + public static void my_assert(boolean e, String s) { + if (!e) + error(s); + } + + public static void error(String s) { + System.err.println("ERROR: " + s); + + // VNTRReadDepth o = (VNTRReadDepth) (new Object()); // Force a crash. want stack trace. + throw new RuntimeException("Assertion wrong"); + // System.exit(1); + } + + static String readString(InputStream in) { + StringBuffer s = new StringBuffer(); + try { + byte[] buf = new byte[1024]; + int n; + while ((n = in.read(buf)) >= 0) { + s.append(new String(buf, 0, n)); + } + } catch (Exception e) { + System.err.println("Unable to read stdin"); + System.exit(1); + } + return s.toString(); + } + + private static String pad(String s, int width, boolean pre0, boolean right) { + if (s.length() >= width) + return s; + StringBuffer prefix = new StringBuffer(); + while (prefix.length() + s.length() < width) + prefix.append((pre0 ? '0' : ' ')); + + if (right || pre0) + return prefix.toString() + s; + else + return s + prefix.toString(); + } + + static String numFmt(long num, int width, boolean pre0, boolean right) { + return pad(Long.toString(num), width, pre0, right); + } + + static String numFmt(float num, int prec, int width, boolean pre0, + boolean right) { + return numFmt((double) num, prec, width, pre0, right); + } + + static String numFmt(double num, int prec, int width, boolean pre0, + boolean right) { + if (prec < 0) + return pad(Double.toString(num), width, pre0, right); + + // Round num first. + double mul = Math.pow(10, prec); + num = Math.round(num * mul) / mul; + + long i = (long) num; + StringBuffer res; + if (i == 0 && num < 0) + res = new StringBuffer("-0"); + else + res = new StringBuffer(Long.toString(i)); + if (prec == 0) + return pad(res.toString(), width, pre0, right); + + res.append("."); + + num = Math.abs(num - i); + long dec = Math.round(num * Math.pow(10, prec)); + res.append(numFmt(dec, prec, true, false)); + return pad(res.toString(), width, pre0, right); + } + + /** + * An extremely limited and poorly implemented sprintf function. Only + * handles %s %d %f with width and precision + */ + public static String sprintf(String fmt, Object[] objs) { + StringBuffer res = new StringBuffer(); + int obj_i = 0; + int i = 0; + while (i < fmt.length()) { + int p = fmt.indexOf('%', i); + if (p < 0 || p + 1 == fmt.length()) { + res.append(fmt.substring(i)); + break; + } + + res.append(fmt.substring(i, p)); + + if (fmt.charAt(p + 1) == '%') { + res.append("%"); + i = p + 2; + continue; + } + + boolean rightAlign = true; + boolean pre0 = false; + int width = -1; + int prec = -1; + p++; + while (true) { + char c = fmt.charAt(p); + if (c == 'd') { + Object o = objs[obj_i++]; + if (o instanceof Integer) { + res.append(numFmt(((Integer) o).intValue(), width, + pre0, rightAlign)); + } else { + System.err.println("Format %d not passed an integer"); + res.append(o); + } + p++; + break; + } + if (c == 'f') { + Object o = objs[obj_i++]; + if (o instanceof Double) { + res.append(numFmt(((Double) o).doubleValue(), prec, + width, pre0, rightAlign)); + } else if (o instanceof Float) { + res.append(numFmt(((Float) o).floatValue(), prec, + width, pre0, rightAlign)); + } else { + System.err + .println("Format %d not passed an double or float"); + res.append(o); + } + p++; + break; + } + if (c == 's') { + res.append(pad(objs[obj_i++].toString(), width, false, + rightAlign)); + p++; + break; + } + + if (c == '.') { + prec = -2; + p++; + continue; + } + + if (c == '-') { + rightAlign = false; + p++; + continue; + } + + if (c >= '0' && c <= '9') { + int j; + for (j = p + 1; j < fmt.length() && c >= '0' && c <= '9'; j++) { + c = fmt.charAt(j); + } + int v = Integer.parseInt(fmt.substring(p, j - 1)); + if (prec == -2) + prec = v; + else { + width = v; + pre0 = fmt.charAt(p) == '0'; + } + p = j - 1; + continue; + } + System.err.println("Unexpected character in % format '" + c + + "'"); + break; + } + i = p; + } + return res.toString(); + } + + /** + * Can use like this: sprintf( fmt, new + * VarArgs(i1).add(i2).add(i3).add(i4)); + */ + public static String sprintf(String fmt, VarArgs v) { + return sprintf(fmt, v.toArray()); + } + + // public static void printf(String fmt) { + // System.out.print(fmt); + // } + + public static void printf(String fmt, Object[] objs) { + System.out.print(sprintf(fmt, objs)); + } + + // public static void printf(String fmt, VarArgs v) { + // printf(fmt, v.toArray()); + // } + + // public static void printf(String fmt, Object v1) { + // printf(fmt, new Object[] {v1}); + // } + + // public static void printf(String fmt, int v1) { + // printf(fmt, new Object[] {new Integer(v1)}); + // } + + // public static void printf(String fmt, int v1, int v2) { + // printf(fmt, new Object[] {new Integer(v1), new Integer(v2)}); + // } + + public static void printf(String fmt, int v1, int v2, int v3) { + printf(fmt, new Object[] { new Integer(v1), new Integer(v2), + new Integer(v3), }); + } + + public static void printf(String fmt, int v1, int v2, int v3, int v4) { + printf(fmt, new Object[] { new Integer(v1), new Integer(v2), + new Integer(v3), new Integer(v4) }); + } + + public static void printf(String fmt, double v1) { + printf(fmt, new Object[] { new Double(v1) }); + } + + public static void printf(String fmt, double v1, double v2) { + printf(fmt, new Object[] { new Double(v1), new Double(v2) }); + } + + public static void printf(String fmt, double v1, double v2, double v3) { + printf(fmt, new Object[] { new Double(v1), new Double(v2), + new Double(v3), }); + } + + public static void printf(String fmt, double v1, double v2, double v3, + double v4) { + printf(fmt, new Object[] { new Double(v1), new Double(v2), + new Double(v3), new Double(v4) }); + } + + public static class VarArgs { + Vector o = new Vector(); + + public VarArgs(char v) { + add(v); + } + + public VarArgs(int v) { + add(v); + } + + public VarArgs(double v) { + add(v); + } + + public VarArgs(float v) { + add(v); + } + + public VarArgs(Object v) { + add(v); + } + + public VarArgs add(char v) { + return add(new Character(v)); + } + + public VarArgs add(int v) { + return add(new Integer(v)); + } + + public VarArgs add(double v) { + return add(new Double(v)); + } + + public VarArgs add(float v) { + return add(new Float(v)); + } + + public VarArgs add(Object v) { + o.add(v); + return this; + } + + public Object[] toArray() { + return o.toArray(); + } + } + + public static void main(String args[]) { + System.out.println(sprintf(args[0], new Object[] { new Double(5), + new String("abc") })); + // System.out.println( numFmt(10, 5, true) ); + // System.out.println( numFmt(10.1099, -2) ); + // System.out.println( numFmt(-10.9999, 3) ); + } + +} diff --git a/src/main/java/japsa/bio/misc/common/Model_SeqAB.java b/src/main/java/japsa/bio/misc/common/Model_SeqAB.java new file mode 100755 index 0000000..5c37a4c --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Model_SeqAB.java @@ -0,0 +1,138 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +/** + * A model of 2 sequences (for alignments) with counts. + * + * Characters from each sequence are encoded with a sequence specific model. + * Matches/changes average the probabilites from these models. + * + * Uses two parameters: match_cost, change_cost + **/ +public class Model_SeqAB implements Two_Seq_Model_Counts { + double match_cost, change_cost; + Seq_Model modelA, modelB; + + int countIndex; + final private int matchIndex = 0, changeIndex = 1; + + public Model_SeqAB(Params p, Seq_Model modelA, Seq_Model modelB, + int countIndex) { + this.countIndex = countIndex; + this.modelA = modelA; + this.modelB = modelB; + + if (!p.exists("match_cost")) { + set_default_costs(); + } else { + match_cost = p.get("match_cost"); + change_cost = p.get("change_cost"); + } + + normalize_costs(); + } + + public String toString() { + return this.getClass() + ": match_cost=" + match_cost + " change_cost=" + + change_cost; + } + + void set_default_costs() { + match_cost = -MyMath.log2(9.0); + change_cost = -MyMath.log2(1.0); + } + + void normalize_costs() { + double sum = MyMath.exp2(-match_cost) + MyMath.exp2(-change_cost); + match_cost = match_cost + MyMath.log2(sum); + change_cost = change_cost + MyMath.log2(sum); + } + + public double encA(char a, int i) { + return modelA.encodeLen(a, i); + } + + public double encB(char a, int i) { + return modelB.encodeLen(a, i); + } + + public double encBoth(char a, char b, int i, int j) { + double A_cost = encA(a, i); + double B_cost = encB(b, j); + if (a == b) { + // Match + // Do: P(match) * ( P(char a) + P(char b) ) / 2 + // System.err.println("enc match = " + ( match_cost + + // MyMath.logplus(A_cost, B_cost) + 1)); + return match_cost + MyMath.logplus(A_cost, B_cost) + 1; + } else { + // Change + // Do: P(change) * P(char a) * P(char b) * 0.5 * (1/(1-P(char b)) + + // 1/(1-P(char a))) + double aN = MyMath.exp2(-encA(b, i)); + double bN = MyMath.exp2(-encB(a, j)); + double norm = -MyMath.log2(1 / (1 - aN) + 1 / (1 - bN)); + // System.err.println("enc change = " + (change_cost + A_cost + + // B_cost + 1 + norm)); + return change_cost + A_cost + B_cost + 1 + norm; + } + } + + public static int required_counts() { + return 2; + } + + public Params counts_to_params(Counts counts) { + double sum = counts.counts[countIndex + matchIndex] + + counts.counts[countIndex + changeIndex]; + Params par = new Params(); + par.put("match_cost", + -MyMath.log2(counts.counts[countIndex + matchIndex] / sum)); + par.put("change_cost", + -MyMath.log2(counts.counts[countIndex + changeIndex] / sum)); + return par; + } + + public void update_count_encA(Counts c, double w, char a, int i) { + }; + + public void update_count_encB(Counts c, double w, char a, int i) { + }; + + public void update_count_encBoth(Counts c, double w, char a, char b, int i, + int j) { + if (a == b) { + c.inc(countIndex + matchIndex, w); + } else { + c.inc(countIndex + changeIndex, w); + } + } + + public double encode_params(double N) { + return Multinomial.MMLparameter_cost( + new double[] { MyMath.exp2(-match_cost), + MyMath.exp2(-change_cost) }, N); + } +} diff --git a/src/main/java/japsa/bio/misc/common/Multinomial.java b/src/main/java/japsa/bio/misc/common/Multinomial.java new file mode 100755 index 0000000..da0f730 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Multinomial.java @@ -0,0 +1,51 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +public class Multinomial { + /** + * Compute the MML estimate for encoding the parameters to the multinomial. + * p[] are the probabilites (should sum to 1) N is the number of data items + * that will be encoded. + * + * Encoding length of the Multinomial parameters are returned in bits. + */ + static public double MMLparameter_cost(double[] p, double N) { + + double h = MyMath.factorial(p.length - 1); // h(theta) - prior + // probabilty density = + // (K-1)! + double F = 1.0 / p[0]; // F will be the Fischer = N^(K-1)/(p1*p2*...*pk) + for (int i = 1; i < p.length - 1; i++) { + F *= N / p[i]; + } + + double cost = 0.5 * MyMath.log2(1 + F + / (h * h * Math.pow(12, p.length - 1))); + + cost += 0.5 * (p.length - 1) * MyMath.log2e; + + return cost; + } +} diff --git a/src/main/java/japsa/bio/misc/common/Mutation_1State.java b/src/main/java/japsa/bio/misc/common/Mutation_1State.java new file mode 100755 index 0000000..4d857cf --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Mutation_1State.java @@ -0,0 +1,335 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.*; + +abstract public class Mutation_1State extends Mutation_FSM { + + /** + * + */ + private static final long serialVersionUID = 1L; + + private class FSM_Params implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + double indel_cost, diag_cost; + Two_Seq_Model_Counts s; + + FSM_Params(Two_Seq_Model_Counts s, double indel_cost, double diag_cost) { + this.s = s; + this.indel_cost = indel_cost; + this.diag_cost = diag_cost; + + normalize_costs(); + } + + FSM_Params(Two_Seq_Model_Counts s, Params p) { + this.s = s; + + if (!p.exists("indel_cost")) { + set_default_costs(); + } else { + indel_cost = p.get("indel_cost"); + diag_cost = p.get("diag_cost"); + } + + normalize_costs(); + } + + void set_default_costs() { + indel_cost = 1; + diag_cost = 0; + } + + void normalize_costs() { + double sum = 2 * MyMath.exp2(-indel_cost) + MyMath.exp2(-diag_cost); + indel_cost = indel_cost + MyMath.log2(sum); + diag_cost = diag_cost + MyMath.log2(sum); + // VNTRReadDepth.printf("ins_cost=%.5f del_cost=%.5f diag_cost=%.5f\n", + // ins_cost, del_cost, diag_cost); + } + + public String toString() { + return "diag_cost=" + diag_cost + " indel_cost=" + indel_cost; + } + } + + FSM_Params p; + + double val; + + final protected int indelIndex = 0, diagIndex = 1; + + public Mutation_1State(Two_Seq_Model_Counts s, double indel_cost, + double diag_cost, int numCounts, int countIndex) { + super(numCounts, countIndex); + p = new FSM_Params(s, indel_cost, diag_cost); + reset(); + } + + public Mutation_1State(Two_Seq_Model_Counts s, Params par, int numCounts, + int countIndex) { + super(numCounts, countIndex); + p = new FSM_Params(s, par); + reset(); + } + + public Mutation_1State(FSM_Params p, int numCounts, int countIndex) { + super(numCounts, countIndex); + this.p = p; + reset(); + } + + public void reset() { + super.reset(); + val = MyMath.Big_Double; + counts.zero(); + } + + public static int required_counts() { + return 2; + }; + + // counts_to_params - convert counts into parameters. + // Call the Two_Seq_Model to convert its own. + // Note the 0.5* in the indel computation. This is cause there are 2 indel + // arcs out + // of each state, but we only have one count for them combined. + public Params counts_to_params(Counts c) { + double sum = c.counts[countIndex + indelIndex] + + c.counts[countIndex + diagIndex]; + Params par = new Params(); + par.put("indel_cost", + -MyMath.log2(0.5 * c.counts[countIndex + indelIndex] / sum)); + par.put("diag_cost", + -MyMath.log2(c.counts[countIndex + diagIndex] / sum)); + par.join(p.s.counts_to_params(c)); + return par; + }; + + public double encode_params() { + Counts c = get_counts(); + + // First encode match/change paramters. Pass the number of these events + // to encode_params + double len = p.s.encode_params(c.counts[countIndex + diagIndex]); + + double dataLen = c.counts[countIndex + diagIndex] + + c.counts[countIndex + indelIndex]; + + len += Multinomial.MMLparameter_cost( + new double[] { MyMath.exp2(-p.indel_cost), + MyMath.exp2(-p.diag_cost) }, dataLen); + return len; + } + + public double alignmentLength() { + Counts c = get_counts(); + return c.counts[countIndex + indelIndex] + + c.counts[countIndex + diagIndex]; + } + + public void init_val(double v) { + val = v; + }; + + public double get_val() { + return val; + }; + + public void normalise(double v) { + val -= v; + } + + public String paramsToString() { + return this.getClass() + ": " + p + "\n"; + } + + public String toString() { + return "val=" + val + " counts=" + counts + "\n"; + } + + public void add(double v, int cIndex) { + val += v; + if (cIndex >= 0) + counts.inc(cIndex, 1); + } + + public static class One extends Mutation_1State implements TraceBack_Info { + /** + * + */ + private static final long serialVersionUID = 1L; + TraceBack_Data id; + TraceBack_Data from; + + public One(FSM_Params p, int numCounts, int countIndex) { + super(p, numCounts, countIndex); + id = null; + from = null; + } + + public One(Two_Seq_Model_Counts s, Params par, int numCounts, + int countIndex) { + super(s, par, numCounts, countIndex); + id = null; + from = null; + } + + public void set_tbdata(TraceBack_Data id) { + this.id = id; + } + + public TraceBack_Data get_tbdata() { + return id; + } + + public TraceBack_Data get_from(TraceBack_Data td) { + Misc.my_assert(td.i == id.i && td.j == id.j, + "Not my traceback data!"); + return from; + } + + public Object clone() { + return new Mutation_1State.One(p, numCounts, countIndex); + } + + public void calc(Mutation_FSM h, Mutation_FSM v, Mutation_FSM d, + char a, char b, int i, int j) { + Mutation_1State.One hcell = (Mutation_1State.One) h; + Mutation_1State.One vcell = (Mutation_1State.One) v; + Mutation_1State.One dcell = (Mutation_1State.One) d; + + Counts tcounts = (Counts) counts.clone(); + if (hcell != null) { + double w = val + p.indel_cost + p.s.encB(b, j); + + tcounts.duplicate(counts); + tcounts.inc(countIndex + indelIndex, 1); + p.s.update_count_encB(tcounts, 1, b, j); + + hcell.or(w, tcounts, this); + } + + if (vcell != null) { + double w = val + p.indel_cost + p.s.encA(a, i); + + tcounts.duplicate(counts); + tcounts.inc(countIndex + indelIndex, 1); + p.s.update_count_encA(tcounts, 1, a, i); + + vcell.or(w, tcounts, this); + } + + if (dcell != null) { + double w = val + p.diag_cost + p.s.encBoth(a, b, i, j); + + tcounts.duplicate(counts); + tcounts.inc(countIndex + diagIndex, 1); + p.s.update_count_encBoth(tcounts, 1, a, b, i, j); + + dcell.or(w, tcounts, this); + } + }; + + public void or(double d, Counts c) { + or(d, c, null); + } + + public void or(double d, Counts c, Mutation_FSM f) { + if (d < val) { + counts.duplicate(c); + val = d; + from = (f == null ? null : ((One) f).id); + } + } + + } + + public static class All extends Mutation_1State { + private static final long serialVersionUID = 1L; + + public All(FSM_Params p, int numCounts, int countIndex) { + super(p, numCounts, countIndex); + } + + public All(Two_Seq_Model_Counts s, Params par, int numCounts, + int countIndex) { + super(s, par, numCounts, countIndex); + } + + public Object clone() { + return new Mutation_1State.All(p, numCounts, countIndex); + } + + public void calc(Mutation_FSM h, Mutation_FSM v, Mutation_FSM d, + char a, char b, int i, int j) { + Mutation_1State.All hcell = (Mutation_1State.All) h; + Mutation_1State.All vcell = (Mutation_1State.All) v; + Mutation_1State.All dcell = (Mutation_1State.All) d; + + if (hcell != null) { + double w = val + p.indel_cost + p.s.encB(b, j); + double count_inc = 1.0 / (1.0 + MyMath.exp2(w - hcell.val)); + if (Double.isNaN(count_inc)) + count_inc = 0; + hcell.or(w, counts); + hcell.p.s.update_count_encB(hcell.counts, count_inc, b, j); + hcell.counts.inc(countIndex + indelIndex, count_inc); + } + + if (vcell != null) { + double w = val + p.indel_cost + p.s.encA(a, i); + double count_inc = 1.0 / (1.0 + MyMath.exp2(w - vcell.val)); + if (Double.isNaN(count_inc)) + count_inc = 0; + vcell.or(w, counts); + vcell.p.s.update_count_encA(vcell.counts, count_inc, a, i); + vcell.counts.inc(countIndex + indelIndex, count_inc); + } + + if (dcell != null) { + double w = val + p.diag_cost + p.s.encBoth(a, b, i, j); + double count_inc = 1.0 / (1.0 + MyMath.exp2(w - dcell.val)); + if (Double.isNaN(count_inc)) + count_inc = 0; + dcell.or(w, counts); + dcell.p.s.update_count_encBoth(dcell.counts, count_inc, a, b, + i, j); + dcell.counts.inc(countIndex + diagIndex, count_inc); + } + }; + + public void or(double d, Counts c) { + // Update the counts first + counts.combine_with_lens(val, c, d); + val = MyMath.logplus(val, d); + } + } + +} diff --git a/src/main/java/japsa/bio/misc/common/Mutation_3State.java b/src/main/java/japsa/bio/misc/common/Mutation_3State.java new file mode 100755 index 0000000..14aac83 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Mutation_3State.java @@ -0,0 +1,690 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.*; + +/** + * Despite its name, this class does _not_ implement a full 3 state FSM. It + * implements a 3 state FSM for the purpose of Linear gapped costs for a DPA. + * There are therefore 5 parameters: given the last operation was a diagonal + * (match or change): diag, start gap given the last operation was a gap: diag, + * start new gap, continue current gap + * + * So, the start_fromD encodes the start of a gap _and_ the first character + * 2**-diag_fromD + 2 * 2**-start_fromD = 1 (2 possible ways to start a gap: ins + * or del) 2**-diag_fromI + 2**-start_fromI + 2**-cont_fromI = 1 + * + * The match/mismatch costs are part of the Two_Seq_Model + */ + +public abstract class Mutation_3State extends Mutation_FSM { + + /** + * + */ + private static final long serialVersionUID = 1L; + + private class FSM_Params implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + + Two_Seq_Model_Counts s; + + double diag_fromD, start_fromD; + + double diag_fromI, start_fromI, cont_fromI; + + /* + * FSM_Params(Two_Seq_Model_Counts s, double diag_cost, double + * start_gap, double cont_gap) { this.s = s; + * + * normalize_costs(diag_cost, start_gap, cont_gap); } + */ + + FSM_Params(Two_Seq_Model_Counts s, Params p) { + this.s = s; + + if (!p.exists("diag_fromD")) { + set_default_costs(); + } else { + diag_fromD = p.get("diag_fromD"); + start_fromD = p.get("start_fromD"); + diag_fromI = p.get("diag_fromI"); + start_fromI = p.get("start_fromI"); + cont_fromI = p.get("cont_fromI"); + + normalize_costs(); + } + } + + void set_default_costs() { + /* + * diag_fromD = 0; start_fromD = 3; + * + * diag_fromI = 0; start_fromI = 3; cont_fromI = 1; + */ + + // These costs are equivalent to SW scores (provided local char cost + // = 0.104808) + // Provided we are using DNA at 2 bits per char + diag_fromD = MyMath.logplus(0.709616, 1.824653); + start_fromD = 4.235037; + + diag_fromI = MyMath.logplus(1.379387, 2.494424); + start_fromI = 4.904808; + cont_fromI = 1.304808; + + if (s instanceof Model_SeqAB) { + ((Model_SeqAB) s).match_cost = 0.709616; + ((Model_SeqAB) s).change_cost = 1.824653; + ((Model_SeqAB) s).normalize_costs(); + } + + normalize_costs(); + } + + void normalize_costs() { + double sum; + + sum = MyMath.exp2(-diag_fromD) + 2 * MyMath.exp2(-start_fromD); + diag_fromD = diag_fromD + MyMath.log2(sum); + start_fromD = start_fromD + MyMath.log2(sum); + + sum = MyMath.exp2(-diag_fromI) + MyMath.exp2(-start_fromI) + + MyMath.exp2(-cont_fromI); + diag_fromI = diag_fromI + MyMath.log2(sum); + start_fromI = start_fromI + MyMath.log2(sum); + cont_fromI = cont_fromI + MyMath.log2(sum); + } + + public String toString() { + return "diag_fromD=" + diag_fromD + " start_fromD=" + start_fromD + + "\n" + "diag_fromI=" + diag_fromI + " start_fromI=" + + start_fromI + " cont_fromI=" + cont_fromI; + } + } + + FSM_Params p; + + double dval, hval, vval; + Counts d_counts, h_counts, v_counts; + + final protected int diag_fromD = 0, start_fromD = 1, diag_fromI = 2, + start_fromI = 3, cont_fromI = 4; + + public Mutation_3State(Two_Seq_Model_Counts s, Params par, int numCounts, + int countIndex) { + super(numCounts, countIndex); + p = new FSM_Params(s, par); + reset(); + } + + public Mutation_3State(FSM_Params p, int numCounts, int countIndex) { + super(numCounts, countIndex); + this.p = p; + reset(); + } + + public void reset() { + super.reset(); + dval = MyMath.Big_Double; + hval = MyMath.Big_Double; + vval = MyMath.Big_Double; + + if (d_counts == null) + d_counts = new Counts(numCounts); + if (h_counts == null) + h_counts = new Counts(numCounts); + if (v_counts == null) + v_counts = new Counts(numCounts); + d_counts.zero(); + h_counts.zero(); + v_counts.zero(); + + // counts = null; + } + + public static int required_counts() { + return 5; + }; + + // counts_to_params - convert counts into parameters. + // Call the Two_Seq_Model to convert its own. + // Note the 0.5* in the start_fromD computation. This is cause there are 2 + // start_fromD + // arcs out of each state, but we only have one count for them combined. + public Params counts_to_params(Counts c) { + Params par = new Params(); + double sum1 = c.counts[countIndex + diag_fromD] + + c.counts[countIndex + start_fromD]; + double sum2 = c.counts[countIndex + diag_fromI] + + c.counts[countIndex + start_fromI] + + c.counts[countIndex + cont_fromI]; + + par.put("diag_fromD", + -MyMath.log2(c.counts[countIndex + diag_fromD] / sum1)); + par.put("start_fromD", + -MyMath.log2(0.5 * c.counts[countIndex + start_fromD] / sum1)); + par.put("diag_fromI", + -MyMath.log2(c.counts[countIndex + diag_fromI] / sum2)); + par.put("start_fromI", + -MyMath.log2(c.counts[countIndex + start_fromI] / sum2)); + par.put("cont_fromI", + -MyMath.log2(c.counts[countIndex + cont_fromI] / sum2)); + par.join(p.s.counts_to_params(c)); + + return par; + }; + + public double encode_params() { + Counts c = get_counts(); + double[] probs; + double dataLen; + + // First encode match/change paramters. Pass the number of these events + // to encode_params + double len = p.s.encode_params(c.counts[countIndex + diag_fromD] + + c.counts[countIndex + diag_fromI]); + + // 'probs' is the paramaters of the multinomial distribution. + // 'dataLen' is the number of things in this multinomial. + + probs = new double[] { MyMath.exp2(-p.diag_fromD), + 2 * MyMath.exp2(-p.start_fromD) }; + dataLen = c.counts[countIndex + diag_fromD] + + c.counts[countIndex + start_fromD]; + + len += Multinomial.MMLparameter_cost(probs, dataLen); + + probs = new double[] { MyMath.exp2(-p.diag_fromI), + MyMath.exp2(-p.start_fromI), MyMath.exp2(-p.cont_fromI) }; + dataLen = c.counts[countIndex + diag_fromI] + + c.counts[countIndex + start_fromI] + + c.counts[countIndex + start_fromI]; + + len += Multinomial.MMLparameter_cost(probs, dataLen); + + return len; + } + + public double alignmentLength() { + Counts c = get_counts(); + return c.counts[countIndex + diag_fromD] + + c.counts[countIndex + start_fromD] + + c.counts[countIndex + diag_fromI] + + c.counts[countIndex + start_fromI] + + c.counts[countIndex + cont_fromI]; + } + + public void init_val(double v) { + dval = v; + hval = vval = MyMath.Big_Double; + }; + + public void normalise(double v) { + dval -= v; + hval -= v; + vval -= v; + } + + /** Initialise diag counts */ + public void init_counts(Counts c) { + d_counts.duplicate(c); + } + + public String paramsToString() { + return this.getClass() + ": " + p + "\n"; + } + + public String toString() { + return "dval=" + dval + " hval=" + hval + " vval=" + vval + "\n" + + "d_counts=" + d_counts + "\n" + "h_counts=" + h_counts + "\n" + + "v_counts=" + v_counts + "\n"; + } + + /** + * add() - something extra must be encoded in this state. Update all three + * states + */ + public void add(double v, int cIndex) { + dval += v; + hval += v; + vval += v; + if (cIndex >= 0) { + d_counts.inc(cIndex, 1); + h_counts.inc(cIndex, 1); + v_counts.inc(cIndex, 1); + } + } + + public static class One extends Mutation_3State implements TraceBack_Info { + + /** + * + */ + private static final long serialVersionUID = 1L; + + private static class TB_Data extends TraceBack_Data { + int state; + + // TB_Data() { + // super(); + // state = -1; + // } + + TB_Data(TraceBack_Data td, int s) { + super(); + i = td.i; + j = td.j; + state = s; + } + } + + TB_Data d_id, v_id, h_id; + TB_Data d_from, v_from, h_from; + + public One(FSM_Params p, int numCounts, int countIndex) { + super(p, numCounts, countIndex); + d_id = v_id = h_id = null; + d_from = v_from = h_from = null; + } + + public One(Two_Seq_Model_Counts s, Params par, int numCounts, + int countIndex) { + super(s, par, numCounts, countIndex); + d_id = v_id = h_id = null; + d_from = v_from = h_from = null; + } + + public void set_tbdata(TraceBack_Data id) { + d_id = new TB_Data(id, 0); + v_id = new TB_Data(id, 1); + h_id = new TB_Data(id, 2); + } + + public TraceBack_Data get_tbdata() { + if (dval <= hval && dval <= vval) { + return d_id; + } else if (vval <= dval && vval <= hval) { + return v_id; + } else if (hval <= dval && hval <= vval) { + return h_id; + } + Misc.my_assert(false, "Bad vals"); + return null; + } + + public TraceBack_Data get_from(TraceBack_Data td) { + TB_Data t = (TB_Data) td; + Misc.my_assert(t.i == d_id.i && t.j == d_id.j, + "Not my traceback data!"); + switch (t.state) { + case 0: + return d_from; + case 1: + return v_from; + case 2: + return h_from; + default: + Misc.my_assert(false, "Bad trackback data. t.state=" + t.state); + } + return null; + } + + public Object clone() { + return new Mutation_3State.One(p, numCounts, countIndex); + } + + public double get_val() { + return MyMath.min3(dval, vval, hval); + }; + + public Counts get_counts() { + // Get counts from state with smallest val + if (dval <= hval && dval <= vval) + return d_counts; + + if (vval <= dval && vval <= hval) + return v_counts; + + if (hval <= dval && hval <= vval) + return h_counts; + + Misc.my_assert(false, "Bad vals!"); + return null; + }; + + public void calc(Mutation_FSM h, Mutation_FSM v, Mutation_FSM d, + char a, char b, int i, int j) { + Mutation_3State.One hcell = (Mutation_3State.One) h; + Mutation_3State.One vcell = (Mutation_3State.One) v; + Mutation_3State.One dcell = (Mutation_3State.One) d; + + Counts tcounts = (Counts) counts.clone(); + if (hcell != null) { + double char_cost = p.s.encB(b, j); + double w; + // double count_inc; + + // From 'd' state + w = dval + char_cost + p.start_fromD; + tcounts.duplicate(d_counts); + tcounts.inc(countIndex + start_fromD, 1); + p.s.update_count_encB(tcounts, 1, b, j); + hcell.or_h(w, tcounts, d_id); + + // From 'v' state + w = vval + char_cost + p.start_fromI; + tcounts.duplicate(v_counts); + tcounts.inc(countIndex + start_fromI, 1); + p.s.update_count_encB(tcounts, 1, b, j); + hcell.or_h(w, tcounts, v_id); + + // From 'h' state + w = hval + char_cost + p.cont_fromI; + tcounts.duplicate(h_counts); + tcounts.inc(countIndex + cont_fromI, 1); + p.s.update_count_encB(tcounts, 1, b, j); + hcell.or_h(w, tcounts, h_id); + } + + if (vcell != null) { + double char_cost = p.s.encA(a, i); + double w; + // double count_inc; + + // From 'd' state + w = dval + char_cost + p.start_fromD; + tcounts.duplicate(d_counts); + tcounts.inc(countIndex + start_fromD, 1); + p.s.update_count_encA(tcounts, 1, a, i); + vcell.or_v(w, tcounts, d_id); + + // From 'v' state + w = vval + char_cost + p.cont_fromI; + tcounts.duplicate(v_counts); + tcounts.inc(countIndex + cont_fromI, 1); + p.s.update_count_encA(tcounts, 1, a, i); + vcell.or_v(w, tcounts, v_id); + + // From 'h' state + w = hval + char_cost + p.start_fromI; + tcounts.duplicate(h_counts); + tcounts.inc(countIndex + start_fromI, 1); + p.s.update_count_encA(tcounts, 1, a, i); + vcell.or_v(w, tcounts, h_id); + } + + if (dcell != null) { + double char_cost = p.s.encBoth(a, b, i, j); + double w; + // double count_inc; + + // From 'd' state + w = dval + char_cost + p.diag_fromD; + tcounts.duplicate(d_counts); + tcounts.inc(countIndex + diag_fromD, 1); + p.s.update_count_encBoth(tcounts, 1, a, b, i, j); + dcell.or_d(w, tcounts, d_id); + + // From 'v' state + w = vval + char_cost + p.diag_fromI; + tcounts.duplicate(v_counts); + tcounts.inc(countIndex + diag_fromI, 1); + p.s.update_count_encBoth(tcounts, 1, a, b, i, j); + dcell.or_d(w, tcounts, v_id); + + // From 'h' state + w = hval + char_cost + p.diag_fromI; + tcounts.duplicate(h_counts); + tcounts.inc(countIndex + diag_fromI, 1); + p.s.update_count_encBoth(tcounts, 1, a, b, i, j); + dcell.or_d(w, tcounts, h_id); + } + } + + public void or(double d, Counts c) { + or(d, c, null); + } + + /** + * or() - a new transition into this cell. All new transitions start in + * the diag state, so just call or_d + */ + public void or(double d, Counts c, Mutation_FSM from) { + TB_Data f = null; + if (from != null) + f = (TB_Data) (((One) from).get_tbdata()); + or_d(d, c, f); + } + + public void or_d(double d, Counts c, TB_Data from_id) { + if (d < dval) { + d_counts.duplicate(c); + dval = d; + d_from = from_id; + } + } + + public void or_h(double d, Counts c, TB_Data from_id) { + if (d < hval) { + h_counts.duplicate(c); + hval = d; + h_from = from_id; + } + } + + public void or_v(double d, Counts c, TB_Data from_id) { + if (d < vval) { + v_counts.duplicate(c); + vval = d; + v_from = from_id; + } + } + } + + public static class All extends Mutation_3State { + /** + * + */ + private static final long serialVersionUID = 1L; + + public All(FSM_Params p, int numCounts, int countIndex) { + super(p, numCounts, countIndex); + } + + public All(Two_Seq_Model_Counts s, Params par, int numCounts, + int countIndex) { + super(s, par, numCounts, countIndex); + } + + public Object clone() { + return new Mutation_3State.All(p, numCounts, countIndex); + } + + public double get_val() { + double v = MyMath.logplus(dval, hval); + v = MyMath.logplus(v, vval); + return v; + }; + + public Counts get_counts() { + // Combine counts from each of the three states weighted by their + // val + Counts c = (Counts) d_counts.clone(); + double val = dval; + + c.combine_with_lens(val, h_counts, hval); + val = MyMath.logplus(val, hval); + + c.combine_with_lens(val, v_counts, vval); + + return c; + }; + + public void calc(Mutation_FSM h, Mutation_FSM v, Mutation_FSM d, + char a, char b, int i, int j) { + Mutation_3State.All hcell = (Mutation_3State.All) h; + Mutation_3State.All vcell = (Mutation_3State.All) v; + Mutation_3State.All dcell = (Mutation_3State.All) d; + + if (hcell != null) { + double char_cost = p.s.encB(b, j); + double w; + double count_inc; + + // From 'd' state + w = dval + char_cost + p.start_fromD; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - hcell.hval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + hcell.or_h(w, d_counts); + hcell.p.s.update_count_encB(hcell.h_counts, count_inc, b, j); + hcell.h_counts.inc(countIndex + start_fromD, count_inc); + + // From 'v' state + w = vval + char_cost + p.start_fromI; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - hcell.hval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + hcell.or_h(w, v_counts); + hcell.p.s.update_count_encB(hcell.h_counts, count_inc, b, j); + hcell.h_counts.inc(countIndex + start_fromI, count_inc); + + // From 'h' state + w = hval + char_cost + p.cont_fromI; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - hcell.hval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + hcell.or_h(w, h_counts); + hcell.p.s.update_count_encB(hcell.h_counts, count_inc, b, j); + hcell.h_counts.inc(countIndex + cont_fromI, count_inc); + } + + if (vcell != null) { + double char_cost = p.s.encA(a, i); + double w; + double count_inc; + + // From 'd' state + w = dval + char_cost + p.start_fromD; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - vcell.vval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + vcell.or_v(w, d_counts); + vcell.p.s.update_count_encA(vcell.v_counts, count_inc, a, i); + vcell.v_counts.inc(countIndex + start_fromD, count_inc); + + // From 'v' state + w = vval + char_cost + p.cont_fromI; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - vcell.vval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + vcell.or_v(w, v_counts); + vcell.p.s.update_count_encA(vcell.v_counts, count_inc, a, i); + vcell.v_counts.inc(countIndex + cont_fromI, count_inc); + + // From 'h' state + w = hval + char_cost + p.start_fromI; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - vcell.vval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + vcell.or_v(w, h_counts); + vcell.p.s.update_count_encA(vcell.v_counts, count_inc, a, i); + vcell.v_counts.inc(countIndex + start_fromI, count_inc); + } + + if (dcell != null) { + double char_cost = p.s.encBoth(a, b, i, j); + double w; + double count_inc; + + // From 'd' state + w = dval + char_cost + p.diag_fromD; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - dcell.dval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + dcell.or_d(w, d_counts); + dcell.p.s.update_count_encBoth(dcell.d_counts, count_inc, a, b, + i, j); + dcell.d_counts.inc(countIndex + diag_fromD, count_inc); + + // From 'v' state + w = vval + char_cost + p.diag_fromI; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - dcell.dval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + dcell.or_d(w, v_counts); + dcell.p.s.update_count_encBoth(dcell.d_counts, count_inc, a, b, + i, j); + dcell.d_counts.inc(countIndex + diag_fromI, count_inc); + + // From 'h' state + w = hval + char_cost + p.diag_fromI; + count_inc = 1.0 / (1.0 + MyMath.exp2(w - dcell.dval)); + if (Double.isNaN(count_inc)) + count_inc = 0; + dcell.or_d(w, h_counts); + dcell.p.s.update_count_encBoth(dcell.d_counts, count_inc, a, b, + i, j); + dcell.d_counts.inc(countIndex + diag_fromI, count_inc); + } + + } + + /** + * or() - a new transition into this cell. All new transitions start in + * the diag state, so just call or_d + */ + public void or(double d, Counts c) { + or_d(d, c); + + /* + * if (d + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.Serializable; + + + +public abstract class Mutation_FSM implements Has_Value, Serializable { + /** + * + */ + private static final long serialVersionUID = 4340122370640374306L; + Counts counts; + int numCounts; + int countIndex; + + class FSM_Params { + Two_Seq_Model s; + + FSM_Params(Two_Seq_Model s) { + this.s = s; + }; + } + + public Mutation_FSM() { + this(0, 0); + } + + public Mutation_FSM(int numCounts, int countIndex) { + this.numCounts = numCounts; + counts = new Counts(numCounts); + this.countIndex = countIndex; + } + + public abstract void init_val(double v); + + public abstract double get_val(); + + public abstract void normalise(double v); + + public abstract void calc(Mutation_FSM h, Mutation_FSM v, Mutation_FSM d, + char a, char b, int i, int j); + + public void reset() { + counts.zero(); + } + + /** Initialise all counts */ + public void init_counts(Counts c) { + counts.duplicate(c); + } + + public static int required_counts() { + return 0; + }; + + public abstract Params counts_to_params(Counts c); + + public Counts get_counts() { + return counts; + }; + + public double alignmentLength() { + System.err.println("WARNING: alignmentLength() not implemented in " + + this.getClass()); + return 0; + }; + + public abstract void add(double d, int cIndex); + + public abstract void or(double d, Counts c); + + public String paramsToString() { + return this.getClass() + ": paramsToString() not defined" + "\n"; + } + + public double encode_params() { + System.err.println("WARNING: encode_params() not implemented in " + + this.getClass()); + return 0; + } + + public abstract Object clone(); + + public static class TraceBack_Data { + public int i, j; + + public TraceBack_Data() { + i = -1; + j = -1; + } + + public TraceBack_Data(int i, int j) { + this.i = i; + this.j = j; + } + + public String toString() { + return "i=" + i + " j=" + j; + }; + } + + public interface TraceBack_Info { + public void set_tbdata(TraceBack_Data id); + + public TraceBack_Data get_tbdata(); + + public TraceBack_Data get_from(TraceBack_Data td); + + public void or(double d, Counts c, Mutation_FSM from); + } +} diff --git a/src/main/java/japsa/bio/misc/common/MyMath.java b/src/main/java/japsa/bio/misc/common/MyMath.java new file mode 100755 index 0000000..c4631e0 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/MyMath.java @@ -0,0 +1,133 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +public final class MyMath { + + public static final double Big_Double = Double.POSITIVE_INFINITY; + + public static final double loge2 = Math.log(2); + public static final double log2e = 1.0 / loge2; + + public static double log2(double a) { + // return Math.log(a); + return Math.log(a) * log2e; + } + + public static double exp2(double a) { + // return Math.exp(a); + return Math.exp(a * loge2); + } + + public static double logplus(double a, double b) { + // return -log(exp(-a) + exp(-b)); But this would lose accuracy + if (Double.isInfinite(a)) + return b; + if (Double.isInfinite(b)) + return a; + if (a > b) { + double t = b; + b = a; + a = t; + } + ; // make b>a + if (b > a + 50) + return a; // Approx if b >> a + return a - log2(1 + exp2(a - b)); + // return a-Math.log(1+Math.exp(a-b)); + } + + // logstar(x) - a prior over integers 1..infinity + // returns length in bits for encoding that integer + // This is for integers only! + public static double logstar_discrete(int x) { + if (x > 1) { + x = (int) log2(x); + return 1 + x + logstar_discrete(x); + } else + return 1; + } + + // logstar(x) - a prior over integers 1..infinity + // returns length in bits for encoding that integer + // This is Rissanen's continuous approximation to logstar_discrete + // according to Rissanen (1983) + public static double logstar_continuous(double x) { + if (x > 1) { + x = log2(x); + return x + logstar_continuous(x); + } else + return log2(2.865); + } + + public static double factorial(int N) { + Misc.my_assert(N >= 0, "Bad paramater to factorial"); + double res = 1; + for (int i = 2; i <= N; i++) + res *= i; + return res; + } + + public static int min2(int a, int b) { + return (a < b ? a : b); + } + + public static int max2(int a, int b) { + return (a > b ? a : b); + } + + public static double max2(double a, double b) { + return (a > b ? a : b); + } + + public static double min2(double a, double b) { + return (a < b ? a : b); + } + + public static double min3(double a, double b, double c) { + return (a < b ? (a < c ? a : c) : (b < c ? b : c)); + } + + public static double min4(double a, double b, double c, double d) { + return min2(min3(a, b, c), d); + } + + public static void main(String args[]) { + for (double i = 1; i < 100; i += 1) { + double v1 = MyMath.logstar_continuous(i); + // System.out.println("logstar("+i+") = "+v1); + System.out.println(i + " " + v1 + " " + + (-i * log2(0.9) - log2(0.1))); + } + + /* + * double s = Double.POSITIVE_INFINITY; for (double i=1; i<400; i+=0.01) + * { double v1 = MyMath.logstar_continuous(i); double v2 = + * MyMath.logstar_discrete((int)i); System.out.println(i+" "+v1+" "+v2); + * //s = MyMath.logplus(v,s); + * //System.out.println("i="+i+" v="+v+" sum=" + * +s+" sum_p="+MyMath.exp2(-s)); + */ + } +} diff --git a/src/main/java/japsa/bio/misc/common/NumericalSequence.java b/src/main/java/japsa/bio/misc/common/NumericalSequence.java new file mode 100755 index 0000000..fcf7af1 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/NumericalSequence.java @@ -0,0 +1,183 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.common; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.BitSet; +import java.util.Vector; + +/** + * @author Minh Duc Cao this class represent a nummerical sequence and various + * operations on it. + */ +public class NumericalSequence { + private double[] seqs; + + /** + * Create a numerical sequence that hold the data + * + * @param aSeqs + */ + public NumericalSequence(double[] aSeqs) { + seqs = new double[aSeqs.length]; + for (int index = 0; index < seqs.length; index++) { + seqs[index] = aSeqs[index];// + } + } + + public double[] getSeqs() { + return seqs; + } + + public static double[] read(String fileName) throws IOException { + + BufferedReader bufRdr = new BufferedReader(new FileReader(fileName)); + String line = null; + Vector v = new Vector(); + while ((line = bufRdr.readLine()) != null) { + if (line.startsWith("#")) + continue;// Comment lines + String[] tokens = line.trim().split("\\s");// Broken up + + v.add(Double.parseDouble(tokens[tokens.length - 1]));// .. and get + // the last + // token + } + + bufRdr.close(); + // Convert from vector to array + double[] data = new double[v.size()]; + for (int i = 0; i < v.size(); i++) + data[i] = v.get(i); + + return data; + } + + public double[] smooth(int wSize) { + double[] out = new double[seqs.length]; + double[] his = new double[wSize]; + int index = 0; + double sum = 0.0; + + for (int i = 0; i < wSize; i++) { + his[i] = 0.0; + } + + for (int i = 0; i < seqs.length; i++) { + index = i % wSize; + sum = sum - his[index] + seqs[i]; + his[index] = seqs[i]; + + if (i < wSize) + out[i / 2] = sum / (i + 1); + else + out[i - wSize / 2] = sum / wSize; + } + + for (int i = wSize - 1; i > 0; i--) { + index = (index + 1) % wSize; + sum -= his[index]; + out[out.length - i / 2 - 1] = sum / (i); + } + return out; + } + + public double getSum(BitSet bs) { + double sum = 0; + for (int index = 0; index < seqs.length; index++) { + if (bs.get(index)) + sum += seqs[index]; + } + + return sum; + } + + public double getSum() { + double sum = 0; + for (int index = 0; index < seqs.length; index++) { + sum += seqs[index]; + } + return sum; + } + + NumericalSequence difference(NumericalSequence aSeq) { + // This may throw exception if the sizes are not match + double[] anoSeq = aSeq.seqs; + double[] newSeq = new double[seqs.length]; + + for (int i = 0; i < newSeq.length; i++) { + newSeq[i] = seqs[i] - anoSeq[i]; + } + return new NumericalSequence(newSeq); + } + + public boolean writeDataToFile(File file) { + try { + PrintWriter pw = new PrintWriter(new FileOutputStream(file)); + pw.println("# Double data written by DNAGraphTool"); + for (int i = 0; i < seqs.length; i++) { + pw.println(i + "\t" + seqs[i]); + } + pw.close(); + } catch (Exception e) { + e.printStackTrace(); + } + return true; + } + + static double[] difference(double[] seq1, double[] seq2) { + double[] seqResult = new double[seq1.length]; + + for (int i = 0; i < seqResult.length; i++) { + seqResult[i] = seq1[i] - seq2[i]; + } + + return seqResult; + } + + public static void main(String[] args) { + try { + double[] seq0 = read(args[0]), seq1 = read(args[1]); + + NumericalSequence output = new NumericalSequence(difference(seq0, + seq1)); + output.writeDataToFile(new File(args[2])); + + } catch (Exception e) { + e.printStackTrace(); + } + + } +} diff --git a/src/main/java/japsa/bio/misc/common/Params.java b/src/main/java/japsa/bio/misc/common/Params.java new file mode 100755 index 0000000..7542837 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Params.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.Serializable; + +public final class Params implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + private String names[]; + private double vals[]; + private int num, room; + + private void alloc(int r) { + if (room >= r) + return; + + String[] n = new String[r]; + double[] v = new double[r]; + + if (room > 0) { // Copy over any data + for (int i = 0; i < num; i++) { + n[i] = names[i]; + v[i] = vals[i]; + } + } + + room = r; + names = n; + vals = v; + } + + public Params() { + num = 0; + room = 0; + alloc(10); + } + + public Params put(String s, double v) { + int id = get_id(s); + if (id < 0) { + if (num == room) + alloc(room * 2); + names[num] = new String(s); + id = num; + num++; + } + vals[id] = v; + + return this; + } + + private int get_id(String s) { + for (int i = 0; i < num; i++) { + if (s.equalsIgnoreCase(names[i])) + return i; + } + return -1; + } + + public boolean exists(String s) { + return get_id(s) >= 0; + } + + public double get(String s) { + int id = get_id(s); + Misc.my_assert(id >= 0, "Attempt Params.get() with non-existent key"); + return vals[id]; + } + + public int get_num() { + return num; + } + + public String get_name_by_id(int id) { + return names[id]; + } + + public void join(Params p) { + for (int i = 0; i < p.num; i++) { + put(p.names[i], p.vals[i]); + } + } + + public String toString() { + StringBuffer r = new StringBuffer(); + for (int i = 0; i < num; i++) { + r.append(names[i] + "=" + vals[i] + "\n"); + } + return r.toString(); + } + + public void fromString(String str) { + int s = 0; + int e; + do { + int p_comma, p_newline; + p_comma = str.indexOf(',', s); + p_newline = str.indexOf('\n', s); + + if (p_comma < 0) + p_comma = str.length(); + if (p_newline < 0) + p_newline = str.length(); + + e = (p_comma < p_newline ? p_comma : p_newline); + + int m = str.indexOf('=', s); + if (s < m && m < e) { + String p = str.substring(s, m); + String v = str.substring(m + 1, e); + double d = Double.parseDouble(v); + + put(p, d); + } + s = e + 1; + } while (s < str.length()); + } + + public static void main(String args[]) { + Params p = new Params(); + p.fromString(args[0]); + System.out.println("p=" + p); + } + +} diff --git a/src/main/java/japsa/bio/misc/common/Seq_Model.java b/src/main/java/japsa/bio/misc/common/Seq_Model.java new file mode 100755 index 0000000..807846d --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Seq_Model.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.*; + +/** + * The interface for any left to right model of sequence + */ +public interface Seq_Model extends Serializable { + public static final long serialVersionUID = 1234567890; + + /** + * Calculate what it would cost to encode a character + * + * @see #update + * @param a + * The character to encode + * @param i + * The character a is the ith character + * of the sequence + * @return The length to encode the character a + */ + double encodeLen(char a, int i); + + /** + * Update the internal model for encoding a character. Like + * {@link #encodeLen} but actually updates the internal state of the model + * as required. + * + * @see #encodeLen + * @param a + * The character to encode + * @param i + * The character a is the ith character + * of the sequence + * @return The length to encode the character a + */ + double update(char a, int i); +} diff --git a/src/main/java/japsa/bio/misc/common/Two_Seq_Model.java b/src/main/java/japsa/bio/misc/common/Two_Seq_Model.java new file mode 100755 index 0000000..f7be290 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Two_Seq_Model.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +import java.io.*; + +public interface Two_Seq_Model extends Serializable { + public abstract double encA(char a, int i); + + public abstract double encB(char a, int i); + + public abstract double encBoth(char a, char b, int i, int j); + + public abstract double encode_params(double N); +} diff --git a/src/main/java/japsa/bio/misc/common/Two_Seq_Model_Counts.java b/src/main/java/japsa/bio/misc/common/Two_Seq_Model_Counts.java new file mode 100755 index 0000000..e36c076 --- /dev/null +++ b/src/main/java/japsa/bio/misc/common/Two_Seq_Model_Counts.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) David Powell + * + * + * This file is used by both FuzzyLZ and AlignCompress + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + + */ + +package japsa.bio.misc.common; + +public interface Two_Seq_Model_Counts extends Two_Seq_Model { + public abstract void update_count_encA(Counts c, double w, char a, int i); + + public abstract void update_count_encB(Counts c, double w, char a, int i); + + public abstract void update_count_encBoth(Counts c, double w, char a, + char b, int i, int j); + + public abstract Params counts_to_params(Counts c); +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/OptionsHandle.java b/src/main/java/japsa/bio/misc/dnaPlatform/OptionsHandle.java new file mode 100755 index 0000000..162ed52 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/OptionsHandle.java @@ -0,0 +1,684 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform; + +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: OptionsHandle + *

+ * + *

+ * Description: A FunctionHandle holds options for functions. Options for + * functions include integers, boolean, doubles, strings and SequenceData. An + * OptionsHandle can also have other OptionsHandle as option + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +public class OptionsHandle { + + // this variable holds a reference to the owner of options + private Object owner; + + private int MAXOPTIONS = 100; + Option[] myOptions; + private int numberOptions; + + private String newline; + + public OptionsHandle(Object optionsOwner) { + owner = optionsOwner; + myOptions = new Option[MAXOPTIONS]; + numberOptions = 0; + + newline = System.getProperty("line.separator"); + } + + public OptionsHandle(Object optionsOwner, int options) { + owner = optionsOwner; + + MAXOPTIONS = options; + myOptions = new Option[MAXOPTIONS]; + numberOptions = 0; + + newline = System.getProperty("line.separator"); + } + + /** + * This function returns the class of the object that created OptionsHandle + * + * @return Class + */ + public Object getOwner() { + return owner; + } + + /* getNumberOfOptions: this function returns the number of options */ + public int getNumberOptions() { + return numberOptions; + } + + /* getOptionAt: returns the option name at given index */ + public String getOptionAt(int index) { + if (index < 0 || index >= numberOptions) + return null; + else + return myOptions[index].getOption(); + + } + + /* + * optionSet: this function returns true if the value of a given option has + * been changed + */ + public boolean optionSet(String option) { + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option)) { + return myOptions[i].valueChanged(); + } + } + return false; + } + + /* getHelp: given an option it returns a string with help */ + public String getHelp(String option) { + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option)) { + return myOptions[i].getHelp(); + } + } + return null; + } + + /*** Functions to add options ***/ + + /** + * Function to add an option of any type. + * + * @param option + * String + * @param defaultVal + * Object + * @param help + * String + */ + public void addOption(String option, Object defaultVal, String help) { + if (numberOptions < MAXOPTIONS) { + myOptions[numberOptions] = new Option(option, defaultVal, help); + numberOptions++; + } + } + + /** + * Function to add option holding booleans + * + * @param option + * String + * @param defaultVal + * boolean + * @param help + * String + */ + public void addBooleanOption(String option, boolean defaultVal, String help) { + if (numberOptions < MAXOPTIONS) { + myOptions[numberOptions] = new BooleanOption(option, defaultVal, + help); + numberOptions++; + } + } + + /** + * Function to add option holding integers + * + * @param option + * String + * @param defaultVal + * int + * @param help + * String + */ + public void addIntOption(String option, int defaultVal, String help) { + if (numberOptions < MAXOPTIONS) { + myOptions[numberOptions] = new IntOption(option, defaultVal, help); + numberOptions++; + } + } + + /** + * Function to add option holding doubles + * + * @param option + * String + * @param defaultVal + * double + * @param help + * String + */ + public void addDoubleOption(String option, double defaultVal, String help) { + if (numberOptions < MAXOPTIONS) { + myOptions[numberOptions] = new DoubleOption(option, defaultVal, + help); + numberOptions++; + } + } + + /** + * Function to add options to hold strings + * + * @param option + * String + * @param defaultVal + * String + * @param help + * String + */ + public void addStringOption(String option, String defaultVal, String help) { + if (numberOptions < MAXOPTIONS) { + myOptions[numberOptions] = new StringOption(option, defaultVal, + help); + numberOptions++; + } + } + + /** + * Function to add options holding SequenceData + * + * @param option + * String + * @param defaultVal + * SequenceData + * @param help + * String + */ + public void addSequenceDataOption(String option, SequenceData defaultVal, + String help) { + if (numberOptions < MAXOPTIONS) { + myOptions[numberOptions] = new SequenceDataOption(option, + defaultVal, help); + numberOptions++; + } + } + + /** + * Function to add options holding OptionsHandle + * + * @param option + * String + * @param defaultVal + * SequenceData + * @param help + * String + */ + public void addOptionsHandleOption(String option, OptionsHandle defaultVal, + String help) { + if (numberOptions < MAXOPTIONS) { + myOptions[numberOptions] = new OptionsHandleOption(option, + defaultVal, help); + numberOptions++; + } + } + + /*** function to get values of options ***/ + public Object getOptionValue(String option) { + Option o = null; + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option)) + o = myOptions[i]; + } + return o.getValue(); + } + + public boolean getBooleanValue(String option) { + Option o = null; + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i].getValue() instanceof Boolean) + o = myOptions[i]; + } + return ((Boolean) o.getValue()).booleanValue(); + } + + public int getIntValue(String option) { + Option o = null; + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i].getValue() instanceof Integer) + o = myOptions[i]; + + } + return ((Integer) o.getValue()).intValue(); + } + + public double getDoubleValue(String option) { + Option o = null; + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i].getValue() instanceof Double) + o = myOptions[i]; + + } + return ((Double) o.getValue()).doubleValue(); + } + + public String getStringValue(String option) { + Option o = null; + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i].getValue() instanceof String) + o = myOptions[i]; + + } + return (String) o.getValue(); + } + + @SuppressWarnings({ "unused", "null" }) + public SequenceData getSequenceDataValue(String option) { + Option o = null; + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i].getValue() instanceof SequenceData) + o = myOptions[i]; + return (SequenceData) o.getValue(); + + } + return null; + } + + public OptionsHandle getOptionsHandleValue(String option) { + Option o = null; + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i].getValue() instanceof OptionsHandle) + o = myOptions[i]; + + } + return (OptionsHandle) o.getValue(); + } + + /*** functions to set values of options ***/ + public void setOptionValue(String option, Object val) { + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i].getValue().getClass() + .equals(val.getClass())) + myOptions[i].setValue(val); + } + } + + public void setSequenceDataValue(String option, SequenceData val) { + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i] instanceof SequenceDataOption) + myOptions[i].setValue(val); + } + } + + public void setOptionsHandleValue(String option, OptionsHandle val) { + for (int i = 0; i < numberOptions; i++) { + if (myOptions[i].getOption().equals(option) + && myOptions[i] instanceof OptionsHandleOption) + myOptions[i].setValue(val); + } + } + + /** + * Function to create string representing an OptionsHandle object given as a + * parameter. + * + * @param ops + * OptionsHandle + * @return String + */ + private String optionsToString(OptionsHandle ops) { + String optionString = ""; + for (int i = 0; i < ops.getNumberOptions(); i++) { + + String o = ops.getOptionAt(i); + Object val = ops.getOptionValue(o); + + if (val instanceof Boolean) + optionString += "\n" + o + " (boolean) = " + + ((Boolean) val).booleanValue(); + else if (val instanceof Integer) + optionString += "\n" + o + " (integer) = " + + ((Integer) val).intValue(); + else if (val instanceof Double) + optionString += "\n" + o + " (double) = " + + ((Double) val).doubleValue(); + else if (val instanceof String) + optionString += "\n" + o + " (string) = " + val; + // else if(val instanceof SequenceData) + // optionString += "\n"+ o +" = " + val; + else + optionString += "\n" + o + " = " + val; + + } + + return optionString; + } + + public String toString() { + String options = owner + "{"; + options += optionsToString(this); + options += "\n}"; + return options; + } + + /** + *

+ * Title: Option + *

+ * + *

+ * Description: This class holds options and ModelHandle + *

+ * + */ + private class Option { + + protected String option; // holds name of option + protected String help; + protected Object defaultVal; + protected Object value; + + public Option(String option, Object defaultVal, String help) { + this.option = option; + this.help = help; + this.defaultVal = value = defaultVal; + } + + public String getOption() { + return option; + } + + @SuppressWarnings("unused") + public Object getDefaultVal() { + return defaultVal; + } + + public Object getValue() { + return value; + } + + public void setValue(Object val) { + value = val; + } + + // if value != defaultVal it means value has been changed + public boolean valueChanged() { + return value != defaultVal; + } + + public String getHelp() { + return help + newline + "(default = " + defaultVal + ")"; + } + + } + + /** + *

+ * Title: BooleanOption + *

+ * + *

+ * Description: This class holds boolean options + *

+ * + */ + private class BooleanOption extends Option { + + public BooleanOption(String option, boolean defaultVal, String help) { + super(option, new Boolean(defaultVal), help); + } + + } + + /** + *

+ * Title: IntOption + *

+ * + *

+ * Description: This class holds integer options + *

+ * + */ + private class IntOption extends Option { + public IntOption(String option, int defaultVal, String help) { + super(option, new Integer(defaultVal), help); + } + + } + + /** + *

+ * Title: DoubleOption + *

+ * + *

+ * Description: This class holds double options + *

+ * + */ + private class DoubleOption extends Option { + public DoubleOption(String option, double defaultVal, String help) { + super(option, new Double(defaultVal), help); + } + } + + /** + *

+ * Title: StringOption + *

+ * + *

+ * Description: This class holds string options and ModelHandle + *

+ * + */ + private class StringOption extends Option { + public StringOption(String option, String defaultVal, String help) { + super(option, defaultVal, help); + } + + } + + /** + *

+ * Title: SequenceDataOption + *

+ * + *

+ * Description: This class holds SequenceData options + *

+ * + */ + private class SequenceDataOption extends Option { + + public SequenceDataOption(String option, SequenceData defaultVal, + String help) { + super(option, defaultVal, help); + } + + } + + /** + *

+ * Title: OptionsHandleOption + *

+ * + *

+ * Description: This class holds OptionHandle options + *

+ * + */ + private class OptionsHandleOption extends Option { + + public OptionsHandleOption(String option, OptionsHandle defaultVal, + String help) { + super(option, defaultVal, help); + } + + } + + /* + * Main function for testing + */ + public static void main(String args[]) { + + String owner = "Master"; + OptionsHandle func = new OptionsHandle(owner); + + System.out.println("Testing function handle"); + System.out.println(); + + // adding options into ModelHandle + System.out.println("Creating options ... "); + func.addIntOption("foo", 7, "Some integer"); + func.addBooleanOption("fom", false, "A bool"); + func.addDoubleOption("doub", 8.42, "A double"); + func.addStringOption("str", "cthulu", "A string"); + CharSequenceData s = new CharSequenceData(); + s.setSequenceName("char_1"); + func.addSequenceDataOption("aSequence", s, + "This is a character sequence"); + + OptionsHandle h = new OptionsHandle("Slave", 2); + h.addBooleanOption("poor", true, "very unfortunate"); + h.addDoubleOption("money", 0.0, "All his money"); + + func.addOptionsHandleOption("slave", h, "his worker"); + + // printing option values + System.out.println("These are the options: "); + System.out.println("foo = " + func.getOptionValue("foo")); + System.out.println("fom = " + func.getOptionValue("fom")); + System.out.println("doub = " + func.getOptionValue("doub")); + System.out.println("str = " + func.getOptionValue("str")); + + // setting option values + /* + * System.out.println("Setting option values ..."); + * func.setOptionValue("foo",new Integer(4)); + * func.setOptionValue("fom",new Boolean(true)); + * func.setOptionValue("doub",new Double(4.5)); + * func.setOptionValue("str","hello hello"); + * + * //printing new option values System.out.println("New values:"); + * System.out.println("foo = " + func.getOptionValue("foo")); + * System.out.println("fom = " + func.getOptionValue("fom")); + * System.out.println("doub = " + func.getOptionValue("doub")); + * System.out.println("str = " + func.getOptionValue("str")); + * + * //attempting to set wrong value for options + * System.out.println("Setting options with wrong value type ..."); + * func.setOptionValue("foo",new Boolean(false)); + * func.setOptionValue("fom",new Integer(666)); + * func.setOptionValue("doub","this is a string!"); + * func.setOptionValue("str",new Double(0.0)); + * + * //printing new option values + * System.out.println("Values should be same as above:"); + * System.out.println("foo = " + func.getOptionValue("foo")); + * System.out.println("fom = " + func.getOptionValue("fom")); + * System.out.println("doub = " + func.getOptionValue("doub")); + * System.out.println("str = " + func.getOptionValue("str")); + * + * + * //getting types of options + * System.out.println("These are the types of the options"); + * System.out.println("foo type = " + + * func.getOptionValue("foo").getClass()); + * System.out.println("fom type = " + + * func.getOptionValue("fom").getClass()); + * System.out.println("doub type = " + + * func.getOptionValue("doub").getClass()); + * System.out.println("str type = " + + * func.getOptionValue("str").getClass()); + * + * + * System.out.println("Checking for types of options"); + * if(func.getOptionValue("foo") instanceof Integer) + * System.out.println("foo is an integer!"); + * if(func.getOptionValue("fom") instanceof Boolean) + * System.out.println("fom is a boolean!"); + * if(func.getOptionValue("doub") instanceof Double) + * System.out.println("doub is a double!"); + */ + + // printing toString method of options handle dialog: + System.out.println("toString(): "); + System.out.println(func); + + System.out.println(); + System.out.println("Attempting to get all children out of string"); + String description = func.toString(); + + System.out.println("\t" + description + "\n"); + + // if toString() returns a string with curly braces + // add all lines in between as child nodes + if (description.indexOf('{') > -1) { + System.out.println("There are children!"); + + // add anything before brakets into tree + String desStart = description.substring(0, + description.indexOf('{') + 1); + if (desStart.matches("\\S+")) + System.out.println("\t" + desStart); + + // add children to a vector and call function to add children to + // tree + description = description.substring((description.indexOf('{') + 1), + (description.lastIndexOf('}'))); + + String[] children = description.split("[\\n]"); + + for (int i = 0; i < children.length; i++) + System.out.println("\t\t\t" + children[i]); + + // add anything after children + System.out.println("The end bit is:"); + String end = description.substring(description.lastIndexOf('}'), + description.length() - 1); + if (end.matches("\\S+")) + System.out.println("\t" + end); + } + + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/compModel/CompressionModel.java b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/CompressionModel.java new file mode 100755 index 0000000..93c3ded --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/CompressionModel.java @@ -0,0 +1,68 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.compModel; + +import japsa.bio.misc.dnaPlatform.function.Function; + +/** + *

+ * Title: Compression Model + *

+ * + *

+ * Description: CompressionModel inherits from Function interface and defines + * other methods for compression models within the DNAPlatform + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + * @author Julie Bernal + * @version 1.0 + */ + +public interface CompressionModel extends Function { + + /** + * Returns names of files containing other output of model. + * + * @return String[] + */ + public String[] getOtherGraphs(); + + /* + * Returns the lenght of the sequence scanned by the model. + * + * @return int + */ + // public int getSequenceLength(); + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/compModel/ExpertCompressionModel.java b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/ExpertCompressionModel.java new file mode 100755 index 0000000..dc01e0d --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/ExpertCompressionModel.java @@ -0,0 +1,174 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.compModel; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.xm.ExpertModel; + +import java.io.*; + +/** + *

+ * Title: Fuzzy Model + *

+ * + *

+ * Description: Class to use FyzzyDriver in DNAPlatform + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class ExpertCompressionModel implements CompressionModel { + public ExpertCompressionModel() { + } + + /** + * Returns an OptionsHandle with options to run expert model. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOptions = new OptionsHandle(this, 20); + myOptions.addIntOption("hashSize", 11, + "Hash size, should not exceed 13 as limitation of Java"); + myOptions.addIntOption("maxExpert", 100, "Expert Limit"); + myOptions.addIntOption("context", 20, "Context Length"); + + myOptions.addDoubleOption("listenThreshold", .1, "Listen Threshold"); + + myOptions.addIntOption("chances", 4, + "Number of chances before expert to be removed"); + + myOptions.addStringOption("dna", "ATGC", + "Alphabet used by the sequence."); + + // myOptions.addDoubleOption() + return myOptions; + } + + /** + * Returns a Class array with one element, DNA_SEQUENCE indicating this + * model can only be applied to DNA sequences. + * + * @return Class[] + */ + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] types = { DNASequenceData.class }; + return types; + } + + /** + * Function to execute fuzzyLZ model using FuzzyDriver class. + * + * @param options + * OptionsHandle + * @param data + * SequenceData + * @return SequenceData + * @throws IOException + * @throws RuntimeException + */ + public SequenceData execute(OptionsHandle options, SequenceData data) { + // throws IOException, RuntimeException { + + if (options == null) { + System.out.println("options is null!"); + return null; + } + + else { + System.out.println("Owner of options is " + options.getOwner()); + } + + DoubleSequenceData outputData = new DoubleSequenceData(data); + if (data instanceof DNASequenceData) { + + int hashSize = options.getIntValue("hashSize"); + int context = options.getIntValue("context"); + int maxExpert = options.getIntValue("maxExpert"); + double listenThreshold = options.getDoubleValue("listenThreshold"); + int chances = options.getIntValue("chances"); + + ExpertModel expertModel = new ExpertModel(hashSize, + Alphabet.DNA4(), context, maxExpert, listenThreshold, + chances, false); + + + try{ + expertModel.printParams(); + DNASequenceData dnaData = (DNASequenceData) data; + + char [] ssss = dnaData.getCharData(); + Sequence dna = new Sequence(Alphabet.DNA4(), dnaData.getCharData(), + dnaData.getSequenceName()); + + System.out.println(ssss[0] + " " + ssss[1]); + + Sequence[] dnaArray = new Sequence[1]; + + dnaArray[0] = dna; + + double[] costs = expertModel.encode(dnaArray); + + outputData.addHistory(this); + outputData.addHistory(options); + outputData.setDoubleData(costs); + }catch (Exception e){ + e.printStackTrace(System.out); + throw e; + } + } + return outputData; + } + + /** + * Returns an array of Strings with names for all other files created by + * fuzzy. + * + * @return String[] + */ + public String[] getOtherGraphs() { + // other graphs created with fuzzyLZ are: + return null; + } + + public String toString() { + return "Expert Model"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/compModel/FuzzyModel.java b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/FuzzyModel.java new file mode 100755 index 0000000..4aa2f98 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/FuzzyModel.java @@ -0,0 +1,205 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.compModel; + +import java.io.*; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; +import japsa.bio.misc.fuzzyLZ.FuzzyDriver; + +/** + *

+ * Title: Fuzzy Model + *

+ * + *

+ * Description: Class to use FyzzyDriver in DNAPlatform + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class FuzzyModel implements CompressionModel { + + String sequenceFile; + String[] imageFiles; + + public FuzzyModel() { + } + + /** + * Returns an OptionsHandle with options to run fuzzy model. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOptions = new OptionsHandle(this, 20); + myOptions.addIntOption("maxIterations", 3, + "Max number of iterations. (0 means until convergence)"); + myOptions.addStringOption("preFile", "", + "Prepend 'preFile' to sequence."); + myOptions + .addStringOption( + "seqModel", + "markov(2)", + "Base sequence model. Use 'markov(n)' for a n-th order markov model, n=-1 for uniform"); + myOptions.addStringOption("dna", "atgc", + "Alphabet used by the sequence."); + + myOptions.addStringOption("fwdMach", "3state", + "Comma separated list of machines to use for forward matches.\n" + + "(Use an empty string '' for no forward machines.)\n" + + "Supported: 1state,3state"); + + myOptions.addStringOption("revMach", "3state", + "Comma separated list of machines to use for reverse matches.\n" + + "(Use an empty string '' for no reverse machines.)\n" + + "Supported: 1state,3state"); + + myOptions.addBooleanOption("overwrite", true, "Overwrite msglen file."); + myOptions.addIntOption("debug", 2, + "Debug level (higher gives more verbose )"); + myOptions.addIntOption("imageSize", 1024, + "Maximum Image size in pixels"); + myOptions.addIntOption("imageFreq", 0, + "Save an image every seconds. (0 - to disable)"); + myOptions.addIntOption("checkFreq", 0, + "Save a checkpoint every seconds. (0 - to disable)"); + myOptions.addIntOption("statsFreq", 300, + "Display some stats every seconds. (0 - to disable)"); + + myOptions.addStringOption("msgFile", "", + "Output file for encode length of each character.\n" + + "(The default is based on the input file name)"); + + myOptions.addStringOption("outDir", "." + File.separatorChar, + "Directory to save output files in."); + + // These options are for Matches_Sparse + myOptions + .addIntOption("hashSize", 10, + "Window size to use for constructing hashtable (0 - for full N^2 algorithm)"); + myOptions.addIntOption("computeWin", 10, + "Number of cells to activate on a hashtable hit"); + myOptions + .addIntOption("cutML", 4, + "When (cell_value - base_cell > cutML) then cell is killed. (in bits)"); + // myOptions.addBoolean("plotActive", false, "true: plot only active + // cells, false: plot cell values"); + + myOptions + .addStringOption("paramFile", "", + "Parameter file to read for various model parameters (see docs)"); + + myOptions.addStringOption("resume", "", + "Filename to resume from checkpoint"); + + return myOptions; + } + + /** + * Returns a Class array with one element, DNA_SEQUENCE indicating this + * model can only be applied to DNA sequences. + * + * @return Class[] + */ + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] types = { DNASequenceData.class }; + return types; + } + + /** + * Function to execute fuzzyLZ model using FuzzyDriver class. + * + * @param options + * OptionsHandle + * @param data + * SequenceData + * @return SequenceData + * @throws IOException + * @throws RuntimeException + */ + public SequenceData execute(OptionsHandle options, SequenceData data) + throws IOException, RuntimeException { + + if (options == null) { + System.out.println("options is null!"); + return null; + } + + else { + System.out.println("Owner of options is " + options.getOwner()); + } + + System.out.println(options.getStringValue("resume")); + + DoubleSequenceData outputData = new DoubleSequenceData(data); + if (data instanceof DNASequenceData) { + + FuzzyDriver fuz = new FuzzyDriver(); + String infoContentFile = new String(); + + // run fuzzyLZ + if (data instanceof DNASequenceData) { + infoContentFile = fuz.start(options, data.toString(), + ((DNASequenceData) data).getCharData()); + + // add this model to the history of outputData + outputData.addHistory(this); + outputData.addHistory(options); + + // read double data obtained from Fuzzy + outputData.readDataFromFile(infoContentFile); + + // set image files from fuzzy; + imageFiles = fuz.getImageFileNames(); + } + } + return outputData; + } + + /** + * Returns an array of Strings with names for all other files created by + * fuzzy. + * + * @return String[] + */ + public String[] getOtherGraphs() { + // other graphs created with fuzzyLZ are: + return imageFiles; + } + + public String toString() { + return "ARM"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/compModel/MarkovModel.java b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/MarkovModel.java new file mode 100755 index 0000000..ef3154e --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/compModel/MarkovModel.java @@ -0,0 +1,311 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.compModel; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.util.HashMap; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: MarkovModel + *

+ * + *

+ * Description: Class to use Markov model in DNAPlatform + *

+ * + * @author Julie Bernal + * @version 1.0 + */ + +public class MarkovModel implements CompressionModel { + + public MarkovModel() { + } + + /** + * Return all options needed to run Markov model in OptionsHandle object. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + myOps.addStringOption("dna", "atgc", + "Alphabet used by the sequence."); + myOps.addIntOption("order", 0, "Order of Markov Model"); + + return myOps; + } + + /** + * Returns a Class array with one element, DNA_SEQUENCE indicating this + * model can be applied to DNA sequences and any other type of char + * sequences. + * + * @return Class[] + */ + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] types = { CharSequenceData.class }; + return types; + } + + /** + * Runs markov model with inputSequenceData to create outputSequenceData. + * The output data for this model is a InfoContentSequenceData of type + * INFORMATION_CONTENT + * + * @throws IOException + * @throws RuntimeException + */ + + public SequenceData execute(OptionsHandle options, SequenceData data) + throws RuntimeException { + + DoubleSequenceData outputData = new DoubleSequenceData(data); + if (data instanceof CharSequenceData) { + // get corresponding fields according to options + String alphabet = options.getStringValue("dna"); + int order = options.getIntValue("order"); + + Markov mark = new Markov(alphabet.length()); + + // add this model to the history of outputData + outputData.addHistory(options); + + // create double data in outputData by reading information content + // file + // created with markov + outputData.setDoubleData(mark.calcInfoContent( + ((CharSequenceData) data).getCharData(), order)); + } + return outputData; + } + + /** + * Returns the name of the file where the information content calculated by + * model is stored. The name of the file returned is the filename given in + * MarkovDriver + * + * @return String + */ + /* + * public String getInfoContentFile() { String prefix = (new + * File(sequenceFile)).getName(); return prefix + "_" + + * myOptions.getIntValue("order") + "MM-msgLen.txt"; } + */ + + /** + * Returns null as Markov model does not create other files. + * + * @return String[] + */ + + public String[] getOtherGraphs() { + return null; + } + + public String toString() { + return "Markov"; + } + +} + +/** + * Markov implements a k-order Markov Model, the constructor for this model + * takes as arguments the number of characters in sequence. + * + * @author Julie Bernal + * @version 1.0 + */ +class Markov { + + private int myCharSet; // number of possible characters in myString + private String nl; + + public Markov(int numCharSet) { + myCharSet = numCharSet; + nl = System.getProperty("line.separator"); + } + + /* + * printInfoContent: this class iterates through string s predicting the + * probability of every character according to a Markov Model. The + * parameter order indicates the number of characters to consider before + * current character. + */ + public void printInfoContent(String s, final int order, String fileName) { + try { + + BufferedWriter out = new BufferedWriter(new FileWriter(fileName)); + HashMap charCount = new HashMap(); + String seq, totalSeq; + int count = 1, totalCount = 1; + + double probability;// , bitsToCode = 0; + + // give probability of 1/charSet to all chars that don't have enough + // characters before them + for (int i = 0; i < order; i++) { + out.write(s.charAt(i) + "\t" + (-1 * log2(1.0 / myCharSet)) + + nl); + // bitsToCode += (-1 * log2(1.0/myCharSet)); + } + + for (int index = 0; index < s.length() - order; index++) { + + // 1. Look at current subsequence in s + seq = s.substring(index, index + order + 1); + totalSeq = seq.substring(0, seq.length() - 1) + "-"; + + // Get counters to predict probability: + // counter for all the characters + if (charCount.containsKey(totalSeq)) + totalCount = ((Integer) charCount.get(totalSeq)).intValue(); + else + // assume all chars seen once at the start + totalCount = myCharSet; + + // conunter for current character + if (charCount.containsKey(seq)) + count = ((Integer) charCount.get(seq)).intValue(); + else + // assume all chars seen once at start + count = 1; + + // 2. Predict probability of current character according to + // preceeding sequence (InfoContent) + probability = (double) count / (double) totalCount; + + // output information content of each character in sequence + out.write(seq + "\t" + (-1 * log2(probability)) + nl); + + // DecimalFormat probFormat = new DecimalFormat("#0.00"); + // bitsToCode += (-1 * log2(probability)); + // System.out.println("\tBits: " + bitsToCode); + + // 3. Update counters: + count++; + totalCount++; + charCount.put(seq, new Integer(count)); + charCount.put(totalSeq, new Integer(totalCount)); + } + out.close(); + + } catch (IOException e) { + } + } + + /** + * calcInfoContent: this function calculates the information content of a + * string, given an order. + * + * @param s + * String + * @param order + * int + * @return double[] information content sequence + */ + + public double[] calcInfoContent(char[] sequence, int order) { + + double[] infoCont = new double[sequence.length]; + int index = 0; + + HashMap charCount = new HashMap(); + String seq, totalSeq; + int count = 1, totalCount = 1; + + double probability; + + String s = String.valueOf(sequence); + + // give probability of 1/charSet to all chars that don't have enough + // characters before them + for (int i = 0; i < order; i++) { + // out.write(s.charAt(i)+ "\t" + (-1*log2(1.0/myCharSet)) + nl); + infoCont[index] = -1 * log2(1.0 / myCharSet); + index++; + } + + for (int i = 0; i < s.length() - order; i++) { + // 1. Look at current subsequence in s + seq = s.substring(i, i + order + 1); + totalSeq = seq.substring(0, seq.length() - 1) + "-"; + + // Get counters to predict probability: + // counter for all the characters + if (charCount.containsKey(totalSeq)) + totalCount = ((Integer) charCount.get(totalSeq)).intValue(); + else + // assume all chars seen once at the start + totalCount = myCharSet; + + // conunter for current character + if (charCount.containsKey(seq)) + count = ((Integer) charCount.get(seq)).intValue(); + else + // assume all chars seen once at start + count = 1; + + // 2. Predict probability of current character according to + // preceeding sequence (InfoContent) + probability = (double) count / (double) totalCount; + + // output information content of each character in sequence + infoCont[index] = (-1 * log2(probability)); + index++; + + // 3. Update counters: + count++; + totalCount++; + charCount.put(seq, new Integer(count)); + charCount.put(totalSeq, new Integer(totalCount)); + } + + return infoCont; + } + + /** + * Calculates log2 of argument given. + * + * @param n + * double + * @return double + */ + private double log2(double n) { + return Math.log(n) / Math.log(2); + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/AppendFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/AppendFunction.java new file mode 100755 index 0000000..3c03e92 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/AppendFunction.java @@ -0,0 +1,148 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: AppendFunction + *

+ * + *

+ * Description: This function takes an input sequence and another sequnce as a + * parameter and appends these sequences if they are of the same SequenceData + * type + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class AppendFunction implements Function { + public AppendFunction() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { SequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for a particular function which has + * parameters to run function. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + + CharSequenceData temp = new CharSequenceData(); + temp.setCharData(new char[0]); + + myOps.addSequenceDataOption("Sequence", temp, "Sequence to append"); + + return myOps; + } + + /** + * Creates a new SequenceData from input SequenceData, cuts new SequenceData + * and adds itself to history of new SequenceData + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + SequenceData appendSeq = myOptions.getSequenceDataValue("Sequence"); + + int appendIndex = seqData.getData().length; + + // 1. Create output SequenceData from given SequenceData + SequenceData output = seqData.getNewSequenceData(); + + // 2. Cut new sequence and set output data to be new sequence + output.setData(append(output.getData(), appendSeq.getData())); + + // 3. let output SequenceData know about this function + output.addHistory(this + " [" + appendIndex + "] "); + output.addHistory(appendSeq.getHistory()); + + return output; + } + + /** + * This function cuts a given object array from firstIndex to lastIndex and + * returns created array of objects. An array is only cut when the indexes + * are bigger than 0 and less than the lenght of array. + * + * @param firstIndex + * int + * @param lastIndex + * int + * @param data + * Object[] + * @return Object[] + */ + private Object[] append(Object[] data1, Object[] data2) + throws RuntimeException { + + if (!data1.getClass().equals(data2.getClass())) { + throw new RuntimeException( + "Sequences to append must be of same type"); + } + + System.out.println("Appending sequences ... "); + + Object[] temp = new Object[data1.length + data2.length]; + + int index = 0; + for (int i = 0; i < data1.length; i++, index++) + temp[index] = data1[i]; + + for (int i = 0; i < data2.length; i++, index++) + temp[index] = data2[i]; + + return temp; + } + + /* fixing toString() method */ + public String toString() { + return "Append"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/CombineMinFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/CombineMinFunction.java new file mode 100755 index 0000000..961d2b4 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/CombineMinFunction.java @@ -0,0 +1,148 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: Difference + *

+ * + *

+ * Description: This function takes two DoubleSequenceData sequences and + * calculates the difference between their values. + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class CombineMinFunction implements Function { + public CombineMinFunction() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { DoubleSequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for a particular function which has + * parameters to run function. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + + DoubleSequenceData temp = new DoubleSequenceData(); + temp.setDoubleData(new double[0]); + + myOps.addSequenceDataOption("Sequence", temp, "Combine two sequence"); + + return myOps; + } + + /** + * Creates a new DoubleSequenceData containing the difference between + * sequence given as parameter and sequence in OptionsHandle + * + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + SequenceData diffData = myOptions.getSequenceDataValue("Sequence"); + + if (!(seqData instanceof DoubleSequenceData && diffData instanceof DoubleSequenceData)) { + throw new RuntimeException( + "Incorrect type of sequence, this function operates on sequences of doubles"); + } + + // 1. Create output SequenceData from given SequenceData + DoubleSequenceData output = new DoubleSequenceData(seqData); + + // 2. Subtract sequence given as a parameter from input sequence and set + // it to output + output.setDoubleData(getMin( + ((DoubleSequenceData) seqData).getDoubleData(), + ((DoubleSequenceData) diffData).getDoubleData())); + + // 3. let output SequenceData know about this function + output.addHistory(this); + output.addHistory(diffData.getHistory()); + + return output; + } + + /** + * This function cuts a given object array from firstIndex to lastIndex and + * returns created array of objects. An array is only cut when the indexes + * are bigger than 0 and less than the lenght of array. + * + * @param firstIndex + * int + * @param lastIndex + * int + * @param data + * Object[] + * @return Object[] + */ + private double[] getMin(double[] data1, double[] data2) { + + System.out.println("Calculating get min of sequences ... "); + + // set length for output to the shortest sequence + double[] temp = new double[Math.min(data1.length, data2.length)]; + + for (int i = 0; i < temp.length; i++) + if (data1[i] > data2[i]) + temp[i] = data2[i]; + else + temp[i] = data1[i]; + + return temp; + } + + /* fixing toString() method */ + public String toString() { + return "CombineMin"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/DNAComplementFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/DNAComplementFunction.java new file mode 100755 index 0000000..96d6117 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/DNAComplementFunction.java @@ -0,0 +1,159 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: DNAComplementFunction + *

+ * + *

+ * Description: This function transforms a DNA sequence stored in a + * DNASequenceData object and creates a new DNASequenceData object containing + * the complementary DNA sequence, in which A is matched with T, G is matched + * with C and vice versa. + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class DNAComplementFunction implements Function { + public DNAComplementFunction() { + } + + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] typeData = { DNASequenceData.class }; + return typeData; + } + + /** + * Returns null as the DNAComplement function has no options + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + return null; + } + + /** + * Creates a new SequenceData from input SequenceData, smooths new + * SequenceData and adds this function to its history. + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + if (!(seqData instanceof DNASequenceData)) { + throw new RuntimeException( + "Incorrect type of sequence, this function operates on DNA sequences"); + } + + // 1. Create output SequenceData from given SequenceData + SequenceData output = seqData.getNewSequenceData(); + + // 2. Smooth new sequence and set its data to be new smoothed data + if (output instanceof DNASequenceData) + ((DNASequenceData) output) + .setCharData(complement(((DNASequenceData) output) + .getCharData())); + + // 3. let output SequenceData know about this function + output.addHistory(this); + + return output; + + } + + /** + * This functon smooths array of doubles. It takes a window size as + * parameter, calculates the average of the information content of elements + * in array in window size and gives this value to the last element in the + * array. + * + * @param data + * Object[] + * @return Object[] + */ + private char[] complement(char[] data) throws RuntimeException { + + char[] newData = new char[data.length]; + + for (int i = 0; i < data.length; i++) { + switch (data[i]) { + case 'a': + newData[i] = 't'; + break; + case 't': + newData[i] = 'a'; + break; + case 'g': + newData[i] = 'c'; + break; + case 'c': + newData[i] = 'g'; + break; + case 'A': + newData[i] = 'T'; + break; + case 'T': + newData[i] = 'A'; + break; + case 'G': + newData[i] = 'C'; + break; + case 'C': + newData[i] = 'G'; + break; + default: + newData[i] = data[i]; + break; + } + } + + return newData; + } + + /* fixing toString() method */ + public String toString() { + return "DNA Complement"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/DifferenceFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/DifferenceFunction.java new file mode 100755 index 0000000..d751da0 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/DifferenceFunction.java @@ -0,0 +1,144 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: Difference + *

+ * + *

+ * Description: This function takes two DoubleSequenceData sequences and + * calculates the difference between their values. + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class DifferenceFunction implements Function { + public DifferenceFunction() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { DoubleSequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for a particular function which has + * parameters to run function. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + + DoubleSequenceData temp = new DoubleSequenceData(); + temp.setDoubleData(new double[0]); + + myOps.addSequenceDataOption("Sequence", temp, + "Substract this sequence from input"); + + return myOps; + } + + /** + * Creates a new DoubleSequenceData containing the difference between + * sequence given as parameter and sequence in OptionsHandle + * + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + SequenceData diffData = myOptions.getSequenceDataValue("Sequence"); + + if (!(seqData instanceof DoubleSequenceData && diffData instanceof DoubleSequenceData)) { + throw new RuntimeException( + "Incorrect type of sequence, this function operates on sequences of doubles"); + } + + // 1. Create output SequenceData from given SequenceData + DoubleSequenceData output = new DoubleSequenceData(seqData); + + // 2. Subtract sequence given as a parameter from input sequence and set + // it to output + output.setDoubleData(difference( + ((DoubleSequenceData) seqData).getDoubleData(), + ((DoubleSequenceData) diffData).getDoubleData())); + + // 3. let output SequenceData know about this function + output.addHistory(this); + output.addHistory(diffData.getHistory()); + + return output; + } + + /** + * This function cuts a given object array from firstIndex to lastIndex and + * returns created array of objects. An array is only cut when the indexes + * are bigger than 0 and less than the lenght of array. + * + * @param firstIndex + * int + * @param lastIndex + * int + * @param data + * Object[] + * @return Object[] + */ + private double[] difference(double[] data1, double[] data2) { + + System.out.println("Calculating difference of sequences ... "); + + // set length for output to the shortest sequence + double[] temp = new double[Math.max(data1.length, data2.length)]; + + int index = 0; + for (int i = 0; i < temp.length; i++, index++) + temp[index] = data1[i] - data2[i]; + + return temp; + } + + /* fixing toString() method */ + public String toString() { + return "Difference"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/FilterFeatureFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/FilterFeatureFunction.java new file mode 100755 index 0000000..c1cb638 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/FilterFeatureFunction.java @@ -0,0 +1,140 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; +import japsa.seq.JapsaFeature; + +import java.util.Iterator; + +/** + *

+ * Title: FilterFeatureFunction + *

+ * + * + * @author Minh Duc Cao + * @version 1.0 + */ +public class FilterFeatureFunction implements Function { + public FilterFeatureFunction() { + } + + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] typeData = { AnnotationSequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for the smoothing function which has an + * option to change window size + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + + myOps.addBooleanOption("Include", true, "Include or exclude"); + myOps.addStringOption("Features", "", + "List of features separated by comma ','"); + + return myOps; + } + + /** + * Creates a new SequenceData from input SequenceData, smooths new + * SequenceData and adds this function to its history. + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + if (!(seqData instanceof AnnotationSequenceData)) { + throw new RuntimeException( + "Incorrect type of sequence, this function operates on sequences of doubles"); + } + + AnnotationSequenceData annoData = (AnnotationSequenceData) seqData; + Iterator iter = annoData.iterator(); + + String[] list = myOptions.getStringValue("Features").split(","); + boolean include = myOptions.getBooleanValue("Include"); + + for (int x = 0; x < list.length; x++) { + list[x] = list[x].trim().toUpperCase(); + } + + // 1. Create output SequenceData from given SequenceData + AnnotationSequenceData output = new AnnotationSequenceData(seqData); + + if (include) {// Include + while (iter.hasNext()) { + JapsaFeature feature = iter.next(); + for (int x = 0; x < list.length; x++) { + if (list[x].equals(feature.getType().toUpperCase().trim())) { + output.addFeature(feature.cloneFeature()); + break; + } + } + + } + } else {// Exclude + while (iter.hasNext()) { + boolean added = true; + JapsaFeature feature = iter.next(); + for (int x = 0; x < list.length; x++) { + if (list[x].equals(feature.getType().toUpperCase().trim())) { + added = false; + break; + } + if (added) + output.addFeature(feature.cloneFeature()); + } + } + + } + + // 3. let output SequenceData know about this function + output.addHistory(myOptions); + return output; + + } + + /* fixing toString() method */ + public String toString() { + return "Feature Filter"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/Function.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/Function.java new file mode 100755 index 0000000..553db60 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/Function.java @@ -0,0 +1,84 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.function; + +import java.io.IOException; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: Function + *

+ * + *

+ * Description: This interface defines the methods all functions within the + * DNAPlatform must implement + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public interface Function { + + /** + * Returns an OptionsHandle object for a particular function which contains + * function options. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle(); + + /** + * Returns an array of classes a model can be applied to + * + * @return Class[] + */ + public Class[] getTypeSequenceData(); + + /** + * This method is used to execute funcions, which map a SequenceData object + * to another SequenceData object + * + * @param data + * SequenceData + * @return SequenceData + * @throws IOException + * @throws RuntimeException + */ + public SequenceData execute(OptionsHandle options, SequenceData data) + throws IOException, RuntimeException; + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/NegateFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/NegateFunction.java new file mode 100755 index 0000000..bce37fa --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/NegateFunction.java @@ -0,0 +1,127 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: NegateFunction + *

+ * + *

+ * Description: This class implements a function that negates all numbers in a + * double sequence. + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +public class NegateFunction implements Function { + public NegateFunction() { + } + + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] typeData = { DoubleSequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for the smoothing function which has an + * option to change window size + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + return null; + } + + /** + * Creates a new DoubleSequenceData from input SequenceData, negates new + * SequenceData and adds this function to its history. + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + if (!(seqData instanceof DoubleSequenceData)) { + throw new RuntimeException( + "Incorrect type of sequence, this function operates on numeric sequences"); + } + + // 1. Create output SequenceData from given SequenceData + SequenceData output = seqData.getNewSequenceData(); + + // 2. Smooth new sequence and set its data to be new smoothed data + if (output instanceof DoubleSequenceData) + ((DoubleSequenceData) output) + .setDoubleData(negate(((DoubleSequenceData) output) + .getDoubleData())); + + // 3. let output SequenceData know about this function + output.addHistory(this); + + return output; + + } + + /** + * This functon negates an array of doubles + * + * @param data + * Object[] + * @return Object[] + */ + private double[] negate(double[] data) throws RuntimeException { + + double[] newData = new double[data.length]; + + for (int i = 0; i < data.length; i++) { + newData[i] = data[i] * -1; + } + + return newData; + } + + /* fixing toString() method */ + public String toString() { + return "Negate"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/ReadFileFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/ReadFileFunction.java new file mode 100755 index 0000000..e5e834d --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/ReadFileFunction.java @@ -0,0 +1,99 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import java.io.IOException; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: ReadFileFunction + *

+ * + *

+ * Description: This function reads a sequence from a file. + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +public class ReadFileFunction implements Function { + + String filename = ""; + + public ReadFileFunction() { + } + + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] typeData = { SequenceData.class }; + return typeData; + } + + /** + * ReadFromFile does not have options therefore return null + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + return null; + } + + /** + * OptionsHandle must be null as this function doesn't have options. seqData + * is sequence to read from a file. + * + * @param myOptions + * OptionsHandle + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws IOException, RuntimeException { + if (filename != "" && filename != null) { + // read a file using data's method to read sequence from file + seqData.readDataFromFile(filename); + } + return seqData; + } + + public void setFile(String sequenceFile) { + filename = sequenceFile; + } + + public String toString() { + return "Read"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/ReadFormatFileFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/ReadFormatFileFunction.java new file mode 100755 index 0000000..0ad385a --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/ReadFormatFileFunction.java @@ -0,0 +1,190 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFileFormat; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; + +import java.util.ArrayList; +import java.util.Iterator; + +/** + *

+ * Title: ReadFileFunction + *

+ * + *

+ * Description: This function reads a sequence from a file. + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +public class ReadFormatFileFunction implements Function { + + String filename = ""; + + public ReadFormatFileFunction() { + } + + public ReadFormatFileFunction(String f) { + filename = f; + } + + @SuppressWarnings("rawtypes") + public Class[] getTypeSequenceData() { + Class[] typeData = { SequenceData.class }; + return typeData; + } + + /** + * ReadFromFile does not have options therefore return null + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + return null; + } + + /** + * OptionsHandle must be null as this function doesn't have options. seqData + * is sequence to read from a file. + * + * @param myOptions + * OptionsHandle + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) { + // never get called + // seqData = nummy + // if (filename != "" && filename != null) { + // return guessFormat(filename); + // } + return null; + } + + public void setFile(String sequenceFile) { + filename = sequenceFile; + } + + public String toString() { + return "Read"; + } + + public Iterator guessFormat() { + ArrayList seqList = new ArrayList(); + try { + SequenceReader reader = SequenceReader.getReader(filename); + if (reader instanceof JapsaFileFormat) { + JapsaFileFormat bff = (JapsaFileFormat) reader; + JapsaAnnotation annoRead; + while ((annoRead = bff.readAnnotation()) != null) { + AnnotationSequenceData anno = new AnnotationSequenceData( + annoRead); + if (anno.size() > 0) { + seqList.add(anno); + anno.addHistory(filename); + } + + Sequence seq = annoRead.getSequence(); + if (seq != null && seq.length() > 0 + && seq.alphabet() == Alphabet.DNA4()) { + DNASequenceData dna = new DNASequenceData(); + dna.setCharData(seq.charSequence()); + + seqList.add(dna); + dna.addHistory(filename); + } + } + } else { + throw new RuntimeException("Unknown format"); + } + + /************************************************** + * BufferedReader in = SequenceReader.openFile(filename); if (in == + * null) return null; + * + * in.mark(10); + * + * char[] buf = new char[10]; in.read(buf, 0, 10); in.reset(); + * String format = new String(buf); + * + * int mode = -1; if (format.startsWith(JapseFileFormat.HEADER)) { + * mode = 0; } else if (format.startsWith("LOCUS")) {// Genbank mode + * = 1;// Biojava System.out.println("Read as Genbank"); // seqsIt = + * SeqIOTools.readGenbank(in); // seqsIt = + * IOTools.readGenbankDNA(in,null); throw new RuntimeException( + * "Genbank format has not been supported"); } else if + * (format.startsWith(">")) { // Fasta mode = 1;// Biojava + * System.out.println("Read as Fasta"); // seqsIt = + * IOTools.readFastaDNA(in,null); throw new RuntimeException( + * "fasta format has not been supported"); } else if + * (format.startsWith("ID")) { // Fasta mode = 1;// Biojava + * System.out.println("Read as EMBL"); // seqsIt = + * IOTools.readEMBLDNA(in,null); throw new + * RuntimeException("EMBL format has not been supported"); } else + * throw new Exception("Unknown file format"); + * + * {// mode == 0 = my format BioCompFileFormat fileFormat = new + * BioCompFileFormat(in); try { Iterator iterSeq = + * fileFormat.getSequenceIterator(); while (iterSeq.hasNext()) { + * Sequence japsa.seq = iterSeq.next(); if (japsa.seq.length() > 0 + * && japsa.seq.alphabet() == Alphabet.DNA4()) { DNASequenceData dna + * = new DNASequenceData(); + * dna.setCharData(japsa.seq.charSequence()); + * + * seqList.add(dna); dna.addHistory(filename); } } + * + * Iterator iter = + * fileFormat.getAnnotationIterator(); while (iter.hasNext()) { + * AnnotationSequenceData anno = new AnnotationSequenceData( + * iter.next()); if (anno.size() > 0) { seqList.add(anno); + * anno.addHistory(filename); } } + * + * } catch (Exception e) { e.printStackTrace(); } } / + **************************************************/ + return seqList.iterator(); + } catch (Exception e) { + // System.err.println("Error reading '"+filename+"' "+e); + e.printStackTrace(); + } + return null; + + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/ReverseFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/ReverseFunction.java new file mode 100755 index 0000000..1b50166 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/ReverseFunction.java @@ -0,0 +1,196 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: ReverseFunction + *

+ * + *

+ * Description: This class is used to reverse SequenceData. + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class ReverseFunction implements Function { + + public ReverseFunction() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { SequenceData.class }; + return typeData; + } + + /** + * This method retusrns null as this function does not have any options. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + return null; + } + + /** + * Creates a new SequenceData from input SequenceData, reverses new + * SequenceData and adds this function to its history. + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) { + + boolean reversed = false; + + // 1. Create output SequenceData from given SequenceData + SequenceData output = seqData.getNewSequenceData(); + + // 2. Reverse new sequence and set output data to be reversed sequence + if (output instanceof CharSequenceData) { + ((CharSequenceData) output) + .setCharData(reverse(((CharSequenceData) output) + .getCharData())); + reversed = true; + } + + else if (output instanceof DoubleSequenceData) { + ((DoubleSequenceData) output) + .setDoubleData(reverse(((DoubleSequenceData) output) + .getDoubleData())); + reversed = true; + } + + // reverse SequenceData as a sequence of objects if it isn't a + // double or char sequence or if it wasn't reversed before + else if (!reversed) + output.setData(reverse(output.getData())); + + // 3. let output SequenceData know about this function + output.addHistory(this); + + return output; + + } + + /** + * This functon reverses an array of Objects + * + * @param data + * Object[] + * @return Object[] + */ + private Object[] reverse(Object[] data) { + Object temp; + int last = data.length - 1; + int first = 0; + + System.out.println("Reversing sequence ... "); + + while (first < last) { + temp = data[first]; + data[first] = data[last]; + data[last] = temp; + + first++; + last--; + } + + return data; + } + + /** + * This function reverses an array holding chars + * + * @param data + * Object[] + * @return Object[] + */ + private char[] reverse(char[] data) { + char temp; + int last = data.length - 1; + int first = 0; + + System.out.println("Reversing sequence ... "); + + while (first < last) { + temp = data[first]; + data[first] = data[last]; + data[last] = temp; + + first++; + last--; + } + + return data; + } + + /** + * This function reverses an array holding doubles + * + * @param data + * Object[] + * @return Object[] + */ + private double[] reverse(double[] data) { + double temp; + int last = data.length - 1; + int first = 0; + + System.out.println("Reversing sequence ... "); + + while (first < last) { + temp = data[first]; + data[first] = data[last]; + data[last] = temp; + + first++; + last--; + } + + return data; + } + + /* fixing toString() method */ + public String toString() { + return "Reverse"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/SaveFileFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/SaveFileFunction.java new file mode 100755 index 0000000..6c84cfa --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/SaveFileFunction.java @@ -0,0 +1,112 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import java.io.*; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +//import org.biojavax.bio.seq.RichSequence.IOTools; +/** + * + * @author hoangnguyen + */ +@SuppressWarnings("rawtypes") +public class SaveFileFunction implements Function { + // public static int NUM_OF_CHARS_IN_LINE=60; + private File file; + + /** + * Creates a new instance of saveFileFunction + */ + // non-argument constructor + public SaveFileFunction() { + } + + // constructor with argument + public SaveFileFunction(File aFile) { + file = aFile; + } + + // override the method form super class + public Class[] getTypeSequenceData() { + Class[] typeData = { SequenceData.class }; + return typeData; + } + + /** + * ReadFromFile does not have options therefore return null + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + return null; + } + + /** + * OptionsHandle must be null as this function doesn't have options. seqData + * is sequence to read from a file. precondition: SequenceData is a + * CharSequenceData + * + * @param myOptions + * OptionsHandle + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) { + seqData.writeDataToFile(file); + + return null; + } + + /** + * method returns the string representation of this function + * + * @return String + */ + + public String toString() { + return "Save"; + } + + /** + * method sets the file object + * + * @param fi + * a File + * + */ + public void setFile(File fi) { + file = fi; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/SaveFormatFileFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/SaveFormatFileFunction.java new file mode 100755 index 0000000..5aa74f7 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/SaveFormatFileFunction.java @@ -0,0 +1,133 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; +import japsa.seq.Alphabet; +import japsa.seq.JapsaAnnotation; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; + +import java.io.*; + +/** + *

+ * Title: Save one or more sequecce to a file + *

+ * + *

+ * Description: This function saves sequences to a file. + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class SaveFormatFileFunction implements Function { + private File file; + Sequence charSeq = null; + JapsaAnnotation annoSeq = null; + + public SaveFormatFileFunction(File f) { + file = f; + } + + public void setCharSequence(CharSequenceData charData) { + if (charData instanceof DNASequenceData) { + charSeq = new Sequence(Alphabet.DNA4(), charData.getCharData(), + charData.getSequenceName()); + } + + } + + public void setAnnotationSequence(AnnotationSequenceData annoData) { + if (annoData != null) + this.annoSeq = annoData.getAnnotation(); + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { CharSequenceData.class, + AnnotationSequenceData.class }; + return typeData; + } + + /** + * ReadFromFile does not have options therefore return null + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + DoubleSequenceData temp = new DoubleSequenceData(); + temp.setDoubleData(new double[0]); + myOps.addSequenceDataOption("Annotation", temp, + "The Annotation to save with"); + return myOps; + + } + + /** + * OptionsHandle must be null as this function doesn't have options. seqData + * is sequence to read from a file. + * + * @param myOptions + * OptionsHandle + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws IOException { + if (seqData instanceof DNASequenceData) + charSeq = new Sequence(Alphabet.DNA4(), + ((DNASequenceData) seqData).getCharData(), + seqData.getSequenceName()); + + // Get from option + SequenceData annoData = myOptions.getSequenceDataValue("Annotation"); + if (annoData instanceof AnnotationSequenceData) { + setAnnotationSequence((AnnotationSequenceData) annoData); + } + + SequenceOutputStream out = new SequenceOutputStream( + new FileOutputStream(file)); + JapsaAnnotation.write(charSeq, annoSeq, out); + out.close(); + + return null; + } + + public String toString() { + return "Save japsa.seq with annotation"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/SelectFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/SelectFunction.java new file mode 100755 index 0000000..6dd33bb --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/SelectFunction.java @@ -0,0 +1,157 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: CutFunction + *

+ * + *

+ * Description: This class is used to cut SequenceData given a starting and + * ending index in a SequenceData object + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class SelectFunction implements Function { + public SelectFunction() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { SequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for a particular function which has + * parameters to run function. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + + myOps.addIntOption("Start position", 0, + "Start position to cut (positions range from 1 to sequence length)."); + myOps.addIntOption("Last position", 0, + "Last position to cut (positions range from 1 to sequence length)."); + + return myOps; + } + + /** + * Creates a new SequenceData from input SequenceData, cuts new SequenceData + * and adds itself to history of new SequenceData + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + int firstIndex = myOptions.getIntValue("Start position") - 1; + int lastIndex = myOptions.getIntValue("Last position") - 1; + + // 1. Create output SequenceData from given SequenceData + SequenceData output = seqData.getNewSequenceData(); + + // 2. Cut new sequence and set output data to be new sequence + output.setData(cut(firstIndex, lastIndex, output.getData())); + + // 3. let output SequenceData know about this function + output.addHistory(myOptions); + + return output; + } + + /** + * This function cuts a given object array from firstIndex to lastIndex and + * returns created array of objects. An array is only cut when the indexes + * are bigger than 0 and less than the lenght of array. + * + * @param firstIndex + * int + * @param lastIndex + * int + * @param data + * Object[] + * @return Object[] + */ + private Object[] cut(int firstIndex, int lastIndex, Object[] data) + throws RuntimeException { + + if (firstIndex < 0) { + firstIndex = 0; + } + + else if (firstIndex > data.length) { + firstIndex = data.length; + } + + if (lastIndex < 0) { + lastIndex = 0; + } + + else if (lastIndex > data.length) { + lastIndex = data.length; + } + + if (firstIndex > lastIndex) { + throw new RuntimeException( + "First position must be smaller than last position"); + } + + System.out.println("Selecting sequence ... [" + (firstIndex + 1) + "-" + + (lastIndex + 1) + "]"); + + /* + * The new array ranges from firstIndex to lastIndex, including + * lastIndex + */ + Object[] temp = new Object[lastIndex - firstIndex + 1]; + + for (int i = firstIndex, j = 0; i <= lastIndex; i++, j++) + temp[j] = data[i]; + + return temp; + } + + /* fixing toString() method */ + public String toString() { + return "Select"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/Smooth.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/Smooth.java new file mode 100755 index 0000000..42b59ec --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/Smooth.java @@ -0,0 +1,131 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import java.io.*; +import java.util.Vector; + +public class Smooth { + String inFile, outFile; + int wSize; + + public Smooth(int wSize, String inFile, String outFile) { + this.inFile = inFile; + this.outFile = outFile; + this.wSize = wSize; + } + + /** + * Smooth the data + * + * @param in + * @param wSize + * @return + */ + + public static double[] smooth(double[] in, int wSize) { + double[] out = new double[in.length]; + double[] his = new double[wSize]; + int index = 0; + double sum = 0.0; + + for (int i = 0; i < wSize; i++) { + his[i] = 0.0; + } + + for (int i = 0; i < in.length; i++) { + index = i % wSize; + sum = sum - his[index] + in[i]; + his[index] = in[i]; + + if (i < wSize) + out[i / 2] = sum / (i + 1); + else + out[i - wSize / 2] = sum / wSize; + } + + for (int i = wSize - 1; i > 0; i--) { + index = (index + 1) % wSize; + sum -= his[index]; + out[out.length - i / 2 - 1] = sum / (i); + } + return out; + } + + // Read from stdin, smooth and write back to stdout + public void smooth() { + try { + // This approach is rather not memory efficient + BufferedReader in = new BufferedReader(new FileReader(inFile)); + Vector v = new Vector(); + String line; + while ((line = in.readLine()) != null) { + String arr[] = line.split(" |\t"); + double value = Double.parseDouble(arr[arr.length - 1]); + v.add(value); + } + in.close(); + // Make an array of double + double[] data = new double[v.size()]; + for (int i = 0; i < v.size(); i++) + data[i] = v.get(i); + + double[] outData = smooth(data, wSize); + + PrintStream out = new PrintStream(new FileOutputStream(outFile)); + for (int i = 0; i < v.size(); i++) { + out.println(i + "\t" + outData[i]); + } + + out.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * @param args + */ + public static void main(String[] args) { + if (args.length < 3) { + System.err.println("Smooth smooth inFile outFile"); + System.exit(1); + } + int s = Integer.parseInt(args[0]); + Smooth sm = new Smooth(s, args[1], args[2]); + sm.smooth(); + + // double[] in = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17}; + // double[] out = smooth(in,1); + // for (int i = 0;i < out.length; i++){ + // System.out.println(out[i]); + // } + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/SmoothingFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/SmoothingFunction.java new file mode 100755 index 0000000..b9190ee --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/SmoothingFunction.java @@ -0,0 +1,152 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: SmoothingFunction + *

+ * + *

+ * Description: This function takes a DoubleSequenceData object and a window + * size and slides a window over data calculating average and setting this + * average to element in the middle of the window + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class SmoothingFunction implements Function { + public SmoothingFunction() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { DoubleSequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for the smoothing function which has an + * option to change window size + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + + myOps.addIntOption("Window size", 1, "Sliding window size"); + + return myOps; + } + + /** + * Creates a new SequenceData from input SequenceData, smooths new + * SequenceData and adds this function to its history. + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + if (!(seqData instanceof DoubleSequenceData)) { + throw new RuntimeException( + "Incorrect type of sequence, this function operates on sequences of doubles"); + } + + int winSize = myOptions.getIntValue("Window size"); + + // 1. Create output SequenceData from given SequenceData + SequenceData output = seqData.getNewSequenceData(); + + // 2. Smooth new sequence and set its data to be new smoothed data + if (output instanceof DoubleSequenceData) + ((DoubleSequenceData) output).setDoubleData(smooth(winSize, + ((DoubleSequenceData) output).getDoubleData())); + + System.out.println(output.getData().length); + + // 3. let output SequenceData know about this function + output.addHistory(myOptions); + + return output; + + } + + /** + * This functon smooths array of doubles. It takes a window size as + * parameter, calculates the average of the information content of elements + * in array in window size and gives this value to the last element in the + * array. + * + * @param data + * Object[] + * @return Object[] + */ + private double[] smooth(int winSize, double[] data) throws RuntimeException { + + return japsa.bio.misc.dnaPlatform.function.Smooth.smooth(data, winSize); + + /************************************************ + * double[] newData = new double[data.length]; double tally = 0; + * + * if (winSize <= 0 && winSize >= data.length) { throw new + * RuntimeException("Bad window size: " + winSize); } + * System.out.println("Smoothing double sequence data ... "); + * + * int index; + * + * // coping first values from data into newData for (index = 0; index < + * winSize - 1; index++) { tally += data[index]; newData[index] = + * data[index]; } + * + * // calculating smoothing for last value in first window tally += + * data[index]; newData[index] = tally / winSize; index++; + * + * // calculating rest of smoothed values and put into newData while + * (index < data.length) { tally = tally + data[index] - data[index - + * winSize]; newData[index] = tally / winSize; index++; } + * + * return newData; / + ************************************************/ + } + + /* fixing toString() method */ + public String toString() { + return "Smoothing"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/SmoothingFunction2.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/SmoothingFunction2.java new file mode 100755 index 0000000..b5b244c --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/SmoothingFunction2.java @@ -0,0 +1,147 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +/** + *

+ * Title: SmoothingFunction + *

+ * + *

+ * Description: This function takes a DoubleSequenceData object and a window + * size and slides a window over data calculating average and setting this + * average to element in the middle of the window. The first elements before the + * first window is read are smoothed by setting their values as the average of + * elements seen previously + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class SmoothingFunction2 implements Function { + public SmoothingFunction2() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { DoubleSequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for the smoothing2 function which has + * option to set window size. + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 1); + + myOps.addIntOption("Window size", 1, "Sliding window size"); + return myOps; + } + + /** + * Creates a new SequenceData from input SequenceData, smooths new + * SequenceData and adds this function to its history. + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + int winSize = myOptions.getIntValue("Window size"); + + // 1. Create output SequenceData from given SequenceData + SequenceData output = seqData.getNewSequenceData(); + + // 2. Smooth new sequence and set its data to be new smoothed data + if (output instanceof DoubleSequenceData) + ((DoubleSequenceData) output).setDoubleData(smooth(winSize, + ((DoubleSequenceData) output).getDoubleData())); + + // 3. let output SequenceData know about this function + output.addHistory(myOptions); + + return output; + + } + + /** + * This functon smooths array of doubles. It takes a window size as + * parameter, calculates the average of the information content of elements + * in array in window size and gives this value to the last element in the + * array. + * + * @param data + * Object[] + * @return Object[] + */ + private double[] smooth(int winSize, double[] data) throws RuntimeException { + + double[] newData = new double[data.length]; + double tally = 0; + + if (winSize <= 0 && winSize >= data.length) { + throw new RuntimeException("Bad window size " + winSize); + } + System.out.println("Smoothing double sequence data ... "); + + int index = 0; + + // coping first values from data into newData + newData[0] = tally = data[0]; + for (index = 1; index < winSize; index++) { + tally += data[index]; + newData[index] = tally / (index + 1); + } + + // calculating rest of smoothed values and put into newData + while (index < data.length) { + tally = tally + data[index] - data[index - winSize]; + newData[index] = tally / winSize; + index++; + } + + return newData; + } + + /* fixing toString() method */ + public String toString() { + return "Smoothing II"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/function/ThresholdFunction.java b/src/main/java/japsa/bio/misc/dnaPlatform/function/ThresholdFunction.java new file mode 100755 index 0000000..704ea15 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/function/ThresholdFunction.java @@ -0,0 +1,156 @@ +/***************************************************************************** + * Copyright (c) 2010 Minh Duc Cao, Monash University. All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the name of Monash University nor the names of its contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.function; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; +import japsa.seq.JapsaFeature; + +/** + *

+ * Title: ThresholdFunction + *

+ * + *

+ * Description: This function takes a DoubleSequenceData object and a window + * size and slides a window over data calculating average and setting this + * average to element in the middle of the window + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class ThresholdFunction implements Function { + public ThresholdFunction() { + } + + public Class[] getTypeSequenceData() { + Class[] typeData = { DoubleSequenceData.class }; + return typeData; + } + + /** + * Returns an OptionsHandle object for the smoothing function which has an + * option to change window size + * + * @return OptionsHandle + */ + public OptionsHandle getOptionsHandle() { + OptionsHandle myOps = new OptionsHandle(this, 2); + + myOps.addDoubleOption("Threshold", 0.0, "Sliding window size"); + myOps.addBooleanOption("Up", true, "From threshold up?"); + + return myOps; + } + + /** + * Creates a new SequenceData from input SequenceData, smooths new + * SequenceData and adds this function to its history. + * + * @param seqData + * SequenceData + * @return SequenceData + */ + public SequenceData execute(OptionsHandle myOptions, SequenceData seqData) + throws RuntimeException { + + if (!(seqData instanceof DoubleSequenceData)) { + throw new RuntimeException( + "Incorrect type of sequence, this function operates on sequences of doubles"); + } + + DoubleSequenceData doubleData = (DoubleSequenceData) seqData; + double data[] = doubleData.getDoubleData(); + + double thres = myOptions.getDoubleValue("Threshold"); + boolean upSide = myOptions.getBooleanValue("Up"); + + // 1. Create output SequenceData from given SequenceData + AnnotationSequenceData output = new AnnotationSequenceData(seqData); + + boolean in = false; + int start = 0; + double sum = 0.0; + // Go thro the numerical sequence + for (int i = 0; i < data.length; i++) { + if (data[i] < thres == upSide && in) {// Outside + in = false; + JapsaFeature f = new JapsaFeature(start, i + 1 - start); + f.setType("Threshold"); + f.addDesc("Length: " + f.getLength()); + f.addDesc("Significant:" + sum); + f.setID("S" + start); + + output.addFeature(f); + + } else if (data[i] < thres != upSide) { + if (!in) { + // Start to be in the range + in = true; + start = i; + sum = 0.0;// data[i]; + } + sum += upSide ? (data[i] - thres) : (thres - data[i]); + + } + } + + if (in) { + JapsaFeature f = new JapsaFeature(start, data.length - start); + f.setType("Threshold"); + f.addDesc("Length: " + f.getLength()); + f.addDesc("Significant:" + sum); + f.setID("S" + start); + + output.addFeature(f); + + } + + System.out.println(output.size()); + + // 3. let output SequenceData know about this function + output.addHistory(myOptions); + + return output; + + } + + /* fixing toString() method */ + public String toString() { + return "Thresholding"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/DataTreePanel.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/DataTreePanel.java new file mode 100755 index 0000000..56a54c6 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/DataTreePanel.java @@ -0,0 +1,289 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import javax.swing.*; +import java.awt.*; +import javax.swing.tree.*; +import java.util.*; + +/** + *

+ * Title: DataTreePanel + *

+ * + *

+ * Description: This is a Panel that displays a JTree given a group of objects + * inside a Vector. toString method is called for every object in the tree and + * if this method returns a string with new line characters each line is added + * as a new node in the tree. If the string has curly braces, all lines between + * the curly braces are added as child nodes + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class DataTreePanel extends JPanel { + public static final long serialVersionUID = MainFrame.serialVersionUID; + + private JTree tree; + JScrollPane treeView; + + private DefaultMutableTreeNode rootNode; + private DefaultTreeModel treeModel; + + public DataTreePanel() { + + try { + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + private void jbInit() throws Exception { + setLayout(new BorderLayout()); + setBorder(BorderFactory.createTitledBorder("Data")); + + createTree(); + treeView = new JScrollPane(tree); + this.add(treeView, BorderLayout.CENTER); + } + + private void createTree() { + rootNode = new DefaultMutableTreeNode("Data"); + treeModel = new DefaultTreeModel(rootNode); + + tree = new JTree(treeModel); + tree.getSelectionModel().setSelectionMode( + TreeSelectionModel.SINGLE_TREE_SELECTION); + tree.setRootVisible(false); + tree.setShowsRootHandles(true); + tree.putClientProperty("JTree.lineStyle", "Angled"); + + DefaultTreeCellRenderer renderer2 = new DefaultTreeCellRenderer(); + renderer2.setOpenIcon(null); + renderer2.setClosedIcon(null); + renderer2.setLeafIcon(null); + tree.setCellRenderer(renderer2); + + } + + /** + * Returns the JTree displayed in this panel + * + * @return JTree + */ + public JTree getTree() { + return tree; + } + + private Vector createVectorString(String description) { + Vector v = new Vector(); + String[] children = description.split("[\\n]"); + + for (int i = 0; i < children.length; i++) { + if (!children[i].matches("^\\s*$")) + v.add(children[i]); + } + return v; + } + + /** + * Given a vector of vectors and strings describing the history of a + * SequenceData object it adds this history to the rootNode of tree to be + * displayed + * + * @param history + * Vector + */ + public void addDataHistory(Vector history) { + DefaultMutableTreeNode parent = rootNode; + DefaultMutableTreeNode current = parent; + + Iterator it = history.iterator(); + while (it.hasNext()) { + Object o = it.next(); + if (o instanceof Vector) { + // if element is a vector then call function to add elements + // of vector as children to current node + addDataHistory((Vector) o, current); + } else { + String description = o.toString(); + + // if toString() returns a string with curly braces + // add all lines in between as child nodes + if ((description.indexOf('{') > -1) + && (description.indexOf('}') > -1)) { + + // add anything before brakets into tree + String desStart = description.substring(0, + description.indexOf('{')); + if (desStart.matches("\\S+")) + current = addObject(parent, desStart, true); + + // add children to a vector and call function to add + // children to tree + String des = description.substring( + (description.indexOf('{') + 1), + (description.lastIndexOf('}'))); + + addDataHistory(createVectorString(des), current); + + // add anything after children + String end = description.substring( + description.lastIndexOf('}'), + description.length() - 1); + if (end.matches("\\S+")) + current = addObject(parent, end, true); + } else + current = addObject(parent, o.toString(), true); + + } + + } + } + + /** + * Given a vector of vectors and strings describing the history of a + * SequenceData object it adds this history to a parent + * DefaultMutableTreeNode to be displayed. + * + * @param history + * Vector + * @param parent + * DefaultMutableTreeNode + */ + public void addDataHistory(Vector history, DefaultMutableTreeNode parent) { + DefaultMutableTreeNode current = parent; + Iterator it = history.iterator(); + while (it.hasNext()) { + Object o = it.next(); + if (o instanceof Vector) { + // if element is a vector then call function to add elements + // of vector as children to current node + addDataHistory((Vector) o, current); + } else { + String description = o.toString(); + + // if toString() returns a string with curly braces + // add all lines in between as child nodes + if ((description.indexOf('{') > -1) + && (description.indexOf('}') > -1)) { + + // add anything before brakets into tree + String desStart = description.substring(0, + description.indexOf('{')); + if (desStart.matches("\\S+")) + current = addObject(parent, desStart, false); + + // add children to a vector and call function to add + // children to tree + String des = description.substring( + (description.indexOf('{') + 1), + (description.lastIndexOf('}'))); + + addDataHistory(createVectorString(des), current); + + // add anything after children + String end = description.substring( + description.lastIndexOf('}'), + description.length() - 1); + if (end.matches("\\S+")) + current = addObject(parent, end, false); + } else + current = addObject(parent, o.toString(), false); + } + } + } + + /** + * Adds child object to given DefaultMutableTreeNode parent. + * + * @param parent + * DefaultMutableTreeNode + * @param child + * Object + * @param shouldBeVisible + * boolean + * @return DefaultMutableTreeNode + */ + public DefaultMutableTreeNode addObject(DefaultMutableTreeNode parent, + Object child, boolean shouldBeVisible) { + DefaultMutableTreeNode childNode = new DefaultMutableTreeNode(child); + + treeModel.insertNodeInto(childNode, parent, parent.getChildCount()); + + // Make sure the user can see the lovely new node. + if (shouldBeVisible) { + tree.scrollPathToVisible(new TreePath(childNode.getPath())); + } + + return childNode; + } + + /** + * Returns selected sequence data name if a path has been selected. + * + * @return String + */ + public String getFirstSelectedPath() { + if (!tree.isSelectionEmpty()) { + Object[] path = tree.getSelectionPath().getPath(); + return path[1].toString(); + } else + return null; + } + + /** + * Removes selected node + * + * @return boolean + */ + public boolean removeSelectedPath() { + TreePath currentSelection = tree.getSelectionPath(); + if (currentSelection != null) { + DefaultMutableTreeNode currentNode = (DefaultMutableTreeNode) (currentSelection + .getPathComponent(1)); + MutableTreeNode parent = (MutableTreeNode) (currentNode.getParent()); + if (parent != null) { + treeModel.removeNodeFromParent(currentNode); + return true; + } + } + return false; + } + + /* + * // Adding child node to selected path in tree. // Might need this later. + * + * public DefaultMutableTreeNode addObject(Object child) { + * DefaultMutableTreeNode parentNode = null; TreePath parentPath = + * tree.getSelectionPath(); + * + * if (parentPath == null) { //There's no selection. Default to the root + * node. parentNode = rootNode; } else { parentNode = + * (DefaultMutableTreeNode) (parentPath.getLastPathComponent()); } + * + * return addObject(parentNode, child, true); } + */ + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/DnaGraphApplet.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/DnaGraphApplet.java new file mode 100755 index 0000000..cfab91e --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/DnaGraphApplet.java @@ -0,0 +1,77 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import java.awt.*; +import javax.swing.*; + +/** + *

+ * Title: MainFrame + *

+ * + *

+ * Description: This is the main Frame of the DnaGUI tool. This class contains + * CompressionModels, SequenceData objects and Functions. It also contains + * Panels to display information content, information about SequenceData and a + * panel to display other graphs created by compression models. + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class DnaGraphApplet extends JApplet { + public static final long serialVersionUID = MainFrame.serialVersionUID; + + MainPanel mainPanel; + + public DnaGraphApplet() { + init(); + } + + public void init() { + mainPanel = new MainPanel(); + this.add(mainPanel); + this.setJMenuBar(mainPanel.mainMenu); + this.setSize(new Dimension(900, 500)); + + mainPanel.mainMenu.menuFile_exit.setEnabled(false); + } + + public static void main(String[] args) { + try { + UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); + } catch (Exception e) { + e.printStackTrace(); + } + + DnaGraphApplet frame = new DnaGraphApplet(); + // Center the window + Dimension screenSize = Toolkit.getDefaultToolkit().getScreenSize(); + Dimension frameSize = frame.getSize(); + if (frameSize.height > screenSize.height) { + frameSize.height = screenSize.height; + } + if (frameSize.width > screenSize.width) { + frameSize.width = screenSize.width; + } + frame.setVisible(true); + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/GridRangeDialog.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/GridRangeDialog.java new file mode 100755 index 0000000..ac5ecef --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/GridRangeDialog.java @@ -0,0 +1,286 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.MouseEvent; +import java.awt.event.MouseAdapter; +import java.awt.event.KeyAdapter; +import java.awt.event.KeyEvent; + +/** + *

+ * Title: GridRangeDialog + *

+ * + *

+ * Description: This is a dialog used to enter minimum and maximum values for + * both x and y axis + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ +public class GridRangeDialog extends JDialog { + public static final long serialVersionUID = MainFrame.serialVersionUID; + + JPanel contentPane; + JPanel titlePane = new JPanel(); + JPanel coordPane = new JPanel(); + JPanel buttonPane = new JPanel(); + + JLabel titleLabel = new JLabel(" Plot Title: "); + JTextField titleField = new JTextField(); + + JLabel yMinLabel = new JLabel("Min y: "); + JTextField yMinField = new DoubleField(); + JLabel yMaxLabel = new JLabel(" Max y: "); + JTextField yMaxField = new DoubleField(); + + JLabel xMinLabel = new JLabel("Min x: "); + JTextField xMinField = new DoubleField(); + JLabel xMaxLabel = new JLabel(" Max x: "); + JTextField xMaxField = new DoubleField(); + + JButton okButton = new JButton("OK"); + JButton cancelButton = new JButton("Cancel"); + + // to hold new grid range entered by user + double yMin; + double yMax; + double xMin; + double xMax; + + // to hold title entered by user + String title; + + /** + * Constructor takes as parameters current range values for a plot + * + * @param xMinVal + * double + * @param xMaxVal + * double + * @param yMinVal + * double + * @param yMaxVal + * double + */ + public GridRangeDialog(String title, double xMinVal, double xMaxVal, + double yMinVal, double yMaxVal) { + + this.title = title; + xMin = xMinVal; + xMax = xMaxVal; + yMin = yMinVal; + yMax = yMaxVal; + + enableEvents(AWTEvent.WINDOW_EVENT_MASK); + try { + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + pack(); + } + + /** + * Creating the GUI of GridRangeDialog + * + * @throws Exception + */ + private void jbInit() throws Exception { + this.setTitle("Grid Range"); + this.setResizable(false); + + /* creating coordPane */ + yMinField.setText(yMin + ""); + yMaxField.setText(yMax + ""); + xMinField.setText(xMin + ""); + xMaxField.setText(xMax + ""); + + coordPane.setLayout(new GridLayout(2, 4)); + coordPane.setBorder(BorderFactory + .createTitledBorder("Enter Grid Range")); + titleLabel.setHorizontalAlignment(SwingConstants.RIGHT); + coordPane.add(xMinLabel); + coordPane.add(xMinField); + coordPane.add(xMaxLabel); + coordPane.add(xMaxField); + coordPane.add(yMinLabel); + coordPane.add(yMinField); + coordPane.add(yMaxLabel); + coordPane.add(yMaxField); + + /* creating buttonPane */ + buttonPane.add(okButton); + buttonPane.add(cancelButton); + + cancelButton + .addMouseListener(new GridRangeDialog_cancelButton_mouseAdapter( + this)); + okButton.addMouseListener(new GridRangeDialog_okButton_mouseAdapter( + this)); + + /* creating titlePane */ + titleField.setText(title); + titlePane.setLayout(new BorderLayout()); + titlePane.add(titleLabel, BorderLayout.WEST); + titlePane.add(titleField, BorderLayout.CENTER); + + /* adding all panels to contentPane */ + contentPane = (JPanel) this.getContentPane(); + contentPane.setLayout(new BorderLayout(2, 10)); + + contentPane.add(titlePane, BorderLayout.NORTH); + contentPane.add(coordPane, BorderLayout.CENTER); + contentPane.add(buttonPane, BorderLayout.SOUTH); + + } + + /** + * Returns title of plot + * + * @return String + */ + public String getTitle() { + return title; + } + + /** + * Returns minimum y value entered + * + * @return double + */ + public double getYminValue() { + return yMin; + } + + /** + * Returns maximum y value entered + * + * @return double + */ + public double getYmaxValue() { + return yMax; + } + + /** + * Returns minimum x value entered + * + * @return double + */ + public double getXminValue() { + return xMin; + } + + /** + * Returns maximum x value entered + * + * @return double + */ + public double getXmaxValue() { + return xMax; + } + + /** + * Gets minumum and maximum values for x and y axis + * + * @param e + * MouseEvent + */ + public void okButton_mouseClicked(MouseEvent e) { + xMin = ((DoubleField) xMinField).getDouble(); + xMax = ((DoubleField) xMaxField).getDouble(); + yMin = ((DoubleField) yMinField).getDouble(); + yMax = ((DoubleField) yMaxField).getDouble(); + + title = titleField.getText(); + } + + /** + * DoubleField is a JTextField for doubles + * */ + private class DoubleField extends JTextField { + // public static final long serialVersionUID = + // MainFrame.serialVersionUID; + + /** + * + */ + private static final long serialVersionUID = 1L; + + DoubleField() { + this.addKeyListener(new KeyAdapter() { + public void keyTyped(KeyEvent e) { + char c = e.getKeyChar(); + if (!(Character.isDigit(c) || c == KeyEvent.VK_BACK_SPACE + || c == KeyEvent.VK_DELETE + || (c == '.' && getText().indexOf('.') == -1) || (c == '-' && getText() + .length() == 0))) { + getToolkit().beep(); + e.consume(); + } + } + }); + } + + /** + * Returns double in text box, if the DoubleField is empty then 0 is + * returned + * + * @return double + */ + double getDouble() { + if (super.getText().length() < 0) + return 0; + + return Double.parseDouble(super.getText()); + } + } + +} + +class GridRangeDialog_cancelButton_mouseAdapter extends MouseAdapter { + private GridRangeDialog adaptee; + + GridRangeDialog_cancelButton_mouseAdapter(GridRangeDialog adaptee) { + this.adaptee = adaptee; + } + + public void mouseClicked(MouseEvent e) { + adaptee.dispose(); + } +} + +class GridRangeDialog_okButton_mouseAdapter extends MouseAdapter { + private GridRangeDialog adaptee; + + GridRangeDialog_okButton_mouseAdapter(GridRangeDialog adaptee) { + this.adaptee = adaptee; + } + + public void mouseClicked(MouseEvent e) { + adaptee.okButton_mouseClicked(e); + adaptee.dispose(); + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/InfoContentPanel.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/InfoContentPanel.java new file mode 100755 index 0000000..759ab77 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/InfoContentPanel.java @@ -0,0 +1,354 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import javax.swing.*; +import java.awt.*; +import java.awt.event.*; +import javax.swing.border.TitledBorder; + +import japsa.bio.misc.dnaPlatform.sequence.*; + +import javax.swing.border.Border; + +/** + *

+ * Title: InfoContentPanel + *

+ * + *

+ * Description: This pabel contains a InfoContentPlot with a scrollbar and + * buttons to manipulate the plot. + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + *

+ * Company: Monash + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class InfoContentPanel extends JPanel { + public static final long serialVersionUID = MainFrame.serialVersionUID; + + InfoContentPlot plot; + JScrollBar plotScrollBar; + + Border border1 = BorderFactory + .createLineBorder(new Color(100, 150, 200), 4); + Border highlightBorder = new TitledBorder(border1, ""); + Border border = BorderFactory.createTitledBorder(""); + + PlotPopupMenu graphPopup = new PlotPopupMenu(this); + + // tobe implemented later + // PlotPopupMenu featurePopup= new PlotPopupMenu(this); + + // storing this reference to call method + MainPanel myContainer; + + public InfoContentPanel(String name, MainPanel container) { + setPlotTitle(name); + myContainer = container; + plot = new InfoContentPlot(myContainer);// Panel to draw the plots + plotScrollBar = new JScrollBar(Scrollbar.HORIZONTAL, -20, 200, -20, 200); + + try { + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + public void setPlotTitle(String title) { + border = BorderFactory.createTitledBorder(title); + highlightBorder = new TitledBorder(border1, title); + } + + private void jbInit() throws Exception { + this.setLayout(new BorderLayout()); + plot.setBorder(border); + + plotScrollBar + .addAdjustmentListener(new InfoContentPanel_plotScrollBar_adjustmentAdapter( + this)); + + this.addMouseMotionListener(new InfoContentPanel_this_mouseMotionAdapter( + this)); + + this.addMouseListener(new InfoContentPanel_this_mouseAdapter(this)); + + add(plot, BorderLayout.CENTER); + add(plotScrollBar, BorderLayout.SOUTH); + } + + public void highlight(boolean b) { + if (b) + plot.setBorder(highlightBorder); + else + plot.setBorder(border); + } + + /** + * This function returns the graph being displayed in this panel. This can + * be used to print the graph from main. + * + * @return JPanel + */ + public JPanel getInfoContentGraph() { + return plot; + } + + /** + * Sets sequence of characters to be displayed in the plot under x + * coordinates. + * + * @param sequence + * char[] + */ + public void setSequence(char[] sequence) { + plot.setSequence(sequence);// ,proteinConvertUtilities.convertToProtein(sequence)); + } + + /** + * This function adjusts the plotScrollBar according to the plot current + * minimum, maximum and limit x values + */ + private void adjustPlotScroll() { + int extent = (int) (plot.getMaxXval() - plot.getMinXval()); + + plotScrollBar.setValues((int) plot.getMinXval(), extent, + (int) plot.getMinLimitX(), (int) plot.getMaxLimitX()); + } + + /** + * Is given a DoubleSequenceData object to display as a graph. First it gets + * or sets the name for DoubleSequenceData object. Then it adds this + * object's name to the graphBox and the object to the plot. Finally, the + * scroll bar of the plot is adjusted. + * + * @param data + * DoubleSequenceData + * @return boolean - indicating whether data was drawn in plot + */ + public boolean paintInfoContent(DoubleSequenceData data) { + + /* add graph to popup menu */ + if (graphPopup.addGraphToMenu(data.toString())) { + /* displaying information content in plot */ + plot.addGraph(data); + plot.selectGraph(data); + adjustPlotScroll(); + + return true; + } + return false; + } + + public boolean addFeatures(AnnotationSequenceData data) { + + /* add graph to popup menu */ + + plot.addFeatures(data); + adjustPlotScroll(); + + return true; + } + + public boolean removeFeatures(AnnotationSequenceData data) { + + /* add graph to popup menu */ + + plot.removeFeatures(data); + adjustPlotScroll(); + + return true; + } + + /** + * Calls function zoomIn() of InfoContentPlot and adjusts the scrollbar for + * the information content plot. + * + * @param e + * ActionEvent + */ + public void zoomIn_actionPerformed() { + plot.zoomIn(); + adjustPlotScroll(); + } + + /** + * Calls the function zoomOut() of InfoContentPlot and adjusts the scrollbar + * for the information content plot. + * + * @param e + * ActionEvent + */ + public void zoomOut_actionPerformed() { + plot.zoomOut(); + adjustPlotScroll(); + } + + /** + * Whenever the value of the scrollbar for the plot is changed it calls + * function in InfoContentPlot to move display window of plot. + * + * @param e + * AdjustmentEvent + */ + public void plotScrollBar_adjustmentValueChanged(AdjustmentEvent e) { + plot.moveXaxis(e.getValue()); + } + + /** + * Function to remove a graph from the graph box and plot + * + * @param graphName + * String + */ + public void removeGraph(String graphName) { + if (graphName.equals("")) + return; + + // remove graph from the plot and popup menu + plot.removeGraph(graphName); + graphPopup.removeGraphFromMenu(graphName); + + adjustPlotScroll(); + } + + /** + * When the gridRamgeButton is pressed then the GridRangeDialog is displayed + * for the user to change the grid range + * + */ + public void showGridRangeDialog() { + GridRangeDialog dlg = new GridRangeDialog(this.toString(), + plot.getMinXval(), plot.getMaxXval(), plot.getMinYval(), + plot.getMaxYval()); + Dimension dlgSize = dlg.getPreferredSize(); + Dimension frmSize = myContainer.getSize(); + java.awt.Point loc = getLocation(); + dlg.setLocation((frmSize.width - dlgSize.width) / 2 + loc.x, + (frmSize.height - dlgSize.height) / 2 + loc.y); + dlg.setModal(true); + dlg.setVisible(true); + + /* update title if it has been changed */ + if (!dlg.getTitle().equals("") + && !dlg.getTitle().equals(this.toString())) { + + setPlotTitle(dlg.getTitle()); + plot.setBorder(highlightBorder); + + // if name changes then we have to update menu in MainFrame + myContainer.updateInfoContentPlotMenu(); + } + + double yMin = dlg.getYminValue(); + double yMax = dlg.getYmaxValue(); + + double xMin = dlg.getXminValue(); + double xMax = dlg.getXmaxValue(); + + if (yMin < yMax) + plot.setYrange(yMin, yMax); + if (xMin < xMax) + plot.setXrange(xMin, xMax); + + adjustPlotScroll(); + + } + + /** + * When the user checks box to display coordinates then tell the plot to + * display mouse coordinates + * + * @param e + * ActionEvent + */ + public void showCoorBox_actionPerformed(ActionEvent e) { + // plot.displayMouseCoord(showCoorBox.isSelected()); + } + + public String toString() { + return ((TitledBorder) border).getTitle(); + } + + public void this_mouseMoved(MouseEvent e) { + plot.mouseMoved(e.getX() - plot.getX(), e.getY() - plot.getY()); + } + + public void this_mousePressed(MouseEvent e) { + myContainer.plotToolBar.selectInfoContentPanel(this); + // Container.selectInformationContentPanel(this);Select info + } + +} + +class InfoContentPanel_this_mouseAdapter extends MouseAdapter { + private InfoContentPanel adaptee; + + InfoContentPanel_this_mouseAdapter(InfoContentPanel adaptee) { + this.adaptee = adaptee; + } + + public void mousePressed(MouseEvent e) { + adaptee.this_mousePressed(e); + maybeShowPopup(e); + } + + public void maybeShowPopup(MouseEvent e) { + /* show popup menu if data in sequenceData */ + if (e.isPopupTrigger()) { + adaptee.graphPopup.show(e.getComponent(), e.getX(), e.getY()); + } + } +} + +class InfoContentPanel_this_mouseMotionAdapter extends MouseMotionAdapter { + private InfoContentPanel adaptee; + + InfoContentPanel_this_mouseMotionAdapter(InfoContentPanel adaptee) { + this.adaptee = adaptee; + } + + public void mouseMoved(MouseEvent e) { + adaptee.this_mouseMoved(e); + } +} + +class InfoContentPanel_plotScrollBar_adjustmentAdapter implements + AdjustmentListener { + private InfoContentPanel adaptee; + + InfoContentPanel_plotScrollBar_adjustmentAdapter(InfoContentPanel adaptee) { + this.adaptee = adaptee; + } + + public void adjustmentValueChanged(AdjustmentEvent e) { + adaptee.plotScrollBar_adjustmentValueChanged(e); + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/InfoContentPlot.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/InfoContentPlot.java new file mode 100755 index 0000000..fdb2100 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/InfoContentPlot.java @@ -0,0 +1,1336 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import japsa.bio.misc.dnaPlatform.sequence.*; +import japsa.seq.JapsaFeature; + +import javax.swing.*; + +import java.awt.*; +import java.awt.event.MouseAdapter; +import java.awt.event.MouseEvent; + +import java.util.*; +import java.text.DecimalFormat; + +/** + *

+ * Title: InfoContentPlot + *

+ * + *

+ * Description: This is a JPanel with fixed length used to plot long arrays of + * doubles, the index of arrays are x coordinates and the values stored in + * arrays are y coordinates. This class uses paint to calculate the section of + * the plot that is visible in screen and display it. + *

+ * + * + * @author Julie Bernal This class is modified by Hoang Anh Nguyen to add + * proteins and annotations + * + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public class InfoContentPlot extends JPanel { + public static final long serialVersionUID = MainFrame.serialVersionUID; + + Grid plot; + Yaxis yAxis; + Xaxis xAxis; + JPanel south = new JPanel(); + JPanel nothing = new JPanel(); + + Vector graphs = new Vector(); + Graph selectedGraph = null; + Vector featureList; + JPanel mainPanel; + + /* + * string that represents the protein + */ + + // char[] aminoAcidSequence=null; + + /* Just some colours for graphs */ + Color[] plotColors = { new Color(7, 61, 149), new Color(195, 98, 3), + new Color(9, 114, 13), new Color(11, 79, 16), + new Color(90, 8, 132), new Color(204, 31, 2) }; + + Color[] featureColors = { Color.RED, Color.BLUE, Color.YELLOW, Color.GREEN }; + + public InfoContentPlot(JPanel panel) { + try { + mainPanel = panel; + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + private void jbInit() throws Exception { + /* initializing InfoContent objects */ + plot = new Grid(0, 4, 0.5, 0, 100, 1, 0, 200); + yAxis = new Yaxis(0, 4, 0.5); + xAxis = new Xaxis(0, 10, 1); + + setXrange(0, 200); + + /* arranging all the panels */ + setLayout(new BorderLayout()); + south.setLayout(new BorderLayout()); + + nothing.setMaximumSize(new Dimension((int) yAxis.getMaximumSize() + .getWidth(), (int) xAxis.getMaximumSize().getHeight())); + nothing.setMinimumSize(new Dimension((int) yAxis.getMaximumSize() + .getWidth(), (int) xAxis.getMinimumSize().getHeight())); + nothing.setPreferredSize(new Dimension((int) yAxis.getPreferredSize() + .getWidth(), (int) xAxis.getPreferredSize().getHeight())); + + nothing.setBackground(yAxis.getBackground()); + + add(yAxis, BorderLayout.WEST); + add(plot, BorderLayout.CENTER); + add(south, BorderLayout.SOUTH); + + south.add(nothing, BorderLayout.WEST); + south.add(xAxis, BorderLayout.CENTER); + + featureList = new Vector(); + } + + /** + * Calculates the coordinate in pixels of a value given the width and the + * range of x values displayed. + * + * @param width + * int, the length where values are displayed + * @param rangeValues + * double, the number of values displayed + * @param value + * double, value to calculate coordinate + * @return int, the coordinate of value given as parameter + */ + public int calculateXcoord(int width, double minXvalue, double maxXvalue, + double value) { + int xCoor = (int) ((value - minXvalue) / (maxXvalue - minXvalue) * width); + if (xCoor < 0) + return 0; + + else if (xCoor > width) + return width; + + return xCoor; + } + + /** + * Calculates value of given x coordinate given the width and the range of x + * values displayed. + * + * @param width + * int, the length where values are displayed + * @param minXvalue + * double, minimum x value being displayed in plot + * @param maxXvalue + * double, maximum x value being displayed in plot + * @param coordinate + * int, coordinate we want to finc value for + * @return int, the value of coordinate given as a parameter + */ + public double calculateXvalue(int width, double minXvalue, + double maxXvalue, int coordinate) { + return (coordinate * (maxXvalue - minXvalue) / width) + minXvalue; + } + + /** + * Calculates the y coordinate in pixels of a value given the height and the + * number of y values displayed. + * + * @param height + * int, the length where values are displayed + * @param minYvalue + * double, minimum y value being displayed in plot + * @param maxYvalue + * double, maximum y value being displayed in plot + * @param value + * double, value to calculate coordinate + * @return int, the coordinate of value given as parameter + */ + public int calculateYcoord(int height, double minYvalue, double maxYvalue, + double value) { + int yCoor = height + - (int) ((value - minYvalue) / (maxYvalue - minYvalue) * height); + if (yCoor < 0) + return 0; + else if (yCoor > height) + return height; + + return yCoor; + } + + /** + * Calculates value of given y coordinate given the height and the range of + * y values displayed. + * + * @param height + * int, the length where values are displayed + * @param minYvalue + * double, the minimum y value being displayed in plot + * @param maxYvalue + * double, the maximum y value being displayed in plot + * @param coordinate + * int, coordinate we want to finc value for + * @return int, the value of coordinate given as a parameter + */ + public double calculateYvalue(int height, double minYvalue, + double maxYvalue, int coordinate) { + return ((height - coordinate) * (maxYvalue - minYvalue) / height) + + minYvalue; + } + + /** + * This function takes argument indicating whether or not to display mouse + * coordinates in plot + * + * @param bool + * boolean + */ + public void displayMouseCoord(boolean bool) { + plot.showMouseCoordinates(bool); + repaint(); + } + + /** + * Returns minimum x value that is displayed in the InfoContentPlot + * + * @return double + */ + public double getMinXval() { + return plot.getMinX(); + } + + /** + * Returns maximum x value that is displayed in the InfoContentPlot + * + * @return double + */ + public double getMaxXval() { + return plot.getMaxX(); + } + + /** + * Returns the minimum y value that is displayed in the InfoContentPlot + * + * @return double + */ + public double getMinYval() { + return plot.getMinY(); + } + + /** + * Returns the maximum y value that is diplayed in the InfoContentPlot + * + * @return double + */ + public double getMaxYval() { + return plot.getMaxY(); + } + + /** + * Changes the range of y values to be displated in the InfoContentPlot. The + * range of values for the Grid and the y axis have to be changed. + * + * @param yMin + * double + * @param yMax + * double + */ + public void setYrange(double yMin, double yMax) { + /* return if invalid y range of values */ + if (yMin > yMax) + return; + + plot.setYrange(yMin, yMax); + yAxis.setRange(yMin, yMax); + + /* fix the y scale */ + double divisions = (yMax - yMin) / 15; + + int i = 1; + double scale = 0.1; + while (scale < divisions) { + if (i == 2) { + i = 0; + scale = (scale / 2) + (scale * 2); + } else { + i++; + scale *= 2; + } + } + + plot.setYscale((double) scale); + yAxis.setScale((double) scale); + + repaint(); + } + + /** + * Changes the range of x values to be displayed in the InfoContentPlot. The + * range of values for the Grid and the x axis have to be changed. + * + * @param xMin + * double + * @param xMax + * double + */ + public void setXrange(double xMin, double xMax) { + + /* Return if new range is invalid */ + if (xMin > xMax || xMin < plot.getMinLimitX()) // || xMax > + // plot.getMaxLimitX()) + return; + + plot.setXrange(xMin, xMax); + xAxis.setRange(xMin, xMax); + + /* fix the y scale */ + double divisions = (xMax - xMin) / 15; + + int i = 1; + double scale = 1; + while (scale < divisions) { + if (i == 2) { + i = 0; + scale = (scale / 2) + (scale * 2); + } else { + i++; + scale *= 2; + } + } + + plot.setXscale(scale); + xAxis.setScale(scale); + + repaint(); + } + + /** + * This function zooms in the plot. This is achieved by decreasing the range + * of values displayed in the plot and in the x axis. + */ + public void zoomIn() { + double decrease = (plot.getMaxX() - plot.getMinX()) / 2; + if (decrease > 1) + setXrange(plot.getMinX(), plot.getMaxX() - decrease); + } + + /** + * This function zooms out of the plot. This is achieved by increasing the + * range of values displayed in the plot and x axis. + */ + public void zoomOut() { + double increase = (plot.getMaxX() - plot.getMinX()) / 2; + setXrange(plot.getMinX(), plot.getMaxX() + increase); + } + + /** + * Returns minimum limit x value for the plot + * + * @return double + */ + public double getMinLimitX() { + return plot.getMinLimitX(); + } + + /** + * Returns maximum limit x value for the plot + * + * @return double + */ + public double getMaxLimitX() { + return plot.getMaxLimitX(); + } + + /** + * Moves the plot to the value indicated + * + * @param xValue + * double + */ + public void moveXaxis(double value) { + double xRange = plot.getMaxX() - plot.getMinX(); + setXrange(value, value + xRange); + } + + /** + * This function is used to set character sequence of the plot It also set + * the aminoAcid array + * + * public void setSequence (char[] japsa.seq, char[] aminoSeq) { + * this.aminoAcidSequence = aminoSeq; int seqLen = + * xAxis.setSequence(japsa.seq); + * + * + * if(plot.getMaxLimitX() < seqLen) plot.setMaxLimitX(seqLen); + * + * repaint(); } + */ + + /** + * This function is used to set character sequence of the plot + */ + public void setSequence(char[] seq) { + int seqLen = xAxis.setSequence(seq); + + /* Change the x limit in the plot */ + if (plot.getMaxLimitX() < seqLen) + plot.setMaxLimitX(seqLen); + + repaint(); + } + + /** + * Calculates maximum x value in the plot to be displayed. This value is the + * maxixmum lenght of all graphs and char sequence of x axis + */ + private int calculateMaxXvalue() { + Iterator it = graphs.iterator(); + int maxLen = 0; + + while (it.hasNext()) { + Graph gr = (Graph) it.next(); + SequenceData d = gr.getData(); + if (d.getData().length > maxLen) + maxLen = d.getData().length; + } + + for (int lidx = 0; lidx < featureList.size(); lidx++) { + AnnotationSequenceData featureData = featureList.get(lidx); + if (featureData.size() > 0) { + int x = featureData.getFeature(featureData.size() - 1).getEnd(); + if (x > maxLen) + maxLen = x; + } + } + + if (xAxis.getSequenceLenght() > maxLen) + maxLen = xAxis.getSequenceLenght(); + + return maxLen; + + } + + /** + * creates a graph with given DoubleSequenceData object and returns the + * index of this graph in vector graphs. + * + * @param data + * SequenceData + * @return int + */ + public void addGraph(DoubleSequenceData data) { + Graph gr = new Graph(plotColors[graphs.size() % plotColors.length], + data); + graphs.add(gr); + + selectedGraph = gr; + + if (plot.getMaxLimitX() < data.getData().length) + plot.setMaxLimitX(data.getData().length); + + repaint(); + } + + /** + * Res the graph in graphs at given index and repaints the plot + * + * @param index + * int + * @return Graph + */ + + public boolean removeFeatures(AnnotationSequenceData seqData) { + if (featureList.remove(seqData)) { + + // calculate new maximum x value + plot.setMaxLimitX(calculateMaxXvalue()); + + repaint(); + return true; + } + return false; + } + + public boolean addFeatures(AnnotationSequenceData seqData) { + if (featureList.contains(seqData)) { + System.out.println("The features already added"); + return false; + } + if (featureList.size() >= featureColors.length) { + System.out.println("No more space"); + return false; + } + + featureList.add(seqData); + + // calculate new maximum x value + plot.setMaxLimitX(calculateMaxXvalue()); + repaint(); + + return true; + } + + /* TODO: Eliminate string comparisons with SequenceData names and types */ + + public void removeGraph(String name) { + Iterator it = graphs.iterator(); + while (it.hasNext()) { + Graph gr = (Graph) it.next(); + String grName = (gr.getData()).toString(); + if (name.equals(grName)) { + graphs.removeElement(gr); + if (selectedGraph == gr) + selectedGraph = null; + break; + } + } + + // calculate new maximum x value + plot.setMaxLimitX(calculateMaxXvalue()); + + repaint(); + } + + /** + * Sets the selected graph to be the graph corresponding to the + * DoubleSequenceData object with the given name. + * + * @param readID + * String + */ + public void selectGraph(DoubleSequenceData data) { + Iterator it = graphs.iterator(); + while (it.hasNext()) { + Graph gr = (Graph) it.next(); + if (data == gr.getData()) + selectedGraph = gr; + } + + repaint(); + } + + /** + * + *

+ * Title: Grid + *

+ * + *

+ * Description: This is the grid where graphs get drawn in the + * InfoContentPlot + *

+ * + * + * @author Julie Bernal + * @version 1.0 + */ + private class Grid extends JPanel { + // public static final long serialVersionUID = + // MainFrame.serialVersionUID; + + /** + * + */ + private static final long serialVersionUID = 1L; + /* holds minimum and maximum values to be displayed */ + double minY, maxY; + double minX, maxX; + + double scaleY, scaleX; + + /* width and height of Plot */ + int width, height; + + /* to store mouse coordinates */ + int mouseX, mouseY; + boolean showMouseCoord = false; + + /* holds limits for x values */ + double minLimitX, maxLimitX; + + public Grid(double minYvalue, double maxYvalue, double yScale, + double minXvalue, double maxXvalue, double xScale, + double minXlimit, double maxXlimit) { + super(); + if (minYvalue <= maxYvalue) { + minY = minYvalue; + maxY = maxYvalue; + } + + if (minXvalue <= maxXvalue) { + minX = minXvalue; + maxX = maxXvalue; + } + + scaleY = yScale; + scaleX = xScale; + + minLimitX = minXlimit; + maxLimitX = maxXlimit; + + setBackground(Color.white); + setBorder(BorderFactory.createLineBorder(Color.black)); + } + + public void paint(Graphics g) { + width = getWidth(); + height = getHeight(); + + g.setColor(getBackground()); + g.fillRect(0, 0, width, height); + + paintGrid(g); + + // painting graphs in the grid + Iterator i = graphs.iterator(); + while (i.hasNext()) { + Graph gr = i.next(); + g.setColor(gr.getColor()); + paintGraph(g, gr.getData(), false); + } + + // painting selected graph + if (selectedGraph != null) { + g.setColor(selectedGraph.getColor()); + paintGraph(g, selectedGraph.getData(), false);// true); + } + + if (showMouseCoord) + paintMouseCoordinates(g); + + } + + /** + * Paints a grid + * + * @param g + * Graphics + */ + private void paintGrid(Graphics g) { + int x0 = calculateXcoord(width, minX, maxX, 0); + int y0 = calculateYcoord(height, minY, maxY, 0); + + if (x0 < 0 || x0 > width) + x0 = 0; + + if (y0 < 0 || y0 > height) + y0 = 0; + + g.setColor(Color.lightGray); + + /* drawing grid from axis */ + for (double y = minY; y < maxY; y += scaleY) { + int yCoor = calculateYcoord(height, minY, maxY, y); + g.drawLine(0, yCoor, width, yCoor); + } + + /* + * to have gray lines at x values multiples of scaleX, calculate + * xValue at coordinate 0 and add to coordinate until corresponding + * value is a multiple of scaleX + */ + + double initialXval = minX; + while (initialXval % scaleX > 0) + initialXval++; + + for (double x = initialXval; x < maxX; x += scaleX) { + int xCoor = calculateXcoord(width, minX, maxX, x); + g.drawLine(xCoor, 0, xCoor, height); + } + + /* draw axis */ + g.setColor(Color.black); + if (x0 > 0 && x0 < width) + g.drawLine(x0, 0, x0, height); + + if (y0 > 0 && y0 < height) + g.drawLine(0, y0, width, y0); + + } + + /** + * Paints mouse coordinates + * + * @param g + * Graphics + */ + private void paintMouseCoordinates(Graphics g) { + + g.setColor(Color.black); + + int mX = mouseX, mY = mouseY; + if (mouseX + 50 > width) + mX = mouseX - 50; + if (mouseY - 50 < 0) + mY = mouseY + 50; + + double x = calculateXvalue(width, minX, maxX, mouseX); + double y = calculateYvalue(height, minY, maxY, mouseY); + + String xVal = String.valueOf(x); + if (xVal.indexOf('.') > -1) + xVal = xVal.substring(0, xVal.indexOf('.')); + + String yVal = String.valueOf(y); + if (yVal.indexOf('.') > -1 && yVal.indexOf('.') < yVal.length() - 4) + yVal = yVal.substring(0, yVal.indexOf('.') + 3); + + // g.drawString("(" + mouseX + "," + mouseY + ")", mX, mY); + g.drawString("(" + xVal + "," + yVal + ")", mX, mY - 20); + // g.drawString("coords (" + xCoor + "," + yCoor + ")", mX, mY - + // 40); + } + + /** + * Sets whether or not mouse coordinates should be displayed in the plot + * + * @param bool + * boolean + */ + public void showMouseCoordinates(boolean bool) { + showMouseCoord = bool; + } + + /** + * Paints a DoubleSequenceData double array. Indexes in array are + * treated as x coordinates and values stored in array are treated as y + * coordinates. Only points visible in the panel are drawn. Graphs are + * drawn in the plot from point 1 while in array points start [0] so + * when x coordinates are calculated, they are calculated as x+1 for any + * x value. + * + * @param g + * Graphics + * @param data + * SequenceData + */ + + public void paintGraph(Graphics g, DoubleSequenceData data, + boolean highlight) { + double infoContent[] = data.getDoubleData(); + + // minYval and maxYval are used to draw only one line at each + // pixel even if there are many points to be plotted at that pixel + // this is done by drawing a vertical line from the lowest y value + // to + // the highest y value + double minYval = Double.POSITIVE_INFINITY; + double maxYval = Double.NEGATIVE_INFINITY; + + int xCoor, lastXcoor = -1; + + int x = (int) minX; + + /* draw first line if minX > 0 */ + if (x > 0 && x < infoContent.length) { + int xC1 = calculateXcoord(width, minX, maxX, x + 1); + int xC2 = calculateXcoord(width, minX, maxX, x); + int yC1 = calculateYcoord(height, minY, maxY, infoContent[x]); + int yC2 = calculateYcoord(height, minY, maxY, + infoContent[x - 1]); + + if (highlight) { + int xPoints[] = { xC1 - 1, xC1 + 1, xC2 + 1, xC2 - 1 }; + int yPoints[] = { yC1 - 1, yC1 + 1, yC2 + 1, yC2 - 1 }; + + g.fillPolygon(xPoints, yPoints, 4); + } + + else + g.drawLine(xC1, yC1, xC2, yC2); + } + + lastXcoor = calculateXcoord(width, minX, maxX, x + 1); + x++; + /* + * draw graph lines while x is still visible in the screen and a + * point in infoContent[] + */ + while (x < maxX && x < infoContent.length) { + xCoor = calculateXcoord(width, minX, maxX, x + 1); + + // Only draw one line at each pixel + if (xCoor == lastXcoor) { + if (infoContent[x] < minYval) + minYval = infoContent[x]; + if (infoContent[x] > maxYval) + maxYval = infoContent[x]; + } else { + // If minYval and maxYval have been set draw a line at + // lastXcoor + if (minYval < maxYval) { + g.drawLine(lastXcoor, + calculateYcoord(height, minY, maxY, minYval), + lastXcoor, + calculateYcoord(height, minY, maxY, maxYval)); + + minYval = Double.POSITIVE_INFINITY; + maxYval = Double.NEGATIVE_INFINITY; + } + + // drawing lines + int lastYcoor = calculateYcoord(height, minY, maxY, + infoContent[x - 1]); + int yCoor = calculateYcoord(height, minY, maxY, + infoContent[x]); + if (highlight) { + int xPoints[] = { lastXcoor - 1, lastXcoor + 1, + xCoor + 1, xCoor - 1 }; + int yPoints[] = { lastYcoor - 1, lastYcoor + 1, + yCoor + 1, yCoor - 1 }; + + g.fillPolygon(xPoints, yPoints, 4); + } else + g.drawLine(lastXcoor, lastYcoor, xCoor, yCoor); + } + + lastXcoor = xCoor; + x++; + } + } + + /** + * returns minimum y value displayed in the plot + * + * @return double + */ + public double getMinY() { + return minY; + } + + /** + * returns maximum y value displayed in the plot + * + * @return double + */ + public double getMaxY() { + return maxY; + } + + /** + * Returns minimum x value displayed in the plot + * + * @return double + */ + public double getMinX() { + return minX; + } + + /** + * Returns maximum x value displayed in the plot + * + * @return double + */ + public double getMaxX() { + return maxX; + } + + /** + * Changes the range of y values to display in the plot + * + * @param yMin + * double, minimum y value to display + * @param yMax + * double, maximum y value to display + */ + public void setYrange(double yMin, double yMax) { + if (minY <= maxY) { + minY = yMin; + maxY = yMax; + } + } + + /** + * Changes the range of x values to display in the plot + * + * @param xMin + * double, minimum x value to display + * @param xMax + * double, maximum x value to display + */ + public void setXrange(double xMin, double xMax) { + if (minX <= maxX) { + minX = xMin; + maxX = xMax; + } + } + + /** + * Returns the x scale of the plot. The x scale is the increase value to + * draw lines in the grid and the x axis + * + * @return double + */ + @SuppressWarnings("unused") + public double getXscale() { + return scaleX; + } + + /** + * Returns the y scale of the plot. The y scale is the increase value to + * draw lines in the grid and the y axis + * + * @return double + */ + @SuppressWarnings("unused") + public double getYscale() { + return scaleY; + } + + /** + * Sets the x scale of the plot. The x scale is the increase value to + * draw lines in the grid and the x axis + * + * @param scale + * double + */ + public void setXscale(double scale) { + scaleX = scale; + } + + /** + * Sets the y scale of the plot. The y scale is the increase value to + * draw lines in the grid and the y axis + * + * @param scale + * double + */ + public void setYscale(double scale) { + scaleY = scale; + } + + /** + * Returns the minimum allowed value in x axis + * + * @return double + */ + public double getMinLimitX() { + return minLimitX; + } + + /** + * Returns the maximum value allowed in x axis + */ + public double getMaxLimitX() { + return maxLimitX; + } + + /** + * Sets the maximum value allowed in x axis + * + * @param maxLimit + * double + */ + public void setMaxLimitX(double maxLimit) { + maxLimitX = maxLimit; + } + + } + + public void mouseMoved(int x, int y) { + if (plot.showMouseCoord) { + plot.mouseX = x - (plot.getX()) + 2; + plot.mouseY = y - (plot.getY()) - 3; + + repaint(); + } + } + + /** + * + *

+ * Title: Yaxis + *

+ * + *

+ * Description: A Yaxis is a JPanel of width 30 that displays Y values of an + * InfoContentPlot + *

+ * + * @author Julie Bernal + * @version 1.0 + */ + private class Yaxis extends JPanel { + /** + * + */ + private static final long serialVersionUID = 1L; + + double minY, maxY, scaleY; + + int width, height; + + public Yaxis(double minYvalue, double maxYvalue, double yScale) { + super(); + if (minYvalue <= maxYvalue) { + minY = minYvalue; + maxY = maxYvalue; + } + scaleY = yScale; + setBackground(Color.lightGray); + + setMaximumSize(new Dimension(30, 32767)); + setMinimumSize(new Dimension(30, + (int) ((maxY - minY) / yScale) * 10)); + setPreferredSize(new Dimension(30, + (int) ((maxY - minY) / yScale) * 10)); + + } + + /** + * Writes y values in y axis. Values written depend in the minimum and + * maximum y value to display and the y scale + * + * @param g + * Graphics + */ + public void paint(Graphics g) { + height = getHeight(); + width = getWidth(); + + g.setColor(getBackground()); + g.fillRect(0, 0, width, height); + + g.setColor(Color.black); + + /* painting y values */ + Font oldFont = g.getFont(); + g.setFont(new Font("Dialog", Font.PLAIN, 9)); + + DecimalFormat df = new DecimalFormat("#0.0"); + // decimal format for big numbers: + // decimal format for big numbers: + if (scaleY >= 1000) { + df = new DecimalFormat("#0.#####E0"); + g.setFont(new Font("Dialog", Font.PLAIN, 9)); + } else if (scaleY >= 1) + df = new DecimalFormat("#0.#"); + + for (double y = minY; y < maxY; y += scaleY) { + int i = calculateYcoord(height, minY, maxY, y); + // only draw numbers if they fit + int x = width - g.getFontMetrics().stringWidth(df.format(y)); + + g.drawString(df.format(y), x, i); + } + + g.setFont(oldFont); + + } + + /** + * Changes the range of values being displayed in y axis + * + * @param yMinValue + * double, minimum y value to display + * @param yMaxValue + * double, maximum y value to display + */ + public void setRange(double yMinValue, double yMaxValue) { + if (yMinValue <= yMaxValue) { + minY = yMinValue; + maxY = yMaxValue; + } + } + + /** + * Sets the scale for the y axis + * + * @param scale + * double + */ + public void setScale(double scale) { + scaleY = scale; + } + + } + + /** + * + *

+ * Title: Xaxis + *

+ * + *

+ * Description: A Xaxis is a class of height 30 that displays annotation of + * an InfoContentPlot + *

+ * + * @author Julie Bernal + * @version 1.0 Modified by Minh Duc Cao + */ + private class Xaxis extends JPanel { + /** + * + */ + private static final long serialVersionUID = 1L; + + static final int START = 25, HEIGHT = 8, DISTANCE = 12; + + double minX, maxX, scaleX; + int width, height; + + private char[] sequence; + + public Xaxis(double minXvalue, double maxXvalue, double xScale) { + super(); + if (minXvalue <= maxXvalue) { + minX = minXvalue; + maxX = maxXvalue; + } + scaleX = xScale; + setBackground(Color.lightGray); + // the height of the Maxis is increase from 30 to 55 + // in order to have enough place to put 3 more strings: the protein + // sequence. + setMaximumSize(new Dimension(32767, 85)); + setMinimumSize(new Dimension(10, 85)); + setPreferredSize(new Dimension(10, 85)); + + // add mouse listener + // If a features is clicked, display the feature property + addMouseListener(new MouseAdapter() { + public void mousePressed(MouseEvent e) { + int index = -1; + int x_mouse = e.getX(); + int y_mouse = e.getY(); + for (int lidx = 0; lidx < featureList.size(); lidx++) { + if (START + DISTANCE * lidx <= y_mouse + && START + DISTANCE * lidx >= y_mouse - HEIGHT) { + index = lidx; + break; + } + } + + if (index >= 0) { + AnnotationSequenceData featureData = featureList + .get(index); + + for (int i = 0; i < featureData.size(); i++) { + JapsaFeature feature = featureData.getFeature(i); + if (feature.getEnd() > minX + && feature.getStart() < maxX) {// overlap + int begin_drawing = calculateXcoord(width, + minX, maxX, feature.getStart()); + // width of the rectangle + int length_drawing = calculateXcoord(width, + minX, maxX, feature.getEnd()) + - begin_drawing; + if (begin_drawing <= x_mouse + && begin_drawing + length_drawing >= x_mouse) { + // pop up + JOptionPane.showMessageDialog(mainPanel, + feature.getProperty(), + "Feature info", + JOptionPane.PLAIN_MESSAGE); + } + } + } + } + } + }); + + } + + /** + * Writes the values in x axis, these values depend in the minimum and + * maximum x values to display and the scale of x axis + * + * @param g + * Graphics + */ + public void paint(Graphics g) { + height = getHeight(); + width = getWidth(); + + g.setColor(getBackground()); + g.fillRect(0, 0, width, height); + + g.setColor(Color.black); + + /* painting x values */ + int pixelSpace = (int) (width / ((maxX - minX) / scaleX)); + Font oldFont = g.getFont(); + g.setFont(new Font("Dialog", Font.PLAIN, 9)); + + DecimalFormat df = new DecimalFormat("#0.0"); + // decimal format for big numbers: + if (scaleX >= 10000) + df = new DecimalFormat("#0.#####E0"); + else if (scaleX >= 1) + df = new DecimalFormat("#0.#"); + + /* + * to have values at x values multiples of scaleX, calculate xValue + * at coordinate 0 and add to coordinate until corresponding value + * is a multiple of scaleX + */ + + double initialXval = minX; + while (initialXval % scaleX > 0) + initialXval++; + + for (double x = initialXval; x < maxX; x += scaleX) { + int i = calculateXcoord(width, minX, maxX, x); + // only draw numbers if they fit + if (g.getFontMetrics().stringWidth(df.format(x)) < (pixelSpace - 5)) { + if (x == 0) + g.drawString("0", i, 10); + + else { + i -= g.getFontMetrics().stringWidth(df.format(x)) / 2; + g.drawString(df.format(x), i, 10); + } + } + } + + g.setFont(oldFont); + + /* + * draw sequence if it exists and the range of x values displayed in + * the screen is small enough to display characters in string + * sequence[0] should be painted as sequence[1] + */ + // changes made by Hoang Nguyen + + int pixelsUnit = (int) (width / (maxX - minX)); + if (sequence != null && pixelsUnit > 5) { + for (int x = (int) minX; x < maxX && x < sequence.length; x++) + g.drawString("" + sequence[x], + calculateXcoord(width, minX, maxX, x + 1), 20); + }// end if pixel unit < 5 + + for (int lidx = 0; lidx < featureList.size(); lidx++) { + AnnotationSequenceData featureData = featureList.get(lidx); + + for (int i = 0; i < featureData.size(); i++) { + JapsaFeature feature = featureData.getFeature(i); + if (feature.getEnd() > minX && feature.getStart() < maxX) {// overlap + g.setColor(featureColors[lidx]); + int begin_drawing = calculateXcoord(width, minX, maxX, + feature.getStart()); + // System.out.println("=====" + width + " " + minX + + // " " + maxX + " " + feature.getStart() + " " + + // begin_drawing); + // width of the rectangle + int length_drawing = calculateXcoord(width, minX, maxX, + feature.getEnd()) - begin_drawing; + g.drawRect(begin_drawing, START + DISTANCE * lidx, + length_drawing, HEIGHT); + } + } + + } + + } + + /** + * Changes the range of x values being displayed in x axis + * + * @param xMinValue + * double, minimum x value to display + * @param xMaxValue + * double, maximum x value to display + */ + public void setRange(double xMinValue, double xMaxValue) { + if (xMinValue <= xMaxValue) { + minX = xMinValue; + maxX = xMaxValue; + } + } + + /** + * Sets the scale for the x axis + * + * @param scale + * double + */ + public void setScale(double scale) { + scaleX = scale; + } + + /** + * This funciton is used to give a character sequence to be displayed as + * x coordinate + * + * @param japsa + * .seq char[] + * @return int the lenght of the sequence added to x axis + */ + public int setSequence(char[] seq) { + sequence = seq; + return seq.length; + } + + /** + * Returns the lenght of character sequence in x aXis or 0 if the + * sequence has not been set + * + * @return int + */ + public int getSequenceLenght() { + if (sequence == null) + return 0; + return sequence.length; + } + } + + /** + * + *

+ * Title: Graph + *

+ * + *

+ * Description: A graph contains SequenceData and other information about + * how it is displayed. For example the colour of the graph + *

+ */ + private class Graph { + private Color myColor; + DoubleSequenceData myData; + + public Graph(Color color, DoubleSequenceData data) { + myColor = color; + myData = data; + } + + public Color getColor() { + return myColor; + } + + public DoubleSequenceData getData() { + return myData; + } + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainFrame.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainFrame.java new file mode 100755 index 0000000..116aae6 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainFrame.java @@ -0,0 +1,83 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import java.awt.*; +import javax.swing.*; + +/** + *

+ * Title: MainFrame + *

+ * + *

+ * Description: This is the main Frame of the DnaGUI tool. This class contains + * CompressionModels, SequenceData objects and Functions. It also contains + * Panels to display information content, information about SequenceData and a + * panel to display other graphs created by compression models. + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class MainFrame extends JFrame { + public static final long serialVersionUID = 1234567890; + + MainPanel mainPanel; + + public MainFrame() { + init(); + // Center the window + Dimension screenSize = Toolkit.getDefaultToolkit().getScreenSize(); + Dimension frameSize = this.getSize(); + if (frameSize.height > screenSize.height) { + frameSize.height = screenSize.height; + } + if (frameSize.width > screenSize.width) { + frameSize.width = screenSize.width; + } + + this.setLocation((screenSize.width - frameSize.width) / 2, + (screenSize.height - frameSize.height) / 2); + this.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + + boolean packFrame = false;// Dont know why it is here--Minh + + // Validate frames that have preset sizes + // Pack frames that have useful preferred size info, e.g. from their + // layout + if (packFrame) { + this.pack(); + } else { + this.validate(); + } + + this.setVisible(true); + } + + public void init() { + mainPanel = new MainPanel(); + this.add(mainPanel); + this.setJMenuBar(mainPanel.mainMenu); + this.setTitle("DNA Platform"); + this.setSize(new Dimension(900, 500)); + + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainMenuBar.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainMenuBar.java new file mode 100755 index 0000000..d1f55e5 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainMenuBar.java @@ -0,0 +1,625 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import javax.swing.*; + +import japsa.bio.misc.dnaPlatform.compModel.*; +import japsa.bio.misc.dnaPlatform.function.*; +import japsa.bio.misc.dnaPlatform.sequence.*; + +import java.awt.event.*; +import java.io.File; + +/** + *

+ * Title: MainFrame + *

+ * + *

+ * Description: This is the main Frame of the DnaGUI tool. This class contains + * CompressionModels, SequenceData objects and Functions. It also contains + * Panels to display information content, information about SequenceData and a + * panel to display other graphs created by compression models. + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class MainMenuBar extends JMenuBar { + public static final long serialVersionUID = MainFrame.serialVersionUID; + // The current directory to read/write file + MainPanel mainPanel; + + // File + JMenu menuFile = new JMenu("File"); + JMenuItem menuFile_exit = new JMenuItem("Exit"); + JMenuItem menuFile_print = new JMenuItem("Print graph"); + JMenu menuFile_import = new JMenu("Import Sequence"); + // JMenuItemImport: inner class from MainPanel class + JMenuItemImport[] menuFile_import_sequences; + + // Run + JMenu menuRun = new JMenu("Run"); + JMenu menuRun_model = new JMenu("Run Model"); + JMenu menuRun_function = new JMenu("Run Function"); + JMenuItemFunction[] menuRun_runFunction; + + // View + JMenu menuView = new JMenu("Plots"); + JMenu menuView_plots = new JMenu("Number of Plots"); + JMenuItem menuView_plots_add = new JMenuItem("Add plot"); + JMenu menuView_plots_remove = new JMenu("Remove plot"); + JMenu menuView_selectPlot = new JMenu("Select plot to work with"); + + // Help + JMenu menuHelp = new JMenu("Help"); + JMenuItem menuHelp_about = new JMenuItem("About"); + + public MainMenuBar(MainPanel mainPanel) { + this.mainPanel = mainPanel; + initMenu(); + } + + /** + * This method creates all the JMenuItemImport's needed to import sequences + * into the tool. This method contains an array that holds all the types of + * sequences that can be read from files. + * + * + * @return JMenuItemImport[] + */ + + private JMenuItemImport[] createJMenuItemsImport() { + // This array contains all the types of sequences that can be + // imported into the tool from files + SequenceData[] types = { new DNASequenceData(), + // new CharSequenceData(), + new DoubleSequenceData(), new AnnotationSequenceData() }; + + // create array to hold all JMenuItemImport items + JMenuItemImport[] seqImport = new JMenuItemImport[types.length]; + + // create JMenuItemImport elements in array depending on + // the seqTypes available + for (int i = 0; i < seqImport.length; i++) + seqImport[i] = new JMenuItemImport(types[i], mainPanel); + + return seqImport; + } + + // initialise the menu + private void initMenu() { + /**** Creating all the Functions and CompressionModels ****/ + Function[] functions = mainPanel.functions; + + /**** Setting up the Menu ****/ + // import menu + menuFile_import.add(new JMenuItemFormatedImport(mainPanel)); + menuFile_import_sequences = createJMenuItemsImport(); + for (int i = 0; i < menuFile_import_sequences.length; i++) + menuFile_import.add(menuFile_import_sequences[i]); + + // menu item for paste DNA sequence + JMenuItem pasteSequence = new JMenuItem("Paste DNA"); + pasteSequence + .addActionListener(new MainMenu_pasteSequence_actionAdapter( + mainPanel)); + menuFile_import.add(pasteSequence); + + menuFile_print + .addActionListener(new MainMenu_menuFile_print_actionAdapter( + mainPanel));// in MainPanel class + menuFile_exit + .addActionListener(new MainMenu_menuFile_exit_actionAdapter( + mainPanel));// in MainPanel class + menuView_plots_add + .addActionListener(new MainMenu_menuView_plots_add_actionAdapter( + mainPanel));// in MainPanel class + + // adding JMenuItem to menuFile + menuFile.add(menuFile_import); + menuFile.add(menuFile_print); + menuFile.addSeparator(); + menuFile.add(menuFile_exit); + + // creating menu items for functions and models + menuRun_runFunction = new JMenuItemFunction[functions.length]; + + for (int i = 0; i < functions.length; i++) { + menuRun_runFunction[i] = new JMenuItemFunction(functions[i], + mainPanel); + + if (functions[i] instanceof CompressionModel) { + menuRun_model.add(menuRun_runFunction[i]); + } else + menuRun_function.add(menuRun_runFunction[i]); + } + + menuRun.add(menuRun_model); + menuRun.add(menuRun_function); + + menuView_plots.add(menuView_plots_add); + menuView_plots.add(menuView_plots_remove); + menuView.add(menuView_selectPlot); + menuView.add(menuView_plots); + + menuHelp_about.addActionListener(new MainMenu_helpAbout_actionAdapter( + mainPanel)); + menuHelp.add(menuHelp_about); + + add(menuFile); + add(menuRun); + add(menuView); + add(menuHelp); + } + + /** + * Exit application + * + * @param e + * ActionEvent + */ + public void menuFile_exit_actionPerformed(ActionEvent e) { + int value = JOptionPane.showConfirmDialog(this, "Do you want to exit?", + "DNA Tool", JOptionPane.OK_CANCEL_OPTION); + + if (value == JOptionPane.OK_OPTION) + System.exit(0); + } + + /** + * Print InfoContentGraph + * + * @param e + * ActionEvent + */ + public void menuFile_print_actionPerformed(ActionEvent e) { + PrintUtilities.printComponent(mainPanel.infoContent + .getInfoContentGraph()); + } +} + +/** + *

+ * Title: JMenuItemImport + *

+ * + *

+ * Description: This class extens JMenuItem and has an action listener that + * reads sequence data from a file and creates a SequenceData object using the + * ReadFileFunction, which runs on the RunFunction thread. + * + *

+ */ + +class JMenuItemImport extends JMenuItem { + public static final long serialVersionUID = MainFrame.serialVersionUID; + private SequenceData sequenceType; + MainPanel mainPanel; + + JMenuItemImport(SequenceData sequenceClass, MainPanel mainPanel) { + super(sequenceClass.toString()); + sequenceType = sequenceClass; + this.mainPanel = mainPanel; + + this.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + importSequence(e); + } + }); + } + + private void importSequence(ActionEvent e) { + JFileChooser fileChooser = new JFileChooser(mainPanel.currentDir); + + String sequenceFile = ""; + fileChooser.setDialogTitle("Import " + sequenceType.getProperty()); + + if (JFileChooser.APPROVE_OPTION == fileChooser.showOpenDialog(this)) { + File file = fileChooser.getSelectedFile().getAbsoluteFile(); + mainPanel.currentDir = file.getAbsoluteFile(); + + // set file of current model: + sequenceFile = file.getAbsolutePath(); + mainPanel.statusBar.setText("sequence file: " + file.toString()); + } + if (sequenceFile != "" && sequenceFile != null) { + + // create SequenceData of the same type as sequenceType + SequenceData data = sequenceType.getNewSequenceData(); + mainPanel.currentData = data; + + // reading file in separate thread with readFun + + ReadFileFunction readFun = new ReadFileFunction(); + readFun.setFile(sequenceFile); + RunFunction runFun = new RunFunction(readFun, data, mainPanel); + runFun.start(); + + System.out.println("Created a sequence data of type" + + data.getClass()); + + } else + mainPanel.statusBar.setText("couldn't open file: " + sequenceFile); + } +} + +/** + *

+ * Title: JMenuItemImport + *

+ * + *

+ * Description: This class extens JMenuItem and has an action listener that + * reads sequence data from a file and creates a SequenceData object using the + * ReadFileFunction, which runs on the RunFunction thread. + * + *

+ */ + +class JMenuItemFormatedImport extends JMenuItem { + public static final long serialVersionUID = MainFrame.serialVersionUID; + + MainPanel mainPanel; + + JMenuItemFormatedImport(MainPanel mainPanel) { + super("Import Rich Format"); + this.mainPanel = mainPanel; + + this.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + importSequence(e); + } + }); + } + + private void importSequence(ActionEvent e) { + mainPanel.importSequence(e); + } +} + +/** + *

+ * Title: MainFrame_paste_actionAdapter + *

+ * this class implements ActionListener when use chooses paste sequence, it will + * show a textArea user can paste text into it + * + */ +class MainMenu_pasteSequence_actionAdapter implements ActionListener { + private MainPanel adaptee; + + // constructor + public MainMenu_pasteSequence_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + // acitonPerformed + public void actionPerformed(ActionEvent e) { + adaptee.pasteSequence_actionPerformed(e); + } +} + +/** + *

+ * Title: MainMenu_readFormated_actionAdapter + *

+ * this class implements ActionListener + * + */ + +class MainMenu_readFormated_actionAdapter implements ActionListener { + private MainPanel adaptee; + + // constructor + public MainMenu_readFormated_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + // actionPerformed + public void actionPerformed(ActionEvent e) { + adaptee.pasteSequence_actionPerformed(e); + } +} + +class MainMenu_menuView_plots_add_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_menuView_plots_add_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.createPlot(); + } +} + +class MainMenu_menuFile_exit_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_menuFile_exit_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.menuFile_exit_actionPerformed(e); + } +} + +class MainMenu_menuFile_print_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_menuFile_print_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.menuFile_print_actionPerformed(e); + } +} + +class MenuFile_exit_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MenuFile_exit_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.menuFile_exit_actionPerformed(e); + } +} + +class MenuFile_print_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MenuFile_print_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.menuFile_print_actionPerformed(e); + } +} + +/* + * This inner class implements ActionListerner When the event of this type + * occurs (user has chosen to set the axis axis) the class performs + * actionPerformed method to call the method popup_setXaxis_actionPerformed + */ +class MainMenu_popup_setXaxis_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_popup_setXaxis_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.popup_setXaxis_actionPerformed(e); + } +} + +/* + * class popup_property shows the property of the sequence when the user chooses + * to show the sequence information author: Hoang Nguyen + */ +class MainMenu_popup_property_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_popup_property_actionAdapter(MainPanel adaptee) {// ,SequenceData + // sequenceData) + // { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.popup_property_actionPerformed(e); + } +} + +class MainMenu_popup_redraw_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_popup_redraw_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.popup_redraw_actionPerformed(e); + } +} + +class MainMenu_popup_remove_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_popup_remove_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.popup_remove_actionPerformed(e); + } +} + +class MainMenu_popup_save_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_popup_save_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.popup_save_actionPerformed(e); + } + +} + +class MainMenu_popup_saveAnnotation_actionAdapter implements ActionListener { + private MainPanel adaptee; + + MainMenu_popup_saveAnnotation_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + // TODO: Ask for annotation + + adaptee.popup_saveAnno_actionPerformed(e); + } + +} + +/** + *

+ * Title: MainFrame_helpAbout_actionAdapter + *

+ * this class implements ActionListener when use chooses help menu, it will show + * a textField contains information about the product + * + */ +class MainMenu_helpAbout_actionAdapter implements ActionListener { + private MainPanel adaptee; + + // constructor + public MainMenu_helpAbout_actionAdapter(MainPanel adaptee) { + this.adaptee = adaptee; + } + + // actionPerformed: override the parent class + public void actionPerformed(ActionEvent e) { + adaptee.helpMenu_about_actionPerformed(e); + } +} + +class JMenuItemSelectPlot extends JMenuItem { + /** + * + */ + private static final long serialVersionUID = 1L; + InfoContentPanel me; + MainPanel mainPanel; + + JMenuItemSelectPlot(InfoContentPanel plot, MainPanel mainPanel) { + super(plot.toString()); + me = plot; + this.mainPanel = mainPanel; + + this.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + selectInformationContentPanel(); + + } + }); + } + + private void selectInformationContentPanel() { + mainPanel.plotToolBar.selectInfoContentPanel(me); + // mainPanel.selectInformationContentPanel(me);Select info + } + +} + +class JMenuItemRemovePlot extends JMenuItem { + /** + * + */ + private static final long serialVersionUID = 1L; + private InfoContentPanel me; + MainPanel mainPanel; + + JMenuItemRemovePlot(InfoContentPanel plot, MainPanel mainPanel) { + super(plot.toString()); + me = plot; + this.mainPanel = mainPanel; + + this.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + removePlot(); + } + + }); + } + + private void removePlot() { + + for (int i = 0; i < mainPanel.mainMenu.menuView_selectPlot + .getItemCount(); i++) { + JMenuItemSelectPlot item = (JMenuItemSelectPlot) mainPanel.mainMenu.menuView_selectPlot + .getItem(i); + if (item.me == me) { + mainPanel.mainMenu.menuView_selectPlot.remove(item); + System.out.println("Removing " + item.me); + } + } + + mainPanel.mainMenu.menuView_plots_remove.remove(this); + + mainPanel.plotList.remove(me); + + if (mainPanel.infoContent == me) { + if (mainPanel.plotList.size() > 0) { + mainPanel.infoContent = mainPanel.plotList.firstElement(); + mainPanel.infoContent.highlight(true); + } else + mainPanel.infoContent = null; + } + + mainPanel.plotSplitPanel.remove(mainPanel.plotsPanel); + mainPanel.plotsPanel = mainPanel + .addPlots(mainPanel.plotList.size() - 1); + mainPanel.plotSplitPanel.add(mainPanel.plotsPanel); + + // mainPanel.plotToolBar.updateInfoContentPanels(mainPanel.plots);--------------------------------------------------- + + } +} + +class JMenuItemFunction extends JMenuItem { + /** + * + */ + private static final long serialVersionUID = 1L; + private Function fun; + private MainPanel mainPanel; + + JMenuItemFunction(Function function, MainPanel mainPanel) { + super(function.toString()); + fun = function; + this.mainPanel = mainPanel; + + this.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + runFunction(e); + } + }); + } + + private void runFunction(ActionEvent e) { + RunFunction runFunction = new RunFunction(fun, mainPanel.currentData, + mainPanel); + runFunction.start(); + } + + public Function getFunction() { + return fun; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainPanel.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainPanel.java new file mode 100755 index 0000000..800a156 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/MainPanel.java @@ -0,0 +1,1016 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import java.awt.*; + +import javax.swing.*; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.compModel.*; +import japsa.bio.misc.dnaPlatform.function.*; +import japsa.bio.misc.dnaPlatform.sequence.*; + +import java.awt.event.*; +import java.io.*; +import java.net.URL; +import java.util.*; + +/** + *

+ * Title: MainFrame + *

+ * + *

+ * Description: This is the main Frame of the DnaGUI tool. This class contains + * CompressionModels, SequenceData objects and Functions. It also contains + * Panels to display information content, information about SequenceData and a + * panel to display other graphs created by compression models. + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class MainPanel extends JPanel { + public static final long serialVersionUID = 1234567890; + + File currentDir = new File("."); + MainMenuBar mainMenu; + + /*-- array to hold all functions that can be executed on SequenceData + including compression models --*/ + Function[] functions; + + /*-- vector to hold SequenceData objects --*/ + Vector sequenceData = new Vector(); + SequenceData currentData = null; // used to select data to be modified + + /*-- Panels --*/ + JPanel statusPane = new JPanel();// The panel to hold the status bar + JPanel rightPanel = new JPanel(); + + JSplitPane plotSplitPanel = new JSplitPane(); + JSplitPane plotsPanel = new JSplitPane();// Panel to contain plot + JSplitPane centrePanel = new JSplitPane(); + JScrollPane extrasPane = new JScrollPane(); + + DataTreePanel dataTree = new DataTreePanel(); + InfoContentPanel infoContent = null; // new InfoContentPanel("Plot 1"); + + Vector plotList = new Vector(); + + /* plot toolbar */ + PlotToolBar plotToolBar = new PlotToolBar(this); + + /*-- popup menu for SequenceData in dataTree --*/ + JPopupMenu popup = new JPopupMenu(); + JMenu popup_model = new JMenu("Run Model"); + JMenu popup_function = new JMenu("Run Function"); + JMenuItemFunction[] popup_runFunction; + JMenuItem popup_remove = new JMenuItem("Remove"); + JMenuItem popup_redraw = new JMenuItem("Redraw"); + JMenuItem popup_save = new JMenuItem("Save"); + JMenuItem popup_save_ann = new JMenuItem("Save with annotation"); + JMenuItem popup_setXaxis = new JMenuItem("Set as x-axis"); + JMenuItem popup_property = new JMenuItem("Properties"); + + JPanel buttonsPanel = new JPanel(); + + /*-- progress label and bar --*/ + JLabel statusBar = new JLabel(""); + JProgressBar progressBar = new JProgressBar(); + + public MainPanel() { + initPanel(); + } + + /** + * This method is where all compression models and functions to be used in + * the GUI are created. New functions and compression models that implement + * the Function or CompressionModel interface can be added to the GUI by + * creating them and adding them to the vector of functions this method + * returns. + * + * @return Function[] containing all Functions and CompressionModels to be + * used in the GUI + */ + + private Function[] createFunctionsModels() { + + // adding compression models + Function[] funcs = { + new FuzzyModel(), + new MarkovModel(), + new ExpertCompressionModel(), + // adding functions + new AppendFunction(), new ReverseFunction(), + new SelectFunction(), new DNAComplementFunction(), + new SmoothingFunction(), new DifferenceFunction(), + new NegateFunction(), new ThresholdFunction(), + new FilterFeatureFunction() };// maybe don't add converter + // here/add in xaxis + + return funcs; + } + + /** + * Set up the centre panel, most things appear here + */ + private void initCentrePane() { + + extrasPane.setMaximumSize(new Dimension(32767, 2));// Extrapanel to + // cover upo unused + // space? + extrasPane.setMinimumSize(new Dimension(19, 2)); + + /**** Adding panels ****/ + plotsPanel = addPlots(0);// The current plot panel + + plotSplitPanel.add(plotsPanel, JSplitPane.LEFT);// = top + plotSplitPanel.add(extrasPane, JSplitPane.RIGHT);// = down + + plotSplitPanel.setOrientation(JSplitPane.VERTICAL_SPLIT); + plotSplitPanel.setDividerLocation(800); + + rightPanel.setLayout(new BorderLayout()); + rightPanel.add(plotToolBar, BorderLayout.NORTH); + rightPanel.add(plotSplitPanel, BorderLayout.CENTER); + + centrePanel.add(dataTree, JSplitPane.LEFT); + centrePanel.add(rightPanel, JSplitPane.RIGHT); + centrePanel.setDividerLocation(180); + } + + private void initStatusPane() { + progressBar.setPreferredSize(new Dimension(200, 14)); + statusPane.add(statusBar); + statusPane.add(progressBar); + } + + /** + * Setting up all the popups + */ + private void initPopups() {// throws Exception { + /**** Setting up popup menu ****/ + popup_remove.addActionListener(new MainMenu_popup_remove_actionAdapter( + this)); + popup_redraw.addActionListener(new MainMenu_popup_redraw_actionAdapter( + this)); + popup_save + .addActionListener(new MainMenu_popup_save_actionAdapter(this)); + popup_save_ann + .addActionListener(new MainMenu_popup_saveAnnotation_actionAdapter( + this)); + + popup_setXaxis + .addActionListener(new MainMenu_popup_setXaxis_actionAdapter( + this)); + popup_property + .addActionListener(new MainMenu_popup_property_actionAdapter( + this)); + + // creating popup menu items for functions and models + popup_runFunction = new JMenuItemFunction[functions.length]; + for (int i = 0; i < functions.length; i++) { + popup_runFunction[i] = new JMenuItemFunction(functions[i], this); + if (functions[i] instanceof CompressionModel) { + popup_model.add(popup_runFunction[i]); + } else + popup_function.add(popup_runFunction[i]); + } + + popup.add(popup_model); + popup.add(popup_function); + popup.add(popup_remove); + popup.add(popup_redraw); + popup.add(popup_save); + popup.add(popup_save_ann); + popup.add(popup_setXaxis); + popup.add(popup_property); + + } + + private void initButtonPanel() {// throws Exception { + /**************** The panel to add buttons ***************************/ + JPanel smallerPanel = new JPanel(); + + smallerPanel.setLayout(new GridLayout(1, 3)); + buttonsPanel.setLayout(new BorderLayout()); + + ImageIcon icon = loadImage("images/Open16.gif", "open new DNA file");// + + JButton openFileButton = new JButton(icon); + // variable acts like an holder, inner class will access it + final MainPanel mainPanel = this; + // add action Listener to the openFileButton + openFileButton.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + importSequence(e); + } + } + + ); + + icon = loadImage("images/Import16.gif", "open window to paste DNA file"); + + JButton pasteADNSequenceButton = new JButton(icon); + pasteADNSequenceButton + .addActionListener(new MainMenu_pasteSequence_actionAdapter( + mainPanel)); + + // this button is used to create a new plot + JButton addNewPlotButton = new JButton("+"); + addNewPlotButton + .addActionListener(new MainMenu_menuView_plots_add_actionAdapter( + mainPanel)); + + smallerPanel.add(openFileButton); + smallerPanel.add(pasteADNSequenceButton); + smallerPanel.add(addNewPlotButton); + + buttonsPanel.add(smallerPanel, BorderLayout.WEST); + /********************* End of button panel ******************************/ + + } + + private void initPanel() {// throws Exception { + this.setSize(new Dimension(900, 500)); + this.setName(this.getName() + ".contentPane"); + this.setLayout(new BorderLayout() { + + /** + * + */ + private static final long serialVersionUID = 1L; + + /** + * This BorderLayout subclass maps a null constraint to CENTER. + * Although the reference BorderLayout also does this, some VMs + * throw an IllegalArgumentException. + */ + public void addLayoutComponent(Component comp, Object constraints) { + if (constraints == null) { + constraints = BorderLayout.CENTER; + } + super.addLayoutComponent(comp, constraints); + } + }); + + /**** Creating all the Functions and CompressionModels ****/ + functions = createFunctionsModels(); + + /* setting plot toolbar */ + + initButtonPanel(); + initCentrePane(); + initStatusPane(); + initPopups(); + + /**** add elements to contentPane ****/ + this.add(buttonsPanel, BorderLayout.NORTH); + this.add(centrePanel, BorderLayout.CENTER); + this.add(statusPane, java.awt.BorderLayout.SOUTH); + + /**** adding mouse listener to tree in dataTree panel ****/ + JTree tree = dataTree.getTree(); + tree.addMouseListener(new tree_popupListener()); + + mainMenu = new MainMenuBar(this); + } + + protected ImageIcon loadImage(String imgPath, String desc) { + URL url = this.getClass().getResource(imgPath); + + if (url == null) + return new ImageIcon(imgPath, desc); + + return new ImageIcon(url, desc); + + } + + /** + * Exit application + * + * @param e + * ActionEvent + */ + public void menuFile_exit_actionPerformed(ActionEvent e) { + int value = JOptionPane.showConfirmDialog(this, "Do you want to exit?", + "DNA Tool", JOptionPane.OK_CANCEL_OPTION); + + if (value == JOptionPane.OK_OPTION) + System.exit(0); + } + + /** + * open a new frame : about frame do nothing just about the information of + * the program: from Monash + * + */ + public void helpMenu_about_actionPerformed(ActionEvent e) { + JFrame aboutFrame = new JFrame("about"); + JTextField about = new JTextField(); + about.setText("From Monash University "); + about.setEditable(false); + aboutFrame.add(about); + aboutFrame.setSize(200, 100); + aboutFrame.setLocation(300, 400); + aboutFrame.setDefaultCloseOperation(WindowConstants.HIDE_ON_CLOSE); + aboutFrame.setVisible(true); + + } + + /** + * create a new frame window contains TextArea so that the user can cut and + * paste the sequence to the program instead of selecting the file + * + * @param ACtionEvent + * + */ + public void pasteSequence_actionPerformed(ActionEvent e) { + final JFrame pasteSequenceFrame = new JFrame("Paste DNA Sequence"); + JPanel textPanel = new JPanel(); + // just a variable holder for the anonymous class + final MainPanel mainPanel = this; + JPanel buttonPanel = new JPanel(); + pasteSequenceFrame.setLayout(new BorderLayout()); + final JTextArea paste = new JTextArea(40, 52); + paste.setEditable(true); + JScrollPane scrollText = new JScrollPane(paste, + JScrollPane.VERTICAL_SCROLLBAR_ALWAYS, + JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED); + textPanel.add(scrollText); + // okie button + JButton okieButton = new JButton("OK"); + JButton cancelButton = new JButton("Cancel"); + okieButton.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + String text = paste.getText(); + DNASequenceData data = new DNASequenceData(); + data.readDataFromString(text); + currentData = data; + (new RunFunction(null, null, mainPanel)).attachSequence(data); + + System.out.println("Created a sequence data of type" + + data.getClass()); + pasteSequenceFrame.dispose(); + } + } + + ); + + cancelButton.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + pasteSequenceFrame.dispose(); + } + }); + buttonPanel.add(okieButton); + buttonPanel.add(cancelButton); + pasteSequenceFrame.add(textPanel, BorderLayout.NORTH); + pasteSequenceFrame.add(buttonPanel, BorderLayout.SOUTH); + pasteSequenceFrame.setSize(600, 900); + pasteSequenceFrame.setLocation(0, 0); + pasteSequenceFrame + .setDefaultCloseOperation(WindowConstants.HIDE_ON_CLOSE); + pasteSequenceFrame.setVisible(true); + + } + + /** + * Print InfoContentGraph + * + * @param e + * ActionEvent + */ + public void menuFile_print_actionPerformed(ActionEvent e) { + PrintUtilities.printComponent(infoContent.getInfoContentGraph()); + } + + /** + * Delete selected data from dataTree and information content. + * + * @param e + * ActionEvent + */ + + public void popup_remove_actionPerformed(ActionEvent e) { + // TODO: Convert this to make it generic: remove, draw + if (currentData == null) + return; + + if (currentData instanceof DoubleSequenceData) { + // go through all the plotList to remove + Iterator it = plotList.iterator(); + while (it.hasNext()) { + InfoContentPanel p = it.next(); + + p.removeGraph(currentData.toString()); + } + + // if (infoContent != null) + // infoContent.removeGraph(currentData.toString()); + } + + else if (currentData instanceof AnnotationSequenceData) { + // go through all the plotList to remove + Iterator it = plotList.iterator(); + while (it.hasNext()) { + InfoContentPanel p = it.next(); + + p.removeFeatures((AnnotationSequenceData) currentData); + } + + // infoContent.removeGraph(currentData.toString()); ?? + } + + sequenceData.removeElement(currentData); + currentData = null; + + /* remove history of sequenceData object from dataTree */ + dataTree.removeSelectedPath(); + + } + + /** + * Save selected data from dataTree + * + * @param e + * ActionEvent + */ + public void popup_save_actionPerformed(ActionEvent e) { + + if (currentData == null) + return; + + JFileChooser fileChooser = new JFileChooser(currentDir); + fileChooser.setDialogTitle("Save File"); + // choose which type of file to save + + if (JFileChooser.APPROVE_OPTION == fileChooser.showOpenDialog(this)) { + File file = fileChooser.getSelectedFile().getAbsoluteFile(); + + currentDir = file.getAbsoluteFile(); + + SaveFileFunction saveFunction = new SaveFileFunction(file); + RunFunction runFun = new RunFunction(saveFunction, currentData, + this); + + System.out.println("ready to run"); + runFun.start(); + } + } + + /** + * Save selected data from dataTree + * + * @param e + * ActionEvent + */ + public void popup_saveAnno_actionPerformed(ActionEvent e) { + + if (currentData == null) + return; + + JFileChooser fileChooser = new JFileChooser(currentDir); + fileChooser.setDialogTitle("Save File"); + // choose which type of file to save + + if (JFileChooser.APPROVE_OPTION == fileChooser.showOpenDialog(this)) { + File file = fileChooser.getSelectedFile().getAbsoluteFile(); + currentDir = file.getAbsoluteFile(); + + SaveFormatFileFunction saveFunction = new SaveFormatFileFunction( + file); + RunFunction runFun = new RunFunction(saveFunction, currentData, + this); + + System.out.println("ready to run"); + runFun.start(); + } + } + + /** + * Function to redraw data in plot + * + * @param e + * ActionEvent + */ + public void popup_redraw_actionPerformed(ActionEvent e) { + if (currentData == null || infoContent == null) + return; + statusBar.setText(""); + if (currentData instanceof DoubleSequenceData) { + if (!infoContent.paintInfoContent((DoubleSequenceData) currentData)) + statusBar.setText("Graph is already in plot"); + } else if (currentData instanceof AnnotationSequenceData) { + if (!infoContent.addFeatures((AnnotationSequenceData) currentData)) + statusBar.setText("Cant draw feature list"); + } + } + + // set the Xaxis of the panel + // draw the protein + // draw the features(anottation) + public void popup_setXaxis_actionPerformed(ActionEvent e) { + if (currentData == null || infoContent == null) + return; + + if (currentData instanceof CharSequenceData) { + System.out.println("Changing sequence of plot"); + infoContent.setSequence(((CharSequenceData) currentData) + .getCharData()); + } + + if (currentData instanceof AnnotationSequenceData) { + if (!infoContent.addFeatures((AnnotationSequenceData) currentData)) + statusBar.setText("Cant draw feature list"); + } + } + + /* + * shows the property of the current sequence + * + * @param ActionEvent Hoang Nguyen + */ + public void popup_property_actionPerformed(ActionEvent e) { + + if (currentData == null) + return; + + String message = currentData.getProperty(); + JOptionPane.showMessageDialog(this, message, "sequence info", + JOptionPane.PLAIN_MESSAGE); + + /* + * if(dnaSeq.sequenceType==DNASequenceData.FASTA) { + * + * message += + * "DESCRIPTION LINE:"+annotation.getProperty("DESCRIPTION_LINE") +"\n"; + * message += "DESCRIPTION:"+annotation.getProperty("DESCRIPTION") + * +"\n"; JOptionPane.showMessageDialog(this, message, "sequence info", + * JOptionPane.PLAIN_MESSAGE); } else//read from genebank + * if(dnaSeq.sequenceType==DNASequenceData.GENBANK) { + * + * message += "LOCUS :"+annotation.getProperty("LOCUS")+"\n"; + * message += "SIZE :"+annotation.getProperty("SIZE")+"\n"; + * message += "TYPE :"+annotation.getProperty("TYPE")+"\n"; + * message += "CIRCULAR :"+annotation.getProperty("CIRCULAR")+"\n"; + * message += "DIVISION :"+annotation.getProperty("DIVISION")+"\n"; + * message += "MDAT :"+annotation.getProperty("MDAT")+"\n"; + * message += "SOURCE :"+annotation.getProperty("SOURCE")+"\n"; + * message += "DEFINITION :"+annotation.getProperty("DEFINITION") +"\n"; + * JOptionPane.showMessageDialog(this, message, "sequence info", + * JOptionPane.PLAIN_MESSAGE); } + * + * /********************************************************* + */ + } + + /** + * Displays a dialog to update options to run a function model. + * + * @param handle + * OptionsHandle + * @return boolean Whether the options for function were set in dialog + */ + protected boolean setParameters(OptionsHandle handle) { + OptionsHandleDialog dlg = new OptionsHandleDialog((Frame) null, handle, + sequenceData); + Dimension dlgSize = dlg.getPreferredSize(); + Dimension frmSize = getSize(); + java.awt.Point loc = getLocation(); + dlg.setLocation((frmSize.width - dlgSize.width) / 2 + loc.x, + (frmSize.height - dlgSize.height) / 2 + loc.y); + dlg.setModal(true); + dlg.setVisible(true); + + return dlg.isOptionsHandleSet(); + } + + /** + * This is a recursive function to add all plots from index to JScrollPanes + * and returns the Panel where all plots were added. When the index is equal + * to zero this function returns the first InfoContentPanel + * + * @param index + * Index to add plots from + */ + JSplitPane addPlots(int index) { + + // add plots in split panels and return final JPanel + JSplitPane pan = new JSplitPane(JSplitPane.VERTICAL_SPLIT); + if (index < 0 || index >= plotList.size()) { + pan.add(new JPanel(), JSplitPane.BOTTOM); + } + + else if (index == 0) { + pan.add(plotList.elementAt(index), JSplitPane.BOTTOM); + } + + else { + pan.add(plotList.elementAt(index), JSplitPane.BOTTOM); + pan.add(addPlots(index - 1), JSplitPane.TOP); + } + + return pan; + + } + + public static final int MAX_NUMBER_PLOT = 4; + + /** + * This method creates a new InformationContentPanel to be displayed by this + * tool + */ + public void createPlot() { + if (plotList.size() < MAX_NUMBER_PLOT) { + + // find right name for the plot + int number = 1; + Iterator it = plotList.iterator(); + while (it.hasNext()) { + InfoContentPanel p = it.next(); + String name = p.toString(); + + if (name.indexOf("Plot ") != -1) { + String n = name.substring(name.lastIndexOf(' ') + 1); + + if (number == Integer.parseInt(n)) + number++; + } + } + + InfoContentPanel panel = new InfoContentPanel("Plot " + number, + this); + + plotList.add(panel); + + plotSplitPanel.remove(plotsPanel); + plotsPanel = addPlots(plotList.size() - 1); + plotSplitPanel.add(plotsPanel); + + // select plot + plotToolBar.selectInfoContentPanel(panel); + + // this.selectInformationContentPanel(panel); + + /* update menus */ + mainMenu.menuView_plots_remove.add(new JMenuItemRemovePlot(panel, + this)); + mainMenu.menuView_selectPlot.add(new JMenuItemSelectPlot(panel, + this)); + plotToolBar.updateInfoContentPanels(plotList); + } else + statusBar.setText("A maximum of four plots can be displayed"); + } + + public void updateInfoContentPlotMenu() { + mainMenu.menuView_selectPlot.removeAll(); + mainMenu.menuView_plots_remove.removeAll(); + + Iterator it = plotList.iterator(); + while (it.hasNext()) { + InfoContentPanel panel = it.next(); + + mainMenu.menuView_plots_remove.add(new JMenuItemRemovePlot(panel, + this)); + mainMenu.menuView_selectPlot.add(new JMenuItemSelectPlot(panel, + this)); + } + + // plotToolBar.updateInfoContentPanels(plots);----------------------------------------------------------- + } + + /** + * This method selects the informationContentPanel to work with + * + * @param panel + * InfoContentPanel + */ + public void selectInformationContentPanel(InfoContentPanel panel) { + if (panel == null) + return; + + // set all other graphs to no highlight + Iterator it = plotList.iterator(); + while (it.hasNext()) { + InfoContentPanel p = it.next(); + p.highlight(p == panel); + } + + infoContent = panel; + // select InfoContentPanel in plotToolBar + // plotToolBar.selectInfoContentPanel(panel); + + // after doing this, reset the menuView_selectPlot + // this.plotToolBar.setSelectedPanel(panel); + // plotToolBar.selectInfoContentPanel(panel); + + } + + protected void importSequence(ActionEvent e) { + JFileChooser fileChooser = new JFileChooser(currentDir); + + String sequenceFile = ""; + fileChooser.setDialogTitle("Import Rich Format File"); + + if (JFileChooser.APPROVE_OPTION == fileChooser.showOpenDialog(this)) { + File file = fileChooser.getSelectedFile().getAbsoluteFile(); + currentDir = file.getAbsoluteFile(); + + // set file of current model: + sequenceFile = file.getAbsolutePath(); + statusBar.setText("Sequence file: " + file.toString()); + } + if (sequenceFile != "" && sequenceFile != null) { + + // reading file in separate thread with readFun + + ReadFormatFileFunction readFun = new ReadFormatFileFunction(); + readFun.setFile(sequenceFile); + RunFunction runFun = new RunFunction(readFun, null, this); + runFun.start(); + + // System.out.println("Created a sequence data of type" + + // data.getClass()); + + } else + this.statusBar.setText("couldn't open file: " + sequenceFile); + } + + /** + *

+ * Title: JMenuItemFunction + *

+ * + *

+ * Description: This class extens JMenuItem and has an action listener that + * calls a RunFunction thread to run functions. + *

+ */ + + /** + * + *

+ * Title: DataTreePanel_tree_mouseAdapter + *

+ * + *

+ * Description: This is a MouseAdapter for the JTree contained in + * DataTreePanel. It has been added in this MainFrame instead of + * DataTreePanel to handle mouse clicks in this class. + *

+ * + * @author Julie Bernal + * @version 1.0 + */ + @SuppressWarnings("rawtypes") + class tree_popupListener extends MouseAdapter { + + @SuppressWarnings("unchecked") + public void mousePressed(MouseEvent e) { + maybeShowPopup(e); + + // enable and disable items in main menu according to current data + for (int i = 0; i < mainMenu.menuRun_runFunction.length; i++) { + Function fun = mainMenu.menuRun_runFunction[i].getFunction(); + Class[] classes = fun.getTypeSequenceData(); + + mainMenu.menuRun_runFunction[i].setEnabled(false); + for (int j = 0; j < classes.length; j++) { + if (currentData != null + && classes[j].isAssignableFrom(currentData + .getClass())) { + mainMenu.menuRun_runFunction[i].setEnabled(true); + } + } + } + + } + + @SuppressWarnings("unchecked") + public void mouseReleased(MouseEvent e) { + maybeShowPopup(e); + + // enable and disable items in main menu according to current data + for (int i = 0; i < mainMenu.menuRun_runFunction.length; i++) { + Function fun = mainMenu.menuRun_runFunction[i].getFunction(); + Class[] classes = fun.getTypeSequenceData(); + + mainMenu.menuRun_runFunction[i].setEnabled(false); + for (int j = 0; j < classes.length; j++) { + if (currentData != null + && classes[j].isAssignableFrom(currentData + .getClass())) { + mainMenu.menuRun_runFunction[i].setEnabled(true); + } + } + } + } + + /** + * Shows popup for SequenceData object selected elements in popup menu + * are enabled according to the type of SequenceData selected. + * + * @param e + * MouseEvent + */ + @SuppressWarnings("unchecked") + private void maybeShowPopup(MouseEvent e) { + if (dataTree == null) + return; + String data = dataTree.getFirstSelectedPath(); + if (data == null) + return; + else { + // get SequenceData with the same name as selected SequenceData + Iterator it = sequenceData.iterator(); + while (it.hasNext()) { + SequenceData d = it.next(); + if (data.equals(d.toString())) { + // select data from sequenceData using name in tree + currentData = d; + // enable functions and models that can be applied to + // current data + for (int i = 0; i < popup_runFunction.length; i++) { + Function fun = popup_runFunction[i].getFunction(); + Class[] classes = fun.getTypeSequenceData(); + + popup_runFunction[i].setEnabled(false); + for (int j = 0; j < classes.length; j++) { + if (classes[j].isAssignableFrom(currentData + .getClass())) { + popup_runFunction[i].setEnabled(true); + } + } + } + + popup_property.setEnabled(true); + popup_save_ann.setEnabled(d instanceof DNASequenceData); + + if (d instanceof CharSequenceData) { + popup_redraw.setEnabled(false); + popup_setXaxis.setEnabled(true); + } else if (d instanceof DoubleSequenceData) { + popup_redraw.setEnabled(true); + popup_setXaxis.setEnabled(false); + } else if (d instanceof AnnotationSequenceData) { + popup_redraw.setEnabled(true); + popup_setXaxis.setEnabled(false); + } + + /* show popup menu if data in sequenceData */ + if (e.isPopupTrigger()) { + popup.show(e.getComponent(), e.getX(), e.getY()); + } + break; + + } + } // while + + } + + } + } +} + +/** + *

+ * Title: RunFunction + *

+ * + *

+ * Description: This class is a Thread that runs functions. + *

+ */ +class RunFunction extends Thread { + Function myFunction; + SequenceData myInputData; + MainPanel mainPanel; + + public RunFunction(Function function, SequenceData inputData, + MainPanel mainPanel) { + myFunction = function; + myInputData = inputData; + this.mainPanel = mainPanel; + } + + public void attachMultiSequence(Iterator itrData) { + int i = 1; + while (itrData.hasNext()) { + System.out.println(i++); + SequenceData data = itrData.next(); + attachSequence(data); + } + + } + + public void attachSequence(SequenceData data) { + Iterator it = mainPanel.sequenceData.iterator(); + int i = 1; + while (it.hasNext()) { + SequenceData d = it.next(); + if (d.getClass().equals(data.getClass())) { + i++; + String name = d.getSequenceName(); + String subName[] = name.split("\\D"); + if (subName.length > 0) { + int n = Integer.parseInt(subName[0]); + if (i == n) + i = n + 1; + } + + } + } + data.setSequenceName(i + ""); + // Attached into sequence + mainPanel.sequenceData.add(data); + mainPanel.dataTree.addDataHistory(data.getHistory()); + + // Draw if need to + if (data instanceof DoubleSequenceData && mainPanel.infoContent != null) { + mainPanel.statusBar.setText("Creating graph"); + mainPanel.infoContent.paintInfoContent((DoubleSequenceData) data); + } + } + + /** + * Runs a function. The steps required to run a function are: 1. Set + * parameters for function 2. Call method to execute function 3. If + * inputData is DoubleSequenceData repaint infoContent 4. Display + * information about outpout data in dataTree panel + * + * The SequenceData object created by CompressionModel is stored in + * sequenceData vector. + * + * The progressBar is turned on and off to indicate execution of thread + * + */ + public void run() { + // if there is another thread stop this one + if (mainPanel.progressBar.isIndeterminate()) { + mainPanel.statusBar + .setText("Cannot run more than one funcion/model at the same time."); + System.err + .println("Cannot run more than one function and model at the same time."); + return; + } + + else if (mainPanel.sequenceData == null) { + mainPanel.statusBar.setText("Select a sequence"); + System.err + .println("Select sequence before executing function/model."); + return; + } + + // indicate task is being performed + mainPanel.progressBar.setIndeterminate(true); + + try { + // 1. Set parameters for function if function has options + OptionsHandle options = myFunction.getOptionsHandle(); + if (myFunction instanceof ReadFormatFileFunction) { + Iterator iterSeq = ((ReadFormatFileFunction) myFunction) + .guessFormat(); + attachMultiSequence(iterSeq); + + } else if (options == null || mainPanel.setParameters(options)) { + + // 2. Call method to execute function for inputData + mainPanel.statusBar.setText("Running " + myFunction); + SequenceData data = myFunction.execute(options, myInputData); + if (data != null) { + attachSequence(data); + } + } + + } catch (Exception e) { + System.err.println("Error executing function " + myFunction); + System.err.println(e); + mainPanel.statusBar.setText("Error running function"); + mainPanel.progressBar.setIndeterminate(false); + System.out.println(e.getMessage()); + } + + mainPanel.statusBar.setText(""); + mainPanel.progressBar.setIndeterminate(false); + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/OptionsHandleDialog.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/OptionsHandleDialog.java new file mode 100755 index 0000000..e457931 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/OptionsHandleDialog.java @@ -0,0 +1,433 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import javax.swing.*; + +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.bio.misc.dnaPlatform.sequence.*; + +import java.awt.*; +import java.awt.event.KeyEvent; +import java.awt.event.KeyAdapter; +import java.awt.event.WindowEvent; +import java.awt.event.ActionListener; +import java.awt.event.ActionEvent; +import java.util.*; + +/** + *

+ * Title: OptionsHandleDialog + *

+ * + *

+ * Description: This is a dialog that displays options in a OptionsHandle object + * and lets user set options + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +class OptionsHandleDialog extends JDialog { + /** + * + */ + private static final long serialVersionUID = -6467708920193257046L; + + OptionsHandle myHandle; + + private JLabel[] options; + private Object[] values; + + JPanel contentPane; + JPanel northPane = new JPanel(); + JPanel southPane = new JPanel(); + JScrollPane centerPane = new JScrollPane(); + JPanel optionPane = new JPanel(); + + JLabel title_lbl = new JLabel(); + JLabel subtitle_lbl = new JLabel("Options: "); + JButton cancel_btn = new JButton("Cancel"); + JButton set_btn = new JButton("Run"); + + Vector sequenceData; + + // used to indicate whether user clicked on ok or cancel + private boolean accepted = false; + + public OptionsHandleDialog(Frame parent, OptionsHandle options, Vector data) { + super(parent); + enableEvents(AWTEvent.WINDOW_EVENT_MASK); + myHandle = options; + sequenceData = data; + + if (myHandle != null) { + try { + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + pack(); + } + + public OptionsHandleDialog(JDialog parent, OptionsHandle options, + Vector data) { + super(parent); + enableEvents(AWTEvent.WINDOW_EVENT_MASK); + myHandle = options; + sequenceData = data; + + if (myHandle != null) { + try { + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + pack(); + } + + /* + * jbInit: in this function we create a ParamPanel depending on its + * Parameters, four types of parameters are recognized: boolean, integer, + * double and string + */ + private void jbInit() throws Exception { + int numberParams = myHandle.getNumberOptions(); + + this.setTitle("Set Options"); + this.setSize(500, 300); + contentPane = (JPanel) this.getContentPane(); + contentPane.setLayout(new BorderLayout()); + + title_lbl.setFont(new java.awt.Font("Dialog", Font.BOLD, 14)); + + title_lbl.setText(myHandle.getOwner().toString()); + northPane.add(title_lbl); + northPane.add(subtitle_lbl); + + cancel_btn + .addActionListener(new OptionsHandleDialog_cancel_btn_actionAdapter( + this)); + set_btn.addActionListener(new OptionsHandleDialog_set_btn_actionAdapter( + this)); + southPane.add(cancel_btn); + southPane.add(set_btn); + + // Creating the option panel: + + optionPane.setLayout(new GridLayout(numberParams, 2, 4, 4)); + createMyOptionsValues(numberParams); + + // adding options and values into this panel + for (int i = 0; i < numberParams; i++) { + String o = myHandle.getOptionAt(i); + Object val = myHandle.getOptionValue(o); + + optionPane.add(options[i]); + + if (val instanceof Boolean) + optionPane.add((JCheckBox) values[i]); + + else if (val instanceof Integer) + optionPane.add((IntField) values[i]); + + else if (val instanceof Double) + optionPane.add((DoubleField) values[i]); + + else if (val instanceof String) + optionPane.add((JTextField) values[i]); + + else if (val instanceof SequenceData) + optionPane.add((JComboBox) values[i]); + + else if (val instanceof OptionsHandle) + optionPane.add((HandleButton) values[i]); + + } + + centerPane.getViewport().add(optionPane); + contentPane.add(northPane, BorderLayout.NORTH); + contentPane.add(southPane, BorderLayout.SOUTH); + contentPane.add(centerPane, BorderLayout.CENTER); + } + + @SuppressWarnings("unchecked") + private void createMyOptionsValues(int numberParams) { + options = new JLabel[numberParams]; + values = new Object[numberParams]; + + // creating options[] and values[] from Options + for (int i = 0; i < numberParams; i++) { + String option = myHandle.getOptionAt(i); + + options[i] = new JLabel(option); + Object val = myHandle.getOptionValue(option); + + if (val instanceof Boolean) { + JCheckBox checkBox = new JCheckBox(); + checkBox.setSelected(((Boolean) val).booleanValue()); + values[i] = checkBox; + } + + else if (val instanceof Integer) { + IntField intField = new IntField(); + intField.setText(((Integer) val).intValue()); + values[i] = intField; + } + + else if (val instanceof Double) { + DoubleField doubleField = new DoubleField(); + doubleField.setText(((Double) val).doubleValue()); + values[i] = doubleField; + + } + + else if (val instanceof String) { + values[i] = new JTextField((String) val); + } + + else if (val instanceof SequenceData) { + JComboBox seqBox = new JComboBox(sequenceData); + values[i] = seqBox; + } + + else if (val instanceof OptionsHandle) { + OptionsHandle subHandle = (OptionsHandle) val; + HandleButton handleButton = new HandleButton( + option + "_Handle", subHandle); + handleButton.addActionListener(new HandleButton_actionAdapter( + this)); + values[i] = handleButton; + } + + } + } + + public void openSubHandleDialog(ActionEvent e) { + + // get option of button + HandleButton button = (HandleButton) e.getSource(); + OptionsHandle handle = button.getOptionsHandle(); + + OptionsHandleDialog dlg = new OptionsHandleDialog(this, handle, + sequenceData); + Dimension dlgSize = dlg.getPreferredSize(); + Dimension frmSize = getSize(); + Point loc = getLocation(); + dlg.setLocation((frmSize.width - dlgSize.width) / 2 + loc.x, + (frmSize.height - dlgSize.height) / 2 + loc.y); + dlg.setModal(true); + + dlg.setVisible(true); + + } + + /* + * setOptionsHandle: this function changes the option values in + * OptionsHandle + */ + public void setOptionsHandle() { + int numberParams = myHandle.getNumberOptions(); + + for (int i = 0; i < numberParams; i++) { + String option = myHandle.getOptionAt(i); + Object val = myHandle.getOptionValue(option); + + if (val instanceof Boolean) { + JCheckBox checkBox = (JCheckBox) values[i]; + myHandle.setOptionValue(option, + new Boolean(checkBox.isSelected())); + } + + else if (val instanceof Integer) { + IntField intField = (IntField) values[i]; + if (intField.getText().length() > 0) + myHandle.setOptionValue(option, + new Integer(intField.getText())); + } + + else if (val instanceof Double) { + DoubleField doubleField = (DoubleField) values[i]; + if (doubleField.getText().length() > 0) + myHandle.setOptionValue(option, + new Double(doubleField.getText())); + + } + + else if (val instanceof String) { + JTextField txt = (JTextField) values[i]; + if (txt.getText().length() > 0) + myHandle.setOptionValue(option, txt.getText()); + } + + else if (val instanceof SequenceData) { + JComboBox seqBox = (JComboBox) values[i]; + SequenceData data = (SequenceData) seqBox.getSelectedItem(); + if (data instanceof SequenceData) + myHandle.setSequenceDataValue(option, data); + } + + else if (val instanceof OptionsHandle) { + HandleButton handleButton = (HandleButton) values[i]; + OptionsHandle subhandle = handleButton.getOptionsHandle(); + if (subhandle instanceof OptionsHandle) + myHandle.setOptionsHandleValue(option, subhandle); + + } + } + + accepted = true; + + } + + /* IntField is a JTextField that only allows user to enter integers */ + private class IntField extends JTextField { + /** + * + */ + private static final long serialVersionUID = -4791170332146987317L; + + IntField() { + this.addKeyListener(new KeyAdapter() { + public void keyTyped(KeyEvent e) { + char c = e.getKeyChar(); + if (!(Character.isDigit(c) || c == KeyEvent.VK_BACK_SPACE + || c == KeyEvent.VK_DELETE || (c == '-' && getText() + .length() == 0))) { + getToolkit().beep(); + e.consume(); + } + } + }); + } + + void setText(int number) { + super.setText(String.valueOf(number)); + } + + } + + /* DoubleField is a JTextField that only allows user to enter doubles */ + private class DoubleField extends JTextField { + /** + * + */ + private static final long serialVersionUID = -7249595281088991083L; + + DoubleField() { + this.addKeyListener(new KeyAdapter() { + public void keyTyped(KeyEvent e) { + char c = e.getKeyChar(); + if (!(Character.isDigit(c) || c == KeyEvent.VK_BACK_SPACE + || c == KeyEvent.VK_DELETE + || (c == '.' && getText().indexOf('.') == -1) || (c == '-' && getText() + .length() == 0))) { + getToolkit().beep(); + e.consume(); + } + } + }); + } + + void setText(double number) { + super.setText(String.valueOf(number)); + } + } + + /* HandleButton is a JButton that contains a OptionsHandle object */ + private class HandleButton extends JButton { + /** + * + */ + private static final long serialVersionUID = 9207613471442424795L; + private OptionsHandle myHandle; + + HandleButton(String text, OptionsHandle handle) { + super(text); + myHandle = handle; + } + + public OptionsHandle getOptionsHandle() { + return myHandle; + } + + } + + /** + * This function returns a boolean indicating whether user clicked on button + * to set options + * + * @return boolean + */ + public boolean isOptionsHandleSet() { + return accepted; + } + + protected void processWindowEvent(WindowEvent e) { + if (e.getID() == WindowEvent.WINDOW_CLOSING) { + dispose(); + } + super.processWindowEvent(e); + } + +} + +/*--------------*/ +class HandleButton_actionAdapter implements ActionListener { + private OptionsHandleDialog adaptee; + + HandleButton_actionAdapter(OptionsHandleDialog adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.openSubHandleDialog(e); + } +} + +/*--------------*/ +class OptionsHandleDialog_cancel_btn_actionAdapter implements ActionListener { + private OptionsHandleDialog adaptee; + + OptionsHandleDialog_cancel_btn_actionAdapter(OptionsHandleDialog adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.dispose(); + } +} + +/*--------------*/ +class OptionsHandleDialog_set_btn_actionAdapter implements ActionListener { + private OptionsHandleDialog adaptee; + + OptionsHandleDialog_set_btn_actionAdapter(OptionsHandleDialog adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.setOptionsHandle(); + adaptee.dispose(); + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/PlotPopupMenu.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/PlotPopupMenu.java new file mode 100755 index 0000000..76fb247 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/PlotPopupMenu.java @@ -0,0 +1,171 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import javax.swing.*; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; + +/** + *

+ * Title: DnaPlatform + *

+ * + *

+ * Description: This is a java platform for visualization of compression results + * obtained with different models + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + *

+ * Company: Monash University + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class PlotPopupMenu extends JPopupMenu { + + /** + * + */ + private static final long serialVersionUID = 1L; + + InfoContentPanel plot; + + /* GUI elements for InfoContentPlot menu */ + JMenuItem zoomIn = new JMenuItem("zoom in"); + JMenuItem zoomOut = new JMenuItem("zoom out"); + JMenuItem gridRange = new JMenuItem("plot view"); + JMenu graph = new JMenu("graphs"); + JMenu graph_remove = new JMenu("remove"); + + public PlotPopupMenu(InfoContentPanel plot) { + this.plot = plot; + + try { + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + private void jbInit() throws Exception { + gridRange.addActionListener(new MyActionAdapter(this)); + zoomIn.addActionListener(new MyActionAdapter(this)); + zoomOut.addActionListener(new MyActionAdapter(this)); + + /* setting viewMenu */ + graph.add(graph_remove); + add(gridRange); + add(zoomIn); + add(zoomOut); + add(graph); + + } + + public void gridRange_actionPerformed(ActionEvent e) { + plot.showGridRangeDialog(); + } + + public void zoomIn_actionPerformed(ActionEvent e) { + plot.zoomIn_actionPerformed(); + } + + public void zoomOut_actionPerformed(ActionEvent e) { + plot.zoomOut_actionPerformed(); + } + + /** + * This method adds a graph name to the popup menu if this name is not + * already in menu. It returns a boolean indicating whether or not new graph + * was added. + * + * @param data + * SequenceData + * @return String + */ + public boolean addGraphToMenu(String graphName) { + + for (int i = 0; i < graph_remove.getItemCount(); i++) { + if ((graph_remove.getItem(i).getText()).equals(graphName)) { + return false; + } + } + + JMenuItem item = new JMenuItem(graphName); + item.addActionListener(new MyRemoveActionAdapter(this, item)); + graph_remove.add(item); + return true; + } + + /** + * This method removes given graph name from the menu + * + * @param graphName + * String + */ + public void removeGraphFromMenu(String graphName) { + for (int i = 0; i < graph_remove.getItemCount(); i++) { + if ((graph_remove.getItem(i).getText()).equals(graphName)) { + graph_remove.remove(i); + } + } + } + +} + +class MyActionAdapter implements ActionListener { + private PlotPopupMenu adaptee; + + MyActionAdapter(PlotPopupMenu adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + if (e.getSource().equals(adaptee.gridRange)) + adaptee.gridRange_actionPerformed(e); + + else if (e.getSource().equals(adaptee.zoomIn)) + adaptee.zoomIn_actionPerformed(e); + + else if (e.getSource().equals(adaptee.zoomOut)) + adaptee.zoomOut_actionPerformed(e); + + } +} + +class MyRemoveActionAdapter implements ActionListener { + private PlotPopupMenu adaptee; + private JMenuItem removeItem; + + MyRemoveActionAdapter(PlotPopupMenu adaptee, JMenuItem removeGraphItem) { + this.adaptee = adaptee; + removeItem = removeGraphItem; + } + + public void actionPerformed(ActionEvent e) { + adaptee.plot.removeGraph(removeItem.getText()); + adaptee.graph_remove.remove(removeItem); + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/PlotToolBar.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/PlotToolBar.java new file mode 100755 index 0000000..c9d934a --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/PlotToolBar.java @@ -0,0 +1,252 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import javax.swing.*; + +import java.util.*; +import java.awt.*; +import java.awt.event.ItemEvent; +import java.awt.event.ItemListener; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; + +/** + *

+ * Title: DnaPlatform + *

+ * + *

+ * Description: This is a java platform for visualization of compression results + * obtained with different models + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + *

+ * Company: Monash University + *

+ * + * @author Julie Bernal + * @version 1.0 + */ + +public class PlotToolBar extends JToolBar { + public static final long serialVersionUID = MainFrame.serialVersionUID; + /* GUI elements for PlotMenuBar */ + JLabel plotLabel = new JLabel("Working with plot: "); + @SuppressWarnings("rawtypes") + JComboBox plotBox = new JComboBox(); + + JButton removeButton = new JButton("Remove Plot"); + + // JButton viewButton = new JButton("View"); + + JPanel zoomPanel = new JPanel(); + + MainPanel myContainer; + + ImageIcon zoomInIcon; + ImageIcon zoomOutIcon; + JButton zoomOut; + JButton zoomIn; + + JLabel coordLabel = new JLabel("(0,0)"); + + // private InfoContentPanel selectedPanel =null; + public PlotToolBar(MainPanel panel) { + + super(); + myContainer = panel; + + try { + jbInit(); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + + private void jbInit() throws Exception { + this.setFloatable(false); + + zoomInIcon = myContainer.loadImage("images/ZoomIn16.gif", "zoom in"); + zoomOutIcon = myContainer.loadImage("images/ZoomOut16.gif", "zoom out"); + + zoomOut = new JButton(zoomOutIcon); + zoomIn = new JButton(zoomInIcon); + + zoomIn.setFont(new java.awt.Font("Monospaced", Font.BOLD, 12)); + + zoomIn.setMargin(new Insets(0, 4, 0, 4)); + zoomIn.addActionListener(new PlotToolBar_zoomIn_actionAdapter(this)); + + zoomOut.setFont(new java.awt.Font("Monospaced", Font.BOLD, 12)); + zoomOut.setMargin(new Insets(0, 4, 0, 4)); + zoomOut.addActionListener(new PlotToolBar_zoomOut_actionAdapter(this)); + + coordLabel.setBorder(BorderFactory.createLoweredBevelBorder()); + coordLabel.setMaximumSize(new Dimension(200, 20)); + coordLabel.setMinimumSize(new Dimension(100, 20)); + coordLabel.setPreferredSize(new Dimension(100, 24)); + coordLabel.setHorizontalAlignment(SwingConstants.CENTER); + + plotBox.setMaximumSize(new Dimension(100, 24)); + plotBox.setMinimumSize(new Dimension(100, 24)); + plotBox.setPreferredSize(new Dimension(100, 24)); + plotBox.addItemListener(new PlotToolBar_plotBox_itemAdapter(this)); + + plotLabel.setHorizontalAlignment(SwingConstants.RIGHT); + zoomPanel.setMinimumSize(new Dimension(53, 24)); + zoomPanel.setPreferredSize(new Dimension(53, 24)); + + zoomPanel.setLayout(new FlowLayout(FlowLayout.CENTER, 5, 1)); + zoomPanel.add(zoomIn); + zoomPanel.add(zoomOut); + + this.setLayout(new GridLayout(1, 0)); + + // removeButton.setMargin(new Insets(0, 4, 0, 4)); + removeButton.setMinimumSize(new Dimension(30, 24)); + removeButton.setPreferredSize(new Dimension(30, 24)); + removeButton + .addActionListener(new PlotToolBar_removeButton_actionAdapter( + this)); + // removeButton.setFont(new java.awt.Font("Monospaced", Font.BOLD, 12)); + + add(plotLabel); + add(plotBox); + add(removeButton); + add(zoomPanel); + add(coordLabel); + + // Load images + + } + + /** + * method sets the selected panel to a given panel + * + * @param InfoContentPanel + * + */ + // public void setSelectedPanel(InfoContentPanel p){ + // this.selectedPanel=p; + // System.out.println(">>>>"); + // } + + @SuppressWarnings("unchecked") + public void updateInfoContentPanels(Vector panels) { + plotBox.removeAllItems(); + + Iterator it = panels.iterator(); + while (it.hasNext()) { + plotBox.addItem((InfoContentPanel) it.next()); + } + } + + // select InfocontentPanel + // cause alot of exception + public void selectInfoContentPanel(InfoContentPanel panel) { + plotBox.setSelectedItem(panel); + } + + public void plotBox_itemStateChanged(ItemEvent e) { + myContainer.selectInformationContentPanel((InfoContentPanel) plotBox + .getSelectedItem()); + } + + public void zoomIn_actionPerformed(ActionEvent e) { + ((InfoContentPanel) plotBox.getSelectedItem()).zoomIn_actionPerformed(); + // this.selectedPanel.zoomIn_actionPerformed(); + } + + public void removeButton_actionPerformed(ActionEvent e) { + System.out.println(myContainer.plotList.size()); + myContainer.plotList.remove(plotBox.getSelectedIndex()); + System.out.println(myContainer.plotList.size()); + + myContainer.repaint(); + // Remove the item from the drop down list + plotBox.removeItem((plotBox.getSelectedItem())); + // myContainer.plotList.remove(plotBox.getSelectedIndex()); + + // + // ((InfoContentPanel)).zoomIn_actionPerformed(); + // this.selectedPanel.zoomIn_actionPerformed(); + } + + public void zoomOut_actionPerformed(ActionEvent e) { + ((InfoContentPanel) plotBox.getSelectedItem()) + .zoomOut_actionPerformed(); + // this.selectedPanel.zoomOut_actionPerformed(); + } + +} + +class PlotToolBar_plotBox_itemAdapter implements ItemListener { + private PlotToolBar adaptee; + + PlotToolBar_plotBox_itemAdapter(PlotToolBar adaptee) { + this.adaptee = adaptee; + } + + public void itemStateChanged(ItemEvent e) { + adaptee.plotBox_itemStateChanged(e); + } +} + +class PlotToolBar_zoomIn_actionAdapter implements ActionListener { + private PlotToolBar adaptee; + + PlotToolBar_zoomIn_actionAdapter(PlotToolBar adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.zoomIn_actionPerformed(e); + } +} + +class PlotToolBar_zoomOut_actionAdapter implements ActionListener { + private PlotToolBar adaptee; + + PlotToolBar_zoomOut_actionAdapter(PlotToolBar adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.zoomOut_actionPerformed(e); + } + +} + +class PlotToolBar_removeButton_actionAdapter implements ActionListener { + private PlotToolBar adaptee; + + PlotToolBar_removeButton_actionAdapter(PlotToolBar adaptee) { + this.adaptee = adaptee; + } + + public void actionPerformed(ActionEvent e) { + adaptee.removeButton_actionPerformed(e); + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/PrintUtilities.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/PrintUtilities.java new file mode 100755 index 0000000..4a14ee5 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/PrintUtilities.java @@ -0,0 +1,102 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +import java.awt.*; +import javax.swing.*; +import java.awt.print.*; + +/** + * A simple utility class that lets you very simply print an arbitrary + * component. Just pass the component to the PrintUtilities.printComponent. The + * component you want to print doesn't need a print method and doesn't have to + * implement any interface or do anything special at all. + *

+ * If you are going to be printing many times, it is marginally more efficient + * to first do the following: + * + *

+ * PrintUtilities printHelper = new PrintUtilities(theComponent);
+ * 
+ * + * then later do printHelper.print(). But this is a very tiny difference, so in + * most cases just do the simpler + * PrintUtilities.printComponent(componentToBePrinted). + * + * 7/99 Marty Hall, http://www.apl.jhu.edu/~hall/java/ May be freely used or + * adapted. + */ + +public class PrintUtilities implements Printable { + private Component componentToBePrinted; + + public static void printComponent(Component c) { + new PrintUtilities(c).print(); + } + + public PrintUtilities(Component componentToBePrinted) { + this.componentToBePrinted = componentToBePrinted; + } + + public void print() { + PrinterJob printJob = PrinterJob.getPrinterJob(); + printJob.setPrintable(this); + if (printJob.printDialog()) + try { + printJob.print(); + } catch (PrinterException pe) { + System.out.println("Error printing: " + pe); + } + } + + public int print(Graphics g, PageFormat pageFormat, int pageIndex) { + if (pageIndex > 0) { + return (NO_SUCH_PAGE); + } else { + pageFormat.setOrientation(PageFormat.LANDSCAPE); + Graphics2D g2d = (Graphics2D) g; + g2d.translate(pageFormat.getImageableX(), + pageFormat.getImageableY()); + disableDoubleBuffering(componentToBePrinted); + componentToBePrinted.paint(g2d); + enableDoubleBuffering(componentToBePrinted); + return (PAGE_EXISTS); + } + } + + /** + * The speed and quality of printing suffers dramatically if any of the + * containers have double buffering turned on. So this turns if off + * globally. + * + * @see enableDoubleBuffering + */ + public static void disableDoubleBuffering(Component c) { + RepaintManager currentManager = RepaintManager.currentManager(c); + currentManager.setDoubleBufferingEnabled(false); + } + + /** Re-enables double buffering globally. */ + + public static void enableDoubleBuffering(Component c) { + RepaintManager currentManager = RepaintManager.currentManager(c); + currentManager.setDoubleBufferingEnabled(true); + } +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/Import16.gif b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/Import16.gif new file mode 100755 index 0000000..0fc47e1 Binary files /dev/null and b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/Import16.gif differ diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/Open16.gif b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/Open16.gif new file mode 100755 index 0000000..fabd567 Binary files /dev/null and b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/Open16.gif differ diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/ZoomIn16.gif b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/ZoomIn16.gif new file mode 100755 index 0000000..2329426 Binary files /dev/null and b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/ZoomIn16.gif differ diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/ZoomOut16.gif b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/ZoomOut16.gif new file mode 100755 index 0000000..f9f7565 Binary files /dev/null and b/src/main/java/japsa/bio/misc/dnaPlatform/gui/images/ZoomOut16.gif differ diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/gui/proteinConvertUtilities.java b/src/main/java/japsa/bio/misc/dnaPlatform/gui/proteinConvertUtilities.java new file mode 100755 index 0000000..e4a2b48 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/gui/proteinConvertUtilities.java @@ -0,0 +1,140 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Hoang Anh Nguyen and is subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.gui; + +/** + * + * @author hoangnguyen + */ +public class proteinConvertUtilities { + + public static int A = 0; + public static int C = 1; + public static int G = 2; + public static int T = 3; + public static char STOP_CONDON = '<'; + public static char START_CODON = '>'; + public static char Methionie = 'M';// also can be start codon + public static int NUMBER_OF_PROTEIN = 64; + // table contains the + public static char[] proteinTable = new char[] { 'K', 'N', 'K', 'N', 'T', + 'T', 'T', 'T', 'R', 'S', 'R', 'S', 'I', 'I', 'M', 'I', 'Q', 'H', + 'Q', 'H', 'P', 'P', 'P', 'P', 'R', 'R', 'R', 'R', 'L', 'L', 'L', + 'L', 'E', 'D', 'E', 'D', 'A', 'A', 'A', 'A', 'G', 'G', 'G', 'G', + 'V', 'V', 'V', 'V', '<', 'Y', '<', 'Y', 'S', 'S', 'S', 'S', '<', + 'C', 'W', 'C', 'L', 'F', 'L', 'F' }; + + /** + * Method converts array of characters into protein codons + * + * @param seqData + * CharSequenceData - sequence of A, C, G,T + * @return SequenceData - sequence of protein codons + * + */ + public static char[] convertToProtein(char[] seqData) + throws RuntimeException { + + char[] charSeq = seqData; + // now run through the sequence, convert 3 contiguous characters into 1 + // protein + // the number of proteins represented by that sequence will be the + // length of the sequence - 2 + char[] proteinSeq = new char[charSeq.length - 2];// may it cause + // exception when + // charSeq < 2 ? + for (int i = 0; i < charSeq.length - 2; i++) { + proteinSeq[i] = getCodon(charSeq[i], charSeq[i + 1], charSeq[i + 2]); + } + return proteinSeq; + } + + /** + * method returns the corrensponding codon characters to 3 characters + * + * @param char , char, char + * @return char + */ + public static char getCodon(char c1, char c2, char c3) { + return proteinTable[4 * 4 * getOrder(c1) + 4 * getOrder(c2) + + getOrder(c3)]; + } + + /** + * method finds the value of the character + * + * @param char c if c='a' or 'A' --> return 0 if c='c' or 'G' --> return 1 + * if c='g' or 'G' --> reuturn 2 if c= 't' or 'T'--> return 3 + * @return int + */ + public static int getOrder(char c) { + char character = Character.toLowerCase(c); + int returnValue; + switch (character) { + case 'a': + returnValue = A; + break; + case 'c': + returnValue = C; + break; + case 'g': + returnValue = G; + break; + case 't': + returnValue = T; + break; + default: + returnValue = -1; + break; + } + return returnValue; + } + + /** + * this method filters the aminoAcidCodons to aminoAcids in protein + * + * @param char[] aminoAcidCodons + * @return char[] aminoAcids in protein + * + * + */ + public static void proteinFilter(char[] aminoAcidCodons) { + // int lastStop =0; + // int lastBegin =0; + boolean isInProtein = false; + for (int i = 0; i < aminoAcidCodons.length; i++) { + // if current char is stop and it is in the protien + if (aminoAcidCodons[i] == STOP_CONDON && isInProtein == true) { + aminoAcidCodons[i] = STOP_CONDON; + isInProtein = false; + } + // if current char is start and not in the protein + else if (aminoAcidCodons[i] == Methionie && isInProtein == false) { + aminoAcidCodons[i] = START_CODON; + isInProtein = true; + } else if (isInProtein == false) { + aminoAcidCodons[i] = ' '; + } + + } + + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/sequence/AnnotationSequenceData.java b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/AnnotationSequenceData.java new file mode 100755 index 0000000..6317921 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/AnnotationSequenceData.java @@ -0,0 +1,177 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.sequence; + +import japsa.bio.misc.dnaPlatform.function.ReadFormatFileFunction; +import japsa.seq.JapsaAnnotation; +import japsa.seq.JapsaFeature; +import japsa.seq.SequenceOutputStream; + +import java.io.*; +import java.util.Iterator; + +/** + *

+ * Title: FeatureSequenceData + *

+ * + *

+ * Description: FeatureSequenceData is a DiscreteSequence data that stores + * actual features + *

+ * The features are in increasing order of starting points. + * + * @author Minh Duc + * @version 1.0 + */ +public class AnnotationSequenceData extends SequenceData { + JapsaAnnotation annotation; + + /** + * Initializes current CharSequenceData instance to hold 100,000 characters. + * Number of characters stored can be changed by reading a DNA file with + * method readDataFromDNAfile() + */ + public AnnotationSequenceData() { + super(); + annotation = new JapsaAnnotation(); + } + + public AnnotationSequenceData(JapsaAnnotation x) { + super(); + annotation = x; + } + + /** + * Takes a parent SequenceData object to get information about how it was + * constructed. + * + * @param parent + * SequenceData + */ + + public AnnotationSequenceData(SequenceData parent) { + super(parent); + annotation = new JapsaAnnotation(); + } + + /** + * Function to return data stored in FeatureSequenceData as an array of + * Character objects + * + * @return Character[] + */ + public Object[] getData() { + return annotation.getFeatureList().toArray(); + } + + public int size() { + return annotation.numFeatures(); + } + + /** + * Function to set the data stored in a FeatureSequenceData object given an + * array of Features objects + * + * @param newData + * Character[] + */ + public void setData(Object[] newData) { + annotation = new JapsaAnnotation(); + + for (int i = 0; i < newData.length; i++) + annotation.add((JapsaFeature) newData[i]); + } + + public Iterator iterator() { + return annotation.iterator(); + } + + public boolean writeDataToFile(File file) { + try { + SequenceOutputStream ps = new SequenceOutputStream( + new FileOutputStream(file)); + annotation.writeAnnotation(ps); + ps.close(); + + } catch (Exception e) { + } + return true; + } + + public String getProperty() { + return "Annotation containing " + size() + " features\n" + + annotation.getDescription(); + } + + public JapsaFeature getFeature(int idx) { + return annotation.getFeature(idx); + } + + public void addFeature(JapsaFeature f) { + annotation.add(f); + } + + public void addDescription(String desc) { + annotation.addDescription(desc); + } + + /** + * This function reads alphanumeric characters from file + * + * @param sequenceFile + * String + * @return int + */ + public int readDataFromFile(String sequenceFile) throws IOException { + + annotation = null; + Iterator iterSeq = (new ReadFormatFileFunction( + sequenceFile)).guessFormat(); + while (iterSeq.hasNext()) { + SequenceData seq = iterSeq.next(); + if (seq instanceof AnnotationSequenceData) { + annotation = ((AnnotationSequenceData) seq).annotation; + return annotation.numFeatures(); + } + } + throw new RuntimeException("No annotation found "); + + } + + public JapsaAnnotation getAnnotation() { + return annotation; + } + + public void setAnnotation(JapsaAnnotation annotation) { + this.annotation = annotation; + } + + /* + * need to modify toString() method so that every sequenceData object is + * unique + */ + public String toString() { + if (sequenceName == null) + return "Annotation sequence"; + return sequenceName + "(Annotation Sequence)"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/sequence/CharSequenceData.java b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/CharSequenceData.java new file mode 100755 index 0000000..d2d2847 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/CharSequenceData.java @@ -0,0 +1,250 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.sequence; + +import java.io.*; +import java.util.Vector; + +/** + *

+ * Title: CharSequenceData + *

+ * + *

+ * Description: CharSequenceData is a DiscreteSequence data that stores actual + * sequence data as characters + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class CharSequenceData extends SequenceData { + /** + * data: the sequence of characters + */ + protected char data[]; + + /** + * Initializes current CharSequenceData instance to hold 100,000 characters. + * Number of characters stored can be changed by reading a DNA file with + * method readDataFromDNAfile() + */ + public CharSequenceData() { + super(); + data = new char[0]; + } + + /** + * Takes a parent SequenceData object to get information about how it was + * constructed. + * + * @param parent + * SequenceData + */ + + public CharSequenceData(SequenceData parent) { + super(parent); + data = new char[0]; + } + + public String getProperty() { + return "Char sequence of " + data.length + " charators"; + } + + /** + * Returns array of characters with data stored in object + * + * @return char[] + */ + public char[] getCharData() { + return (char[]) data.clone(); + } + + /** + * Sets data[] to be a copy of given charData array + * + * @param charData + * char[] + */ + public void setCharData(char[] charData) { + data = charData; + } + + /** + * method returns the string representation of data + * + * @return String + * + * + */ + public String getStringData() { + String s = ""; + for (int i = 0; i < data.length; i++) + s = s + data[i]; + return s; + } + + /** + * Function to return data stored in CharSequenceData as an array of + * Character objects + * + * @return Character[] + */ + public Object[] getData() { + Character[] tempData = new Character[data.length]; + for (int i = 0; i < data.length; i++) + tempData[i] = new Character(data[i]); + + return tempData; + } + + /** + * Function to set the data stored in a SequenceData object given an array + * of Character objects + * + * @param newData + * Character[] + */ + public void setData(Object[] newData) { + data = new char[newData.length]; + for (int i = 0; i < newData.length; i++) + data[i] = ((Character) newData[i]).charValue(); + } + + public int size() { + return data.length; + } + + /** + * Reads a file using a BufferedReader and stores characters found in file + * matching charRegex into char array + * + * @param sequenceFile + * name of the file contaning the sequence of characters to read + * @param charRegex + * a regular expression describing the type of characters to be + * read from the file + * @return the number of characters read from file + * + * public int readDataFromFile(String sequenceFile, String + * charRegex) { + * + * int sequenceLength = + * getSequenceLengthFromFile(sequenceFile,charRegex,""); data = new + * char[sequenceLength]; + * + * int totalLen = 0; if (sequenceFile == null) { return 0; } + * + * try { // Open a file of the given name. File file = new + * File(sequenceFile); + * + * //store file in constructor vector addHistory(file); + * + * FileReader fr = new FileReader(file); BufferedReader bufRdr = new + * BufferedReader(fr); + * + * String line = null; while ( (line = bufRdr.readLine()) != null) { + * String[] words = line.split(""); for (int j = 0; j < + * words.length; j++) { if (words[j].matches(charRegex)) { // if + * there is an element in data equal to new element // point to that + * same element in data array data[totalLen] = words[j].charAt(0); + * totalLen++; //if totalLength = sequenceLength, stop looping if + * (totalLen == sequenceLength) { break; } } } } //while + * + * bufRdr.close(); } catch (IOException ioex) { + * System.err.println(ioex); } + * + * System.out.println("graph created!"); + * + * sequenceLength = totalLen; return totalLen; } + */ + + /** + * read a file specify which type the file is if the file is FASTA or + * GENBANK, use the appropriate from biojava otherwise use the normal + * method(read by BufferedReader) + * + */ + public int readDataFromFile(String sequenceFile, String charRegex) { + + if (sequenceFile == null) { + return 0; + } + Vector v = new Vector(); + + try { + // Open a file of the given name. + File file = new File(sequenceFile); + + // store file in constructor vector + addHistory(file); + + FileReader fr = new FileReader(file); + BufferedReader bufRdr = new BufferedReader(fr); + + String line = null; + while ((line = bufRdr.readLine()) != null) { + String[] words = line.split(""); + for (int j = 0; j < words.length; j++) { + if (words[j].matches(charRegex)) { + v.add(words[j].charAt(0)); + } + } + } + data = new char[v.size()]; + for (int i = 0; i < v.size(); i++) + data[i] = v.get(i); + + bufRdr.close(); + } catch (IOException ioex) { + ioex.printStackTrace(); + } + System.out.println("graph created!"); + return data.length; + + } + + public boolean writeDataToFile(File file) { + return true; + } + + /** + * This function reads alphanumeric characters from file + * + * @param sequenceFile + * String + * @return int + * @throws Exception + */ + public int readDataFromFile(String sequenceFile) throws IOException { + return readDataFromFile(sequenceFile, "[acgtACGT]"); + } + + /* + * need to modify toString() method so that every sequenceData object is + * unique + */ + public String toString() { + if (sequenceName == null) + return "Character sequence"; + return sequenceName + "(Character sequence)"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/sequence/DNASequenceData.java b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/DNASequenceData.java new file mode 100755 index 0000000..372cefa --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/DNASequenceData.java @@ -0,0 +1,153 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.sequence; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +/** + *

+ * Title: DNASequenceData + *

+ * + *

+ * Description: This class holds instances of DNA sequences + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class DNASequenceData extends CharSequenceData { + + public DNASequenceData() { + super(); + + } + + /** + * Creates an InfoContentSequenceData object from parent SequenceData. + * + * @param parent + * SequenceData + */ + + public DNASequenceData(SequenceData parent) { + super(parent); + } + + public void readDataFromString(String str) { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < str.length(); i++) { + char c = Character.toLowerCase(str.charAt(i)); + if (c >= 'a' && c <= 'z') + sb.append(c); + } + + data = sb.toString().toCharArray(); + } + + public boolean writeDataToFile(File file) { + try { + Sequence aDNA = new Sequence(Alphabet.DNA4(), data, sequenceName); + SequenceOutputStream out = new SequenceOutputStream( + new FileOutputStream(file)); + + aDNA.print(out); + + out.close(); + + return true; + } catch (Exception e) { + e.printStackTrace(); + } + + return false; + + } + + /** + * Reads DNA sequence stored in file given as a parameter and stores that + * sequence in a character array. + * + * @param sequenceFile + * String + * @return int + */ + public int readDataFromFile(String sequenceFile) throws IOException { + File file = new File(sequenceFile); + addHistory(file); + + // set sequence of infoContent panel + Sequence seq = SequenceReader.getReader(sequenceFile) + .nextSequence(null);// (filename)IOTools.read(args[0]); + if (seq == null) { + System.err.println("Unable to read sequence file"); + } + + data = seq.charSequence(); + return data.length; + } + + /** + * Function to return data stored in CharSequenceData as an array of + * Character objects, DNA sequences are composed of characters A, G, T, C + * + * @return Character[] + */ + public Object[] getData() { + + Character[] tempData = new Character[data.length]; + for (int i = 0; i < data.length; i++) { + if (data[i] == 'a' || data[i] == 'A') + tempData[i] = 'a'; + else if (data[i] == 'g' || data[i] == 'G') + tempData[i] = 'g'; + else if (data[i] == 't' || data[i] == 'T') + tempData[i] = 't'; + else if (data[i] == 'c' || data[i] == 'C') + tempData[i] = 'c'; + else + tempData[i] = new Character(data[i]); + } + + return tempData; + } + + public String getProperty() { + return "DNA sequence of " + data.length + " nucleotides"; + } + + /* + * need to modify toString() method so that every sequenceData object is + * unique + */ + public String toString() { + if (sequenceName == null) + return "DNA Sequence"; + return sequenceName + "(DNA)"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/sequence/DoubleSequenceData.java b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/DoubleSequenceData.java new file mode 100755 index 0000000..4fa56a4 --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/DoubleSequenceData.java @@ -0,0 +1,168 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.sequence; + +import java.io.*; + +import japsa.bio.misc.common.NumericalSequence; + +/** + *

+ * Title: DoubleSequenceData + *

+ * + *

+ * Description: This class holds values assigned to bases throughout a sequence. + * Values are represented as doubles and are stored in an array of fixed length. + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + * @author Julie Bernal + * @version 1.0 + */ +public class DoubleSequenceData extends SequenceData { + private double data[]; + + /** + * Creates a DoubleSequenceData to hols a maximum of given doubles. + * + * @param sequenceLen + * int + */ + public DoubleSequenceData() { + super(); + data = new double[0]; + } + + /** + * Creates a DoubleSequenceData object from parent SequenceData. + * DoubleSequenceData is set to hold the same number of objects as parent. + * + * @param parent + * SequenceData + */ + + public DoubleSequenceData(SequenceData parent) { + super(parent); + data = new double[0]; + } + + public String getProperty() { + return "Numerical sequence of " + data.length + " numbers"; + } + + /** + * Returns an array with data stored in DoubleSequenceData object. + * + * @return double[] + */ + public double[] getDoubleData() { + return (double[]) data.clone(); + } + + /** + * Sets data[] to be a copy of given doubleData array + * + * @param doubleData + * double[] + */ + public void setDoubleData(double[] doubleData) { + data = doubleData; + } + + /** + * Function to return data stored in DoubleSequenceData as an array of + * Double objects + * + * @return Double[] + */ + public Object[] getData() { + Double[] tempData = new Double[data.length]; + for (int i = 0; i < data.length; i++) + tempData[i] = new Double(data[i]); + + return tempData; + } + + /** + * Function to set the data stored in a DoubleSequenceData object given an + * array of Double objects + * + * @param newData + * Double[] + */ + public void setData(Object[] newData) { + + data = new double[newData.length]; + for (int i = 0; i < newData.length; i++) + data[i] = ((Double) newData[i]).doubleValue(); + } + + /** + * readDataFromFile: This function reads output in a file from a compression + * model on a sequence. Points are stored in an array of fixed length, + * infoContent, to access elements of the array an integer is used. + */ + public int readDataFromFile(String filename) { + + try { + // Open a file of the given name. + File file = new File(filename); + addHistory(file); + data = NumericalSequence.read(filename); + } catch (Exception e) { + e.printStackTrace(); + } + System.out.println("graph created!" + data.length); + return data.length; + } + + public boolean writeDataToFile(File file) { + return (new NumericalSequence(data)).writeDataToFile(file); + /******************************************* + * try{ PrintWriter pw = new PrintWriter(new FileOutputStream(file)); + * pw.println("# Double data written by DNAGraphTool"); for (int i = 0; + * i < data.length; i++){ pw.println(i + "\t" + data[i]); } + * + * pw.close(); }catch (Exception e){ e.printStackTrace(); } + * + * return true; / + *******************************************/ + + } + + public int size() { + return data.length; + } + + /* + * need to modify toString() method so that every sequenceData object is + * unique + */ + public String toString() { + if (sequenceName == null) + return "Numerical sequence"; + return sequenceName + "(Numerical sequence)"; + } + +} diff --git a/src/main/java/japsa/bio/misc/dnaPlatform/sequence/SequenceData.java b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/SequenceData.java new file mode 100755 index 0000000..56562cd --- /dev/null +++ b/src/main/java/japsa/bio/misc/dnaPlatform/sequence/SequenceData.java @@ -0,0 +1,196 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.bio.misc.dnaPlatform.sequence; + +import java.util.*; +import java.io.*; + +/** + *

+ * Title: SequenceData + *

+ * + *

+ * Description: This is an abstract class to represent all types of sequences + * within the DNAPlatform. Sequences hold actual sequence data and a vector + * containing references to objects used to create them. + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + * Walked through my Minh Duc Cao 14/11/2007 + * + * @author Julie Bernal + * @version 1.0 + */ +@SuppressWarnings("rawtypes") +public abstract class SequenceData implements Cloneable { + + // Vector to hold references to objects used to create current sequence data + // Objects used to create data include sequenceFiles, modelHandles and + // Functions + protected Vector constructor; + protected String sequenceName; + protected SequenceData myParent; + + /** + * Creates an instance of SequenceData. + * + */ + public SequenceData() { + constructor = new Vector(); + myParent = null; + } + + /** + * Creates an instance of SequenceData from parent SequenceData, parent + * history is added to new instance. + * + * @param parent + * SequenceData + */ + + public SequenceData(SequenceData parent) { + // constructor = new Vector(); + constructor = parent.getHistory(); + myParent = parent; + } + + /** + * method returns the sequenceName of the SequenceData + * + * @return String sequence name + * + */ + public String getSequenceName() { + return sequenceName; + } + + /** + * method sets the sequenceName + * + * @param String + * sequence name + * + */ + public void setSequenceName(String name) { + sequenceName = name; + } + + public SequenceData getParentSequence() { + return myParent; + } + + /* + * The name of sequence file is a String in Vector constructor + * + * @return String + */ + public File getSequenceFile() { + Iterator i = constructor.iterator(); + while (i.hasNext()) { + Object o = i.next(); + if (o instanceof File) { + return (File) o; + } + } + return null; + } + + /** + * This method adds a new object to the history of sequence + * + * @param o + * object + */ + public void addHistory(Object o) { + constructor.add(o); + } + + /** + * This method returns a vector holding references to objects used to create + * the sequence. + * + * @return Vector + */ + public Vector getHistory() { + Vector history = new Vector(); + history.add(this); + history.add(constructor); + return history; + } + + /** + * Returns clone of current sequence data object. + * + * @return SequenceData + */ + // Do i need to clone??? + public SequenceData getNewSequenceData() { + SequenceData newSequenceData = null; + try { + newSequenceData = (SequenceData) this.clone(); + newSequenceData.constructor = this.getHistory(); + } catch (Exception ex) { + System.err.println(ex); + ex.printStackTrace(); + } + return newSequenceData; + } + + /** + * Function to return data stored in SequenceData object. + * + * @return Object[] + */ + public abstract Object[] getData(); + + /** + * Function to set the data stored in a SequenceData object. + * + * @param newData + * Object[] + */ + public abstract void setData(Object[] newData); + + /* + * need to modify toString() method so that every sequenceData object is + * unique + */ + public String toString() { + if (sequenceName == null) + return "Sequence"; + return sequenceName + "(Sequence)"; + } + + /* + * All sequence data objects should know how to read sequences from files + */ + public abstract int readDataFromFile(String file) throws IOException; + + public abstract boolean writeDataToFile(File file); + + public abstract String getProperty(); + + public abstract int size(); + +} diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/ExactMatches.java b/src/main/java/japsa/bio/misc/fuzzyLZ/ExactMatches.java new file mode 100755 index 0000000..3b964bf --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/ExactMatches.java @@ -0,0 +1,330 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the + * compression of DNA sequeces. It can be viewed as a + * compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain + * inserts/deletes/mismatches. + * + */ + +package japsa.bio.misc.fuzzyLZ; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.Serializable; + +/** + * This class implements a hashing scheme to find matches of a fixed size within + * a String + * + * @author David Powell + * @version 24/5/2001 + */ + +public class ExactMatches implements Serializable { + private static final long serialVersionUID = 1L; + + // public static class Convert implements Serializable { + public static class Convert implements Serializable { + private static final long serialVersionUID = 1L; + + public String conv(String s) { + return s; + } + + public char conv(char a) { + return a; + } + } + + public static class Reverse_Complement_DNA extends Convert { + private static final long serialVersionUID = 1L; + + public String conv(String s) { + // return (new StringBuffer(s)).reverse().toString(); + char c[] = new char[s.length()]; + for (int i = 0; i < s.length(); i++) { + char a = s.charAt(i); + a = conv(a); + c[s.length() - i - 1] = a; + } + return new String(c); + }; + + public char conv(char a) { + switch (a) { + case 'A': + a = 'T'; + break; + case 'T': + a = 'A'; + break; + case 'G': + a = 'C'; + break; + case 'C': + a = 'G'; + break; + case 'a': + a = 't'; + break; + case 't': + a = 'a'; + break; + case 'g': + a = 'c'; + break; + case 'c': + a = 'g'; + break; + default: + System.err.println("WARNING: unknown DNA character " + a + + " in Reverse_Complement_DNA (char unchanged)"); + return a; + } + return a; + } + } + + // Internal representation of a linked-list of ints. + public static class MyList implements Serializable { + private static final long serialVersionUID = 1L; + + public static class L implements Serializable { + private static final long serialVersionUID = 1L; + L next; + + int val; + + L(int v) { + next = null; + val = v; + } + } + + L start, end; + + MyList(int v) { + L n = new L(v); + start = end = n; + } + + void add(int v) { + L n = new L(v); + end.next = n; + end = n; + } + } + + private static class MyHash implements Serializable { + private static final long serialVersionUID = 1L; + + private static class HashChain implements Serializable { + private static final long serialVersionUID = 1L; + HashChain next; + + MyList intList; + } + + int hSize = 1048573; + + HashChain hTable[]; + + char[] str; + + int winSize; + + MyHash(char[] str, int winSize) { + this.str = str; + this.winSize = winSize; + hTable = new HashChain[hSize]; + } + + private int hash(char[] s, int p) { + int res = 0; + for (int i = 0; i < winSize; i++) { + res = (res << 1) + res + s[p + i]; + } + return Math.abs(res) % hSize; + } + + private boolean equal(char[] s1, int p1, char[] s2, int p2) { + for (int i = 0; i < winSize; i++) + if (s1[p1 + i] != s2[p2 + i]) + return false; + return true; + } + + MyList get(String s) { + return get(s.toCharArray(), 0); + } + + MyList get(char[] s, int ipos) { + int i = hash(s, ipos); + HashChain c = hTable[i]; + // Check if any of the strings at this hash bucket match the string + // s + while (c != null) { + int pos = c.intList.start.val; // Get the string posSrc of the + // first. + if (equal(s, ipos, str, pos)) { + // A match! + return c.intList; + } + c = c.next; + } + return null; + } + + void put(int pos) { + MyList l = get(str, pos); + if (l == null) { + // First occurance of this string + HashChain c = new HashChain(); + c.intList = new MyList(pos); + + int i = hash(str, pos); + c.next = hTable[i]; + hTable[i] = c; + } else { + l.add(pos); + } + + } + } + + char[] str; + + int winSize; + + int strLen; + + MyHash h; + + ExactMatches(char[] str, int winSize) { + this.str = str; + this.winSize = winSize; + + strLen = str.length; + + if (FuzzyLZ.DEBUG >= 2) + System.err.println("Constructing table of repeats..."); + + h = new MyHash(str, winSize); + for (int i = 0; i < strLen + 1 - winSize; i++) { + h.put(i); + } + + if (FuzzyLZ.DEBUG >= 2) + System.err.println("Done constructing table of repeats."); + } + + // Write our own serization handler. We will _not_ save the hash table. + // Only the string, and recompute the hash table on reload + private void writeObject(java.io.ObjectOutputStream out) throws IOException { + out.writeObject(str); + out.writeInt(winSize); + out.writeInt(strLen); + } + + private void readObject(java.io.ObjectInputStream in) throws IOException, + ClassNotFoundException { + str = (char[]) in.readObject(); + winSize = in.readInt(); + strLen = in.readInt(); + + if (FuzzyLZ.DEBUG >= 2) + System.err.println("Re-Constructing table of repeats..."); + + h = new MyHash(str, winSize); + for (int i = 0; i < strLen + 1 - winSize; i++) { + h.put(i); + } + + if (FuzzyLZ.DEBUG >= 2) + System.err.println("Done re-constructing table of repeats."); + } + + MyList get(char[] s, int pos) { + return h.get(s, pos); + } + + MyList get(String s) { + return h.get(s); + } + + public long count_hits(Convert convert) { + long count = 0; + for (int i = 0; i < strLen + 1 - winSize; i++) { + String s = new String(str, i, winSize); + MyList l = h.get(convert.conv(s)); + if (l != null) { + for (MyList.L l2 = l.start; l2 != null && l2.val < i; l2 = l2.next) { + count++; + } + } + } + return count; + } + + public void dispAll(Convert convert) { + for (int i = 0; i < strLen + 1 - winSize; i++) { + String s = new String(str, i, winSize); + MyList l = h.get(convert.conv(s)); + if (l != null) { + for (MyList.L l2 = l.start; l2 != null && l2.val < i; l2 = l2.next) { + System.out.println("String at " + i + " has a match at " + + l2.val); + System.out.println(" :" + s + ":" + + new String(str, l2.val, winSize)); + } + } + } + } + + @SuppressWarnings("unused") + public static void main(String args[]) { + int len = Integer.valueOf(args[0]).intValue(); + char[] input = null; + + try { + StringBuffer s = new StringBuffer(); + byte[] buf = new byte[1024]; + int n; + while ((n = System.in.read(buf)) >= 0) { + s.append(new String(buf, 0, n)); + } + input = s.toString().toCharArray(); + } catch (Exception e) { + System.err.println("Unable to read stdin"); + System.exit(1); + } + + System.out.println("String length = " + input.length); + + ExactMatches m = new ExactMatches(input, len); + // System.out.println("Number of matches = "+m.count()); + + if (true) { + // m.dispAll(new Convert()); + m.dispAll(new Reverse_Complement_DNA()); + } else { + try { + File f = new File("ExactMatches.store"); + ObjectOutputStream oos = new ObjectOutputStream( + new FileOutputStream(f)); + oos.writeObject(m); + oos.close(); + } catch (Exception e) { + System.err.println("Error writing file: " + e); + } + } + } +} diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/FuzzyDriver.java b/src/main/java/japsa/bio/misc/fuzzyLZ/FuzzyDriver.java new file mode 100755 index 0000000..02a7ebc --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/FuzzyDriver.java @@ -0,0 +1,940 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the compression of DNA sequeces. + * It can be viewed as a compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain inserts/deletes/mismatches. + * + */ + +package japsa.bio.misc.fuzzyLZ; + +import japsa.bio.misc.common.MarkovN; +import japsa.bio.misc.common.Misc; +import japsa.bio.misc.common.Params; +import japsa.bio.misc.common.Seq_Model; +import japsa.bio.misc.dnaPlatform.OptionsHandle; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.io.PrintStream; +import java.io.Serializable; +import java.util.Vector; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class FuzzyDriver implements Serializable { + private static final long serialVersionUID = 1L; + + static final String VERSION = "1.3"; + + private static final double CONVERGE_CUTOFF = 0.0001; + + // If any object variables are added here, be use to think about + // serialisation of them... + // if the variable is needed after a 'resume', add the variable to the + // writeObject() and readObject() functions. + + int maxIterations; + + char alphabet[]; + + int DEBUG; + + String fname; + + int imageFreq; + + int checkpointFreq; + + int statsFreq; + + String msgFname; + + String outDir; + + String fprefix; + + // FileWriter msgFile; + PrintStream msgOut; + + Params params; + + char[] str; + + char[] preStr; + + char[] joinedStr; // Simple concatenation of preStr + str + + String seqModelStr; // Sequence model for base state. + + boolean overwrite; // Overwrite files + + static CommandLine defaultCmdLine = new CommandLine(); + static { + defaultCmdLine.addInt("maxIterations", 1, + "Max number of iterations. (0 means until convergence)"); + defaultCmdLine.addString("preFile", "", + "Prepend 'preFile' to sequence."); + defaultCmdLine + .addString( + "seqModel", + "markov(2)", + "Base sequence model. Use 'markov(n)' for a n-th order markov model, n=-1 for uniform"); + defaultCmdLine.addString("dna", "atgc", + "Alphabet used by the sequence."); + + defaultCmdLine.addString("fwdMach", "3state", + "Comma separated list of machines to use for forward matches.\n" + + "(Use an empty string '' for no forward machines.)\n" + + "Supported: 1state,3state"); + + defaultCmdLine.addString("revMach", "3state", + "Comma separated list of machines to use for reverse matches.\n" + + "(Use an empty string '' for no reverse machines.)\n" + + "Supported: 1state,3state"); + + defaultCmdLine.addBoolean("overwrite", false, "Overwrite msglen file."); + defaultCmdLine.addInt("debug", 2, + "Debug level (higher gives more verbose )"); + defaultCmdLine + .addInt("imageSize", 1024, "Maximum Image size in pixels"); + defaultCmdLine.addInt("imageFreq", 0, + "Save an image every seconds. (0 - to disable)"); + defaultCmdLine.addInt("checkFreq", 0, + "Save a checkpoint every seconds. (0 - to disable)"); + defaultCmdLine.addInt("statsFreq", 300, + "Display some stats every seconds. (0 - to disable)"); + + defaultCmdLine.addString("msgFile", "", + "Output file for encode length of each character.\n" + + "(The default is based on the input file name)"); + + defaultCmdLine.addString("outDir", "." + File.separatorChar, + "Directory to save output files in."); + + // These options are for Matches_Sparse + defaultCmdLine + .addInt("hashSize", 10, + "Window size to use for constructing hashtable (0 - for full N^2 algorithm)"); + defaultCmdLine.addInt("computeWin", 10, + "Number of cells to activate on a hashtable hit"); + defaultCmdLine + .addInt("cutML", 4, + "When (cell_value - base_cell > cutML) then cell is killed. (in bits)"); + // defaultCmdLine.addBoolean("plotActive", false, "true: plot only + // active + // cells, false: plot cell values"); + + defaultCmdLine + .addString("paramFile", "", + "Parameter file to read for various model parameters (see docs)"); + + defaultCmdLine.addBoolean("resume", false, "Resume from a checkpoint"); + } + + @SuppressWarnings("unchecked") + public static void main(String args[]) throws Exception { + CommandLine cmdLine = FuzzyDriver.defaultCmdLine; + + args = cmdLine.parseLine(args); + + System.out + .println("Approximate Repeat Model for DNA sequence compression"); + + System.out.println(" L. Allison, T. Edgoose, T. I. Dix."); + System.out + .println(" Compression of Strings with Approximate Repeats"); + System.out.println(" Intell. Sys. in Molec. Biol., pp.8-16, 1998\n"); + + if (args == null || args.length != 1) { + System.err + .println("Usage: java " + FuzzyDriver.class.getName() + + " [options] \n" + + cmdLine.usageMessage()); + System.exit(1); + } + + boolean resume = cmdLine.getBooleanVal("resume"); + + FuzzyDriver me = null; + + // Start compression + if (resume) { + // Resuming from a checkpoint + try { + System.out.println("Reloading from checkpoint..."); + File f = new File(args[0]); + ObjectInputStream is = new ObjectInputStream( + new FileInputStream(f)); + me = (FuzzyDriver) is.readObject(); + System.out.println("Successfully loaded checkpoint..."); + is.close(); + } catch (Exception e) { + System.err.println("Unable to resume from checkpoint: " + e); + System.exit(1); + } + + // use defaults. + if (cmdLine.optionSet("maxIterations")) + me.maxIterations = cmdLine.getIntVal("maxIterations"); + if (cmdLine.optionSet("imageFreq")) + me.imageFreq = cmdLine.getIntVal("imageFreq"); + if (cmdLine.optionSet("checkFreq")) + me.checkpointFreq = cmdLine.getIntVal("checkFreq"); + if (cmdLine.optionSet("statsFreq")) + me.statsFreq = cmdLine.getIntVal("statsFreq"); + if (cmdLine.optionSet("imageSize")) + FuzzyLZ.img_width = FuzzyLZ.img_height = cmdLine + .getIntVal("imageSize"); + if (cmdLine.optionSet("debug")) { + me.DEBUG = cmdLine.getIntVal("debug"); + FuzzyLZ.DEBUG = me.DEBUG; + } + + }// end if resume + else {// start from scratch + // Starting anew (no checkpoint) + me = new FuzzyDriver(); + me.params = new Params(); + + me.maxIterations = cmdLine.getIntVal("maxIterations"); + me.alphabet = cmdLine.getStringVal("dna").toCharArray(); + me.DEBUG = cmdLine.getIntVal("debug"); + FuzzyLZ.DEBUG = me.DEBUG; + FuzzyLZ.img_width = FuzzyLZ.img_height = cmdLine + .getIntVal("imageSize"); + me.imageFreq = cmdLine.getIntVal("imageFreq"); + me.checkpointFreq = cmdLine.getIntVal("checkFreq"); + me.statsFreq = cmdLine.getIntVal("statsFreq"); + me.msgFname = cmdLine.getStringVal("msgFile"); + me.outDir = cmdLine.getStringVal("outDir"); + me.overwrite = cmdLine.getBooleanVal("overwrite"); + Matches_Sparse.def_winSize = cmdLine.getIntVal("hashSize"); + Matches_Sparse.def_computeWin = cmdLine.getIntVal("computeWin"); + Matches_Sparse.def_cutML = cmdLine.getIntVal("cutML"); + + // Matches_Sparse.def_plotActive = + // cmdLine.getBooleanVal("plotActive"); + + String paramFile = cmdLine.getStringVal("paramFile"); + if (paramFile.length() > 0) { + try { + BufferedReader rdr = new BufferedReader(new FileReader( + paramFile)); + String line; + while ((line = rdr.readLine()) != null) { + int i = line.indexOf('='); + if (i >= 0) { + String key = line.substring(0, i); + String valStr = line.substring(i + 1); + double val = Double.parseDouble(valStr); + me.params.put(key, val); + } else { + if (line.length() > 0 && !line.startsWith("#")) + System.err + .println("WARNING: Ignoring param line '" + + line + "'"); + } + } + rdr.close(); + } catch (FileNotFoundException e) { + System.err.println("ERROR: Unable to read file '" + + paramFile + "'. Ignoring..."); + } catch (NumberFormatException e) { + System.err + .println("ERROR: Converting string from paramFile to number :" + + e); + me.params = new Params(); + } catch (IOException e) { + System.err.println("ERROR: Reading file '" + paramFile + + "'. Ignoring..."); + me.params = new Params(); + } + } + + me.seqModelStr = cmdLine.getStringVal("seqModel"); + + @SuppressWarnings("rawtypes") + Vector machs = new Vector(); + try { + FuzzyLZ.def_numFwd = parseMachineNames( + cmdLine.getStringVal("fwdMach"), machs); + FuzzyLZ.def_numRev = parseMachineNames( + cmdLine.getStringVal("revMach"), machs); + } catch (Exception e) { + System.err.println(e); + System.exit(1); + } + + FuzzyLZ.MutationModels = + (String[]) machs.toArray(new String[machs.size()]); + + String preFile = cmdLine.getStringVal("preFile"); + if (preFile == "") { + me.preStr = new char[0]; + } else { + Sequence seq = SequenceReader.getReader(preFile).nextSequence( + null);// (filename)IOTools.read(args[0]); + if (seq == null) { + System.err.println("Unable to read prefix file: '" + + preFile + "'"); + System.exit(1); + } + me.preStr = seq.charSequence(); + + for (int i = 0; i < me.preStr.length; i++) + me.preStr[i] = Character.toLowerCase(me.preStr[i]); + + Misc.printf("Pre-sequence length = %d\n", me.preStr.length); + } + + me.fname = args[0]; + Sequence seq = SequenceReader.getReader(me.fname) + .nextSequence(null);// (filename)IOTools.read(args[0]); + if (seq == null) { + System.err.println("Unable to read sequence file"); + System.exit(1); + } + me.str = seq.charSequence(); + for (int i = 0; i < me.str.length; i++) + me.str[i] = Character.toLowerCase(me.str[i]); + + Misc.printf("Sequence length = %d\n", me.str.length); + + // Display the machines we are going to use + for (int i = 0; i < FuzzyLZ.def_numFwd; i++) + System.err.println("FwdMachine[" + i + "]: " + + FuzzyLZ.MutationModels[i]); + for (int i = 0; i < FuzzyLZ.def_numRev; i++) + System.err.println("RevMachine[" + i + "]: " + + FuzzyLZ.MutationModels[i + FuzzyLZ.def_numFwd]); + + // Create joinedStr + me.joinedStr = new char[me.preStr.length + me.str.length]; + System.arraycopy(me.preStr, 0, me.joinedStr, 0, me.preStr.length); + System.arraycopy(me.str, 0, me.joinedStr, me.preStr.length, + me.str.length); + + // Get the file prefix to use for output filenames + me.fprefix = me.outDir; + if (me.fprefix.length() > 0) + me.fprefix += File.separatorChar; + me.fprefix += (new File(me.fname)).getName(); + + // Open the file for msglen output + // try { + // File f; + if (me.msgFname.equals("")) + me.msgFname = me.fprefix + "-msglen.txt"; + // f = new File(me.msgFname); + // if (me.overwrite) + // f.delete(); + + // if (f.exists()) { + // System.err.println("Output file '" + me.msgFname + // + "' already exists."); + // System.exit(1); + // } + // if (!f.createNewFile()) { + // System.err.println("Unable to create Output file '" + // + me.msgFname); + // System.exit(1); + // } + // + // me.msgFile = new FileWriter(f); + // Create new file output stream + // me.msgOut = new PrintStream(f); + // } + // catch (IOException e) { + // System.err.println("Error creating msglen output file: " + e); + // System.exit(1); + // } + } + try { + me.go(resume); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * This Function is called by FuzzyModel to run FuzzyLZ given a char[] + * instead of reading sequence from a file. + * + * @param myOptions + * OptionsHandle + * @param myFile + * String + * @param sequence + * char[] + * @throws RuntimeException + * @throws IOException + * @return filename of Information content sequence created + */ + @SuppressWarnings("unchecked") + public String start(OptionsHandle myOptions, String myFile, char[] sequence) + throws RuntimeException, IOException { + + String resume = myOptions.getStringValue("resume"); + + FuzzyDriver me = null; + + if (resume.length() > 0) { + // Resuming from a checkpoint + try { + System.out.println("Reloading from checkpoint..."); + File f = new File(myOptions.getStringValue("resume")); + ObjectInputStream is = new ObjectInputStream( + new FileInputStream(f)); + me = (FuzzyDriver) is.readObject(); + System.out.println("Successfully loaded checkpoint..."); + is.close(); + } catch (Exception e) { + throw new IOException("Unable to resume from checkpoint: " + e); + } + // We can re-set some of the parameters here. + // Only reset ones that are defined on this commandline, ie. don't + // use defaults. + if (myOptions.optionSet("maxIterations")) + me.maxIterations = myOptions.getIntValue("maxIterations"); + if (myOptions.optionSet("imageFreq")) + me.imageFreq = myOptions.getIntValue("imageFreq"); + if (myOptions.optionSet("checkFreq")) + me.checkpointFreq = myOptions.getIntValue("checkFreq"); + if (myOptions.optionSet("statsFreq")) + me.statsFreq = myOptions.getIntValue("statsFreq"); + if (myOptions.optionSet("imageSize")) + FuzzyLZ.img_width = FuzzyLZ.img_height = myOptions + .getIntValue("imageSize"); + if (myOptions.optionSet("debug")) { + me.DEBUG = myOptions.getIntValue("debug"); + FuzzyLZ.DEBUG = me.DEBUG; + } + + } else { + // Starting anew (no checkpoint) + me = new FuzzyDriver(); + me.params = new Params(); + + me.maxIterations = myOptions.getIntValue("maxIterations"); + me.alphabet = myOptions.getStringValue("dna").toCharArray(); + me.DEBUG = myOptions.getIntValue("debug"); + FuzzyLZ.DEBUG = me.DEBUG; + FuzzyLZ.img_width = FuzzyLZ.img_height = myOptions + .getIntValue("imageSize"); + me.imageFreq = myOptions.getIntValue("imageFreq"); + me.checkpointFreq = myOptions.getIntValue("checkFreq"); + me.statsFreq = myOptions.getIntValue("statsFreq"); + me.msgFname = myOptions.getStringValue("msgFile"); + me.outDir = myOptions.getStringValue("outDir"); + me.overwrite = myOptions.getBooleanValue("overwrite"); + Matches_Sparse.def_winSize = myOptions.getIntValue("hashSize"); + Matches_Sparse.def_computeWin = myOptions.getIntValue("computeWin"); + Matches_Sparse.def_cutML = myOptions.getIntValue("cutML"); + // Matches_Sparse.def_plotActive = + // myOptions.getBooleanValue("plotActive"); + + String paramFile = myOptions.getStringValue("paramFile"); + if (paramFile.length() > 0) { + try { + BufferedReader rdr = new BufferedReader(new FileReader( + paramFile)); + String line; + while ((line = rdr.readLine()) != null) { + int i = line.indexOf('='); + if (i >= 0) { + String key = line.substring(0, i); + String valStr = line.substring(i + 1); + double val = Double.parseDouble(valStr); + me.params.put(key, val); + } else { + if (line.length() > 0 && !line.startsWith("#")) + System.err + .println("WARNING: Ignoring param line '" + + line + "'"); + } + } + rdr.close(); + } catch (FileNotFoundException e) { + System.err.println("ERROR: Unable to read file '" + + paramFile + "'. Ignoring..."); + } catch (NumberFormatException e) { + System.err + .println("ERROR: Converting string from paramFile to number :" + + e); + me.params = new Params(); + } catch (IOException e) { + System.err.println("ERROR: Reading file '" + paramFile + + "'. Ignoring..."); + me.params = new Params(); + } + } + + me.seqModelStr = myOptions.getStringValue("seqModel"); + + @SuppressWarnings("rawtypes") + Vector machs = new Vector(); + FuzzyLZ.def_numFwd = parseMachineNames( + myOptions.getStringValue("fwdMach"), machs); + FuzzyLZ.def_numRev = parseMachineNames( + myOptions.getStringValue("revMach"), machs); + FuzzyLZ.MutationModels = + (String[]) machs.toArray(new String[machs.size()]); + + String preFile = myOptions.getStringValue("preFile"); + if (preFile == "") { + me.preStr = new char[0]; + } else { + Sequence seq = SequenceReader.getReader(preFile).nextSequence( + null);// (filename)IOTools.read(args[0]); + if (seq == null) { + throw new IOException("Unable to read prefix file: '" + + preFile + "' "); + } + me.preStr = seq.charSequence(); + + for (int i = 0; i < me.preStr.length; i++) + me.preStr[i] = Character.toLowerCase(me.preStr[i]); + + Misc.printf("Pre-sequence length = %d\n", me.preStr.length); + } + + me.fname = myFile; + /* + * DNA japsa.seq = DNA.guess_format(me.fname); if (japsa.seq == + * null) { System.err.println("Unable to read sequence file"); + * System.exit(1); } + */ + me.str = sequence; + Misc.printf("Sequence length = %d\n", me.str.length); + + // Display the machines we are going to use + for (int i = 0; i < FuzzyLZ.def_numFwd; i++) + System.err.println("FwdMachine[" + i + "]: " + + FuzzyLZ.MutationModels[i]); + for (int i = 0; i < FuzzyLZ.def_numRev; i++) + System.err.println("RevMachine[" + i + "]: " + + FuzzyLZ.MutationModels[i + FuzzyLZ.def_numFwd]); + + // Create joinedStr + me.joinedStr = new char[me.preStr.length + me.str.length]; + System.arraycopy(me.preStr, 0, me.joinedStr, 0, me.preStr.length); + System.arraycopy(me.str, 0, me.joinedStr, me.preStr.length, + me.str.length); + + // Get the file prefix to use for output filenames + me.fprefix = me.outDir; + if (me.fprefix.length() > 0) + me.fprefix += File.separatorChar; + me.fprefix += (new File(me.fname)).getName(); + + // Open the file for msglen output + try { + File f; + if (me.msgFname.equals("")) + me.msgFname = me.fprefix + "-msglen.txt"; + f = new File(me.msgFname); + if (me.overwrite) + f.delete(); + if (f.exists()) { + throw new IOException("Output file '" + me.msgFname + + "' already exists."); + } + if (!f.createNewFile()) { + throw new IOException("Unable to create Output file '" + + me.msgFname); + } + // me.msgFile = new FileWriter(f); + me.msgOut = new PrintStream(f); + } catch (IOException e) { + throw new IOException("Error creating msglen output file: " + e); + } + } + me.go(resume.length() > 0); + + // return filename where information content was written + return me.msgFname; + + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + static int parseMachineNames(String l, Vector res) throws IOException { + String s[] = l.split(","); + int num = 0; + for (int i = 0; i < s.length; i++) { + if (s[i].equals("")) + continue; + if (s[i].compareToIgnoreCase("1state") == 0) { + res.add("japsa.bio.misc.common.Mutation_1State"); + } else if (s[i].compareToIgnoreCase("3state") == 0) { + res.add("japsa.bio.misc.common.Mutation_3State"); + } else { + throw new IOException("Unknown machine: '" + s[i] + "'"); + } + num++; + } + return num; + } + + // Note: these object variables must also be saved/loaded in the + // writeObject/readObject functions. + + FuzzyLZ mdl; + + double tot_msglen; + double last_msglen = -1; + + int iteration; + + int inner_i; + + long iterationStartTime; // Start time + + void init_iteration() throws IOException { + if (DEBUG >= 3) { + // TODO check this pls + // S.printf("Starting (unnormalized) costs:\n%s\n", params); + } + + // Ensure Params 'p' has no funky numbers + int numParams = params.get_num(); + for (int i = 0; i < numParams; i++) { + String name = params.get_name_by_id(i); + double v = params.get(name); + if (Double.isInfinite(v) || Double.isNaN(v)) { + System.err.println("Potential parameter problem: '" + name + + "' has bad value=" + v); + } + } + + Seq_Model seqModel = parseSeqModel(seqModelStr); + if (seqModel == null) { + throw new IOException("ERROR: Unable to parse seqModel : '" + + seqModelStr + "'"); + } + + // I don't think the Buffered_Seq_Model actually + // saves any computation! + // seqModel = new Buffered_Seq_Model(seqModel); // reduces calls to + // encodeLen + + // Train the seqModel up on preStr if it has been specified + for (int i = 0; i < preStr.length; i++) { + seqModel.update(preStr[i], i); + } + + mdl = new FuzzyLZ(params, seqModel, joinedStr, alphabet.length, + preStr.length); + + tot_msglen = 0; + iterationStartTime = System.currentTimeMillis(); + } + + /** + * Parse a string representing a sequence model, and create the model. + * + * @param tandemRepeat + * @return The sequence model. null on error + */ + private Seq_Model parseSeqModel(String str_) { + Pattern p = Pattern.compile("markov\\((-?\\d+)\\)", + Pattern.CASE_INSENSITIVE); + Matcher m = p.matcher(str_); + if (m.matches()) { + int n = Integer.parseInt(m.group(1)); + return new MarkovN(n, alphabet); + } + + return null; + } + + void inner_loop() throws IOException { + char c = str[inner_i]; + double d = mdl.update(c, preStr.length + inner_i); + + tot_msglen += d; + Misc.my_assert(d > 0, "Bugger! -ve bits to encode char"); + + if (DEBUG >= 3) + System.out.printf("%c %03d: m=%.2f tot(m)=%.2f\n", c, inner_i, d, + tot_msglen); + + // try { + msgOut.printf("%c\t%7d\t%f\n", c, inner_i, d); + + // msgFile.write(VNTRReadDepth.sprintf("%s %03d %f\n", new + // VNTRReadDepth.VarArgs(c).add(inner_i).add(d))); + // msgFile.flush(); + // } + // catch (IOException e) { + // throw new IOException("Error writing to msglen output file: " + e); + // } + } + + /** + * The main function to compress + * + * @param resume + */ + void go(boolean resume) throws IOException { + if (resume) { + System.out.println("Resuming from iteration=" + iteration + + " at character=" + inner_i + " of " + str.length); + } else { + iteration = 0; + } + + long last_checkpoint = System.currentTimeMillis(); + long last_image = System.currentTimeMillis(); + long last_stats = System.currentTimeMillis(); + + for (; maxIterations == 0 || iteration < maxIterations; iteration++) { + + // Re initilise the new file to write + msgOut = new PrintStream(new FileOutputStream(this.msgFname + + iteration)); + msgOut.println("# Information content generated by Approximate Repeat Model"); + msgOut.println("# Compression of Strings with Approximate Repeats " + + "\n# L. Allison, T. Edgoose, T. I. Dix., Intell. Sys. in Molec. Biol., pp.8-16, 1998."); + + if (DEBUG >= 1) + Misc.printf("\n\nIteration %d\nParams:\n%s\n", new Object[] { + new Integer(iteration), params }); + + if (!resume) { + try { + init_iteration(); + inner_i = 0; + } catch (Exception e) { + System.err.println(e); + return; + } + } + + resume = false; + + for (; inner_i < str.length; inner_i++) { + // Display some stats? + if (statsFreq > 0 + && (System.currentTimeMillis() - last_stats) / 1000 >= statsFreq) { + last_stats = System.currentTimeMillis(); + System.out.println("Stats as at " + new java.util.Date()); + System.out.println("Current compression: " + + (tot_msglen / inner_i) + " bits/char."); + + System.out.println("At " + inner_i + " of " + str.length + + " (" + (100.0 * inner_i / str.length) + "%)"); + double eta = 1.0 + * (System.currentTimeMillis() - iterationStartTime) + * str.length * str.length + / (1.0 * inner_i * inner_i); + int secs = (int) (eta / 1000); + int mins = secs / 60; + int hours = mins / 60; + System.out.println("iteration ETA: " + hours + " hrs " + + (mins % 60) + " mins " + (secs % 60) + " secs."); + mdl.display_stats(); + } + + // Save a checkpoint? + if (checkpointFreq > 0 + && (System.currentTimeMillis() - last_checkpoint) / 1000 >= checkpointFreq) { + try { + String f = fprefix + "-checkpoint-" + iteration + "-" + + inner_i + ".obj"; + if (DEBUG >= 1) + System.out.println("Saving checkpoint : " + f); + ObjectOutputStream o = new ObjectOutputStream( + new FileOutputStream(new File(f))); + o.writeObject(this); + o.close(); + } catch (IOException e) { + System.err.println("Failed to save checkpoint: " + e); + } + last_checkpoint = System.currentTimeMillis(); + } + + // Save an image? + if (imageFreq > 0 + && (System.currentTimeMillis() - last_image) / 1000 >= imageFreq) { + mdl.plot.save(Misc.sprintf(fprefix + "-tmp-%02d-%07d.ppm", + new Misc.VarArgs(iteration).add(inner_i)), "Seq: " + + fname); + mdl.plotActive.save(Misc.sprintf(fprefix + + "-tmpActive-%02d-%07d.ppm", new Misc.VarArgs( + iteration).add(inner_i)), "Seq: " + fname); + mdl.plotHits.save(Misc.sprintf(fprefix + + "-tmpHits-%02d-%07d.ppm", new Misc.VarArgs( + iteration).add(inner_i)), "Seq: " + fname); + last_image = System.currentTimeMillis(); + } + // Do the work here + try { + inner_loop(); + } catch (Exception e) { + System.err.println(e); + return; + } + } + + if (DEBUG >= 0) + Misc.printf("Iteration " + iteration + + " : Total for mdl = %.4f (%.4f bits/char)\n", + tot_msglen, tot_msglen / str.length); + + if (DEBUG >= 1 || statsFreq > 0) { + System.out.println("Stats as at " + new java.util.Date()); + mdl.display_stats(); + } + + if (DEBUG >= 2) + mdl.display(); + + params = mdl.counts_to_params(); + + mdl.plot.save(Misc.sprintf(fprefix + "-final-iter%02d.ppm", + new Misc.VarArgs(iteration)), "Seq: " + fname); + mdl.plotActive.save( + Misc.sprintf(fprefix + "-finalActive-iter%02d.ppm", + new Misc.VarArgs(iteration)), "Seq: " + fname); + mdl.plotHits.save(Misc.sprintf(fprefix + "-finalHits-iter%02d.ppm", + new Misc.VarArgs(iteration)), "Seq: " + fname); + + if (last_msglen > 0) { + if (last_msglen < tot_msglen) + System.err.println("WARNING: Problem with convergence"); + else if (last_msglen - tot_msglen < CONVERGE_CUTOFF) { + System.err + .println("Converged to within " + CONVERGE_CUTOFF); + break; + } + } + last_msglen = tot_msglen; + System.out.println("On average " + (tot_msglen * 1.0 / str.length)); + this.msgOut.close(); + File f = new File(this.msgFname + iteration), f2 = new File( + this.msgFname); + copy(f, f2); + + System.out.println("=============" + + f.renameTo(new File(this.msgFname))); + // Copy this into the file + } + } + + // Copies src file to dst file. + // If the dst file does not exist, it is created + static void copy(File src, File dst) throws IOException { + InputStream in = new FileInputStream(src); + OutputStream out = new FileOutputStream(dst); + + // Transfer bytes from in to out + byte[] buf = new byte[1024]; + int len; + while ((len = in.read(buf)) > 0) { + out.write(buf, 0, len); + } + in.close(); + out.close(); + } + + /** + * Used by FuzzyModel to get the image file names + * + * @return String[] + */ + public String[] getImageFileNames() { + String[] graphs = new String[3]; + graphs[0] = fprefix + "-final-iter%02d.ppm"; + graphs[1] = fprefix + "-finalActive-iter%02d.ppm"; + graphs[2] = fprefix + "-finalHits_iter%02d.ppm"; + + return graphs; + } + + // Write our own serization handler. + // Just store everything. Must re-open the msglen output file + private void writeObject(java.io.ObjectOutputStream out) throws IOException { + out.writeInt(maxIterations); + out.writeObject(alphabet); + out.writeInt(DEBUG); + out.writeObject(fname); + out.writeInt(imageFreq); + out.writeInt(checkpointFreq); + out.writeInt(statsFreq); + out.writeObject(msgFname); + out.writeObject(outDir); + + out.writeObject(fprefix); + out.writeObject(params); + out.writeObject(str); + out.writeObject(preStr); + out.writeObject(seqModelStr); + + out.writeObject(mdl); + out.writeDouble(tot_msglen); + out.writeInt(iteration); + out.writeInt(inner_i); + } + + private void readObject(java.io.ObjectInputStream in) throws IOException, + ClassNotFoundException { + maxIterations = in.readInt(); + alphabet = (char[]) in.readObject(); + DEBUG = in.readInt(); + fname = (String) in.readObject(); + imageFreq = in.readInt(); + checkpointFreq = in.readInt(); + statsFreq = in.readInt(); + msgFname = (String) in.readObject(); + outDir = (String) in.readObject(); + + fprefix = (String) in.readObject(); + params = (Params) in.readObject(); + str = (char[]) in.readObject(); + preStr = (char[]) in.readObject(); + seqModelStr = (String) in.readObject(); + + mdl = (FuzzyLZ) in.readObject(); + tot_msglen = in.readDouble(); + iteration = in.readInt(); + inner_i = in.readInt(); + + // re-Create joinedStr + joinedStr = new char[preStr.length + str.length]; + System.arraycopy(preStr, 0, joinedStr, 0, preStr.length); + System.arraycopy(str, 0, joinedStr, preStr.length, str.length); + + // Open the file for msglen output + try { + File f; + f = new File(msgFname); + if (!f.exists()) { + throw new IOException("Output file '" + msgFname + + "' does not exist. Unable to resume"); + } + // msgFile = new FileWriter(f, true); + msgOut = new PrintStream(f); + msgOut.println("# Resuming from checkpoint...\n"); + // msgFile.write("# Resuming from checkpoint...\n"); + } catch (IOException e) { + throw new IOException("Error creating msglen output file: " + e); + } + } +} diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/FuzzyLZ.java b/src/main/java/japsa/bio/misc/fuzzyLZ/FuzzyLZ.java new file mode 100755 index 0000000..9d0d84e --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/FuzzyLZ.java @@ -0,0 +1,645 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the + * compression of DNA sequeces. It can be viewed as a + * compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain + * inserts/deletes/mismatches. + * + */ + +package japsa.bio.misc.fuzzyLZ; + + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import japsa.bio.misc.common.Counts; +import japsa.bio.misc.common.Misc; +import japsa.bio.misc.common.Mutation_FSM; +import japsa.bio.misc.common.MyMath; +import japsa.bio.misc.common.Params; +import japsa.bio.misc.common.Seq_Model; +import japsa.bio.misc.common.Two_Seq_Model_Counts; + + + +/** + * A model of 2 DNA sequences (for alignments) with counts. Has a seperate cost + * for every pair of DNA characters. Thus 16 counts and parameters. + */ +class Model_SeqA_DNA implements Two_Seq_Model_Counts { + + /** + * + */ + private static final long serialVersionUID = 1L; + + static final int alphaSize = 4; + + char chars[] = { 'a', 't', 'g', 'c' }; + + double costs[]; + + int countIndex; + + public Model_SeqA_DNA(Params p, int alphaSize, int countIndex) { + this.countIndex = countIndex; + + Misc.my_assert(chars.length == Model_SeqA_DNA.alphaSize, + "Internal class error: char array and alphaSize don't match"); + Misc.my_assert(Model_SeqA_DNA.alphaSize == alphaSize, + "Model_SeqA_DNA only works for an dna of " + + Model_SeqA_DNA.alphaSize); + + costs = new double[alphaSize * alphaSize]; + + if (!p.exists("match_" + chars[0] + chars[0])) { + set_default_costs(); + } else { + for (int i = 0; i < alphaSize; i++) + for (int j = 0; j < alphaSize; j++) { + String s = "match_" + chars[i] + "|" + chars[j]; + costs[pair_to_int(chars[i], chars[j])] = p.get(s); + } + } + + normalize_costs(); + } + + private int pair_to_int(char a, char b) { + int i, r = 0; + for (i = 0; i < alphaSize; i++) + if (a == chars[i]) { + r = i; + break; + } + Misc.my_assert(i < alphaSize, "Unknown character in DNA sequence"); + r *= alphaSize; + + for (i = 0; i < alphaSize; i++) + if (b == chars[i]) { + r += i; + break; + } + Misc.my_assert(i < alphaSize, "Unknown character in DNA sequence"); + + return r; + } + + void set_default_costs() { + for (int i = 0; i < alphaSize; i++) { + for (int j = 0; j < alphaSize; j++) { + costs[pair_to_int(chars[i], chars[j])] = (i == j ? 1 : 4); + } + } + } + + void normalize_costs() { + for (int j = 0; j < alphaSize; j++) { + double sum = 0; + for (int i = 0; i < alphaSize; i++) + sum += MyMath.exp2(-costs[pair_to_int(chars[i], chars[j])]); + for (int i = 0; i < alphaSize; i++) + costs[pair_to_int(chars[i], chars[j])] += MyMath.log2(sum); + } + } + + public double encA(char a, int i) { + return MyMath.log2(alphaSize); + } + + public double encB(char a, int i) { + return 0; + } + + public double encBoth(char a, char b, int i, int j) { + // if (a==b) return -MyMath.log2(0.8); + // else return -MyMath.log2(0.2) + MyMath.log2(alphaSize-1); + return costs[pair_to_int(a, b)]; + } + + public static int required_counts() { + return alphaSize * alphaSize; + } + + public Params counts_to_params(Counts counts) { + Params par = new Params(); + + for (int j = 0; j < alphaSize; j++) { + double sum = 0; + for (int i = 0; i < alphaSize; i++) + sum += counts.counts[countIndex + + pair_to_int(chars[i], chars[j])]; + for (int i = 0; i < alphaSize; i++) + par.put("match_" + chars[i] + "|" + chars[j], + -MyMath.log2(counts.counts[countIndex + + pair_to_int(chars[i], chars[j])] + / sum)); + } + + return par; + } + + public void update_count_encA(Counts c, double w, char a, int i) { + }; + + public void update_count_encB(Counts c, double w, char a, int i) { + }; + + public void update_count_encBoth(Counts c, double w, char a, char b, int i, + int j) { + c.inc(countIndex + pair_to_int(a, b), w); + } + + public double encode_params(double N) { + return 0; + } +} + +/** + * Implements the FuzzyLZ algorithm of "Compression of Strings with Approximate + * Repeats" by L. Allison, T. Edgoose, T.I. Dix in Intell. Sys. in Mol. Biol. + * '98 + */ +@SuppressWarnings("rawtypes") +public class FuzzyLZ implements Seq_Model { + /** + * + */ + private static final long serialVersionUID = 1L; + + static int DEBUG = 5; + + /** + * Parent class for forward/reverse matches. Has parameters to continue a + * match (cont_cost) and to end a match (end_cost) + */ + static abstract class Matches implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + + Mutation_FSM fsmType; + + Plot plot; + + Plot plotActive; + + Plot plotHits; + + int seqLen; + + char[] sequence; + + double encEnd, encContinue; + + int countIndex; + + protected final int contIndex = 0, endIndex = 1; + + Matches(Mutation_FSM fsm, Params p, int countIndex, char[] seq) { + fsmType = fsm; + this.countIndex = countIndex; + sequence = seq; + seqLen = sequence.length; + + if (!p.exists("end_cost")) { + set_default_costs(); + } else { + encEnd = p.get("end_cost"); + encContinue = p.get("cont_cost"); + } + normalize_costs(); + + plot = null; + } + + // These three really oughtn't be here. + void constructHash() { + }; + + void setHash(ExactMatches h) { + }; + + ExactMatches getHash() { + return null; + } + + void setPlot(Plot p) { + plot = p; + }; + + void setPlotActive(Plot p) { + plotActive = p; + }; + + void setPlotHits(Plot p) { + plotHits = p; + }; + + public void plotVals(int i, double base) { + return; + }; + + void set_default_costs() { + + // encEnd = -MyMath.log2(0.1); + // encContinue = -MyMath.log2(0.9); + // mdc changed to the values after a few iteration + encEnd = 8.052274663541743; + encContinue = 0.0054452480179498315; + } + + void normalize_costs() { + double sum = MyMath.exp2(-encEnd) + MyMath.exp2(-encContinue); + encEnd = encEnd + MyMath.log2(sum); + encContinue = encContinue + MyMath.log2(sum); + if (DEBUG >= 2) + System.out.printf("end_cost=%f cont_cost=%f\n", encEnd, + encContinue); + System.out.println("==end_cost=" + encEnd + " cont_cost=" + + encContinue); + } + + public abstract void beginLinks(int i, double startLen, + Counts startCounts); + + public abstract double update(char a, int i, Counts retCounts); + + public abstract double msgLen(); + + public abstract double normalise(double base); + + public static int required_counts() { + return 2; + }; + + public Params counts_to_params(Counts c) { + Params p = fsmType.counts_to_params(c); + double sum = c.get(countIndex + endIndex) + + c.get(countIndex + contIndex); + p.put("end_cost", -MyMath.log2(c.get(countIndex + endIndex) / sum)); + p.put("cont_cost", + -MyMath.log2(c.get(countIndex + contIndex) / sum)); + return p; + } + } + + static int img_width = 800, img_height = 800; + + static String[] MutationModels = { "common.Mutation_3State$All" }; + + static int def_numFwd = 1, def_numRev = 1; + + Seq_Model seqModel; + + int seqLen, alphaSize, totCounts; + + char[] sequence; + + int numFwd, numRev; + + double encNoStart; + + double[] encStartMachines; + + Matches[] machines; + + double base[]; + + Counts baseCounts[]; + + double last_msgLen; + + private final int noStartIndex = 0; + + private int myCounts = 1; + + public Plot plot; + + public Plot plotActive; + + public Plot plotHits; + + @SuppressWarnings("unchecked") + FuzzyLZ(Params params, Seq_Model seqModel, char[] sequence, int alphaSize, + int plotStart) throws RuntimeException { + this.seqModel = seqModel; + this.sequence = sequence; + this.alphaSize = alphaSize; + seqLen = sequence.length; + numFwd = def_numFwd; + numRev = def_numRev; + + Class[] M_class = new Class[numFwd + numRev]; + Method[] M_counts = new Method[numFwd + numRev]; + Constructor[] M_new = new Constructor[numFwd + numRev]; + for (int i = 0; i < M_class.length; i++) { + try { + M_class[i] = Class.forName(MutationModels[i + % MutationModels.length]); + M_counts[i] = M_class[i].getMethod("required_counts", null); + M_new[i] = M_class[i].getConstructor(new Class[] { + Class.forName("japsa.bio.misc.common.Two_Seq_Model_Counts"), + Class.forName("japsa.bio.misc.common.Params"), Integer.TYPE, + Integer.TYPE }); + } catch (Exception e) { + throw new RuntimeException( + "Error finding class/method for class '" + + MutationModels[i % MutationModels.length] + + "' : " + e); + } + } + + encStartMachines = new double[numFwd + numRev]; + machines = new Matches[numFwd + numRev]; + + myCounts += numFwd + numRev; + int countPos = myCounts; + + int mdlCounts = Model_SeqA.required_counts(); + int matCounts = Matches.required_counts(); + totCounts = myCounts; + + int fsmCounts[] = new int[numFwd + numRev]; + + for (int i = 0; i < numFwd + numRev; i++) { + try { + fsmCounts[i] = ((Integer) M_counts[i].invoke(null, null)) + .intValue(); + } catch (Exception e) { + System.err.println("required_counts Exception : " + e); + } + totCounts += (mdlCounts + fsmCounts[i] + matCounts); + } + + if (!params.exists("nostart_cost")) { + set_default_costs(); + } else { + encNoStart = params.get("nostart_cost"); + for (int i = 0; i < numFwd + numRev; i++) + encStartMachines[i] = params.get("start" + i + "_cost"); + } + normalize_costs(); + + plot = new Plot(seqLen + 1, seqLen + 1, img_width, img_height, + plotStart); + plotActive = new Plot(seqLen + 1, seqLen + 1, img_width, img_height, + plotStart); + plotHits = new Plot(seqLen + 1, seqLen + 1, img_width, img_height, + plotStart); + + // Now initialise the various mutation machines + for (int m = 0; m < numFwd + numRev; m++) { + Params p = new Params(); + for (int i = 0; i < params.get_num(); i++) { + String n = params.get_name_by_id(i); + if (n.startsWith("MACHINE" + m + "_")) + p.put(n.substring(9), params.get(n)); + } + Two_Seq_Model_Counts model = new Model_SeqA(p, alphaSize, countPos); + // Two_Seq_Model_Counts model = new Model_SeqA_SubModel(seqModel, p, + // alphaSize, countPos); + + countPos += mdlCounts; + Mutation_FSM fsmType = null; + try { + fsmType = (Mutation_FSM) M_new[m] + .newInstance(new Object[] { model, p, + new Integer(totCounts), new Integer(countPos) }); + } catch (Exception e) { + System.err.println("newInstance Exception : " + e); + } + countPos += fsmCounts[m]; + machines[m] = new Matches_Sparse((m < numFwd), fsmType, p, + countPos, sequence); + countPos += matCounts; + + if (m == 0) + machines[m].constructHash(); + else + machines[m].setHash(machines[0].getHash()); + + machines[m].setPlot(plot); + machines[m].setPlotActive(plotActive); + machines[m].setPlotHits(plotHits); + + // VNTRReadDepth.printf("Starting costs:\n%s:\n", fsmType.paramsToString()); + } + Misc.my_assert(countPos == totCounts, + "Internal error: countPos!=totCounts"); + + // Setup the base states. + base = new double[2]; + base[0] = 0; + + baseCounts = new Counts[2]; + baseCounts[0] = new Counts(totCounts); + baseCounts[1] = new Counts(totCounts); + + last_msgLen = 0; + } + + void set_default_costs() { + // for (int i = 0; i < numFwd + numRev; i++) + // encStartMachines[i] = -MyMath.log2(0.1 / (numFwd + numRev)); + // encNoStart = -MyMath.log2(0.9); + + // mdc changed to an good param + encNoStart = 0.001602767095846256;// -MyMath.log2(0.9) + + for (int i = 0; i < numFwd + numRev; i++) { + if (i == 0) + encStartMachines[i] = 10.422286251724662; + // else if (i == 1) encStartMachines[i] = 11.355628428955441; + else + encStartMachines[i] = 11.355705924947152;// -MyMath.log2(0.1 / + // (numFwd + + // numRev)); + } + + } + + void normalize_costs() { + double sum = MyMath.exp2(-encNoStart); + for (int i = 0; i < numFwd + numRev; i++) + sum += MyMath.exp2(-encStartMachines[i]); + + encNoStart += MyMath.log2(sum); + for (int i = 0; i < numFwd + numRev; i++) + encStartMachines[i] += MyMath.log2(sum); + + if (DEBUG >= 2) { + System.out.printf("nostart_cost=%f ", encNoStart); + for (int i = 0; i < numFwd + numRev; i++) { + System.out.printf("start" + i + "_cost=%f ", + encStartMachines[i]); + System.out.print(" == start" + i + "_cost=" + + encStartMachines[i]); + } + + System.out.println(); + } + } + + public double encodeLen(char a, int i) throws RuntimeException { + throw new RuntimeException("encodeLen not implemented here"); + } + + public double update(char aChar, int i) { + // Start base[i+1] as base[i] with noStart and the char (also the + // counts) + base[1] = base[0] + seqModel.encodeLen(aChar, i) + + (i == 0 ? 0 : encNoStart); + baseCounts[1].duplicate(baseCounts[0]); + baseCounts[1].inc(noStartIndex, 1); + + for (int m = 0; m < numFwd + numRev; m++) { + Counts tmpCounts = new Counts(totCounts); + tmpCounts.duplicate(baseCounts[0]); + tmpCounts.inc(noStartIndex + 1 + m, 1); + + machines[m].beginLinks(i, base[0] + encStartMachines[m] + + encStartPos(i), tmpCounts); + } + + // update matches and calc their contribution to base + double msgLen = base[1]; + for (int m = 0; m < numFwd + numRev; m++) { + Counts retCounts = new Counts(totCounts); + double len = machines[m].update(aChar, i, retCounts); + + baseCounts[1].combine_with_lens(base[1], retCounts, len); + base[1] = MyMath.logplus(base[1], len); + + msgLen = MyMath.logplus(msgLen, machines[m].msgLen()); + } + + // Update the sequence model with the new character. + seqModel.update(aChar, i); + + double p = MyMath.exp2(msgLen - base[1]); + p = Math.pow(p, 0.3); + plot.putMax(i + 1, i + 1, p, p, p); + plotActive.putMax(i + 1, i + 1, p, p, p); + plotHits.putMax(i + 1, i + 1, p, p, p); + for (int m = 0; m < numFwd + numRev; m++) { + machines[m].plotVals(i + 1, msgLen); + if (DEBUG >= 4) + ((Matches_Sparse) machines[m]).display_stats(); + } + + if (msgLen > 100) { // Renormalise values so we don't lose accuracy + double factor = last_msgLen; + base[1] -= factor; + + msgLen = base[1]; + for (int m = 0; m < numFwd + numRev; m++) { + msgLen = MyMath.logplus(msgLen, machines[m].normalise(factor)); + } + last_msgLen -= factor; + // System.err.println("Normalising. factor="+factor+" + // last_msgLen="+last_msgLen); + } + + base[0] = base[1]; + Counts t = baseCounts[0]; + baseCounts[0] = baseCounts[1]; + baseCounts[1] = t; + + double tm = msgLen; + msgLen -= last_msgLen; + last_msgLen = tm; + + return msgLen; + } + + /** + * The start position of a copy is simply encoded from a uniform prior over + * possible start positions from [0..n] + */ + double encStartPos(int n) { + return MyMath.log2(n + 1); + } + + public Params counts_to_params() { + Counts counts = baseCounts[0]; + + Params res = new Params(); + + double sum = counts.get(noStartIndex); + // Prepend the parameters for the matches with 'MACHINEx_' + for (int m = 0; m < numFwd + numRev; m++) { + Params p = machines[m].counts_to_params(counts); + for (int i = 0; i < p.get_num(); i++) { + String n = p.get_name_by_id(i); + res.put("MACHINE" + m + "_" + n, p.get(n)); + } + + sum += counts.get(noStartIndex + 1 + m); + } + + // Finally parameters from us... + res.put("nostart_cost", -MyMath.log2(counts.get(noStartIndex) / sum)); + for (int m = 0; m < numFwd + numRev; m++) { + res.put("start" + m + "_cost", + -MyMath.log2(counts.get(noStartIndex + 1 + m) / sum)); + } + + return res; + } + + public void display() { + // VNTRReadDepth.printf("Final counts:\n%s\n", baseCounts[seqLen]); + System.out.printf("Final costs:\n%s\n", counts_to_params()); + // TODO: check this pls + } + + public void display_stats() { + for (int m = 0; m < numFwd + numRev; m++) { + ((Matches_Sparse) machines[m]).display_stats(); + } + System.out.println(""); + } + + public void saveState(String fname) { + try { + File f = new File(fname); + ObjectOutputStream oos = new ObjectOutputStream( + new FileOutputStream(f)); + oos.writeObject(this); + oos.close(); + } catch (Exception e) { + System.err.println("Error writing file: " + e); + } + } + + public static FuzzyLZ readState(String fname) { + FuzzyLZ r = null; + Object o = null; + try { + FileInputStream istream = new FileInputStream(fname); + ObjectInputStream p = new ObjectInputStream(istream); + o = p.readObject(); + istream.close(); + } catch (Exception e) { + System.err.println("Error reading file: " + e); + } + + r = (FuzzyLZ) o; + return r; + } + +} diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/Matches_Sparse.java b/src/main/java/japsa/bio/misc/fuzzyLZ/Matches_Sparse.java new file mode 100755 index 0000000..f06a610 --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/Matches_Sparse.java @@ -0,0 +1,309 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the compression of DNA sequeces. + * It can be viewed as a compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain inserts/deletes/mismatches. + * + */ + +package japsa.bio.misc.fuzzyLZ; + +import japsa.bio.misc.common.*; + +class Matches_Sparse extends FuzzyLZ.Matches { + /** + * + */ + private static final long serialVersionUID = 1L; + + static int def_winSize = 25; + + static int def_computeWin = 10; + + static int def_cutML = 4; + + Sparse b, e; + + ExactMatches hash; + + int winSize; // What window size to use for constructing the hashtable + + int computeWin; // How many cells do we activate on a hashtable hit + + double cutML; // At what message length to we kill active cells + + boolean fullN2; // Do the full N^2 algorithm + + int debug; + + boolean fwd; + + ExactMatches.Convert converter; + + long hashHits; + + double plotBrightness; + + double beginCost; + + Matches_Sparse(boolean fwd, Mutation_FSM fsm, Params p, int countIndex, + char[] seq) { + super(fsm, p, countIndex, seq); + + this.fwd = fwd; + + b = new Sparse(fsmType); + e = new Sparse(fsmType); + hash = null; + + winSize = def_winSize; + computeWin = def_computeWin; + cutML = def_cutML; + + plotBrightness = 0.5; + + fullN2 = (winSize == 0); + + converter = (fwd ? new ExactMatches.Convert() + : new ExactMatches.Reverse_Complement_DNA()); + + debug = 0; + hashHits = 0; + } + + void constructHash() { + if (!fullN2) { + hash = new ExactMatches(sequence, winSize); + if (FuzzyLZ.DEBUG >= 2) { + System.out.println("fwd: total hash hits=" + + hash.count_hits(new ExactMatches.Convert())); + if (!fwd) + System.out + .println("rev: total hash hits=" + + hash.count_hits(new ExactMatches.Reverse_Complement_DNA())); + } + } + } + + void setHash(ExactMatches h) { + hash = h; + } + + ExactMatches getHash() { + return hash; + } + + public void beginLinks(int i, double startLen, Counts startCounts) { + beginCost = startLen; + + // Do begin links into b, (startLen & startCounts) + if (debug > 1) + System.err.print("Active at "); + for (Sparse.Iterate iter = b.moveFwd(null); iter.o != null; iter = b + .moveFwd(iter)) { + Mutation_FSM cell = (Mutation_FSM) (iter.o); + cell.or(startLen, startCounts); + + if (plotActive != null) + plotActive.putMax(i, iter.i, (fwd ? 0.5 : 0), (fwd ? 0 : 0.5), + 0); + + if (debug > 1) + System.err.print("(" + i + "," + iter.i + ") "); + } + if (debug > 1) + System.err.println(); + } + + public double update(char aChar, int i, Counts retCounts) { + if (hash == null && !fullN2) + constructHash(); + + // Duplicate 'b' into 'e'. Discard cells with bad msgLen, and chop where + // appropriate. + if (debug > 1) + System.err.println("Before copy_cull_cut. Sparse = " + b); + e.copy_cull_cut(b, (fwd ? 1 : -1), winSize, beginCost - cutML, + computeWin); + if (debug > 1) + System.err.println("After copy_cull_cut. Sparse = " + e); + + // Add any new hash hits into 'e'. 'e' must not have useful data in it + // for this to work. + if (i + 1 + winSize <= seqLen && !fullN2) { + ExactMatches.MyList l; + if (fwd) + l = hash.get(sequence, i + 1); + else { + String str = new String(sequence, i + 1, winSize); + l = hash.get(converter.conv(str)); + } + if (l != null) + for (ExactMatches.MyList.L l2 = l.start; l2 != null + && l2.val < i; l2 = l2.next) { + int hit = l2.val; + int n1 = MyMath.max2(0, hit - computeWin); + int n2 = MyMath.min2(i, hit + 1 + computeWin); + e.add(n1, n2); + + if (plotActive != null) + for (int j = n1; j < n1 + 1; j++) + plotActive.putMax(i, j, (fwd ? 1 : 0), + (fwd ? 0 : 1), 0); + if (plotHits != null) + for (int j = n1; j < n1 + 1; j++) + plotHits.putMax(i, j, (fwd ? 1 : 0), (fwd ? 0 : 1), + 0); + if (debug > 0) + System.err.println("New hash " + i + " = " + hit + " (" + + n1 + "," + n2 + ")"); + + hashHits++; + } + } + + if (fullN2) + e.add(0, i); + + if (debug > 1) + System.err.println("Before join. Sparse = " + e); + e.join(); + if (debug > 1) + System.err.println("After join. Sparse = " + e); + e.checkAlloc(); + + Misc.my_assert(e.tail == null || e.tail.end <= i, + "Cheating! Sparse end past current character"); + Misc.my_assert(e.head == null || e.head.start >= 0, + "Bugger. Sparse start < 0"); + + // Reset all 'e' cells. + for (Sparse.Iterate eIter = e.moveFwd(0, null); eIter.o != null; eIter = e + .moveFwd(eIter)) + ((Mutation_FSM) (eIter.o)).reset(); + + { + // Compute 'e' from 'b' using FSM + Sparse.Iterate eIter = null; + Sparse.Iterate bIter = fwd ? b.moveFwd(null) : b.moveRev(null); + while (bIter.o != null) { + Mutation_FSM cell, hcell, vcell, dcell; + + cell = (Mutation_FSM) (bIter.o); + hcell = (Mutation_FSM) (fwd ? b.getNext(bIter) : b + .getPrev(bIter)); + + eIter = fwd ? e.moveFwd(bIter.i, eIter, true) : e.moveRev( + bIter.i, eIter, true); + vcell = (Mutation_FSM) eIter.o; + dcell = (Mutation_FSM) (fwd ? e.getNext(eIter) : e + .getPrev(eIter)); + + int j = bIter.i; + char bChar; + if (fwd) + bChar = sequence[j]; + else { + bChar = (j > 0 ? converter.conv(sequence[j - 1]) : '-'); + } + + cell.calc(hcell, vcell, dcell, aChar, bChar, i, j); + + bIter = fwd ? b.moveFwd(bIter) : b.moveRev(bIter); + } + } + // Compute return base & Counts from 'e' and END_COPY + double ret_msgLen = Double.POSITIVE_INFINITY; + retCounts.zero(); + for (Sparse.Iterate eIter = e.moveFwd(0, null); eIter.o != null; eIter = e + .moveFwd(eIter)) { + Mutation_FSM cell = (Mutation_FSM) eIter.o; + + double endLen = cell.get_val() + encEnd; + + // Hack: Increment the endCopy count in the _cell_ (will undo after + // next statement) + Counts c = cell.get_counts(); + c.inc(countIndex + endIndex, 1); + retCounts.combine_with_lens(ret_msgLen, c, endLen); + c.inc(countIndex + endIndex, -1); + + ret_msgLen = MyMath.logplus(ret_msgLen, endLen); + } + + // Compute 'e' to include CONT_COPY + for (Sparse.Iterate eIter = e.moveFwd(0, null); eIter.o != null; eIter = e + .moveFwd(eIter)) { + Mutation_FSM cell = (Mutation_FSM) eIter.o; + + cell.add(encContinue, countIndex + contIndex); + } + + // Increase the age of all active elements. + e.incAge(); + + // Swap 'e' and 'b' + Sparse t = b; + b = e; + e = t; + + return ret_msgLen; + } + + public double msgLen() { + double res = Double.POSITIVE_INFINITY; + for (Sparse.Iterate bIter = b.moveFwd(0, null); bIter.o != null; bIter = b + .moveFwd(bIter)) { + Mutation_FSM cell = (Mutation_FSM) bIter.o; + + res = MyMath.logplus(res, cell.get_val()); + } + return res; + } + + public double normalise(double base) { + // Note: base must be less than _all_ vals in the cells. + // Use the result of msgLen() or something less + double res = Double.POSITIVE_INFINITY; + for (Sparse.Iterate bIter = b.moveFwd(0, null); bIter.o != null; bIter = b + .moveFwd(bIter)) { + Mutation_FSM cell = (Mutation_FSM) bIter.o; + cell.normalise(base); + res = MyMath.logplus(res, cell.get_val()); + } + return res; + } + + public void plotVals(int i, double base) { + for (Sparse.Iterate bIter = b.moveFwd(0, null); bIter.o != null; bIter = b + .moveFwd(bIter)) { + Mutation_FSM cell = (Mutation_FSM) bIter.o; + double v = MyMath.exp2(base - cell.get_val()); + v = Math.pow(v, plotBrightness); + plot.putMax(i, bIter.i, (fwd ? v : 0), (fwd ? 0 : v), 0); + } + } + + public String toString() { + StringBuffer r = new StringBuffer(); + r.append((fwd ? "fwd: " : "rev: ")); + for (Sparse.Iterate bIter = b.moveFwd(0, null); bIter.o != null; bIter = b + .moveFwd(bIter)) { + Mutation_FSM cell = (Mutation_FSM) bIter.o; + r.append(bIter.i + ": " + cell.get_val() + " "); + } + return r.toString(); + } + + public void display_stats() { + String pre = (fwd ? "fwd: " : "rev: "); + System.out.println(pre + "Number of hash hits = " + hashHits); + b.display_stats(pre); + System.out.println(""); + } + +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/Model_SeqA.java b/src/main/java/japsa/bio/misc/fuzzyLZ/Model_SeqA.java new file mode 100755 index 0000000..58b3213 --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/Model_SeqA.java @@ -0,0 +1,127 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the + * compression of DNA sequeces. It can be viewed as a + * compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain + * inserts/deletes/mismatches. + * + */ + +package japsa.bio.misc.fuzzyLZ; + +import japsa.bio.misc.common.*; + +/** + * A model of 2 sequences (for alignments) with counts. Uses two parameters: + * match_cost, change_cost + * + * This is really a pseudo 2-sequence model. It assumes that sequence B is + * already known to the receiver. So, it encodes a character from sequence A + * using a cost 'match_cost' if it is the same as the character from B. If it is + * different it is has a cost 'change_cost' plus a cost for the character using + * a uniform model over the rest of the dna + */ + +class Model_SeqA implements Two_Seq_Model_Counts { + /** + * + */ + private static final long serialVersionUID = 1L; + + int alphaSize; + + double match_cost, change_cost; + + int countIndex; + + final private int matchIndex = 0, changeIndex = 1; + + public Model_SeqA(Params p, int alphaSize, int countIndex) { + this.alphaSize = alphaSize; + this.countIndex = countIndex; + + if (!p.exists("match_cost")) { + set_default_costs(); + } else { + match_cost = p.get("match_cost"); + change_cost = p.get("change_cost"); + } + + normalize_costs(); + } + + void set_default_costs() { + // match_cost = -MyMath.log2(9.0); + // change_cost = -MyMath.log2(1.0); + // mdc changed to a good set of variable + + match_cost = 0.20554840023089077;// + change_cost = 2.912770498714653;// -MyMath.log2(1.0); + } + + void normalize_costs() { + double sum = MyMath.exp2(-match_cost) + MyMath.exp2(-change_cost); + match_cost = match_cost + MyMath.log2(sum); + change_cost = change_cost + MyMath.log2(sum); + + if (FuzzyLZ.DEBUG >= 2) { + System.out.printf("match_cost=%f change_cost=%f\n", match_cost, + change_cost); + System.out.println(" ==match_cost=" + match_cost + " change_cost=" + + change_cost); + } + + } + + public double encA(char a, int i) { + return MyMath.log2(alphaSize); + } + + public double encB(char a, int i) { + return 0; + } + + public double encBoth(char a, char b, int i, int j) { + return ((a == b) ? match_cost : change_cost + + MyMath.log2(alphaSize - 1)); + } + + public static int required_counts() { + return 2; + } + + public Params counts_to_params(Counts counts) { + double sum = counts.counts[countIndex + matchIndex] + + counts.counts[countIndex + changeIndex]; + Params par = new Params(); + par.put("match_cost", + -MyMath.log2(counts.counts[countIndex + matchIndex] / sum)); + par.put("change_cost", + -MyMath.log2(counts.counts[countIndex + changeIndex] / sum)); + return par; + } + + public void update_count_encA(Counts c, double w, char a, int i) { + }; + + public void update_count_encB(Counts c, double w, char a, int i) { + }; + + public void update_count_encBoth(Counts c, double w, char a, char b, int i, + int j) { + if (a == b) { + c.inc(countIndex + matchIndex, w); + } else { + c.inc(countIndex + changeIndex, w); + } + } + + public double encode_params(double N) { + return 0; + } +} diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/Model_SeqA_SubModel.java b/src/main/java/japsa/bio/misc/fuzzyLZ/Model_SeqA_SubModel.java new file mode 100755 index 0000000..3f63c07 --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/Model_SeqA_SubModel.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the + * compression of DNA sequeces. It can be viewed as a + * compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain + * inserts/deletes/mismatches. + * + */ + +package japsa.bio.misc.fuzzyLZ; + +import japsa.bio.misc.common.MyMath; +import japsa.bio.misc.common.Params; +import japsa.bio.misc.common.Seq_Model; + +/** + * + * This is a simple extension to {@link Model_SeqA}. The difference is that this + * model takes a parameter, a {@link japsa.bio.misc.common.Seq_Model}, that is used to + * encode the characters from Sequence A, which do not match characters + * from Sequence B. This model can be seen as a generalisation of + * {@link Model_SeqA} which uses a uniform model for characters from unmatched + * characters from sequence A. + */ +class Model_SeqA_SubModel extends Model_SeqA { + + /** + * + */ + private static final long serialVersionUID = 1L; + Seq_Model subMdl; + + public Model_SeqA_SubModel(Seq_Model subMdl, Params p, int alphaSize, + int countIndex) { + super(p, alphaSize, countIndex); + this.subMdl = subMdl; + } + + public double encA(char a, int i) { + return subMdl.encodeLen(a, i); + } + + public double encB(char a, int i) { + return 0; + } + + public double encBoth(char a, char b, int i, int j) { + if (a == b) + return match_cost; + + // Encode char 'a', but normalise since we know 'a' is different to 'b' + double costB = subMdl.encodeLen(b, i); + double norm = MyMath.log2(1 - MyMath.exp2(-costB)); + return change_cost + subMdl.encodeLen(a, i) - norm; + } + +} diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/Plot.java b/src/main/java/japsa/bio/misc/fuzzyLZ/Plot.java new file mode 100755 index 0000000..9e50726 --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/Plot.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the compression of DNA sequeces. + * It can be viewed as a compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain inserts/deletes/mismatches. + * + */ + +package japsa.bio.misc.fuzzyLZ; + +import java.io.*; + +import japsa.bio.misc.common.*; + +class Plot implements Serializable { + private static final long serialVersionUID = 1L; + + int img[][]; + + double scale; + + int numRows, numCols; + + int startRow; + + Plot(int rows, int columns, int maxColumns, int maxRows) { + this(rows, columns, maxColumns, maxRows, 0); + } + + Plot(int rows, int columns, int maxColumns, int maxRows, int startRow) { + this.startRow = startRow; + + scale = MyMath.min2(1.0 * MyMath.min2(columns, maxColumns) / columns, + 1.0 * MyMath.min2(rows - startRow, maxRows) / rows); + + numRows = (int) (scale * (rows - startRow)); + numCols = (int) (scale * columns); + img = new int[numRows + 1][numCols + 1]; + + } + + void put(int row, int col, double r, double g, double b) { + row = (int) (scale * (row - startRow)); + col = (int) (scale * col); + img[row][col] = ((byte) (r * 255)) << 16 | ((byte) (g * 255)) << 8 + | ((byte) (b * 255)); + } + + void putMax(int row, int col, double r, double g, double b) { + row = (int) (scale * (row - startRow)); + col = (int) (scale * col); + + byte r1 = (byte) MyMath.max2((img[row][col] >> 16) & 255, r * 255); + byte g1 = (byte) MyMath.max2((img[row][col] >> 8) & 255, g * 255); + byte b1 = (byte) MyMath.max2((img[row][col]) & 255, b * 255); + + img[row][col] = (r1) << 16 | (g1) << 8 | (b1); + } + + void save(String fname, String comments) { + if (FuzzyLZ.DEBUG >= 1) + System.out.println("Writing image '" + fname + "'"); + try { + File f = new File(fname); + DataOutputStream out = new DataOutputStream( + new BufferedOutputStream(new FileOutputStream(f))); + + out.writeBytes("P6\n"); + out.writeBytes("#Created by FuzzyLZ\n"); + if (comments.length() > 0) + out.writeBytes("#" + comments + "\n"); + out.writeBytes(numCols + " " + numRows + "\n"); + out.writeBytes("255\n"); + + for (int r = 0; r < numRows; r++) { + for (int c = 0; c < numCols; c++) { + out.writeByte((img[r][c] >> 16) & 255); + out.writeByte((img[r][c] >> 8) & 255); + out.writeByte((img[r][c]) & 255); + } + } + + out.close(); + } catch (Exception e) { + System.err.println("Error writing file: " + e); + } + if (FuzzyLZ.DEBUG >= 1) + System.out.println("Done image"); + } + + public static void main(String args[]) { + Plot p = new Plot(100, 100, 50, 50); + for (int i = 0; i < 100; i++) { + p.put(i, i, 0, 255, 0); + p.put(i, 99 - i, 0, 0, 255); + p.put(50, i, 255, 0, 0); + p.put(i, 50, 255, 0, 0); + } + p.save("out.ppm", ""); + } +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/misc/fuzzyLZ/Sparse.java b/src/main/java/japsa/bio/misc/fuzzyLZ/Sparse.java new file mode 100755 index 0000000..6fe669e --- /dev/null +++ b/src/main/java/japsa/bio/misc/fuzzyLZ/Sparse.java @@ -0,0 +1,569 @@ +/* + * Copyright (c) David Powell + * + * + * This file is part of FuzzyLZ + * + * FuzzyLZ is a program orginally intended for the compression of DNA sequeces. + * It can be viewed as a compression model like Lempel-Ziv 77, but instead of + * exact matches, allowing matches that contain inserts/deletes/mismatches. + * + */ + +/* + * This class implements a sparse array to keep track of the active cells in the + * DPA + */ + +package japsa.bio.misc.fuzzyLZ; + +import java.io.*; + +import java.util.*; + +import japsa.bio.misc.common.*; +import japsa.bio.misc.dnaPlatform.gui.MainFrame; + +class Sparse implements Serializable { + static final long serialVersionUID = MainFrame.serialVersionUID; + + class Link implements Serializable { + /** + * + */ + private static final long serialVersionUID = 1L; + + int start, end, age; + + @SuppressWarnings("rawtypes") + ArrayList o; + + Link next; + + Link prev; + + Link(int start, int end) { + this.start = start; + this.end = end; + age = 0; + next = null; + prev = null; + o = new ArrayList(end - start + 10); + } + + @SuppressWarnings("unchecked") + void realloc() { + o.ensureCapacity(end - start); + + while (o.size() > end - start) + delObj((Has_Value) o.remove(o.size() - 1)); + + while (o.size() < end - start) + o.add(newObj()); + } + + public String toString() { + return "l(" + start + "->" + end + " " + age + ") "; + } + } + + /** + * Class Iterate is used by a number of functions to store state information + * about the current position + */ + static class Iterate { + Link curr; + + int i; + + Has_Value o; + + public String toString() { + return "Link:" + curr + " i=" + i + " o=" + o; + } + } + + final Has_Value type; + + Link head; // Keep these sorted by link.start + + Link tail; + + int numLinks; + + int numObj; + + long links_created = 0, newObj = 0; + + LinkedList unusedObj; // Storage of FSM not being used. So we + // don't have to + + // keep destroying/creating new ones. + + Sparse(Has_Value type) { + this.type = type; + unusedObj = new LinkedList(); + } + + /* + * void clear() { Link l = head; while (l!=null) { l.clear(); Link t = + * l.next; l.next = null; l.prev = null; l = t; } } + */ + + private Link insert(Link ins, Link prev) { + links_created++; + if (prev == null) { + if (head != null) + head.prev = ins; + ins.next = head; + head = ins; + } else { + // Insert ins into double-linked list. + if (prev.next != null) + prev.next.prev = ins; + ins.next = prev.next; + ins.prev = prev; + prev.next = ins; + } + if (ins.next == null) + tail = ins; + + return ins; + } + + private Link copy(Link l, Link prev, int start, int end) { + if (l == null) + l = insert(new Link(start, end), prev); + else { + l.start = start; + l.end = end; + } + return l; + } + + void copy_cull_cut(Sparse s, int dir, int min_age, double cutOff, int maxGap) { + Link l = head, prev = null; + + sanity(); + for (Link ol = s.head; ol != null; ol = ol.next) { + if (ol.age < min_age) { + // This link is young, copy it completely + int start = ol.start; + int end = ol.end; + if (dir == 1) + end++; + else + start = (start > 0 ? start - 1 : 0); + l = copy(l, prev, start, end); + l.age = ol.age; + prev = l; + l = l.next; + continue; + } + + int start = -1, end = -1; + for (int i = ol.start; i < ol.end; i++) { + Has_Value o = (Has_Value) ol.o.get(i - ol.start); + if (o.get_val() <= cutOff) { + if (start < 0) { + // Start of a new significant part + start = MyMath.max2(i - maxGap, ol.start); + i = MyMath.min2(i + maxGap, ol.end); + end = i; + } else { + end = i + 1; + } + continue; + } + + // This Obj has a high msgLen + + if (start >= 0 && i - end > maxGap) { + // We have a significant region with a big gap after it. + if (dir == 1) + end++; + else + start = (start > 0 ? start - 1 : 0); + l = copy(l, prev, start, end); + l.age = ol.age; + prev = l; + l = l.next; + + start = end = -1; + } + } + + // Any remaining bit at the end? + if (start >= 0) { + if (dir == 1) + end++; + else + start = (start > 0 ? start - 1 : 0); + l = copy(l, prev, start, end); + l.age = ol.age; + prev = l; + l = l.next; + } + } + + // Must remove anything left at the end of l (this sparse list). + while (l != null) { + Link n = l.next; + remove(l); + l = n; + } + } + + // This copies the format of sparse array 's' into this sparse array. + // It is assumed no object data is in the array. And checkAlloc() must be + // called before the array is used. + // 'dir' can be -1,0,1 to specify which direction to expand the sparse array + void duplicateFrom(Sparse s, int dir) { + Link l; + Link l2 = head, prev = null; + for (l = s.head; l != null; l = l.next) { + int ns = l.start - (dir == -1 && l.start > 0 ? 1 : 0); + int ne = l.end + (dir == 1 ? 1 : 0); + // System.err.println("Duplicate link ("+ns+" -> +"+ne+") l2="+l2); + if (l2 == null) { + l2 = insert(new Link(ns, ne), prev); + } else { + l2.start = ns; + l2.end = ne; + } + l2.age = l.age; + + prev = l2; + l2 = l2.next; + } + + // Must remove anything left in l2. + while (l2 != null) { + Link n = l2.next; + remove(l2); + l2 = n; + } + + sanity(); + } + + // Join any touching or overlapping sparse elements. Assumes no useful data + // in the array + void join() { + Link l = head, p = null; + + while (l != null) { + if (p == null) { + p = l; + l = l.next; + continue; + } + + // Join any overlapping regions + if (p.end >= l.start) { + p.start = MyMath.min2(p.start, l.start); + p.end = MyMath.max2(p.end, l.end); + + remove(l); + l = p.next; + } else { + p = l; + l = l.next; + } + } + sanity(); + sanity2(); + } + + // Remove a link from the sparse list + void remove(Link l) { + if (l.prev == null) { + Misc.my_assert(head == l, "head!=l in Sparse.remove()"); + head = l.next; + } else + l.prev.next = l.next; + + if (l.next == null) { + Misc.my_assert(tail == l, "tail!=l in Sparse.remove()"); + tail = l.prev; + } else + l.next.prev = l.prev; + + // Now set its size to 0, and realloc() so all Object are reclaimed in + // our temp list + l.end = l.start; + l.realloc(); + } + + // Add a new one from 'n1 to n2' + // NOTE: This function assumes that no useful object data is currently in + // the array. + // This function does _not_ allocate room in links. + // After using this function, checkAlloc() must be called before using this + // sparse array + void add(int n1, int n2) { + if (n1 < 0) + n1 = 0; + if (n2 <= n1) + return; + + Link l = head, p = null; + while (l != null) { + if (n2 < l.start) { + // Put new one before l + Link nl = insert(new Link(n1, n2), p); + nl.age = 0; + return; + } + + if (n1 < l.end) { + // Join new one into l + l.start = MyMath.min2(n1, l.start); + l.end = MyMath.max2(n2, l.end); + // l.age=0; + return; + } + + p = l; + l = l.next; + } + + // Put the new one at the end. + Link nl = insert(new Link(n1, n2), p); + nl.age = 0; + } + + void checkAlloc() { + sanity2(); + for (Link l = head; l != null; l = l.next) { + l.realloc(); + } + } + + // Remove elements from the start and end of each sparse list that are > + // cutoff + // Skip any sparse lists that have an age < minage + // Not used by current implementation. + void cull(int min_age, double cutoff) { + Link l = head; + while (l != null) { + if (l.age < min_age) { + l = l.next; + continue; + } + + int len = l.end - l.start; + int s, e; + for (s = 0; s < len; s++) { // Find first element < cutoff + Has_Value o = (Has_Value) l.o.get(s); + if (o.get_val() <= cutoff) + break; + } + + if (s >= len) { // No elements found < cutoff. Throw away sparse + // list + Link n = l.next; + remove(l); + l = n; + continue; + } + + // Find last element that is < cutoff + for (e = len; e > s; e--) { + Has_Value o = (Has_Value) l.o.get(e - 1); + if (o.get_val() <= cutoff) + break; + } + + l.end = e + l.start; + l.start = s + l.start; + + // Remove elements from the start of the list. + for (int i = 0; i < s; i++) { + delObj((Has_Value) l.o.remove(0)); + } + + l.realloc(); // This will remove any elements from the end of the + // list + + l = l.next; + } + sanity(); + } + + private Has_Value newObj() { + if (unusedObj.size() > 0) + return (Has_Value) unusedObj.removeFirst(); + newObj++; + return (Has_Value) ((Has_Value) type).clone(); + } + + private void delObj(Has_Value o) { + unusedObj.addFirst(o); + } + + // moveFwd(int, Iterate) - find the element at 'int'. iterate.o==null if + // none there. + // Iterate position updated. + Iterate moveFwd(int n, Iterate res) { + return moveFwd(n, res, false); + } + + // moveFwd(Iterate) - find the next element. iterate.o==null if at end. + // Iterate position updated. + Iterate moveFwd(Iterate res) { + return moveFwd((res == null ? 0 : res.i + 1), res, false); + } + + Iterate moveFwd(int n, Iterate res, boolean exact) { + if (res == null) { + res = new Iterate(); + res.curr = head; + } + + res.o = null; + while (res.curr != null) { + if (exact && n < res.curr.start) + return res; // Didn't find an element at n + + if (n < res.curr.end) { + if (n < res.curr.start) + n = res.curr.start; // Update n to next available item + res.i = n; + res.o = (Has_Value) res.curr.o.get(n - res.curr.start); + return res; + } + res.curr = res.curr.next; + } + return res; + } + + Iterate moveRev(int n, Iterate res) { + return moveRev(n, res, false); + } + + Iterate moveRev(Iterate res) { + int end = (tail == null ? 0 : tail.end); + return moveRev((res == null ? end : res.i - 1), res, false); + } + + Iterate moveRev(int n, Iterate res, boolean exact) { + if (res == null) { + res = new Iterate(); + res.curr = tail; + } + + res.o = null; + while (res.curr != null) { + if (exact && n >= res.curr.end) + return res; // Didn't find an element at n + + if (n >= res.curr.start) { + if (n >= res.curr.end) + n = res.curr.end - 1; + res.i = n; + res.o = (Has_Value) res.curr.o.get(n - res.curr.start); + return res; + } + res.curr = res.curr.prev; + } + return res; + } + + // get the object at interate posSrc + 1. Expected to be in same link. + Has_Value getNext(Iterate res) { + if (res.curr == null) + return null; + + int n = res.i + 1; + if (n < res.curr.start || n >= res.curr.end) + return null; + return (Has_Value) res.curr.o.get(n - res.curr.start); + } + + // Get the object at interate posSrc - 1. Expected to be in same link. + Has_Value getPrev(Iterate res) { + if (res.curr == null) + return null; + + int n = res.i - 1; + if (n < res.curr.start || n >= res.curr.end) + return null; + return (Has_Value) res.curr.o.get(n - res.curr.start); + } + + void incAge() { + for (Link l = head; l != null; l = l.next) { + l.age++; + } + } + + public String toString() { + StringBuffer r = new StringBuffer(); + for (Link l = head; l != null; l = l.next) { + r.append(l); + } + return r.toString(); + } + + public void sanity() { + Link p = null; + for (Link l = head; l != null; l = l.next) { + Misc.my_assert(l.start >= 0 && l.start < l.end, + "Insane. start/end index stuffed."); + + if (l.prev != null) + Misc.my_assert(l == l.prev.next, "Insane 1"); + if (l.next != null) + Misc.my_assert(l == l.next.prev, "Insane 2"); + + p = l; + } + + if (head != null) + Misc.my_assert(head.prev == null, "Insane 4"); + + if (tail != null) + Misc.my_assert(tail.next == null, "Insane 5"); + + if (p != null) + Misc.my_assert(p == tail, "Insane 6"); + } + + public void sanity2() { + Link p = null; + for (Link l = head; l != null; l = l.next) { + Misc.my_assert(l.start >= 0 && l.start < l.end, + "Insane. start/end index stuffed."); + if (p != null) { + if (p.end > l.start) { + System.err.println("Insane. Regions overlap"); + System.err.println("Sparse = " + this); + Misc.my_assert(false, ""); + } + } + p = l; + } + } + + public void display_stats() { + display_stats(""); + } + + public void display_stats(String pre) { + long waste = 0; + long num = 0; + long numActive = 0; + for (Link l = head; l != null; l = l.next) { + numActive += (l.end - l.start); + waste += l.o.size() - (l.end - l.start); + num++; + } + System.out.println(pre + "Number of active lists = " + num); + System.out.println(pre + "Total active cells = " + numActive); + System.out.println(pre + "Wasted array elements = " + waste); + System.out.println(pre + "Total links created = " + links_created); + System.out.println(pre + "Total Obj created = " + newObj); + // System.out.println(this); + } +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/misc/resources/Dis2Priors b/src/main/java/japsa/bio/misc/resources/Dis2Priors new file mode 100755 index 0000000..4c41160 --- /dev/null +++ b/src/main/java/japsa/bio/misc/resources/Dis2Priors @@ -0,0 +1,3500 @@ +1200 0 1.307354 0.003654 0.005271 15065.000000 0.003727 +1200 1 1.307354 0.013850 0.019981 5065.000000 0.013851 +1200 2 1.307354 0.015078 0.021754 2170.109995 0.015026 +1200 3 1.307354 0.021314 0.030751 801.136466 0.021040 +1200 4 1.307354 0.026860 0.038754 468.857612 0.026220 +1200 5 1.307354 0.034344 0.049552 308.742620 0.033168 +1200 6 1.307354 0.041182 0.059421 239.519516 0.039552 +1200 7 1.307354 0.044112 0.063651 217.605299 0.042249 +1200 8 1.307354 0.050692 0.073149 185.445829 0.048477 +1200 9 1.307354 0.055235 0.079708 167.974517 0.052783 +1200 10 1.307354 0.059772 0.086259 151.997223 0.057003 +1200 11 1.307354 0.065964 0.095201 130.835028 0.062625 +1200 12 1.307354 0.072082 0.104038 118.745870 0.068497 +1200 13 1.307354 0.079955 0.115412 104.068738 0.075871 +1200 14 1.307354 0.087425 0.126208 92.398531 0.082780 +1200 15 1.307354 0.089979 0.129900 89.132950 0.085173 +1200 16 1.307354 0.090836 0.131139 87.718872 0.085950 +1200 17 1.307354 0.092331 0.133300 85.811073 0.087315 +1200 18 1.307354 0.096360 0.139125 80.816248 0.090981 +1200 19 1.307354 0.099712 0.143973 76.112158 0.093929 +1200 20 1.307354 0.107824 0.155707 68.051446 0.101265 +1200 21 1.307354 0.111853 0.161538 65.188656 0.105009 +1200 22 1.307354 0.116476 0.168229 61.578572 0.109191 +1200 23 1.307354 0.121099 0.174923 58.811472 0.113490 +1200 24 1.307354 0.127414 0.184068 54.509495 0.119054 +1200 25 1.307354 0.131101 0.189410 52.583001 0.122413 +1200 26 1.307354 0.136500 0.197235 50.370938 0.127500 +1200 27 1.307354 0.140956 0.203694 48.590710 0.131647 +1200 28 1.307354 0.143905 0.207971 47.533910 0.134378 +1200 29 1.307354 0.148553 0.214712 45.262001 0.138434 +1200 30 1.307354 0.150800 0.217972 44.366196 0.140466 +1200 31 1.307354 0.157540 0.227754 41.658662 0.146413 +1200 32 1.307354 0.160408 0.231918 40.428068 0.148888 +1200 33 1.307354 0.164028 0.237176 39.233827 0.152138 +1200 34 1.307354 0.169182 0.244663 37.696200 0.156872 +1200 35 1.307354 0.170677 0.246836 37.358624 0.158309 +1200 36 1.307354 0.177800 0.257190 35.325029 0.164813 +1200 37 1.307354 0.179942 0.260305 34.625893 0.166602 +1200 38 1.307354 0.185376 0.268212 33.535942 0.171677 +1200 39 1.307354 0.188822 0.273226 32.708345 0.174763 +1200 40 1.307354 0.192333 0.278338 31.742143 0.177777 +1200 41 1.307354 0.197801 0.286303 30.467658 0.182579 +1200 42 1.307354 0.198572 0.287426 30.406814 0.183380 +1200 43 1.307354 0.202213 0.292732 29.567645 0.186512 +1200 44 1.307354 0.210320 0.304554 28.522655 0.194316 +1200 45 1.307354 0.213665 0.309434 27.763222 0.197095 +1200 46 1.307354 0.216256 0.313216 27.322763 0.199413 +1200 47 1.307354 0.219674 0.318206 26.728520 0.202441 +1200 48 1.307354 0.225100 0.326133 25.732381 0.207141 +1200 49 1.307354 0.231806 0.335935 24.600644 0.212965 +1200 50 1.307354 0.233825 0.338888 24.404722 0.214915 +1200 51 1.307354 0.236499 0.342799 24.186174 0.217552 +1200 52 1.307354 0.240420 0.348538 23.731200 0.221183 +1200 53 1.307354 0.242423 0.351470 23.401442 0.222878 +1200 54 1.307354 0.246501 0.357443 23.053212 0.226816 +1200 55 1.307354 0.251719 0.365090 22.372222 0.231427 +1200 56 1.307354 0.259073 0.375875 21.495424 0.237972 +1200 57 1.307354 0.260298 0.377673 21.409657 0.239149 +1200 58 1.307354 0.263499 0.382372 21.175558 0.242217 +1200 59 1.307354 0.265382 0.385137 21.133270 0.244191 +1200 60 1.307354 0.267894 0.388826 20.923095 0.246592 +1200 61 1.307354 0.271073 0.393497 20.508995 0.249266 +1200 62 1.307354 0.274289 0.398225 20.143317 0.252073 +1200 63 1.307354 0.281319 0.408566 19.431405 0.258281 +1200 64 1.307354 0.286363 0.415993 19.104025 0.263125 +1200 65 1.307354 0.287710 0.417977 18.914031 0.264173 +1200 66 1.307354 0.293840 0.427012 18.428844 0.269783 +1200 67 1.307354 0.297461 0.432353 18.136474 0.272989 +1200 68 1.307354 0.303069 0.440630 17.618325 0.277822 +1200 69 1.307354 0.308849 0.449169 17.012647 0.282467 +1200 70 1.307354 0.312531 0.454611 16.742745 0.285817 +1200 71 1.307354 0.316816 0.460951 16.427792 0.289599 +1200 72 1.307354 0.322334 0.469121 15.958458 0.294199 +1200 73 1.307354 0.324211 0.471901 15.815548 0.295850 +1200 74 1.307354 0.329737 0.480092 15.487047 0.300871 +1200 75 1.307354 0.331570 0.482810 15.394449 0.302690 +1200 76 1.307354 0.334588 0.487288 15.195715 0.305334 +1200 77 1.307354 0.336354 0.489910 15.059635 0.306807 +1200 78 1.307354 0.339925 0.495213 14.909864 0.310269 +1200 79 1.307354 0.345773 0.503906 14.571018 0.315483 +1200 80 1.307354 0.349401 0.509303 14.282635 0.318369 +1200 81 1.307354 0.353518 0.515432 14.056044 0.321999 +1200 82 1.307354 0.353564 0.515501 14.056044 0.322419 +1200 83 1.307354 0.356459 0.519813 13.972002 0.324984 +1200 84 1.307354 0.358759 0.523241 13.805422 0.326887 +1200 85 1.307354 0.361623 0.527511 13.736602 0.329778 +1200 86 1.307354 0.366908 0.535397 13.451282 0.334443 +1200 87 1.307354 0.369907 0.539876 13.330824 0.337262 +1200 88 1.307354 0.374016 0.546017 13.106227 0.340761 +1200 89 1.307354 0.376854 0.550260 12.975882 0.343367 +1200 90 1.307354 0.382028 0.558003 12.706363 0.347803 +1200 91 1.307354 0.386013 0.563973 12.442443 0.350954 +1200 92 1.307354 0.390522 0.570733 12.281817 0.355140 +1200 93 1.307354 0.393420 0.575080 12.099056 0.357418 +1200 94 1.307354 0.393994 0.575942 12.074894 0.357898 +1200 95 1.307354 0.396825 0.580193 11.930933 0.360280 +1200 96 1.307354 0.399146 0.583679 11.824089 0.362348 +1200 97 1.307354 0.401415 0.587090 11.753392 0.364483 +1200 98 1.307354 0.404633 0.591929 11.624877 0.367368 +1200 99 1.307354 0.407919 0.596873 11.463343 0.370111 +1200 100 1.307354 0.411291 0.601951 11.326673 0.373094 +1200 101 1.307354 0.413024 0.604561 11.236466 0.374511 +1200 102 1.307354 0.415277 0.607958 11.169283 0.376603 +1200 103 1.307354 0.415838 0.608804 11.158124 0.377174 +1200 104 1.307354 0.419182 0.613848 11.003076 0.379966 +1200 105 1.307354 0.420320 0.615566 10.992084 0.381198 +1200 106 1.307354 0.426995 0.625648 10.720822 0.386838 +1200 107 1.307354 0.429796 0.629884 10.593005 0.389166 +1200 108 1.307354 0.433596 0.635633 10.477178 0.392661 +1200 109 1.307354 0.435759 0.638908 10.393736 0.394482 +1200 110 1.307354 0.437978 0.642270 10.290368 0.396225 +1200 111 1.307354 0.440258 0.645726 10.218623 0.398293 +1200 112 1.307354 0.443520 0.650674 10.106890 0.401060 +1200 113 1.307354 0.444603 0.652317 10.076630 0.402056 +1200 114 1.307354 0.446320 0.654924 10.026397 0.403681 +1200 115 1.307354 0.448483 0.658209 9.966449 0.405606 +1200 116 1.307354 0.450674 0.661537 9.906859 0.407620 +1200 117 1.307354 0.452320 0.664040 9.867330 0.409189 +1200 118 1.307354 0.452904 0.664929 9.837787 0.409646 +1200 119 1.307354 0.455100 0.668269 9.759438 0.411397 +1200 120 1.307354 0.456724 0.670740 9.701086 0.412784 +1200 121 1.307354 0.458928 0.674096 9.623825 0.414615 +1200 122 1.307354 0.462622 0.679723 9.575850 0.418444 +1200 123 1.307354 0.465316 0.683831 9.480616 0.420757 +1200 124 1.307354 0.466889 0.686230 9.423931 0.421987 +1200 125 1.307354 0.468997 0.689448 9.386329 0.424075 +1200 126 1.307354 0.470585 0.691873 9.330208 0.425439 +1200 127 1.307354 0.474798 0.698311 9.163849 0.428707 +1200 128 1.307354 0.478994 0.704729 9.036512 0.432305 +1200 129 1.307354 0.484268 0.712807 8.928776 0.437325 +1200 130 1.307354 0.484755 0.713552 8.910945 0.437744 +1200 131 1.307354 0.486333 0.715971 8.848817 0.438947 +1200 132 1.307354 0.490097 0.721746 8.760814 0.442383 +1200 133 1.307354 0.493809 0.727447 8.656364 0.445480 +1200 134 1.307354 0.496993 0.732340 8.553160 0.448086 +1200 135 1.307354 0.498000 0.733889 8.519033 0.448897 +1200 136 1.307354 0.500680 0.738013 8.459637 0.451372 +1200 137 1.307354 0.501749 0.739658 8.442743 0.452326 +1200 138 1.307354 0.505327 0.745169 8.325427 0.455188 +1200 139 1.307354 0.506862 0.747535 8.292208 0.456524 +1200 140 1.307354 0.508448 0.749980 8.242628 0.457870 +1200 141 1.307354 0.510509 0.753160 8.201539 0.459727 +1200 142 1.307354 0.513098 0.757157 8.128093 0.461788 +1200 143 1.307354 0.515712 0.761195 8.063359 0.464093 +1200 144 1.307354 0.519832 0.767566 7.967225 0.467683 +1200 145 1.307354 0.524449 0.774715 7.840827 0.471463 +1200 146 1.307354 0.528548 0.781069 7.739606 0.474892 +1200 147 1.307354 0.531702 0.785963 7.654979 0.477447 +1200 148 1.307354 0.535233 0.791446 7.571277 0.480472 +1200 149 1.307354 0.536736 0.793783 7.563713 0.482086 +1200 150 1.307354 0.538771 0.796948 7.518489 0.483897 +1200 151 1.307354 0.539797 0.798544 7.510978 0.485006 +1200 152 1.307354 0.541824 0.801700 7.458611 0.486660 +1200 153 1.307354 0.543359 0.804090 7.428851 0.487933 +1200 154 1.307354 0.544399 0.805709 7.391818 0.488706 +1200 155 1.307354 0.546421 0.808861 7.362325 0.490649 +1200 156 1.307354 0.549949 0.814366 7.296394 0.493780 +1200 157 1.307354 0.552395 0.818184 7.231053 0.495751 +1200 158 1.307354 0.555365 0.822825 7.187819 0.498519 +1200 159 1.307354 0.556803 0.825073 7.144842 0.499503 +1200 160 1.307354 0.557791 0.826618 7.130574 0.500516 +1200 161 1.307354 0.561822 0.832928 7.052606 0.504063 +1200 162 1.307354 0.564867 0.837700 6.975491 0.506347 +1200 163 1.307354 0.567759 0.842236 6.899219 0.508579 +1200 164 1.307354 0.569214 0.844519 6.864826 0.509675 +1200 165 1.307354 0.570729 0.846898 6.823781 0.510844 +1200 166 1.307354 0.574685 0.853115 6.742425 0.514206 +1200 167 1.307354 0.577212 0.857090 6.695417 0.516338 +1200 168 1.307354 0.577724 0.857897 6.688728 0.516844 +1200 169 1.307354 0.581529 0.863889 6.615591 0.520018 +1200 170 1.307354 0.583470 0.866948 6.582612 0.521804 +1200 171 1.307354 0.587326 0.873033 6.504132 0.524912 +1200 172 1.307354 0.590233 0.877626 6.458785 0.527533 +1200 173 1.307354 0.594196 0.883891 6.362674 0.530378 +1200 174 1.307354 0.597593 0.889270 6.293102 0.533057 +1200 175 1.307354 0.600967 0.894618 6.236747 0.535931 +1200 176 1.307354 0.603895 0.899263 6.162390 0.537935 +1200 177 1.307354 0.604362 0.900004 6.156234 0.538421 +1200 178 1.307354 0.607226 0.904555 6.107205 0.540893 +1200 179 1.307354 0.608218 0.906132 6.095009 0.541806 +1200 180 1.307354 0.610625 0.909959 6.040427 0.543554 +1200 181 1.307354 0.614924 0.916804 5.980353 0.547359 +1200 182 1.307354 0.616895 0.919946 5.944597 0.548954 +1200 183 1.307354 0.620765 0.926119 5.891362 0.552566 +1200 184 1.307354 0.621288 0.926955 5.885476 0.553136 +1200 185 1.307354 0.622741 0.929276 5.861993 0.554361 +1200 186 1.307354 0.623711 0.930825 5.838604 0.554938 +1200 187 1.307354 0.625611 0.933863 5.797896 0.556331 +1200 188 1.307354 0.628980 0.939254 5.734501 0.558954 +1200 189 1.307354 0.630922 0.942364 5.711620 0.560682 +1200 190 1.307354 0.634284 0.947753 5.654816 0.563449 +1200 191 1.307354 0.637485 0.952891 5.604177 0.566113 +1200 192 1.307354 0.638425 0.954400 5.587398 0.566793 +1200 193 1.307354 0.641150 0.958780 5.548442 0.569065 +1200 194 1.307354 0.643545 0.962632 5.509758 0.571083 +1200 195 1.307354 0.645951 0.966506 5.465877 0.572849 +1200 196 1.307354 0.647892 0.969634 5.433197 0.574297 +1200 197 1.307354 0.648861 0.971196 5.416929 0.575198 +1200 198 1.307354 0.650286 0.973495 5.395316 0.576369 +1200 199 1.307354 0.652146 0.976496 5.368420 0.577966 +1200 200 1.307354 0.656350 0.983288 5.309720 0.581458 +1200 201 1.307354 0.658682 0.987060 5.272700 0.583367 +1200 202 1.307354 0.660991 0.990798 5.235939 0.585225 +1200 203 1.307354 0.661942 0.992338 5.220262 0.585947 +1200 204 1.307354 0.663306 0.994549 5.204633 0.587119 +1200 205 1.307354 0.666036 0.998976 5.158024 0.589218 +1200 206 1.307354 0.668387 1.002793 5.122062 0.590951 +1200 207 1.307354 0.671563 1.007954 5.076193 0.593526 +1200 208 1.307354 0.675222 1.013908 5.025709 0.596530 +1200 209 1.307354 0.679865 1.021475 4.955874 0.600150 +1200 210 1.307354 0.681256 1.023744 4.936100 0.601267 +1200 211 1.307354 0.683944 1.028133 4.891897 0.603207 +1200 212 1.307354 0.686673 1.032595 4.857790 0.605417 +1200 213 1.307354 0.688017 1.034794 4.833574 0.606412 +1200 214 1.307354 0.689819 1.037743 4.809478 0.607855 +1200 215 1.307354 0.692569 1.042249 4.771175 0.610056 +1200 216 1.307354 0.695747 1.047462 4.737910 0.612818 +1200 217 1.307354 0.700279 1.054907 4.681423 0.616664 +1200 218 1.307354 0.702974 1.059339 4.653432 0.619016 +1200 219 1.307354 0.706110 1.064506 4.611760 0.621487 +1200 220 1.307354 0.708350 1.068199 4.584186 0.623194 +1200 221 1.307354 0.709210 1.069617 4.565895 0.623713 +1200 222 1.307354 0.713716 1.077060 4.515970 0.627486 +1200 223 1.307354 0.715505 1.080019 4.493458 0.628863 +1200 224 1.307354 0.715960 1.080771 4.484485 0.629183 +1200 225 1.307354 0.719561 1.086736 4.439885 0.632004 +1200 226 1.307354 0.722261 1.091212 4.408930 0.634234 +1200 227 1.307354 0.724529 1.094977 4.391338 0.636277 +1200 228 1.307354 0.726309 1.097934 4.373817 0.637839 +1200 229 1.307354 0.727577 1.100042 4.360722 0.639024 +1200 230 1.307354 0.729828 1.103786 4.334649 0.640783 +1200 231 1.307354 0.730721 1.105272 4.321671 0.641399 +1200 232 1.307354 0.732867 1.108848 4.300127 0.643273 +1200 233 1.307354 0.735479 1.113202 4.274416 0.645612 +1200 234 1.307354 0.738994 1.119070 4.236138 0.648598 +1200 235 1.307354 0.741209 1.122773 4.219236 0.650658 +1200 236 1.307354 0.743896 1.127271 4.194009 0.652946 +1200 237 1.307354 0.746115 1.130986 4.168933 0.654854 +1200 238 1.307354 0.749148 1.136073 4.139867 0.657399 +1200 239 1.307354 0.751750 1.140443 4.106896 0.659362 +1200 240 1.307354 0.753472 1.143336 4.094600 0.660811 +1200 241 1.307354 0.757409 1.149962 4.061990 0.664543 +1200 242 1.307354 0.759630 1.153704 4.037703 0.666210 +1200 243 1.307354 0.764368 1.161701 3.985579 0.669867 +1200 244 1.307354 0.767456 1.166921 3.949887 0.672319 +1200 245 1.307354 0.768796 1.169190 3.934127 0.673252 +1200 246 1.307354 0.770530 1.172125 3.914515 0.674493 +1200 247 1.307354 0.774843 1.179439 3.863981 0.677619 +1200 248 1.307354 0.775657 1.180822 3.860121 0.678399 +1200 249 1.307354 0.778693 1.185981 3.829379 0.680779 +1200 250 1.307354 0.780402 1.188887 3.806482 0.681782 +1200 251 1.307354 0.781660 1.191029 3.795086 0.682823 +1200 252 1.307354 0.784162 1.195292 3.768626 0.684916 +1200 253 1.307354 0.785896 1.198249 3.753589 0.686213 +1200 254 1.307354 0.788001 1.201842 3.731146 0.687909 +1200 255 1.307354 0.790079 1.205394 3.708838 0.689482 +1200 256 1.307354 0.791735 1.208225 3.697733 0.690898 +1200 257 1.307354 0.795073 1.213940 3.668284 0.693767 +1200 258 1.307354 0.796810 1.216918 3.646351 0.694946 +1200 259 1.307354 0.798480 1.219783 3.628174 0.696074 +1200 260 1.307354 0.798918 1.220534 3.628174 0.696503 +1200 261 1.307354 0.801449 1.224883 3.602878 0.698366 +1200 262 1.307354 0.802694 1.227023 3.588503 0.699222 +1200 263 1.307354 0.803972 1.229221 3.577759 0.700310 +1200 264 1.307354 0.806548 1.233656 3.559923 0.702697 +1200 265 1.307354 0.809530 1.238797 3.535103 0.705289 +1200 266 1.307354 0.811677 1.242504 3.510456 0.706726 +1200 267 1.307354 0.815079 1.248384 3.479019 0.709365 +1200 268 1.307354 0.817996 1.253435 3.454763 0.711711 +1200 269 1.307354 0.819264 1.255632 3.444420 0.712651 +1200 270 1.307354 0.820959 1.258572 3.427249 0.713997 +1200 271 1.307354 0.821378 1.259299 3.427249 0.714433 +1200 272 1.307354 0.823425 1.262853 3.406758 0.715765 +1200 273 1.307354 0.826321 1.267887 3.383005 0.718192 +1200 274 1.307354 0.827153 1.269335 3.372877 0.718780 +1200 275 1.307354 0.830393 1.274978 3.349361 0.721602 +1200 276 1.307354 0.832485 1.278627 3.332664 0.723331 +1200 277 1.307354 0.835357 1.283643 3.302819 0.725231 +1200 278 1.307354 0.838178 1.288575 3.279792 0.727391 +1200 279 1.307354 0.841023 1.293559 3.256925 0.729516 +1200 280 1.307354 0.842253 1.295715 3.247174 0.730641 +1200 281 1.307354 0.844301 1.299309 3.224534 0.731855 +1200 282 1.307354 0.846352 1.302911 3.208460 0.733437 +1200 283 1.307354 0.848062 1.305917 3.195658 0.734741 +1200 284 1.307354 0.848865 1.307329 3.189276 0.735617 +1200 285 1.307354 0.849693 1.308787 3.182907 0.736170 +1200 286 1.307354 0.850899 1.310911 3.170207 0.736835 +1200 287 1.307354 0.855416 1.318878 3.135543 0.740452 +1200 288 1.307354 0.857854 1.323186 3.116796 0.742564 +1200 289 1.307354 0.859060 1.325319 3.107464 0.743368 +1200 290 1.307354 0.860320 1.327547 3.098160 0.744285 +1200 291 1.307354 0.861908 1.330360 3.082716 0.745434 +1200 292 1.307354 0.864369 1.334723 3.061223 0.747022 +1200 293 1.307354 0.866327 1.338200 3.045963 0.748435 +1200 294 1.307354 0.867131 1.339626 3.039880 0.748924 +1200 295 1.307354 0.868328 1.341754 3.030778 0.750072 +1200 296 1.307354 0.871581 1.347543 3.009647 0.752777 +1200 297 1.307354 0.874007 1.351866 2.991653 0.754875 +1200 298 1.307354 0.874789 1.353261 2.985678 0.755447 +1200 299 1.307354 0.876006 1.355432 2.976739 0.756409 +1200 300 1.307354 0.876437 1.356202 2.973765 0.756682 +1200 301 1.307354 0.877634 1.358339 2.964862 0.757576 +1200 302 1.307354 0.880064 1.362684 2.947135 0.759487 +1200 303 1.307354 0.881670 1.365560 2.938311 0.760976 +1200 304 1.307354 0.883731 1.369251 2.920743 0.762441 +1200 305 1.307354 0.884509 1.370646 2.914910 0.763105 +1200 306 1.307354 0.885727 1.372831 2.906183 0.763865 +1200 307 1.307354 0.887345 1.375737 2.891695 0.764918 +1200 308 1.307354 0.889336 1.379315 2.877280 0.766339 +1200 309 1.307354 0.890542 1.381486 2.865800 0.766977 +1200 310 1.307354 0.891681 1.383535 2.857219 0.767700 +1200 311 1.307354 0.892886 1.385706 2.848665 0.768620 +1200 312 1.307354 0.894043 1.387792 2.840136 0.769532 +1200 313 1.307354 0.894825 1.389202 2.831633 0.769974 +1200 314 1.307354 0.897584 1.394182 2.814702 0.772241 +1200 315 1.307354 0.898767 1.396320 2.806275 0.773045 +1200 316 1.307354 0.902729 1.403490 2.775590 0.775738 +1200 317 1.307354 0.905506 1.408525 2.756239 0.777877 +1200 318 1.307354 0.907867 1.412813 2.739759 0.779685 +1200 319 1.307354 0.910220 1.417092 2.723378 0.781336 +1200 320 1.307354 0.910975 1.418465 2.717939 0.781819 +1200 321 1.307354 0.912539 1.421313 2.707094 0.783134 +1200 322 1.307354 0.914887 1.425595 2.690909 0.784676 +1200 323 1.307354 0.917592 1.430534 2.672147 0.786695 +1200 324 1.307354 0.920313 1.435509 2.656170 0.789022 +1200 325 1.307354 0.923788 1.441876 2.632384 0.791590 +1200 326 1.307354 0.924938 1.443986 2.624503 0.792481 +1200 327 1.307354 0.926098 1.446116 2.619261 0.793478 +1200 328 1.307354 0.926883 1.447557 2.614031 0.793982 +1200 329 1.307354 0.929569 1.452497 2.593212 0.795730 +1200 330 1.307354 0.930369 1.453970 2.588034 0.796375 +1200 331 1.307354 0.932348 1.457615 2.575132 0.797774 +1200 332 1.307354 0.934260 1.461142 2.562295 0.799139 +1200 333 1.307354 0.937767 1.467619 2.539349 0.801742 +1200 334 1.307354 0.941615 1.474744 2.516609 0.804634 +1200 335 1.307354 0.943171 1.477629 2.506568 0.805765 +1200 336 1.307354 0.945911 1.482718 2.489092 0.807853 +1200 337 1.307354 0.947039 1.484814 2.481639 0.808716 +1200 338 1.307354 0.948168 1.486914 2.474209 0.809556 +1200 339 1.307354 0.949296 1.489014 2.469268 0.810427 +1200 340 1.307354 0.950794 1.491805 2.456959 0.811379 +1200 341 1.307354 0.951521 1.493161 2.452052 0.811863 +1200 342 1.307354 0.953359 1.496590 2.442269 0.813321 +1200 343 1.307354 0.956395 1.502264 2.425241 0.815742 +1200 344 1.307354 0.959808 1.508653 2.403522 0.818019 +1200 345 1.307354 0.961685 1.512175 2.393932 0.819739 +1200 346 1.307354 0.963583 1.515738 2.381999 0.821141 +1200 347 1.307354 0.966201 1.520661 2.365391 0.822887 +1200 348 1.307354 0.966958 1.522085 2.360667 0.823389 +1200 349 1.307354 0.969531 1.526935 2.346553 0.825296 +1200 350 1.307354 0.971428 1.530514 2.334855 0.826729 +1200 351 1.307354 0.974389 1.536109 2.318576 0.828863 +1200 352 1.307354 0.976281 1.539691 2.307018 0.830189 +1200 353 1.307354 0.977400 1.541812 2.300111 0.831071 +1200 354 1.307354 0.980482 1.547659 2.284074 0.833544 +1200 355 1.307354 0.983074 1.552585 2.270418 0.835273 +1200 356 1.307354 0.984912 1.556084 2.259100 0.836732 +1200 357 1.307354 0.986404 1.558927 2.252336 0.837829 +1200 358 1.307354 0.987915 1.561809 2.243349 0.838920 +1200 359 1.307354 0.989767 1.565345 2.234398 0.840492 +1200 360 1.307354 0.991198 1.568080 2.225483 0.841377 +1200 361 1.307354 0.993448 1.572386 2.212176 0.843183 +1200 362 1.307354 0.994933 1.575232 2.207759 0.844707 +1200 363 1.307354 0.998299 1.581693 2.190176 0.847633 +1200 364 1.307354 1.000116 1.585185 2.181437 0.849028 +1200 365 1.307354 1.000846 1.586591 2.177081 0.849571 +1200 366 1.307354 1.002319 1.589427 2.168394 0.850624 +1200 367 1.307354 1.003408 1.591525 2.161902 0.851138 +1200 368 1.307354 1.003761 1.592206 2.159742 0.851363 +1200 369 1.307354 1.005545 1.595650 2.148976 0.852411 +1200 370 1.307354 1.007371 1.599177 2.140401 0.854099 +1200 371 1.307354 1.008805 1.601949 2.133993 0.855185 +1200 372 1.307354 1.010709 1.605636 2.123355 0.856620 +1200 373 1.307354 1.011441 1.607055 2.119115 0.857085 +1200 374 1.307354 1.013633 1.611307 2.106444 0.858542 +1200 375 1.307354 1.015770 1.615459 2.095944 0.860147 +1200 376 1.307354 1.017229 1.618298 2.087581 0.861006 +1200 377 1.307354 1.017976 1.619752 2.083412 0.861626 +1200 378 1.307354 1.019440 1.622604 2.077174 0.862697 +1200 379 1.307354 1.022039 1.627673 2.062692 0.864383 +1200 380 1.307354 1.023519 1.630565 2.054462 0.865309 +1200 381 1.307354 1.023524 1.630575 2.054462 0.865360 +1200 382 1.307354 1.024621 1.632720 2.050359 0.866487 +1200 383 1.307354 1.026410 1.636222 2.040138 0.867603 +1200 384 1.307354 1.028216 1.639761 2.029968 0.868665 +1200 385 1.307354 1.030336 1.643920 2.017830 0.870067 +1200 386 1.307354 1.031078 1.645378 2.013801 0.870507 +1200 387 1.307354 1.033286 1.649721 2.003762 0.872262 +1200 388 1.307354 1.035136 1.653364 1.993773 0.873404 +1200 389 1.307354 1.036962 1.656964 1.983834 0.874400 +1200 390 1.307354 1.038762 1.660519 1.973944 0.875508 +1200 391 1.307354 1.041638 1.666206 1.960182 0.877506 +1200 392 1.307354 1.043439 1.669773 1.950410 0.878869 +1200 393 1.307354 1.045558 1.673976 1.940688 0.880389 +1200 394 1.307354 1.047330 1.677497 1.932944 0.881634 +1200 395 1.307354 1.048801 1.680422 1.927157 0.882949 +1200 396 1.307354 1.049444 1.681703 1.923308 0.883309 +1200 397 1.307354 1.050890 1.684582 1.917550 0.884597 +1200 398 1.307354 1.052679 1.688150 1.907991 0.885613 +1200 399 1.307354 1.054438 1.691663 1.900378 0.886907 +1200 400 1.307354 1.056515 1.695816 1.889015 0.888294 +1200 401 1.307354 1.057565 1.697918 1.883360 0.888747 +1200 402 1.307354 1.059010 1.700813 1.875845 0.889574 +1200 403 1.307354 1.061830 1.706473 1.862767 0.891349 +1200 404 1.307354 1.064994 1.712836 1.847931 0.893606 +1200 405 1.307354 1.066040 1.714943 1.842399 0.894190 +1200 406 1.307354 1.066369 1.715607 1.842399 0.894541 +1200 407 1.307354 1.067800 1.718493 1.835047 0.895654 +1200 408 1.307354 1.069941 1.722816 1.825900 0.897022 +1200 409 1.307354 1.072021 1.727024 1.814982 0.898386 +1200 410 1.307354 1.073067 1.729143 1.809548 0.898994 +1200 411 1.307354 1.075850 1.734786 1.798729 0.901521 +1200 412 1.307354 1.078633 1.740442 1.786188 0.903375 +1200 413 1.307354 1.081711 1.746711 1.773735 0.905926 +1200 414 1.307354 1.083089 1.749524 1.768424 0.906919 +1200 415 1.307354 1.085182 1.753800 1.757851 0.908156 +1200 416 1.307354 1.085486 1.754421 1.756094 0.908338 +1200 417 1.307354 1.085496 1.754442 1.756094 0.908339 +1200 418 1.307354 1.086896 1.757306 1.750837 0.909382 +1200 419 1.307354 1.087914 1.759390 1.747340 0.910209 +1200 420 1.307354 1.089916 1.763496 1.736893 0.911397 +1200 421 1.307354 1.090963 1.765646 1.733424 0.912162 +1200 422 1.307354 1.092699 1.769214 1.726508 0.913466 +1200 423 1.307354 1.094086 1.772067 1.719619 0.914362 +1200 424 1.307354 1.094775 1.773485 1.716185 0.914880 +1200 425 1.307354 1.096470 1.776979 1.709337 0.915908 +1200 426 1.307354 1.099254 1.782729 1.697420 0.917730 +1200 427 1.307354 1.099953 1.784174 1.694030 0.918136 +1200 428 1.307354 1.100623 1.785559 1.690647 0.918524 +1200 429 1.307354 1.101960 1.788328 1.685585 0.919446 +1200 430 1.307354 1.104060 1.792682 1.675507 0.921025 +1200 431 1.307354 1.106036 1.796786 1.667154 0.922311 +1200 432 1.307354 1.108105 1.801089 1.658844 0.923746 +1200 433 1.307354 1.109116 1.803195 1.653877 0.924217 +1200 434 1.307354 1.111415 1.807989 1.643988 0.925749 +1200 435 1.307354 1.112136 1.809494 1.642346 0.926302 +1200 436 1.307354 1.113493 1.812330 1.635793 0.927049 +1200 437 1.307354 1.114532 1.814502 1.632526 0.927783 +1200 438 1.307354 1.115862 1.817288 1.626013 0.928801 +1200 439 1.307354 1.118563 1.822951 1.614676 0.930277 +1200 440 1.307354 1.120648 1.827333 1.606627 0.931404 +1200 441 1.307354 1.123002 1.832287 1.597021 0.933050 +1200 442 1.307354 1.124039 1.834475 1.592239 0.933783 +1200 443 1.307354 1.125442 1.837433 1.587472 0.934838 +1200 444 1.307354 1.127519 1.841822 1.579558 0.936145 +1200 445 1.307354 1.129882 1.846825 1.568545 0.937589 +1200 446 1.307354 1.131221 1.849665 1.563849 0.938629 +1200 447 1.307354 1.133270 1.854014 1.556053 0.940032 +1200 448 1.307354 1.134617 1.856878 1.551395 0.940760 +1200 449 1.307354 1.137250 1.862486 1.540578 0.942572 +1200 450 1.307354 1.139922 1.868190 1.531367 0.944326 +1200 451 1.307354 1.141927 1.872476 1.522211 0.945585 +1200 452 1.307354 1.142283 1.873239 1.520690 0.945788 +1200 453 1.307354 1.143566 1.875988 1.516137 0.946654 +1200 454 1.307354 1.146242 1.881732 1.507072 0.948826 +1200 455 1.307354 1.146882 1.883107 1.504062 0.949147 +1200 456 1.307354 1.147585 1.884618 1.501059 0.949513 +1200 457 1.307354 1.148239 1.886025 1.499559 0.950028 +1200 458 1.307354 1.150881 1.891717 1.489104 0.951482 +1200 459 1.307354 1.153544 1.897467 1.478722 0.953385 +1200 460 1.307354 1.154826 1.900240 1.474295 0.954404 +1200 461 1.307354 1.158464 1.908126 1.461092 0.956506 +1200 462 1.307354 1.159120 1.909550 1.458174 0.956598 +1200 463 1.307354 1.160424 1.912385 1.452356 0.957298 +1200 464 1.307354 1.161376 1.914457 1.449456 0.957777 +1200 465 1.307354 1.161691 1.915143 1.448008 0.957935 +1200 466 1.307354 1.163007 1.918009 1.442230 0.958653 +1200 467 1.307354 1.164324 1.920883 1.437912 0.959664 +1200 468 1.307354 1.166236 1.925057 1.430744 0.960890 +1200 469 1.307354 1.167882 1.928659 1.425035 0.961756 +1200 470 1.307354 1.168183 1.929318 1.423612 0.961787 +1200 471 1.307354 1.169141 1.931416 1.419350 0.962342 +1200 472 1.307354 1.170468 1.934326 1.415100 0.963194 +1200 473 1.307354 1.172437 1.938649 1.408046 0.964703 +1200 474 1.307354 1.172804 1.939457 1.406639 0.964767 +1200 475 1.307354 1.174448 1.943074 1.401027 0.966193 +1200 476 1.307354 1.176396 1.947368 1.394042 0.967379 +1200 477 1.307354 1.178336 1.951652 1.387093 0.968675 +1200 478 1.307354 1.178662 1.952371 1.385707 0.968814 +1200 479 1.307354 1.179972 1.955270 1.381558 0.969626 +1200 480 1.307354 1.180614 1.956690 1.378799 0.969951 +1200 481 1.307354 1.182188 1.960177 1.373298 0.970943 +1200 482 1.307354 1.183155 1.962324 1.370556 0.971858 +1200 483 1.307354 1.186074 1.968810 1.362361 0.974155 +1200 484 1.307354 1.186717 1.970241 1.359640 0.974469 +1200 485 1.307354 1.187067 1.971021 1.358282 0.974668 +1200 486 1.307354 1.189332 1.976071 1.350161 0.976260 +1200 487 1.307354 1.190591 1.978883 1.346118 0.977088 +1200 488 1.307354 1.191547 1.981021 1.343430 0.977700 +1200 489 1.307354 1.193170 1.984653 1.338070 0.978533 +1200 490 1.307354 1.193836 1.986146 1.335398 0.978781 +1200 491 1.307354 1.194140 1.986826 1.334064 0.978976 +1200 492 1.307354 1.196368 1.991827 1.326087 0.980463 +1200 493 1.307354 1.198298 1.996166 1.319477 0.981469 +1200 494 1.307354 1.199890 1.999753 1.314212 0.982378 +1200 495 1.307354 1.200850 2.001918 1.311587 0.982900 +1200 496 1.307354 1.202156 2.004864 1.306354 0.983594 +1200 497 1.307354 1.204739 2.010707 1.297246 0.985266 +1200 498 1.307354 1.205363 2.012120 1.295950 0.985756 +1200 499 1.307354 1.206285 2.014210 1.293362 0.986510 +1200 500 1.307354 1.207218 2.016327 1.289490 0.987136 +1200 501 1.307354 1.208806 2.019933 1.284345 0.988053 +1200 502 1.307354 1.209124 2.020657 1.284345 0.988224 +1200 503 1.307354 1.210679 2.024194 1.279220 0.989622 +1200 504 1.307354 1.211607 2.026309 1.275390 0.990168 +1200 505 1.307354 1.212849 2.029141 1.271572 0.990918 +1200 506 1.307354 1.214023 2.031821 1.267765 0.991524 +1200 507 1.307354 1.215307 2.034756 1.263969 0.992461 +1200 508 1.307354 1.216877 2.038351 1.258926 0.993399 +1200 509 1.307354 1.219366 2.044058 1.251398 0.995111 +1200 510 1.307354 1.220936 2.047667 1.246405 0.995913 +1200 511 1.307354 1.221261 2.048413 1.245160 0.995931 +1200 512 1.307354 1.224109 2.054976 1.235244 0.997435 +1200 513 1.307354 1.225369 2.057883 1.231545 0.998329 +1200 514 1.307354 1.228153 2.064322 1.222959 0.999898 +1200 515 1.307354 1.228849 2.063618 1.222959 0.999761 +1200 516 1.307354 1.229094 2.066504 1.219297 1.000368 +1200 517 1.307354 1.230664 2.070146 1.214432 1.001303 +1200 518 1.307354 1.232539 2.074503 1.209587 1.002266 +1200 519 1.307354 1.233739 2.077295 1.204760 1.002839 +1200 520 1.307354 1.235330 2.081004 1.199953 1.003932 +1200 521 1.307354 1.236597 2.083962 1.196361 1.004704 +1200 522 1.307354 1.237830 2.086842 1.192779 1.005604 +1200 523 1.307354 1.238759 2.089016 1.190397 1.006480 +1200 524 1.307354 1.239678 2.091169 1.188019 1.007170 +1200 525 1.307354 1.242156 2.096981 1.180916 1.008496 +1200 526 1.307354 1.243084 2.099161 1.177381 1.008972 +1200 527 1.307354 1.244339 2.102114 1.173855 1.009623 +1200 528 1.307354 1.245234 2.104222 1.170341 1.010113 +1200 529 1.307354 1.247092 2.108602 1.165671 1.011002 +1200 530 1.307354 1.248607 2.112182 1.161020 1.011754 +1200 531 1.307354 1.250475 2.116603 1.155232 1.013103 +1200 532 1.307354 1.251730 2.119576 1.151774 1.013862 +1200 533 1.307354 1.253576 2.123959 1.146032 1.014955 +1200 534 1.307354 1.256028 2.129793 1.139180 1.016478 +1200 535 1.307354 1.256351 2.130561 1.138042 1.016598 +1200 536 1.307354 1.257285 2.132788 1.135769 1.017043 +1200 537 1.307354 1.258187 2.134942 1.132369 1.017507 +1200 538 1.307354 1.259147 2.137234 1.130107 1.018239 +1200 539 1.307354 1.259150 2.137243 1.130107 1.018252 +1200 540 1.307354 1.261621 2.143155 1.123350 1.019603 +1200 541 1.307354 1.261889 2.145236 1.121107 1.020293 +1200 542 1.307354 1.262163 2.144455 1.121107 1.019989 +1200 543 1.307354 1.263712 2.148170 1.117750 1.020892 +1200 544 1.307354 1.265214 2.151782 1.113291 1.021743 +1200 545 1.307354 1.265824 2.153250 1.111067 1.022177 +1200 546 1.307354 1.266090 2.153888 1.111067 1.022440 +1200 547 1.307354 1.266090 2.153888 1.111067 1.022432 +1200 548 1.307354 1.266681 2.155314 1.108849 1.022697 +1200 549 1.307354 1.267908 2.158270 1.105529 1.023289 +1200 550 1.307354 1.269455 2.162005 1.101118 1.024159 +1200 551 1.307354 1.271238 2.166317 1.095628 1.025305 +1200 552 1.307354 1.273047 2.170699 1.091257 1.026336 +1200 553 1.307354 1.274868 2.175122 1.085817 1.027380 +1200 554 1.307354 1.276301 2.178606 1.081484 1.028380 +1200 555 1.307354 1.276629 2.179405 1.081484 1.028669 +1200 556 1.307354 1.278084 2.182951 1.077169 1.029387 +1200 557 1.307354 1.278966 2.185102 1.075018 1.029959 +1200 558 1.307354 1.279568 2.186572 1.072871 1.030532 +1200 559 1.307354 1.280473 2.188784 1.070729 1.031119 +1200 560 1.307354 1.281652 2.191667 1.067523 1.031673 +1200 561 1.307354 1.283184 2.195420 1.063264 1.032642 +1200 562 1.307354 1.284371 2.198334 1.060080 1.033211 +1200 563 1.307354 1.284956 2.199771 1.057963 1.033470 +1200 564 1.307354 1.286743 2.204166 1.053742 1.034542 +1200 565 1.307354 1.287340 2.205634 1.052689 1.035059 +1200 566 1.307354 1.288210 2.207778 1.049537 1.035486 +1200 567 1.307354 1.290267 2.212856 1.044306 1.036652 +1200 568 1.307354 1.291432 2.215736 1.041179 1.037493 +1200 569 1.307354 1.292621 2.218681 1.038062 1.038273 +1200 570 1.307354 1.294722 2.223893 1.032887 1.039413 +1200 571 1.307354 1.296765 2.228972 1.026711 1.040790 +1200 572 1.307354 1.296775 2.228997 1.026711 1.040785 +1200 573 1.307354 1.297111 2.229833 1.026711 1.040948 +1200 574 1.307354 1.299414 2.235577 1.020572 1.042188 +1200 575 1.307354 1.300287 2.237755 1.017517 1.042577 +1200 576 1.307354 1.300884 2.239249 1.016500 1.042841 +1200 577 1.307354 1.301449 2.240662 1.014470 1.043019 +1200 578 1.307354 1.302089 2.242264 1.013457 1.043320 +1200 579 1.307354 1.303257 2.245191 1.010423 1.044015 +1200 580 1.307354 1.305066 2.249730 1.005386 1.045319 +1200 581 1.307354 1.305077 2.249758 1.005386 1.045322 +1200 582 1.307354 1.306550 2.253463 1.001374 1.046128 +1200 583 1.307354 1.308602 2.258634 0.996382 1.047158 +1200 584 1.307354 1.308894 2.259371 0.995387 1.047306 +1200 585 1.307354 1.309493 2.260884 0.994392 1.047695 +1200 586 1.307354 1.310057 2.262306 0.992407 1.047976 +1200 587 1.307354 1.310348 2.263042 0.992407 1.048107 +1200 588 1.307354 1.312101 2.267479 0.987459 1.049092 +1200 589 1.307354 1.313271 2.270443 0.984503 1.049752 +1200 590 1.307354 1.314688 2.274039 0.981555 1.050534 +1200 591 1.307354 1.315844 2.276978 0.978617 1.051349 +1200 592 1.307354 1.316983 2.279879 0.975687 1.052011 +1200 593 1.307354 1.317271 2.280613 0.974712 1.052267 +1200 594 1.307354 1.319927 2.287391 0.967916 1.054217 +1200 595 1.307354 1.320214 2.288124 0.966949 1.054348 +1200 596 1.307354 1.320820 2.289674 0.965983 1.054754 +1200 597 1.307354 1.321674 2.291859 0.964054 1.055405 +1200 598 1.307354 1.323086 2.295480 0.960207 1.056040 +1200 599 1.307354 1.324509 2.299133 0.956376 1.056830 +1200 600 1.307354 1.324485 2.299070 0.956376 1.056841 +1200 601 1.307354 1.324764 2.299788 0.956376 1.056987 +1200 602 1.307354 1.325629 2.302012 0.953513 1.057376 +1200 603 1.307354 1.326167 2.303398 0.952560 1.057740 +1200 604 1.307354 1.327032 2.305626 0.950658 1.058390 +1200 605 1.307354 1.330474 2.314516 0.942145 1.060526 +1200 606 1.307354 1.331332 2.316736 0.940263 1.061037 +1200 607 1.307354 1.331599 2.317430 0.939324 1.061309 +1200 608 1.307354 1.333043 2.321173 0.935576 1.062276 +1200 609 1.307354 1.333595 2.322608 0.933708 1.062695 +1200 610 1.307354 1.334748 2.325604 0.931843 1.063407 +1200 611 1.307354 1.337307 2.332272 0.925346 1.064706 +1200 612 1.307354 1.338447 2.335249 0.922576 1.065752 +1200 613 1.307354 1.338998 2.336689 0.920733 1.066134 +1200 614 1.307354 1.340438 2.340457 0.917977 1.066894 +1200 615 1.307354 1.341026 2.341997 0.916143 1.067161 +1200 616 1.307354 1.342138 2.344915 0.913400 1.067710 +1200 617 1.307354 1.342995 2.347164 0.911576 1.068230 +1200 618 1.307354 1.344423 2.350920 0.908847 1.068822 +1200 619 1.307354 1.345842 2.354659 0.905221 1.069460 +1200 620 1.307354 1.347848 2.359954 0.900708 1.070709 +1200 621 1.307354 1.348371 2.361336 0.898909 1.071057 +1200 622 1.307354 1.350351 2.366581 0.894428 1.072185 +1200 623 1.307354 1.352065 2.371128 0.890860 1.073278 +1200 624 1.307354 1.353187 2.374112 0.888192 1.073937 +1200 625 1.307354 1.353751 2.375613 0.887305 1.074187 +1200 626 1.307354 1.355952 2.381481 0.882000 1.075422 +1200 627 1.307354 1.357942 2.386801 0.877603 1.076331 +1200 628 1.307354 1.358762 2.388998 0.875850 1.076695 +1200 629 1.307354 1.359075 2.389838 0.874975 1.076846 +1200 630 1.307354 1.360206 2.392871 0.872356 1.077476 +1200 631 1.307354 1.360745 2.394318 0.871484 1.077722 +1200 632 1.307354 1.361036 2.395099 0.870614 1.077869 +1200 633 1.307354 1.362732 2.399661 0.866274 1.078783 +1200 634 1.307354 1.363870 2.402728 0.863680 1.079510 +1200 635 1.307354 1.364740 2.405075 0.861955 1.079954 +1200 636 1.307354 1.365863 2.408109 0.859375 1.080680 +1200 637 1.307354 1.365844 2.408059 0.859375 1.080343 +1200 638 1.307354 1.366719 2.410425 0.857658 1.080873 +1200 639 1.307354 1.367843 2.413471 0.855090 1.081742 +1200 640 1.307354 1.369733 2.418598 0.850828 1.082675 +1200 641 1.307354 1.370350 2.420275 0.849129 1.083028 +1200 642 1.307354 1.372015 2.424808 0.845741 1.083779 +1200 643 1.307354 1.373137 2.427868 0.843209 1.084288 +1200 644 1.307354 1.374790 2.432384 0.839844 1.085142 +1200 645 1.307354 1.375619 2.434655 0.838167 1.085660 +1200 646 1.307354 1.375905 2.435437 0.837330 1.085755 +1200 647 1.307354 1.376123 2.436034 0.836493 1.086009 +1200 648 1.307354 1.377778 2.440572 0.833155 1.087139 +1200 649 1.307354 1.377794 2.440617 0.833155 1.087012 +1200 650 1.307354 1.378642 2.442947 0.831492 1.087496 +1200 651 1.307354 1.380044 2.446805 0.828174 1.088219 +1200 652 1.307354 1.381437 2.450643 0.825694 1.088928 +1200 653 1.307354 1.381977 2.452134 0.824046 1.089140 +1200 654 1.307354 1.383073 2.455163 0.821578 1.089745 +1200 655 1.307354 1.383902 2.457457 0.819938 1.090369 +1200 656 1.307354 1.384451 2.458978 0.819119 1.090589 +1200 657 1.307354 1.384994 2.460481 0.817483 1.090910 +1200 658 1.307354 1.385033 2.460589 0.817483 1.091044 +1200 659 1.307354 1.386400 2.464383 0.815035 1.091843 +1200 660 1.307354 1.387210 2.466634 0.812595 1.092340 +1200 661 1.307354 1.387743 2.468114 0.811783 1.092774 +1200 662 1.307354 1.388575 2.470430 0.810162 1.093247 +1200 663 1.307354 1.390197 2.474954 0.806929 1.093974 +1200 664 1.307354 1.390456 2.475678 0.806123 1.094072 +1200 665 1.307354 1.392339 2.480941 0.802105 1.095053 +1200 666 1.307354 1.392584 2.481629 0.801304 1.095157 +1200 667 1.307354 1.392876 2.482445 0.801304 1.095287 +1200 668 1.307354 1.393415 2.483956 0.799703 1.095743 +1200 669 1.307354 1.392890 2.482486 0.799703 1.095520 +1200 670 1.307354 1.394510 2.487028 0.797309 1.096480 +1200 671 1.307354 1.395293 2.489227 0.795717 1.097049 +1200 672 1.307354 1.398017 2.496898 0.790169 1.098499 +1200 673 1.307354 1.398568 2.498454 0.788591 1.098868 +1200 674 1.307354 1.398844 2.499234 0.787803 1.098981 +1200 675 1.307354 1.399145 2.500083 0.787803 1.099122 +1200 676 1.307354 1.400230 2.503151 0.785444 1.099677 +1200 677 1.307354 1.401564 2.506930 0.782311 1.100491 +1200 678 1.307354 1.402630 2.509954 0.780748 1.100932 +1200 679 1.307354 1.403430 2.512226 0.778411 1.101387 +1200 680 1.307354 1.403978 2.513784 0.777633 1.101738 +1200 681 1.307354 1.403727 2.513070 0.777633 1.101516 +1200 682 1.307354 1.404536 2.515371 0.776856 1.101957 +1200 683 1.307354 1.405081 2.516924 0.775305 1.102319 +1200 684 1.307354 1.405627 2.518479 0.774530 1.102550 +1200 685 1.307354 1.406160 2.520000 0.772984 1.102892 +1200 686 1.307354 1.408063 2.525435 0.769130 1.104064 +1200 687 1.307354 1.409650 2.529981 0.766061 1.104976 +1200 688 1.307354 1.410182 2.531506 0.764532 1.105323 +1200 689 1.307354 1.410182 2.531506 0.764532 1.105458 +1200 690 1.307354 1.410670 2.532909 0.763768 1.105656 +1200 691 1.307354 1.410671 2.532911 0.763768 1.105760 +1200 692 1.307354 1.411461 2.535181 0.762243 1.106322 +1200 693 1.307354 1.412277 2.537526 0.759960 1.106879 +1200 694 1.307354 1.412306 2.537609 0.759960 1.106883 +1200 695 1.307354 1.412061 2.536905 0.759960 1.106781 +1200 696 1.307354 1.413677 2.541560 0.757685 1.107447 +1200 697 1.307354 1.414481 2.543879 0.756172 1.108027 +1200 698 1.307354 1.416302 2.549142 0.752402 1.109048 +1200 699 1.307354 1.418160 2.554526 0.748652 1.109925 +1200 700 1.307354 1.418691 2.556065 0.747904 1.110148 +1200 701 1.307354 1.419476 2.558347 0.745664 1.110645 +1200 702 1.307354 1.420292 2.560723 0.744175 1.111000 +1200 703 1.307354 1.421607 2.564552 0.741947 1.111685 +1200 704 1.307354 1.422391 2.566838 0.740466 1.112117 +1200 705 1.307354 1.422904 2.568336 0.738987 1.112326 +1200 706 1.307354 1.424199 2.572123 0.736774 1.112971 +1200 707 1.307354 1.426039 2.577516 0.733102 1.113957 +1200 708 1.307354 1.426568 2.579071 0.732369 1.114188 +1200 709 1.307354 1.428157 2.583742 0.728718 1.115335 +1200 710 1.307354 1.428697 2.585332 0.727990 1.115431 +1200 711 1.307354 1.429242 2.586938 0.726537 1.115886 +1200 712 1.307354 1.430849 2.591683 0.723638 1.116758 +1200 713 1.307354 1.431919 2.594847 0.721471 1.117438 +1200 714 1.307354 1.432449 2.596417 0.720750 1.117757 +1200 715 1.307354 1.432736 2.597267 0.720030 1.117889 +1200 716 1.307354 1.434037 2.601126 0.717157 1.118660 +1200 717 1.307354 1.435610 2.605801 0.714296 1.119287 +1200 718 1.307354 1.436087 2.607223 0.713582 1.119596 +1200 719 1.307354 1.436341 2.607980 0.712869 1.119824 +1200 720 1.307354 1.436342 2.607981 0.712869 1.119710 +1200 721 1.307354 1.436852 2.609503 0.712157 1.119816 +1200 722 1.307354 1.437102 2.610247 0.712157 1.119929 +1200 723 1.307354 1.437331 2.610931 0.711446 1.120026 +1200 724 1.307354 1.438637 2.614831 0.709316 1.120561 +1200 725 1.307354 1.439402 2.617118 0.707899 1.120883 +1200 726 1.307354 1.440451 2.620261 0.705780 1.121430 +1200 727 1.307354 1.441754 2.624170 0.703667 1.121936 +1200 728 1.307354 1.443084 2.628170 0.700859 1.122576 +1200 729 1.307354 1.443335 2.628925 0.700859 1.122562 +1200 730 1.307354 1.444096 2.631219 0.699459 1.122968 +1200 731 1.307354 1.445648 2.635902 0.696669 1.123679 +1200 732 1.307354 1.446707 2.639102 0.694583 1.124339 +1200 733 1.307354 1.449297 2.646956 0.689740 1.125507 +1200 734 1.307354 1.449822 2.648553 0.689051 1.125732 +1200 735 1.307354 1.450360 2.650189 0.687675 1.125838 +1200 736 1.307354 1.450899 2.651828 0.686988 1.126141 +1200 737 1.307354 1.451639 2.654082 0.685616 1.126523 +1200 738 1.307354 1.452674 2.657242 0.683563 1.127148 +1200 739 1.307354 1.453976 2.661221 0.680836 1.127890 +1200 740 1.307354 1.454984 2.664308 0.679476 1.128293 +1200 741 1.307354 1.455494 2.665871 0.678797 1.128486 +1200 742 1.307354 1.455998 2.667418 0.677442 1.128825 +1200 743 1.307354 1.456016 2.667473 0.677442 1.128813 +1200 744 1.307354 1.457280 2.671357 0.675414 1.129424 +1200 745 1.307354 1.458513 2.675152 0.673391 1.129886 +1200 746 1.307354 1.458491 2.675085 0.673391 1.130111 +1200 747 1.307354 1.459509 2.678225 0.671375 1.130511 +1200 748 1.307354 1.460573 2.681512 0.669365 1.130910 +1200 749 1.307354 1.460317 2.680720 0.669365 1.130690 +1200 750 1.307354 1.460594 2.681575 0.669365 1.130901 +1200 751 1.307354 1.460600 2.681594 0.669365 1.130889 +1200 752 1.307354 1.461390 2.684037 0.668028 1.131219 +1200 753 1.307354 1.461371 2.683980 0.668028 1.131278 +1200 754 1.307354 1.461602 2.684694 0.668028 1.131493 +1200 755 1.307354 1.462906 2.688736 0.665363 1.132092 +1200 756 1.307354 1.463184 2.689598 0.664698 1.132314 +1200 757 1.307354 1.465975 2.698282 0.660064 1.133634 +1200 758 1.307354 1.466243 2.699119 0.659405 1.133734 +1200 759 1.307354 1.467245 2.702246 0.658088 1.134155 +1200 760 1.307354 1.467471 2.702951 0.657430 1.134366 +1200 761 1.307354 1.468433 2.705960 0.655462 1.134848 +1200 762 1.307354 1.468669 2.706699 0.654807 1.135156 +1200 763 1.307354 1.468695 2.706781 0.654807 1.135174 +1200 764 1.307354 1.468938 2.707543 0.654807 1.135266 +1200 765 1.307354 1.469155 2.708222 0.654153 1.135368 +1200 766 1.307354 1.469670 2.709835 0.653499 1.135668 +1200 767 1.307354 1.470144 2.711323 0.652194 1.135866 +1200 768 1.307354 1.472134 2.717580 0.648943 1.136893 +1200 769 1.307354 1.473648 2.722352 0.646354 1.137790 +1200 770 1.307354 1.474099 2.723776 0.645063 1.138167 +1200 771 1.307354 1.475387 2.727850 0.643132 1.138766 +1200 772 1.307354 1.476117 2.730161 0.641847 1.139176 +1200 773 1.307354 1.477353 2.734082 0.639926 1.139667 +1200 774 1.307354 1.477858 2.735685 0.638648 1.139976 +1200 775 1.307354 1.478343 2.737227 0.638010 1.140157 +1200 776 1.307354 1.478861 2.738874 0.636736 1.140564 +1200 777 1.307354 1.479108 2.739660 0.636100 1.140858 +1200 778 1.307354 1.478865 2.738886 0.636100 1.140771 +1200 779 1.307354 1.479396 2.740580 0.635464 1.141131 +1200 780 1.307354 1.479652 2.741394 0.634829 1.141247 +1200 781 1.307354 1.479643 2.741365 0.634829 1.141374 +1200 782 1.307354 1.479894 2.742166 0.634195 1.141603 +1200 783 1.307354 1.480638 2.744538 0.632929 1.141921 +1200 784 1.307354 1.481876 2.748494 0.631034 1.142732 +1200 785 1.307354 1.482611 2.750848 0.629144 1.143246 +1200 786 1.307354 1.483611 2.754053 0.627261 1.143875 +1200 787 1.307354 1.484617 2.757284 0.626008 1.144350 +1200 788 1.307354 1.486076 2.761978 0.623510 1.145252 +1200 789 1.307354 1.487065 2.765165 0.621643 1.145851 +1200 790 1.307354 1.488030 2.768280 0.619782 1.146428 +1200 791 1.307354 1.489495 2.773021 0.616693 1.147330 +1200 792 1.307354 1.489478 2.772968 0.616693 1.147328 +1200 793 1.307354 1.490978 2.777832 0.614846 1.147891 +1200 794 1.307354 1.491970 2.781055 0.613005 1.148340 +1200 795 1.307354 1.492722 2.783503 0.611781 1.148739 +1200 796 1.307354 1.493013 2.784452 0.611170 1.148718 +1200 797 1.307354 1.494966 2.790825 0.608123 1.149552 +1200 798 1.307354 1.495662 2.793104 0.607516 1.149749 +1200 799 1.307354 1.497185 2.798095 0.604487 1.150630 +1200 800 1.307354 1.497643 2.799600 0.603883 1.150786 +1200 801 1.307354 1.498670 2.802973 0.602075 1.151395 +1200 802 1.307354 1.499663 2.806245 0.600273 1.151790 +1200 803 1.307354 1.500166 2.807901 0.599673 1.152088 +1200 804 1.307354 1.500463 2.808882 0.599074 1.152061 +1200 805 1.307354 1.501206 2.811335 0.597878 1.152430 +1200 806 1.307354 1.502384 2.815233 0.596088 1.153190 +1200 807 1.307354 1.503078 2.817530 0.594897 1.153551 +1200 808 1.307354 1.504049 2.820753 0.593116 1.154039 +1200 809 1.307354 1.504780 2.823182 0.591932 1.154434 +1200 810 1.307354 1.506173 2.827821 0.589570 1.155126 +1200 811 1.307354 1.506902 2.830251 0.588392 1.155348 +1200 812 1.307354 1.506855 2.830264 0.588392 1.155341 +1200 813 1.307354 1.506609 2.830273 0.588392 1.155246 +1200 814 1.307354 1.506883 2.830188 0.588392 1.155343 +1200 815 1.307354 1.508582 2.835864 0.586045 1.156139 +1200 816 1.307354 1.509064 2.837480 0.584874 1.156406 +1200 817 1.307354 1.510029 2.840714 0.583706 1.156872 +1200 818 1.307354 1.510746 2.843121 0.581959 1.157464 +1200 819 1.307354 1.511729 2.846428 0.580216 1.157928 +1200 820 1.307354 1.511753 2.846507 0.580216 1.157928 +1200 821 1.307354 1.512752 2.849874 0.578479 1.158474 +1200 822 1.307354 1.514183 2.854706 0.576747 1.159098 +1200 823 1.307354 1.514687 2.856210 0.576171 1.159049 +1200 824 1.307354 1.514671 2.856355 0.576171 1.158856 +1200 825 1.307354 1.515643 2.859645 0.574446 1.159647 +1200 826 1.307354 1.516320 2.861941 0.573299 1.159883 +1200 827 1.307354 1.516080 2.861926 0.573299 1.159809 +1200 828 1.307354 1.516334 2.861989 0.573299 1.160016 +1200 829 1.307354 1.516804 2.863584 0.572154 1.160394 +1200 830 1.307354 1.517748 2.866792 0.571011 1.160821 +1200 831 1.307354 1.518478 2.869277 0.569871 1.161090 +1200 832 1.307354 1.518723 2.870114 0.569302 1.161191 +1200 833 1.307354 1.519205 2.871754 0.568733 1.161476 +1200 834 1.307354 1.519690 2.873409 0.567597 1.161770 +1200 835 1.307354 1.521135 2.878348 0.565333 1.162503 +1200 836 1.307354 1.521628 2.880037 0.564204 1.162759 +1200 837 1.307354 1.522096 2.881640 0.563640 1.162938 +1200 838 1.307354 1.523057 2.884938 0.561952 1.163476 +1200 839 1.307354 1.523789 2.887456 0.560830 1.163744 +1200 840 1.307354 1.524548 2.890067 0.559710 1.164358 +1200 841 1.307354 1.524318 2.889276 0.559710 1.164239 +1200 842 1.307354 1.525536 2.893471 0.558034 1.164853 +1200 843 1.307354 1.526053 2.895256 0.556920 1.165262 +1200 844 1.307354 1.527940 2.901782 0.554144 1.166091 +1200 845 1.307354 1.527465 2.900137 0.554144 1.165798 +1200 846 1.307354 1.527912 2.901688 0.554144 1.165898 +1200 847 1.307354 1.529064 2.905681 0.552485 1.166320 +1200 848 1.307354 1.529566 2.907427 0.551933 1.166492 +1200 849 1.307354 1.530044 2.909090 0.551381 1.166660 +1200 850 1.307354 1.530249 2.909803 0.550831 1.166841 +1200 851 1.307354 1.530453 2.910513 0.550280 1.167100 +1200 852 1.307354 1.530705 2.911390 0.550280 1.167182 +1200 853 1.307354 1.530972 2.912318 0.549730 1.167287 +1200 854 1.307354 1.532665 2.918229 0.546990 1.168100 +1200 855 1.307354 1.533379 2.920726 0.545898 1.168448 +1200 856 1.307354 1.534832 2.925818 0.543720 1.169161 +1200 857 1.307354 1.535312 2.927502 0.542634 1.169414 +1200 858 1.307354 1.535789 2.929178 0.542092 1.169650 +1200 859 1.307354 1.537192 2.934120 0.539389 1.170551 +1200 860 1.307354 1.538577 2.939008 0.537237 1.171329 +1200 861 1.307354 1.539752 2.943163 0.535629 1.171752 +1200 862 1.307354 1.540473 2.945720 0.534559 1.172199 +1200 863 1.307354 1.540710 2.946562 0.534025 1.172295 +1200 864 1.307354 1.540925 2.947324 0.534025 1.172295 +1200 865 1.307354 1.541402 2.949018 0.532959 1.172542 +1200 866 1.307354 1.542315 2.952266 0.531894 1.172885 +1200 867 1.307354 1.542290 2.952178 0.531894 1.172959 +1200 868 1.307354 1.542760 2.953851 0.531363 1.173117 +1200 869 1.307354 1.543701 2.957207 0.529772 1.173633 +1200 870 1.307354 1.544173 2.958890 0.528714 1.173813 +1200 871 1.307354 1.544609 2.960450 0.528186 1.173958 +1200 872 1.307354 1.545107 2.962231 0.527658 1.174240 +1200 873 1.307354 1.545576 2.963910 0.526604 1.174501 +1200 874 1.307354 1.546782 2.968234 0.525028 1.175128 +1200 875 1.307354 1.547547 2.970983 0.523456 1.175557 +1200 876 1.307354 1.547994 2.972589 0.522933 1.175725 +1200 877 1.307354 1.549190 2.976900 0.520846 1.176450 +1200 878 1.307354 1.549661 2.978597 0.520326 1.176611 +1200 879 1.307354 1.551046 2.983604 0.518250 1.177197 +1200 880 1.307354 1.551935 2.986827 0.517215 1.177515 +1200 881 1.307354 1.551923 2.986782 0.517215 1.177598 +1200 882 1.307354 1.552616 2.989295 0.516182 1.177842 +1200 883 1.307354 1.553481 2.992441 0.514637 1.178168 +1200 884 1.307354 1.553720 2.993309 0.514637 1.178259 +1200 885 1.307354 1.553974 2.994235 0.514123 1.178349 +1200 886 1.307354 1.554669 2.996765 0.513096 1.178571 +1200 887 1.307354 1.555121 2.998415 0.512583 1.178849 +1200 888 1.307354 1.555828 3.000997 0.511560 1.179207 +1200 889 1.307354 1.556938 3.005054 0.509518 1.179723 +1200 890 1.307354 1.556945 3.005082 0.509518 1.179718 +1200 891 1.307354 1.558555 3.010987 0.507485 1.180355 +1200 892 1.307354 1.559474 3.014364 0.505966 1.180761 +1200 893 1.307354 1.559930 3.016043 0.505461 1.181006 +1200 894 1.307354 1.559910 3.015969 0.505461 1.181006 +1200 895 1.307354 1.561300 3.021098 0.503444 1.181756 +1200 896 1.307354 1.561997 3.023673 0.502438 1.181973 +1200 897 1.307354 1.562253 3.024621 0.501937 1.182066 +1200 898 1.307354 1.562917 3.027078 0.500934 1.182292 +1200 899 1.307354 1.563586 3.029560 0.500434 1.182515 +1200 900 1.307354 1.564308 3.032242 0.499434 1.182751 +1200 901 1.307354 1.564800 3.034070 0.498935 1.182915 +1200 902 1.307354 1.565018 3.034881 0.498437 1.183083 +1200 903 1.307354 1.565926 3.038264 0.496448 1.183780 +1200 904 1.307354 1.566124 3.039004 0.496448 1.183937 +1200 905 1.307354 1.566573 3.040678 0.495457 1.184175 +1200 906 1.307354 1.567059 3.042492 0.494962 1.184321 +1200 907 1.307354 1.568390 3.047471 0.492987 1.184968 +1200 908 1.307354 1.569049 3.049946 0.492002 1.185283 +1200 909 1.307354 1.569752 3.052583 0.491020 1.185719 +1200 910 1.307354 1.570673 3.056049 0.489550 1.186133 +1200 911 1.307354 1.570681 3.056079 0.489550 1.186034 +1200 912 1.307354 1.572532 3.063060 0.487109 1.186829 +1200 913 1.307354 1.572354 3.062385 0.487109 1.186644 +1200 914 1.307354 1.572588 3.063268 0.487109 1.186805 +1200 915 1.307354 1.572580 3.063239 0.487109 1.186807 +1200 916 1.307354 1.572823 3.064157 0.486623 1.186965 +1200 917 1.307354 1.573473 3.066617 0.485651 1.187278 +1200 918 1.307354 1.575050 3.072599 0.483230 1.188181 +1200 919 1.307354 1.575310 3.073586 0.483230 1.188166 +1200 920 1.307354 1.575539 3.074457 0.482265 1.188405 +1200 921 1.307354 1.576211 3.077016 0.481783 1.188624 +1200 922 1.307354 1.576874 3.079539 0.480821 1.188947 +1200 923 1.307354 1.578439 3.085518 0.478424 1.189688 +1200 924 1.307354 1.579087 3.087997 0.477469 1.190009 +1200 925 1.307354 1.580150 3.092074 0.476039 1.190260 +1200 926 1.307354 1.579753 3.090551 0.476039 1.190144 +1200 927 1.307354 1.580432 3.093156 0.475564 1.190443 +1200 928 1.307354 1.581115 3.095785 0.475089 1.190654 +1200 929 1.307354 1.580650 3.093994 0.475089 1.190426 +1200 930 1.307354 1.581763 3.098278 0.474140 1.190987 +1200 931 1.307354 1.581976 3.099099 0.473666 1.191129 +1200 932 1.307354 1.581971 3.099078 0.473666 1.191048 +1200 933 1.307354 1.582431 3.100851 0.472720 1.191293 +1200 934 1.307354 1.583786 3.106088 0.471305 1.191839 +1200 935 1.307354 1.584689 3.109586 0.469894 1.192302 +1200 936 1.307354 1.585319 3.112028 0.468487 1.192690 +1200 937 1.307354 1.586375 3.116131 0.467084 1.193196 +1200 938 1.307354 1.586797 3.117772 0.466618 1.193339 +1200 939 1.307354 1.587496 3.120498 0.465686 1.193658 +1200 940 1.307354 1.588838 3.125736 0.463828 1.194264 +1200 941 1.307354 1.589750 3.129309 0.462439 1.194840 +1200 942 1.307354 1.590245 3.131249 0.461515 1.195082 +1200 943 1.307354 1.590028 3.130398 0.461515 1.195020 +1200 944 1.307354 1.589803 3.129516 0.461515 1.194759 +1200 945 1.307354 1.590032 3.130412 0.461515 1.194838 +1200 946 1.307354 1.590024 3.130382 0.461515 1.195012 +1200 947 1.307354 1.590246 3.131252 0.461515 1.195189 +1200 948 1.307354 1.591571 3.136458 0.459674 1.195760 +1200 949 1.307354 1.592193 3.138907 0.458756 1.195946 +1200 950 1.307354 1.592607 3.140538 0.458298 1.196162 +1200 951 1.307354 1.593291 3.143238 0.457383 1.196559 +1200 952 1.307354 1.593273 3.143167 0.457383 1.196455 +1200 953 1.307354 1.594557 3.148243 0.455558 1.197059 +1200 954 1.307354 1.594984 3.149936 0.454648 1.197272 +1200 955 1.307354 1.595898 3.153559 0.453740 1.197659 +1200 956 1.307354 1.596373 3.155448 0.452834 1.197831 +1200 957 1.307354 1.597060 3.158181 0.451929 1.198196 +1200 958 1.307354 1.597478 3.159847 0.451478 1.198409 +1200 959 1.307354 1.598228 3.162839 0.450126 1.198900 +1200 960 1.307354 1.599332 3.167251 0.448779 1.199363 +1200 961 1.307354 1.600632 3.172460 0.446541 1.199956 +1200 962 1.307354 1.601275 3.175041 0.445650 1.200250 +1200 963 1.307354 1.602574 3.180268 0.444315 1.200758 +1200 964 1.307354 1.603245 3.182977 0.443428 1.201057 +1200 965 1.307354 1.603666 3.184677 0.442543 1.201190 +1200 966 1.307354 1.604093 3.186402 0.442100 1.201323 +1200 967 1.307354 1.604724 3.188956 0.441218 1.201730 +1200 968 1.307354 1.605381 3.191618 0.440336 1.201938 +1200 969 1.307354 1.605584 3.192442 0.440336 1.201939 +1200 970 1.307354 1.605590 3.192467 0.440336 1.201946 +1200 971 1.307354 1.606035 3.194275 0.439897 1.201980 +1200 972 1.307354 1.606909 3.197828 0.438579 1.202358 +1200 973 1.307354 1.608190 3.203048 0.437266 1.202878 +1200 974 1.307354 1.609885 3.209980 0.435087 1.203477 +1200 975 1.307354 1.610739 3.213483 0.433784 1.203831 +1200 976 1.307354 1.610927 3.214255 0.433351 1.203999 +1200 977 1.307354 1.610965 3.214413 0.433351 1.204007 +1200 978 1.307354 1.611643 3.217200 0.432485 1.204220 +1200 979 1.307354 1.612497 3.220717 0.431622 1.204655 +1200 980 1.307354 1.613099 3.223202 0.430329 1.205026 +1200 981 1.307354 1.613716 3.225749 0.429470 1.205481 +1200 982 1.307354 1.614585 3.229347 0.428184 1.205764 +1200 983 1.307354 1.614565 3.229261 0.428184 1.205733 +1200 984 1.307354 1.614544 3.229177 0.428184 1.205652 +1200 985 1.307354 1.615180 3.231813 0.427756 1.205926 +1200 986 1.307354 1.616222 3.236140 0.426476 1.206333 +1200 987 1.307354 1.617507 3.241491 0.424350 1.206973 +1200 988 1.307354 1.618960 3.247562 0.422656 1.207597 +1200 989 1.307354 1.619171 3.248443 0.422234 1.207663 +1200 990 1.307354 1.619822 3.251172 0.421391 1.207943 +1200 991 1.307354 1.620684 3.254791 0.420549 1.208196 +1200 992 1.307354 1.620897 3.255686 0.420129 1.208273 +1200 993 1.307354 1.621945 3.260097 0.418453 1.208940 +1200 994 1.307354 1.622336 3.261745 0.418035 1.209135 +1200 995 1.307354 1.623191 3.265355 0.416783 1.209572 +1200 996 1.307354 1.624681 3.271667 0.415120 1.210038 +1200 997 1.307354 1.625293 3.274264 0.414291 1.210326 +1200 998 1.307354 1.626131 3.277825 0.413051 1.210644 +1200 999 1.307354 1.626369 3.278841 0.412638 1.210798 +1200 1000 1.307354 1.627426 3.283349 0.411814 1.211045 +1200 1001 1.307354 1.628260 3.286912 0.410581 1.211394 +1200 1002 1.307354 1.628464 3.287785 0.410581 1.211378 +1200 1003 1.307354 1.628239 3.286822 0.410581 1.211299 +1200 1004 1.307354 1.628654 3.288600 0.410171 1.211521 +1200 1005 1.307354 1.629274 3.291256 0.409352 1.211810 +1200 1006 1.307354 1.629898 3.293932 0.408535 1.212084 +1200 1007 1.307354 1.630509 3.296562 0.407719 1.212421 +1200 1008 1.307354 1.631381 3.300317 0.406498 1.212777 +1200 1009 1.307354 1.631574 3.301148 0.406092 1.212841 +1200 1010 1.307354 1.632018 3.303065 0.405281 1.213128 +1200 1011 1.307354 1.633026 3.307420 0.404472 1.213448 +1200 1012 1.307354 1.633480 3.309390 0.403664 1.213583 +1200 1013 1.307354 1.634126 3.312189 0.402858 1.213945 +1200 1014 1.307354 1.634758 3.314933 0.402053 1.214130 +1200 1015 1.307354 1.635571 3.318472 0.401250 1.214369 +1200 1016 1.307354 1.636790 3.323791 0.399649 1.214810 +1200 1017 1.307354 1.637626 3.327446 0.398851 1.215218 +1200 1018 1.307354 1.637453 3.326689 0.398851 1.215263 +1200 1019 1.307354 1.637245 3.325779 0.398851 1.215201 +1200 1020 1.307354 1.638497 3.331261 0.397260 1.215644 +1200 1021 1.307354 1.638918 3.333113 0.396863 1.215858 +1200 1022 1.307354 1.639752 3.336780 0.395675 1.216252 +1200 1023 1.307354 1.639557 3.335921 0.395675 1.216101 +1200 1024 1.307354 1.639560 3.335935 0.395675 1.216000 +1200 1025 1.307354 1.639756 3.336793 0.395675 1.216131 +1200 1026 1.307354 1.639984 3.337799 0.395675 1.216194 +1200 1027 1.307354 1.640615 3.340581 0.394885 1.216451 +1200 1028 1.307354 1.641665 3.345217 0.393702 1.216920 +1200 1029 1.307354 1.642067 3.346997 0.392916 1.217132 +1200 1030 1.307354 1.642474 3.348799 0.392524 1.217253 +1200 1031 1.307354 1.643106 3.351600 0.391740 1.217421 +1200 1032 1.307354 1.643941 3.355312 0.390957 1.217686 +1200 1033 1.307354 1.645167 3.360774 0.389008 1.218438 +1200 1034 1.307354 1.645379 3.361718 0.388620 1.218587 +1200 1035 1.307354 1.646831 3.368213 0.387069 1.219014 +1200 1036 1.307354 1.647471 3.371084 0.386296 1.219181 +1200 1037 1.307354 1.648494 3.375679 0.385140 1.219569 +1200 1038 1.307354 1.648943 3.377700 0.384755 1.219702 +1200 1039 1.307354 1.648961 3.377784 0.384755 1.219708 +1200 1040 1.307354 1.649573 3.380542 0.383603 1.220060 +1200 1041 1.307354 1.650169 3.383232 0.383220 1.220222 +1200 1042 1.307354 1.650986 3.386930 0.382072 1.220635 +1200 1043 1.307354 1.651426 3.388922 0.381309 1.220821 +1200 1044 1.307354 1.653258 3.397251 0.378651 1.221757 +1200 1045 1.307354 1.653477 3.398245 0.378273 1.221894 +1200 1046 1.307354 1.654318 3.402084 0.377517 1.222194 +1200 1047 1.307354 1.654762 3.404116 0.376763 1.222397 +1200 1048 1.307354 1.655159 3.405931 0.376387 1.222498 +1200 1049 1.307354 1.655775 3.408752 0.376011 1.222664 +1200 1050 1.307354 1.655981 3.409700 0.375635 1.222723 +1200 1051 1.307354 1.656212 3.410760 0.375260 1.222786 +1200 1052 1.307354 1.657030 3.414517 0.374511 1.223095 +1200 1053 1.307354 1.657628 3.417273 0.373763 1.223286 +1200 1054 1.307354 1.658049 3.419215 0.373389 1.223388 +1200 1055 1.307354 1.659870 3.427635 0.370786 1.224213 +1200 1056 1.307354 1.660669 3.431341 0.370046 1.224518 +1200 1057 1.307354 1.661266 3.434119 0.369307 1.224748 +1200 1058 1.307354 1.662294 3.438907 0.368201 1.225050 +1200 1059 1.307354 1.662669 3.440659 0.367833 1.225164 +1200 1060 1.307354 1.662479 3.439769 0.367833 1.225114 +1200 1061 1.307354 1.662675 3.440688 0.367833 1.225171 +1200 1062 1.307354 1.663671 3.445344 0.366365 1.225609 +1200 1063 1.307354 1.664643 3.449903 0.365634 1.225866 +1200 1064 1.307354 1.666023 3.456394 0.363811 1.226396 +1200 1065 1.307354 1.667228 3.462085 0.362722 1.226741 +1200 1066 1.307354 1.668029 3.465878 0.361997 1.226958 +1200 1067 1.307354 1.668412 3.467691 0.361274 1.227127 +1200 1068 1.307354 1.669018 3.470570 0.360553 1.227371 +1200 1069 1.307354 1.669847 3.474512 0.359474 1.227713 +1200 1070 1.307354 1.670459 3.477433 0.359114 1.227886 +1200 1071 1.307354 1.671052 3.480261 0.358397 1.228074 +1200 1072 1.307354 1.671855 3.484102 0.357682 1.228310 +1200 1073 1.307354 1.672252 3.486008 0.356967 1.228502 +1200 1074 1.307354 1.672240 3.485951 0.356967 1.228503 +1200 1075 1.307354 1.672240 3.485948 0.356967 1.228586 +1200 1076 1.307354 1.672240 3.485950 0.356967 1.228587 +1200 1077 1.307354 1.673423 3.491628 0.355543 1.229084 +1200 1078 1.307354 1.673433 3.491676 0.355543 1.229083 +1200 1079 1.307354 1.674014 3.494473 0.354478 1.229386 +1200 1080 1.307354 1.674214 3.495440 0.354478 1.229279 +1200 1081 1.307354 1.675183 3.500116 0.353064 1.229833 +1200 1082 1.307354 1.676165 3.504869 0.352007 1.230246 +1200 1083 1.307354 1.677104 3.509431 0.350953 1.230576 +1200 1084 1.307354 1.677882 3.513215 0.349902 1.230863 +1200 1085 1.307354 1.678662 3.517015 0.349204 1.231074 +1200 1086 1.307354 1.679645 3.521825 0.348158 1.231439 +1200 1087 1.307354 1.680981 3.528378 0.346422 1.231941 +1200 1088 1.307354 1.681577 3.531308 0.346076 1.232102 +1200 1089 1.307354 1.681979 3.533290 0.345385 1.232284 +1200 1090 1.307354 1.682748 3.537083 0.344696 1.232409 +1200 1091 1.307354 1.683155 3.539095 0.344351 1.232525 +1200 1092 1.307354 1.684145 3.543996 0.343320 1.232810 +1200 1093 1.307354 1.684145 3.543996 0.343320 1.232810 +1200 1094 1.307354 1.684137 3.543954 0.343320 1.232817 +1200 1095 1.307354 1.684506 3.545787 0.342977 1.232934 +1200 1096 1.307354 1.684323 3.544879 0.342977 1.232965 +1200 1097 1.307354 1.684289 3.544710 0.342977 1.233063 +1200 1098 1.307354 1.684498 3.545748 0.342635 1.233122 +1200 1099 1.307354 1.684908 3.547783 0.341950 1.233369 +1200 1100 1.307354 1.685275 3.549609 0.341609 1.233545 +1200 1101 1.307354 1.685264 3.549555 0.341267 1.233685 +1200 1102 1.307354 1.686427 3.555348 0.339906 1.234134 +1200 1103 1.307354 1.686838 3.557404 0.339566 1.234245 +1200 1104 1.307354 1.686833 3.557378 0.339566 1.234244 +1200 1105 1.307354 1.687784 3.562138 0.338211 1.234631 +1200 1106 1.307354 1.688368 3.565067 0.337536 1.234871 +1200 1107 1.307354 1.688762 3.567047 0.337199 1.235054 +1200 1108 1.307354 1.689127 3.568881 0.336525 1.235223 +1200 1109 1.307354 1.689713 3.571832 0.335853 1.235405 +1200 1110 1.307354 1.689311 3.569808 0.335853 1.235383 +1200 1111 1.307354 1.690304 3.574814 0.335183 1.235738 +1200 1112 1.307354 1.691090 3.578787 0.333845 1.236081 +1200 1113 1.307354 1.691102 3.578848 0.333845 1.236153 +1200 1114 1.307354 1.692069 3.583749 0.332846 1.236480 +1200 1115 1.307354 1.692636 3.586630 0.331849 1.236782 +1200 1116 1.307354 1.693224 3.589621 0.331186 1.237025 +1200 1117 1.307354 1.693978 3.593471 0.330195 1.237308 +1200 1118 1.307354 1.694543 3.596352 0.329535 1.237533 +1200 1119 1.307354 1.694905 3.598207 0.328877 1.237782 +1200 1120 1.307354 1.695287 3.600162 0.328221 1.238020 +1200 1121 1.307354 1.695864 3.603121 0.327893 1.238116 +1200 1122 1.307354 1.696463 3.606199 0.327238 1.238356 +1200 1123 1.307354 1.697222 3.610110 0.326258 1.238700 +1200 1124 1.307354 1.697408 3.611071 0.325932 1.238752 +1200 1125 1.307354 1.697991 3.614077 0.325281 1.239044 +1200 1126 1.307354 1.698546 3.616951 0.324632 1.239196 +1200 1127 1.307354 1.699135 3.620001 0.323660 1.239497 +1200 1128 1.307354 1.699526 3.622030 0.323013 1.239826 +1200 1129 1.307354 1.700477 3.626972 0.322046 1.240091 +1200 1130 1.307354 1.701240 3.630949 0.321082 1.240423 +1200 1131 1.307354 1.701636 3.633017 0.320761 1.240515 +1200 1132 1.307354 1.702013 3.634989 0.320121 1.240756 +1200 1133 1.307354 1.702946 3.639878 0.318844 1.241089 +1200 1134 1.307354 1.703318 3.641831 0.318844 1.241125 +1200 1135 1.307354 1.703671 3.643687 0.318207 1.241297 +1200 1136 1.307354 1.704213 3.646537 0.317889 1.241433 +1200 1137 1.307354 1.704576 3.648452 0.317571 1.241508 +1200 1138 1.307354 1.704788 3.649567 0.316937 1.241716 +1200 1139 1.307354 1.705942 3.655669 0.315673 1.242092 +1200 1140 1.307354 1.705956 3.655741 0.315673 1.242107 +1200 1141 1.307354 1.706492 3.658585 0.315042 1.242255 +1200 1142 1.307354 1.706861 3.660543 0.314413 1.242492 +1200 1143 1.307354 1.707807 3.665574 0.313472 1.242863 +1200 1144 1.307354 1.707999 3.666593 0.313159 1.242904 +1200 1145 1.307354 1.708400 3.668732 0.312846 1.243017 +1200 1146 1.307354 1.708965 3.671753 0.311909 1.243316 +1200 1147 1.307354 1.709353 3.673827 0.311597 1.243425 +1200 1148 1.307354 1.709906 3.676790 0.310975 1.243634 +1200 1149 1.307354 1.710122 3.677949 0.310664 1.243666 +1200 1150 1.307354 1.710863 3.681930 0.309734 1.243996 +1200 1151 1.307354 1.711422 3.684937 0.309116 1.244259 +1200 1152 1.307354 1.712137 3.688796 0.308190 1.244542 +1200 1153 1.307354 1.713054 3.693755 0.306961 1.244899 +1200 1154 1.307354 1.713962 3.698679 0.306042 1.245194 +1200 1155 1.307354 1.714340 3.700735 0.305736 1.245280 +1200 1156 1.307354 1.714154 3.699722 0.305736 1.245248 +1200 1157 1.307354 1.714515 3.701688 0.305736 1.245340 +1200 1158 1.307354 1.715064 3.704679 0.305125 1.245531 +1200 1159 1.307354 1.715801 3.708699 0.304212 1.245790 +1200 1160 1.307354 1.716531 3.712695 0.303301 1.246061 +1200 1161 1.307354 1.717263 3.716710 0.302393 1.246385 +1200 1162 1.307354 1.718374 3.722820 0.301186 1.246717 +1200 1163 1.307354 1.719124 3.726955 0.300285 1.247026 +1200 1164 1.307354 1.719682 3.730041 0.299685 1.247302 +1200 1165 1.307354 1.720062 3.732148 0.299086 1.247452 +1200 1166 1.307354 1.720058 3.732128 0.298788 1.247518 +1200 1167 1.307354 1.720433 3.734205 0.298489 1.247610 +1200 1168 1.307354 1.720599 3.735126 0.298489 1.247566 +1200 1169 1.307354 1.721125 3.738048 0.298191 1.247708 +1200 1170 1.307354 1.721680 3.741140 0.297595 1.247856 +1200 1171 1.307354 1.722788 3.747324 0.296408 1.248252 +1200 1172 1.307354 1.723135 3.749262 0.295816 1.248407 +1200 1173 1.307354 1.723304 3.750209 0.295816 1.248444 +1200 1174 1.307354 1.723303 3.750202 0.295816 1.248454 +1200 1175 1.307354 1.723674 3.752286 0.295521 1.248541 +1200 1176 1.307354 1.723688 3.752359 0.295521 1.248542 +1200 1177 1.307354 1.724216 3.755327 0.294636 1.248751 +1200 1178 1.307354 1.725273 3.761276 0.293754 1.249084 +1200 1179 1.307354 1.725790 3.764193 0.292874 1.249415 +1200 1180 1.307354 1.725797 3.764231 0.292874 1.249411 +1200 1181 1.307354 1.726170 3.766343 0.292582 1.249495 +1200 1182 1.307354 1.726693 3.769303 0.292289 1.249530 +1200 1183 1.307354 1.727430 3.773484 0.291706 1.249714 +1200 1184 1.307354 1.727424 3.773448 0.291706 1.249713 +1200 1185 1.307354 1.728331 3.778607 0.290542 1.250002 +1200 1186 1.307354 1.729020 3.782539 0.289961 1.250219 +1200 1187 1.307354 1.729204 3.783592 0.289672 1.250327 +1200 1188 1.307354 1.729570 3.785679 0.289382 1.250350 +1200 1189 1.307354 1.729748 3.786703 0.289093 1.250515 +1200 1190 1.307354 1.730288 3.789795 0.288516 1.250640 +1200 1191 1.307354 1.730435 3.790638 0.288228 1.250733 +1200 1192 1.307354 1.731322 3.795739 0.287365 1.251081 +1200 1193 1.307354 1.731492 3.796716 0.286791 1.251196 +1200 1194 1.307354 1.731655 3.797656 0.286791 1.251231 +1200 1195 1.307354 1.732009 3.799694 0.286218 1.251397 +1200 1196 1.307354 1.732376 3.801814 0.285647 1.251613 +1200 1197 1.307354 1.732717 3.803788 0.285361 1.251696 +1200 1198 1.307354 1.732730 3.803862 0.285361 1.251685 +1200 1199 1.307354 1.732726 3.803839 0.285361 1.251682 +1200 1200 1.307354 1.732729 3.803858 0.285361 1.251743 +1200 1201 1.307354 1.732736 3.803895 0.285361 1.251680 +1200 1202 1.307354 1.734140 3.812039 0.283655 1.252203 +1200 1203 1.307354 1.735254 3.818523 0.282523 1.252546 +1200 1204 1.307354 1.735239 3.818438 0.282241 1.252734 +1200 1205 1.307354 1.735964 3.822672 0.281677 1.252927 +1200 1206 1.307354 1.736488 3.825745 0.280834 1.253183 +1200 1207 1.307354 1.737185 3.829832 0.279993 1.253415 +1200 1208 1.307354 1.737182 3.829813 0.279993 1.253340 +1200 1209 1.307354 1.737736 3.833075 0.279714 1.253463 +1200 1210 1.307354 1.738078 3.835090 0.279155 1.253690 +1200 1211 1.307354 1.738810 3.839409 0.278319 1.253928 +1200 1212 1.307354 1.739014 3.840612 0.278041 1.253982 +1200 1213 1.307354 1.739550 3.843789 0.277486 1.254180 +1200 1214 1.307354 1.739919 3.845971 0.277209 1.254326 +1200 1215 1.307354 1.739756 3.845006 0.277209 1.254287 +1200 1216 1.307354 1.739772 3.845101 0.277209 1.254295 +1200 1217 1.307354 1.740307 3.848277 0.276655 1.254482 +1200 1218 1.307354 1.740860 3.851568 0.276103 1.254601 +1200 1219 1.307354 1.740696 3.850590 0.276103 1.254618 +1200 1220 1.307354 1.740862 3.851576 0.276103 1.254659 +1200 1221 1.307354 1.741382 3.854675 0.275276 1.254917 +1200 1222 1.307354 1.742435 3.860969 0.274178 1.255292 +1200 1223 1.307354 1.742954 3.864081 0.273630 1.255425 +1200 1224 1.307354 1.742967 3.864159 0.273630 1.255358 +1200 1225 1.307354 1.743674 3.868403 0.273084 1.255572 +1200 1226 1.307354 1.743668 3.868366 0.273084 1.255508 +1200 1227 1.307354 1.743856 3.869496 0.273084 1.255554 +1200 1228 1.307354 1.744045 3.870634 0.272811 1.255604 +1200 1229 1.307354 1.744899 3.875785 0.271994 1.255887 +1200 1230 1.307354 1.745445 3.879085 0.271451 1.256032 +1200 1231 1.307354 1.746524 3.885631 0.270368 1.256342 +1200 1232 1.307354 1.746296 3.884245 0.270368 1.256240 +1200 1233 1.307354 1.746976 3.888380 0.269828 1.256516 +1200 1234 1.307354 1.748173 3.895683 0.268214 1.257039 +1200 1235 1.307354 1.748852 3.899842 0.267411 1.257265 +1200 1236 1.307354 1.749201 3.901979 0.267144 1.257360 +1200 1237 1.307354 1.749395 3.903172 0.266877 1.257461 +1200 1238 1.307354 1.749945 3.906551 0.266344 1.257645 +1200 1239 1.307354 1.751131 3.913867 0.265017 1.257979 +1200 1240 1.307354 1.751489 3.916079 0.264487 1.258183 +1200 1241 1.307354 1.751870 3.918443 0.264223 1.258260 +1200 1242 1.307354 1.752557 3.922707 0.263169 1.258648 +1200 1243 1.307354 1.752537 3.922584 0.263169 1.258640 +1200 1244 1.307354 1.753207 3.926750 0.262381 1.258845 +1200 1245 1.307354 1.754059 3.932070 0.261334 1.259162 +1200 1246 1.307354 1.754242 3.933213 0.261334 1.259200 +1200 1247 1.307354 1.754409 3.934254 0.261073 1.259241 +1200 1248 1.307354 1.754755 3.936426 0.260812 1.259386 +1200 1249 1.307354 1.754765 3.936483 0.260552 1.259454 +1200 1250 1.307354 1.754945 3.937615 0.260291 1.259494 +1200 1251 1.307354 1.755463 3.940869 0.259772 1.259626 +1200 1252 1.307354 1.755988 3.944167 0.259512 1.259745 +1200 1253 1.307354 1.755987 3.944165 0.259512 1.259748 +1200 1254 1.307354 1.756148 3.945178 0.259512 1.259704 +1200 1255 1.307354 1.757024 3.950706 0.258735 1.259885 +1200 1256 1.307354 1.757564 3.954120 0.258218 1.260077 +1200 1257 1.307354 1.758079 3.957387 0.257703 1.260254 +1200 1258 1.307354 1.758906 3.962644 0.256931 1.260457 +1200 1259 1.307354 1.758916 3.962710 0.256675 1.260521 +1200 1260 1.307354 1.758941 3.962868 0.256675 1.260461 +1200 1261 1.307354 1.759130 3.964072 0.256675 1.260503 +1200 1262 1.307354 1.759646 3.967362 0.256418 1.260612 +1200 1263 1.307354 1.759623 3.967218 0.256162 1.260654 +1200 1264 1.307354 1.759590 3.967009 0.256162 1.260670 +1200 1265 1.307354 1.759770 3.968155 0.256162 1.260717 +1200 1266 1.307354 1.760629 3.973654 0.254630 1.261149 +1200 1267 1.307354 1.761321 3.978096 0.254122 1.261321 +1200 1268 1.307354 1.761145 3.976967 0.254122 1.261351 +1200 1269 1.307354 1.761635 3.980113 0.253614 1.261463 +1200 1270 1.307354 1.761621 3.980028 0.253614 1.261461 +1200 1271 1.307354 1.762313 3.984482 0.253108 1.261602 +1200 1272 1.307354 1.762461 3.985441 0.253108 1.261649 +1200 1273 1.307354 1.762788 3.987554 0.252350 1.261831 +1200 1274 1.307354 1.763469 3.991964 0.251595 1.262101 +1200 1275 1.307354 1.763291 3.990809 0.251595 1.262066 +1200 1276 1.307354 1.763808 3.994160 0.251343 1.262183 +1200 1277 1.307354 1.764335 3.997584 0.250841 1.262343 +1200 1278 1.307354 1.764335 3.997584 0.250841 1.262343 +1200 1279 1.307354 1.764823 4.000766 0.250090 1.262513 +1200 1280 1.307354 1.764987 4.001834 0.249840 1.262599 +1200 1281 1.307354 1.765497 4.005161 0.249341 1.262821 +1200 1282 1.307354 1.765655 4.006195 0.249092 1.262859 +1200 1283 1.307354 1.766014 4.008542 0.248844 1.262886 +1200 1284 1.307354 1.765838 4.007395 0.248844 1.262781 +1200 1285 1.307354 1.766360 4.010813 0.248844 1.262909 +1200 1286 1.307354 1.766531 4.011933 0.248844 1.262886 +1200 1287 1.307354 1.766348 4.010730 0.248844 1.262914 +1200 1288 1.307354 1.766362 4.010825 0.248595 1.262971 +1200 1289 1.307354 1.767041 4.015284 0.247851 1.263189 +1200 1290 1.307354 1.767530 4.018500 0.247356 1.263401 +1200 1291 1.307354 1.767836 4.020522 0.246862 1.263534 +1200 1292 1.307354 1.767856 4.020654 0.246862 1.263500 +1200 1293 1.307354 1.768179 4.022786 0.246615 1.263570 +1200 1294 1.307354 1.768497 4.024884 0.246369 1.263643 +1200 1295 1.307354 1.769010 4.028281 0.245877 1.263825 +1200 1296 1.307354 1.769811 4.033599 0.244651 1.264153 +1200 1297 1.307354 1.771128 4.042384 0.243431 1.264536 +1200 1298 1.307354 1.771629 4.045733 0.243188 1.264589 +1200 1299 1.307354 1.772098 4.048880 0.242703 1.264707 +1200 1300 1.307354 1.772286 4.050143 0.242460 1.264806 +1200 1301 1.307354 1.772106 4.048933 0.242460 1.264771 +1200 1302 1.307354 1.772914 4.054368 0.241493 1.265109 +1200 1303 1.307354 1.773402 4.057662 0.240770 1.265339 +1200 1304 1.307354 1.773860 4.060753 0.240529 1.265448 +1200 1305 1.307354 1.773815 4.060452 0.240289 1.265496 +1200 1306 1.307354 1.774653 4.066123 0.239330 1.265765 +1200 1307 1.307354 1.775313 4.070605 0.238614 1.266008 +1200 1308 1.307354 1.775512 4.071964 0.238137 1.266155 +1200 1309 1.307354 1.776011 4.075361 0.237661 1.266258 +1200 1310 1.307354 1.775841 4.074207 0.237661 1.266204 +1200 1311 1.307354 1.775820 4.074063 0.237661 1.266195 +1200 1312 1.307354 1.776189 4.076575 0.237424 1.266323 +1200 1313 1.307354 1.777009 4.082187 0.236713 1.266555 +1200 1314 1.307354 1.778310 4.091127 0.235533 1.266883 +1200 1315 1.307354 1.778148 4.090007 0.235533 1.266844 +1200 1316 1.307354 1.778316 4.091164 0.235533 1.266885 +1200 1317 1.307354 1.778968 4.095662 0.234828 1.267076 +1200 1318 1.307354 1.779610 4.100107 0.234125 1.267315 +1200 1319 1.307354 1.779933 4.102344 0.233891 1.267399 +1200 1320 1.307354 1.780109 4.103567 0.233657 1.267409 +1200 1321 1.307354 1.780105 4.103538 0.233657 1.267450 +1200 1322 1.307354 1.780964 4.109517 0.232725 1.267676 +1200 1323 1.307354 1.781596 4.113921 0.232260 1.267855 +1200 1324 1.307354 1.782109 4.117513 0.231565 1.268022 +1200 1325 1.307354 1.781953 4.116418 0.231565 1.267981 +1200 1326 1.307354 1.782912 4.123146 0.230641 1.268299 +1200 1327 1.307354 1.783881 4.129966 0.229721 1.268595 +1200 1328 1.307354 1.784345 4.133245 0.229262 1.268701 +1200 1329 1.307354 1.784325 4.133105 0.229262 1.268701 +1200 1330 1.307354 1.785108 4.138654 0.228804 1.268852 +1200 1331 1.307354 1.785437 4.140989 0.228347 1.268974 +1200 1332 1.307354 1.785613 4.142236 0.228119 1.269053 +1200 1333 1.307354 1.785930 4.144490 0.227664 1.269170 +1200 1334 1.307354 1.786403 4.147868 0.227209 1.269331 +1200 1335 1.307354 1.786594 4.149229 0.227209 1.269320 +1200 1336 1.307354 1.786599 4.149266 0.227209 1.269321 +1200 1337 1.307354 1.786597 4.149249 0.227209 1.269333 +1200 1338 1.307354 1.787411 4.155072 0.226302 1.269553 +1200 1339 1.307354 1.787555 4.156103 0.225850 1.269676 +1200 1340 1.307354 1.787864 4.158322 0.225625 1.269748 +1200 1341 1.307354 1.788488 4.162814 0.224949 1.269930 +1200 1342 1.307354 1.788810 4.165133 0.224500 1.270057 +1200 1343 1.307354 1.788947 4.166121 0.224500 1.270087 +1200 1344 1.307354 1.789918 4.173140 0.223828 1.270279 +1200 1345 1.307354 1.790088 4.174374 0.223604 1.270296 +1200 1346 1.307354 1.789784 4.172173 0.223604 1.270127 +1200 1347 1.307354 1.790262 4.175634 0.223604 1.270268 +1200 1348 1.307354 1.790577 4.177926 0.223381 1.270374 +1200 1349 1.307354 1.791366 4.183673 0.222712 1.270543 +1200 1350 1.307354 1.791373 4.183723 0.222490 1.270593 +1200 1351 1.307354 1.791837 4.187110 0.222045 1.270781 +1200 1352 1.307354 1.792166 4.189517 0.221823 1.270792 +1200 1353 1.307354 1.792339 4.190788 0.221823 1.270831 +1200 1354 1.307354 1.792508 4.192021 0.221602 1.270864 +1200 1355 1.307354 1.792963 4.195365 0.221380 1.270945 +1200 1356 1.307354 1.792813 4.194263 0.221380 1.270865 +1200 1357 1.307354 1.792952 4.195282 0.221380 1.270940 +1200 1358 1.307354 1.793897 4.202245 0.220057 1.271290 +1200 1359 1.307354 1.793897 4.202247 0.220057 1.271287 +1200 1360 1.307354 1.794848 4.209280 0.219179 1.271593 +1200 1361 1.307354 1.795167 4.211641 0.218960 1.271649 +1200 1362 1.307354 1.796234 4.219582 0.217651 1.271971 +1200 1363 1.307354 1.796692 4.223003 0.217216 1.272112 +1200 1364 1.307354 1.797183 4.226681 0.216782 1.272252 +1200 1365 1.307354 1.797344 4.227883 0.216566 1.272288 +1200 1366 1.307354 1.797695 4.230521 0.216349 1.272359 +1200 1367 1.307354 1.798023 4.232987 0.216133 1.272379 +1200 1368 1.307354 1.798534 4.236835 0.215702 1.272485 +1200 1369 1.307354 1.798872 4.239389 0.215486 1.272598 +1200 1370 1.307354 1.799521 4.244298 0.214841 1.272723 +1200 1371 1.307354 1.799207 4.241919 0.214841 1.272666 +1200 1372 1.307354 1.799668 4.245409 0.214626 1.272808 +1200 1373 1.307354 1.800266 4.249946 0.213984 1.273014 +1200 1374 1.307354 1.800434 4.251224 0.213770 1.273049 +1200 1375 1.307354 1.800577 4.252312 0.213556 1.273113 +1200 1376 1.307354 1.801208 4.257124 0.212917 1.273302 +1200 1377 1.307354 1.801363 4.258309 0.212704 1.273334 +1200 1378 1.307354 1.801664 4.260616 0.212492 1.273439 +1200 1379 1.307354 1.801964 4.262908 0.212280 1.273490 +1200 1380 1.307354 1.802725 4.268752 0.211433 1.273692 +1200 1381 1.307354 1.802706 4.268605 0.211433 1.273688 +1200 1382 1.307354 1.803187 4.272309 0.211221 1.273785 +1200 1383 1.307354 1.803342 4.273510 0.211010 1.273815 +1200 1384 1.307354 1.803816 4.277170 0.210589 1.273959 +1200 1385 1.307354 1.803803 4.277065 0.210589 1.273899 +1200 1386 1.307354 1.804302 4.280931 0.210379 1.273990 +1200 1387 1.307354 1.804311 4.280997 0.210379 1.273989 +1200 1388 1.307354 1.805203 4.287926 0.209539 1.274231 +1200 1389 1.307354 1.806120 4.295084 0.208703 1.274419 +1200 1390 1.307354 1.806446 4.297636 0.208495 1.274525 +1200 1391 1.307354 1.806868 4.300943 0.208078 1.274615 +1200 1392 1.307354 1.807776 4.308086 0.207248 1.274846 +1200 1393 1.307354 1.807776 4.308086 0.207248 1.274857 +1200 1394 1.307354 1.807925 4.309261 0.207041 1.274885 +1200 1395 1.307354 1.808093 4.310589 0.207041 1.274923 +1200 1396 1.307354 1.808686 4.315282 0.206421 1.275087 +1200 1397 1.307354 1.809296 4.320114 0.205803 1.275249 +1200 1398 1.307354 1.809624 4.322722 0.205187 1.275408 +1200 1399 1.307354 1.809786 4.324013 0.204982 1.275487 +1200 1400 1.307354 1.809634 4.322808 0.204982 1.275455 +1200 1401 1.307354 1.810368 4.328652 0.204368 1.275614 +1200 1402 1.307354 1.810675 4.331106 0.203960 1.275730 +1200 1403 1.307354 1.810801 4.332118 0.203960 1.275752 +1200 1404 1.307354 1.811561 4.338210 0.203146 1.275975 +1200 1405 1.307354 1.812023 4.341930 0.202740 1.276056 +1200 1406 1.307354 1.812312 4.344257 0.202336 1.276194 +1200 1407 1.307354 1.812903 4.349030 0.201730 1.276386 +1200 1408 1.307354 1.813210 4.351520 0.201327 1.276493 +1200 1409 1.307354 1.813365 4.352769 0.201126 1.276527 +1200 1410 1.307354 1.813989 4.357842 0.200724 1.276659 +1200 1411 1.307354 1.814445 4.361551 0.200323 1.276757 +1200 1412 1.307354 1.814860 4.364938 0.199923 1.276884 +1200 1413 1.307354 1.814714 4.363750 0.199923 1.276813 +1200 1414 1.307354 1.814857 4.364918 0.199923 1.276798 +1200 1415 1.307354 1.815313 4.368646 0.199524 1.276974 +1200 1416 1.307354 1.816099 4.375101 0.198927 1.277113 +1200 1417 1.307354 1.816251 4.376352 0.198728 1.277183 +1200 1418 1.307354 1.816713 4.380151 0.198133 1.277309 +1200 1419 1.307354 1.818051 4.391232 0.196752 1.277699 +1200 1420 1.307354 1.818349 4.393708 0.196555 1.277748 +1200 1421 1.307354 1.818198 4.392449 0.196555 1.277724 +1200 1422 1.307354 1.817754 4.388766 0.196555 1.277651 +1200 1423 1.307354 1.818194 4.392417 0.196555 1.277734 +1200 1424 1.307354 1.818486 4.394850 0.196359 1.277790 +1200 1425 1.307354 1.819079 4.399793 0.195967 1.277932 +1200 1426 1.307354 1.819075 4.399756 0.195771 1.277980 +1200 1427 1.307354 1.819226 4.401024 0.195575 1.278056 +1200 1428 1.307354 1.819374 4.402258 0.195380 1.278079 +1200 1429 1.307354 1.819813 4.405936 0.195185 1.278152 +1200 1430 1.307354 1.819801 4.405837 0.194990 1.278204 +1200 1431 1.307354 1.819810 4.405909 0.194795 1.278250 +1200 1432 1.307354 1.819820 4.405998 0.194795 1.278253 +1200 1433 1.307354 1.820433 4.411144 0.194212 1.278420 +1200 1434 1.307354 1.820708 4.413460 0.193824 1.278518 +1200 1435 1.307354 1.821165 4.417312 0.193243 1.278655 +1200 1436 1.307354 1.821587 4.420878 0.193050 1.278725 +1200 1437 1.307354 1.821614 4.421105 0.193050 1.278724 +1200 1438 1.307354 1.821612 4.421091 0.193050 1.278678 +1200 1439 1.307354 1.821621 4.421171 0.193050 1.278680 +1200 1440 1.307354 1.821458 4.419789 0.193050 1.278641 +1200 1441 1.307354 1.821769 4.422422 0.192858 1.278754 +1200 1442 1.307354 1.821619 4.421146 0.192858 1.278618 +1200 1443 1.307354 1.821769 4.422423 0.192858 1.278689 +1200 1444 1.307354 1.822164 4.425773 0.192858 1.278761 +1200 1445 1.307354 1.822615 4.429600 0.192280 1.278895 +1200 1446 1.307354 1.822894 4.431982 0.191896 1.278979 +1200 1447 1.307354 1.823160 4.434249 0.191704 1.279062 +1200 1448 1.307354 1.823150 4.434161 0.191704 1.279066 +1200 1449 1.307354 1.823568 4.437735 0.191130 1.279189 +1200 1450 1.307354 1.823433 4.436580 0.191130 1.279172 +1200 1451 1.307354 1.823574 4.437786 0.191130 1.279210 +1200 1452 1.307354 1.824190 4.443064 0.190558 1.279334 +1200 1453 1.307354 1.824353 4.444468 0.190368 1.279422 +1200 1454 1.307354 1.824486 4.445605 0.190368 1.279403 +1200 1455 1.307354 1.824483 4.445584 0.190368 1.279357 +1200 1456 1.307354 1.824322 4.444200 0.190368 1.279371 +1200 1457 1.307354 1.824620 4.446757 0.190178 1.279468 +1200 1458 1.307354 1.825238 4.452083 0.189608 1.279613 +1200 1459 1.307354 1.825383 4.453334 0.189608 1.279577 +1200 1460 1.307354 1.826139 4.459883 0.189041 1.279769 +1200 1461 1.307354 1.826269 4.461010 0.188663 1.279874 +1200 1462 1.307354 1.826708 4.464824 0.188286 1.279964 +1200 1463 1.307354 1.827288 4.469886 0.187723 1.280118 +1200 1464 1.307354 1.827121 4.468421 0.187723 1.280042 +1200 1465 1.307354 1.827105 4.468283 0.187723 1.280083 +1200 1466 1.307354 1.827416 4.470997 0.187348 1.280186 +1200 1467 1.307354 1.827868 4.474954 0.186600 1.280394 +1200 1468 1.307354 1.828006 4.476165 0.186600 1.280415 +1200 1469 1.307354 1.828115 4.477117 0.186414 1.280432 +1200 1470 1.307354 1.828377 4.479415 0.186042 1.280567 +1200 1471 1.307354 1.828688 4.482151 0.186042 1.280578 +1200 1472 1.307354 1.828976 4.484691 0.185670 1.280638 +1200 1473 1.307354 1.828416 4.479757 0.185670 1.280453 +1200 1474 1.307354 1.829150 4.486221 0.185670 1.280622 +1200 1475 1.307354 1.829421 4.488613 0.185484 1.280672 +1200 1476 1.307354 1.829728 4.491333 0.185484 1.280679 +1200 1477 1.307354 1.830040 4.494096 0.185299 1.280740 +1200 1478 1.307354 1.830173 4.495267 0.185114 1.280765 +1200 1479 1.307354 1.830461 4.497825 0.184929 1.280815 +1200 1480 1.307354 1.830899 4.501730 0.184191 1.280989 +1200 1481 1.307354 1.830915 4.501871 0.184191 1.280998 +1200 1482 1.307354 1.831452 4.506657 0.183640 1.281142 +1200 1483 1.307354 1.831580 4.507807 0.183640 1.281168 +1200 1484 1.307354 1.832586 4.516828 0.182907 1.281342 +1200 1485 1.307354 1.832577 4.516741 0.182907 1.281296 +1200 1486 1.307354 1.833152 4.521925 0.182177 1.281528 +1200 1487 1.307354 1.833416 4.524311 0.181632 1.281670 +1200 1488 1.307354 1.833702 4.526892 0.181632 1.281674 +1200 1489 1.307354 1.834137 4.530841 0.181088 1.281788 +1200 1490 1.307354 1.834175 4.531186 0.181088 1.281782 +1200 1491 1.307354 1.834044 4.529999 0.181088 1.281727 +1200 1492 1.307354 1.834312 4.532423 0.181088 1.281818 +1200 1493 1.307354 1.834046 4.530015 0.181088 1.281691 +1200 1494 1.307354 1.834337 4.532654 0.181088 1.281826 +1200 1495 1.307354 1.834178 4.531213 0.181088 1.281756 +1200 1496 1.307354 1.834337 4.532656 0.180907 1.281839 +1200 1497 1.307354 1.834627 4.535286 0.180907 1.281857 +1200 1498 1.307354 1.834478 4.533932 0.180907 1.281737 +1200 1499 1.307354 1.834606 4.535098 0.180907 1.281800 +1200 1500 1.307354 1.834590 4.534949 0.180907 1.281754 +1200 1501 1.307354 1.835287 4.541309 0.180365 1.281968 +1200 1502 1.307354 1.835416 4.542483 0.180365 1.281955 +1200 1503 1.307354 1.835552 4.543729 0.180185 1.282027 +1200 1504 1.307354 1.835672 4.544829 0.180005 1.282055 +1200 1505 1.307354 1.835818 4.546170 0.180005 1.282045 +1200 1506 1.307354 1.835972 4.547577 0.180005 1.282062 +1200 1507 1.307354 1.835855 4.546503 0.180005 1.281996 +1200 1508 1.307354 1.836004 4.547871 0.180005 1.282074 +1200 1509 1.307354 1.836565 4.553026 0.179466 1.282178 +1200 1510 1.307354 1.836701 4.554274 0.179287 1.282239 +1200 1511 1.307354 1.836984 4.556888 0.179108 1.282289 +1200 1512 1.307354 1.836557 4.552951 0.179108 1.282221 +1200 1513 1.307354 1.836696 4.554232 0.179108 1.282251 +1200 1514 1.307354 1.837133 4.558265 0.178750 1.282377 +1200 1515 1.307354 1.837580 4.562400 0.178393 1.282452 +1200 1516 1.307354 1.837580 4.562400 0.178393 1.282452 +1200 1517 1.307354 1.837844 4.564838 0.178037 1.282547 +1200 1518 1.307354 1.838415 4.570144 0.177682 1.282657 +1200 1519 1.307354 1.838432 4.570300 0.177504 1.282702 +1200 1520 1.307354 1.838870 4.574388 0.177150 1.282784 +1200 1521 1.307354 1.838543 4.571338 0.177150 1.282720 +1200 1522 1.307354 1.839246 4.577897 0.176619 1.282923 +1200 1523 1.307354 1.839368 4.579038 0.176443 1.282990 +1200 1524 1.307354 1.839217 4.577624 0.176443 1.282967 +1200 1525 1.307354 1.839209 4.577550 0.176443 1.282934 +1200 1526 1.307354 1.839784 4.582938 0.176266 1.283043 +1200 1527 1.307354 1.839823 4.583302 0.175914 1.283108 +1200 1528 1.307354 1.840083 4.585738 0.175739 1.283169 +1200 1529 1.307354 1.840475 4.589433 0.175388 1.283273 +1200 1530 1.307354 1.841100 4.595328 0.174688 1.283418 +1200 1531 1.307354 1.841101 4.595337 0.174688 1.283416 +1200 1532 1.307354 1.841235 4.596601 0.174688 1.283445 +1200 1533 1.307354 1.841370 4.597887 0.174513 1.283470 +1200 1534 1.307354 1.841375 4.597928 0.174513 1.283437 +1200 1535 1.307354 1.841555 4.599632 0.174339 1.283527 +1200 1536 1.307354 1.841413 4.598287 0.174339 1.283499 +1200 1537 1.307354 1.841842 4.602360 0.173817 1.283643 +1200 1538 1.307354 1.842086 4.604677 0.173643 1.283686 +1200 1539 1.307354 1.842495 4.608577 0.173297 1.283794 +1200 1540 1.307354 1.842931 4.612735 0.172951 1.283866 +1200 1541 1.307354 1.843356 4.616803 0.172605 1.283968 +1200 1542 1.307354 1.843927 4.622284 0.172088 1.284071 +1200 1543 1.307354 1.844065 4.623619 0.171917 1.284098 +1200 1544 1.307354 1.844468 4.627503 0.171745 1.284176 +1200 1545 1.307354 1.844867 4.631358 0.171060 1.284313 +1200 1546 1.307354 1.844994 4.632581 0.171060 1.284333 +1200 1547 1.307354 1.845540 4.637884 0.170377 1.284501 +1200 1548 1.307354 1.845964 4.642007 0.170037 1.284604 +1200 1549 1.307354 1.845696 4.639394 0.170037 1.284528 +1200 1550 1.307354 1.846237 4.644663 0.169697 1.284661 +1200 1551 1.307354 1.846799 4.650162 0.169189 1.284797 +1200 1552 1.307354 1.847347 4.655531 0.168851 1.284891 +1200 1553 1.307354 1.847494 4.656984 0.168683 1.284913 +1200 1554 1.307354 1.847516 4.657199 0.168683 1.284921 +1200 1555 1.307354 1.847382 4.655877 0.168683 1.284893 +1200 1556 1.307354 1.847658 4.658598 0.168514 1.284944 +1200 1557 1.307354 1.847672 4.658734 0.168514 1.284945 +1200 1558 1.307354 1.847795 4.659941 0.168346 1.284981 +1200 1559 1.307354 1.848377 4.665687 0.167842 1.285122 +1200 1560 1.307354 1.848666 4.668545 0.167674 1.285166 +1200 1561 1.307354 1.848795 4.669824 0.167507 1.285186 +1200 1562 1.307354 1.849769 4.679521 0.166671 1.285410 +1200 1563 1.307354 1.849640 4.678226 0.166671 1.285386 +1200 1564 1.307354 1.849782 4.679651 0.166505 1.285447 +1200 1565 1.307354 1.850190 4.683728 0.166172 1.285519 +1200 1566 1.307354 1.850591 4.687746 0.166006 1.285582 +1200 1567 1.307354 1.850714 4.688974 0.165675 1.285647 +1200 1568 1.307354 1.851009 4.691936 0.165344 1.285745 +1200 1569 1.307354 1.851582 4.697716 0.164520 1.285958 +1200 1570 1.307354 1.852147 4.703428 0.164027 1.286048 +1200 1571 1.307354 1.852417 4.706169 0.163863 1.286095 +1200 1572 1.307354 1.853355 4.715719 0.162721 1.286373 +1200 1573 1.307354 1.852943 4.711512 0.162721 1.286306 +1200 1574 1.307354 1.852959 4.711672 0.162721 1.286278 +1200 1575 1.307354 1.852689 4.708932 0.162721 1.286234 +1200 1576 1.307354 1.852951 4.711597 0.162721 1.286234 +1200 1577 1.307354 1.853363 4.715793 0.162721 1.286337 +1200 1578 1.307354 1.853640 4.718627 0.162721 1.286382 +1200 1579 1.307354 1.853502 4.717216 0.162721 1.286358 +1200 1580 1.307354 1.854048 4.722813 0.162396 1.286452 +1200 1581 1.307354 1.854052 4.722846 0.162396 1.286378 +1200 1582 1.307354 1.853914 4.721431 0.162396 1.286358 +1200 1583 1.307354 1.854178 4.724147 0.162396 1.286443 +1200 1584 1.307354 1.854176 4.724121 0.162396 1.286372 +1200 1585 1.307354 1.854445 4.726888 0.162234 1.286484 +1200 1586 1.307354 1.854451 4.726946 0.162072 1.286527 +1200 1587 1.307354 1.854587 4.728353 0.161910 1.286558 +1200 1588 1.307354 1.855279 4.735484 0.161264 1.286708 +1200 1589 1.307354 1.855673 4.739572 0.160942 1.286769 +1200 1590 1.307354 1.855680 4.739644 0.160942 1.286793 +1200 1591 1.307354 1.855696 4.739809 0.160942 1.286793 +1200 1592 1.307354 1.855834 4.741237 0.160781 1.286816 +1200 1593 1.307354 1.855587 4.738676 0.160781 1.286757 +1200 1594 1.307354 1.855987 4.742824 0.160781 1.286831 +1200 1595 1.307354 1.856647 4.749707 0.159979 1.287019 +1200 1596 1.307354 1.857194 4.755426 0.159341 1.287147 +1200 1597 1.307354 1.857175 4.755229 0.159341 1.287100 +1200 1598 1.307354 1.856897 4.752319 0.159341 1.287015 +1200 1599 1.307354 1.857031 4.753717 0.159341 1.287083 +1200 1600 1.307354 1.857570 4.759368 0.159023 1.287221 +1200 1601 1.307354 1.857829 4.762092 0.158864 1.287258 +1200 1602 1.307354 1.858107 4.765015 0.158705 1.287300 +1200 1603 1.307354 1.858124 4.765202 0.158705 1.287296 +1200 1604 1.307354 1.858392 4.768025 0.158547 1.287342 +1200 1605 1.307354 1.858784 4.772166 0.158072 1.287436 +1200 1606 1.307354 1.858779 4.772116 0.158072 1.287474 +1200 1607 1.307354 1.859577 4.780595 0.157127 1.287677 +1200 1608 1.307354 1.859438 4.779115 0.157127 1.287621 +1200 1609 1.307354 1.859693 4.781832 0.156970 1.287702 +1200 1610 1.307354 1.859942 4.784480 0.156813 1.287741 +1200 1611 1.307354 1.860198 4.787217 0.156500 1.287816 +1200 1612 1.307354 1.860065 4.785800 0.156500 1.287794 +1200 1613 1.307354 1.859928 4.784328 0.156500 1.287839 +1200 1614 1.307354 1.860057 4.785710 0.156344 1.287895 +1200 1615 1.307354 1.859899 4.784021 0.156344 1.287797 +1200 1616 1.307354 1.859898 4.784017 0.156344 1.287805 +1200 1617 1.307354 1.860536 4.790837 0.156031 1.287951 +1200 1618 1.307354 1.860549 4.790972 0.156031 1.287911 +1200 1619 1.307354 1.860544 4.790919 0.156031 1.287912 +1200 1620 1.307354 1.860514 4.790605 0.156031 1.287849 +1200 1621 1.307354 1.860626 4.791804 0.156031 1.287835 +1200 1622 1.307354 1.861031 4.796155 0.155876 1.287972 +1200 1623 1.307354 1.861801 4.804455 0.155099 1.288166 +1200 1624 1.307354 1.861940 4.805954 0.154944 1.288195 +1200 1625 1.307354 1.862181 4.808572 0.154325 1.288347 +1200 1626 1.307354 1.862300 4.809865 0.154325 1.288360 +1200 1627 1.307354 1.862850 4.815840 0.153710 1.288486 +1200 1628 1.307354 1.862706 4.814267 0.153710 1.288433 +1200 1629 1.307354 1.862826 4.815580 0.153710 1.288455 +1200 1630 1.307354 1.862954 4.816968 0.153710 1.288480 +1200 1631 1.307354 1.863360 4.821409 0.153403 1.288550 +1200 1632 1.307354 1.863502 4.822954 0.153403 1.288566 +1200 1633 1.307354 1.863384 4.821662 0.153403 1.288546 +1200 1634 1.307354 1.863879 4.827084 0.152791 1.288690 +1200 1635 1.307354 1.864264 4.831320 0.152485 1.288756 +1200 1636 1.307354 1.864279 4.831480 0.152485 1.288766 +1200 1637 1.307354 1.864666 4.835753 0.152029 1.288864 +1200 1638 1.307354 1.864798 4.837203 0.152029 1.288879 +1200 1639 1.307354 1.865192 4.841559 0.151725 1.288947 +1200 1640 1.307354 1.865206 4.841718 0.151725 1.288912 +1200 1641 1.307354 1.865590 4.845979 0.151422 1.288996 +1200 1642 1.307354 1.866088 4.851517 0.150818 1.289136 +1200 1643 1.307354 1.866339 4.854313 0.150667 1.289173 +1200 1644 1.307354 1.866850 4.860030 0.149916 1.289362 +1200 1645 1.307354 1.866813 4.859624 0.149916 1.289314 +1200 1646 1.307354 1.866932 4.860951 0.149916 1.289295 +1200 1647 1.307354 1.867566 4.868080 0.149318 1.289463 +1200 1648 1.307354 1.867819 4.870928 0.149020 1.289553 +1200 1649 1.307354 1.867945 4.872362 0.148722 1.289602 +1200 1650 1.307354 1.868459 4.878177 0.148425 1.289693 +1200 1651 1.307354 1.869130 4.885801 0.147833 1.289796 +1200 1652 1.307354 1.869257 4.887246 0.147833 1.289814 +1200 1653 1.307354 1.869122 4.885713 0.147833 1.289763 +1200 1654 1.307354 1.868753 4.881510 0.147833 1.289703 +1200 1655 1.307354 1.868482 4.878439 0.147833 1.289582 +1200 1656 1.307354 1.868613 4.879921 0.147833 1.289602 +1200 1657 1.307354 1.868601 4.879791 0.147833 1.289598 +1200 1658 1.307354 1.869229 4.886926 0.147833 1.289733 +1200 1659 1.307354 1.869246 4.887127 0.147833 1.289703 +1200 1660 1.307354 1.869615 4.891345 0.147833 1.289799 +1200 1661 1.307354 1.869477 4.889761 0.147833 1.289748 +1200 1662 1.307354 1.869339 4.888188 0.147833 1.289718 +1200 1663 1.307354 1.869864 4.894188 0.147686 1.289838 +1200 1664 1.307354 1.869981 4.895537 0.147538 1.289860 +1200 1665 1.307354 1.870019 4.895968 0.147538 1.289836 +1200 1666 1.307354 1.870284 4.899009 0.147243 1.289940 +1200 1667 1.307354 1.870554 4.902118 0.146949 1.289975 +1200 1668 1.307354 1.870426 4.900649 0.146949 1.289963 +1200 1669 1.307354 1.870810 4.905073 0.146656 1.290054 +1200 1670 1.307354 1.870418 4.900557 0.146656 1.289930 +1200 1671 1.307354 1.870561 4.902199 0.146656 1.289993 +1200 1672 1.307354 1.870669 4.903448 0.146656 1.290066 +1200 1673 1.307354 1.870930 4.906451 0.146217 1.290173 +1200 1674 1.307354 1.871181 4.909352 0.146071 1.290190 +1200 1675 1.307354 1.871309 4.910834 0.145925 1.290237 +1200 1676 1.307354 1.871432 4.912263 0.145779 1.290258 +1200 1677 1.307354 1.871699 4.915367 0.145488 1.290337 +1200 1678 1.307354 1.872083 4.919827 0.145052 1.290430 +1200 1679 1.307354 1.872370 4.923179 0.144763 1.290504 +1200 1680 1.307354 1.872257 4.921860 0.144763 1.290455 +1200 1681 1.307354 1.872383 4.923324 0.144763 1.290474 +1200 1682 1.307354 1.872255 4.921839 0.144763 1.290424 +1200 1683 1.307354 1.871992 4.918765 0.144763 1.290351 +1200 1684 1.307354 1.872124 4.920302 0.144763 1.290401 +1200 1685 1.307354 1.872382 4.923321 0.144763 1.290472 +1200 1686 1.307354 1.872638 4.926311 0.144618 1.290507 +1200 1687 1.307354 1.872760 4.927740 0.144474 1.290563 +1200 1688 1.307354 1.872747 4.927591 0.144474 1.290561 +1200 1689 1.307354 1.872874 4.929075 0.144185 1.290606 +1200 1690 1.307354 1.873019 4.930776 0.144041 1.290654 +1200 1691 1.307354 1.873534 4.936830 0.143466 1.290770 +1200 1692 1.307354 1.873787 4.939816 0.143180 1.290838 +1200 1693 1.307354 1.873931 4.941518 0.143180 1.290822 +1200 1694 1.307354 1.874192 4.944603 0.142894 1.290892 +1200 1695 1.307354 1.874330 4.946238 0.142751 1.290915 +1200 1696 1.307354 1.874322 4.946138 0.142751 1.290937 +1200 1697 1.307354 1.874327 4.946203 0.142751 1.290944 +1200 1698 1.307354 1.874932 4.953386 0.142039 1.291102 +1200 1699 1.307354 1.874929 4.953354 0.142039 1.291101 +1200 1700 1.307354 1.875179 4.956327 0.141614 1.291181 +1200 1701 1.307354 1.875412 4.959117 0.141473 1.291218 +1200 1702 1.307354 1.875678 4.962298 0.141190 1.291260 +1200 1703 1.307354 1.875949 4.965546 0.141049 1.291297 +1200 1704 1.307354 1.876032 4.966541 0.140908 1.291333 +1200 1705 1.307354 1.876547 4.972740 0.140346 1.291473 +1200 1706 1.307354 1.876181 4.968331 0.140346 1.291416 +1200 1707 1.307354 1.876169 4.968192 0.140346 1.291437 +1200 1708 1.307354 1.876908 4.977103 0.139507 1.291636 +1200 1709 1.307354 1.877297 4.981811 0.139089 1.291731 +1200 1710 1.307354 1.877290 4.981729 0.139089 1.291730 +1200 1711 1.307354 1.877661 4.986232 0.138534 1.291850 +1200 1712 1.307354 1.877286 4.981672 0.138534 1.291788 +1200 1713 1.307354 1.877402 4.983079 0.138534 1.291831 +1200 1714 1.307354 1.877528 4.984609 0.138534 1.291849 +1200 1715 1.307354 1.877763 4.987473 0.138396 1.291882 +1200 1716 1.307354 1.878023 4.990641 0.138396 1.291895 +1200 1717 1.307354 1.877881 4.988906 0.138396 1.291842 +1200 1718 1.307354 1.878623 4.997978 0.137844 1.292010 +1200 1719 1.307354 1.878619 4.997933 0.137844 1.292008 +1200 1720 1.307354 1.878615 4.997879 0.137844 1.292007 +1200 1721 1.307354 1.878485 4.996290 0.137844 1.291951 +1200 1722 1.307354 1.878620 4.997936 0.137844 1.291947 +1200 1723 1.307354 1.878491 4.996356 0.137844 1.291890 +1200 1724 1.307354 1.878607 4.997781 0.137844 1.291901 +1200 1725 1.307354 1.878731 4.999301 0.137844 1.291917 +1200 1726 1.307354 1.878969 5.002231 0.137844 1.291945 +1200 1727 1.307354 1.879824 5.012771 0.136883 1.292200 +1200 1728 1.307354 1.880084 5.015990 0.136609 1.292246 +1200 1729 1.307354 1.879858 5.013185 0.136609 1.292210 +1200 1730 1.307354 1.880220 5.017676 0.136336 1.292324 +1200 1731 1.307354 1.880584 5.022211 0.135521 1.292497 +1200 1732 1.307354 1.880959 5.026881 0.135251 1.292556 +1200 1733 1.307354 1.881189 5.029762 0.134980 1.292591 +1200 1734 1.307354 1.881677 5.035875 0.134711 1.292670 +1200 1735 1.307354 1.881924 5.038989 0.134308 1.292734 +1200 1736 1.307354 1.881793 5.037336 0.134308 1.292681 +1200 1737 1.307354 1.882166 5.042037 0.133905 1.292820 +1200 1738 1.307354 1.882281 5.043482 0.133638 1.292898 +1200 1739 1.307354 1.882907 5.051408 0.132839 1.293043 +1200 1740 1.307354 1.883020 5.052850 0.132839 1.293063 +1200 1741 1.307354 1.882770 5.049675 0.132839 1.292999 +1200 1742 1.307354 1.882889 5.051186 0.132839 1.293042 +1200 1743 1.307354 1.883000 5.052596 0.132706 1.293095 +1200 1744 1.307354 1.883335 5.056851 0.132309 1.293168 +1200 1745 1.307354 1.883316 5.056617 0.132309 1.293167 +1200 1746 1.307354 1.883447 5.058284 0.132309 1.293148 +1200 1747 1.307354 1.883330 5.056792 0.132309 1.293139 +1200 1748 1.307354 1.883818 5.063019 0.132045 1.293219 +1200 1749 1.307354 1.883934 5.064495 0.131913 1.293238 +1200 1750 1.307354 1.884204 5.067962 0.131649 1.293306 +1200 1751 1.307354 1.884899 5.076899 0.130862 1.293466 +1200 1752 1.307354 1.885017 5.078432 0.130601 1.293515 +1200 1753 1.307354 1.885258 5.081544 0.130471 1.293551 +1200 1754 1.307354 1.885514 5.084859 0.130210 1.293612 +1200 1755 1.307354 1.885875 5.089558 0.129950 1.293661 +1200 1756 1.307354 1.886108 5.092594 0.129690 1.293719 +1200 1757 1.307354 1.886070 5.092097 0.129690 1.293678 +1200 1758 1.307354 1.885955 5.090598 0.129690 1.293668 +1200 1759 1.307354 1.886307 5.095180 0.129561 1.293743 +1200 1760 1.307354 1.886539 5.098215 0.129431 1.293752 +1200 1761 1.307354 1.886505 5.097779 0.129431 1.293726 +1200 1762 1.307354 1.886611 5.099156 0.129302 1.293775 +1200 1763 1.307354 1.887112 5.105733 0.128915 1.293867 +1200 1764 1.307354 1.887475 5.110511 0.128529 1.293946 +1200 1765 1.307354 1.887615 5.112362 0.128272 1.293991 +1200 1766 1.307354 1.887714 5.113675 0.128144 1.294005 +1200 1767 1.307354 1.887834 5.115259 0.128016 1.294048 +1200 1768 1.307354 1.887711 5.113627 0.128016 1.294027 +1200 1769 1.307354 1.888076 5.118470 0.127633 1.294131 +1200 1770 1.307354 1.887975 5.117126 0.127633 1.294116 +1200 1771 1.307354 1.888453 5.123475 0.126997 1.294243 +1200 1772 1.307354 1.888579 5.125155 0.126997 1.294258 +1200 1773 1.307354 1.888458 5.123540 0.126997 1.294217 +1200 1774 1.307354 1.888799 5.128082 0.126616 1.294322 +1200 1775 1.307354 1.889140 5.132645 0.126364 1.294371 +1200 1776 1.307354 1.889519 5.137728 0.125859 1.294482 +1200 1777 1.307354 1.889874 5.142491 0.125608 1.294524 +1200 1778 1.307354 1.889532 5.137901 0.125608 1.294428 +1200 1779 1.307354 1.889408 5.136232 0.125608 1.294411 +1200 1780 1.307354 1.889635 5.139282 0.125608 1.294491 +1200 1781 1.307354 1.889881 5.142594 0.125232 1.294612 +1200 1782 1.307354 1.889757 5.140924 0.125232 1.294586 +1200 1783 1.307354 1.889887 5.142672 0.125232 1.294609 +1200 1784 1.307354 1.890595 5.152237 0.124483 1.294770 +1200 1785 1.307354 1.890738 5.154177 0.124235 1.294821 +1200 1786 1.307354 1.890844 5.155621 0.124110 1.294838 +1200 1787 1.307354 1.891096 5.159046 0.123863 1.294873 +1200 1788 1.307354 1.891445 5.163811 0.123492 1.294945 +1200 1789 1.307354 1.891691 5.167167 0.123245 1.295006 +1200 1790 1.307354 1.891564 5.165439 0.123245 1.294966 +1200 1791 1.307354 1.891554 5.165300 0.123245 1.294966 +1200 1792 1.307354 1.891895 5.169972 0.122876 1.295088 +1200 1793 1.307354 1.892112 5.172950 0.122631 1.295120 +1200 1794 1.307354 1.891986 5.171212 0.122631 1.295103 +1200 1795 1.307354 1.892224 5.174490 0.122386 1.295170 +1200 1796 1.307354 1.892350 5.176224 0.122264 1.295187 +1200 1797 1.307354 1.892682 5.180798 0.121776 1.295287 +1200 1798 1.307354 1.892920 5.184096 0.121654 1.295318 +1200 1799 1.307354 1.892688 5.180884 0.121654 1.295290 +1200 1800 1.307354 1.892698 5.181020 0.121654 1.295316 +1200 1801 1.307354 1.893275 5.189008 0.120927 1.295449 +1200 1802 1.307354 1.893490 5.192001 0.120806 1.295479 +1200 1803 1.307354 1.893951 5.198426 0.120444 1.295544 +1200 1804 1.307354 1.894217 5.202153 0.120084 1.295607 +1200 1805 1.307354 1.894675 5.208580 0.119485 1.295723 +1200 1806 1.307354 1.895406 5.218903 0.118889 1.295841 +1200 1807 1.307354 1.895305 5.217470 0.118771 1.295859 +1200 1808 1.307354 1.895546 5.220884 0.118533 1.295917 +1200 1809 1.307354 1.895778 5.224185 0.118297 1.295945 +1200 1810 1.307354 1.895773 5.224112 0.118297 1.295945 +1200 1811 1.307354 1.896125 5.229119 0.117943 1.296015 +1200 1812 1.307354 1.896365 5.232547 0.117707 1.296067 +1200 1813 1.307354 1.896475 5.234118 0.117472 1.296102 +1200 1814 1.307354 1.896699 5.237320 0.117355 1.296130 +1200 1815 1.307354 1.896816 5.239006 0.117120 1.296172 +1200 1816 1.307354 1.896835 5.239280 0.117120 1.296176 +1200 1817 1.307354 1.896946 5.240878 0.117003 1.296187 +1200 1818 1.307354 1.897520 5.249140 0.116536 1.296282 +1200 1819 1.307354 1.897865 5.254128 0.116187 1.296353 +1200 1820 1.307354 1.897867 5.254165 0.116187 1.296328 +1200 1821 1.307354 1.898189 5.258840 0.116071 1.296372 +1200 1822 1.307354 1.898061 5.256971 0.116071 1.296363 +1200 1823 1.307354 1.898630 5.265271 0.115724 1.296434 +1200 1824 1.307354 1.898962 5.270118 0.115608 1.296456 +1200 1825 1.307354 1.899058 5.271523 0.115377 1.296492 +1200 1826 1.307354 1.899405 5.276616 0.114917 1.296567 +1200 1827 1.307354 1.899292 5.274958 0.114917 1.296504 +1200 1828 1.307354 1.899512 5.278202 0.114917 1.296557 +1200 1829 1.307354 1.899617 5.279743 0.114802 1.296597 +1200 1830 1.307354 1.900055 5.286215 0.114230 1.296705 +1200 1831 1.307354 1.900164 5.287838 0.114116 1.296722 +1200 1832 1.307354 1.900501 5.292828 0.113888 1.296766 +1200 1833 1.307354 1.900610 5.294460 0.113661 1.296803 +1200 1834 1.307354 1.900368 5.290855 0.113661 1.296769 +1200 1835 1.307354 1.900593 5.294195 0.113661 1.296800 +1200 1836 1.307354 1.900598 5.294274 0.113661 1.296800 +1200 1837 1.307354 1.900603 5.294357 0.113661 1.296802 +1200 1838 1.307354 1.900834 5.297792 0.113320 1.296858 +1200 1839 1.307354 1.900934 5.299286 0.113207 1.296891 +1200 1840 1.307354 1.901048 5.300996 0.113094 1.296903 +1200 1841 1.307354 1.901272 5.304336 0.112868 1.296958 +1200 1842 1.307354 1.901490 5.307612 0.112643 1.296985 +1200 1843 1.307354 1.901591 5.309122 0.112418 1.297027 +1200 1844 1.307354 1.901571 5.308827 0.112305 1.297047 +1200 1845 1.307354 1.901471 5.307320 0.112305 1.297055 +1200 1846 1.307354 1.901479 5.307447 0.112305 1.297057 +1200 1847 1.307354 1.901728 5.311195 0.111969 1.297116 +1200 1848 1.307354 1.901963 5.314729 0.111634 1.297185 +1200 1849 1.307354 1.902173 5.317905 0.111300 1.297232 +1200 1850 1.307354 1.902380 5.321037 0.111077 1.297283 +1200 1851 1.307354 1.902492 5.322731 0.110966 1.297295 +1200 1852 1.307354 1.903025 5.330833 0.110193 1.297435 +1200 1853 1.307354 1.902607 5.324482 0.110193 1.297336 +1200 1854 1.307354 1.902589 5.324207 0.110193 1.297335 +1200 1855 1.307354 1.902831 5.327880 0.110193 1.297409 +1200 1856 1.307354 1.903161 5.332917 0.110083 1.297455 +1200 1857 1.307354 1.903185 5.333270 0.110083 1.297435 +1200 1858 1.307354 1.903397 5.336524 0.109863 1.297505 +1200 1859 1.307354 1.903169 5.333038 0.109863 1.297456 +1200 1860 1.307354 1.903258 5.334389 0.109863 1.297445 +1200 1861 1.307354 1.903393 5.336451 0.109863 1.297489 +1200 1862 1.307354 1.903253 5.334309 0.109863 1.297470 +1200 1863 1.307354 1.903686 5.340942 0.109534 1.297547 +1200 1864 1.307354 1.903815 5.342923 0.109425 1.297564 +1200 1865 1.307354 1.903807 5.342808 0.109425 1.297562 +1200 1866 1.307354 1.904212 5.349043 0.108988 1.297652 +1200 1867 1.307354 1.904335 5.350945 0.108988 1.297643 +1200 1868 1.307354 1.904241 5.349489 0.108988 1.297628 +1200 1869 1.307354 1.904331 5.350880 0.108988 1.297639 +1200 1870 1.307354 1.904660 5.355965 0.108770 1.297679 +1200 1871 1.307354 1.904897 5.359655 0.108662 1.297706 +1200 1872 1.307354 1.904906 5.359783 0.108662 1.297706 +1200 1873 1.307354 1.904781 5.357852 0.108662 1.297711 +1200 1874 1.307354 1.905093 5.362700 0.108336 1.297768 +1200 1875 1.307354 1.904643 5.355711 0.108336 1.297719 +1200 1876 1.307354 1.905213 5.364568 0.107904 1.297845 +1200 1877 1.307354 1.905334 5.366456 0.107904 1.297836 +1200 1878 1.307354 1.905755 5.373048 0.107473 1.297907 +1200 1879 1.307354 1.905969 5.376407 0.107366 1.297934 +1200 1880 1.307354 1.906055 5.377757 0.107366 1.297936 +1200 1881 1.307354 1.906389 5.383006 0.107152 1.297971 +1200 1882 1.307354 1.906724 5.388305 0.106831 1.298018 +1200 1883 1.307354 1.907686 5.403620 0.105663 1.298217 +1200 1884 1.307354 1.908026 5.409068 0.105346 1.298277 +1200 1885 1.307354 1.908444 5.415789 0.104926 1.298336 +1200 1886 1.307354 1.908222 5.412224 0.104926 1.298269 +1200 1887 1.307354 1.908656 5.419224 0.104926 1.298336 +1200 1888 1.307354 1.908656 5.419224 0.104926 1.298336 +1200 1889 1.307354 1.908768 5.421022 0.104717 1.298373 +1200 1890 1.307354 1.908766 5.420998 0.104717 1.298372 +1200 1891 1.307354 1.908877 5.422796 0.104403 1.298431 +1200 1892 1.307354 1.908982 5.424494 0.104299 1.298452 +1200 1893 1.307354 1.909086 5.426192 0.104090 1.298488 +1200 1894 1.307354 1.909191 5.427884 0.103986 1.298504 +1200 1895 1.307354 1.909183 5.427757 0.103986 1.298500 +1200 1896 1.307354 1.909476 5.432523 0.103675 1.298552 +1200 1897 1.307354 1.909479 5.432585 0.103675 1.298531 +1200 1898 1.307354 1.909611 5.434740 0.103572 1.298573 +1200 1899 1.307354 1.910047 5.441863 0.103261 1.298633 +1200 1900 1.307354 1.910267 5.445484 0.103055 1.298664 +1200 1901 1.307354 1.910472 5.448867 0.102747 1.298705 +1200 1902 1.307354 1.910691 5.452468 0.102439 1.298753 +1200 1903 1.307354 1.910677 5.452243 0.102439 1.298752 +1200 1904 1.307354 1.910982 5.457284 0.102234 1.298787 +1200 1905 1.307354 1.911177 5.460532 0.102132 1.298813 +1200 1906 1.307354 1.911628 5.468042 0.101623 1.298891 +1200 1907 1.307354 1.911822 5.471273 0.101420 1.298935 +1200 1908 1.307354 1.912026 5.474693 0.101117 1.298984 +1200 1909 1.307354 1.912327 5.479744 0.100814 1.299038 +1200 1910 1.307354 1.912905 5.489502 0.100211 1.299122 +1200 1911 1.307354 1.913123 5.493209 0.100011 1.299167 +1200 1912 1.307354 1.913374 5.497473 0.099512 1.299243 +1200 1913 1.307354 1.913582 5.501011 0.099214 1.299284 +1200 1914 1.307354 1.914000 5.508164 0.098720 1.299374 +1200 1915 1.307354 1.913900 5.506449 0.098720 1.299362 +1200 1916 1.307354 1.913997 5.508120 0.098621 1.299378 +1200 1917 1.307354 1.914221 5.511955 0.098523 1.299405 +1200 1918 1.307354 1.914534 5.517352 0.098326 1.299423 +1200 1919 1.307354 1.914848 5.522780 0.098130 1.299456 +1200 1920 1.307354 1.914712 5.520418 0.098130 1.299424 +1200 1921 1.307354 1.914613 5.518704 0.098130 1.299392 +1200 1922 1.307354 1.914714 5.520459 0.098130 1.299423 +1200 1923 1.307354 1.914818 5.522261 0.098130 1.299441 +1200 1924 1.307354 1.914620 5.518831 0.098130 1.299416 +1200 1925 1.307354 1.914696 5.520137 0.098130 1.299427 +1200 1926 1.307354 1.914978 5.525018 0.097836 1.299513 +1200 1927 1.307354 1.915089 5.526958 0.097738 1.299527 +1200 1928 1.307354 1.915108 5.527273 0.097738 1.299529 +1200 1929 1.307354 1.915328 5.531107 0.097445 1.299576 +1200 1930 1.307354 1.915424 5.532774 0.097348 1.299590 +1200 1931 1.307354 1.915529 5.534604 0.097251 1.299600 +1200 1932 1.307354 1.916256 5.547344 0.096380 1.299735 +1200 1933 1.307354 1.916519 5.551983 0.096091 1.299789 +1200 1934 1.307354 1.916830 5.557480 0.095900 1.299822 +1200 1935 1.307354 1.916824 5.557376 0.095900 1.299805 +1200 1936 1.307354 1.916949 5.559583 0.095900 1.299817 +1200 1937 1.307354 1.916407 5.549999 0.095900 1.299712 +1200 1938 1.307354 1.916408 5.550023 0.095900 1.299688 +1200 1939 1.307354 1.916436 5.550524 0.095900 1.299694 +1200 1940 1.307354 1.916562 5.552732 0.095900 1.299707 +1200 1941 1.307354 1.916627 5.553895 0.095900 1.299732 +1200 1942 1.307354 1.916927 5.559198 0.095900 1.299782 +1200 1943 1.307354 1.917013 5.560721 0.095900 1.299791 +1200 1944 1.307354 1.917211 5.564252 0.095900 1.299811 +1200 1945 1.307354 1.917207 5.564172 0.095900 1.299811 +1200 1946 1.307354 1.917207 5.564169 0.095900 1.299811 +1200 1947 1.307354 1.917620 5.571545 0.095612 1.299860 +1200 1948 1.307354 1.917519 5.569729 0.095612 1.299850 +1200 1949 1.307354 1.917635 5.571815 0.095421 1.299886 +1200 1950 1.307354 1.918210 5.582126 0.094851 1.299967 +1200 1951 1.307354 1.918111 5.580345 0.094851 1.299958 +1200 1952 1.307354 1.918121 5.580517 0.094851 1.299960 +1200 1953 1.307354 1.918532 5.587931 0.094662 1.300008 +1200 1954 1.307354 1.918927 5.595099 0.094472 1.300034 +1200 1955 1.307354 1.918942 5.595363 0.094472 1.300037 +1200 1956 1.307354 1.918670 5.590436 0.094472 1.299988 +1200 1957 1.307354 1.919066 5.597629 0.094472 1.300033 +1200 1958 1.307354 1.919060 5.597506 0.094472 1.300031 +1200 1959 1.307354 1.919347 5.602749 0.094190 1.300081 +1200 1960 1.307354 1.920303 5.620297 0.093160 1.300238 +1200 1961 1.307354 1.920601 5.625816 0.092881 1.300270 +1200 1962 1.307354 1.920706 5.627756 0.092788 1.300283 +1200 1963 1.307354 1.921015 5.633503 0.092325 1.300355 +1200 1964 1.307354 1.921104 5.635162 0.092141 1.300384 +1200 1965 1.307354 1.921188 5.636726 0.091957 1.300412 +1200 1966 1.307354 1.921603 5.644506 0.091682 1.300461 +1200 1967 1.307354 1.921813 5.648453 0.091590 1.300465 +1200 1968 1.307354 1.922108 5.654011 0.091316 1.300512 +1200 1969 1.307354 1.922125 5.654327 0.091316 1.300497 +1200 1970 1.307354 1.922288 5.657416 0.090952 1.300566 +1200 1971 1.307354 1.922409 5.659710 0.090770 1.300591 +1200 1972 1.307354 1.922538 5.662159 0.090498 1.300628 +1200 1973 1.307354 1.922436 5.660227 0.090498 1.300578 +1200 1974 1.307354 1.922528 5.661957 0.090498 1.300586 +1200 1975 1.307354 1.922742 5.666028 0.090498 1.300592 +1200 1976 1.307354 1.922953 5.670060 0.090498 1.300613 +1200 1977 1.307354 1.923154 5.673907 0.090317 1.300655 +1200 1978 1.307354 1.923452 5.679612 0.089957 1.300705 +1200 1979 1.307354 1.923247 5.675690 0.089957 1.300658 +1200 1980 1.307354 1.923134 5.673521 0.089957 1.300629 +1200 1981 1.307354 1.923125 5.673339 0.089957 1.300611 +1200 1982 1.307354 1.923347 5.677596 0.089957 1.300618 +1200 1983 1.307354 1.923341 5.677492 0.089957 1.300617 +1200 1984 1.307354 1.923732 5.685006 0.089957 1.300660 +1200 1985 1.307354 1.923542 5.681353 0.089957 1.300625 +1200 1986 1.307354 1.923819 5.686680 0.089957 1.300669 +1200 1987 1.307354 1.923937 5.688965 0.089957 1.300704 +1200 1988 1.307354 1.923921 5.688656 0.089957 1.300703 +1200 1989 1.307354 1.924500 5.699889 0.089419 1.300781 +1200 1990 1.307354 1.924685 5.703490 0.089241 1.300815 +1200 1991 1.307354 1.924766 5.705072 0.089152 1.300824 +1200 1992 1.307354 1.925275 5.715058 0.088530 1.300916 +1200 1993 1.307354 1.925372 5.716973 0.088353 1.300948 +1200 1994 1.307354 1.925666 5.722772 0.088001 1.300995 +1200 1995 1.307354 1.925743 5.724305 0.087913 1.301005 +1200 1996 1.307354 1.925767 5.724771 0.087913 1.300991 +1200 1997 1.307354 1.925754 5.724511 0.087913 1.300992 +1200 1998 1.307354 1.925965 5.728693 0.087825 1.301019 +1200 1999 1.307354 1.926156 5.732485 0.087737 1.301038 +1200 2000 1.307354 1.926248 5.734322 0.087650 1.301047 +1200 2001 1.307354 1.926531 5.739986 0.087300 1.301093 +1200 2002 1.307354 1.926525 5.739862 0.087300 1.301096 +1200 2003 1.307354 1.926712 5.743609 0.086865 1.301153 +1200 2004 1.307354 1.926644 5.742235 0.086865 1.301135 +1200 2005 1.307354 1.926639 5.742130 0.086865 1.301154 +1200 2006 1.307354 1.926908 5.747542 0.086605 1.301199 +1200 2007 1.307354 1.926898 5.747332 0.086518 1.301211 +1200 2008 1.307354 1.926811 5.745596 0.086518 1.301173 +1200 2009 1.307354 1.927388 5.757210 0.086259 1.301245 +1200 2010 1.307354 1.927469 5.758861 0.086087 1.301269 +1200 2011 1.307354 1.927690 5.763336 0.085829 1.301309 +1200 2012 1.307354 1.927792 5.765411 0.085572 1.301335 +1200 2013 1.307354 1.927691 5.763363 0.085572 1.301306 +1200 2014 1.307354 1.927585 5.761203 0.085572 1.301328 +1200 2015 1.307354 1.927879 5.767184 0.085487 1.301357 +1200 2016 1.307354 1.928387 5.777576 0.084806 1.301444 +1200 2017 1.307354 1.928559 5.781109 0.084552 1.301479 +1200 2018 1.307354 1.929140 5.793096 0.083962 1.301569 +1200 2019 1.307354 1.929329 5.797021 0.083544 1.301624 +1200 2020 1.307354 1.929219 5.794744 0.083544 1.301613 +1200 2021 1.307354 1.929512 5.800835 0.083127 1.301679 +1200 2022 1.307354 1.929584 5.802337 0.082961 1.301706 +1200 2023 1.307354 1.929678 5.804296 0.082878 1.301716 +1200 2024 1.307354 1.929568 5.801989 0.082878 1.301685 +1200 2025 1.307354 1.929557 5.801770 0.082878 1.301685 +1200 2026 1.307354 1.929645 5.803600 0.082878 1.301695 +1200 2027 1.307354 1.929727 5.805318 0.082878 1.301703 +1200 2028 1.307354 1.929811 5.807074 0.082796 1.301725 +1200 2029 1.307354 1.929949 5.809957 0.082548 1.301760 +1200 2030 1.307354 1.930050 5.812070 0.082383 1.301785 +1200 2031 1.307354 1.929956 5.810117 0.082383 1.301775 +1200 2032 1.307354 1.929932 5.809601 0.082383 1.301755 +1200 2033 1.307354 1.929935 5.809662 0.082383 1.301741 +1200 2034 1.307354 1.929848 5.807837 0.082383 1.301714 +1200 2035 1.307354 1.930391 5.819257 0.082301 1.301798 +1200 2036 1.307354 1.930658 5.824902 0.081972 1.301839 +1200 2037 1.307354 1.930658 5.824902 0.081972 1.301839 +1200 2038 1.307354 1.930669 5.825126 0.081972 1.301841 +1200 2039 1.307354 1.930837 5.828689 0.081808 1.301858 +1200 2040 1.307354 1.930954 5.831180 0.081645 1.301884 +1200 2041 1.307354 1.931164 5.835646 0.081482 1.301907 +1200 2042 1.307354 1.931429 5.841315 0.081157 1.301949 +1200 2043 1.307354 1.931238 5.837223 0.081157 1.301938 +1200 2044 1.307354 1.931519 5.843237 0.080752 1.301998 +1200 2045 1.307354 1.931522 5.843294 0.080752 1.301997 +1200 2046 1.307354 1.931526 5.843393 0.080752 1.301984 +1200 2047 1.307354 1.931712 5.847389 0.080511 1.302033 +1200 2048 1.307354 1.931606 5.845104 0.080511 1.302021 +1200 2049 1.307354 1.931685 5.846806 0.080511 1.302033 +1200 2050 1.307354 1.931986 5.853279 0.080109 1.302083 +1200 2051 1.307354 1.931908 5.851599 0.080109 1.302084 +1200 2052 1.307354 1.931858 5.850528 0.080109 1.302073 +1200 2053 1.307354 1.932138 5.856559 0.080029 1.302101 +1200 2054 1.307354 1.932322 5.860562 0.079630 1.302148 +1200 2055 1.307354 1.932517 5.864782 0.079392 1.302184 +1200 2056 1.307354 1.932628 5.867197 0.079233 1.302206 +1200 2057 1.307354 1.932791 5.870754 0.078996 1.302236 +1200 2058 1.307354 1.932314 5.860385 0.078996 1.302143 +1200 2059 1.307354 1.932326 5.860647 0.078996 1.302143 +1200 2060 1.307354 1.932625 5.867140 0.078996 1.302186 +1200 2061 1.307354 1.932725 5.869319 0.078996 1.302182 +1200 2062 1.307354 1.933000 5.875323 0.078996 1.302224 +1200 2063 1.307354 1.933190 5.879489 0.078917 1.302243 +1200 2064 1.307354 1.933397 5.884049 0.078681 1.302279 +1200 2065 1.307354 1.933680 5.890292 0.078445 1.302305 +1200 2066 1.307354 1.933790 5.892720 0.078210 1.302331 +1200 2067 1.307354 1.933824 5.893471 0.078132 1.302348 +1200 2068 1.307354 1.934208 5.902016 0.077588 1.302415 +1200 2069 1.307354 1.934030 5.898046 0.077588 1.302396 +1200 2070 1.307354 1.933998 5.897332 0.077588 1.302373 +1200 2071 1.307354 1.933905 5.895276 0.077588 1.302368 +1200 2072 1.307354 1.933998 5.897332 0.077588 1.302376 +1200 2073 1.307354 1.934051 5.898531 0.077588 1.302379 +1200 2074 1.307354 1.934134 5.900377 0.077588 1.302389 +1200 2075 1.307354 1.934497 5.908481 0.077433 1.302439 +1200 2076 1.307354 1.934419 5.906732 0.077433 1.302437 +1200 2077 1.307354 1.934791 5.915083 0.077047 1.302484 +1200 2078 1.307354 1.934804 5.915364 0.077047 1.302485 +1200 2079 1.307354 1.935163 5.923466 0.076816 1.302517 +1200 2080 1.307354 1.935187 5.924009 0.076663 1.302533 +1200 2081 1.307354 1.935360 5.927920 0.076509 1.302548 +1200 2082 1.307354 1.935444 5.929832 0.076509 1.302556 +1200 2083 1.307354 1.935696 5.935569 0.076128 1.302597 +1200 2084 1.307354 1.935790 5.937713 0.076052 1.302607 +1200 2085 1.307354 1.935890 5.940001 0.075900 1.302630 +1200 2086 1.307354 1.935971 5.941844 0.075900 1.302627 +1200 2087 1.307354 1.936348 5.950504 0.075522 1.302679 +1200 2088 1.307354 1.936618 5.956741 0.075296 1.302708 +1200 2089 1.307354 1.936860 5.962342 0.074920 1.302754 +1200 2090 1.307354 1.937326 5.973206 0.074175 1.302841 +1200 2091 1.307354 1.937343 5.973605 0.074175 1.302845 +1200 2092 1.307354 1.937326 5.973204 0.074175 1.302845 +1200 2093 1.307354 1.937505 5.977390 0.073879 1.302874 +1200 2094 1.307354 1.937895 5.986565 0.073364 1.302936 +1200 2095 1.307354 1.937808 5.984509 0.073364 1.302928 +1200 2096 1.307354 1.937691 5.981776 0.073364 1.302919 +1200 2097 1.307354 1.937693 5.981802 0.073364 1.302917 +1200 2098 1.307354 1.937962 5.988142 0.073364 1.302941 +1200 2099 1.307354 1.937712 5.982249 0.073364 1.302915 +1200 2100 1.307354 1.937621 5.980118 0.073364 1.302894 +1200 2101 1.307354 1.937733 5.982751 0.073364 1.302922 +1200 2102 1.307354 1.937738 5.982865 0.073364 1.302932 +1200 2103 1.307354 1.937649 5.980767 0.073364 1.302911 +1200 2104 1.307354 1.937567 5.978845 0.073364 1.302904 +1200 2105 1.307354 1.937560 5.978686 0.073364 1.302902 +1200 2106 1.307354 1.937733 5.982765 0.073364 1.302904 +1200 2107 1.307354 1.937736 5.982824 0.073364 1.302902 +1200 2108 1.307354 1.938002 5.989095 0.073364 1.302925 +1200 2109 1.307354 1.938276 5.995589 0.073144 1.302969 +1200 2110 1.307354 1.938382 5.998108 0.073071 1.302977 +1200 2111 1.307354 1.938373 5.997890 0.073071 1.302970 +1200 2112 1.307354 1.938293 5.995984 0.073071 1.302950 +1200 2113 1.307354 1.938378 5.998017 0.073071 1.302957 +1200 2114 1.307354 1.938263 5.995271 0.073071 1.302945 +1200 2115 1.307354 1.938094 5.991286 0.073071 1.302930 +1200 2116 1.307354 1.937863 5.985828 0.073071 1.302879 +1200 2117 1.307354 1.937776 5.983768 0.073071 1.302874 +1200 2118 1.307354 1.937874 5.986079 0.073071 1.302857 +1200 2119 1.307354 1.938129 5.992093 0.073071 1.302896 +1200 2120 1.307354 1.938433 5.999330 0.073071 1.302925 +1200 2121 1.307354 1.938640 6.004267 0.073071 1.302971 +1200 2122 1.307354 1.938618 6.003738 0.073071 1.302954 +1200 2123 1.307354 1.938620 6.003767 0.073071 1.302955 +1200 2124 1.307354 1.938720 6.006165 0.072998 1.302979 +1200 2125 1.307354 1.938793 6.007908 0.072780 1.303009 +1200 2126 1.307354 1.939227 6.018333 0.072489 1.303047 +1200 2127 1.307354 1.939372 6.021848 0.072345 1.303063 +1200 2128 1.307354 1.939351 6.021328 0.072345 1.303063 +1200 2129 1.307354 1.939708 6.029978 0.071697 1.303135 +1200 2130 1.307354 1.939976 6.036511 0.071411 1.303171 +1200 2131 1.307354 1.940268 6.043639 0.071126 1.303206 +1200 2132 1.307354 1.940439 6.047856 0.070984 1.303222 +1200 2133 1.307354 1.940530 6.050087 0.070842 1.303240 +1200 2134 1.307354 1.940524 6.049934 0.070842 1.303240 +1200 2135 1.307354 1.940527 6.050009 0.070842 1.303237 +1200 2136 1.307354 1.940430 6.047615 0.070842 1.303217 +1200 2137 1.307354 1.940528 6.050031 0.070842 1.303239 +1200 2138 1.307354 1.940594 6.051663 0.070489 1.303278 +1200 2139 1.307354 1.940928 6.059910 0.070067 1.303333 +1200 2140 1.307354 1.941162 6.065725 0.069927 1.303349 +1200 2141 1.307354 1.941451 6.072943 0.069718 1.303366 +1200 2142 1.307354 1.941410 6.071911 0.069718 1.303361 +1200 2143 1.307354 1.941598 6.076609 0.069648 1.303378 +1200 2144 1.307354 1.941529 6.074879 0.069648 1.303371 +1200 2145 1.307354 1.941549 6.075380 0.069648 1.303374 +1200 2146 1.307354 1.941634 6.077519 0.069648 1.303382 +1200 2147 1.307354 1.941733 6.080010 0.069648 1.303377 +1200 2148 1.307354 1.941887 6.083883 0.069509 1.303392 +1200 2149 1.307354 1.941698 6.079131 0.069509 1.303365 +1200 2150 1.307354 1.941697 6.079095 0.069509 1.303364 +1200 2151 1.307354 1.941699 6.079146 0.069509 1.303364 +1200 2152 1.307354 1.941958 6.085676 0.069509 1.303387 +1200 2153 1.307354 1.942129 6.089993 0.069440 1.303402 +1200 2154 1.307354 1.941889 6.083925 0.069440 1.303383 +1200 2155 1.307354 1.941968 6.085931 0.069440 1.303390 +1200 2156 1.307354 1.941847 6.082876 0.069440 1.303365 +1200 2157 1.307354 1.942122 6.089810 0.069440 1.303397 +1200 2158 1.307354 1.941961 6.085749 0.069440 1.303371 +1200 2159 1.307354 1.941895 6.084079 0.069440 1.303363 +1200 2160 1.307354 1.941996 6.086627 0.069440 1.303373 +1200 2161 1.307354 1.942402 6.096911 0.069094 1.303443 +1200 2162 1.307354 1.942668 6.103696 0.068887 1.303468 +1200 2163 1.307354 1.942903 6.109700 0.068612 1.303494 +1200 2164 1.307354 1.943349 6.121178 0.068134 1.303549 +1200 2165 1.307354 1.943362 6.121508 0.068134 1.303552 +1200 2166 1.307354 1.943348 6.121144 0.068134 1.303549 +1200 2167 1.307354 1.943863 6.134521 0.067726 1.303598 +1200 2168 1.307354 1.944048 6.139352 0.067456 1.303626 +1200 2169 1.307354 1.944113 6.141052 0.067321 1.303643 +1200 2170 1.307354 1.944386 6.148219 0.067120 1.303667 +1200 2171 1.307354 1.944402 6.148619 0.067053 1.303668 +1200 2172 1.307354 1.944578 6.153256 0.066986 1.303681 +1200 2173 1.307354 1.944661 6.155458 0.066785 1.303700 +1200 2174 1.307354 1.944748 6.157766 0.066719 1.303706 +1200 2175 1.307354 1.944938 6.162794 0.066452 1.303737 +1200 2176 1.307354 1.944954 6.163222 0.066452 1.303737 +1200 2177 1.307354 1.945218 6.170251 0.066187 1.303764 +1200 2178 1.307354 1.945232 6.170641 0.066187 1.303765 +1200 2179 1.307354 1.945063 6.166136 0.066187 1.303740 +1200 2180 1.307354 1.945332 6.173308 0.066187 1.303754 +1200 2181 1.307354 1.945418 6.175601 0.066121 1.303773 +1200 2182 1.307354 1.945348 6.173741 0.066121 1.303767 +1200 2183 1.307354 1.945457 6.176665 0.065857 1.303800 +1200 2184 1.307354 1.945738 6.184217 0.065529 1.303840 +1200 2185 1.307354 1.945810 6.186163 0.065463 1.303844 +1200 2186 1.307354 1.945719 6.183700 0.065463 1.303839 +1200 2187 1.307354 1.945965 6.190341 0.065072 1.303885 +1200 2188 1.307354 1.946311 6.199746 0.064683 1.303926 +1200 2189 1.307354 1.946240 6.197801 0.064683 1.303930 +1200 2190 1.307354 1.946318 6.199933 0.064618 1.303936 +1200 2191 1.307354 1.946330 6.200258 0.064618 1.303925 +1200 2192 1.307354 1.946325 6.200103 0.064618 1.303926 +1200 2193 1.307354 1.946395 6.202019 0.064618 1.303933 +1200 2194 1.307354 1.946136 6.194974 0.064618 1.303904 +1200 2195 1.307354 1.946123 6.194609 0.064618 1.303888 +1200 2196 1.307354 1.946123 6.194618 0.064618 1.303889 +1200 2197 1.307354 1.945862 6.187563 0.064618 1.303869 +1200 2198 1.307354 1.945959 6.190190 0.064618 1.303867 +1200 2199 1.307354 1.946059 6.192880 0.064618 1.303897 +1200 2200 1.307354 1.946292 6.199227 0.064489 1.303948 +1200 2201 1.307354 1.946211 6.197007 0.064489 1.303942 +1200 2202 1.307354 1.946468 6.204025 0.064232 1.303975 +1200 2203 1.307354 1.946552 6.206321 0.063848 1.304018 +1200 2204 1.307354 1.946368 6.201287 0.063848 1.304005 +1200 2205 1.307354 1.946440 6.203261 0.063848 1.304009 +1200 2206 1.307354 1.946376 6.201495 0.063848 1.304006 +1200 2207 1.307354 1.946304 6.199550 0.063848 1.304003 +1200 2208 1.307354 1.946243 6.197889 0.063848 1.304001 +1200 2209 1.307354 1.946310 6.199702 0.063848 1.304003 +1200 2210 1.307354 1.946398 6.202114 0.063848 1.304010 +1200 2211 1.307354 1.946578 6.207024 0.063657 1.304036 +1200 2212 1.307354 1.946587 6.207266 0.063657 1.304037 +1200 2213 1.307354 1.946586 6.207245 0.063593 1.304047 +1200 2214 1.307354 1.946494 6.204731 0.063593 1.304017 +1200 2215 1.307354 1.946846 6.214369 0.063339 1.304070 +1200 2216 1.307354 1.947018 6.219113 0.063276 1.304079 +1200 2217 1.307354 1.947018 6.219113 0.063276 1.304076 +1200 2218 1.307354 1.947195 6.224007 0.063150 1.304093 +1200 2219 1.307354 1.947203 6.224245 0.063087 1.304094 +1200 2220 1.307354 1.947364 6.228695 0.063087 1.304096 +1200 2221 1.307354 1.947355 6.228440 0.063087 1.304097 +1200 2222 1.307354 1.947537 6.233507 0.062898 1.304114 +1200 2223 1.307354 1.947454 6.231197 0.062898 1.304107 +1200 2224 1.307354 1.947447 6.230992 0.062898 1.304096 +1200 2225 1.307354 1.947707 6.238256 0.062772 1.304127 +1200 2226 1.307354 1.947963 6.245438 0.062459 1.304159 +1200 2227 1.307354 1.948083 6.248794 0.062459 1.304160 +1200 2228 1.307354 1.948095 6.249150 0.062459 1.304161 +1200 2229 1.307354 1.948184 6.251648 0.062272 1.304179 +1200 2230 1.307354 1.948338 6.255991 0.061962 1.304210 +1200 2231 1.307354 1.948656 6.265012 0.061591 1.304250 +1200 2232 1.307354 1.948824 6.269815 0.061468 1.304262 +1200 2233 1.307354 1.948709 6.266542 0.061468 1.304262 +1200 2234 1.307354 1.948855 6.270688 0.061284 1.304282 +1200 2235 1.307354 1.949240 6.281728 0.060857 1.304327 +1200 2236 1.307354 1.949234 6.281581 0.060857 1.304307 +1200 2237 1.307354 1.949399 6.286314 0.060857 1.304308 +1200 2238 1.307354 1.949317 6.283956 0.060857 1.304303 +1200 2239 1.307354 1.949074 6.276964 0.060857 1.304282 +1200 2240 1.307354 1.949058 6.276499 0.060857 1.304281 +1200 2241 1.307354 1.949294 6.283284 0.060857 1.304291 +1200 2242 1.307354 1.949518 6.289772 0.060857 1.304320 +1200 2243 1.307354 1.949746 6.296373 0.060796 1.304328 +1200 2244 1.307354 1.950000 6.303784 0.060493 1.304358 +1200 2245 1.307354 1.950424 6.316229 0.059832 1.304429 +1200 2246 1.307354 1.950588 6.321055 0.059712 1.304440 +1200 2247 1.307354 1.950599 6.321392 0.059712 1.304417 +1200 2248 1.307354 1.950764 6.326286 0.059712 1.304437 +1200 2249 1.307354 1.950932 6.331261 0.059593 1.304449 +1200 2250 1.307354 1.951014 6.333715 0.059534 1.304455 +1200 2251 1.307354 1.951077 6.335575 0.059415 1.304465 +1200 2252 1.307354 1.950905 6.330464 0.059415 1.304449 +1200 2253 1.307354 1.951157 6.337968 0.059118 1.304494 +1200 2254 1.307354 1.951304 6.342370 0.059059 1.304502 +1200 2255 1.307354 1.951430 6.346172 0.058765 1.304530 +1200 2256 1.307354 1.951554 6.349900 0.058706 1.304538 +1200 2257 1.307354 1.951404 6.345397 0.058706 1.304528 +1200 2258 1.307354 1.951583 6.350762 0.058530 1.304555 +1200 2259 1.307354 1.951801 6.357368 0.058414 1.304562 +1200 2260 1.307354 1.951892 6.360118 0.058297 1.304577 +1200 2261 1.307354 1.952206 6.369675 0.057891 1.304613 +1200 2262 1.307354 1.952283 6.372041 0.057717 1.304631 +1200 2263 1.307354 1.952353 6.374174 0.057602 1.304644 +1200 2264 1.307354 1.952559 6.380521 0.057372 1.304662 +1200 2265 1.307354 1.952475 6.377940 0.057372 1.304645 +1200 2266 1.307354 1.952617 6.382285 0.057258 1.304677 +1200 2267 1.307354 1.952844 6.389313 0.056972 1.304705 +1200 2268 1.307354 1.952678 6.384188 0.056972 1.304689 +1200 2269 1.307354 1.952772 6.387087 0.056915 1.304706 +1200 2270 1.307354 1.953067 6.396235 0.056745 1.304726 +1200 2271 1.307354 1.953343 6.404821 0.056462 1.304749 +1200 2272 1.307354 1.953396 6.406510 0.056349 1.304763 +1200 2273 1.307354 1.953468 6.408766 0.056293 1.304766 +1200 2274 1.307354 1.953560 6.411640 0.056237 1.304772 +1200 2275 1.307354 1.953305 6.403652 0.056237 1.304720 +1200 2276 1.307354 1.953220 6.400985 0.056237 1.304707 +1200 2277 1.307354 1.953437 6.407770 0.056237 1.304734 +1200 2278 1.307354 1.953522 6.410433 0.056237 1.304742 +1200 2279 1.307354 1.953392 6.406377 0.056237 1.304732 +1200 2280 1.307354 1.953479 6.409108 0.056237 1.304731 +1200 2281 1.307354 1.953404 6.406754 0.056237 1.304735 +1200 2282 1.307354 1.953461 6.408544 0.056237 1.304740 +1200 2283 1.307354 1.953526 6.410584 0.056237 1.304744 +1200 2284 1.307354 1.953541 6.411055 0.056237 1.304765 +1200 2285 1.307354 1.953644 6.414294 0.056012 1.304793 +1200 2286 1.307354 1.953800 6.419206 0.055789 1.304812 +1200 2287 1.307354 1.953885 6.421877 0.055677 1.304825 +1200 2288 1.307354 1.954130 6.429653 0.055455 1.304841 +1200 2289 1.307354 1.954284 6.434567 0.055400 1.304847 +1200 2290 1.307354 1.954271 6.434174 0.055400 1.304847 +1200 2291 1.307354 1.954435 6.439394 0.055179 1.304867 +1200 2292 1.307354 1.954766 6.450043 0.054904 1.304894 +1200 2293 1.307354 1.954752 6.449580 0.054849 1.304898 +1200 2294 1.307354 1.954892 6.454106 0.054575 1.304925 +1200 2295 1.307354 1.955132 6.461904 0.054358 1.304942 +1200 2296 1.307354 1.955206 6.464320 0.054249 1.304950 +1200 2297 1.307354 1.955347 6.468903 0.054141 1.304961 +1200 2298 1.307354 1.955513 6.474346 0.053871 1.304985 +1200 2299 1.307354 1.955664 6.479289 0.053656 1.305004 +1200 2300 1.307354 1.955729 6.481436 0.053656 1.305002 +1200 2301 1.307354 1.955915 6.487578 0.053442 1.305025 +1200 2302 1.307354 1.955823 6.484530 0.053442 1.305021 +1200 2303 1.307354 1.955893 6.486857 0.053335 1.305033 +1200 2304 1.307354 1.955963 6.489162 0.053282 1.305040 +1200 2305 1.307354 1.956296 6.500250 0.052963 1.305066 +1200 2306 1.307354 1.956609 6.510734 0.052541 1.305101 +1200 2307 1.307354 1.956872 6.519603 0.052227 1.305132 +1200 2308 1.307354 1.956821 6.517858 0.052227 1.305120 +1200 2309 1.307354 1.956893 6.520297 0.052175 1.305133 +1200 2310 1.307354 1.957276 6.533326 0.051708 1.305173 +1200 2311 1.307354 1.957359 6.536155 0.051553 1.305189 +1200 2312 1.307354 1.957274 6.533244 0.051553 1.305186 +1200 2313 1.307354 1.957214 6.531189 0.051553 1.305183 +1200 2314 1.307354 1.957455 6.539445 0.051399 1.305199 +1200 2315 1.307354 1.957596 6.544290 0.051194 1.305217 +1200 2316 1.307354 1.957741 6.549269 0.051040 1.305229 +1200 2317 1.307354 1.957888 6.554339 0.050786 1.305250 +1200 2318 1.307354 1.957733 6.549014 0.050786 1.305231 +1200 2319 1.307354 1.957727 6.548802 0.050786 1.305232 +1200 2320 1.307354 1.957797 6.551217 0.050786 1.305236 +1200 2321 1.307354 1.957708 6.548127 0.050786 1.305237 +1200 2322 1.307354 1.957695 6.547678 0.050786 1.305234 +1200 2323 1.307354 1.957674 6.546981 0.050786 1.305229 +1200 2324 1.307354 1.957895 6.554610 0.050786 1.305245 +1200 2325 1.307354 1.958027 6.559188 0.050583 1.305268 +1200 2326 1.307354 1.958150 6.563468 0.050533 1.305275 +1200 2327 1.307354 1.958378 6.571414 0.050281 1.305293 +1200 2328 1.307354 1.958526 6.576618 0.050180 1.305301 +1200 2329 1.307354 1.958813 6.586728 0.049980 1.305321 +1200 2330 1.307354 1.959082 6.596287 0.049880 1.305329 +1200 2331 1.307354 1.959383 6.607051 0.049434 1.305366 +1200 2332 1.307354 1.959653 6.616780 0.049138 1.305389 +1200 2333 1.307354 1.959679 6.617726 0.048942 1.305402 +1200 2334 1.307354 1.959550 6.613046 0.048942 1.305396 +1200 2335 1.307354 1.959736 6.619773 0.048504 1.305439 +1200 2336 1.307354 1.959802 6.622172 0.048407 1.305446 +1200 2337 1.307354 1.959867 6.624506 0.048359 1.305453 +1200 2338 1.307354 1.960004 6.629484 0.048262 1.305459 +1200 2339 1.307354 1.960149 6.634787 0.048069 1.305476 +1200 2340 1.307354 1.960218 6.637321 0.048021 1.305480 +1200 2341 1.307354 1.960085 6.632462 0.048021 1.305477 +1200 2342 1.307354 1.960159 6.635163 0.047878 1.305491 +1200 2343 1.307354 1.960287 6.639854 0.047782 1.305497 +1200 2344 1.307354 1.960044 6.630960 0.047782 1.305479 +1200 2345 1.307354 1.960128 6.634019 0.047782 1.305484 +1200 2346 1.307354 1.960063 6.631660 0.047782 1.305467 +1200 2347 1.307354 1.960257 6.638738 0.047782 1.305481 +1200 2348 1.307354 1.960325 6.641245 0.047782 1.305483 +1200 2349 1.307354 1.960595 6.651205 0.047544 1.305514 +1200 2350 1.307354 1.960678 6.654285 0.047496 1.305520 +1200 2351 1.307354 1.960700 6.655078 0.047449 1.305524 +1200 2352 1.307354 1.960671 6.653989 0.047449 1.305518 +1200 2353 1.307354 1.960614 6.651882 0.047449 1.305510 +1200 2354 1.307354 1.960933 6.663761 0.047212 1.305543 +1200 2355 1.307354 1.960919 6.663210 0.047024 1.305558 +1200 2356 1.307354 1.961050 6.668113 0.046883 1.305568 +1200 2357 1.307354 1.960971 6.665168 0.046883 1.305562 +1200 2358 1.307354 1.960979 6.665469 0.046883 1.305560 +1200 2359 1.307354 1.961137 6.671378 0.046743 1.305578 +1200 2360 1.307354 1.961189 6.673321 0.046603 1.305589 +1200 2361 1.307354 1.961211 6.674140 0.046603 1.305589 +1200 2362 1.307354 1.961268 6.676278 0.046463 1.305598 +1200 2363 1.307354 1.961348 6.679307 0.046324 1.305611 +1200 2364 1.307354 1.961173 6.672739 0.046324 1.305600 +1200 2365 1.307354 1.961181 6.673018 0.046324 1.305602 +1200 2366 1.307354 1.961123 6.670831 0.046324 1.305598 +1200 2367 1.307354 1.961344 6.679165 0.046093 1.305628 +1200 2368 1.307354 1.961659 6.691085 0.045726 1.305656 +1200 2369 1.307354 1.961970 6.702955 0.045498 1.305674 +1200 2370 1.307354 1.961876 6.699325 0.045498 1.305666 +1200 2371 1.307354 1.961720 6.693388 0.045498 1.305649 +1200 2372 1.307354 1.961728 6.693678 0.045498 1.305648 +1200 2373 1.307354 1.961733 6.693887 0.045498 1.305650 +1200 2374 1.307354 1.961364 6.679896 0.045498 1.305618 +1200 2375 1.307354 1.961710 6.693029 0.045498 1.305647 +1200 2376 1.307354 1.962000 6.704093 0.045498 1.305672 +1200 2377 1.307354 1.962415 6.720074 0.044911 1.305717 +1200 2378 1.307354 1.962396 6.719358 0.044911 1.305716 +1200 2379 1.307354 1.962552 6.725399 0.044776 1.305725 +1200 2380 1.307354 1.962688 6.730698 0.044598 1.305741 +1200 2381 1.307354 1.962609 6.727615 0.044598 1.305729 +1200 2382 1.307354 1.962517 6.724041 0.044598 1.305729 +1200 2383 1.307354 1.962749 6.733082 0.044464 1.305750 +1200 2384 1.307354 1.962811 6.735486 0.044420 1.305751 +1200 2385 1.307354 1.963037 6.744380 0.044154 1.305770 +1200 2386 1.307354 1.963041 6.744520 0.044154 1.305771 +1200 2387 1.307354 1.962969 6.741685 0.044154 1.305759 +1200 2388 1.307354 1.963091 6.746517 0.043978 1.305785 +1200 2389 1.307354 1.963205 6.751001 0.043890 1.305791 +1200 2390 1.307354 1.963218 6.751538 0.043759 1.305800 +1200 2391 1.307354 1.963448 6.760681 0.043367 1.305826 +1200 2392 1.307354 1.963583 6.766051 0.043194 1.305841 +1200 2393 1.307354 1.963837 6.776259 0.042979 1.305857 +1200 2394 1.307354 1.963973 6.781707 0.042850 1.305865 +1200 2395 1.307354 1.964114 6.787450 0.042636 1.305881 +1200 2396 1.307354 1.964171 6.789764 0.042466 1.305891 +1200 2397 1.307354 1.964237 6.792442 0.042297 1.305903 +1200 2398 1.307354 1.964235 6.792345 0.042297 1.305903 +1200 2399 1.307354 1.964012 6.783282 0.042297 1.305872 +1200 2400 1.307354 1.964401 6.799138 0.042170 1.305911 +1200 2401 1.307354 1.964499 6.803126 0.042002 1.305925 +1200 2402 1.307354 1.964264 6.793520 0.042002 1.305912 +1200 2403 1.307354 1.964417 6.799785 0.042002 1.305915 +1200 2404 1.307354 1.964182 6.790175 0.042002 1.305902 +1200 2405 1.307354 1.964451 6.801172 0.042002 1.305922 +1200 2406 1.307354 1.964719 6.812174 0.041626 1.305949 +1200 2407 1.307354 1.964647 6.809234 0.041626 1.305944 +1200 2408 1.307354 1.964780 6.814708 0.041626 1.305951 +1200 2409 1.307354 1.964955 6.821954 0.041377 1.305965 +1200 2410 1.307354 1.964928 6.820851 0.041377 1.305958 +1200 2411 1.307354 1.964772 6.814354 0.041377 1.305936 +1200 2412 1.307354 1.964788 6.815025 0.041377 1.305947 +1200 2413 1.307354 1.964943 6.821457 0.041377 1.305955 +1200 2414 1.307354 1.964939 6.821295 0.041377 1.305956 +1200 2415 1.307354 1.964858 6.817931 0.041377 1.305940 +1200 2416 1.307354 1.965161 6.830545 0.041377 1.305963 +1200 2417 1.307354 1.965292 6.836006 0.041336 1.305970 +1200 2418 1.307354 1.965306 6.836604 0.041294 1.305971 +1200 2419 1.307354 1.965488 6.844266 0.041212 1.305976 +1200 2420 1.307354 1.965428 6.841744 0.041212 1.305973 +1200 2421 1.307354 1.965358 6.838801 0.041212 1.305961 +1200 2422 1.307354 1.965244 6.833996 0.041212 1.305953 +1200 2423 1.307354 1.965435 6.842032 0.041212 1.305967 +1200 2424 1.307354 1.965525 6.845831 0.041212 1.305969 +1200 2425 1.307354 1.965671 6.852003 0.041171 1.305979 +1200 2426 1.307354 1.965915 6.862352 0.040802 1.306004 +1200 2427 1.307354 1.965907 6.862034 0.040802 1.306002 +1200 2428 1.307354 1.965962 6.864393 0.040720 1.306011 +1200 2429 1.307354 1.966146 6.872259 0.040517 1.306024 +1200 2430 1.307354 1.965940 6.863439 0.040517 1.306014 +1200 2431 1.307354 1.966010 6.866425 0.040517 1.306015 +1200 2432 1.307354 1.966083 6.869583 0.040517 1.306022 +1200 2433 1.307354 1.966082 6.869512 0.040517 1.306023 +1200 2434 1.307354 1.966264 6.877361 0.040235 1.306042 +1200 2435 1.307354 1.966384 6.882525 0.040034 1.306055 +1200 2436 1.307354 1.966622 6.892870 0.039676 1.306079 +1200 2437 1.307354 1.966583 6.891158 0.039676 1.306069 +1200 2438 1.307354 1.966638 6.893563 0.039676 1.306079 +1200 2439 1.307354 1.966762 6.898968 0.039478 1.306091 +1200 2440 1.307354 1.966630 6.893197 0.039478 1.306074 +1200 2441 1.307354 1.966812 6.901180 0.039478 1.306080 +1200 2442 1.307354 1.966846 6.902645 0.039478 1.306080 +1200 2443 1.307354 1.967070 6.912548 0.039478 1.306091 +1200 2444 1.307354 1.967185 6.917614 0.039399 1.306097 +1200 2445 1.307354 1.967167 6.916801 0.039321 1.306103 +1200 2446 1.307354 1.967193 6.917984 0.039242 1.306106 +1200 2447 1.307354 1.967179 6.917349 0.039242 1.306106 +1200 2448 1.307354 1.967389 6.926678 0.039085 1.306117 +1200 2449 1.307354 1.967386 6.926555 0.039007 1.306123 +1200 2450 1.307354 1.967472 6.930389 0.038813 1.306134 +1200 2451 1.307354 1.967595 6.935896 0.038542 1.306151 +1200 2452 1.307354 1.967645 6.938133 0.038427 1.306158 +1200 2453 1.307354 1.967695 6.940391 0.038427 1.306159 +1200 2454 1.307354 1.967836 6.946764 0.038197 1.306173 +1200 2455 1.307354 1.967993 6.953881 0.037817 1.306196 +1200 2456 1.307354 1.968072 6.957453 0.037742 1.306202 +1200 2457 1.307354 1.968058 6.956847 0.037742 1.306201 +1200 2458 1.307354 1.968129 6.960058 0.037742 1.306200 +1200 2459 1.307354 1.968188 6.962778 0.037704 1.306203 +1200 2460 1.307354 1.968257 6.965935 0.037704 1.306205 +1200 2461 1.307354 1.968319 6.968760 0.037554 1.306214 +1200 2462 1.307354 1.968429 6.973820 0.037366 1.306225 +1200 2463 1.307354 1.968566 6.980168 0.037255 1.306232 +1200 2464 1.307354 1.968626 6.982930 0.037255 1.306225 +1200 2465 1.307354 1.968838 6.992777 0.036995 1.306246 +1200 2466 1.307354 1.968825 6.992200 0.036995 1.306236 +1200 2467 1.307354 1.968954 6.998228 0.036995 1.306246 +1200 2468 1.307354 1.968972 6.999048 0.036921 1.306251 +1200 2469 1.307354 1.968864 6.993995 0.036921 1.306237 +1200 2470 1.307354 1.968779 6.990034 0.036921 1.306225 +1200 2471 1.307354 1.968678 6.985329 0.036921 1.306210 +1200 2472 1.307354 1.968939 6.997507 0.036921 1.306229 +1200 2473 1.307354 1.969074 7.003826 0.036921 1.306242 +1200 2474 1.307354 1.968939 6.997511 0.036921 1.306236 +1200 2475 1.307354 1.969074 7.003830 0.036921 1.306242 +1200 2476 1.307354 1.968965 6.998734 0.036921 1.306232 +1200 2477 1.307354 1.968882 6.994852 0.036921 1.306229 +1200 2478 1.307354 1.969031 7.001832 0.036921 1.306240 +1200 2479 1.307354 1.968987 6.999757 0.036921 1.306237 +1200 2480 1.307354 1.969050 7.002688 0.036921 1.306240 +1200 2481 1.307354 1.969301 7.014555 0.036810 1.306258 +1200 2482 1.307354 1.969436 7.020938 0.036664 1.306266 +1200 2483 1.307354 1.969367 7.017673 0.036664 1.306261 +1200 2484 1.307354 1.969674 7.032299 0.036263 1.306291 +1200 2485 1.307354 1.969766 7.036749 0.036190 1.306296 +1200 2486 1.307354 1.970070 7.051422 0.035938 1.306310 +1200 2487 1.307354 1.970285 7.061912 0.035652 1.306327 +1200 2488 1.307354 1.970353 7.065223 0.035509 1.306336 +1200 2489 1.307354 1.970324 7.063811 0.035509 1.306325 +1200 2490 1.307354 1.970498 7.072376 0.035332 1.306346 +1200 2491 1.307354 1.970516 7.073271 0.035297 1.306347 +1200 2492 1.307354 1.970399 7.067483 0.035297 1.306335 +1200 2493 1.307354 1.970260 7.060672 0.035297 1.306327 +1200 2494 1.307354 1.970195 7.057531 0.035297 1.306324 +1200 2495 1.307354 1.970140 7.054856 0.035297 1.306321 +1200 2496 1.307354 1.970469 7.070925 0.035262 1.306349 +1200 2497 1.307354 1.970564 7.075623 0.035086 1.306359 +1200 2498 1.307354 1.970527 7.073779 0.035051 1.306361 +1200 2499 1.307354 1.970729 7.083783 0.034841 1.306374 +1200 2500 1.307354 1.970696 7.082153 0.034841 1.306373 +1200 2501 1.307354 1.970767 7.085691 0.034702 1.306382 +1200 2502 1.307354 1.970753 7.085001 0.034702 1.306382 +1200 2503 1.307354 1.970698 7.082231 0.034702 1.306368 +1200 2504 1.307354 1.970641 7.079448 0.034702 1.306360 +1200 2505 1.307354 1.970659 7.080338 0.034702 1.306362 +1200 2506 1.307354 1.970663 7.080499 0.034702 1.306362 +1200 2507 1.307354 1.970646 7.079680 0.034702 1.306365 +1200 2508 1.307354 1.970656 7.080144 0.034702 1.306364 +1200 2509 1.307354 1.970557 7.075282 0.034702 1.306359 +1200 2510 1.307354 1.970611 7.077955 0.034702 1.306367 +1200 2511 1.307354 1.970661 7.080425 0.034702 1.306379 +1200 2512 1.307354 1.970665 7.080632 0.034702 1.306377 +1200 2513 1.307354 1.970948 7.094720 0.034392 1.306399 +1200 2514 1.307354 1.970872 7.090896 0.034392 1.306392 +1200 2515 1.307354 1.971049 7.099747 0.034186 1.306410 +1200 2516 1.307354 1.971116 7.103148 0.034118 1.306414 +1200 2517 1.307354 1.971009 7.097748 0.034118 1.306409 +1200 2518 1.307354 1.971044 7.099511 0.034118 1.306415 +1200 2519 1.307354 1.970984 7.096525 0.034118 1.306412 +1200 2520 1.307354 1.970994 7.096998 0.034118 1.306412 +1200 2521 1.307354 1.970940 7.094317 0.034118 1.306414 +1200 2522 1.307354 1.970968 7.095678 0.034118 1.306414 +1200 2523 1.307354 1.971008 7.097683 0.034050 1.306417 +1200 2524 1.307354 1.970903 7.092440 0.034050 1.306414 +1200 2525 1.307354 1.970961 7.095336 0.034050 1.306416 +1200 2526 1.307354 1.971151 7.104886 0.033914 1.306425 +1200 2527 1.307354 1.971149 7.104815 0.033880 1.306428 +1200 2528 1.307354 1.971213 7.107994 0.033880 1.306426 +1200 2529 1.307354 1.971230 7.108886 0.033880 1.306427 +1200 2530 1.307354 1.971082 7.101417 0.033880 1.306419 +1200 2531 1.307354 1.971120 7.103324 0.033880 1.306416 +1200 2532 1.307354 1.971317 7.113258 0.033812 1.306432 +1200 2533 1.307354 1.971374 7.116177 0.033677 1.306438 +1200 2534 1.307354 1.971377 7.116317 0.033677 1.306438 +1200 2535 1.307354 1.971367 7.115836 0.033677 1.306438 +1200 2536 1.307354 1.971447 7.119860 0.033610 1.306442 +1200 2537 1.307354 1.971441 7.119556 0.033610 1.306442 +1200 2538 1.307354 1.971468 7.120937 0.033576 1.306443 +1200 2539 1.307354 1.971563 7.125781 0.033342 1.306456 +1200 2540 1.307354 1.971628 7.129140 0.033342 1.306456 +1200 2541 1.307354 1.971584 7.126856 0.033342 1.306452 +1200 2542 1.307354 1.971763 7.136038 0.033309 1.306460 +1200 2543 1.307354 1.971831 7.139567 0.033209 1.306464 +1200 2544 1.307354 1.971977 7.147082 0.032813 1.306485 +1200 2545 1.307354 1.971906 7.143432 0.032813 1.306482 +1200 2546 1.307354 1.971787 7.137302 0.032813 1.306472 +1200 2547 1.307354 1.971753 7.135515 0.032813 1.306469 +1200 2548 1.307354 1.971937 7.145022 0.032813 1.306483 +1200 2549 1.307354 1.971931 7.144720 0.032813 1.306483 +1200 2550 1.307354 1.971987 7.147630 0.032813 1.306482 +1200 2551 1.307354 1.971958 7.146136 0.032813 1.306484 +1200 2552 1.307354 1.971959 7.146191 0.032813 1.306478 +1200 2553 1.307354 1.971821 7.139047 0.032813 1.306468 +1200 2554 1.307354 1.971839 7.139980 0.032813 1.306470 +1200 2555 1.307354 1.972018 7.149210 0.032813 1.306479 +1200 2556 1.307354 1.972048 7.150775 0.032813 1.306479 +1200 2557 1.307354 1.972151 7.156164 0.032781 1.306487 +1200 2558 1.307354 1.972254 7.161553 0.032617 1.306496 +1200 2559 1.307354 1.972313 7.164634 0.032617 1.306494 +1200 2560 1.307354 1.972435 7.171058 0.032422 1.306507 +1200 2561 1.307354 1.972531 7.176106 0.032261 1.306516 +1200 2562 1.307354 1.972398 7.169102 0.032261 1.306497 +1200 2563 1.307354 1.972380 7.168142 0.032261 1.306493 +1200 2564 1.307354 1.972508 7.174919 0.032261 1.306503 +1200 2565 1.307354 1.972548 7.176991 0.032261 1.306510 +1200 2566 1.307354 1.972805 7.190660 0.032068 1.306525 +1200 2567 1.307354 1.972842 7.192665 0.032068 1.306526 +1200 2568 1.307354 1.972843 7.192691 0.031940 1.306532 +1200 2569 1.307354 1.972862 7.193747 0.031940 1.306533 +1200 2570 1.307354 1.972844 7.192756 0.031908 1.306535 +1200 2571 1.307354 1.973002 7.201258 0.031717 1.306543 +1200 2572 1.307354 1.972913 7.196441 0.031717 1.306539 +1200 2573 1.307354 1.972892 7.195311 0.031717 1.306535 +1200 2574 1.307354 1.972907 7.196105 0.031717 1.306541 +1200 2575 1.307354 1.972815 7.191182 0.031717 1.306538 +1200 2576 1.307354 1.973003 7.201299 0.031590 1.306550 +1200 2577 1.307354 1.972898 7.195661 0.031590 1.306541 +1200 2578 1.307354 1.973060 7.204365 0.031496 1.306556 +1200 2579 1.307354 1.973344 7.219731 0.031152 1.306572 +1200 2580 1.307354 1.973457 7.225904 0.030996 1.306581 +1200 2581 1.307354 1.973590 7.233213 0.030780 1.306591 +1200 2582 1.307354 1.973628 7.235313 0.030780 1.306584 +1200 2583 1.307354 1.973679 7.238102 0.030780 1.306584 +1200 2584 1.307354 1.973619 7.234808 0.030780 1.306580 +1200 2585 1.307354 1.973420 7.223894 0.030780 1.306570 +1200 2586 1.307354 1.973225 7.213290 0.030780 1.306557 +1200 2587 1.307354 1.973230 7.213539 0.030780 1.306557 +1200 2588 1.307354 1.973166 7.210073 0.030780 1.306548 +1200 2589 1.307354 1.973224 7.213237 0.030780 1.306544 +1200 2590 1.307354 1.973152 7.209328 0.030780 1.306550 +1200 2591 1.307354 1.973227 7.213372 0.030780 1.306556 +1200 2592 1.307354 1.973282 7.216354 0.030780 1.306564 +1200 2593 1.307354 1.973302 7.217461 0.030780 1.306560 +1200 2594 1.307354 1.973330 7.218984 0.030780 1.306566 +1200 2595 1.307354 1.973134 7.208357 0.030780 1.306548 +1200 2596 1.307354 1.973473 7.226812 0.030780 1.306583 +1200 2597 1.307354 1.973499 7.228243 0.030780 1.306581 +1200 2598 1.307354 1.973494 7.227931 0.030780 1.306585 +1200 2599 1.307354 1.973726 7.240735 0.030627 1.306599 +1200 2600 1.307354 1.973773 7.243315 0.030596 1.306600 +1200 2601 1.307354 1.973990 7.255373 0.030322 1.306614 +1200 2602 1.307354 1.974125 7.262935 0.030141 1.306623 +1200 2603 1.307354 1.974056 7.259082 0.030141 1.306617 +1200 2604 1.307354 1.974131 7.263245 0.030141 1.306619 +1200 2605 1.307354 1.974088 7.260869 0.030141 1.306614 +1200 2606 1.307354 1.974144 7.263981 0.030141 1.306617 +1200 2607 1.307354 1.974027 7.257420 0.030141 1.306611 +1200 2608 1.307354 1.974126 7.262975 0.030141 1.306623 +1200 2609 1.307354 1.974101 7.261565 0.030111 1.306624 +1200 2610 1.307354 1.974015 7.256751 0.030111 1.306620 +1200 2611 1.307354 1.974129 7.263167 0.030111 1.306625 +1200 2612 1.307354 1.974245 7.269696 0.029781 1.306640 +1200 2613 1.307354 1.974359 7.276140 0.029692 1.306645 +1200 2614 1.307354 1.974281 7.271731 0.029692 1.306641 +1200 2615 1.307354 1.974284 7.271887 0.029692 1.306645 +1200 2616 1.307354 1.974289 7.272140 0.029603 1.306649 +1200 2617 1.307354 1.974438 7.280608 0.029485 1.306655 +1200 2618 1.307354 1.974534 7.286054 0.029192 1.306669 +1200 2619 1.307354 1.974439 7.280681 0.029192 1.306665 +1200 2620 1.307354 1.974379 7.277252 0.029192 1.306662 +1200 2621 1.307354 1.974487 7.283409 0.029192 1.306668 +1200 2622 1.307354 1.974540 7.286401 0.028960 1.306679 +1200 2623 1.307354 1.974708 7.296015 0.028786 1.306687 +1200 2624 1.307354 1.974995 7.312589 0.028386 1.306706 +1200 2625 1.307354 1.974979 7.311645 0.028386 1.306705 +1200 2626 1.307354 1.974982 7.311820 0.028386 1.306706 +1200 2627 1.307354 1.975194 7.324196 0.028076 1.306721 +1200 2628 1.307354 1.975191 7.324031 0.028076 1.306720 +1200 2629 1.307354 1.975185 7.323649 0.028076 1.306720 +1200 2630 1.307354 1.975109 7.319230 0.028076 1.306715 +1200 2631 1.307354 1.975239 7.326829 0.027964 1.306726 +1200 2632 1.307354 1.975308 7.330882 0.027852 1.306731 +1200 2633 1.307354 1.975409 7.336822 0.027686 1.306738 +1200 2634 1.307354 1.975405 7.336613 0.027686 1.306738 +1200 2635 1.307354 1.975686 7.353294 0.027328 1.306754 +1200 2636 1.307354 1.975757 7.357499 0.027274 1.306757 +1200 2637 1.307354 1.975862 7.363836 0.027165 1.306762 +1200 2638 1.307354 1.975934 7.368153 0.027084 1.306765 +1200 2639 1.307354 1.976103 7.378388 0.026868 1.306774 +1200 2640 1.307354 1.976301 7.390441 0.026574 1.306787 +1200 2641 1.307354 1.976015 7.373059 0.026574 1.306770 +1200 2642 1.307354 1.975950 7.369121 0.026574 1.306767 +1200 2643 1.307354 1.975852 7.363237 0.026574 1.306761 +1200 2644 1.307354 1.975968 7.370214 0.026574 1.306769 +1200 2645 1.307354 1.975923 7.367503 0.026574 1.306764 +1200 2646 1.307354 1.975999 7.372068 0.026574 1.306765 +1200 2647 1.307354 1.976251 7.387419 0.026574 1.306781 +1200 2648 1.307354 1.976241 7.386761 0.026574 1.306781 +1200 2649 1.307354 1.976392 7.396037 0.026574 1.306787 +1200 2650 1.307354 1.976530 7.404574 0.026284 1.306799 +1200 2651 1.307354 1.976648 7.411841 0.026100 1.306808 +1200 2652 1.307354 1.976646 7.411769 0.026100 1.306804 +1200 2653 1.307354 1.976549 7.405696 0.026100 1.306800 +1200 2654 1.307354 1.976467 7.400658 0.026100 1.306794 +1200 2655 1.307354 1.976603 7.409097 0.026100 1.306798 +1200 2656 1.307354 1.976615 7.409818 0.026100 1.306802 +1200 2657 1.307354 1.976791 7.420779 0.026022 1.306811 +1200 2658 1.307354 1.976791 7.420779 0.026022 1.306811 +1200 2659 1.307354 1.976806 7.421699 0.026022 1.306808 +1200 2660 1.307354 1.976811 7.422033 0.026022 1.306806 +1200 2661 1.307354 1.976936 7.429874 0.026022 1.306809 +1200 2662 1.307354 1.977027 7.435629 0.025789 1.306821 +1200 2663 1.307354 1.977042 7.436540 0.025789 1.306814 +1200 2664 1.307354 1.977121 7.441565 0.025789 1.306817 +1200 2665 1.307354 1.977117 7.441299 0.025789 1.306817 +1200 2666 1.307354 1.977170 7.444694 0.025763 1.306822 +1200 2667 1.307354 1.977196 7.446324 0.025686 1.306825 +1200 2668 1.307354 1.977193 7.446135 0.025686 1.306823 +1200 2669 1.307354 1.977205 7.446864 0.025686 1.306824 +1200 2670 1.307354 1.977439 7.461843 0.025405 1.306837 +1200 2671 1.307354 1.977385 7.458379 0.025405 1.306834 +1200 2672 1.307354 1.977411 7.460084 0.025405 1.306834 +1200 2673 1.307354 1.977472 7.464019 0.025279 1.306841 +1200 2674 1.307354 1.977441 7.462020 0.025254 1.306843 +1200 2675 1.307354 1.977575 7.470619 0.025128 1.306848 +1200 2676 1.307354 1.977691 7.478192 0.025103 1.306849 +1200 2677 1.307354 1.977701 7.478807 0.025103 1.306849 +1200 2678 1.307354 1.977964 7.496008 0.024828 1.306860 +1200 2679 1.307354 1.978111 7.505731 0.024508 1.306873 +1200 2680 1.307354 1.978162 7.509095 0.024288 1.306882 +1200 2681 1.307354 1.978139 7.507563 0.024288 1.306881 +1200 2682 1.307354 1.978064 7.502653 0.024288 1.306876 +1200 2683 1.307354 1.978087 7.504129 0.024288 1.306878 +1200 2684 1.307354 1.978201 7.511688 0.024143 1.306887 +1200 2685 1.307354 1.978227 7.513438 0.024143 1.306885 +1200 2686 1.307354 1.978254 7.515244 0.024143 1.306884 +1200 2687 1.307354 1.978251 7.515056 0.024143 1.306884 +1200 2688 1.307354 1.978431 7.527063 0.024071 1.306890 +1200 2689 1.307354 1.978541 7.534526 0.023831 1.306899 +1200 2690 1.307354 1.978457 7.528822 0.023831 1.306890 +1200 2691 1.307354 1.978515 7.532785 0.023831 1.306895 +1200 2692 1.307354 1.978602 7.538624 0.023736 1.306903 +1200 2693 1.307354 1.978646 7.541622 0.023594 1.306909 +1200 2694 1.307354 1.978608 7.539066 0.023594 1.306905 +1200 2695 1.307354 1.978723 7.546879 0.023500 1.306912 +1200 2696 1.307354 1.978811 7.552882 0.023406 1.306916 +1200 2697 1.307354 1.978573 7.536653 0.023406 1.306897 +1200 2698 1.307354 1.978573 7.536653 0.023406 1.306897 +1200 2699 1.307354 1.978577 7.536981 0.023406 1.306897 +1200 2700 1.307354 1.978574 7.536723 0.023406 1.306897 +1200 2701 1.307354 1.978633 7.540721 0.023406 1.306896 +1200 2702 1.307354 1.978677 7.543738 0.023406 1.306901 +1200 2703 1.307354 1.978718 7.546557 0.023406 1.306902 +1200 2704 1.307354 1.978789 7.551368 0.023406 1.306911 +1200 2705 1.307354 1.978927 7.560870 0.023243 1.306921 +1200 2706 1.307354 1.978928 7.560893 0.023243 1.306921 +1200 2707 1.307354 1.978987 7.564997 0.023104 1.306927 +1200 2708 1.307354 1.979172 7.577774 0.022943 1.306933 +1200 2709 1.307354 1.979441 7.596683 0.022692 1.306942 +1200 2710 1.307354 1.979497 7.600598 0.022647 1.306944 +1200 2711 1.307354 1.979799 7.622141 0.022355 1.306954 +1200 2712 1.307354 1.979770 7.620036 0.022355 1.306954 +1200 2713 1.307354 1.979728 7.617035 0.022355 1.306954 +1200 2714 1.307354 1.979782 7.620936 0.022243 1.306958 +1200 2715 1.307354 1.979849 7.625746 0.022176 1.306961 +1200 2716 1.307354 1.979688 7.614196 0.022176 1.306956 +1200 2717 1.307354 1.979733 7.617402 0.022176 1.306954 +1200 2718 1.307354 1.979557 7.604827 0.022176 1.306945 +1200 2719 1.307354 1.979457 7.597775 0.022176 1.306939 +1200 2720 1.307354 1.979497 7.600593 0.022176 1.306940 +1200 2721 1.307354 1.979562 7.605190 0.022176 1.306945 +1200 2722 1.307354 1.979515 7.601854 0.022176 1.306938 +1200 2723 1.307354 1.979475 7.599047 0.022176 1.306938 +1200 2724 1.307354 1.979622 7.609486 0.022176 1.306945 +1200 2725 1.307354 1.979465 7.598312 0.022176 1.306940 +1200 2726 1.307354 1.979428 7.595734 0.022176 1.306943 +1200 2727 1.307354 1.979612 7.608745 0.022176 1.306956 +1200 2728 1.307354 1.979623 7.609557 0.022176 1.306953 +1200 2729 1.307354 1.979738 7.617745 0.022176 1.306959 +1200 2730 1.307354 1.979879 7.627860 0.022044 1.306966 +1200 2731 1.307354 1.979928 7.631416 0.022000 1.306967 +1200 2732 1.307354 1.980033 7.639042 0.022000 1.306967 +1200 2733 1.307354 1.980008 7.637193 0.022000 1.306966 +1200 2734 1.307354 1.980037 7.639305 0.022000 1.306965 +1200 2735 1.307354 1.980184 7.650053 0.021890 1.306971 +1200 2736 1.307354 1.980257 7.655376 0.021738 1.306976 +1200 2737 1.307354 1.980197 7.650945 0.021738 1.306974 +1200 2738 1.307354 1.980263 7.655823 0.021738 1.306976 +1200 2739 1.307354 1.980219 7.652606 0.021738 1.306973 +1200 2740 1.307354 1.980201 7.651295 0.021738 1.306971 +1200 2741 1.307354 1.980083 7.642638 0.021738 1.306969 +1200 2742 1.307354 1.980161 7.648339 0.021738 1.306971 +1200 2743 1.307354 1.980241 7.654195 0.021738 1.306976 +1200 2744 1.307354 1.980152 7.647664 0.021738 1.306973 +1200 2745 1.307354 1.980232 7.653528 0.021651 1.306979 +1200 2746 1.307354 1.980279 7.656965 0.021608 1.306981 +1200 2747 1.307354 1.980384 7.664755 0.021478 1.306985 +1200 2748 1.307354 1.980210 7.651932 0.021478 1.306974 +1200 2749 1.307354 1.980152 7.647693 0.021478 1.306972 +1200 2750 1.307354 1.980108 7.644479 0.021478 1.306970 +1200 2751 1.307354 1.980181 7.649792 0.021478 1.306972 +1200 2752 1.307354 1.980239 7.654071 0.021478 1.306979 +1200 2753 1.307354 1.980307 7.659083 0.021478 1.306985 +1200 2754 1.307354 1.980274 7.656623 0.021478 1.306984 +1200 2755 1.307354 1.980286 7.657508 0.021478 1.306985 +1200 2756 1.307354 1.980187 7.650229 0.021478 1.306977 +1200 2757 1.307354 1.980339 7.661424 0.021478 1.306984 +1200 2758 1.307354 1.980541 7.676357 0.021307 1.306991 +1200 2759 1.307354 1.980440 7.668904 0.021307 1.306984 +1200 2760 1.307354 1.980369 7.663629 0.021307 1.306982 +1200 2761 1.307354 1.980432 7.668260 0.021307 1.306987 +1200 2762 1.307354 1.980501 7.673438 0.021307 1.306989 +1200 2763 1.307354 1.980701 7.688370 0.021159 1.306996 +1200 2764 1.307354 1.980852 7.699753 0.020782 1.307009 +1200 2765 1.307354 1.980878 7.701733 0.020782 1.307008 +1200 2766 1.307354 1.980874 7.701412 0.020782 1.307009 +1200 2767 1.307354 1.980876 7.701563 0.020782 1.307005 +1200 2768 1.307354 1.980884 7.702134 0.020782 1.307006 +1200 2769 1.307354 1.980737 7.691080 0.020782 1.306999 +1200 2770 1.307354 1.980773 7.693747 0.020782 1.307001 +1200 2771 1.307354 1.980759 7.692700 0.020782 1.307000 +1200 2772 1.307354 1.980798 7.695647 0.020782 1.307004 +1200 2773 1.307354 1.980773 7.693795 0.020782 1.307002 +1200 2774 1.307354 1.980669 7.685923 0.020782 1.306992 +1200 2775 1.307354 1.980850 7.699615 0.020782 1.306999 +1200 2776 1.307354 1.980903 7.703591 0.020782 1.307003 +1200 2777 1.307354 1.980909 7.704096 0.020782 1.307002 +1200 2778 1.307354 1.980956 7.707601 0.020782 1.307003 +1200 2779 1.307354 1.981067 7.716130 0.020782 1.307008 +1200 2780 1.307354 1.981234 7.728955 0.020678 1.307013 +1200 2781 1.307354 1.981197 7.726070 0.020678 1.307011 +1200 2782 1.307354 1.981209 7.727023 0.020678 1.307011 +1200 2783 1.307354 1.981275 7.732128 0.020678 1.307012 +1200 2784 1.307354 1.981351 7.738005 0.020616 1.307015 +1200 2785 1.307354 1.981406 7.742294 0.020493 1.307018 +1200 2786 1.307354 1.981488 7.748672 0.020411 1.307022 +1200 2787 1.307354 1.981534 7.752293 0.020350 1.307023 +1200 2788 1.307354 1.981437 7.744748 0.020188 1.307029 +1200 2789 1.307354 1.981540 7.752783 0.019987 1.307035 +1200 2790 1.307354 1.981547 7.753362 0.019987 1.307032 +1200 2791 1.307354 1.981644 7.760964 0.019967 1.307036 +1200 2792 1.307354 1.981709 7.766142 0.019848 1.307039 +1200 2793 1.307354 1.981530 7.751985 0.019848 1.307032 +1200 2794 1.307354 1.981840 7.776491 0.019828 1.307040 +1200 2795 1.307354 1.981901 7.781408 0.019788 1.307041 +1200 2796 1.307354 1.982003 7.789572 0.019690 1.307045 +1200 2797 1.307354 1.981872 7.779057 0.019690 1.307039 +1200 2798 1.307354 1.981985 7.788183 0.019631 1.307047 +1200 2799 1.307354 1.982037 7.792364 0.019474 1.307051 +1200 2800 1.307354 1.982202 7.805702 0.019146 1.307061 +1200 2801 1.307354 1.982200 7.805544 0.019146 1.307061 +1200 2802 1.307354 1.982138 7.800471 0.019146 1.307057 +1200 2803 1.307354 1.982102 7.797595 0.019146 1.307053 +1200 2804 1.307354 1.982155 7.801881 0.019146 1.307055 +1200 2805 1.307354 1.982170 7.803135 0.019146 1.307055 +1200 2806 1.307354 1.982180 7.803937 0.019146 1.307058 +1200 2807 1.307354 1.982266 7.810887 0.019127 1.307062 +1200 2808 1.307354 1.982166 7.802772 0.019127 1.307057 +1200 2809 1.307354 1.982282 7.812218 0.019127 1.307060 +1200 2810 1.307354 1.982338 7.816857 0.019127 1.307062 +1200 2811 1.307354 1.982496 7.829870 0.018956 1.307068 +1200 2812 1.307354 1.982552 7.834527 0.018805 1.307072 +1200 2813 1.307354 1.982508 7.830871 0.018805 1.307071 +1200 2814 1.307354 1.982398 7.821718 0.018805 1.307068 +1200 2815 1.307354 1.982697 7.846622 0.018433 1.307083 +1200 2816 1.307354 1.982746 7.850683 0.018414 1.307084 +1200 2817 1.307354 1.982729 7.849232 0.018414 1.307080 +1200 2818 1.307354 1.982863 7.860572 0.018396 1.307084 +1200 2819 1.307354 1.982758 7.851677 0.018396 1.307079 +1200 2820 1.307354 1.982814 7.856419 0.018396 1.307081 +1200 2821 1.307354 1.983052 7.876637 0.018377 1.307085 +1200 2822 1.307354 1.982979 7.870388 0.018377 1.307078 +1200 2823 1.307354 1.983004 7.872495 0.018377 1.307077 +1200 2824 1.307354 1.982924 7.865688 0.018377 1.307075 +1200 2825 1.307354 1.982965 7.869186 0.018377 1.307078 +1200 2826 1.307354 1.983075 7.878567 0.018377 1.307083 +1200 2827 1.307354 1.983058 7.877167 0.018377 1.307083 +1200 2828 1.307354 1.983089 7.879804 0.018377 1.307084 +1200 2829 1.307354 1.983150 7.884975 0.018322 1.307086 +1200 2830 1.307354 1.983280 7.896264 0.018086 1.307093 +1200 2831 1.307354 1.983244 7.893095 0.018086 1.307092 +1200 2832 1.307354 1.983342 7.901625 0.018050 1.307094 +1200 2833 1.307354 1.983358 7.902993 0.017924 1.307098 +1200 2834 1.307354 1.983266 7.895051 0.017924 1.307094 +1200 2835 1.307354 1.983373 7.904289 0.017924 1.307096 +1200 2836 1.307354 1.983469 7.912726 0.017870 1.307099 +1200 2837 1.307354 1.983534 7.918395 0.017799 1.307102 +1200 2838 1.307354 1.983537 7.918650 0.017799 1.307102 +1200 2839 1.307354 1.983695 7.932679 0.017640 1.307106 +1200 2840 1.307354 1.983647 7.928411 0.017640 1.307105 +1200 2841 1.307354 1.983563 7.920994 0.017640 1.307102 +1200 2842 1.307354 1.983478 7.913511 0.017640 1.307101 +1200 2843 1.307354 1.983572 7.921801 0.017640 1.307103 +1200 2844 1.307354 1.983619 7.925877 0.017640 1.307103 +1200 2845 1.307354 1.983625 7.926476 0.017640 1.307101 +1200 2846 1.307354 1.983785 7.940690 0.017569 1.307108 +1200 2847 1.307354 1.983829 7.944593 0.017534 1.307109 +1200 2848 1.307354 1.983836 7.945231 0.017429 1.307112 +1200 2849 1.307354 1.983841 7.945673 0.017360 1.307114 +1200 2850 1.307354 1.983758 7.938290 0.017360 1.307113 +1200 2851 1.307354 1.983723 7.935156 0.017342 1.307114 +1200 2852 1.307354 1.983671 7.930549 0.017342 1.307113 +1200 2853 1.307354 1.983722 7.935055 0.017342 1.307114 +1200 2854 1.307354 1.983709 7.933851 0.017342 1.307110 +1200 2855 1.307354 1.983728 7.935610 0.017342 1.307112 +1200 2856 1.307354 1.983807 7.942603 0.017342 1.307113 +1200 2857 1.307354 1.983732 7.935918 0.017342 1.307111 +1200 2858 1.307354 1.983757 7.938169 0.017342 1.307113 +1200 2859 1.307354 1.983576 7.922103 0.017342 1.307104 +1200 2860 1.307354 1.983752 7.937752 0.017342 1.307112 +1200 2861 1.307354 1.983925 7.953262 0.017291 1.307116 +1200 2862 1.307354 1.983894 7.950443 0.017273 1.307116 +1200 2863 1.307354 1.983854 7.946851 0.017256 1.307117 +1200 2864 1.307354 1.983842 7.945757 0.017256 1.307114 +1200 2865 1.307354 1.983825 7.944275 0.017256 1.307115 +1200 2866 1.307354 1.983868 7.948057 0.017256 1.307117 +1200 2867 1.307354 1.983893 7.950382 0.017187 1.307119 +1200 2868 1.307354 1.983848 7.946346 0.017187 1.307118 +1200 2869 1.307354 1.983969 7.957229 0.017084 1.307122 +1200 2870 1.307354 1.984095 7.968599 0.016948 1.307125 +1200 2871 1.307354 1.984224 7.980468 0.016696 1.307132 +1200 2872 1.307354 1.984263 7.983997 0.016514 1.307137 +1200 2873 1.307354 1.984205 7.978679 0.016514 1.307136 +1200 2874 1.307354 1.984252 7.982980 0.016497 1.307137 +1200 2875 1.307354 1.984252 7.982953 0.016497 1.307137 +1200 2876 1.307354 1.984458 8.002045 0.016284 1.307143 +1200 2877 1.307354 1.984562 8.011766 0.016171 1.307146 +1200 2878 1.307354 1.984619 8.017124 0.016074 1.307148 +1200 2879 1.307354 1.984689 8.023711 0.015978 1.307151 +1200 2880 1.307354 1.984577 8.013179 0.015978 1.307146 +1200 2881 1.307354 1.984628 8.017985 0.015978 1.307149 +1200 2882 1.307354 1.984590 8.014423 0.015978 1.307146 +1200 2883 1.307354 1.984613 8.016623 0.015978 1.307146 +1200 2884 1.307354 1.984727 8.027326 0.015835 1.307155 +1200 2885 1.307354 1.984684 8.023308 0.015835 1.307152 +1200 2886 1.307354 1.984699 8.024727 0.015835 1.307154 +1200 2887 1.307354 1.984688 8.023626 0.015835 1.307151 +1200 2888 1.307354 1.984680 8.022895 0.015835 1.307154 +1200 2889 1.307354 1.984641 8.019217 0.015740 1.307157 +1200 2890 1.307354 1.984561 8.011718 0.015740 1.307156 +1200 2891 1.307354 1.984575 8.013018 0.015740 1.307157 +1200 2892 1.307354 1.984461 8.002363 0.015740 1.307154 +1200 2893 1.307354 1.984524 8.008267 0.015740 1.307155 +1200 2894 1.307354 1.984557 8.011312 0.015740 1.307156 +1200 2895 1.307354 1.984649 8.019981 0.015630 1.307160 +1200 2896 1.307354 1.984731 8.027688 0.015568 1.307161 +1200 2897 1.307354 1.984947 8.048327 0.015352 1.307167 +1200 2898 1.307354 1.985034 8.056726 0.015184 1.307171 +1200 2899 1.307354 1.985018 8.055197 0.015184 1.307169 +1200 2900 1.307354 1.985019 8.055289 0.015184 1.307167 +1200 2901 1.307354 1.984981 8.051629 0.015184 1.307168 +1200 2902 1.307354 1.984930 8.046759 0.015184 1.307167 +1200 2903 1.307354 1.984991 8.052642 0.015184 1.307168 +1200 2904 1.307354 1.984974 8.050985 0.015184 1.307168 +1200 2905 1.307354 1.985037 8.057028 0.015184 1.307170 +1200 2906 1.307354 1.984954 8.049076 0.015184 1.307167 +1200 2907 1.307354 1.984901 8.043950 0.015184 1.307168 +1200 2908 1.307354 1.985028 8.056220 0.015184 1.307170 +1200 2909 1.307354 1.985321 8.084758 0.014794 1.307180 +1200 2910 1.307354 1.985257 8.078518 0.014794 1.307180 +1200 2911 1.307354 1.985304 8.083088 0.014794 1.307180 +1200 2912 1.307354 1.985249 8.077665 0.014794 1.307179 +1200 2913 1.307354 1.985261 8.078932 0.014750 1.307181 +1200 2914 1.307354 1.985339 8.086528 0.014632 1.307184 +1200 2915 1.307354 1.985433 8.095874 0.014559 1.307186 +1200 2916 1.307354 1.985378 8.090440 0.014559 1.307184 +1200 2917 1.307354 1.985433 8.095859 0.014559 1.307185 +1200 2918 1.307354 1.985479 8.100503 0.014545 1.307186 +1200 2919 1.307354 1.985715 8.124161 0.014186 1.307194 +1200 2920 1.307354 1.985637 8.116346 0.014186 1.307192 +1200 2921 1.307354 1.985838 8.136718 0.013975 1.307199 +1200 2922 1.307354 1.985927 8.145810 0.013891 1.307201 +1200 2923 1.307354 1.985900 8.143033 0.013864 1.307201 +1200 2924 1.307354 1.985853 8.138278 0.013864 1.307199 +1200 2925 1.307354 1.985868 8.139737 0.013864 1.307199 +1200 2926 1.307354 1.986030 8.156513 0.013808 1.307203 +1200 2927 1.307354 1.986059 8.159520 0.013781 1.307203 +1200 2928 1.307354 1.986100 8.163797 0.013657 1.307206 +1200 2929 1.307354 1.985958 8.149073 0.013657 1.307201 +1200 2930 1.307354 1.985992 8.152544 0.013657 1.307202 +1200 2931 1.307354 1.986097 8.163417 0.013657 1.307204 +1200 2932 1.307354 1.986215 8.175791 0.013657 1.307203 +1200 2933 1.307354 1.986108 8.164550 0.013657 1.307201 +1200 2934 1.307354 1.986195 8.173694 0.013657 1.307202 +1200 2935 1.307354 1.986329 8.187767 0.013657 1.307206 +1200 2936 1.307354 1.986259 8.180456 0.013657 1.307204 +1200 2937 1.307354 1.986362 8.191262 0.013657 1.307206 +1200 2938 1.307354 1.986368 8.191981 0.013603 1.307207 +1200 2939 1.307354 1.986417 8.197163 0.013589 1.307207 +1200 2940 1.307354 1.986307 8.185498 0.013589 1.307204 +1200 2941 1.307354 1.986283 8.182970 0.013589 1.307204 +1200 2942 1.307354 1.986208 8.175024 0.013589 1.307203 +1200 2943 1.307354 1.986241 8.178481 0.013589 1.307204 +1200 2944 1.307354 1.986128 8.166654 0.013589 1.307203 +1200 2945 1.307354 1.986072 8.160821 0.013589 1.307202 +1200 2946 1.307354 1.986154 8.169395 0.013589 1.307206 +1200 2947 1.307354 1.986251 8.179538 0.013481 1.307210 +1200 2948 1.307354 1.986201 8.174265 0.013481 1.307209 +1200 2949 1.307354 1.986041 8.157651 0.013481 1.307205 +1200 2950 1.307354 1.986174 8.171476 0.013481 1.307209 +1200 2951 1.307354 1.986106 8.164379 0.013481 1.307206 +1200 2952 1.307354 1.986152 8.169148 0.013481 1.307205 +1200 2953 1.307354 1.986220 8.176351 0.013481 1.307207 +1200 2954 1.307354 1.986331 8.187997 0.013441 1.307210 +1200 2955 1.307354 1.986470 8.202856 0.013320 1.307213 +1200 2956 1.307354 1.986418 8.197230 0.013320 1.307212 +1200 2957 1.307354 1.986328 8.187641 0.013320 1.307211 +1200 2958 1.307354 1.986420 8.197420 0.013307 1.307213 +1200 2959 1.307354 1.986453 8.200949 0.013294 1.307214 +1200 2960 1.307354 1.986397 8.194970 0.013294 1.307212 +1200 2961 1.307354 1.986437 8.199312 0.013294 1.307214 +1200 2962 1.307354 1.986481 8.204004 0.013280 1.307214 +1200 2963 1.307354 1.986571 8.213659 0.013201 1.307216 +1200 2964 1.307354 1.986503 8.206343 0.013201 1.307215 +1200 2965 1.307354 1.986549 8.211256 0.013161 1.307217 +1200 2966 1.307354 1.986769 8.235180 0.012722 1.307226 +1200 2967 1.307354 1.986814 8.240098 0.012684 1.307226 +1200 2968 1.307354 1.986925 8.252350 0.012608 1.307228 +1200 2969 1.307354 1.987017 8.262529 0.012420 1.307232 +1200 2970 1.307354 1.987022 8.263101 0.012420 1.307232 +1200 2971 1.307354 1.986969 8.257188 0.012420 1.307231 +1200 2972 1.307354 1.987080 8.269571 0.012408 1.307232 +1200 2973 1.307354 1.987270 8.291044 0.012102 1.307238 +1200 2974 1.307354 1.987210 8.284275 0.012102 1.307236 +1200 2975 1.307354 1.987275 8.291596 0.012102 1.307238 +1200 2976 1.307354 1.987105 8.272348 0.012102 1.307235 +1200 2977 1.307354 1.987003 8.261012 0.012102 1.307232 +1200 2978 1.307354 1.987085 8.270144 0.012102 1.307236 +1200 2979 1.307354 1.987238 8.287383 0.012041 1.307239 +1200 2980 1.307354 1.987180 8.280799 0.012041 1.307237 +1200 2981 1.307354 1.987211 8.284315 0.012041 1.307239 +1200 2982 1.307354 1.987112 8.273156 0.012041 1.307234 +1200 2983 1.307354 1.987252 8.288973 0.012029 1.307239 +1200 2984 1.307354 1.987427 8.308951 0.011815 1.307243 +1200 2985 1.307354 1.987457 8.312415 0.011791 1.307244 +1200 2986 1.307354 1.987415 8.307617 0.011791 1.307244 +1200 2987 1.307354 1.987538 8.321836 0.011639 1.307247 +1200 2988 1.307354 1.987533 8.321290 0.011604 1.307247 +1200 2989 1.307354 1.987596 8.328599 0.011454 1.307250 +1200 2990 1.307354 1.987523 8.320117 0.011454 1.307248 +1200 2991 1.307354 1.987607 8.329852 0.011454 1.307250 +1200 2992 1.307354 1.987494 8.316699 0.011454 1.307249 +1200 2993 1.307354 1.987545 8.322616 0.011454 1.307250 +1200 2994 1.307354 1.987526 8.320396 0.011454 1.307248 +1200 2995 1.307354 1.987526 8.320425 0.011454 1.307248 +1200 2996 1.307354 1.987601 8.329160 0.011454 1.307249 +1200 2997 1.307354 1.987619 8.331266 0.011454 1.307249 +1200 2998 1.307354 1.987732 8.344530 0.011397 1.307251 +1200 2999 1.307354 1.987722 8.343328 0.011397 1.307250 +1200 3000 1.307354 1.987757 8.347482 0.011397 1.307250 +1200 3001 1.307354 1.987701 8.340915 0.011397 1.307249 +1200 3002 1.307354 1.987752 8.346829 0.011397 1.307250 +1200 3003 1.307354 1.987664 8.336573 0.011397 1.307248 +1200 3004 1.307354 1.987809 8.353603 0.011397 1.307250 +1200 3005 1.307354 1.987872 8.361167 0.011397 1.307251 +1200 3006 1.307354 1.987829 8.356021 0.011397 1.307249 +1200 3007 1.307354 1.987848 8.358221 0.011397 1.307250 +1200 3008 1.307354 1.988090 8.387381 0.011083 1.307257 +1200 3009 1.307354 1.988080 8.386203 0.011083 1.307256 +1200 3010 1.307354 1.988161 8.396084 0.011038 1.307257 +1200 3011 1.307354 1.988385 8.423639 0.010788 1.307262 +1200 3012 1.307354 1.988501 8.438196 0.010511 1.307267 +1200 3013 1.307354 1.988477 8.435234 0.010511 1.307266 +1200 3014 1.307354 1.988513 8.439692 0.010511 1.307266 +1200 3015 1.307354 1.988551 8.444536 0.010469 1.307267 +1200 3016 1.307354 1.988498 8.437783 0.010469 1.307267 +1200 3017 1.307354 1.988523 8.440923 0.010469 1.307267 +1200 3018 1.307354 1.988490 8.436866 0.010469 1.307267 +1200 3019 1.307354 1.988585 8.448808 0.010406 1.307268 +1200 3020 1.307354 1.988643 8.456143 0.010344 1.307269 +1200 3021 1.307354 1.988681 8.461074 0.010251 1.307271 +1200 3022 1.307354 1.988682 8.461159 0.010251 1.307271 +1200 3023 1.307354 1.988651 8.457167 0.010251 1.307269 +1200 3024 1.307354 1.988606 8.451465 0.010251 1.307268 +1200 3025 1.307354 1.988506 8.438778 0.010251 1.307266 +1200 3026 1.307354 1.988516 8.440028 0.010251 1.307268 +1200 3027 1.307354 1.988709 8.464628 0.010200 1.307272 +1200 3028 1.307354 1.988693 8.462565 0.010200 1.307271 +1200 3029 1.307354 1.988699 8.463295 0.010200 1.307271 +1200 3030 1.307354 1.988659 8.458187 0.010200 1.307271 +1200 3031 1.307354 1.988645 8.456425 0.010200 1.307270 +1200 3032 1.307354 1.988747 8.469460 0.010149 1.307272 +1200 3033 1.307354 1.988834 8.480671 0.010149 1.307272 +1200 3034 1.307354 1.988949 8.495743 0.010028 1.307274 +1200 3035 1.307354 1.988971 8.498624 0.010028 1.307274 +1200 3036 1.307354 1.989076 8.512429 0.009850 1.307277 +1200 3037 1.307354 1.989033 8.506741 0.009850 1.307276 +1200 3038 1.307354 1.989125 8.518969 0.009801 1.307278 +1200 3039 1.307354 1.989158 8.523362 0.009732 1.307279 +1200 3040 1.307354 1.989100 8.515622 0.009732 1.307279 +1200 3041 1.307354 1.989097 8.515211 0.009732 1.307278 +1200 3042 1.307354 1.989088 8.513941 0.009732 1.307277 +1200 3043 1.307354 1.989105 8.516293 0.009732 1.307277 +1200 3044 1.307354 1.989048 8.508704 0.009732 1.307276 +1200 3045 1.307354 1.989115 8.517621 0.009732 1.307278 +1200 3046 1.307354 1.989158 8.523281 0.009732 1.307277 +1200 3047 1.307354 1.989192 8.527878 0.009732 1.307278 +1200 3048 1.307354 1.989195 8.528294 0.009723 1.307279 +1200 3049 1.307354 1.989100 8.515567 0.009723 1.307278 +1200 3050 1.307354 1.989137 8.520459 0.009713 1.307279 +1200 3051 1.307354 1.989235 8.533680 0.009693 1.307280 +1200 3052 1.307354 1.989241 8.534464 0.009693 1.307280 +1200 3053 1.307354 1.989288 8.540807 0.009693 1.307279 +1200 3054 1.307354 1.989247 8.535195 0.009693 1.307278 +1200 3055 1.307354 1.989403 8.556309 0.009674 1.307280 +1200 3056 1.307354 1.989544 8.575713 0.009416 1.307284 +1200 3057 1.307354 1.989608 8.584612 0.009304 1.307286 +1200 3058 1.307354 1.989707 8.598503 0.009202 1.307287 +1200 3059 1.307354 1.989711 8.599014 0.009202 1.307287 +1200 3060 1.307354 1.989596 8.582931 0.009202 1.307284 +1200 3061 1.307354 1.989682 8.594905 0.009202 1.307286 +1200 3062 1.307354 1.989706 8.598384 0.009202 1.307287 +1200 3063 1.307354 1.989803 8.612082 0.009084 1.307289 +1200 3064 1.307354 1.989891 8.624523 0.008842 1.307292 +1200 3065 1.307354 1.989875 8.622293 0.008842 1.307292 +1200 3066 1.307354 1.989941 8.631785 0.008736 1.307294 +1200 3067 1.307354 1.990035 8.645330 0.008589 1.307296 +1200 3068 1.307354 1.990116 8.657127 0.008344 1.307299 +1200 3069 1.307354 1.990128 8.658880 0.008327 1.307299 +1200 3070 1.307354 1.990126 8.658585 0.008327 1.307299 +1200 3071 1.307354 1.989993 8.639162 0.008327 1.307296 +1200 3072 1.307354 1.990039 8.645898 0.008327 1.307297 +1200 3073 1.307354 1.989984 8.637990 0.008327 1.307296 +1200 3074 1.307354 1.990022 8.643448 0.008327 1.307296 +1200 3075 1.307354 1.990099 8.654684 0.008327 1.307297 +1200 3076 1.307354 1.990164 8.664159 0.008327 1.307299 +1200 3077 1.307354 1.990167 8.664657 0.008327 1.307298 +1200 3078 1.307354 1.990235 8.674617 0.008327 1.307299 +1200 3079 1.307354 1.990243 8.675818 0.008327 1.307299 +1200 3080 1.307354 1.990363 8.693736 0.008138 1.307302 +1200 3081 1.307354 1.990456 8.707808 0.008009 1.307303 +1200 3082 1.307354 1.990544 8.721083 0.007913 1.307305 +1200 3083 1.307354 1.990524 8.718017 0.007913 1.307303 +1200 3084 1.307354 1.990465 8.709123 0.007913 1.307302 +1200 3085 1.307354 1.990474 8.710510 0.007913 1.307303 +1200 3086 1.307354 1.990415 8.701534 0.007913 1.307301 +1200 3087 1.307354 1.990453 8.707356 0.007913 1.307302 +1200 3088 1.307354 1.990466 8.709265 0.007913 1.307302 +1200 3089 1.307354 1.990407 8.700307 0.007913 1.307301 +1200 3090 1.307354 1.990346 8.691170 0.007913 1.307302 +1200 3091 1.307354 1.990210 8.670878 0.007913 1.307300 +1200 3092 1.307354 1.990207 8.670431 0.007913 1.307300 +1200 3093 1.307354 1.990228 8.673574 0.007913 1.307301 +1200 3094 1.307354 1.990231 8.674110 0.007913 1.307302 +1200 3095 1.307354 1.990231 8.673976 0.007913 1.307302 +1200 3096 1.307354 1.990285 8.682050 0.007913 1.307303 +1200 3097 1.307354 1.990418 8.702069 0.007890 1.307305 +1200 3098 1.307354 1.990382 8.696583 0.007890 1.307304 +1200 3099 1.307354 1.990381 8.696470 0.007890 1.307304 +1200 3100 1.307354 1.990383 8.696722 0.007890 1.307303 +1200 3101 1.307354 1.990372 8.695137 0.007890 1.307302 +1200 3102 1.307354 1.990306 8.685266 0.007890 1.307300 +1200 3103 1.307354 1.990399 8.699152 0.007890 1.307301 +1200 3104 1.307354 1.990551 8.722224 0.007890 1.307304 +1200 3105 1.307354 1.990643 8.736322 0.007764 1.307306 +1200 3106 1.307354 1.990615 8.732068 0.007764 1.307306 +1200 3107 1.307354 1.990564 8.724135 0.007764 1.307306 +1200 3108 1.307354 1.990540 8.720511 0.007764 1.307305 +1200 3109 1.307354 1.990518 8.717170 0.007764 1.307305 +1200 3110 1.307354 1.990464 8.709008 0.007764 1.307305 +1200 3111 1.307354 1.990493 8.713310 0.007764 1.307306 +1200 3112 1.307354 1.990508 8.715655 0.007764 1.307306 +1200 3113 1.307354 1.990552 8.722440 0.007764 1.307306 +1200 3114 1.307354 1.990522 8.717843 0.007764 1.307306 +1200 3115 1.307354 1.990632 8.734633 0.007634 1.307308 +1200 3116 1.307354 1.990668 8.740273 0.007558 1.307309 +1200 3117 1.307354 1.990627 8.733894 0.007558 1.307307 +1200 3118 1.307354 1.990500 8.714376 0.007558 1.307305 +1200 3119 1.307354 1.990494 8.713547 0.007558 1.307304 +1200 3120 1.307354 1.990491 8.713010 0.007558 1.307305 +1200 3121 1.307354 1.990551 8.722162 0.007558 1.307305 +1200 3122 1.307354 1.990507 8.715562 0.007558 1.307305 +1200 3123 1.307354 1.990391 8.697907 0.007558 1.307303 +1200 3124 1.307354 1.990455 8.707644 0.007558 1.307303 +1200 3125 1.307354 1.990443 8.705846 0.007558 1.307303 +1200 3126 1.307354 1.990415 8.701588 0.007558 1.307303 +1200 3127 1.307354 1.990463 8.708739 0.007558 1.307303 +1200 3128 1.307354 1.990449 8.706657 0.007558 1.307303 +1200 3129 1.307354 1.990339 8.690162 0.007558 1.307303 +1200 3130 1.307354 1.990353 8.692265 0.007558 1.307303 +1200 3131 1.307354 1.990401 8.699406 0.007558 1.307305 +1200 3132 1.307354 1.990449 8.706676 0.007558 1.307307 +1200 3133 1.307354 1.990490 8.712933 0.007558 1.307307 +1200 3134 1.307354 1.990497 8.713909 0.007558 1.307307 +1200 3135 1.307354 1.990543 8.720971 0.007558 1.307308 +1200 3136 1.307354 1.990550 8.722052 0.007558 1.307308 +1200 3137 1.307354 1.990580 8.726603 0.007558 1.307309 +1200 3138 1.307354 1.990594 8.728779 0.007558 1.307309 +1200 3139 1.307354 1.990549 8.721932 0.007558 1.307307 +1200 3140 1.307354 1.990547 8.721550 0.007558 1.307307 +1200 3141 1.307354 1.990544 8.721143 0.007558 1.307308 +1200 3142 1.307354 1.990603 8.730244 0.007558 1.307308 +1200 3143 1.307354 1.990617 8.732380 0.007558 1.307308 +1200 3144 1.307354 1.990518 8.717128 0.007558 1.307308 +1200 3145 1.307354 1.990499 8.714345 0.007558 1.307307 +1200 3146 1.307354 1.990563 8.724110 0.007558 1.307308 +1200 3147 1.307354 1.990646 8.736880 0.007483 1.307310 +1200 3148 1.307354 1.990684 8.742667 0.007386 1.307311 +1200 3149 1.307354 1.990652 8.737687 0.007386 1.307310 +1200 3150 1.307354 1.990735 8.750683 0.007342 1.307311 +1200 3151 1.307354 1.990749 8.752782 0.007342 1.307311 +1200 3152 1.307354 1.990744 8.752053 0.007342 1.307311 +1200 3153 1.307354 1.990739 8.751321 0.007342 1.307311 +1200 3154 1.307354 1.990734 8.750494 0.007342 1.307309 +1200 3155 1.307354 1.990814 8.763099 0.007342 1.307310 +1200 3156 1.307354 1.990859 8.770103 0.007342 1.307311 +1200 3157 1.307354 1.990883 8.773906 0.007342 1.307310 +1200 3158 1.307354 1.990889 8.774810 0.007342 1.307311 +1200 3159 1.307354 1.990921 8.779987 0.007342 1.307310 +1200 3160 1.307354 1.990897 8.776141 0.007342 1.307310 +1200 3161 1.307354 1.990938 8.782757 0.007342 1.307311 +1200 3162 1.307354 1.990929 8.781228 0.007342 1.307310 +1200 3163 1.307354 1.991023 8.796357 0.007283 1.307312 +1200 3164 1.307354 1.991004 8.793267 0.007283 1.307312 +1200 3165 1.307354 1.991021 8.795914 0.007276 1.307312 +1200 3166 1.307354 1.991092 8.807530 0.007168 1.307313 +1200 3167 1.307354 1.991067 8.803452 0.007168 1.307313 +1200 3168 1.307354 1.991108 8.810009 0.007161 1.307314 +1200 3169 1.307354 1.991204 8.825700 0.007012 1.307315 +1200 3170 1.307354 1.991175 8.820940 0.007012 1.307315 +1200 3171 1.307354 1.991178 8.821572 0.007012 1.307315 +1200 3172 1.307354 1.991256 8.834307 0.006970 1.307316 +1200 3173 1.307354 1.991248 8.833046 0.006928 1.307316 +1200 3174 1.307354 1.991329 8.846468 0.006846 1.307317 +1200 3175 1.307354 1.991416 8.861031 0.006724 1.307318 +1200 3176 1.307354 1.991461 8.868568 0.006690 1.307319 +1200 3177 1.307354 1.991390 8.856674 0.006690 1.307317 +1200 3178 1.307354 1.991402 8.858751 0.006690 1.307317 +1200 3179 1.307354 1.991494 8.874232 0.006690 1.307318 +1200 3180 1.307354 1.991483 8.872381 0.006677 1.307319 +1200 3181 1.307354 1.991552 8.884109 0.006545 1.307320 +1200 3182 1.307354 1.991739 8.916453 0.006244 1.307323 +1200 3183 1.307354 1.991693 8.908535 0.006244 1.307323 +1200 3184 1.307354 1.991630 8.897531 0.006244 1.307322 +1200 3185 1.307354 1.991608 8.893817 0.006244 1.307322 +1200 3186 1.307354 1.991583 8.889494 0.006244 1.307321 +1200 3187 1.307354 1.991525 8.879525 0.006244 1.307321 +1200 3188 1.307354 1.991529 8.880117 0.006244 1.307321 +1200 3189 1.307354 1.991497 8.874713 0.006244 1.307321 +1200 3190 1.307354 1.991412 8.860322 0.006244 1.307320 +1200 3191 1.307354 1.991467 8.869672 0.006244 1.307321 +1200 3192 1.307354 1.991495 8.874463 0.006244 1.307322 +1200 3193 1.307354 1.991484 8.872537 0.006244 1.307322 +1200 3194 1.307354 1.991417 8.861172 0.006244 1.307321 +1200 3195 1.307354 1.991327 8.846189 0.006244 1.307320 +1200 3196 1.307354 1.991306 8.842583 0.006244 1.307320 +1200 3197 1.307354 1.991382 8.855314 0.006244 1.307321 +1200 3198 1.307354 1.991420 8.861774 0.006244 1.307321 +1200 3199 1.307354 1.991475 8.870995 0.006244 1.307322 +1200 3200 1.307354 1.991519 8.878526 0.006244 1.307322 +1200 3201 1.307354 1.991500 8.875273 0.006244 1.307322 +1200 3202 1.307354 1.991584 8.889667 0.006244 1.307323 +1200 3203 1.307354 1.991601 8.892542 0.006232 1.307323 +1200 3204 1.307354 1.991554 8.884542 0.006232 1.307323 +1200 3205 1.307354 1.991570 8.887183 0.006232 1.307323 +1200 3206 1.307354 1.991613 8.894686 0.006145 1.307324 +1200 3207 1.307354 1.991572 8.887595 0.006145 1.307323 +1200 3208 1.307354 1.991577 8.888378 0.006145 1.307323 +1200 3209 1.307354 1.991609 8.893842 0.006145 1.307324 +1200 3210 1.307354 1.991641 8.899488 0.006133 1.307324 +1200 3211 1.307354 1.991788 8.925056 0.006030 1.307325 +1200 3212 1.307354 1.991780 8.923668 0.006030 1.307325 +1200 3213 1.307354 1.991867 8.939125 0.005910 1.307326 +1200 3214 1.307354 1.991902 8.945316 0.005875 1.307327 +1200 3215 1.307354 1.992022 8.966958 0.005741 1.307328 +1200 3216 1.307354 1.992034 8.969053 0.005736 1.307328 +1200 3217 1.307354 1.992036 8.969366 0.005730 1.307328 +1200 3218 1.307354 1.991918 8.948144 0.005730 1.307327 +1200 3219 1.307354 1.991966 8.956820 0.005730 1.307328 +1200 3220 1.307354 1.991973 8.958067 0.005730 1.307327 +1200 3221 1.307354 1.992021 8.966682 0.005730 1.307327 +1200 3222 1.307354 1.992015 8.965668 0.005730 1.307327 +1200 3223 1.307354 1.992073 8.976211 0.005713 1.307328 +1200 3224 1.307354 1.992167 8.993380 0.005622 1.307329 +1200 3225 1.307354 1.992219 9.003038 0.005583 1.307329 +1200 3226 1.307354 1.992241 9.007087 0.005583 1.307329 +1200 3227 1.307354 1.992181 8.996066 0.005583 1.307329 +1200 3228 1.307354 1.992196 8.998764 0.005583 1.307329 +1200 3229 1.307354 1.992245 9.007893 0.005583 1.307329 +1200 3230 1.307354 1.992259 9.010395 0.005583 1.307328 +1200 3231 1.307354 1.992254 9.009477 0.005583 1.307329 +1200 3232 1.307354 1.992208 9.000892 0.005583 1.307327 +1200 3233 1.307354 1.992288 9.015855 0.005583 1.307328 +1200 3234 1.307354 1.992237 9.006372 0.005583 1.307327 +1200 3235 1.307354 1.992319 9.021717 0.005583 1.307328 +1200 3236 1.307354 1.992409 9.038708 0.005583 1.307329 +1200 3237 1.307354 1.992389 9.034988 0.005583 1.307329 +1200 3238 1.307354 1.992369 9.031094 0.005583 1.307329 +1200 3239 1.307354 1.992332 9.024135 0.005583 1.307328 +1200 3240 1.307354 1.992312 9.020444 0.005583 1.307327 +1200 3241 1.307354 1.992287 9.015686 0.005583 1.307327 +1200 3242 1.307354 1.992241 9.007139 0.005583 1.307327 +1200 3243 1.307354 1.992265 9.011681 0.005583 1.307327 +1200 3244 1.307354 1.992287 9.015670 0.005583 1.307328 +1200 3245 1.307354 1.992247 9.008267 0.005583 1.307327 +1200 3246 1.307354 1.992213 9.001950 0.005583 1.307327 +1200 3247 1.307354 1.992147 8.989628 0.005583 1.307326 +1200 3248 1.307354 1.992264 9.011370 0.005583 1.307327 +1200 3249 1.307354 1.992182 8.996140 0.005583 1.307327 +1200 3250 1.307354 1.992204 9.000314 0.005583 1.307327 +1200 3251 1.307354 1.992168 8.993544 0.005583 1.307327 +1200 3252 1.307354 1.992130 8.986525 0.005583 1.307327 +1200 3253 1.307354 1.992140 8.988424 0.005583 1.307327 +1200 3254 1.307354 1.992185 8.996735 0.005583 1.307326 +1200 3255 1.307354 1.992322 9.022233 0.005583 1.307328 +1200 3256 1.307354 1.992393 9.035763 0.005583 1.307329 +1200 3257 1.307354 1.992353 9.028214 0.005583 1.307329 +1200 3258 1.307354 1.992299 9.017965 0.005583 1.307328 +1200 3259 1.307354 1.992342 9.026104 0.005583 1.307328 +1200 3260 1.307354 1.992353 9.028161 0.005583 1.307329 +1200 3261 1.307354 1.992244 9.007710 0.005583 1.307328 +1200 3262 1.307354 1.992267 9.011874 0.005583 1.307329 +1200 3263 1.307354 1.992094 8.979939 0.005583 1.307326 +1200 3264 1.307354 1.992086 8.978607 0.005583 1.307325 +1200 3265 1.307354 1.992147 8.989781 0.005583 1.307326 +1200 3266 1.307354 1.992197 8.998860 0.005583 1.307327 +1200 3267 1.307354 1.992182 8.996192 0.005583 1.307327 +1200 3268 1.307354 1.992153 8.990794 0.005583 1.307326 +1200 3269 1.307354 1.992252 9.009148 0.005583 1.307327 +1200 3270 1.307354 1.992366 9.030577 0.005583 1.307329 +1200 3271 1.307354 1.992309 9.019781 0.005583 1.307328 +1200 3272 1.307354 1.992308 9.019583 0.005583 1.307328 +1200 3273 1.307354 1.992292 9.016570 0.005583 1.307328 +1200 3274 1.307354 1.992225 9.004149 0.005583 1.307328 +1200 3275 1.307354 1.992235 9.006064 0.005583 1.307328 +1200 3276 1.307354 1.992241 9.007130 0.005583 1.307328 +1200 3277 1.307354 1.992312 9.020471 0.005583 1.307328 +1200 3278 1.307354 1.992337 9.025070 0.005583 1.307329 +1200 3279 1.307354 1.992358 9.029052 0.005583 1.307329 +1200 3280 1.307354 1.992388 9.034716 0.005478 1.307330 +1200 3281 1.307354 1.992382 9.033534 0.005478 1.307330 +1200 3282 1.307354 1.992428 9.042381 0.005445 1.307331 +1200 3283 1.307354 1.992415 9.039934 0.005445 1.307330 +1200 3284 1.307354 1.992457 9.047875 0.005445 1.307330 +1200 3285 1.307354 1.992543 9.064493 0.005407 1.307331 +1200 3286 1.307354 1.992611 9.077737 0.005306 1.307332 +1200 3287 1.307354 1.992756 9.106340 0.005047 1.307334 +1200 3288 1.307354 1.992740 9.103211 0.005047 1.307334 +1200 3289 1.307354 1.992886 9.132511 0.004854 1.307335 +1200 3290 1.307354 1.992901 9.135540 0.004758 1.307336 +1200 3291 1.307354 1.992816 9.118474 0.004758 1.307336 +1200 3292 1.307354 1.992840 9.123322 0.004758 1.307336 +1200 3293 1.307354 1.992724 9.100016 0.004758 1.307334 +1200 3294 1.307354 1.992752 9.105651 0.004758 1.307334 +1200 3295 1.307354 1.992773 9.109869 0.004758 1.307335 +1200 3296 1.307354 1.992766 9.108447 0.004758 1.307335 +1200 3297 1.307354 1.992770 9.109249 0.004758 1.307335 +1200 3298 1.307354 1.992829 9.121017 0.004758 1.307336 +1200 3299 1.307354 1.992892 9.133789 0.004758 1.307336 +1200 3300 1.307354 1.993006 9.157066 0.004706 1.307337 +1200 3301 1.307354 1.992934 9.142252 0.004706 1.307336 +1200 3302 1.307354 1.992925 9.140416 0.004706 1.307336 +1200 3303 1.307354 1.992925 9.140416 0.004706 1.307336 +1200 3304 1.307354 1.992987 9.153193 0.004659 1.307337 +1200 3305 1.307354 1.993005 9.156854 0.004622 1.307337 +1200 3306 1.307354 1.992978 9.151273 0.004617 1.307337 +1200 3307 1.307354 1.992959 9.147423 0.004617 1.307337 +1200 3308 1.307354 1.992969 9.149604 0.004617 1.307337 +1200 3309 1.307354 1.993062 9.168669 0.004490 1.307338 +1200 3310 1.307354 1.993035 9.163087 0.004490 1.307338 +1200 3311 1.307354 1.993061 9.168583 0.004490 1.307338 +1200 3312 1.307354 1.993014 9.158745 0.004490 1.307338 +1200 3313 1.307354 1.992965 9.148775 0.004490 1.307337 +1200 3314 1.307354 1.993009 9.157802 0.004490 1.307338 +1200 3315 1.307354 1.992971 9.149983 0.004490 1.307338 +1200 3316 1.307354 1.992865 9.128324 0.004490 1.307337 +1200 3317 1.307354 1.992932 9.141860 0.004490 1.307337 +1200 3318 1.307354 1.993077 9.171895 0.004388 1.307339 +1200 3319 1.307354 1.993026 9.161203 0.004388 1.307338 +1200 3320 1.307354 1.993080 9.172451 0.004388 1.307338 +1200 3321 1.307354 1.993128 9.182555 0.004388 1.307338 +1200 3322 1.307354 1.993244 9.207261 0.004366 1.307339 +1200 3323 1.307354 1.993315 9.222490 0.004310 1.307339 +1200 3324 1.307354 1.993394 9.239644 0.004241 1.307340 +1200 3325 1.307354 1.993312 9.221879 0.004241 1.307339 +1200 3326 1.307354 1.993334 9.226495 0.004241 1.307339 +1200 3327 1.307354 1.993380 9.236568 0.004241 1.307339 +1200 3328 1.307354 1.993299 9.219029 0.004241 1.307339 +1200 3329 1.307354 1.993301 9.219468 0.004241 1.307339 +1200 3330 1.307354 1.993313 9.222088 0.004241 1.307339 +1200 3331 1.307354 1.993220 9.202011 0.004241 1.307338 +1200 3332 1.307354 1.993224 9.202839 0.004241 1.307338 +1200 3333 1.307354 1.993280 9.214920 0.004241 1.307338 +1200 3334 1.307354 1.993386 9.237841 0.004241 1.307339 +1200 3335 1.307354 1.993470 9.256401 0.004241 1.307340 +1200 3336 1.307354 1.993516 9.266561 0.004199 1.307340 +1200 3337 1.307354 1.993525 9.268571 0.004157 1.307340 +1200 3338 1.307354 1.993549 9.273956 0.004149 1.307340 +1200 3339 1.307354 1.993608 9.287167 0.004091 1.307341 +1200 3340 1.307354 1.993614 9.288598 0.004075 1.307341 +1200 3341 1.307354 1.993645 9.295492 0.004043 1.307341 +1200 3342 1.307354 1.993640 9.294564 0.004043 1.307341 +1200 3343 1.307354 1.993673 9.301987 0.004043 1.307341 +1200 3344 1.307354 1.993679 9.303416 0.004043 1.307341 +1200 3345 1.307354 1.993709 9.310314 0.003982 1.307341 +1200 3346 1.307354 1.993753 9.320281 0.003834 1.307342 +1200 3347 1.307354 1.993837 9.340000 0.003684 1.307343 +1200 3348 1.307354 1.993874 9.348734 0.003636 1.307344 +1200 3349 1.307354 1.993895 9.353495 0.003625 1.307344 +1200 3350 1.307354 1.993868 9.347162 0.003622 1.307344 +1200 3351 1.307354 1.993881 9.350346 0.003607 1.307344 +1200 3352 1.307354 1.993844 9.341519 0.003607 1.307343 +1200 3353 1.307354 1.993838 9.340251 0.003607 1.307343 +1200 3354 1.307354 1.993838 9.340064 0.003607 1.307343 +1200 3355 1.307354 1.993871 9.348018 0.003607 1.307343 +1200 3356 1.307354 1.993898 9.354258 0.003568 1.307344 +1200 3357 1.307354 1.993955 9.367850 0.003487 1.307344 +1200 3358 1.307354 1.993920 9.359575 0.003487 1.307344 +1200 3359 1.307354 1.993908 9.356551 0.003487 1.307344 +1200 3360 1.307354 1.993940 9.364244 0.003487 1.307344 +1200 3361 1.307354 1.994050 9.390793 0.003347 1.307345 +1200 3362 1.307354 1.993960 9.369061 0.003347 1.307345 +1200 3363 1.307354 1.994060 9.393233 0.003330 1.307345 +1200 3364 1.307354 1.994099 9.402643 0.003277 1.307346 +1200 3365 1.307354 1.994154 9.416120 0.003206 1.307346 +1200 3366 1.307354 1.994144 9.413798 0.003206 1.307346 +1200 3367 1.307354 1.994113 9.406091 0.003206 1.307346 +1200 3368 1.307354 1.994135 9.411443 0.003206 1.307346 +1200 3369 1.307354 1.994105 9.404085 0.003206 1.307346 +1200 3370 1.307354 1.994217 9.431909 0.003161 1.307346 +1200 3371 1.307354 1.994150 9.415126 0.003161 1.307345 +1200 3372 1.307354 1.994234 9.436197 0.003161 1.307346 +1200 3373 1.307354 1.994320 9.457734 0.003062 1.307347 +1200 3374 1.307354 1.994294 9.451298 0.003062 1.307346 +1200 3375 1.307354 1.994340 9.462999 0.003056 1.307347 +1200 3376 1.307354 1.994392 9.476161 0.002945 1.307347 +1200 3377 1.307354 1.994371 9.470920 0.002945 1.307347 +1200 3378 1.307354 1.994476 9.498062 0.002796 1.307348 +1200 3379 1.307354 1.994471 9.496662 0.002793 1.307348 +1200 3380 1.307354 1.994458 9.493378 0.002793 1.307348 +1200 3381 1.307354 1.994517 9.508763 0.002743 1.307348 +1200 3382 1.307354 1.994540 9.514957 0.002738 1.307348 +1200 3383 1.307354 1.994575 9.524224 0.002719 1.307348 +1200 3384 1.307354 1.994550 9.517671 0.002719 1.307348 +1200 3385 1.307354 1.994559 9.519940 0.002719 1.307348 +1200 3386 1.307354 1.994598 9.530260 0.002670 1.307348 +1200 3387 1.307354 1.994644 9.542785 0.002633 1.307348 +1200 3388 1.307354 1.994689 9.554846 0.002602 1.307349 +1200 3389 1.307354 1.994675 9.551128 0.002602 1.307349 +1200 3390 1.307354 1.994656 9.545825 0.002602 1.307348 +1200 3391 1.307354 1.994597 9.530110 0.002602 1.307348 +1200 3392 1.307354 1.994714 9.561570 0.002602 1.307348 +1200 3393 1.307354 1.994717 9.562626 0.002602 1.307348 +1200 3394 1.307354 1.994687 9.554453 0.002602 1.307348 +1200 3395 1.307354 1.994690 9.555211 0.002602 1.307348 +1200 3396 1.307354 1.994756 9.573091 0.002602 1.307348 +1200 3397 1.307354 1.994772 9.577628 0.002602 1.307348 +1200 3398 1.307354 1.994808 9.587740 0.002602 1.307349 +1200 3399 1.307354 1.994792 9.583179 0.002602 1.307348 +1200 3400 1.307354 1.994810 9.588132 0.002602 1.307348 +1200 3401 1.307354 1.994724 9.564534 0.002602 1.307348 +1200 3402 1.307354 1.994637 9.540913 0.002602 1.307347 +1200 3403 1.307354 1.994707 9.559678 0.002602 1.307348 +1200 3404 1.307354 1.994622 9.536684 0.002602 1.307347 +1200 3405 1.307354 1.994602 9.531450 0.002602 1.307347 +1200 3406 1.307354 1.994568 9.522409 0.002602 1.307347 +1200 3407 1.307354 1.994541 9.515096 0.002602 1.307347 +1200 3408 1.307354 1.994628 9.538500 0.002602 1.307347 +1200 3409 1.307354 1.994725 9.564629 0.002602 1.307348 +1200 3410 1.307354 1.994699 9.557680 0.002602 1.307348 +1200 3411 1.307354 1.994751 9.571973 0.002602 1.307348 +1200 3412 1.307354 1.994695 9.556617 0.002602 1.307348 +1200 3413 1.307354 1.994728 9.565461 0.002602 1.307348 +1200 3414 1.307354 1.994783 9.580562 0.002602 1.307348 +1200 3415 1.307354 1.994876 9.606653 0.002586 1.307349 +1200 3416 1.307354 1.994860 9.602125 0.002586 1.307348 +1200 3417 1.307354 1.994821 9.591373 0.002586 1.307348 +1200 3418 1.307354 1.994840 9.596574 0.002586 1.307348 +1200 3419 1.307354 1.994856 9.600936 0.002586 1.307348 +1200 3420 1.307354 1.994764 9.575398 0.002586 1.307348 +1200 3421 1.307354 1.994772 9.577712 0.002586 1.307348 +1200 3422 1.307354 1.994687 9.554265 0.002586 1.307347 +1200 3423 1.307354 1.994686 9.554166 0.002586 1.307347 +1200 3424 1.307354 1.994684 9.553467 0.002586 1.307348 +1200 3425 1.307354 1.994624 9.537435 0.002586 1.307348 +1200 3426 1.307354 1.994708 9.559926 0.002586 1.307348 +1200 3427 1.307354 1.994778 9.579216 0.002586 1.307349 +1200 3428 1.307354 1.994770 9.577134 0.002586 1.307349 +1200 3429 1.307354 1.994812 9.588755 0.002576 1.307349 +1200 3430 1.307354 1.994816 9.589826 0.002576 1.307349 +1200 3431 1.307354 1.994867 9.604011 0.002558 1.307349 +1200 3432 1.307354 1.994898 9.612923 0.002482 1.307349 +1200 3433 1.307354 1.994920 9.619151 0.002482 1.307349 +1200 3434 1.307354 1.994944 9.625880 0.002448 1.307349 +1200 3435 1.307354 1.995040 9.653614 0.002275 1.307350 +1200 3436 1.307354 1.995097 9.670364 0.002239 1.307350 +1200 3437 1.307354 1.995172 9.692514 0.002130 1.307350 +1200 3438 1.307354 1.995144 9.684350 0.002130 1.307350 +1200 3439 1.307354 1.995091 9.668569 0.002130 1.307350 +1200 3440 1.307354 1.995108 9.673553 0.002130 1.307350 +1200 3441 1.307354 1.995102 9.671963 0.002130 1.307350 +1200 3442 1.307354 1.995109 9.673838 0.002130 1.307350 +1200 3443 1.307354 1.995080 9.665204 0.002130 1.307350 +1200 3444 1.307354 1.995042 9.654136 0.002130 1.307350 +1200 3445 1.307354 1.995077 9.664564 0.002130 1.307350 +1200 3446 1.307354 1.995001 9.642269 0.002130 1.307349 +1200 3447 1.307354 1.995015 9.646268 0.002130 1.307349 +1200 3448 1.307354 1.995039 9.653416 0.002130 1.307350 +1200 3449 1.307354 1.995092 9.668739 0.002130 1.307350 +1200 3450 1.307354 1.995097 9.670248 0.002130 1.307350 +1200 3451 1.307354 1.995082 9.666039 0.002130 1.307350 +1200 3452 1.307354 1.995086 9.667037 0.002130 1.307350 +1200 3453 1.307354 1.995038 9.652973 0.002130 1.307350 +1200 3454 1.307354 1.995100 9.671270 0.002130 1.307350 +1200 3455 1.307354 1.995166 9.690937 0.002041 1.307351 +1200 3456 1.307354 1.995192 9.698754 0.002020 1.307351 +1200 3457 1.307354 1.995228 9.709504 0.002020 1.307351 +1200 3458 1.307354 1.995303 9.732210 0.001929 1.307351 +1200 3459 1.307354 1.995331 9.741036 0.001884 1.307351 +1200 3460 1.307354 1.995267 9.721193 0.001884 1.307351 +1200 3461 1.307354 1.995234 9.711305 0.001884 1.307351 +1200 3462 1.307354 1.995227 9.709128 0.001884 1.307351 +1200 3463 1.307354 1.995228 9.709430 0.001884 1.307351 +1200 3464 1.307354 1.995175 9.693471 0.001884 1.307351 +1200 3465 1.307354 1.995200 9.700934 0.001884 1.307351 +1200 3466 1.307354 1.995159 9.688607 0.001884 1.307351 +1200 3467 1.307354 1.995180 9.694923 0.001884 1.307351 +1200 3468 1.307354 1.995158 9.688510 0.001884 1.307351 +1200 3469 1.307354 1.995224 9.708189 0.001878 1.307351 +1200 3470 1.307354 1.995245 9.714681 0.001808 1.307351 +1200 3471 1.307354 1.995140 9.683201 0.001808 1.307351 +1200 3472 1.307354 1.995107 9.673351 0.001808 1.307351 +1200 3473 1.307354 1.995074 9.663657 0.001808 1.307351 +1200 3474 1.307354 1.994948 9.627032 0.001808 1.307350 +1200 3475 1.307354 1.994930 9.621969 0.001808 1.307350 +1200 3476 1.307354 1.994888 9.609990 0.001808 1.307350 +1200 3477 1.307354 1.994939 9.624655 0.001808 1.307350 +1200 3478 1.307354 1.994907 9.615481 0.001808 1.307350 +1200 3479 1.307354 1.994950 9.627793 0.001808 1.307350 +1200 3480 1.307354 1.994975 9.634897 0.001808 1.307350 +1200 3481 1.307354 1.994947 9.626749 0.001808 1.307350 +1200 3482 1.307354 1.994922 9.619749 0.001808 1.307350 +1200 3483 1.307354 1.994885 9.609211 0.001808 1.307350 +1200 3484 1.307354 1.994852 9.599942 0.001808 1.307350 +1200 3485 1.307354 1.994872 9.605575 0.001808 1.307350 +1200 3486 1.307354 1.994863 9.603092 0.001808 1.307350 +1200 3487 1.307354 1.994783 9.580634 0.001808 1.307350 +1200 3488 1.307354 1.994765 9.575694 0.001808 1.307350 +1200 3489 1.307354 1.994740 9.568862 0.001808 1.307350 +1200 3490 1.307354 1.994713 9.561465 0.001808 1.307350 +1200 3491 1.307354 1.994704 9.559092 0.001808 1.307350 +1200 3492 1.307354 1.994692 9.555772 0.001808 1.307350 +1200 3493 1.307354 1.994695 9.556434 0.001808 1.307350 +1200 3494 1.307354 1.994704 9.558978 0.001808 1.307350 +1200 3495 1.307354 1.994750 9.571524 0.001808 1.307350 +1200 3496 1.307354 1.994792 9.583126 0.001808 1.307350 +1200 3497 1.307354 1.994887 9.609729 0.001808 1.307351 +1200 3498 1.307354 1.994915 9.617676 0.001808 1.307351 +1200 3499 1.307354 1.994924 9.620217 0.001808 1.307351 diff --git a/src/main/java/japsa/bio/misc/resources/PfSummary b/src/main/java/japsa/bio/misc/resources/PfSummary new file mode 100755 index 0000000..89037f7 --- /dev/null +++ b/src/main/java/japsa/bio/misc/resources/PfSummary @@ -0,0 +1,81 @@ +#PF sumarry +# 0 A +0.000012408 0.000012408 0.999962777 0.000012408 +0.000012408 0.999962777 0.000012408 0.000012408 +0.424914697 0.105924685 0.054705627 0.414454991 +# 1 R +0.767459248 0.232522291 0.000009230 0.000009230 +0.000009230 0.000009230 0.999972309 0.000009230 +0.694400857 0.016236224 0.175229375 0.114133545 +# 2 N +0.999994909 0.000001697 0.000001697 0.000001697 +0.999994909 0.000001697 0.000001697 0.000001697 +0.000001697 0.138915619 0.000001697 0.861080987 +# 3 D +0.000003773 0.000003773 0.999988682 0.000003773 +0.999988682 0.000003773 0.000003773 0.000003773 +0.000003773 0.134622567 0.000003773 0.865369888 +# 4 C +0.000013801 0.000013801 0.000013801 0.999958596 +0.000013801 0.000013801 0.999958596 0.000013801 +0.000013801 0.130134150 0.000013801 0.869838247 +# 5 Q +0.000008820 0.999973540 0.000008820 0.000008820 +0.999973540 0.000008820 0.000008820 0.000008820 +0.867128241 0.000008820 0.132854119 0.000008820 +# 6 E +0.000003419 0.000003419 0.999989744 0.000003419 +0.999989744 0.000003419 0.000003419 0.000003419 +0.854974019 0.000003419 0.145019144 0.000003419 +# 7 G +0.000008614 0.000008614 0.999974158 0.000008614 +0.000008614 0.000008614 0.999974158 0.000008614 +0.437069834 0.047075950 0.098174676 0.417679539 +# 8 H +0.000010048 0.999969857 0.000010048 0.000010048 +0.999969857 0.000010048 0.000010048 0.000010048 +0.000010048 0.142915993 0.000010048 0.857063912 +# 9 I +0.999992090 0.000002637 0.000002637 0.000002637 +0.000002637 0.000002637 0.000002637 0.999992090 +0.543372566 0.067774579 0.000002637 0.388850218 +# 10 L +0.000003222 0.237330627 0.000003222 0.762662929 +0.000003222 0.000003222 0.000003222 0.999990333 +0.703578391 0.023680861 0.157939001 0.114801746 +# 11 K +0.999993760 0.000002080 0.000002080 0.000002080 +0.999993760 0.000002080 0.000002080 0.000002080 +0.816996129 0.000002080 0.182999711 0.000002080 +# 12 M +0.999966641 0.000011120 0.000011120 0.000011120 +0.000011120 0.000011120 0.000011120 0.999966641 +0.000011120 0.000011120 0.999966641 0.000011120 +# 13 F +0.000005599 0.000005599 0.000005599 0.999983202 +0.000005599 0.000005599 0.000005599 0.999983202 +0.000005599 0.162683793 0.000005599 0.837305009 +# 14 P +0.000012251 0.999963246 0.000012251 0.000012251 +0.000012251 0.999963246 0.000012251 0.000012251 +0.454173456 0.103684011 0.047682638 0.394459895 +# 15 S +0.385272927 0.000003816 0.000003816 0.614719441 +0.000003816 0.614719441 0.385272927 0.000003816 +0.258929662 0.140604011 0.046999023 0.553467304 +# 16 T +0.999982103 0.000005966 0.000005966 0.000005966 +0.000005966 0.999982103 0.000005966 0.000005966 +0.531465695 0.116962065 0.092252441 0.259319799 +# 17 W +0.000049380 0.000049380 0.000049380 0.999851859 +0.000049380 0.000049380 0.999851859 0.000049380 +0.000049380 0.000049380 0.999851859 0.000049380 +# 18 Y +0.000004270 0.000004270 0.000004270 0.999987189 +0.999987189 0.000004270 0.000004270 0.000004270 +0.000004270 0.108422621 0.000004270 0.891568838 +# 19 V +0.000006358 0.000006358 0.999980925 0.000006358 +0.000006358 0.000006358 0.000006358 0.999980925 +0.410640030 0.062928392 0.125398985 0.401032593 diff --git a/src/main/java/japsa/bio/misc/resources/PfSummary.2 b/src/main/java/japsa/bio/misc/resources/PfSummary.2 new file mode 100755 index 0000000..142b70f --- /dev/null +++ b/src/main/java/japsa/bio/misc/resources/PfSummary.2 @@ -0,0 +1,84 @@ +# 0 A +0.000012408 0.000012408 0.999962777 0.000012408 +0.000012408 0.999962777 0.000012408 0.000012408 +0.424914697 0.105924685 0.054705627 0.414454991 +# 1 R +0.767459248 0.232522291 0.000009230 0.000009230 +0.000009230 0.000009230 0.999972309 0.000009230 +0.694400857 0.016236224 0.175229375 0.114133545 +# 2 N +0.999994909 0.000001697 0.000001697 0.000001697 +0.999994909 0.000001697 0.000001697 0.000001697 +0.000001697 0.138915619 0.000001697 0.861080987 +# 3 D +0.000003773 0.000003773 0.999988682 0.000003773 +0.999988682 0.000003773 0.000003773 0.000003773 +0.000003773 0.134622567 0.000003773 0.865369888 +# 4 C +0.000013801 0.000013801 0.000013801 0.999958596 +0.000013801 0.000013801 0.999958596 0.000013801 +0.000013801 0.130134150 0.000013801 0.869838247 +# 5 Q +0.000008820 0.999973540 0.000008820 0.000008820 +0.999973540 0.000008820 0.000008820 0.000008820 +0.867128241 0.000008820 0.132854119 0.000008820 +# 6 E +0.000003419 0.000003419 0.999989744 0.000003419 +0.999989744 0.000003419 0.000003419 0.000003419 +0.854974019 0.000003419 0.145019144 0.000003419 +# 7 G +0.000008614 0.000008614 0.999974158 0.000008614 +0.000008614 0.000008614 0.999974158 0.000008614 +0.437069834 0.047075950 0.098174676 0.417679539 +# 8 H +0.000010048 0.999969857 0.000010048 0.000010048 +0.999969857 0.000010048 0.000010048 0.000010048 +0.000010048 0.142915993 0.000010048 0.857063912 +# 9 I +0.999992090 0.000002637 0.000002637 0.000002637 +0.000002637 0.000002637 0.000002637 0.999992090 +0.543372566 0.067774579 0.000002637 0.388850218 +# 10 L +0.000003222 0.237330627 0.000003222 0.762662929 +0.000003222 0.000003222 0.000003222 0.999990333 +0.703578391 0.023680861 0.157939001 0.114801746 +# 11 K +0.999993760 0.000002080 0.000002080 0.000002080 +0.999993760 0.000002080 0.000002080 0.000002080 +0.816996129 0.000002080 0.182999711 0.000002080 +# 12 M +0.999966641 0.000011120 0.000011120 0.000011120 +0.000011120 0.000011120 0.000011120 0.999966641 +0.000011120 0.000011120 0.999966641 0.000011120 +# 13 F +0.000005599 0.000005599 0.000005599 0.999983202 +0.000005599 0.000005599 0.000005599 0.999983202 +0.000005599 0.162683793 0.000005599 0.837305009 +# 14 P +0.000012251 0.999963246 0.000012251 0.000012251 +0.000012251 0.999963246 0.000012251 0.000012251 +0.454173456 0.103684011 0.047682638 0.394459895 +# 15 S +0.385272927 0.000003816 0.000003816 0.614719441 +0.000003816 0.614719441 0.385272927 0.000003816 +0.258929662 0.140604011 0.046999023 0.553467304 +# 16 T +0.999982103 0.000005966 0.000005966 0.000005966 +0.000005966 0.999982103 0.000005966 0.000005966 +0.531465695 0.116962065 0.092252441 0.259319799 +# 17 W +0.000049380 0.000049380 0.000049380 0.999851859 +0.000049380 0.000049380 0.999851859 0.000049380 +0.000049380 0.000049380 0.999851859 0.000049380 +# 18 Y +0.000004270 0.000004270 0.000004270 0.999987189 +0.999987189 0.000004270 0.000004270 0.000004270 +0.000004270 0.108422621 0.000004270 0.891568838 +# 19 V +0.000006358 0.000006358 0.999980925 0.000006358 +0.000006358 0.000006358 0.000006358 0.999980925 +0.410640030 0.062928392 0.125398985 0.401032593 +# 20 O +0.000171174 0.000171174 0.000171174 0.999486477 +0.789626840 0.000171174 0.210030811 0.000171174 +0.889934954 0.000171174 0.109722698 0.000171174 diff --git a/src/main/java/japsa/bio/misc/resources/PvSummary b/src/main/java/japsa/bio/misc/resources/PvSummary new file mode 100755 index 0000000..7f57d30 --- /dev/null +++ b/src/main/java/japsa/bio/misc/resources/PvSummary @@ -0,0 +1,81 @@ +#PV codon +# 0 A +0.000005215 0.000005215 0.999984356 0.000005215 +0.000005215 0.999984356 0.000005215 0.000005215 +0.255352766 0.306346144 0.277905825 0.160395265 +# 1 R +0.603342173 0.396646638 0.000005595 0.000005595 +0.000005595 0.000005595 0.999983216 0.000005595 +0.344707014 0.127890974 0.470550390 0.056851622 +# 2 N +0.999988674 0.000003775 0.000003775 0.000003775 +0.999988674 0.000003775 0.000003775 0.000003775 +0.000003775 0.530231873 0.000003775 0.469760576 +# 3 D +0.000004998 0.000004998 0.999985006 0.000004998 +0.999985006 0.000004998 0.000004998 0.000004998 +0.000004998 0.534956018 0.000004998 0.465033986 +# 4 C +0.000014916 0.000014916 0.000014916 0.999955251 +0.000014916 0.000014916 0.999955251 0.000014916 +0.000014916 0.652565632 0.000014916 0.347404535 +# 5 Q +0.000008354 0.999974937 0.000008354 0.000008354 +0.999974937 0.000008354 0.000008354 0.000008354 +0.451331278 0.000008354 0.548652013 0.000008354 +# 6 E +0.000003552 0.000003552 0.999989343 0.000003552 +0.999989343 0.000003552 0.000003552 0.000003552 +0.523227766 0.000003552 0.476765129 0.000003552 +# 7 G +0.000004154 0.000004154 0.999987537 0.000004154 +0.000004154 0.000004154 0.999987537 0.000004154 +0.255085890 0.251978481 0.337939057 0.154996573 +# 8 H +0.000010118 0.999969646 0.000010118 0.000010118 +0.999969646 0.000010118 0.000010118 0.000010118 +0.000010118 0.624434664 0.000010118 0.375545100 +# 9 I +0.999985310 0.000004897 0.000004897 0.000004897 +0.000004897 0.000004897 0.000004897 0.999985310 +0.304749780 0.314337479 0.000004897 0.380907844 +# 10 L +0.000003344 0.644744189 0.000003344 0.355249123 +0.000003344 0.000003344 0.000003344 0.999989969 +0.268198977 0.185639434 0.452541052 0.093620536 +# 11 K +0.999990888 0.000003037 0.000003037 0.000003037 +0.999990888 0.000003037 0.000003037 0.000003037 +0.518777603 0.000003037 0.481216322 0.000003037 +# 12 M +0.999959170 0.000013610 0.000013610 0.000013610 +0.000013610 0.000013610 0.000013610 0.999959170 +0.000013610 0.000013610 0.999959170 0.000013610 +# 13 F +0.000006437 0.000006437 0.000006437 0.999980688 +0.000006437 0.000006437 0.000006437 0.999980688 +0.000006437 0.462801671 0.000006437 0.537185454 +# 14 P +0.000007625 0.999977124 0.000007625 0.000007625 +0.000007625 0.999977124 0.000007625 0.000007625 +0.249839866 0.414343450 0.183872198 0.151944487 +# 15 S +0.503680526 0.000003394 0.000003394 0.496312687 +0.000003394 0.496312687 0.503680526 0.000003394 +0.087837127 0.516451220 0.117115038 0.278596615 +# 16 T +0.999980864 0.000006379 0.000006379 0.000006379 +0.000006379 0.999980864 0.000006379 0.000006379 +0.211676830 0.338064195 0.300232184 0.150026790 +# 17 W +0.000044589 0.000044589 0.000044589 0.999866233 +0.000044589 0.000044589 0.999866233 0.000044589 +0.000044589 0.000044589 0.999866233 0.000044589 +# 18 Y +0.000006828 0.000006828 0.000006828 0.999979517 +0.999979517 0.000006828 0.000006828 0.000006828 +0.000006828 0.627680711 0.000006828 0.372305634 +# 19 V +0.000005231 0.000005231 0.999984306 0.000005231 +0.000005231 0.000005231 0.000005231 0.999984306 +0.180753396 0.198392978 0.416392465 0.204461161 diff --git a/src/main/java/japsa/bio/misc/resources/PvSummary.2 b/src/main/java/japsa/bio/misc/resources/PvSummary.2 new file mode 100755 index 0000000..c3b059c --- /dev/null +++ b/src/main/java/japsa/bio/misc/resources/PvSummary.2 @@ -0,0 +1,84 @@ +# 0 A +0.000005215 0.000005215 0.999984356 0.000005215 +0.000005215 0.999984356 0.000005215 0.000005215 +0.255352766 0.306346144 0.277905825 0.160395265 +# 1 R +0.603342173 0.396646638 0.000005595 0.000005595 +0.000005595 0.000005595 0.999983216 0.000005595 +0.344707014 0.127890974 0.470550390 0.056851622 +# 2 N +0.999988674 0.000003775 0.000003775 0.000003775 +0.999988674 0.000003775 0.000003775 0.000003775 +0.000003775 0.530231873 0.000003775 0.469760576 +# 3 D +0.000004998 0.000004998 0.999985006 0.000004998 +0.999985006 0.000004998 0.000004998 0.000004998 +0.000004998 0.534956018 0.000004998 0.465033986 +# 4 C +0.000014916 0.000014916 0.000014916 0.999955251 +0.000014916 0.000014916 0.999955251 0.000014916 +0.000014916 0.652565632 0.000014916 0.347404535 +# 5 Q +0.000008354 0.999974937 0.000008354 0.000008354 +0.999974937 0.000008354 0.000008354 0.000008354 +0.451331278 0.000008354 0.548652013 0.000008354 +# 6 E +0.000003552 0.000003552 0.999989343 0.000003552 +0.999989343 0.000003552 0.000003552 0.000003552 +0.523227766 0.000003552 0.476765129 0.000003552 +# 7 G +0.000004154 0.000004154 0.999987537 0.000004154 +0.000004154 0.000004154 0.999987537 0.000004154 +0.255085890 0.251978481 0.337939057 0.154996573 +# 8 H +0.000010118 0.999969646 0.000010118 0.000010118 +0.999969646 0.000010118 0.000010118 0.000010118 +0.000010118 0.624434664 0.000010118 0.375545100 +# 9 I +0.999985310 0.000004897 0.000004897 0.000004897 +0.000004897 0.000004897 0.000004897 0.999985310 +0.304749780 0.314337479 0.000004897 0.380907844 +# 10 L +0.000003344 0.644744189 0.000003344 0.355249123 +0.000003344 0.000003344 0.000003344 0.999989969 +0.268198977 0.185639434 0.452541052 0.093620536 +# 11 K +0.999990888 0.000003037 0.000003037 0.000003037 +0.999990888 0.000003037 0.000003037 0.000003037 +0.518777603 0.000003037 0.481216322 0.000003037 +# 12 M +0.999959170 0.000013610 0.000013610 0.000013610 +0.000013610 0.000013610 0.000013610 0.999959170 +0.000013610 0.000013610 0.999959170 0.000013610 +# 13 F +0.000006437 0.000006437 0.000006437 0.999980688 +0.000006437 0.000006437 0.000006437 0.999980688 +0.000006437 0.462801671 0.000006437 0.537185454 +# 14 P +0.000007625 0.999977124 0.000007625 0.000007625 +0.000007625 0.999977124 0.000007625 0.000007625 +0.249839866 0.414343450 0.183872198 0.151944487 +# 15 S +0.503680526 0.000003394 0.000003394 0.496312687 +0.000003394 0.496312687 0.503680526 0.000003394 +0.087837127 0.516451220 0.117115038 0.278596615 +# 16 T +0.999980864 0.000006379 0.000006379 0.000006379 +0.000006379 0.999980864 0.000006379 0.000006379 +0.211676830 0.338064195 0.300232184 0.150026790 +# 17 W +0.000044589 0.000044589 0.000044589 0.999866233 +0.000044589 0.000044589 0.999866233 0.000044589 +0.000044589 0.000044589 0.999866233 0.000044589 +# 18 Y +0.000006828 0.000006828 0.000006828 0.999979517 +0.999979517 0.000006828 0.000006828 0.000006828 +0.000006828 0.627680711 0.000006828 0.372305634 +# 19 V +0.000005231 0.000005231 0.999984306 0.000005231 +0.000005231 0.000005231 0.000005231 0.999984306 +0.180753396 0.198392978 0.416392465 0.204461161 +# 20 O +0.000168663 0.000168663 0.000168663 0.999494012 +0.583572272 0.000168663 0.416090403 0.000168663 +0.690841626 0.000168663 0.308821049 0.000168663 diff --git a/src/main/java/japsa/bio/misc/resources/codon b/src/main/java/japsa/bio/misc/resources/codon new file mode 100755 index 0000000..76a9b58 --- /dev/null +++ b/src/main/java/japsa/bio/misc/resources/codon @@ -0,0 +1,64 @@ +AAA K +AAC N +AAG K +AAU N +ACA T +ACC T +ACG T +ACU T +AGA R +AGC S +AGG R +AGU S +AUA I +AUC I +AUG M +AUU I +CAA Q +CAC H +CAG Q +CAU H +CCA P +CCC P +CCG P +CCU P +CGA R +CGC R +CGG R +CGU R +CUA L +CUC L +CUG L +CUU L +GAA E +GAC D +GAG E +GAU D +GCA A +GCC A +GCG A +GCU A +GGA G +GGC G +GGG G +GGU G +GUA V +GUC V +GUG V +GUU V +UAA X +UAC Y +UAG X +UAU Y +UCA S +UCC S +UCG S +UCU S +UGA X +UGC C +UGG W +UGU C +UUA L +UUC F +UUG L +UUU F diff --git a/src/main/java/japsa/bio/np/ErrorCorrection.java b/src/main/java/japsa/bio/np/ErrorCorrection.java index 8a813fb..990cd14 100644 --- a/src/main/java/japsa/bio/np/ErrorCorrection.java +++ b/src/main/java/japsa/bio/np/ErrorCorrection.java @@ -40,7 +40,8 @@ import japsa.seq.SequenceBuilder; import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; -import japsa.util.Logging; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.FileReader; @@ -57,6 +58,7 @@ * */ public class ErrorCorrection { + private static final Logger LOG = LoggerFactory.getLogger(ErrorCorrection.class); public static String prefix = "tmp"; public static String msa = "kalign"; @@ -69,10 +71,10 @@ public static double needle(Sequence seq1, Sequence seq2, String prefix) throws String needleOut = prefix + "_alignment.needle"; String cmd = "needle -gapopen 10 -gapextend 0.5 -asequence " + seq1File + " -bsequence " + seq2File + " -outfile " + needleOut; - Logging.info("Running " + cmd); + LOG.info("Running " + cmd); Process process = Runtime.getRuntime().exec(cmd); process.waitFor(); - Logging.info("Run'ed " + cmd ); + LOG.info("Run'ed " + cmd ); BufferedReader scoreBf = new BufferedReader(new FileReader(needleOut)); String scoreLine = null; @@ -88,7 +90,14 @@ public static double needle(Sequence seq1, Sequence seq2, String prefix) throws return score; } - public static Sequence consensusSequence(ArrayList readList, String prefix, String msa) throws IOException, InterruptedException{ + public static Sequence consensusSequence(ArrayList readList, String prefix, String msa) throws IOException, InterruptedException{ + if (readList != null && readList.size() > 0) + return consensusSequence(readList, readList.size(), prefix, msa); + else + return null; + + } + public static Sequence consensusSequence(ArrayList readList, int max, String prefix, String msa) throws IOException, InterruptedException{ //String faiFile = prefix + "_" + this.currentReadCount; Sequence consensus = null; if (readList != null && readList.size() > 0){ @@ -101,9 +110,13 @@ public static Sequence consensusSequence(ArrayList readList, String pr String faoFile = prefix + "_ao.fasta";//name of fasta files of reads mapped to the gene { SequenceOutputStream faiSt = SequenceOutputStream.makeOutputStream(faiFile); + int count = 0; for (Sequence seq:readList){ - Logging.info(seq.getName() + " " + seq.length()); + LOG.info(seq.getName() + " " + seq.length()); seq.writeFasta(faiSt); + count ++; + if (count >= max) + break;//for } faiSt.close(); } @@ -124,13 +137,14 @@ public static Sequence consensusSequence(ArrayList readList, String pr }else if (msa.startsWith("mafft")){ cmd = "mafft_wrapper.sh " + faiFile + " " + faoFile; }else{ - Logging.exit("Unknown msa function " + msa, 1); + LOG.error("Unknown msa function " + msa); + return null; } - Logging.info("Running " + cmd); + LOG.info("Running " + cmd); Process process = Runtime.getRuntime().exec(cmd); process.waitFor(); - Logging.info("Done " + cmd); + LOG.info("Done " + cmd); } @@ -149,7 +163,7 @@ public static Sequence consensusSequence(ArrayList readList, String pr }//if }//while sb.setName("consensus"); - Logging.info(sb.getName() + " " + sb.length()); + LOG.info(sb.getName() + " " + sb.length()); return sb.toSequence(); } @@ -196,14 +210,11 @@ public static Sequence consensusSequence(ArrayList readList, String pr }//if }//for x sb.setName("consensus"); - Logging.info(sb.getName() + " " + sb.length()); + LOG.info(sb.getName() + " " + sb.length()); consensus = sb.toSequence(); } - } - + } } return consensus; } - - } diff --git a/src/main/java/japsa/bio/np/Gene.java b/src/main/java/japsa/bio/np/Gene.java deleted file mode 100755 index 0fc422a..0000000 --- a/src/main/java/japsa/bio/np/Gene.java +++ /dev/null @@ -1,34 +0,0 @@ -package japsa.bio.np; - -class Gene implements Comparable{ - String name; - int st,end; - int length; - int cnt; //number of times in genome; - Gene(String[] str){ - this(str[0], Integer.parseInt(str[1]), Integer.parseInt(str[2])); - } - Gene(String name, int st, int end){ - this.st = st; - this.end = end; - this.name = name; - this.length = st - end; - } - - @Override - public boolean equals(Object obj){ - if(obj instanceof Gene){ - return ((Gene)obj).st == st && ((Gene)obj).end == end && ((Gene)obj).name == name; - }else return false; - } - - @Override - public int compareTo(Object obj) { - int st1 = ((Gene)obj).st; - if(st1 profileList){ + @SuppressWarnings("unchecked") PresenceAbsence(int len){ spl = new SpeciesLikelihood[len]; bg = new SpeciesLikelihood(); @@ -194,7 +195,7 @@ public static void calcPosterior(double[] sumLogL, double[] posterior){ int index1 = nrep - index0 -1; if (index1 <= index0) throw new RuntimeException("Confident interval of " + confidentInterval + " does not work with sample = " + nrep); - //Logging.info("YYY " + index0 + " " + index1 + " from " + nrep + " and " +confidentInterval ); + //LOG.info("YYY " + index0 + " " + index1 + " from " + nrep + " and " +confidentInterval ); res[k][0] = samples[index0]; res[k][1] = samples[index1]; } diff --git a/src/main/java/japsa/bio/np/RealtimeAnalysis.java b/src/main/java/japsa/bio/np/RealtimeAnalysis.java index 050be7e..2e6936a 100644 --- a/src/main/java/japsa/bio/np/RealtimeAnalysis.java +++ b/src/main/java/japsa/bio/np/RealtimeAnalysis.java @@ -36,7 +36,8 @@ import java.util.Date; -import japsa.util.Logging; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Real time analysis in a thread that runs in parallel with the thread conecting @@ -46,6 +47,8 @@ * */ public abstract class RealtimeAnalysis implements Runnable { + private static final Logger LOG = LoggerFactory.getLogger(RealtimeAnalysis.class); + private int readPeriod = 0;//Min number of reads before a new analysis private int timePeriod = 0;//Min number of miliseconds before a new analysis @@ -62,7 +65,7 @@ protected RealtimeAnalysis(){ protected String timeNow; public void stopWaiting(){ - Logging.info("All reads received at " + new Date()); + LOG.info("All reads received at " + new Date()); waiting = false; //TODO: implement notifies if I am sleeping } @@ -73,7 +76,7 @@ public void stopWaiting(){ @Override public void run() { startTime = System.currentTimeMillis(); - Logging.info("Start analysing data at " + new Date(startTime)); + LOG.info("Start analysing data at " + new Date(startTime)); try { Thread.sleep(timePeriod); @@ -86,7 +89,7 @@ public void run() { long timeSleep = timePeriod - (System.currentTimeMillis() - lastTime); if (timeSleep > 0){ try { - //Logging.info("Not due time, sleep for " + timeSleep/1000.0 + " seconds"); + //LOG.info("Not due time, sleep for " + timeSleep/1000.0 + " seconds"); Thread.sleep(timeSleep); } catch (InterruptedException e) { e.printStackTrace(); @@ -97,7 +100,7 @@ public void run() { int currentRead = getCurrentRead(); if (currentRead - lastReadNumber < readPeriod){ try { - //Logging.info("Not due read (" + currentRead + "), sleep for " + powerNap/1000.0 + " minutes"); + //LOG.info("Not due read (" + currentRead + "), sleep for " + powerNap/1000.0 + " minutes"); Thread.sleep(powerNap); } catch (InterruptedException e) { e.printStackTrace(); @@ -109,7 +112,7 @@ public void run() { //assert: read number satisfied timeNow = new Date(lastTime).toString(); analysis(); - Logging.info("RUNTIME\t" + timeNow + "\t" + (this.lastTime - this.startTime)/1000.0 + "\t" + this.lastReadNumber + "\t" + (System.currentTimeMillis() - lastTime)/1000.0); + LOG.info("RUNTIME\t" + timeNow + "\t" + (this.lastTime - this.startTime)/1000.0 + "\t" + this.lastReadNumber + "\t" + (System.currentTimeMillis() - lastTime)/1000.0); }//while //perform the final analysis @@ -117,10 +120,10 @@ public void run() { lastReadNumber = getCurrentRead(); timeNow = new Date(lastTime).toString(); analysis(); - Logging.info("RUNTIME\t" + timeNow + "\t" + (this.lastTime - this.startTime)/1000.0 + "\t" + this.lastReadNumber + "\t" + (System.currentTimeMillis() - lastTime)/1000.0); + LOG.info("RUNTIME\t" + timeNow + "\t" + (this.lastTime - this.startTime)/1000.0 + "\t" + this.lastReadNumber + "\t" + (System.currentTimeMillis() - lastTime)/1000.0); //.. and close it close(); - Logging.info("Real time analysis done"); + LOG.info("Real time analysis done"); } abstract protected void close(); diff --git a/src/main/java/japsa/bio/np/RealtimeMLST.java b/src/main/java/japsa/bio/np/RealtimeMLST.java index b8eb35f..19e1b3a 100644 --- a/src/main/java/japsa/bio/np/RealtimeMLST.java +++ b/src/main/java/japsa/bio/np/RealtimeMLST.java @@ -36,22 +36,22 @@ import japsa.bio.alignment.ProbFSM; import japsa.bio.alignment.ProbFSM.Emission; -import japsa.bio.alignment.ProbFSM.ProbOneSM; import japsa.bio.alignment.ProbFSM.ProbThreeSM; -import japsa.bio.bac.MLSTyping; -import japsa.bio.bac.MLSTyping.MLSType; +import japsa.bio.amra.MLSTyping; +import japsa.bio.amra.MLSTyping.MLSType; import japsa.seq.Alphabet; import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.HTSUtilities; -import japsa.util.Logging; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -67,11 +67,12 @@ * @author minhduc * */ -public class RealtimeMLST{ +public class RealtimeMLST{ + private static final Logger LOG = LoggerFactory.getLogger(RealtimeMLST.class); ///////////////////////////////////////////////////////////////////////////// private double minQual = 0; private boolean twoDOnly = false; - private int numThread = 16; + //private int numThread = 16; ArrayList [] alignmentLists; @@ -83,6 +84,7 @@ RealtimeMLSTyper typer; + @SuppressWarnings("unchecked") public RealtimeMLST(String mlstDir, String output, int minRead, int minTime) throws IOException{ typer = new RealtimeMLSTyper(this, mlstDir, output); typer.setReadPeriod(minRead); @@ -115,7 +117,7 @@ public void setTwoDOnly(boolean twoDOnly) { } /** * @param bamFile - * @param geneFile + * @param top * @throws IOException * @throws InterruptedException */ @@ -173,7 +175,7 @@ public void typing(String bamFile, int top) throws IOException, InterruptedExcep Sequence readSeq = HTSUtilities.spanningSequence(record, readSequence, refLength, 0); if (readSeq == null){ - Logging.warn("Read sequence is NULL sequence "); + LOG.warn("Read sequence is NULL sequence "); }else{ //MLFSMThread mlFSM = new MLFSMThread(this.typer, record.getReferenceIndex(), readSeq); //executor.execute(mlFSM); @@ -256,7 +258,7 @@ private void makeTypingConsensus() throws IOException, InterruptedException{ for (int i = 0; i < typing.alignmentLists.length;i++){ Sequence consensus = ErrorCorrection.consensusSequence(typing.alignmentLists[i], prefix + mlstScheme.getGeneName(i) + "kalign" + count, "kalign"); if (consensus == null){ - Logging.warn("No read found for " + mlstScheme.getGeneName(i)); + LOG.warn("No read found for " + mlstScheme.getGeneName(i)); continue; } @@ -310,7 +312,7 @@ private void makeTypingMLwithFSM(int top) throws IOException{ */ @Override public void run() { - Logging.info("Running thread " + geneIndex + " on " + readSeq.getName()); + LOG.info("Running thread " + geneIndex + " on " + readSeq.getName()); ArrayList alleles = typer.mlstScheme.alleles(geneIndex); int numAlleles = alleles.size(); double [] myScore = new double[numAlleles]; @@ -335,13 +337,13 @@ public void run() { //} cost = retState.myCost; int emitCount = tsm.updateCount(retState); - Logging.info("Iter " + c + " : " + emitCount + " states and " + cost + " bits " + readSeq.length() + "bp " + readSeq.getName() + " by " + seq.getName()); + LOG.info("Iter " + c + " : " + emitCount + " states and " + cost + " bits " + readSeq.length() + "bp " + readSeq.getName() + " by " + seq.getName()); tsm.reEstimate(); }//for (iteration) - //Logging.info(" Saving " + (readSeq.length() * 2 - cost) + " on " + readSeq.getName() + " by " + seq.getName()); + //LOG.info(" Saving " + (readSeq.length() * 2 - cost) + " on " + readSeq.getName() + " by " + seq.getName()); //if (cost < readSeq.length() * 2){ myScore[x] = (readSeq.length() * 2 - cost); - Logging.info("Score for " + seq.getName() + " " + myScore[x]); + LOG.info("Score for " + seq.getName() + " " + myScore[x]); //}else // myScore[x] = 0; }//for @@ -350,7 +352,7 @@ public void run() { typer.typerScoreMatrix[geneIndex][x] = myScore[x]; } } - Logging.info("Done thread " + geneIndex + " on " + readSeq.getName()); + LOG.info("Done thread " + geneIndex + " on " + readSeq.getName()); }//run } } \ No newline at end of file diff --git a/src/main/java/japsa/bio/np/RealtimeResistanceGene.java b/src/main/java/japsa/bio/np/RealtimeResistanceGene.java index b51a154..1bc03ee 100644 --- a/src/main/java/japsa/bio/np/RealtimeResistanceGene.java +++ b/src/main/java/japsa/bio/np/RealtimeResistanceGene.java @@ -44,13 +44,14 @@ import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.HTSUtilities; -import japsa.util.Logging; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.File; @@ -71,6 +72,7 @@ */ public class RealtimeResistanceGene { + private static final Logger LOG = LoggerFactory.getLogger(RealtimeResistanceGene.class); ResistanceGeneFinder resistFinder; private HashMap> alignmentMap; @@ -92,15 +94,14 @@ public RealtimeResistanceGene(int read, int time, String output, String resDB, S /** * @param bamFile - * @param geneFile * @throws IOException * @throws InterruptedException */ public void typing(String bamFile) throws IOException, InterruptedException{ //DateFormat df = new SimpleDateFormat("dd/MM/yy HH:mm:ss"); - //Logging.info("START : " + df.format(Calendar.getInstance().getTime())); + //LOG.info("START : " + df.format(Calendar.getInstance().getTime())); - Logging.info("Resistance identification ready at " + new Date()); + LOG.info("Resistance identification ready at " + new Date()); alignmentMap = new HashMap> (); SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); SamReader samReader; @@ -170,7 +171,7 @@ public void typing(String bamFile) throws IOException, InterruptedException{ samIter.close(); samReader.close(); - Logging.info("END : " + new Date()); + LOG.info("END : " + new Date()); } //TODO: way to improve performance: @@ -202,10 +203,10 @@ public ResistanceGeneFinder(RealtimeResistanceGene resistGene, String output, St readGeneInformation(resDB + "/geneList"); } - Logging.info("geneList = " + geneList.size()); - Logging.info("geneMap = " + geneMap.size()); - Logging.info("gene2Group = " + gene2Group.size()); - Logging.info("gene2GeneName = " + gene2GeneName.size()); + LOG.info("geneList = " + geneList.size()); + LOG.info("geneMap = " + geneMap.size()); + LOG.info("gene2Group = " + gene2Group.size()); + LOG.info("gene2GeneName = " + gene2GeneName.size()); sos = SequenceOutputStream.makeOutputStream(output); prefix = tmp; @@ -278,12 +279,12 @@ private void antiBioticsProfile() throws IOException, InterruptedException{ /***************************************************************** if (resistGene.global.equals("hmm")){ double score = fsmAlignment(consensus, gene); - Logging.info("SGF: " + score + " " + geneID + " " + alignmentList.size() + " " + (geneID) + " " + gene2Group.get(geneID)); + LOG.info("SGF: " + score + " " + geneID + " " + alignmentList.size() + " " + (geneID) + " " + gene2Group.get(geneID)); if (score >= resistGene.scoreThreshold){ addPreditedGene(geneID); - Logging.info("ADDF " + geneID);// - //Logging.info("ADDF " + geneID + " " + resistGene.gene2Group.get(geneID)+ " " + resistGene.gene2PrimaryGroup.get(geneID) + " " + geneID); + LOG.info("ADDF " + geneID);// + //LOG.info("ADDF " + geneID + " " + resistGene.gene2Group.get(geneID)+ " " + resistGene.gene2PrimaryGroup.get(geneID) + " " + geneID); continue;//for gene } }else{ @@ -292,11 +293,11 @@ private void antiBioticsProfile() throws IOException, InterruptedException{ consensus.writeFasta(consensusFile); { double score = checkNeedle(consensusFile, gene); - Logging.info("SGF: " + score + " " + geneID + " " + alignmentList.size() + " " + (geneID) + " " + gene2Group.get(geneID)); + LOG.info("SGF: " + score + " " + geneID + " " + alignmentList.size() + " " + (geneID) + " " + gene2Group.get(geneID)); if (score >= resistGene.scoreThreshold){ addPreditedGene(geneID); - Logging.info("ADDF " + geneID); + LOG.info("ADDF " + geneID); continue;//for gene } } @@ -305,7 +306,7 @@ private void antiBioticsProfile() throws IOException, InterruptedException{ } executor.shutdown(); executor.awaitTermination(3, TimeUnit.DAYS); - Logging.info("===Found " + predictedGenes.size() + " vs " + geneMap.size() + " " + alignmentMapSnap.size() + " with " + jobNo); + LOG.info("===Found " + predictedGenes.size() + " vs " + geneMap.size() + " " + alignmentMapSnap.size() + " with " + jobNo); } private void addPreditedGene(String geneID) throws IOException{ @@ -317,7 +318,7 @@ private void addPreditedGene(String geneID) throws IOException{ private static double fsmAlignment(Sequence consensus, Sequence gene){ //if (gene.length() > 2700 || consensus.length() > 4000 || gene.length() * consensus.length() > 6000000){ - // Logging.info("SKIP " + gene.getName() + " " + gene.length() + " vs " + consensus.length()); + // LOG.info("SKIP " + gene.getName() + " " + gene.length() + " vs " + consensus.length()); // return 0; //} //ProbThreeSM tsmF = new ProbThreeSM(gene); @@ -331,7 +332,7 @@ private static double fsmAlignment(Sequence consensus, Sequence gene){ cost = retState.myCost; int emitCount = tsmF.updateCount(retState); - Logging.info("Iter " + c + " : " + emitCount + " states and " + cost + " bits " + consensus.length() + "bp " + consensus.getName() + " by " + gene.getName()); + LOG.info("Iter " + c + " : " + emitCount + " states and " + cost + " bits " + consensus.length() + "bp " + consensus.getName() + " by " + gene.getName()); tsmF.reEstimate(); } return (consensus.length() * 2 - cost) / gene.length(); @@ -346,10 +347,10 @@ private double checkNeedle(String consensusFile, Sequence gene) throws IOExcepti String cmd = "needle -gapopen 10 -gapextend 0.5 -asequence " + faAFile + " -bsequence " + consensusFile + " -outfile " + needleOut; - Logging.info("Running " + cmd); + LOG.info("Running " + cmd); Process process = Runtime.getRuntime().exec(cmd); process.waitFor(); - Logging.info("Run'ed " + cmd ); + LOG.info("Run'ed " + cmd ); BufferedReader scoreBf = new BufferedReader(new FileReader(needleOut)); String scoreLine = null; @@ -460,13 +461,13 @@ protected int getCurrentRead() { @Override public void run() { double score = ResistanceGeneFinder.fsmAlignment(consensus, gene); - Logging.info("SGF: " + score + " " + geneID + " " + " " + (geneID) + " " + resGeneFinder.gene2Group.get(geneID)); + LOG.info("SGF: " + score + " " + geneID + " " + " " + (geneID) + " " + resGeneFinder.gene2Group.get(geneID)); if (score >= resGeneFinder.resistGene.scoreThreshold){ synchronized(resGeneFinder){ try { resGeneFinder.addPreditedGene(geneID); - Logging.info("ADDF " + geneID);// + LOG.info("ADDF " + geneID);// } catch (IOException e) { e.printStackTrace(); } diff --git a/src/main/java/japsa/bio/np/RealtimeSpeciesTyping.java b/src/main/java/japsa/bio/np/RealtimeSpeciesTyping.java index 3466115..a6736cf 100644 --- a/src/main/java/japsa/bio/np/RealtimeSpeciesTyping.java +++ b/src/main/java/japsa/bio/np/RealtimeSpeciesTyping.java @@ -34,10 +34,11 @@ package japsa.bio.np; +import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.DoubleArray; -import japsa.util.Logging; + import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamInputResource; @@ -46,26 +47,34 @@ import htsjdk.samtools.ValidationStringency; import java.io.BufferedReader; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileWriter; import java.io.IOException; +import java.lang.ProcessBuilder.Redirect; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import org.rosuda.JRI.REXP; import org.rosuda.JRI.Rengine; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * @author minhduc + * @author Minh Duc Cao, Son Hoang Nguyen * */ public class RealtimeSpeciesTyping { + private static final Logger LOG = LoggerFactory.getLogger(RealtimeSpeciesTyping.class); + + public static boolean JSON=false; RealtimeSpeciesTyper typer; /** * Minimum quality of alignment */ - private double minQual = 0; + private double minQual = 1; private boolean twoDOnly = false; @@ -78,7 +87,10 @@ HashMap seq2Species = new HashMap(); HashMap species2Count = new HashMap(); ArrayList speciesList = new ArrayList(); - + + //to output binned sequences + public static boolean OUTSEQ=false; + HashMap> species2Seqs = new HashMap>(); public RealtimeSpeciesTyping(String indexFile, String output)throws IOException{ typer = new RealtimeSpeciesTyper(this, output); @@ -86,13 +98,6 @@ public RealtimeSpeciesTyping(String indexFile, String output)throws IOException{ } - - /** - * @param bamFile - * @param geneFile - * @throws IOException - * @throws InterruptedException - */ static class SpeciesCount implements Comparable{ String species; int count = 0; @@ -131,7 +136,7 @@ private void preTyping(String indexFile)throws IOException{ } }//while bf.close(); - Logging.info(seq2Species.size() + " " + species2Count.size()); + LOG.info(seq2Species.size() + " " + species2Count.size()); speciesList.addAll(species2Count.keySet()); //Write header @@ -159,9 +164,9 @@ public void typing(String bamFile, int readNumber, int timeNumber) throws IOExce typer.setReadPeriod(readNumber); typer.setTimePeriod(timeNumber * 1000); - Logging.info("Species typing ready at " + new Date()); + LOG.info("Species typing ready at " + new Date()); - String readName = ""; + String readName = "", refName = ""; //Read the bam file SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); SamReader samReader; @@ -175,6 +180,7 @@ public void typing(String bamFile, int readNumber, int timeNumber) throws IOExce Thread thread = new Thread(typer); thread.start(); + boolean changedFlag = false; while (samIter.hasNext()){ SAMRecord sam = samIter.next(); //if (firstReadTime <=0) @@ -186,13 +192,17 @@ public void typing(String bamFile, int readNumber, int timeNumber) throws IOExce if (!sam.getReadName().equals(readName)){ readName = sam.getReadName(); - + changedFlag = true; synchronized(this){ currentReadCount ++; currentBaseCount += sam.getReadLength(); } + } else if(!refName.equals(sam.getReferenceName())){ + changedFlag = true; } - + else + changedFlag = false; + if (sam.getReadUnmappedFlag()){ continue; } @@ -200,20 +210,29 @@ public void typing(String bamFile, int readNumber, int timeNumber) throws IOExce if (sam.getMappingQuality() < this.minQual) continue; - String refSequence = sam.getReferenceName(); - String species = seq2Species.get(refSequence); + refName = sam.getReferenceName(); + String species = seq2Species.get(refName); if (species == null){ - throw new RuntimeException(" Can find species with ref " + refSequence + " line " + currentReadCount ); + throw new RuntimeException(" Can't find species with ref " + refName + " line " + currentReadCount ); } SpeciesCount sCount = species2Count.get(species); if (sCount == null){ - throw new RuntimeException(" Can find record with species " + species + " line " + currentReadCount ); + throw new RuntimeException(" Can't find record with species " + species + " line " + currentReadCount ); } synchronized(this) { currentReadAligned ++; sCount.count ++; + + if(OUTSEQ && changedFlag){ + ArrayList readList = species2Seqs.get(species); + if( readList == null){ + readList = new ArrayList(); + species2Seqs.put(species, readList); + } + readList.add(readName); + } } }//while @@ -225,6 +244,130 @@ public void typing(String bamFile, int readNumber, int timeNumber) throws IOExce samReader.close(); } +// public void typing(String inFile, String format, String bwaExe, int bwaThread, String bwaIndex, int readNumber, int timeNumber, int qual) throws IOException, InterruptedException{ +// typer.setReadPeriod(readNumber); +// typer.setTimePeriod(timeNumber * 1000); +// +// LOG.info("Species typing ready at " + new Date()); +// +// SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); +// SamReader samReader = null; +// +// Process bwaProcess = null; +// +// if (format.endsWith("am")){//bam or sam +// if ("-".equals(inFile)) +// samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); +// else +// samReader = SamReaderFactory.makeDefault().open(new File(inFile)); +// }else{//fastq or fasta file +// LOG.info("Starting bwa at " + new Date()); +// ProcessBuilder pb = null; +// if ("-".equals(inFile)){ +// pb = new ProcessBuilder(bwaExe, +// "mem", +// "-t", +// "" + bwaThread, +// "-k11", +// "-W20", +// "-r10", +// "-A1", +// "-B1", +// "-O1", +// "-E1", +// "-L0", +// "-a", +// "-Y", +// "-K", +// "20000", +// bwaIndex, +// "-" +// ). +// redirectInput(Redirect.INHERIT); +// }else{ +// pb = new ProcessBuilder(bwaExe, +// "mem", +// "-t", +// "" + bwaThread, +// "-k11", +// "-W20", +// "-r10", +// "-A1", +// "-B1", +// "-O1", +// "-E1", +// "-L0", +// "-a", +// "-Y", +// "-K", +// "20000", +// bwaIndex, +// inFile +// ); +// } +// +// bwaProcess = pb.redirectError(ProcessBuilder.Redirect.to(new File("/dev/null"))).start(); +// +// LOG.info("bwa started x"); +// samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(bwaProcess.getInputStream())); +// +// } +// +// SAMRecordIterator samIter = samReader.iterator(); +// +// Thread thread = new Thread(typer); +// thread.start(); +// String readID = ""; +// while (samIter.hasNext()){ +// SAMRecord sam = samIter.next(); +// //if (firstReadTime <=0) +// // firstReadTime = System.currentTimeMillis(); +// +// if (this.twoDOnly && !sam.getReadName().contains("twodim")){ +// continue; +// } +// +// if (!sam.getReadName().equals(readID)){ +// readID = sam.getReadName(); +// +// synchronized(this){ +// currentReadCount ++; +// currentBaseCount += sam.getReadLength(); +// } +// } +// +// if (sam.getReadUnmappedFlag()){ +// continue; +// } +// +// if (sam.getMappingQuality() < this.minQual) +// continue; +// +// String refSequence = sam.getReferenceName(); +// String species = seq2Species.get(refSequence); +// if (species == null){ +// throw new RuntimeException(" Can find species with ref " + refSequence + " line " + currentReadCount ); +// } +// +// SpeciesCount sCount = species2Count.get(species); +// if (sCount == null){ +// throw new RuntimeException(" Can find record with species " + species + " line " + currentReadCount ); +// } +// +// synchronized(this) { +// currentReadAligned ++; +// sCount.count ++; +// } +// }//while +// +// //final run +// //typer.simpleAnalysisCurrent(); +// +// typer.stopWaiting();//Tell typer to stop +// samIter.close(); +// samReader.close(); +// } + public static class RealtimeSpeciesTyper extends RealtimeAnalysis{ Rengine rengine; RealtimeSpeciesTyping typing; @@ -233,16 +376,24 @@ public void typing(String bamFile, int readNumber, int timeNumber) throws IOExce public RealtimeSpeciesTyper(RealtimeSpeciesTyping t, String output) throws IOException{ typing = t; //Set up Rengine + if (!Rengine.versionCheck()) { + LOG.error("** JRI R-Engine: Version mismatch - Java files don't match library version."); + System.exit(1); + } + //Rengine.DEBUG=1; rengine = new Rengine (new String [] {"--no-save"}, false, null); if (!rengine.waitForR()){ - Logging.exit("Cannot load R",1); + LOG.error("Cannot load R"); + System.exit(1); } rengine.eval("library(MultinomialCI)"); rengine.eval("alpha<-0.05"); - Logging.info("REngine ready"); + LOG.info("REngine ready"); countsOS = SequenceOutputStream.makeOutputStream(output); - countsOS.print("time\tstep\treads\tbases\tspecies\tprob\terr\ttAligned\tsAligned\n"); + if(!JSON) + countsOS.print("time\tstep\treads\tbases\tspecies\tprob\terr\ttAligned\tsAligned\n"); + } @@ -267,7 +418,7 @@ private void simpleAnalysisCurrent() throws IOException{ if (count[i] >= minCount){ countArray.add(count[i]); speciesArray.add(typing.speciesList.get(i)); - Logging.info(step+" : " + typing.speciesList.get(i) + " == " + count[i]); + LOG.info(step+" : " + typing.speciesList.get(i) + " == " + count[i]); } } //if (countArray.size() > 10) return; @@ -279,18 +430,42 @@ private void simpleAnalysisCurrent() throws IOException{ REXP tab = rengine.eval("tab",true); double [][] results = tab.asDoubleMatrix(); + + if(JSON) + countsOS.print("{\n\t\"timestamp\": \"" + timeNow + "\",\n\t\"data\": [\n"); + + boolean toPrintComma=false; for (int i = 0; i < results.length;i++){ if (results[i][0] <= 0.00001) continue; double mid = (results[i][0] + results[i][1])/2; double err = mid - results[i][0]; - countsOS.print(timeNow + "\t" + step + "\t" + lastReadNumber + "\t" + typing.currentBaseCount + "\t" + speciesArray.get(i).replaceAll("_"," ") + "\t" + mid +"\t" + err + "\t" + typing.currentReadAligned + "\t" + countArray.get(i)); + if(!JSON) + countsOS.print(timeNow + "\t" + step + "\t" + lastReadNumber + "\t" + typing.currentBaseCount + "\t" + speciesArray.get(i).replaceAll("_"," ") + "\t" + mid +"\t" + err + "\t" + typing.currentReadAligned + "\t" + countArray.get(i)); + else { + if (toPrintComma) + countsOS.print(","); + countsOS.print("\t\t{" + + "\n\t\t\t\"species\": \"" + speciesArray.get(i).replaceAll("_", " ") + "\"" + + ",\n\t\t\t\"step\": " + step + + ",\n\t\t\t\"reads\": " + lastReadNumber + + ",\n\t\t\t\"bases\": " + typing.currentBaseCount + + ",\n\t\t\t\"prob\": " + mid + + ",\n\t\t\t\"err\": " + err + + ",\n\t\t\t\"tAligned\": " + typing.currentReadAligned + + ",\n\t\t\t\"sAligned\": " + countArray.get(i) + + "\n\t\t}"); + } + toPrintComma=true; + countsOS.println(); } + if(JSON) + countsOS.print("\t]\n}\n"); countsOS.flush(); - Logging.info(step+" " + countArray.size()); + LOG.info(step+" " + countArray.size()); } protected void close(){ @@ -300,6 +475,23 @@ protected void close(){ }catch (Exception e){ e.printStackTrace(); } + + //print out + if(OUTSEQ){ + try (BufferedWriter bw = new BufferedWriter(new FileWriter("species2reads.map"))) { + for(String sp:typing.species2Seqs.keySet()){ + bw.write(">"+sp+"\n"); + ArrayList readList = typing.species2Seqs.get(sp); + for(String read:readList) + bw.write(read+"\n"); + } + + } catch (IOException e) { + + e.printStackTrace(); + + } + } } /* (non-Javadoc) @@ -310,7 +502,7 @@ protected void analysis(){ try{ simpleAnalysisCurrent(); }catch (IOException e){ - Logging.warn(e.getMessage()); + LOG.warn(e.getMessage()); } } diff --git a/src/main/java/japsa/bio/np/RealtimeStrainTyping.java b/src/main/java/japsa/bio/np/RealtimeStrainTyping.java index 1668ec2..8de1bbd 100644 --- a/src/main/java/japsa/bio/np/RealtimeStrainTyping.java +++ b/src/main/java/japsa/bio/np/RealtimeStrainTyping.java @@ -39,20 +39,19 @@ import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.HTSUtilities; -import japsa.util.Logging; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.File; -import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; -import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.Date; @@ -64,6 +63,7 @@ * */ public class RealtimeStrainTyping { + private static final Logger LOG = LoggerFactory.getLogger(RealtimeStrainTyping.class); RealtimeStrainTyper typer; private double minQual = 0; private boolean twoDOnly = false; @@ -77,15 +77,6 @@ long currentBaseCount = 0; int currentReadAligned = 0; - /** - * - * @param minRead - * @param minTime: in seconds - * @param geneFile - * @param profileFile - * @param output - * @throws IOException - */ public RealtimeStrainTyping(int minRead, int minTime, String geneDB, String output) throws IOException{ typer = new RealtimeStrainTyper(this, geneDB, output); @@ -123,12 +114,11 @@ public void setTwoOnly(boolean twoOnly) { /** * @param bamFile - * @param geneFile * @throws IOException * @throws InterruptedException */ public void typing(String bamFile) throws IOException, InterruptedException{ - Logging.info("Strain typing ready at " + new Date()); + LOG.info("Strain typing ready at " + new Date()); alignmentMap = new HashMap> (); @@ -195,7 +185,7 @@ public void typing(String bamFile) throws IOException, InterruptedException{ Sequence readSeq = HTSUtilities.spanningSequence(record, readSequence, refLength, 0); if (readSeq == null){ - Logging.warn("Read sequence is NULL sequence "); + LOG.warn("Read sequence is NULL sequence "); }else{ alignmentList.add(readSeq); } @@ -216,12 +206,6 @@ public int compareTo(LCTypingResult o) { } } - /** - * @param file - * @param out - * @param profile - * @throws IOException - */ public static class GeneProfile implements Comparable{ String strainID; @@ -255,7 +239,7 @@ public String strainID(){ } } - private static double distance (HashSet s1,HashSet s2){ + protected static double distance (HashSet s1,HashSet s2){ int notIn = 0; int intersect = 0; @@ -345,7 +329,7 @@ private void readKnowProfiles(String profileFile) throws IOException{ out.close(); /*****************************************************************/ - Logging.info("There are " + myProfileList.size() +" strains"); + LOG.info("There are " + myProfileList.size() +" strains"); lcTyping = new PresenceAbsence(myProfileList); } @@ -375,7 +359,7 @@ protected int getCurrentRead() { } private ArrayList makePresenceTyping(int top) throws IOException, InterruptedException{ - Logging.info("Perform an analysis at " + new Date()); + LOG.info("Perform an analysis at " + new Date()); long step = (lastTime - startTime)/1000;//convert to second //int step = typing.currentReadCount; @@ -396,7 +380,7 @@ protected int getCurrentRead() { } } } - Logging.info(timeNow + ": Found " + seenGenes.size() + " " + compute); + LOG.info(timeNow + ": Found " + seenGenes.size() + " " + compute); if (compute){ posterior = lcTyping.calcPosterior(); @@ -424,7 +408,7 @@ protected int getCurrentRead() { } datOS.flush(); - Logging.info("End an analysis at " + new Date()); + LOG.info("End an analysis at " + new Date()); return lcT; } /** diff --git a/src/main/java/japsa/bio/np/SpeciesLikelihood.java b/src/main/java/japsa/bio/np/SpeciesLikelihood.java index 664fb31..b645079 100755 --- a/src/main/java/japsa/bio/np/SpeciesLikelihood.java +++ b/src/main/java/japsa/bio/np/SpeciesLikelihood.java @@ -97,6 +97,7 @@ public SpeciesLikelihood(String spname, List l, SpeciesLikelihood bg) t while((st = br.readLine())!=null){ l.add(st); } + br.close(); return l.iterator(); } diff --git a/src/main/java/japsa/bio/np/barcode/BarCodeAnalysis.java b/src/main/java/japsa/bio/np/barcode/BarCodeAnalysis.java new file mode 100644 index 0000000..eb14aa6 --- /dev/null +++ b/src/main/java/japsa/bio/np/barcode/BarCodeAnalysis.java @@ -0,0 +1,308 @@ +package japsa.bio.np.barcode; + +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.FastqReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +import java.io.File; +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.Comparator; + + +public class BarCodeAnalysis { + private static final Logger LOG = LoggerFactory.getLogger(BarCodeAnalysis.class); + + int SCAN_WINDOW; + double DIST_THRES, + SCORE_THRES; + public static boolean print=false, //whether to print to files + script=false, //whether invoke downstream analysis + twoends=false; // both-ends-matching + ArrayList barCodesLeft = new ArrayList(); //barcode sequences from left end + ArrayList barCodesRight = new ArrayList(); //barcode from right end + Process[] processes; + int nSamples; + int barcodeLen; + SequenceOutputStream[] streamToScript, streamToFile; + + public BarCodeAnalysis(String barcodeFile, String scriptFile) throws IOException{ + if(scriptFile!=null) + script=true; + + ArrayList allSeq = SequenceReader.readAll(barcodeFile, Alphabet.DNA()); + if(twoends){ + allSeq.sort(Comparator.comparing(Sequence::getName)); + int i = 0; + while(i < allSeq.size()){ + barCodesLeft.add(allSeq.get(i++)); + barCodesRight.add(allSeq.get(i++)); + } + }else{ + barCodesLeft = allSeq; + for(Sequence seq:barCodesLeft) + barCodesRight.add(Alphabet.DNA.complement(seq)); + + } + + nSamples = barCodesLeft.size(); + + processes = new Process[nSamples]; + + if(script) + streamToScript = new SequenceOutputStream[nSamples]; + if(print) + streamToFile = new SequenceOutputStream[nSamples+1]; //unknown sequences included + + String id; + for(int i=0;i" + id + ":" + barCode); + if(script){ + ProcessBuilder pb = new ProcessBuilder(scriptFile, id) + .redirectError(new File("/dev/null")) + .redirectOutput(new File("/dev/null")); + //pb.directory(new File(System.getProperty("user.dir"))); + + processes[i] = pb.start(); + + LOG.info("Job for " + id + " started"); + streamToScript[i] = new SequenceOutputStream(processes[i].getOutputStream()); + } + barcodeLen += barCodesLeft.get(i).length(); + } + + barcodeLen /= nSamples; + +// if(print){ +// for(int i=0;i bestScore){ + //LOG.info("Better score=" + myScore); + distance = myScore-bestScore; + bestScore = myScore; + bestIndex = i; + if(twoends){ + if(lf[i] + rr[i] > lr[i] + rf[i]){ + bestLeftAlignment = alignmentLF; + bestRightAlignment = alignmentRR; + }else{ + bestLeftAlignment = alignmentLR; + bestRightAlignment = alignmentRF; + } + + }else{ + if(myScore==lf[i] || myScore==rr[i]){ + bestLeftAlignment = alignmentLF; + bestRightAlignment = alignmentRR; + }else if(myScore==lr[i] || myScore==rf[i]){ + bestLeftAlignment = alignmentLR; + bestRightAlignment = alignmentRF; + } + } + + } else if((bestScore-myScore) < distance){ + distance=bestScore-myScore; + } + + } + + + String retval=""; + DecimalFormat twoDForm = new DecimalFormat("#.##"); + if(bestScore < SCORE_THRES || distance < DIST_THRES ){ + //LOG.info("Unknown sequence " + seq.getName()); + retval = "unknown:"+Double.valueOf(twoDForm.format(bestScore))+":"+Double.valueOf(twoDForm.format(distance))+"|0-0:0-0|"; + seq.setName(retval + seq.getName()); + + if(print){ + seq.print(streamToFile[nSamples]); + } + } + //if the best (sum of both ends) alignment in template sequence is greater than in complement + else { +// LOG.info("Sequence " + seq.getName() + " might belongs to sample " + barCodesLeft.get(bestIndex).getName() + " with score=" + bestScore); + if(bestIndex=start2){ + seq1=origSeq1.substring(0, start1) + alnSeq1 + origSeq1.substring(start1+alnSeq1.length()-gap1); + String seq2Filler = start1==start2?"":String.format("%"+(start1-start2)+"s", ""), + markFiller = start1==0?"":String.format("%"+start1+"s", ""); + seq2= seq2Filler + origSeq2.substring(0, start2) + alnSeq2 + origSeq2.substring(start2+alnSeq2.length()-gap2); + mark= markFiller+String.valueOf(alignment.getMarkupLine()); + }else{ + seq2=origSeq2.substring(0, start2) + alnSeq2 + origSeq2.substring(start2+alnSeq2.length()-gap2); + String markFiller = start2==0?"":String.format("%"+start2+"s", ""); + seq1=String.format("%"+(start2-start1)+"s", "") + origSeq1.substring(0, start1) + alnSeq1 + origSeq1.substring(start1+alnSeq1.length()-gap1); + mark=markFiller+String.valueOf(alignment.getMarkupLine()); + } + System.out.println(alignment.getSummary()); + System.out.println(seq1); + System.out.println(mark); + System.out.println(seq2); + } + +} diff --git a/src/main/java/japsa/bio/np/barcode/SWGAlignment.java b/src/main/java/japsa/bio/np/barcode/SWGAlignment.java new file mode 100644 index 0000000..3487d93 --- /dev/null +++ b/src/main/java/japsa/bio/np/barcode/SWGAlignment.java @@ -0,0 +1,875 @@ +package japsa.bio.np.barcode; +/** + * Implement based on jaligner from Ahmed Moustafa + * See license below + */ +import java.text.DecimalFormat; +import japsa.seq.Sequence; + +public class SWGAlignment { + /** + * Gap character + */ + static final char GAP = '-'; + + /** + * Traceback direction stop + */ + static final byte TRACEBACK_STOP = 0; + /** + * Traceback direction left + */ + static final byte TRACEBACK_LEFT = 1; + /** + * Traceback direction diagonal + */ + static final byte TRACEBACK_DIAGONAL = 2; + /** + * Traceback direction up + */ + static final byte TRACEBACK_UP = 3; + + /** + * Markup line identity character + */ + static final char MARKUP_IDENTITY = '|'; + + /** + * Markup line similarity character + */ + static final char MARKUP_SIMILARITY = ':'; + + /** + * Markup line gap character + */ + static final char MARKUP_GAP = ' '; + + /** + * Markup line mismatch character + */ + static final char MARKUP_MISMATCH = '.'; + + /** + * Default name for sequence #1 + */ + private static final String SEQUENCE1 = "jaligner_1"; + + /** + * Default name for sequence #2 + */ + private static final String SEQUENCE2 = "jaligner_2"; + + /** + * Scoring matrix + */ + +// //poreFUME's scores +// static float [][] matrix = { +// { 2.7f, -4.5f, -4.5f, -4.5f}, +// { -4.5f, 2.7f, -4.5f, -4.5f}, +// { -4.5f, -4.5f, 2.7f, -4.5f}, +// { -4.5f, -4.5f, -4.5f, 2.7f} +// }; +// /** +// * Gap open cost +// */ +// static float open=4.7f; +// +// /** +// * Gap extend cost +// */ +// static float extend=1.6f; + + //HOXD70 scoring scheme (F Chiaromonte, VB Yap, W Miller, PSB 2002:115-126) + static float [][] matrix = { + { 91f, -114f, -31f, -123f}, + { -114f, 100f, -125f, -31f}, + { -31f, -125f, 100f, -114f}, + { -123f, -31f, -114f, 91f} + }; + /** + * Gap open cost + */ + static float open=400f; + + /** + * Gap extend cost + */ + static float extend=30f; + + + + /** + * Alignment score + */ + private float score; + + /** + * Aligned sequence #1 + */ + private char[] sequence1; + + /** + * Name of sequence #1 + */ + private String name1; + + /** + * Alignment start location in sequence #1 + */ + private int start1; + + /** + * Aligned sequence #2 + */ + private char[] sequence2; + + /** + * Name of sequence #2 + */ + private String name2; + + /** + * Alignment start location in sequence #2 + */ + private int start2; + + /** + * Markup line + */ + private char[] markupLine; + + /** + * Count of identical locations + */ + private int identity; + + /** + * Count of similar locations + */ + private int similarity; + + /** + * Count of gap locations + */ + private int gaps; + + + + private Sequence originalSequence1; + + private Sequence originalSequence2; + + /** + * Constructor for Alignment + */ + + public SWGAlignment() { + super(); + } + + /** + * @return Returns the extend. + */ + public float getExtend() { + return extend; + } + + /** + * @param extend + * The extend to set. + */ + public void setExtend(float extend) { + SWGAlignment.extend = extend; + } + + /** + * @return Returns the name1. + */ + public String getName1() { + return name1 == null || name1.trim().length() == 0 ? SEQUENCE1 : name1; + } + + /** + * @param name1 + * The name1 to set. + */ + public void setName1(String name1) { + this.name1 = name1; + } + + /** + * @return Returns the name2. + */ + public String getName2() { + return name2 == null || name2.trim().length() == 0 ? SEQUENCE2 : name2; + } + + /** + * @param name2 + * The name2 to set. + */ + public void setName2(String name2) { + this.name2 = name2; + } + + /** + * @return Returns the open. + */ + public float getOpen() { + return open; + } + + /** + * @param open + * The open to set. + */ + public void setOpen(float open) { + SWGAlignment.open = open; + } + + /** + * @return Returns the score. + */ + public float getScore() { + return score; + } + + /** + * @param score + * The score to set. + */ + public void setScore(float score) { + this.score = score; + } + + /** + * Returns the length of the alignment + * + * @return Alignment length + */ + public int getLength() { + return this.sequence1.length; + } + + /** + * @return Returns the sequence1. + */ + public char[] getSequence1() { + return sequence1; + } + + /** + * @param sequence1 + * The sequence1 to set. + */ + public void setSequence1(char[] sequence1) { + this.sequence1 = sequence1; + } + + /** + * @return Returns the sequence2. + */ + public char[] getSequence2() { + return sequence2; + } + + /** + * @param sequence2 + * The sequence2 to set. + */ + public void setSequence2(char[] sequence2) { + this.sequence2 = sequence2; + } + + /** + * @return Returns the start1. + */ + public int getStart1() { + return start1; + } + + /** + * @param start1 + * The start1 to set. + */ + public void setStart1(int start1) { + this.start1 = start1; + } + + /** + * @return Returns the start2. + */ + public int getStart2() { + return start2; + } + + /** + * @param start2 + * The start2 to set. + */ + public void setStart2(int start2) { + this.start2 = start2; + } + + /** + * @return Returns the gaps. + */ + public int getGaps() { + return gaps; + } + + /** + * @param gaps + * The gaps to set. + */ + public void setGaps(int gaps) { + this.gaps = gaps; + } + + /** + * @return Returns the identity. + */ + public int getIdentity() { + return identity; + } + + /** + * @param identity + * The identity to set. + */ + public void setIdentity(int identity) { + this.identity = identity; + } + + /** + * @return Returns the markupLine. + */ + public char[] getMarkupLine() { + return markupLine; + } + + /** + * @param markupLine + * The markupLine to set. + */ + public void setMarkupLine(char[] markupLine) { + this.markupLine = markupLine; + } + + /** + * @return Returns the similarity. + */ + public int getSimilarity() { + return similarity; + } + + /** + * @param similarity + * The similarity to set. + */ + public void setSimilarity(int similarity) { + this.similarity = similarity; + } + + /** + * Returns a summary for alignment + * + * @return {@link String} alignment summary + */ + public String getSummary() { + StringBuffer buffer = new StringBuffer(); + DecimalFormat f1 = new DecimalFormat("0.00"); + DecimalFormat f2 = new DecimalFormat("0.00%"); + + int length = getSequence1().length; + + buffer.append("Sequence #1: " + getName1()); + buffer.append("\r\n"); + buffer.append("Sequence #2: " + getName2()); + buffer.append("\r\n"); + buffer.append("Length #1: " + getOriginalSequence1().length()); + buffer.append("\r\n"); + buffer.append("Length #2: " + getOriginalSequence2().length()); + buffer.append("\r\n"); + + buffer.append("\r\n"); + buffer.append("Gap open: " + open); + buffer.append("\r\n"); + buffer.append("Gap extend: " + extend); + buffer.append("\r\n"); + buffer.append("Length: " + length); + buffer.append("\r\n"); + buffer.append("Identity: " + identity + "/" + length + " (" + + f2.format(identity / (float) length) + ")"); + buffer.append("\r\n"); + buffer.append("Similarity: " + similarity + "/" + length + " (" + + f2.format(similarity / (float) length) + ")"); + buffer.append("\r\n"); + buffer.append("Gaps: " + gaps + "/" + length + " (" + + f2.format(gaps / (float) length) + ")"); + buffer.append("\r\n"); + buffer.append("Score: " + f1.format(score)); + buffer.append("\r\n"); + + return buffer.toString(); + } + + + /** + * Returns original {@link Sequence} #1 + * + * @return original {@link Sequence} #1 + */ + public Sequence getOriginalSequence1() { + return originalSequence1; + } + + /** + * + * @param originalSequence1 + */ + public void setOriginalSequence1(Sequence originalSequence1) { + this.originalSequence1 = originalSequence1; + } + + /** + * Returns original {@link Sequence} #2 + * + * @return original {@link Sequence} #2 + */ + public Sequence getOriginalSequence2() { + return originalSequence2; + } + + /** + * + * @param originalSequence2 + */ + public void setOriginalSequence2(Sequence originalSequence2) { + this.originalSequence2 = originalSequence2; + } + + /** + * Returns the number of gaps of the aligned sequence #1 + * + * @return the number of gaps of the aligned sequence #1 + */ + public int getGaps1() { + int count = 0; + for (int i = 0, n = sequence1.length; i < n; i++) { + if (sequence1[i] == GAP) { + count++; + } + } + return count; + } + + /** + * Returns the number of gaps of the aligned sequence #2 + * + * @return the number of gaps of the aligned sequence #2 + */ + public int getGaps2() { + int count = 0; + for (int i = 0, n = sequence2.length; i < n; i++) { + if (sequence2[i] == GAP) { + count++; + } + } + return count; + } + + /**************************************************************************************** + **************************************************************************************** + ***************************Static functions for the algorithm ************************** + **************************************************************************************** + ****************************************************************************************/ + + /** + * Aligns two sequences by Smith-Waterman (local) + * + * @param s1 + * sequene #1 ({@link Sequence}) + * @param s2 + * sequene #2 ({@link Sequence}) + * @param matrix + * scoring matrix ({@link Matrix}) + * @param o + * open gap penalty + * @param e + * extend gap penalty + * @return alignment object contains the two aligned sequences, the + * alignment score and alignment statistics + * @see Sequence + * @see Matrix + */ + public static SWGAlignment align(Sequence s1, Sequence s2) { + + int m = s1.length() + 1; + int n = s2.length() + 1; + + byte[] pointers = new byte[m * n]; + + // Initializes the boundaries of the traceback matrix to STOP. + for (int i = 0, k = 0; i < m; i++, k += n) { + pointers[k] = TRACEBACK_STOP; + } + for (int j = 1; j < n; j++) { + pointers[j] = TRACEBACK_STOP; + } + + short[] sizesOfVerticalGaps = new short[m * n]; + short[] sizesOfHorizontalGaps = new short[m * n]; + for (int i = 0, k = 0; i < m; i++, k += n) { + for (int j = 0; j < n; j++) { + sizesOfVerticalGaps[k + j] = sizesOfHorizontalGaps[k + j] = 1; + } + } + + Cell cell = construct(s1, s2, pointers, sizesOfVerticalGaps, sizesOfHorizontalGaps); + SWGAlignment alignment = traceback(s1, s2, pointers, cell, sizesOfVerticalGaps, sizesOfHorizontalGaps); + alignment.setOriginalSequence1(s1); + alignment.setOriginalSequence2(s2); + alignment.setOpen(open); + alignment.setExtend(extend); + if (s1.getName() != null) { + alignment.setName1(s1.getName()); + } + if (s2.getName() != null) { + alignment.setName2(s2.getName()); + } + return alignment; + } + + /** + * Constructs directions matrix for the traceback + * + * @param s1 + * sequence #1 + * @param s2 + * sequence #2 + * @param matrix + * scoring matrix + * @param o + * open gap penalty + * @param e + * extend gap penalty + * @return The cell where the traceback starts. + */ + private static Cell construct(Sequence s1, Sequence s2, byte[] pointers, short[] sizesOfVerticalGaps, + short[] sizesOfHorizontalGaps) { + + char[] a1 = s1.charSequence(); + char[] a2 = s2.charSequence(); + + int m = s1.length() + 1; + int n = s2.length() + 1; + + float f; // score of alignment x1...xi to y1...yi if xi aligns to yi + float[] g = new float[n]; // score if xi aligns to a gap after yi + float h; // score if yi aligns to a gap after xi + float[] v = new float[n]; // best score of alignment x1...xi to + // y1...yi + float vDiagonal; + + g[0] = Float.NEGATIVE_INFINITY; + h = Float.NEGATIVE_INFINITY; + v[0] = 0; + + for (int j = 1; j < n; j++) { + g[j] = Float.NEGATIVE_INFINITY; + v[j] = 0; + } + + float similarityScore, g1, g2, h1, h2; + + Cell cell = new Cell(); + + for (int i = 1, k = n; i < m; i++, k += n) { + h = Float.NEGATIVE_INFINITY; + vDiagonal = v[0]; + for (int j = 1, l = k + 1; j < n; j++, l++) { +// similarityScore = matrix[a1[i - 1]][a2[j - 1]]; + similarityScore = matrix[s1.getBase(i-1)][s2.getBase(j-1)]; + + // Fill the matrices + f = vDiagonal + similarityScore; + + g1 = g[j] - extend; + g2 = v[j] - open; + if (g1 > g2) { + g[j] = g1; + sizesOfVerticalGaps[l] = (short) (sizesOfVerticalGaps[l - n] + 1); + } else { + g[j] = g2; + } + + h1 = h - extend; + h2 = v[j - 1] - open; + if (h1 > h2) { + h = h1; + sizesOfHorizontalGaps[l] = (short) (sizesOfHorizontalGaps[l - 1] + 1); + } else { + h = h2; + } + + vDiagonal = v[j]; + v[j] = maximum(f, g[j], h, 0); + + // Determine the traceback direction + if (v[j] == 0) { + pointers[l] = TRACEBACK_STOP; + } else if (v[j] == f) { + pointers[l] = TRACEBACK_DIAGONAL; + } else if (v[j] == g[j]) { + pointers[l] = TRACEBACK_UP; + } else { + pointers[l] = TRACEBACK_LEFT; + } + + // Set the traceback start at the current cell i, j and score + if (v[j] > cell.getScore()) { + cell.set(i, j, v[j]); + } + } + } + + return cell; + } + + /** + * Returns the alignment of two sequences based on the passed array of + * pointers + * + * @param s1 + * sequence #1 + * @param s2 + * sequence #2 + * @param cell + * The cell where the traceback starts. + * @return {@link Alignment}with the two aligned sequences and alignment + * score. + * @see Cell + * @see Alignment + */ + private static SWGAlignment traceback(Sequence s1, Sequence s2, byte[] pointers, Cell cell, short[] sizesOfVerticalGaps, + short[] sizesOfHorizontalGaps) { + + char[] a1 = s1.charSequence(); + char[] a2 = s2.charSequence(); + + int n = s2.length() + 1; + + SWGAlignment alignment = new SWGAlignment(); + alignment.setScore(cell.getScore()); + + int maxlen = s1.length() + s2.length(); // maximum length after the + // aligned sequences + + char[] reversed1 = new char[maxlen]; // reversed sequence #1 + char[] reversed2 = new char[maxlen]; // reversed sequence #2 + char[] reversed3 = new char[maxlen]; // reversed markup + + int len1 = 0; // length of sequence #1 after alignment + int len2 = 0; // length of sequence #2 after alignment + int len3 = 0; // length of the markup line + + int identity = 0; // count of identitcal pairs + int similarity = 0; // count of similar pairs + int gaps = 0; // count of gaps + + char c1, c2; + + int i = cell.getRow(); // traceback start row + int j = cell.getCol(); // traceback start col + int k = i * n; + + boolean stillGoing = true; // traceback flag: true -> continue & false + // -> stop + + while (stillGoing) { + switch (pointers[k + j]) { + case TRACEBACK_UP: + for (int l = 0, len = sizesOfVerticalGaps[k + j]; l < len; l++) { + reversed1[len1++] = a1[--i]; + reversed2[len2++] = GAP; + reversed3[len3++] = MARKUP_GAP; + k -= n; + gaps++; + } + break; + case TRACEBACK_DIAGONAL: + c1 = a1[--i]; + c2 = a2[--j]; + k -= n; + reversed1[len1++] = c1; + reversed2[len2++] = c2; + if (c1 == c2) { + reversed3[len3++] = MARKUP_IDENTITY; + identity++; + similarity++; +// } else if (matrix[c1][c2] > 0) { + } else if (matrix[s1.getBase(i)][s2.getBase(j)] > 0) { + reversed3[len3++] = MARKUP_SIMILARITY; + similarity++; + } else { + reversed3[len3++] = MARKUP_MISMATCH; + } + break; + case TRACEBACK_LEFT: + for (int l = 0, len = sizesOfHorizontalGaps[k + j]; l < len; l++) { + reversed1[len1++] = SWGAlignment.GAP; + reversed2[len2++] = a2[--j]; + reversed3[len3++] = MARKUP_GAP; + gaps++; + } + break; + case TRACEBACK_STOP: + stillGoing = false; + } + } + + alignment.setSequence1(reverse(reversed1, len1)); + alignment.setStart1(i); + alignment.setSequence2(reverse(reversed2, len2)); + alignment.setStart2(j); + alignment.setMarkupLine(reverse(reversed3, len3)); + alignment.setIdentity(identity); + alignment.setGaps(gaps); + alignment.setSimilarity(similarity); + + return alignment; + } + + /** + * Returns the maximum of 4 float numbers. + * + * @param a + * float #1 + * @param b + * float #2 + * @param c + * float #3 + * @param d + * float #4 + * @return The maximum of a, b, c and d. + */ + private static float maximum(float a, float b, float c, float d) { + if (a > b) { + if (a > c) { + return a > d ? a : d; + } else { + return c > d ? c : d; + } + } else if (b > c) { + return b > d ? b : d; + } else { + return c > d ? c : d; + } + } + + /** + * Reverses an array of chars + * + * @param a + * @param len + * @return the input array of char reserved + */ + private static char[] reverse(char[] a, int len) { + char[] b = new char[len]; + for (int i = len - 1, j = 0; i >= 0; i--, j++) { + b[j] = a[i]; + } + return b; + } + + + static class Cell { + /** + * Row of the cell + */ + private int row; + /** + * Column of the cell + */ + private int col; + /** + * Alignment score at this cell + */ + private float score; + + /** + * Constructor + */ + public Cell() { + super(); + this.row = 0; + this.col = 0; + this.score = Float.NEGATIVE_INFINITY; + } + /** + * @return Returns the col. + */ + public int getCol() { + return this.col; + } + /** + * @param col The col to set. + */ + public void setCol(int col) { + this.col = col; + } + /** + * @return Returns the row. + */ + public int getRow() { + return this.row; + } + /** + * @param row The row to set. + */ + public void setRow(int row) { + this.row = row; + } + /** + * @return Returns the score. + */ + public float getScore() { + return this.score; + } + /** + * @param score The score to set. + */ + public void setScore(float score) { + this.score = score; + } + + /** + * Sets the row, column and score of the cell. + * @param row The row to set. + * @param col The col to set. + * @param score The score to set. + */ + public void set(int row, int col, float score) { + this.row = row; + this.col = col; + this.score = score; + } + } +} +/** + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ \ No newline at end of file diff --git a/src/main/java/japsa/bio/phylo/PhylogenyTree.java b/src/main/java/japsa/bio/phylo/PhylogenyTree.java index 8c2dd5d..60ab697 100755 --- a/src/main/java/japsa/bio/phylo/PhylogenyTree.java +++ b/src/main/java/japsa/bio/phylo/PhylogenyTree.java @@ -70,17 +70,17 @@ public PhylogenyTree(PhylogenyTree parent) { distance = DEFAULT_LENGTH; height = 0; } - + // Create an internal tree public PhylogenyTree(PhylogenyTree parent, PhylogenyTree left, - PhylogenyTree right) { + PhylogenyTree right) { this(parent); children = new PhylogenyTree[2]; setChild(left, 0); setChild(right, 1); } - + /** * Create a new tree who where the the root is moved to the parent of the * node A new node to be created (the root) and returned @@ -96,7 +96,7 @@ public PhylogenyTree moveRootTo(PhylogenyTree node) { return this;// Need not do anything PhylogenyTree newNode = oldParen.addChild(new PhylogenyTree(null), node - .getIndex(), node.distance / 2); + .getIndex(), node.distance / 2); newNode.getChild(1).setName("Tmp"); newNode.modifiedRoot(1); @@ -301,8 +301,8 @@ int latexDrawTree(PrintStream out, boolean len) { int x = EXTRA_WIDTH; INDEX++; out.println("\\node[exnode,label=right:\\emph{" - + formatName(this.name) + "}] at (" + x + "," + CRT_LATEXY - + ") (node" + INDEX + ") {};"); + + formatName(this.name) + "}] at (" + x + "," + CRT_LATEXY + + ") (node" + INDEX + ") {};"); // draw the name of the node int y = CRT_LATEXY; CRT_LATEXY -= LATEXDIS; @@ -318,18 +318,18 @@ int latexDrawTree(PrintStream out, boolean len) { int y = (y0 + y1) / 2; int x = level() * 5; out.println("\\node[innode] at (" + x + "," + y + ") (node" - + INDEX + ") {};"); + + INDEX + ") {};"); // out.println(" ;"); if (len) { out.println("\\node at (" + (x + 2.0) + "," + (y0 + 1.2) - + ") {" + nf.format(children[0].distance) + "};"); + + ") {" + nf.format(children[0].distance) + "};"); out.println("\\node at (" + (x + 2.0) + "," + (y1 + 1.2) - + ") {" + nf.format(children[1].distance) + "};"); + + ") {" + nf.format(children[1].distance) + "};"); } out.println("\\path[edge] (node" + INDEX + ") -- (" + x + "," + y0 - + ") -- (node" + ind0 + "); "); + + ") -- (node" + ind0 + "); "); out.println("\\path[edge] (node" + INDEX + ") -- (" + x + "," + y1 - + ") -- (node" + ind1 + "); "); + + ") -- (node" + ind1 + "); "); return y; } @@ -377,7 +377,7 @@ public static PhylogenyTree parseTree(String treeStr){ break; if (" ".equals(currentToken) || "\t".equals(currentToken) || "\n".equals(currentToken) - || ",".equals(currentToken)) + || ",".equals(currentToken)) continue; if ("(".equals(currentToken)) {// Start a new tree @@ -420,7 +420,7 @@ public PhylogenyTree addChild(PhylogenyTree aTree, int index, double dis) { children[index].distance -= dis; PhylogenyTree newChild = new PhylogenyTree(this, this.getChild(index), - aTree); + aTree); newChild.setDistance(dis); aTree.distance = newChild.height = this.height - dis; @@ -438,11 +438,19 @@ public PhylogenyTree removeGrandChild(int childIndex, int grandChildIndex) { PhylogenyTree child = children[childIndex]; PhylogenyTree theGrandChild = child.children[grandChildIndex]; - this.setChild(child.children[1 - grandChildIndex], childIndex); + PhylogenyTree newChild = child.children[1 - grandChildIndex]; + double newDistance = child.distance + newChild.distance; + + + this.setChild(newChild, childIndex); // Restore the distance - this.getChild(childIndex).distance = this.height - - this.getChild(childIndex).height; + + newChild.distance = newDistance; + //this.height - this.getChild(childIndex).height; + //if (newChild.isLeaf() && newChild.name.endsWith("_54")){ + // System.out.println("==== " + newChild.name + " = " + newChild.distance); + //} return theGrandChild; } @@ -510,7 +518,7 @@ public PhylogenyTree commonAncestor(PhylogenyTree aTree) { int indA = routA.length(), indB = routB.length(); while (indA >= 0 && indB >= 0 - && routA.charAt(indA) == routB.charAt(indB)) { + && routA.charAt(indA) == routB.charAt(indB)) { int ind = Integer.parseInt(routA.charAt(indA) + ""); treePtr = treePtr.getChild(ind); @@ -548,7 +556,7 @@ public double distanceTo(PhylogenyTree aTree) { int indA = routA.length() - 1, indB = routB.length() - 1; while (indA >= 0 && indB >= 0 - && routA.charAt(indA) == routB.charAt(indB)) { + && routA.charAt(indA) == routB.charAt(indB)) { int ind = Integer.parseInt(routA.charAt(indA) + ""); treePtr = treePtr.getChild(ind); @@ -575,7 +583,7 @@ public double distanceTo(PhylogenyTree aTree) { //this.addLeafTrees(v); return getLeaves().iterator(); } - + public ArrayList getLeaves() { ArrayList v = new ArrayList(); this.addLeafTrees(v); @@ -605,7 +613,7 @@ public String toString() { return "" + name; } String tree = "(" + children[0] + ":" + children[0].getDistance() + "," - + children[1] + ":" + children[1].getDistance() + ")"; + + children[1] + ":" + children[1].getDistance() + ")"; // if (parent == null) return tree + ";";//If the root // else return tree; @@ -632,7 +640,7 @@ public double sumHops() { return 0; else return this.getChild(0).sumHops() + this.getChild(0).distance - + this.getChild(1).sumHops() + this.getChild(1).distance; + + this.getChild(1).sumHops() + this.getChild(1).distance; } public int numHops() { @@ -640,7 +648,7 @@ public int numHops() { return 0; else return this.getChild(0).numHops() + 1 - + this.getChild(1).numHops() + 1; + + this.getChild(1).numHops() + 1; } /************************* Get and Set ********************************/ @@ -693,7 +701,7 @@ public void setName(String name) { } public static void writeNexus(Sequence[] seqs, PrintStream ps) - throws IOException { + throws IOException { int length = seqs[0].length(); int charPerLine = 60;// @@ -701,7 +709,7 @@ public static void writeNexus(Sequence[] seqs, PrintStream ps) //PrintStream ps = new PrintStream(new FileOutputStream(fileName)); ps.println("#NEXUS\nBEGIN DATA;"); ps.println(" Dimensions NTax=" + seqs.length + " NChar=" - + length + ";"); + + length + ";"); ps .println(" Format DataType=DNA Interleave=yes Gap=- Missing=?;"); ps.println(" Matrix"); @@ -710,7 +718,7 @@ public static void writeNexus(Sequence[] seqs, PrintStream ps) while (true) { for (int i = 0; i < seqs.length; i++) { ps.printf("%s ", (seqs[i].getName() + " ") - .substring(0, 10)); + .substring(0, 10)); for (int x = count; x < count + charPerLine && x < length; x++) { if (x % 10 == 0 && x > count) @@ -733,7 +741,7 @@ public static void writeNexus(Sequence[] seqs, PrintStream ps) } public static void writePhylip(Sequence[] seqs, PrintStream ps) - throws IOException { + throws IOException { //PrintStream ps = new PrintStream(new FileOutputStream(fileName)); int length = seqs[0].length(); @@ -746,7 +754,7 @@ public static void writePhylip(Sequence[] seqs, PrintStream ps) for (int i = 0; i < seqs.length; i++) { if (count == 0) { ps.printf("%s ", (seqs[i].getName() + " ") - .substring(0, 10)); + .substring(0, 10)); } for (int x = count; x < count + charPerLine && x < length; x++) { diff --git a/src/main/java/japsa/bio/sim/IlluminaSequencing.java b/src/main/java/japsa/bio/sim/IlluminaSequencing.java new file mode 100644 index 0000000..d29b8c2 --- /dev/null +++ b/src/main/java/japsa/bio/sim/IlluminaSequencing.java @@ -0,0 +1,121 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/08/2016 - Minh Duc Cao: Created + ****************************************************************************/ +package japsa.bio.sim; + +import java.io.IOException; +import java.util.Random; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; +import japsa.seq.SequenceOutputStream; + +/** + * Class represent Illumina sequencing + * + * @author minhduc + *TODO: make full class + */ +public class IlluminaSequencing{ + + /** + * Simulate MiSeq + * @param fragment + * @param o1 + * @param o2 + * @param rnd + * @throws IOException + */ + public static void simulatePaired(Sequence fragment, SequenceOutputStream o1 , SequenceOutputStream o2, Random rnd) throws IOException{ + + double snp = 0.01; + double del = 0.0001; + double ins = 0.0001; + double ext = 0.2; + + //double r = + int len = Math.min(250, fragment.length()); + String name = fragment.getName(); + + SequenceBuilder read1 = SequencingSimulation.simulateRead(fragment, len, snp, del, ins, ext, rnd); + SequenceBuilder read2 = SequencingSimulation.simulateRead(Alphabet.DNA.complement(fragment), len, snp, del, ins, ext, rnd); + + if (rnd.nextBoolean()){ + o1.print("@"); + o1.print(name); + o1.print("\n"); + for (int i = 0; i < read1.length();i++) + o1.print(read1.charAt(i)); + o1.print("\n+\n"); + + for (int i = 0; i < read1.length();i++) + o1.print("I"); + o1.print("\n"); + + o2.print("@"); + o2.print(name); + o2.print("\n"); + for (int i = 0; i < read2.length();i++) + o2.print(read2.charAt(i)); + o2.print("\n+\n"); + + for (int i = 0; i < read2.length();i++) + o2.print("I"); + o2.print("\n"); + }else{ + o2.print("@"); + o2.print(name); + o2.print("\n"); + for (int i = 0; i < read1.length();i++) + o2.print(read1.charAt(i)); + o2.print("\n+\n"); + + for (int i = 0; i < read1.length();i++) + o2.print("I"); + o2.print("\n"); + + o1.print("@"); + o1.print(name); + o1.print("\n"); + for (int i = 0; i < read2.length();i++) + o1.print(read2.charAt(i)); + o1.print("\n+\n"); + + for (int i = 0; i < read2.length();i++) + o1.print("I"); + o1.print("\n"); + } + } + + +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/sim/PacBioSequencing.java b/src/main/java/japsa/bio/sim/PacBioSequencing.java new file mode 100644 index 0000000..3f35d16 --- /dev/null +++ b/src/main/java/japsa/bio/sim/PacBioSequencing.java @@ -0,0 +1,91 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/08/2016 - Minh Duc Cao: Created + ****************************************************************************/ +package japsa.bio.sim; + +import java.io.IOException; +import java.util.Random; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; +import japsa.seq.SequenceOutputStream; + +/** + * Implement PacBio sequencing + * @author minhduc + * + */ +public class PacBioSequencing{ + static int PACBIO_ADAPTER_LENGTH = 46; + static int READ_ID = 0; + + + public static void simulatePacBio(Sequence fragment, int readLen, SequenceOutputStream o, Random rnd) throws IOException{ + double snp = 0.01; + double ins = 0.1; + double del = 0.04; + double ext = 0.4; + + READ_ID ++; + + //int len = (int) (fragment.length() * .9); + String name = fragment.getName(); + Sequence compStrand = Alphabet.DNA.complement(fragment); + int start = 0; + + boolean template = true; + while (readLen > 0){ + int len = readLen; + Sequence myStrand = template?fragment:compStrand; + + if (len > fragment.length()){ + len = fragment.length(); + } + SequenceBuilder read = SequencingSimulation.simulateRead(myStrand, len, snp, del, ins, ext, rnd); + o.print("@"); + o.print(name); + o.print("/" + READ_ID + "/" + start + "_" + (start + read.length()) + "\n"); + for (int i = 0; i < read.length();i++) + o.print(read.charAt(i)); + o.print("\n+\n"); + for (int i = 0; i < read.length();i++) + o.print("E"); + o.print("\n"); + + start += read.length() + PACBIO_ADAPTER_LENGTH; + readLen -= (read.length() + PACBIO_ADAPTER_LENGTH); + + template = !template; + } + } +} \ No newline at end of file diff --git a/src/main/java/japsa/bio/sim/SequencingSimulation.java b/src/main/java/japsa/bio/sim/SequencingSimulation.java new file mode 100644 index 0000000..bca1ff3 --- /dev/null +++ b/src/main/java/japsa/bio/sim/SequencingSimulation.java @@ -0,0 +1,103 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/08/2016 - Minh Duc Cao: Created + ****************************************************************************/ +package japsa.bio.sim; + +import java.util.Random; + +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; + +/** + * @author minhduc + * + */ +public class SequencingSimulation { + /** + * Simulate a read from the start of a fragment + * @param fragment + * @param len + * @param snp + * @param indel + * @param ext + * @param rnd + * @return + */ + public static SequenceBuilder simulateRead(Sequence fragment, int len, double snp, double del, double ins, double ext, Random rnd){ + //accumulative prob + double aSNP = snp; + double aDel = aSNP + del; + double aIns = aDel + ins; + + //why - 10?/ + len = Math.min(len, fragment.length() - 10); + + //Sequence read = new Sequence(fragment.alphabet(), len); + SequenceBuilder sb = new SequenceBuilder(fragment.alphabet(), len); + + //int mIndex = 0; + int fIndex = 0; + //mIndex < len && + for (; sb.length() < len && fIndex < fragment.length();){ + byte base = fragment.getBase(fIndex); + double r = rnd.nextDouble(); + if (r < aSNP){ + //simulate a SNP aka mismatch + sb.append((byte) ((base + rnd.nextInt(3)) % 4)); + //read.setBase(mIndex, (byte) ((base + rnd.nextInt(3)) % 4)); + //mIndex ++; + fIndex ++; + }else if (r < aDel){ + do{ + fIndex ++; + }while (rnd.nextDouble() < ext); + }else if (r < aIns){ + //insertion + do{ + sb.append((byte) (rnd.nextInt(4))); + //mIndex ++; + }while (rnd.nextDouble() < ext); + }else{ + sb.append(base); + //read.setBase(mIndex, base); + //mIndex ++; + fIndex ++; + }//else + }//for + //for (;mIndex < len;mIndex ++){ + // //pad in random to fill in + // read.setBase(mIndex, (byte) rnd.nextInt(4)); + //} + return sb; + } + +} diff --git a/src/main/java/japsa/bio/tr/TandemRepeat.java b/src/main/java/japsa/bio/tr/TandemRepeat.java index 6eacf92..6a4171b 100755 --- a/src/main/java/japsa/bio/tr/TandemRepeat.java +++ b/src/main/java/japsa/bio/tr/TandemRepeat.java @@ -76,8 +76,8 @@ /** * This field is deprecated, changes to chrom */ - @Deprecated - public static String chrHd = "chr"; //01 + //@Deprecated + //public static String chrHd = "chr"; //01 //Property of a TR -- start, end are with regards to the reference genome private int period = 2; private double unitNo; //the length number of units on the ref @@ -117,7 +117,6 @@ public TandemRepeat(String chr, int start, int end) { /** * Get short tandem repeat from a biofeature - * @param id * @param f */ public TandemRepeat(JapsaFeature f) { @@ -195,8 +194,8 @@ else if (TandemRepeat.pIndelHd.equals(fieldStr)) else if (TandemRepeat.annotationHd.equals(fieldStr)) rec.annotations = reader.getField(i); //TODO: To be removed - else if (TandemRepeat.chrHd.equals(fieldStr)) - rec.setChr(reader.getField(i)); + //else if (TandemRepeat.chrHd.equals(fieldStr)) + // rec.setChr(reader.getField(i)); } return rec; } @@ -214,8 +213,8 @@ public static TandemRepeat read(String line, String [] hds){ String [] toks = line.trim().split("\\t"); for (int i = 0; i < hds.length; i++ ){ - if (TandemRepeat.chrHd.equals(hds[i])) - rec.setChr(toks[i]); + //if (TandemRepeat.chrHd.equals(hds[i])) + // rec.setChr(toks[i]); if (TandemRepeat.chromHd.equals(hds[i])) rec.setChr(toks[i]); else if (TandemRepeat.idHd.equals(hds[i])) @@ -407,7 +406,7 @@ static public void writeToFile(SequenceOutputStream out, String[] headers, Array /** * Find the most compact form of japsa.seq (triplet). * This is a slow implementation. - * @param japsa.seq + * @param seq * @return */ public static String collapseForm(String seq){ diff --git a/src/main/java/japsa/bio/tr/TandemRepeatVariant.java b/src/main/java/japsa/bio/tr/TandemRepeatVariant.java index 0c0bdbc..7002e84 100755 --- a/src/main/java/japsa/bio/tr/TandemRepeatVariant.java +++ b/src/main/java/japsa/bio/tr/TandemRepeatVariant.java @@ -38,7 +38,8 @@ import japsa.seq.JapsaFeature; import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; -import japsa.util.Logging; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; @@ -53,41 +54,43 @@ * */ public class TandemRepeatVariant implements Comparable{ + private static final Logger LOG = LoggerFactory.getLogger(TandemRepeatVariant.class); + public static String varHd = "var", //06 var2Hd = "var2", //06 confidenceHd = "confidence", //07 - + meanHd = "mean", //08 stdHd = "std", //09 eviHd = "evidence", //10 evi2Hd = "evidence2" ; //10 - + //public static String[] STANDARD_HEADERS = {TandemRepeat.chrHd, TandemRepeat.idHd, TandemRepeat.startHd, TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, varHd, confidenceHd, meanHd, stdHd,eviHd}; //public static String[] STANDARD_HEADERS2 = {TandemRepeat.chrHd, TandemRepeat.idHd, TandemRepeat.startHd, TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, varHd, meanHd, stdHd}; public static String[] STANDARD_HEADERS = {TandemRepeat.chromHd, TandemRepeat.idHd, TandemRepeat.startHd, - TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, - varHd, confidenceHd, stdHd, eviHd}; + TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, + varHd, confidenceHd, stdHd, eviHd}; public static String[] STANDARD_HEADERS2 = {TandemRepeat.chromHd, TandemRepeat.idHd, TandemRepeat.startHd, - TandemRepeat.endHd, TandemRepeat.periodHd, - TandemRepeat.unitNoHd, varHd, stdHd}; - + TandemRepeat.endHd, TandemRepeat.periodHd, + TandemRepeat.unitNoHd, varHd, stdHd}; + public static String[] SIMPLE_HEADERS = {TandemRepeat.chromHd, TandemRepeat.idHd, TandemRepeat.startHd, - TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, - varHd, eviHd}; - + TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, + varHd, eviHd}; + public static String[] SIMPLE_HEADERS2 = {TandemRepeat.chromHd, TandemRepeat.idHd, TandemRepeat.startHd, - TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, - varHd, eviHd, var2Hd, evi2Hd}; - + TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, + varHd, eviHd, var2Hd, evi2Hd}; + TandemRepeat tandemRepeat; /** * @return the tandemRepeat @@ -97,8 +100,9 @@ public TandemRepeat getTandemRepeat() { } /** - * @param tandemRepeat the tandemRepeat to set + * @param tr the tandemRepeat to set */ + public void setTandemRepeat (TandemRepeat tr) { this.tandemRepeat = tr; } @@ -107,8 +111,8 @@ public void setTandemRepeat (TandemRepeat tr) { double var2;//second allele double confidence, evidence;//The confident is in probability (i.e., 1-10^phred double evidence2; - - @Deprecated + + @Deprecated double mean = 0;//a bland distribution @@ -138,7 +142,7 @@ public void setConfidence(double confidence) { * @return the mean * @Deprecated: mean will be removed in the new future */ - @Deprecated + @Deprecated public double getMean() { return mean; } @@ -146,7 +150,7 @@ public double getMean() { * @param mean the mean to set * @Deprecated: mean will be removed */ - @Deprecated + @Deprecated public void setMean(double mean) { this.mean = mean; } @@ -162,7 +166,7 @@ public double getStd() { public void setStd(double std) { this.std = std; } - + public void swapVar(){ double tmp = var;var = var2;var2 = tmp; } @@ -173,7 +177,7 @@ public void swapVar(){ public double getVar() { return var; } - + public double getVar2() { return var2; } @@ -206,7 +210,7 @@ public TandemRepeatVariant tandemRepeatClone(){ return trv; } - + /** * Read from a line and a list of fields * @param line @@ -310,7 +314,7 @@ public void addEvidence(double moreEvi){ public void addEvidence2(double moreEvi){ evidence2 += moreEvi; } - + public double getEvidence(){ return evidence; } @@ -322,7 +326,7 @@ public double getEvidence2(){ public static ArrayList readFromFile(String fileName) throws IOException{ //Start with the default header String[] headers = STANDARD_HEADERS2; - + ArrayList trfList = new ArrayList(); BufferedReader in = SequenceReader.openFile(fileName); String line = ""; @@ -338,8 +342,8 @@ else if (line.startsWith("#")) trfList.add(TandemRepeatVariant.read(line, headers)); }//while - - Logging.info("Read in " + trfList.size() + " TRs"); + + LOG.info("Read in " + trfList.size() + " TRs"); return trfList; } @@ -354,12 +358,12 @@ else if (line.startsWith("#")) for (int idx = 0; idx < annoList.size(); idx ++){ JapsaAnnotation anno = annoList.get(idx); for (int x = 0; x < anno.numFeatures(); x++){ - + JapsaFeature feature = anno.getFeature(x); - + if (!feature.getType().equals("trf")) continue; - + TandemRepeatVariant trv = new TandemRepeatVariant(); trv.tandemRepeat.setChr(anno.getAnnotationID()); trv.tandemRepeat.setStart(Integer.parseInt(feature.getID().substring(1))); @@ -401,14 +405,14 @@ public static void print(ArrayList trvList, SequenceOutputS out.print('\n'); } } - + public static void printHeader(SequenceOutputStream out, String [] headers) throws IOException{ out.print("#H:" + headers[0]); for (int i=1; i < headers.length; i++) out.print("\t"+headers[i]); out.print('\n'); } - + public String toString(){ @@ -423,7 +427,7 @@ public String toString(){ public int compareTo(TandemRepeatVariant o) { return tandemRepeat.compareTo(o.tandemRepeat); } - + /** * Write the variant to bed file. Currently use only the first heterozygous * variant is written out. diff --git a/src/main/java/japsa/seq/AbstractSequence.java b/src/main/java/japsa/seq/AbstractSequence.java index cd72c52..f5b88e4 100755 --- a/src/main/java/japsa/seq/AbstractSequence.java +++ b/src/main/java/japsa/seq/AbstractSequence.java @@ -64,10 +64,6 @@ // Some default ID and description of the sequence private String name = "", desc = ""; - /** - * - * @param dna - */ public AbstractSequence(Alphabet alphabet) { this.alphabet = alphabet; } @@ -76,12 +72,6 @@ public AbstractSequence(Alphabet alphabet, String name) { this.alphabet = alphabet; this.name = name; } - /** - * Construct a sequence with given dna, name and descriptions - * @param dna - * @param name - * @param desc - */ public AbstractSequence(Alphabet alphabet, String name, String desc) { this.alphabet = alphabet; @@ -142,6 +132,10 @@ public Alphabet alphabet() { */ public abstract int symbolAt(int loc); public abstract void setSymbol(int loc, int symbol); + + public abstract byte getBase(int loc); + public abstract byte setBase(int loc, byte base); + /** @@ -330,7 +324,8 @@ public void writeJSA(String fileName) throws IOException { * @throws IOException */ public void print(SequenceOutputStream out) throws IOException{ - writeJSA(out); + //writeJSA(out); + writeFasta(out); } diff --git a/src/main/java/japsa/seq/Alphabet.java b/src/main/java/japsa/seq/Alphabet.java index 2e53741..e301682 100755 --- a/src/main/java/japsa/seq/Alphabet.java +++ b/src/main/java/japsa/seq/Alphabet.java @@ -1027,7 +1027,7 @@ public boolean match(int a, int b){ -1, //171 -> 43 => '+' -1, //172 -> 44 => ',' X, //173 -> 45 => '-' - -1, //174 -> 46 => '.' + X, //174 -> 46 => '.' -1, //175 -> 47 => '/' -1, //176 -> 48 => '0' -1, //177 -> 49 => '1' @@ -1323,7 +1323,7 @@ public boolean match(int a, int b){ -1, //171 -> 43 => '+' -1, //172 -> 44 => ',' X, //173 -> 45 => '-' ei gap - -1, //174 -> 46 => '.' + X, //174 -> 46 => '.' -1, //175 -> 47 => '/' -1, //176 -> 48 => '0' -1, //177 -> 49 => '1' diff --git a/src/main/java/japsa/seq/FastaReader.java b/src/main/java/japsa/seq/FastaReader.java index d81cca1..70b0b51 100644 --- a/src/main/java/japsa/seq/FastaReader.java +++ b/src/main/java/japsa/seq/FastaReader.java @@ -36,7 +36,8 @@ package japsa.seq; -import japsa.seq.Alphabet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedInputStream; import java.io.FileInputStream; @@ -56,7 +57,7 @@ * */ public class FastaReader extends SequenceReader{ - static int INI_SEQ_SIZE = 8191;//1 << 20-1; + static int INI_SEQ_SIZE = 8191;//1 << 20-1; //private int seqNo = 1;//keep tract of sequence number //Temporary byte array for the sequence @@ -94,7 +95,7 @@ public boolean hasNext() throws IOException{ /** * Read the first sequence from an input stream * @param ins - * @param dna + * @param alphabet * @return * @throws IOException */ @@ -111,8 +112,7 @@ public static Sequence read (InputStream ins, Alphabet alphabet) throws IOExcept * Precondition:it is a fasta file. If some invalid character exists, an * IOException will be thrown. * - * @param in - * @param dna: the dna, if not specified, the default is DNA16 + * @param alphabet: the alphabet * @return * @throws IOException */ @@ -179,14 +179,7 @@ public Sequence nextSequence(Alphabet alphabet) throws IOException{ } } - /** - * This method is created as for reusable - * @param dna - * @param byteArray - * @param length - * @param name - * @return - */ + static private Sequence makeSequence (Alphabet alphabet, byte[] byteArray, int length, String name){ String[] toks = name.split("\\s",2); @@ -307,8 +300,6 @@ public static Sequence read (InputStream ins, Alphabet alphabet) throws IOExcept * Precondition:it is a fasta file. If some invalid character exists, an * IOException will be thrown. * - * @param in - * @param dna: the dna, if not specified, the default is DNA16 * @return * @throws IOException */ diff --git a/src/main/java/japsa/seq/FastqReader.java b/src/main/java/japsa/seq/FastqReader.java index 8eb6a0d..058f6d0 100644 --- a/src/main/java/japsa/seq/FastqReader.java +++ b/src/main/java/japsa/seq/FastqReader.java @@ -34,8 +34,9 @@ package japsa.seq; -import japsa.seq.Alphabet; -import japsa.util.Logging; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; @@ -49,7 +50,8 @@ * @author Minh Duc Cao (minhduc \dot cao \at gmail \dot com) * */ -public class FastqReader extends SequenceReader{ +public class FastqReader extends SequenceReader{ + private static final Logger LOG = LoggerFactory.getLogger(FastqReader.class); private byte [] seq = new byte[1024];//estimated max read length private byte [] qual = new byte[1024]; @@ -139,7 +141,7 @@ public FastqSequence nextSequence(Alphabet alphabet) throws IOException { if (seqIndex != qualIndex){ //throw new RuntimeException("Lengths of sequence and quality strings do not match at line " + lineNo + " : " + seqIndex + " vs " + qualIndex); - Logging.warn("Lengths of sequence and quality strings do not match at line " + lineNo + " : " + seqIndex + " vs " + qualIndex); + LOG.warn("Lengths of sequence and quality strings do not match at line " + lineNo + " : " + seqIndex + " vs " + qualIndex); } //Read the next byte from the stream (expecting a @ or eof diff --git a/src/main/java/japsa/seq/FastqSequence.java b/src/main/java/japsa/seq/FastqSequence.java index c0a0d10..39ce251 100644 --- a/src/main/java/japsa/seq/FastqSequence.java +++ b/src/main/java/japsa/seq/FastqSequence.java @@ -33,8 +33,6 @@ ****************************************************************************/ package japsa.seq; -import japsa.seq.Alphabet; - import java.io.IOException; import java.util.Arrays; diff --git a/src/main/java/japsa/seq/Genome.java b/src/main/java/japsa/seq/Genome.java new file mode 100755 index 0000000..48e32ff --- /dev/null +++ b/src/main/java/japsa/seq/Genome.java @@ -0,0 +1,166 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 16/12/2012 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsa.seq; + +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * A genome is a set of sequences, possibly chromosomes, as well as other + * sequences such as mitochrondria, contigs, unassembled sequences etc. + * + * @author minhduc + * + */ +public class Genome { + long length; + + ArrayList seqList = new ArrayList(); + HashMap seqHash = new HashMap(); + Alphabet.DNA alphabet = Alphabet.DNA();// the most permissive + + public Genome() { + + } + + /** + * @param args + */ + public void read(String fileName) throws IOException { + // ArrayList seqHash = new ArrayList(); + FastaReader.Faster fread = new FastaReader.Faster(fileName); + Sequence seq; + + // Timer timer = new Timer(); + while ((seq = fread.nextSequence(Alphabet.DNA16())) != null) { + String id = seq.getName().split("\\s")[0]; + seq.setName(id); + seqHash.put(id, seq); + length += seq.length(); + seqList.add(seq); + } + } + + public long getLength(){ + return length; + } + + public ArrayList chrList(){ + return seqList; + } + + + public static void main(String[] args) throws IOException { + Genome genome = new Genome(); + genome.read(args[0]); + genome.getProteome(args[1], args[2]); + } + + public void getProteome(String annotationFile, String canonicalFile) + throws IOException { + BufferedReader canBf = new BufferedReader(new FileReader(canonicalFile)); + BufferedReader annBf = new BufferedReader( + new FileReader(annotationFile)); + + String canLine = ""; + String annoLine = ""; + int annoLineNo = 0; + + Sequence seq = null; + // int canonicalGene = 0; + while ((canLine = canBf.readLine()) != null) { + String[] canToks = canLine.split("\t"); + if (seq == null || (!seq.getName().equals(canToks[0]))) { + seq = seqHash.get(canToks[0]); + } + if (seq == null) + continue; + // assert japsa.seq.id = canToks[0] + // look for the annotation for this gene + + // canonicalGene ++; + while ((annoLine = annBf.readLine()) != null) { + annoLineNo++; + String[] annToks = annoLine.split("\t"); + if (annToks[1].equals(canToks[0]) + && annToks[0].equals(canToks[4])) { + int exonNumber = Integer.parseInt(annToks[7]); + String[] eStart = annToks[8].split(","); + String[] eEnd = annToks[9].split(","); + // boolean sense = "+".equals(annToks[7]); + if (eStart.length != exonNumber) { + canBf.close(); + annBf.close(); + throw new RuntimeException(annoLineNo + " : E = " + + exonNumber + " " + eStart.length); + } + System.out.println("\n " + exonNumber + ":"); + int myLength = 0; + for (int i = 0; i < exonNumber; i++) { + myLength += Integer.parseInt(eEnd[i]) + - Integer.parseInt(eStart[i]) + 1; + System.out.print(" " + + (Integer.parseInt(eEnd[i]) + - Integer.parseInt(eStart[i]) + 1)); + } + + if (myLength % 3 != 0) { + canBf.close(); + annBf.close(); + throw new RuntimeException(annoLineNo + + " : Not division by 3 " + length); + } + break; + } + }// + if (annoLine == null) { + canBf.close(); + annBf.close(); + throw new RuntimeException(annoLineNo + " : Null not expected " + + canToks[4]); + } + } + canBf.close(); + annBf.close(); + } + +} diff --git a/src/main/java/japsa/seq/Sequence.java b/src/main/java/japsa/seq/Sequence.java index 78f2b17..88fdb78 100755 --- a/src/main/java/japsa/seq/Sequence.java +++ b/src/main/java/japsa/seq/Sequence.java @@ -59,7 +59,7 @@ /** * Create an empty sequence with a specified length * - * @param dna + * @param alphabet * */ public Sequence(Alphabet alphabet, int length) { @@ -74,7 +74,7 @@ public Sequence(Alphabet alphabet, int length, String name) { /** * Construct a sequence from a sequence of characters. - * @param dna + * @param alphabet * @param charSeq * @param name */ @@ -101,7 +101,7 @@ public Sequence(Alphabet alphabet, ByteArray bArray, String name) { /** * Copy the byte array up to the length * - * @param dna + * @param alphabet * @param byteArray * @param length */ @@ -132,8 +132,8 @@ public Sequence(Alphabet alphabet, byte[] byteArray, String name) { /** * Construct a sequence with an dna from the string represent the * sequence and the name - * @param dna - * @param seqStr + * @param alphabet + * @param seqString * @param name */ @@ -195,8 +195,9 @@ public Sequence concatenate(Sequence anotherSeq) { * @param loc * @param base */ - public void setBase(int loc, byte base) { + public byte setBase(int loc, byte base) { byteSeq[loc] = base; + return base; } public void setSymbol(int loc, int symbol){ @@ -214,7 +215,7 @@ public Sequence clone(){ /** * Create a random sequence with some length and some frequency distribution - * @param dna + * @param alphabet * @param length * @param freqs * @param rand a random generator @@ -254,7 +255,7 @@ public static Sequence random(Alphabet alphabet, int length, double [] freqs, Ra } /** * Create a random sequence - * @param dna + * @param alphabet * @param length * @param freqs * @return diff --git a/src/main/java/japsa/seq/SequenceBuilder.java b/src/main/java/japsa/seq/SequenceBuilder.java index e46a02c..d0db1aa 100644 --- a/src/main/java/japsa/seq/SequenceBuilder.java +++ b/src/main/java/japsa/seq/SequenceBuilder.java @@ -81,6 +81,11 @@ public SequenceBuilder(Alphabet alphabet, ByteArray bArray, String name) { byteSeq = bArray.toArray(); length = byteSeq.length; } + + public SequenceBuilder(Sequence seq, int length){ + this (seq.alphabet(), length); + append(seq); + } /** @@ -230,11 +235,12 @@ public byte getBase(int loc){ * @param loc * @param base */ - public void setBase(int loc, byte base) { + public byte setBase(int loc, byte base) { if (loc < 0 || loc >= length()){ throw new RuntimeException("Wrong location (max " + length + "):" + loc); } byteSeq[loc] = base; + return base; } public void setSymbol(int loc, int symbol){ diff --git a/src/main/java/japsa/seq/XAFReader.java b/src/main/java/japsa/seq/XAFReader.java index 5c6df6d..d038846 100644 --- a/src/main/java/japsa/seq/XAFReader.java +++ b/src/main/java/japsa/seq/XAFReader.java @@ -34,7 +34,8 @@ package japsa.seq; -import japsa.util.Logging; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.Closeable; @@ -53,6 +54,8 @@ */ public class XAFReader implements Closeable { + private static final Logger LOG = LoggerFactory.getLogger(XAFReader.class); + String sep = "\t"; public static final String XAF_HEADER = "#XAF"; public static final String ROW_HEADER = "#H:"; @@ -145,13 +148,13 @@ public String getField(String fieldName) { FieldHeader header = headerPool.get(fieldName); if (header == null || header.index < 0) { //throw new RuntimeException("Field " + fieldName + " not found"); - //Logging.warn("Field " + fieldName + " not found"); + //LOG.warn("Field " + fieldName + " not found"); return null; } if (header.index >= fields.length) { //throw new RuntimeException("Only " + fields.length + " fields at line " + lineNo); - //Logging.warn("Only " + fields.length + " fields at line " + lineNo); + //LOG.warn("Only " + fields.length + " fields at line " + lineNo); return null; } return fields[header.index]; @@ -164,7 +167,7 @@ public String getField(String fieldName) { public String getField(int fieldNo) { if (fieldNo >= fields.length) { //throw new RuntimeException("Only " + fields.length + " fields at line " + lineNo); - Logging.warn("Only " + fields.length + " fields at line " + lineNo); + LOG.warn("Only " + fields.length + " fields at line " + lineNo); return null; } return fields[fieldNo]; @@ -226,7 +229,7 @@ else if (currentRecord.startsWith(ROW_HEADER)) { for (int i = 0; i < toks.length; i++) { FieldHeader header = headerPool.get(toks[i]); if (header == null) { - Logging.warn("Header " + toks[i] + " not defined"); + LOG.warn("Header " + toks[i] + " not defined"); header = new FieldHeader(); header.headerStr = toks[i]; headerPool.put(toks[i], header); diff --git a/src/main/java/japsa/seq/nanopore/BreakException.java b/src/main/java/japsa/seq/nanopore/BreakException.java new file mode 100644 index 0000000..40abf0b --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/BreakException.java @@ -0,0 +1,12 @@ +package japsa.seq.nanopore; + +public class BreakException extends RuntimeException{ + /** + * + */ + private static final long serialVersionUID = -5357307685637432497L; + + public BreakException(String msg){ + super(msg); + } +} diff --git a/src/main/java/japsa/seq/nanopore/CustomRenderer.java b/src/main/java/japsa/seq/nanopore/CustomRenderer.java new file mode 100644 index 0000000..beabe66 --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/CustomRenderer.java @@ -0,0 +1,38 @@ +package japsa.seq.nanopore; + +import java.awt.Paint; +import org.jfree.chart.renderer.category.BarRenderer; + +/** + * A custom renderer that returns a different color for each item in a single series. + */ + +class CustomRenderer extends BarRenderer { + + /** The colors. */ + private Paint[] colors; + + /** + * Creates a new renderer. + * + * @param colors the colors. + */ + public CustomRenderer(final Paint[] colors) { + this.colors = colors; + } + + /** + * Returns the paint for an item. Overrides the default behaviour inherited from + * AbstractSeriesRenderer. + * + * @param row the series. + * @param column the category. + * + * @return The item color. + */ + public Paint getItemPaint(final int row, final int column) { + return this.colors[column % this.colors.length]; + } +} + + diff --git a/src/main/java/japsa/seq/nanopore/Demultiplexer.java b/src/main/java/japsa/seq/nanopore/Demultiplexer.java new file mode 100644 index 0000000..fa7316b --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/Demultiplexer.java @@ -0,0 +1,423 @@ +/***************************************************************************** + * Copyright (c) Son Hoang Nguyen, IMB - UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 01/03/2017 - Son Hoang Nguyen: Created + * + ****************************************************************************/ +package japsa.seq.nanopore; + +import java.io.IOException; +import java.text.DecimalFormat; +import java.util.ArrayList; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; + +public class Demultiplexer { + //private static final Logger LOG = LoggerFactory.getLogger(Demultiplexer.class); + + int SCAN_WINDOW, + DIST_THRES, + SCORE_THRES; + + ArrayList barCodes; + ArrayList barCodeComps; + int nSamples; + private int barcodeLen; + + int[] readCount; + public static boolean toPrint=false; + SequenceOutputStream[] streamToFile; + + public Demultiplexer(String barcodeFile) throws IOException{ + barCodes = SequenceReader.readAll(barcodeFile, Alphabet.DNA()); + nSamples = barCodes.size(); + barcodeLen = barCodes.get(0).length(); + + barCodeComps = new ArrayList (barCodes.size()); + + if(toPrint){ + streamToFile = new SequenceOutputStream[nSamples+1]; // plus unknown + for(int i=0;i bestScore){ + //LOG.info("Better score=" + myScore); + distance = myScore-bestScore; + bestScore = myScore; + bestIndex = i; + } else if((bestScore-myScore) < distance){ + distance=bestScore-myScore; + } + + } + + + + String retval=""; + DecimalFormat twoDForm = new DecimalFormat("#.##"); + if(bestScore < SCORE_THRES || distance < DIST_THRES){ + //LOG.info("Confounding sequence " + seq.getName() + " with low grouping score " + bestScore); + retval = "unknown:"+Double.valueOf(twoDForm.format(bestScore))+":"+Double.valueOf(twoDForm.format(distance))+"|"; + + if(toPrint) + seq.print(streamToFile[nSamples]); + } + else { + //LOG.info("Sequence " + seq.getName() + " might belongs to sample " + barCodes.get(bestIndex).getName() + " with score=" + bestScore); + retval = barCodes.get(bestIndex).getName()+":"+Double.valueOf(twoDForm.format(bestScore))+":"+Double.valueOf(twoDForm.format(distance))+"|"; + + if(toPrint) + seq.print(streamToFile[bestIndex]); + + readCount[bestIndex]++; + } + +// retval = barCodes.get(bestIndex).getName()+":"+Double.valueOf(twoDForm.format(bestScore))+","+Double.valueOf(twoDForm.format(otherEndOfBest))+"|" +// +barCodes.get(secondBestIndex).getName()+":"+Double.valueOf(twoDForm.format(secondBestScore))+","+Double.valueOf(twoDForm.format(otherEndOfSecondBest)) +// +"|"; +// readCount[bestIndex]++; + + seq.setName(retval + seq.getName()); + + } + + public final class BarcodeAlignment { + + /** + * Traceback direction stop + */ + public static final byte STOP = 0; + /** + * Traceback direction left + */ + public static final byte LEFT = 1; + /** + * Traceback direction diagonal + */ + public static final byte DIAGONAL = 2; + /** + * Traceback direction up + */ + public static final byte UP = 3; + + public BarcodeAlignment(Sequence s1, Sequence s2) { + super(); + this.barcodeSequence = s1; + this.readSequence = s2; + + m = s1.length() + 1; + n = s2.length() + 1; + + //Initialize the arrays + pointers = new byte[m * n]; + sizesOfVerticalGaps = new short[m * n]; + sizesOfHorizontalGaps = new short[m * n]; + } + + Sequence barcodeSequence; + Sequence readSequence; + int m,n; + byte[] pointers; + short[] sizesOfVerticalGaps; + short[] sizesOfHorizontalGaps; + //BLOSSOM62 + //double [][] scores = + // {{4.0,0.0,0.0,0.0}, + // {0.0,9.0,-3.0,-1.0}, + // {0.0,-3.0,6.0,-2.0}, + // {0.0,-1.0,-2.0,5.0} + // }; + + //poreFUME's scores + double openPenalty = 4.7; + double extendPenalty = 1.6; + + double [][] scores = { + { 2.7, -4.5, -4.5, -4.5}, + { -4.5, 2.7, -4.5, -4.5}, + { -4.5, -4.5, 2.7, -4.5}, + { -4.5, -4.5, -4.5, 2.7} + }; + + + /** + * Alignment score at this cell + */ + private double cellScore; + + + public void setBarcodeSequence(Sequence seq){ + barcodeSequence = seq; + } + + public void setReadSequence(Sequence seq){ + readSequence = seq; + } + + + + public double align() { + // Initializes the boundaries of the traceback matrix to STOP. + for (int i = 0, k = 0; i < m; i++, k += n) { + pointers[k] = STOP; + } + for (int j = 1; j < n; j++) { + pointers[j] = STOP; + } + + for (int i = 0, k = 0; i < m; i++, k += n) { + for (int j = 0; j < n; j++) { + sizesOfVerticalGaps[k + j] = sizesOfHorizontalGaps[k + j] = 1; + } + } + return construct(); + } + + /** + * Constructs directions matrix for the traceback + * + * @param barcodeSequence + * sequence #1 + * @param readSequence + * sequence #2 + * @param scores + * scoring matrix + * @param openPenalty + * open gap penalty + * @param extendPenalty + * extend gap penalty + * @return The cell where the traceback starts. + */ + private double construct() { + //logger.info("Started..."); + //long start = System.currentTimeMillis(); + + double f; // score of alignment x1...xi to y1...yi if xi aligns to yi + double[] g = new double[n]; // score if xi aligns to a gap after yi + double h; // score if yi aligns to a gap after xi + double[] v = new double[n]; // best score of alignment x1...xi to + // y1...yi + double vDiagonal; + + g[0] = Float.NEGATIVE_INFINITY; + h = Float.NEGATIVE_INFINITY; + v[0] = 0; + + for (int j = 1; j < n; j++) { + g[j] = Float.NEGATIVE_INFINITY; + v[j] = 0; + } + + double similarityScore, g1, g2, h1, h2; + + cellScore = Float.NEGATIVE_INFINITY; + //Cell cell = new Cell(); + + for (int i = 1, k = n; i < m; i++, k += n) { + h = Float.NEGATIVE_INFINITY; + vDiagonal = v[0]; + for (int j = 1, l = k + 1; j < n; j++, l++) { + similarityScore = scores[barcodeSequence.getBase(i-1)][readSequence.getBase(j-1)]; + + // Fill the matrices + f = vDiagonal + similarityScore; + + g1 = g[j] - extendPenalty; + g2 = v[j] - openPenalty; + if (g1 > g2) { + g[j] = g1; + sizesOfVerticalGaps[l] = (short) (sizesOfVerticalGaps[l - n] + 1); + } else { + g[j] = g2; + } + + h1 = h - extendPenalty; + h2 = v[j - 1] - openPenalty; + if (h1 > h2) { + h = h1; + sizesOfHorizontalGaps[l] = (short) (sizesOfHorizontalGaps[l - 1] + 1); + } else { + h = h2; + } + + vDiagonal = v[j]; + v[j] = maximum(f, g[j], h, 0); + + // Determine the traceback direction + if (v[j] == 0) { + pointers[l] = STOP; + } else if (v[j] == f) { + pointers[l] = DIAGONAL; + } else if (v[j] == g[j]) { + pointers[l] = UP; + } else { + pointers[l] = LEFT; + } + + // Set the traceback start at the current cell i, j and score + if (v[j] > cellScore) { + cellScore = v[j]; + //cell.set(i, j, v[j]); + } + } + } + return cellScore; + } + + + /** + * Returns the maximum of 4 float numbers. + * + * @param a + * float #1 + * @param b + * float #2 + * @param c + * float #3 + * @param d + * float #4 + * @return The maximum of a, b, c and d. + */ + private double maximum(double a, double b, double c, double d) { + if (a > b) { + if (a > c) { + return a > d ? a : d; + } else { + return c > d ? c : d; + } + } else if (b > c) { + return b > d ? b : d; + } else { + return c > d ? c : d; + } + } + + } +} + diff --git a/src/main/java/japsa/seq/nanopore/Fast5DetailReader.java b/src/main/java/japsa/seq/nanopore/Fast5DetailReader.java new file mode 100644 index 0000000..3288101 --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/Fast5DetailReader.java @@ -0,0 +1,448 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 21/07/2014 - Minh Duc Cao: Created + * + ****************************************************************************/ + +package japsa.seq.nanopore; + +import java.util.List; + + +import ncsa.hdf.object.Group; +import ncsa.hdf.object.HObject; +import ncsa.hdf.object.h5.H5CompoundDS; +import ncsa.hdf.object.h5.H5ScalarDS; +import japsa.util.JapsaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Read detail nanopore data (read sequence, events, alignment, models etc) from a raw + * (fast5) format. + * Re-implemented from the previous to aim for faster, which static key + * + * @author minhduc + */ +public class Fast5DetailReader extends Fast5NPReader{ + private static final Logger LOG = LoggerFactory.getLogger(Fast5DetailReader.class); + + static String RAW_PREFIX = "/Raw/Reads"; + static String CHANNEL_ID = "/UniqueGlobalKey/channel_id"; + static String TRACKING_ID = "/UniqueGlobalKey/tracking_id"; + + private double samplingRate = 0;//Default + private int channelNumber = 0; + private long startTime = 0; + + public Fast5DetailReader (String fileName) throws JapsaException, OutOfMemoryError, Exception{ + super(fileName); + readMetaData(); + } + + + public double getSamplingRate() { + return samplingRate; + } + + + public int getChannelNumber() { + return channelNumber; + } + + public long getStartTime() { + return startTime; + } + + + /** + * Extract metadata about the read, including: + * - sampling rate + * - channel number + * @throws Exception + */ + + private void readMetaData() throws Exception{ + HObject data = f5File.get(CHANNEL_ID); + @SuppressWarnings("unchecked") + List aL = (List) data.getMetadata(); + for (ncsa.hdf.object.Attribute att:aL){ + if (att.getName().equals("sampling_rate")){ + samplingRate = ((double[]) att.getValue())[0]; + }else if (att.getName().equals("channel_number")){ + channelNumber = Integer.parseInt(((String[]) att.getValue())[0]); + } + } + } + + /** + * Extract raw event from the read. Start time is also extracted in the process + * @return + * @throws Exception + */ + + public RawSignal getRawEvent() throws Exception{ + HObject data = f5File.get(RAW_PREFIX); + //LOG.info("Read 1 " + (data == null)); + if (data !=null){ + Group group = (Group) data; + group = (Group) group.getMemberList().get(0); + @SuppressWarnings("unchecked") + List aL = (List) group.getMetadata(); + for (ncsa.hdf.object.Attribute att:aL){ + if (att.getName().equals("start_time")){ + startTime = ((long[]) att.getValue())[0]; + } + } + H5ScalarDS myDat =((H5ScalarDS) group.getMemberList().get(0)); + //LOG.info("Read 2 " + (myDat == null)); + if (myDat != null){ + short [] rawEvent = (short[])myDat.getData(); + //LOG.info("Read 3 " + (rawEvent == null)); + RawSignal rawSignal = new RawSignal(rawEvent); + return rawSignal; + } + } + return null; + } + + public void readData() throws OutOfMemoryError, Exception{ + Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); + readData(root); + } + + + + /** + * Recursively print a group and its members. Fastq data are read.If all + * flag is turned on, this method will also reads all events and model data. + * @throws OutOfMemoryError + * + * @throws Exception + */ + private void readData(Group g) throws OutOfMemoryError, Exception{ + + if (g == null) return; + java.util.List members = g.getMemberList(); + + for (HObject member:members) { + //String f = member.getFullName(); + if (member instanceof Group) { + readData((Group) member); + }else if (member instanceof H5CompoundDS){ + String fullName = member.getFullName(); + + //LOG.info(member.getClass() +" "); + @SuppressWarnings("unchecked") + List dat = (List) (((H5CompoundDS) member).getData()); + if (dat != null){ + /********************************************************/ + if (fullName.startsWith("/Analyses/EventDetection_000/Reads/") && fullName.endsWith("Events") ){ + LOG.info("Read " + fullName); + detectedEvents = new DetectionEvents(); + detectedEvents.start = (long[]) dat.get(0); + detectedEvents.length = (long[]) dat.get(1); + detectedEvents.mean = (double[]) dat.get(2); + detectedEvents.stdv = (double[]) dat.get(3); + }else if (fullName.endsWith("BaseCalled_template/Events")){ + LOG.info("Read " + fullName); + bcTempEvents = new BaseCallEvents(); + bcTempEvents.mean = (double[]) dat.get(0); + bcTempEvents.start = (double[]) dat.get(1); + bcTempEvents.stdv = (double[]) dat.get(2); + bcTempEvents.length = (double[]) dat.get(3); + bcTempEvents.modelState = (String[]) dat.get(4); + bcTempEvents.move = (long[]) dat.get(5); + bcTempEvents.weight = (float[]) dat.get(6); + bcTempEvents.pModelState = (float[]) dat.get(7); + bcTempEvents.mpState = (String[]) dat.get(8); + bcTempEvents.pMpState = (float[]) dat.get(9); + + bcTempEvents.pA = (float[]) dat.get(10); + bcTempEvents.pC = (float[]) dat.get(11); + bcTempEvents.pG = (float[]) dat.get(12); + bcTempEvents.pT = (float[]) dat.get(13); + }else if (fullName.endsWith("BaseCalled_complement/Events")){ + LOG.info("Read " + fullName); + bcCompEvents = new BaseCallEvents(); + bcCompEvents.mean = (double[]) dat.get(0); + bcCompEvents.start = (double[]) dat.get(1); + bcCompEvents.stdv = (double[]) dat.get(2); + bcCompEvents.length = (double[]) dat.get(3); + bcCompEvents.modelState = (String[]) dat.get(4); + bcCompEvents.move = (long[]) dat.get(5); + bcCompEvents.weight = (float[]) dat.get(6); + bcCompEvents.pModelState = (float[]) dat.get(7); + bcCompEvents.mpState = (String[]) dat.get(8); + bcCompEvents.pMpState = (float[]) dat.get(9); + + bcCompEvents.pA = (float[]) dat.get(10); + bcCompEvents.pC = (float[]) dat.get(11); + bcCompEvents.pG = (float[]) dat.get(12); + bcCompEvents.pT = (float[]) dat.get(13); + }else if (fullName.endsWith("BaseCalled_complement/Model")){ + LOG.info("Read " + fullName); + bcCompModel = new BaseCallModel(); + bcCompModel.kmer = (String[]) dat.get(0); + //bcCompModel.variant = (double[]) dat.get(1); + bcCompModel.levelMean = (double[]) dat.get(2); + bcCompModel.levelStdv = (double[]) dat.get(3); + bcCompModel.sdMean = (double[]) dat.get(4); + bcCompModel.sdStdv = (double[]) dat.get(5); + //bcCompModel.weigth = (double[]) dat.get(6); + }else if (fullName.endsWith("BaseCalled_template/Model")){ + LOG.info("Read " + fullName); + bcTempModel = new BaseCallModel(); + bcTempModel.kmer = (String[]) dat.get(0); + //bcTempModel.variant = (double[]) dat.get(1); + bcTempModel.levelMean = (double[]) dat.get(2); + bcTempModel.levelStdv = (double[]) dat.get(3); + bcTempModel.sdMean = (double[]) dat.get(4); + bcTempModel.sdStdv = (double[]) dat.get(5); + //bcTempModel.weigth = (double[]) dat.get(6); + }else if (fullName.endsWith("HairpinAlign/Alignment")){ + LOG.info("Read " + fullName); + bcAlignmentHairpin = new BaseCallAlignmentHairpin(); + bcAlignmentHairpin.template = (long[]) dat.get(0); + bcAlignmentHairpin.complement = (long[]) dat.get(1); + }else + if (fullName.endsWith("BaseCalled_2D/Alignment")){ + LOG.info("Read " + fullName); + bcAlignment2D = new BaseCallAlignment2D(); + bcAlignment2D.template = (long[]) dat.get(0); + bcAlignment2D.complement = (long[]) dat.get(1); + bcAlignment2D.kmer = (String[]) dat.get(2); + } + /********************************************************/ + } + } + } + } + + + /** + * Get base call events for complement strand + * @return the bcCompEvents + */ + public BaseCallEvents getBcCompEvents() { + return bcCompEvents; + } + + + /** + * Get base call events for template strand + * @return the bcTempEvents + */ + public BaseCallEvents getBcTempEvents() { + return bcTempEvents; + } + + /** + * Get the events from the pore + * @return the events + */ + public DetectionEvents getEvents() { + return detectedEvents; + } + + + + /** + * Class represent RawSignal, which is an array of short + * @author minhduc + * + */ + public static class RawSignal{ + short [] signal; + + private RawSignal(short [] data){ + signal = data; + } + public short [] getSignal(){ + return signal; + } + } + + /******************************************************************************************************** +H5CompoundDS : /Analyses/EventDetection_000/Reads/Read_12/Events=class java.util.Vector + + + H5ScalarDS : /Analyses/Calibration_Strand_000/Log=class [Ljava.lang.String; + H5ScalarDS : /Analyses/EventDetection_000/Log=class [Ljava.lang.String; + H5CompoundDS : /Analyses/EventDetection_000/Reads/Read_5543/Events=class java.util.Vector + H5ScalarDS : /Raw/Reads/Read_5543/Signal=class [S + + /*******************************************************/ + BaseCallAlignment2D bcAlignment2D = null; + BaseCallAlignmentHairpin bcAlignmentHairpin = null; + BaseCallModel bcCompModel = null, bcTempModel = null; + + DetectionEvents detectedEvents; + BaseCallEvents bcCompEvents = null, bcTempEvents = null; + RawSignal rawSignal = null; + + double seqTime = 0; + + + + + /** + * Get 2D alignment + * @return the bcAlignment2D + */ + public BaseCallAlignment2D getBcAlignment2D() { + return bcAlignment2D; + } + + /** + * Get hairpin alignment + * @return the bcAlignmentHairpin + */ + public BaseCallAlignmentHairpin getBcAlignmentHairpin() { + return bcAlignmentHairpin; + } + + /** + * Get the model for base call of the complement + * @return the bcCompModel + */ + public BaseCallModel getBcCompModel() { + return bcCompModel; + } + + /** + * Get the model for base call of the template + * @return the bcTempModel + */ + public BaseCallModel getBcTempModel() { + return bcTempModel; + } + //String expStart = ""; + + public static class BaseCallModel{ + String [] kmer; + double[] variant; + double[] levelMean, levelStdv, sdMean, sdStdv;//, weigth; + + } + + public static class BaseCallEvents{ + int dim; + double [] mean, start, stdv, length; + float [] pA, pC, pG, pT; + long []move;//, rawIndex; + float [] pModelState; + float [] pMpState; + float [] weight; + //long [] modelLevel; + String [] modelState, mpState; + + public long [] getMove(){ + return move; + } + + public double [] length(){ + return length; + } + + public double [] mean(){ + return mean; + } + + public double [] stdv(){ + return stdv; + } + + public float [] weight(){ + return weight; + } + + public String [] modelState(){ + return modelState; + } + } + + public static class BaseCallAlignment2D{ + int dim; + long [] template, complement; + String [] kmer; + + public String[] getKmer(){ + return kmer; + } + public long[] getComplementKmer(){ + return complement; + } + public long[] getTemplateKmer(){ + return template; + } + } + + public static class BaseCallAlignmentHairpin{ + int dim; + long [] template, complement; + } + + public static class DetectionEvents{ + int dim; + double [] stdv; + double [] mean; + long [] start; + long [] length; + + public double [] getMean(){ + return mean; + } + + /** + * @return the stdv + */ + public double[] getStdv() { + return stdv; + } + + /** + * @return the start + */ + public long[] getStart() { + return start; + } + + /** + * @return the length + */ + public long[] getLength() { + return length; + } + + } + +} diff --git a/src/main/java/japsa/seq/nanopore/Fast5NPReader.java b/src/main/java/japsa/seq/nanopore/Fast5NPReader.java new file mode 100644 index 0000000..a5529a0 --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/Fast5NPReader.java @@ -0,0 +1,293 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 23/09/20116 - Minh Duc Cao: Resigned of the reader class + * + ****************************************************************************/ + +package japsa.seq.nanopore; + +import ncsa.hdf.object.FileFormat; +import ncsa.hdf.object.Group; +import ncsa.hdf.object.HObject; +import ncsa.hdf.object.h5.H5ScalarDS; +import japsa.seq.Alphabet.DNA; + +import java.util.ArrayList; +import java.util.List; + +import japsa.seq.Alphabet; +import japsa.seq.FastqSequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.JapsaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Read nanopore data (read sequence, events, alignment, models etc) from a raw + * (fast5) format. + * + * + * @author minhduc + */ +public class Fast5NPReader{ + private static final Logger LOG = LoggerFactory.getLogger(Fast5NPReader.class); + protected FileFormat f5File; + // FastqSequence seqTemplate = null, seqComplement = null, seq2D = null; + + ArrayList seqList = null; + + /** + * Open a fast5 file before reading anything from it. + * + * The file should be closed before gabbage collected. + * + * @param fileName + * @throws OutOfMemoryError + * @throws Exception + */ + public Fast5NPReader (String fileName) throws JapsaException, OutOfMemoryError, Exception{ + FileFormat fileFormat = FileFormat.getFileFormat(FileFormat.FILE_TYPE_HDF5); + + if (fileFormat == null){ + throw new JapsaException("Cannot read HDF5 file, possily because JHI5 is not installed or configured. Please refer to npReader installation guide or contact the deverlopers."); + } + + //LOG.info("Open " + fileName); + f5File = fileFormat.createInstance(fileName, FileFormat.READ); + if (f5File == null) + throw new RuntimeException("Unable to open file " + fileName); + + f5File.open(); + } + + + public void close() throws Exception{ + f5File.close(); + } + + public void readFastq() throws OutOfMemoryError, Exception{ + if (seqList !=null) return; + seqList = new ArrayList(); + Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); + readFastq(root); + } + + public ArrayList getFastqList(){ + return seqList; + } + + public void readAllFastq(SequenceOutputStream sos) throws OutOfMemoryError, Exception{ + Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); + readAllFastq(root, sos); + } + + long timeStamp = 0; + public void readTime() throws Exception { + try { + Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); + readTime(root); + }catch (Exception e){ + LOG.error(e.getMessage()); + } + } + + private long getValue(Object v){ + //System.out.println(v.getClass()); + if (v.getClass().toString().startsWith("class [J")) { + // System.out.println("LLL = " + ((long[]) v).length); + return ((long[]) v)[0]; + } + + if (v.getClass().toString().startsWith("class [I")) { + //System.out.println("LLL = " + ((int[]) v).length); + return 0L + ((int[]) v)[0]; + } + + return 0; + } + + private void readTime(Group g) throws OutOfMemoryError, Exception{ + if (timeStamp > 0) return; + if (g == null) return; + java.util.List members = g.getMemberList(); + + for (HObject member:members) { + if (member instanceof Group) { + String name = member.getName(); + String fullName = member.getFullName(); + if (!fullName.startsWith("/Raw")) + continue; + + if (name.startsWith("Read_")) { + //System.out.println("Found " + member.getFullName()); + List objs = member.getMetadata(); + for (Object obj : objs) { + ncsa.hdf.object.Attribute attribute = (ncsa.hdf.object.Attribute) obj; + + if (attribute.getName().startsWith("duration")) + timeStamp += getValue(attribute.getValue()); + + if (attribute.getName().startsWith("start_time")) + timeStamp += getValue(attribute.getValue()); + }//for + }else + readTime((Group) member); + + } + } + } + + + + ///** + // * @return the seqTemplate + // */ + //public FastqSequence getSeqTemplate() { + // return seqTemplate; + //} + + + ///** + // * @return the seqComplement + // */ + //public FastqSequence getSeqComplement() { + // return seqComplement; + //} + + ///** + // * @return the seq2D + // */ + //public FastqSequence getSeq2D() { + // return seq2D; + //} + + private void readAllFastq(Group g, SequenceOutputStream out) throws OutOfMemoryError, Exception{ + if (g == null) return; + java.util.List members = g.getMemberList(); + + for (HObject member:members) { + if (member instanceof Group) { + readAllFastq((Group) member, out); + }else if (member instanceof H5ScalarDS){ + String fullName = member.getFullName(); + if (fullName.endsWith("Fastq")){ + Object data = ((H5ScalarDS) member).getData(); + if (data != null){ + //LOG.info(fullName); + //out.print(((String[]) data)[0]); + //out.println(); + //LOG.info("Read " + fullName); + + String [] toks = ((String[]) data)[0].split("\n",2); + if (fullName.contains("BaseCalled_2D")){ + out.print(toks[0] + "_twodimentional path=" + fullName); + }else if (fullName.contains("BaseCalled_complement")){ + out.print(toks[0] + "_complement path=" + fullName); + }else if (fullName.contains("BaseCalled_template")){ + out.print(toks[0] + "_template path=" + fullName); + }else + out.print(toks[0] + "_unknown path=" + fullName); + + out.print('\n'); + out.print(toks[1]); + out.print('\n'); + } + } + } + } + } + + /** + * Recursively print a group and its members. Fastq data are read.If all + * flag is turned on, this method will also reads all events and model data. + * @throws OutOfMemoryError + * + * @throws Exception + */ + private void readFastq(Group g) throws OutOfMemoryError, Exception{ + if (g == null) return; + java.util.List members = g.getMemberList(); + + for (HObject member:members) { + if (member instanceof Group) { + readFastq((Group) member); + }else if (member instanceof H5ScalarDS){ + String fullName = member.getFullName(); + if (fullName.endsWith("Fastq")){ + Object data = ((H5ScalarDS) member).getData(); + String group = fullName.split("/")[2]; + if (data != null){ + //LOG.info("Read " + fullName); + String [] toks = ((String[]) data)[0].split("\n"); + if (fullName.contains("BaseCalled_2D")){ + toks[0] = toks[0].substring(1) + "_twodimentional" + " length=" + toks[1].length() + " group=" + group; + seqList.add(new BaseCalledFastq(DNA.DNA16(), toks, BaseCalledFastq.TWODIM)); + }else if (fullName.contains("BaseCalled_complement")){ + toks[0] = toks[0].substring(1) + "_complement" + " length=" + toks[1].length() + " group=" + group ; + seqList.add(new BaseCalledFastq(DNA.DNA16(), toks, BaseCalledFastq.COMPLEMENT)); + }else if (fullName.contains("BaseCalled_template")){ + toks[0] = toks[0].substring(1) + "_template" + " length=" + toks[1].length() + " group=" + group; + seqList.add(new BaseCalledFastq(DNA.DNA16(), toks, BaseCalledFastq.TEMPLATE)); + } + } + } + } + } + } + + public static class BaseCalledFastq extends FastqSequence{ + public static final int UNKNOWN = 4; + public static final int TWODIM = 0; + public static final int TEMPLATE = 1; + public static final int COMPLEMENT = 2; + int myType = 4; + + public BaseCalledFastq(Alphabet alphabet, String [] toks, int type) { + super(alphabet, toks); + myType = type; + } + + public int type(){ + return myType; + } + + public boolean isTwoDim(){ + return myType == TWODIM; + } + public boolean isTemplate(){ + return myType == TEMPLATE; + } + public boolean isComplement(){ + return myType == COMPLEMENT; + } + } +} diff --git a/src/main/java/japsa/seq/nanopore/FxDialogs.java b/src/main/java/japsa/seq/nanopore/FxDialogs.java new file mode 100644 index 0000000..d6f47b1 --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/FxDialogs.java @@ -0,0 +1,145 @@ +package japsa.seq.nanopore; + +import javafx.scene.control.*; +import javafx.scene.control.Label; +import javafx.scene.control.TextArea; +import javafx.scene.input.KeyCode; +import javafx.scene.input.KeyEvent; +import javafx.scene.layout.GridPane; +import javafx.scene.layout.Priority; +import javafx.stage.StageStyle; + +import java.awt.*; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class FxDialogs { + + public static void showInformation(String title, String message) { + Alert alert = new Alert(Alert.AlertType.INFORMATION); + alert.initStyle(StageStyle.UTILITY); + alert.setTitle("Information"); + alert.setHeaderText(title); + alert.setContentText(message); + + alert.showAndWait(); + } + + public static void showWarning(String title, String message) { + Alert alert = new Alert(Alert.AlertType.WARNING); + alert.initStyle(StageStyle.UTILITY); + alert.setTitle("Warning"); + alert.setHeaderText(title); + alert.setContentText(message); + + alert.showAndWait(); + } + + public static void showError(String title, String message) { + Alert alert = new Alert(Alert.AlertType.ERROR); + alert.initStyle(StageStyle.UTILITY); + alert.setTitle("Error"); + alert.setHeaderText(title); + alert.setContentText(message); + + alert.showAndWait(); + } + + public static void showException(String title, String message, Exception exception) { + Alert alert = new Alert(Alert.AlertType.ERROR); + alert.initStyle(StageStyle.UTILITY); + alert.setTitle("Exception"); + alert.setHeaderText(title); + alert.setContentText(message); + + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + exception.printStackTrace(pw); + String exceptionText = sw.toString(); + + Label label = new Label("Details:"); + + TextArea textArea = new TextArea(exceptionText); + textArea.setEditable(false); + textArea.setWrapText(true); + + textArea.setMaxWidth(Double.MAX_VALUE); + textArea.setMaxHeight(Double.MAX_VALUE); + GridPane.setVgrow(textArea, Priority.ALWAYS); + GridPane.setHgrow(textArea, Priority.ALWAYS); + + GridPane expContent = new GridPane(); + expContent.setMaxWidth(Double.MAX_VALUE); + expContent.add(label, 0, 0); + expContent.add(textArea, 0, 1); + + alert.getDialogPane().setExpandableContent(expContent); + + alert.showAndWait(); + } + + public static final String YES = "Yes"; + public static final String NO = "No"; + public static final String OK = "OK"; + public static final String CANCEL = "Cancel"; + + public static String showConfirm(String title, String message, String... options) { + Alert alert = new Alert(Alert.AlertType.CONFIRMATION); + alert.initStyle(StageStyle.UTILITY); + alert.setTitle("Choose an option"); + alert.setHeaderText(title); + alert.setContentText(message); + + //To make enter key press the actual focused button, not the first one. Just like pressing "space". + alert.getDialogPane().addEventFilter(KeyEvent.KEY_PRESSED, event -> { + if (event.getCode().equals(KeyCode.ENTER)) { + event.consume(); + try { + Robot r = new Robot(); + r.keyPress(java.awt.event.KeyEvent.VK_SPACE); + r.keyRelease(java.awt.event.KeyEvent.VK_SPACE); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + + if (options == null || options.length == 0) { + options = new String[]{OK, CANCEL}; + } + + List buttons = new ArrayList<>(); + for (String option : options) { + buttons.add(new ButtonType(option)); + } + + alert.getButtonTypes().setAll(buttons); + + Optional result = alert.showAndWait(); + if (!result.isPresent()) { + return CANCEL; + } else { + return result.get().getText(); + } + } + + public static String showTextInput(String title, String message, String defaultValue) { + TextInputDialog dialog = new TextInputDialog(defaultValue); + dialog.initStyle(StageStyle.UTILITY); + dialog.setTitle("Input"); + dialog.setHeaderText(title); + dialog.setContentText(message); + + Optional result = dialog.showAndWait(); + if (result.isPresent()) { + return result.get(); + } else { + return null; + } + + } + +} diff --git a/src/main/java/japsa/seq/nanopore/ImageButton.java b/src/main/java/japsa/seq/nanopore/ImageButton.java new file mode 100644 index 0000000..75e85f9 --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/ImageButton.java @@ -0,0 +1,47 @@ +package japsa.seq.nanopore; + +import javafx.scene.control.Button; +import javafx.scene.image.Image; +import javafx.scene.image.ImageView; + +public class ImageButton extends Button { + + private final String STYLE_NORMAL = "-fx-background-color: transparent; -fx-padding: 5, 5, 5, 5;"; + private final String STYLE_PRESSED = "-fx-background-color: transparent; -fx-padding: 6 4 4 6;"; + private final String STYLE_HOVER = "-fx-background-color: transparent; " + + "-fx-padding: 0.333333em 0.666667em 0.333333em 0.666667em; " + + "-fx-text-fill: -fx-text-base-color; " + + "-fx-alignment: CENTER; " + + "-fx-content-display: LEFT;"; + + + public ImageButton(String imgUrl) { + Image folderImage = new Image(getClass().getResourceAsStream(imgUrl)); + ImageView viewFolder = new ImageView(folderImage); + viewFolder.setFitWidth(25); + viewFolder.setFitHeight(25); + setGraphic(viewFolder); + setStyle(STYLE_NORMAL); + + setOnMousePressed((event) -> { + setStyle(STYLE_PRESSED); + }); + + setOnMouseReleased((event) -> { + setStyle(STYLE_NORMAL); + + }); + + setOnMouseEntered((event) -> { + setStyle(STYLE_HOVER); + }); + + setOnMouseExited((event) -> { + setStyle(STYLE_NORMAL); + + }); + + + } + +} diff --git a/src/main/java/japsa/seq/nanopore/NanoporeReader.java b/src/main/java/japsa/seq/nanopore/NanoporeReader.java deleted file mode 100644 index 3c9d102..0000000 --- a/src/main/java/japsa/seq/nanopore/NanoporeReader.java +++ /dev/null @@ -1,907 +0,0 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ - -/************************** REVISION HISTORY ************************** - * 21/07/2014 - Minh Duc Cao: Created - * - ****************************************************************************/ - -package japsa.seq.nanopore; - -import java.io.BufferedReader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import ncsa.hdf.object.FileFormat; -import ncsa.hdf.object.Group; -import ncsa.hdf.object.HObject; -import ncsa.hdf.object.h5.H5CompoundDS; -import ncsa.hdf.object.h5.H5ScalarDS; -import japsa.seq.Alphabet.DNA; -import japsa.seq.FastqSequence; -import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; -import japsa.util.CommandLine; -import japsa.util.IntArray; -import japsa.util.JapsaException; -import japsa.util.Logging; -import japsa.util.deploy.Deployable; - -/** - * Read nanopore data (read sequence, events, alignment, models etc) from a raw - * (fast5) format. - * @author minhduc - * - */ -@Deployable( - scriptName = "jsa.np.f5reader2", - scriptDesc = "Extract nanopore data (fastq/fasta and native data) from h5 files" - ) -public class NanoporeReader// implements Closeable -{ - public static void main(String[] args) throws OutOfMemoryError, Exception { - /*********************** Setting up script ****************************/ - Deployable annotation = NanoporeReader.class.getAnnotation(Deployable.class); - CommandLine cmdLine = new CommandLine("\nUsage: " - + annotation.scriptName() + " [options] f1.fast5 f2.fast5 ...", - annotation.scriptDesc()); - - cmdLine.addString("output", "-", - "Name of the output file, - for stdout"); - cmdLine.addString("type", "fastq", - "Type of data to be extracted:" - + "\nfastq: sequence read in fastq format" - + "\nevents: get events" - + "\nmodels: get models" - + "\nkeys: list all keys" - ); - - cmdLine.addInt("minLength", 0, - "Minimum sequence length"); - - cmdLine.addBoolean("stats", false, "Compute statistics of reads"); - cmdLine.addBoolean("number", false, "Add a unique number to read name"); - cmdLine.addString("f5list",null, "File containing list of fast5 files, one file per line"); - - cmdLine.addStdHelp(); - args = cmdLine.stdParseLine(args); - /**********************************************************************/ - - String type = cmdLine.getStringVal("type"); - String output = cmdLine.getStringVal("output"); - String f5list = cmdLine.getStringVal("f5list"); - int minLength = cmdLine.getIntVal("minLength"); - boolean stats = cmdLine.getBooleanVal("stats"); - boolean number = cmdLine.getBooleanVal("number"); - - ArrayList fileList = new ArrayList(); - if (f5list != null){ - BufferedReader bf = SequenceReader.openFile(f5list); - String line; - while ((line = bf.readLine()) != null){ - fileList.add(line.trim()); - } - bf.close(); - } - - for (int i = 0; i < args.length; i++){ - fileList.add(args[i]); - } - - SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(output); - if (type.equals("fastq")) - readFastq(fileList, minLength, sos, stats,number); - else if (type.equals("events")) - readEvents(fileList, sos, stats); - else if (type.equals("models")) - readModels(fileList, sos, stats); - else if (type.equals("keys")) - readKeys(fileList, sos, stats); - - sos.close(); - //int maxLength = 0, minLength = Integer.MAX_VALUE; - }//main - - - public static void readKeys(ArrayList fileList, SequenceOutputStream sos, boolean stats){ - for (String fileName:fileList){ - Logging.info("Open " + fileName); - try{ - NanoporeReader reader = new NanoporeReader(fileName); - reader.readKeys(); - reader.close(); - sos.print("Keys in " + fileName +"\n"); - sos.print(reader.keyList); - }catch (Exception e){ - Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); - } - }//for - - } - - public static void readEvents(ArrayList fileList, SequenceOutputStream sos, boolean stats){ - for (String fileName:fileList){ - Logging.info("Open " + fileName); - try{ - NanoporeReader reader = new NanoporeReader(fileName); - reader.readData(); - reader.close(); - if (reader.events != null){ - int maxIndx = 0, minIndx = 0; - sos.print("Detected Events:\n"); - for (int i = 0; i < reader.events.mean.length;i++){ - sos.print(i); - sos.print('\t'); - sos.print(reader.events.mean[i]); - sos.print('\t'); - sos.print(reader.events.stdv[i]); - sos.print('\t'); - sos.print(reader.events.length[i]); - sos.print('\t'); - sos.print(reader.events.start[i]); - sos.print('\n'); - if (stats){ - if (reader.events.mean[i] < reader.events.mean[minIndx]) - minIndx = i; - - if (reader.events.mean[i] > reader.events.mean[maxIndx]) - maxIndx = i; - } - } - if (stats){ - Logging.info("Min Event = " + reader.events.mean[minIndx] + " at " + minIndx); - Logging.info("Max Event = " + reader.events.mean[maxIndx] + " at " + maxIndx); - } - } - if (reader.bcTempEvents != null){ - int maxIndx = 0, minIndx = 0; - sos.print("Template Events:\n"); - for (int i = 0; i < reader.bcTempEvents.mean.length;i++){ - sos.print(i); - sos.print('\t'); - sos.print(reader.bcTempEvents.mean[i]); - sos.print('\t'); - sos.print(reader.bcTempEvents.stdv[i]); - sos.print('\t'); - sos.print(reader.bcTempEvents.length[i]); - sos.print('\t'); - sos.print(reader.bcTempEvents.start[i]); - sos.print('\n'); - - if (stats){ - if (reader.bcTempEvents.mean[i] < reader.bcTempEvents.mean[minIndx]) - minIndx = i; - - if (reader.bcTempEvents.mean[i] > reader.bcTempEvents.mean[maxIndx]) - maxIndx = i; - } - } - if (stats){ - Logging.info("Min Temp = " + reader.bcTempEvents.mean[minIndx] + " at " + minIndx); - Logging.info("Max Temp = " + reader.bcTempEvents.mean[maxIndx] + " at " + maxIndx); - } - } - if (reader.bcCompEvents != null){ - int maxIndx = 0, minIndx = 0; - sos.print("Complement Events:\n"); - for (int i = 0; i < reader.bcCompEvents.mean.length;i++){ - sos.print(i); - sos.print('\t'); - sos.print(reader.bcCompEvents.mean[i]); - sos.print('\t'); - sos.print(reader.bcCompEvents.stdv[i]); - sos.print('\t'); - sos.print(reader.bcCompEvents.length[i]); - sos.print('\t'); - sos.print(reader.bcCompEvents.start[i]); - sos.print('\n'); - - if (stats){ - if (reader.bcCompEvents.mean[i] < reader.bcCompEvents.mean[minIndx]) - minIndx = i; - - if (reader.bcCompEvents.mean[i] > reader.bcCompEvents.mean[maxIndx]) - maxIndx = i; - } - } - if (stats){ - Logging.info("Min Comp = " + reader.bcCompEvents.mean[minIndx] + " at " + minIndx); - Logging.info("Max Comp = " + reader.bcCompEvents.mean[maxIndx] + " at " + maxIndx); - } - } - - - - }catch (Exception e){ - Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); - } - }//for - - } - - - public static void readModels(ArrayList fileList, SequenceOutputStream sos, boolean stats){ - for (String fileName:fileList){ - Logging.info("Open " + fileName); - try{ - NanoporeReader reader = new NanoporeReader(fileName); - reader.readData(); - reader.close(); - - if (reader.bcTempModel != null){ - int maxIndx = 0, minIndx = 0; - sos.print("Template model:" + fileName +"\n"); - for (int i = 0; i < reader.bcTempModel.levelMean.length;i++){ - sos.print(reader.bcTempModel.kmer[i]); - sos.print('\t'); - sos.print(reader.bcTempModel.levelMean[i]); - sos.print('\t'); - sos.print(reader.bcTempModel.levelStdv[i]); - sos.print('\t'); - sos.print(reader.bcTempModel.sdMean[i]); - sos.print('\t'); - sos.print(reader.bcTempModel.sdStdv[i]); - sos.print('\t'); - //sos.print(reader.bcTempModel.weigth[i]); - //sos.print('\n'); - if (stats){ - if (reader.bcTempModel.levelMean[i] < reader.bcTempModel.levelMean[minIndx]) - minIndx = i; - - if (reader.bcTempModel.levelMean[i] > reader.bcTempModel.levelMean[maxIndx]) - maxIndx = i; - } - } - if (stats){ - Logging.info("Min Event = " + reader.bcTempModel.levelMean[minIndx] + " at " + minIndx + "(" + reader.bcTempModel.kmer[minIndx] + ")"); - Logging.info("Max Event = " + reader.bcTempModel.levelMean[maxIndx] + " at " + maxIndx + "(" + reader.bcTempModel.kmer[maxIndx] + ")"); - } - } - - if (reader.bcCompModel != null){ - int maxIndx = 0, minIndx = 0; - sos.print("Complement model:\n"); - for (int i = 0; i < reader.bcCompModel.levelMean.length;i++){ - sos.print(reader.bcCompModel.kmer[i]); - sos.print('\t'); - sos.print(reader.bcCompModel.levelMean[i]); - sos.print('\t'); - sos.print(reader.bcCompModel.levelStdv[i]); - sos.print('\t'); - sos.print(reader.bcCompModel.sdMean[i]); - sos.print('\t'); - sos.print(reader.bcCompModel.sdStdv[i]); - sos.print('\t'); - //sos.print(reader.bcCompModel.weigth[i]); - //sos.print('\n'); - if (stats){ - if (reader.bcCompModel.levelMean[i] < reader.bcCompModel.levelMean[minIndx]) - minIndx = i; - - if (reader.bcCompModel.levelMean[i] > reader.bcCompModel.levelMean[maxIndx]) - maxIndx = i; - } - } - if (stats){ - Logging.info("Min Event = " + reader.bcCompModel.levelMean[minIndx] + " at " + minIndx + "(" + reader.bcCompModel.kmer[minIndx] + ")"); - Logging.info("Max Event = " + reader.bcCompModel.levelMean[maxIndx] + " at " + maxIndx + "(" + reader.bcCompModel.kmer[maxIndx] + ")"); - } - } - }catch (Exception e){ - Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); - } - }//for - - } - - - - /** - * Read read sequence from a list of fast5 files. - * @param fileList - * @param sos : output stream - * @param stats: print out statistics - */ - public static void readFastq(ArrayList fileList, int minLength, SequenceOutputStream sos, boolean stats,boolean number){ - int tempCount = 0, compCount = 0, twoDCount = 0; - int fileNumber = 0; - IntArray lengths = new IntArray(); - { - for (String fileName:fileList){ - Logging.info("Open " + fileName); - try{ - NanoporeReader reader = new NanoporeReader(fileName); - reader.readFastq(false); - reader.close(); - - - //Get time & date - String log = reader.getLog(); - if (log != null){ - String [] toks = log.split("\n"); - if (toks.length > 0) - toks = toks[toks.length - 1].split(","); - - log = toks[0]; - }else - log = ""; - - FastqSequence fq; - - fq = reader.getSeq2D(); - if (fq != null && fq.length() >= minLength){ - fq.setName((number?(fileNumber *3) + "_":"") + fq.getName() + " " + log); - fq.print(sos); - if (stats){ - lengths.add(fq.length()); - twoDCount ++; - } - } - - fq = reader.getSeqTemplate(); - if (fq != null && fq.length() >= minLength){ - fq.setName((number?(fileNumber *3 + 1) + "_":"") + fq.getName() + " " + log); - fq.print(sos); - if (stats){ - lengths.add(fq.length()); - tempCount ++; - } - } - - fq = reader.getSeqComplement(); - if (fq != null && fq.length() >= minLength){ - fq.setName((number?(fileNumber *3 + 2) + "_":"") + fq.getName() + " " + log); - fq.print(sos); - if (stats){ - lengths.add(fq.length()); - compCount ++; - } - } - - fileNumber ++; - }catch (Exception e){ - Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); - } - - }//for - }//if - else - - - if (stats){ - Logging.info("Getting stats ... "); - int [] ls = lengths.toArray(); - Arrays.sort(ls); - - long baseCount = 0; - for (int i = 0; i < ls.length; i++) - baseCount += ls[i]; - - double mean = baseCount / ls.length; - double median = ls[ls.length/2]; - long sum = 0; - int quantile1st = 0, quantile2nd = 0, quantile3rd = 0; - for (int i = 0; i < ls.length; i++){ - sum += ls[i]; - if (quantile1st == 0 && sum >= baseCount / 4) - quantile1st = i; - - if (quantile2nd == 0 && sum >= baseCount / 2) - quantile2nd = i; - - if (quantile3rd == 0 && sum >= baseCount * 3/ 4) - quantile3rd = i; - - } - - Logging.info("Open " + fileNumber + " files from " + fileList.size()); - Logging.info("Read count = " + ls.length + "(" + tempCount + " temppate, " + compCount + " complements and " + twoDCount +" 2D)"); - Logging.info("Base count = " + baseCount); - Logging.info("Longest read = " + ls[ls.length - 1] + ", shortest read = " + ls[0]); - Logging.info("Average read length = " + mean); - Logging.info("Median read length = " + median); - Logging.info("Quantile first = " + ls[quantile1st] + " second = " + ls[quantile2nd] + " third = " + ls[quantile3rd]); - } - } - - - String log = null; - String keyList = "";//List of key - - BaseCallAlignment2D bcAlignment2D = null; - BaseCallAlignmentHairpin bcAlignmentHairpin = null; - BaseCallModel bcCompModel = null, bcTempModel = null; - - DetectedEvents events; - BaseCallEvents bcCompEvents = null, bcTempEvents = null; - - FastqSequence seqTemplate = null, seqComplement = null, seq2D = null; - double seqTime = 0; - - - private FileFormat f5File; - - /** - * Open a fast5 file before reading anything from it. - * - * The file should be closed before gabbage collected. - * - * @param fileName - * @throws OutOfMemoryError - * @throws Exception - */ - public NanoporeReader (String fileName) throws JapsaException, OutOfMemoryError, Exception{ - FileFormat fileFormat = FileFormat.getFileFormat(FileFormat.FILE_TYPE_HDF5); - - if (fileFormat == null){ - throw new JapsaException("Cannot read HDF5 file, possily because JHI5 is not installed or configured. Please refer to npReader installation guide or contact the deverlopers."); - } - - //Logging.info("Open " + fileName); - f5File = fileFormat.createInstance(fileName, FileFormat.READ); - if (f5File == null) - throw new RuntimeException("Unable to open file " + fileName); - - f5File.open(); - } - - - public void close() throws Exception{ - f5File.close(); - } - - //public void readFastq() throws OutOfMemoryError, Exception{ - // Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); - // readData(root, false); - //} - - public void readFastq(boolean withTime) throws OutOfMemoryError, Exception{ - Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); - readFastqAndTime(root, withTime); - } - - public void readData() throws OutOfMemoryError, Exception{ - Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); - readData(root, true); - } - - - public void readKeys() throws OutOfMemoryError, Exception{ - Group root = (Group) ((javax.swing.tree.DefaultMutableTreeNode) f5File.getRootNode()).getUserObject(); - readMembers(root); - } - - - /** - * Get base call events for complement strand - * @return the bcCompEvents - */ - public BaseCallEvents getBcCompEvents() { - return bcCompEvents; - } - - - /** - * Get base call events for template strand - * @return the bcTempEvents - */ - public BaseCallEvents getBcTempEvents() { - return bcTempEvents; - } - - /** - * Get 2D alignment - * @return the bcAlignment2D - */ - public BaseCallAlignment2D getBcAlignment2D() { - return bcAlignment2D; - } - - /** - * Get hairpin alignment - * @return the bcAlignmentHairpin - */ - public BaseCallAlignmentHairpin getBcAlignmentHairpin() { - return bcAlignmentHairpin; - } - - /** - * Get the model for base call of the complement - * @return the bcCompModel - */ - public BaseCallModel getBcCompModel() { - return bcCompModel; - } - - /** - * Get the model for base call of the template - * @return the bcTempModel - */ - public BaseCallModel getBcTempModel() { - return bcTempModel; - } - - /** - * Get the events from the pore - * @return the events - */ - public DetectedEvents getEvents() { - return events; - } - - /** - * @return the seqTemplate - */ - public FastqSequence getSeqTemplate() { - return seqTemplate; - } - - public String getLog() { - return log; - } - - /** - * @return the seqComplement - */ - public FastqSequence getSeqComplement() { - return seqComplement; - } - - /** - * @return the seq2D - */ - public FastqSequence getSeq2D() { - return seq2D; - } - - /** - * Recursively print its member names and types. - * - * @throws OutOfMemoryError - * @throws Exception - */ - private void readMembers(Group g) throws OutOfMemoryError, Exception{ - - if (g == null) return; - java.util.List members = g.getMemberList(); - - - for (HObject member:members) { - String fullName = member.getFullName(); - if (member instanceof Group) { - this.keyList += "Group : " + fullName + "\n"; - readMembers((Group) member); - }else if (member instanceof H5CompoundDS){ - - //Logging.info(member.getClass() +" "); - Object dat = ((H5CompoundDS) member).getData(); - if (dat != null){ - this.keyList += "H5CompoundDS : " + fullName + "=" + dat.getClass() +"\n"; - }else - this.keyList += "H5CompoundDS : " + fullName + "=null\n"; - }else if (member instanceof H5ScalarDS){ - Object dat = ((H5ScalarDS) member).getData(); - if (dat != null){ - this.keyList += "H5ScalarDS : " + fullName + "=" + dat.getClass() +"\n"; - }else - this.keyList += "H5ScalarDS : " + fullName + "=null\n"; - } - } - } - - String expStart = ""; - - - private void readFastqAndTime(Group g, boolean withTime) throws OutOfMemoryError, Exception{ - - if (g == null) return; - java.util.List members = g.getMemberList(); - - for (HObject member:members) { - String f = member.getFullName(); - if (withTime && f.contains("tracking_id")){ - @SuppressWarnings("unchecked") - List aL = (List) member.getMetadata(); - for (ncsa.hdf.object.Attribute att:aL){ - if (att.getName().equals("exp_start_time")){ - expStart = ((String[]) att.getValue())[0]; - break;//for att - } - } - continue;//for memmer - } - - if (member instanceof Group) { - readFastqAndTime((Group) member, withTime); - }else if (withTime && member instanceof H5CompoundDS ){ - String fullName = member.getFullName(); - if ((fullName.startsWith("/Analyses/EventDetection_000/Reads/") && fullName.endsWith("Events") )){ - @SuppressWarnings("unchecked") - List dat = (List) (((H5CompoundDS) member).getData()); - long [] eventsStatTime = (long[]) dat.get(2); - seqTime = eventsStatTime[eventsStatTime.length -1] / 5000.0; - } - }else if (member instanceof H5ScalarDS){ - String fullName = member.getFullName(); - if (fullName.endsWith("Fastq")){ - Object data = ((H5ScalarDS) member).getData(); - if (data != null){ - //Logging.info("Read " + fullName); - String [] toks = ((String[]) data)[0].split("\n"); - if (fullName.contains("BaseCalled_2D")){ - //toks[0] = toks[0].substring(1) + "_twodimentional#" + f5File.getName().replace("imb13_010577_lt", "imb13-010577-lt") + " length=" + toks[1].length() ; - toks[0] = toks[0].substring(1) + "_twodimentional" + " length=" + toks[1].length() ; - this.seq2D = new FastqSequence(DNA.DNA16(), toks); - }else if (fullName.contains("BaseCalled_complement")){ - //toks[0] = toks[0].substring(1) + "_complement#" + f5File.getName().replace("imb13_010577_lt", "imb13-010577-lt") + " length=" + toks[1].length() ; - toks[0] = toks[0].substring(1) + "_complement" + " length=" + toks[1].length() ; - this.seqComplement = new FastqSequence(DNA.DNA16(), toks); - }else if (fullName.contains("BaseCalled_template")){ - //toks[0] = toks[0].substring(1) + "_template#" + f5File.getName().replace("imb13_010577_lt", "imb13-010577-lt") + " length=" + toks[1].length() ; - toks[0] = toks[0].substring(1) + "_template" + " length=" + toks[1].length() ; - this.seqTemplate = new FastqSequence(DNA.DNA16(), toks); - } - } - }else if (fullName.endsWith("Log")){ - // Logging.info("Read " + fullName); - Object data = ((H5ScalarDS) member).getData(); - if (data != null){ - log = ((String[]) data)[0]; - //System.out.println("\n\n" + log + "\n\n"); - } - } - } - } - } - - /** - * Recursively print a group and its members. Fastq data are read.If all - * flag is turned on, this method will also reads all events and model data. - * @throws OutOfMemoryError - * - * @throws Exception - */ - private void readData(Group g, boolean all) throws OutOfMemoryError, Exception{ - - if (g == null) return; - java.util.List members = g.getMemberList(); - - for (HObject member:members) { - //System.out.println(indent + member + " " + member.getPath() + " " + member.getClass()); - //System.out.println(indent + member + " " + member.getPath() + " " + member.getClass()); - String f = member.getFullName(); - if (f.contains("tracking_id")){ - @SuppressWarnings("unchecked") - List aL = (List) member.getMetadata(); - for (ncsa.hdf.object.Attribute att:aL){ - if (att.getName().equals("exp_start_time")){ - expStart = ((String[]) att.getValue())[0]; - } - } - } - - if (member instanceof Group) { - readData((Group) member, all); - }else if (all && member instanceof H5CompoundDS){ - String fullName = member.getFullName(); - - //Logging.info(member.getClass() +" "); - @SuppressWarnings("unchecked") - List dat = (List) (((H5CompoundDS) member).getData()); - if (dat != null){ - /********************************************************/ - if (fullName.endsWith("BaseCalled_2D/Alignment")){ - //Logging.info("Read " + fullName); - bcAlignment2D = new BaseCallAlignment2D(); - bcAlignment2D.template = (long[]) dat.get(0); - bcAlignment2D.complement = (long[]) dat.get(1); - bcAlignment2D.kmer = (String[]) dat.get(2); - }else if (fullName.endsWith("BaseCalled_complement/Events")){ - //Logging.info("Read " + fullName); - bcCompEvents = new BaseCallEvents(); - bcCompEvents.mean = (double[]) dat.get(0); - bcCompEvents.start = (double[]) dat.get(1); - bcCompEvents.stdv = (double[]) dat.get(2); - bcCompEvents.length = (double[]) dat.get(3); - bcCompEvents.modelState = (String[]) dat.get(4); - bcCompEvents.modelLevel = (double[]) dat.get(5); - bcCompEvents.move = (long[]) dat.get(6); - bcCompEvents.pModelState = (double[]) dat.get(7); - bcCompEvents.mpState = (String[]) dat.get(8); - bcCompEvents.pMpState = (double[]) dat.get(9); - - bcCompEvents.pA = (double[]) dat.get(10); - bcCompEvents.pC = (double[]) dat.get(11); - bcCompEvents.pG = (double[]) dat.get(12); - bcCompEvents.pT = (double[]) dat.get(13); - //bcCompEvents.rawIndex = (long[]) dat.get(14); - }else if (fullName.endsWith("BaseCalled_template/Events")){ - //Logging.info("Read " + fullName); - bcTempEvents = new BaseCallEvents(); - bcTempEvents.mean = (double[]) dat.get(0); - bcTempEvents.start = (double[]) dat.get(1); - bcTempEvents.stdv = (double[]) dat.get(2); - bcTempEvents.length = (double[]) dat.get(3); - bcTempEvents.modelState = (String[]) dat.get(4); - bcTempEvents.modelLevel = (double[]) dat.get(5); - bcTempEvents.move = (long[]) dat.get(6); - bcTempEvents.pModelState = (double[]) dat.get(7); - bcTempEvents.mpState = (String[]) dat.get(8); - bcTempEvents.pMpState = (double[]) dat.get(9); - - bcTempEvents.pA = (double[]) dat.get(10); - bcTempEvents.pC = (double[]) dat.get(11); - bcTempEvents.pG = (double[]) dat.get(12); - bcTempEvents.pT = (double[]) dat.get(13); - //bcTempEvents.rawIndex = (long[]) dat.get(14); - }else if (fullName.endsWith("BaseCalled_complement/Model")){ - Logging.info("Read " + fullName); - bcCompModel = new BaseCallModel(); - bcCompModel.kmer = (String[]) dat.get(0); - //bcCompModel.variant = (double[]) dat.get(1); - bcCompModel.levelMean = (double[]) dat.get(2); - bcCompModel.levelStdv = (double[]) dat.get(3); - bcCompModel.sdMean = (double[]) dat.get(4); - bcCompModel.sdStdv = (double[]) dat.get(5); - //bcCompModel.weigth = (double[]) dat.get(6); - }else if (fullName.endsWith("BaseCalled_template/Model")){ - Logging.info("Read " + fullName); - bcTempModel = new BaseCallModel(); - bcTempModel.kmer = (String[]) dat.get(0); - //bcTempModel.variant = (double[]) dat.get(1); - bcTempModel.levelMean = (double[]) dat.get(2); - bcTempModel.levelStdv = (double[]) dat.get(3); - bcTempModel.sdMean = (double[]) dat.get(4); - bcTempModel.sdStdv = (double[]) dat.get(5); - //bcTempModel.weigth = (double[]) dat.get(6); - }else if (fullName.startsWith("/Analyses/EventDetection_000/Reads/") && fullName.endsWith("Events") ){ - Logging.info("Read " + fullName); - events = new DetectedEvents(); - events.mean = (double[]) dat.get(0); - events.stdv = (double[]) dat.get(1); - events.start = (long[]) dat.get(2); - events.length = (long[]) dat.get(3); - }else if (fullName.endsWith("HairpinAlign/Alignment")){ - Logging.info("Read " + fullName); - bcAlignmentHairpin = new BaseCallAlignmentHairpin(); - bcAlignmentHairpin.template = (long[]) dat.get(0); - bcAlignmentHairpin.complement = (long[]) dat.get(1); - } - /********************************************************/ - } - }else if (member instanceof H5ScalarDS){ - String fullName = member.getFullName(); - if (fullName.endsWith("Fastq")){ - Object data = ((H5ScalarDS) member).getData(); - if (data != null){ - Logging.info("Read " + fullName); - String [] toks = ((String[]) data)[0].split("\n"); - if (fullName.contains("BaseCalled_2D")){ - //toks[0] = toks[0].substring(1) + "_twodimentional#" + f5File.getName().replace("imb13_010577_lt", "imb13-010577-lt") + " length=" + toks[1].length() ; - toks[0] = toks[0].substring(1) + "_twodimentional" + " length=" + toks[1].length() ; - this.seq2D = new FastqSequence(DNA.DNA16(), toks); - }else if (fullName.contains("BaseCalled_complement")){ - //toks[0] = toks[0].substring(1) + "_complement#" + f5File.getName().replace("imb13_010577_lt", "imb13-010577-lt") + " length=" + toks[1].length() ; - toks[0] = toks[0].substring(1) + "_complement" + " length=" + toks[1].length() ; - this.seqComplement = new FastqSequence(DNA.DNA16(), toks); - }else if (fullName.contains("BaseCalled_template")){ - //toks[0] = toks[0].substring(1) + "_template#" + f5File.getName().replace("imb13_010577_lt", "imb13-010577-lt") + " length=" + toks[1].length() ; - toks[0] = toks[0].substring(1) + "_template" + " length=" + toks[1].length() ; - this.seqTemplate = new FastqSequence(DNA.DNA16(), toks); - } - } - }else if (fullName.endsWith("Log")){ - Logging.info("Read " + fullName); - Object data = ((H5ScalarDS) member).getData(); - if (data != null){ - log = ((String[]) data)[0]; - //System.out.println("\n\n" + log + "\n\n"); - } - } - - } - } - } - - public static class BaseCallModel{ - String [] kmer; - double[] variant; - double[] levelMean, levelStdv, sdMean, sdStdv;//, weigth; - - } - - public static class BaseCallEvents{ - int dim; - double [] mean, start, stdv, length, modelLevel, pModelState, pMpState, pA, pC, pG, pT; - long []move;//, rawIndex; - String [] modelState, mpState; - - public long [] getMove(){ - return move; - } - - public double [] length(){ - return length; - } - - public double [] mean(){ - return mean; - } - - public double [] stdv(){ - return stdv; - } - - } - - public static class BaseCallAlignment2D{ - int dim; - long [] template, complement; - String [] kmer; - } - - public static class BaseCallAlignmentHairpin{ - int dim; - long [] template, complement; - } - - public static class DetectedEvents{ - int dim; - double [] mean, stdv; - long [] start; - long [] length; - - public double [] getMean(){ - return mean; - } - - /** - * @return the stdv - */ - public double[] getStdv() { - return stdv; - } - - /** - * @return the start - */ - public long[] getStart() { - return start; - } - - /** - * @return the length - */ - public long[] getLength() { - return length; - } - - } - -} diff --git a/src/main/java/japsa/seq/nanopore/NanoporeReaderStream.java b/src/main/java/japsa/seq/nanopore/NanoporeReaderStream.java index c4c220a..3fe6d04 100644 --- a/src/main/java/japsa/seq/nanopore/NanoporeReaderStream.java +++ b/src/main/java/japsa/seq/nanopore/NanoporeReaderStream.java @@ -34,20 +34,29 @@ package japsa.seq.nanopore; +import java.io.BufferedWriter; import java.io.File; import java.io.IOException; +import java.io.PrintWriter; import java.net.Socket; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import japsa.seq.FastqSequence; import japsa.seq.SequenceOutputStream; +import japsa.seq.nanopore.Fast5NPReader.BaseCalledFastq; import japsa.util.DoubleArray; import japsa.util.IntArray; import japsa.util.JapsaException; -import japsa.util.Logging; import japsa.util.net.StreamClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * Read nanopore data (read sequence, events, alignment, models etc) from a raw @@ -56,7 +65,10 @@ * */ public class NanoporeReaderStream{ - public String prepareIO(){ + private static final Logger LOG = LoggerFactory.getLogger(NanoporeReaderStream.class); + + + public String prepareIO(){ String msg = null; try{ sos = SequenceOutputStream.makeOutputStream(output); @@ -81,14 +93,16 @@ public String prepareIO(){ public void close() throws IOException{ - Logging.info("npReader closing"); + LOG.info("npReader closing"); sos.close(); if (networkOS != null){ for (SequenceOutputStream out:networkOS) out.close(); } - Logging.info("npReader closed"); - done = true; + if(dmplx!=null) + dmplx.close(); + LOG.info("npReader closed"); + //done = true; } double tempLength = 0, compLength = 0, twoDLength = 0; @@ -97,26 +111,42 @@ public void close() throws IOException{ DoubleArray qual2D = new DoubleArray(), qualComp = new DoubleArray(), qualTemp = new DoubleArray(); IntArray lengths2D = new IntArray(), lengthsComp = new IntArray(), lengthsTemp = new IntArray(); - int fileNumber = 0; - int passNumber = 0, failNumber = 0; SequenceOutputStream sos; ArrayList networkOS = null; public boolean stats, number; public String folder = null; - public int minLength = 0; - public boolean wait = true; + public int minLength = 1; + public volatile boolean wait = true; public boolean realtime = true; - public int interval = 1, age = 1000; + public int interval = 1, age = 30000; public boolean doFail = false; - public String output = ""; + public String output = "-"; public String streamServers = null; - boolean doLow = true; - public boolean getTime = false; - boolean done = false; + public boolean getTimeStamp = false; + private double rps = 4000.0; + public String format = "fastq"; public boolean ready = true; private static final byte MIN_QUAL = '!';//The minimum quality + + public boolean exhaustive = false; + Demultiplexer dmplx = null; + private String bcFile = null; + + public void updateDemultiplexFile(String file){ + if(file==null) + return; + bcFile = file; + try{ + dmplx = new Demultiplexer(file); + }catch(IOException e){ + e.printStackTrace(); + } + } + public String getBCFileName(){ + return bcFile; + } /** * Compute average quality of a read @@ -150,8 +180,7 @@ public void print(FastqSequence fq) throws IOException{ } } - @SuppressWarnings("unused") - private void flush() throws IOException{ + void flush() throws IOException{ sos.flush(); if (networkOS != null){ for (SequenceOutputStream out:networkOS) @@ -159,111 +188,65 @@ private void flush() throws IOException{ } } + public boolean readFastq2(String fileName) throws JapsaException, IOException{ - //Logging.info("Open " + fileName); + //LOG.info("Open " + fileName); try{ - NanoporeReader npReader = new NanoporeReader(fileName); - npReader.readFastq(getTime); + Fast5NPReader npReader = new Fast5NPReader(fileName); + npReader.readFastq(); + if (getTimeStamp) + npReader.readTime(); npReader.close(); - //Get time & date - String log = npReader.getLog(); - if (log != null){ - String [] toks = log.split("\n"); - if (toks.length > 0) - toks = toks[toks.length - 1].split(","); - - log = toks[0]; - }else - log = ""; - - if (getTime){ - log = "ExpStart=" + npReader.expStart + " timestamp=" + npReader.seqTime + " " + log; - } - - FastqSequence fq; - - fq = npReader.getSeq2D(); - if (fq != null && fq.length() >= minLength){ - fq.setName((number?(fileNumber *3) + "_":"") + fq.getName() + " " + log); - print(fq); - if (stats){ - lengths.add(fq.length()); - lengths2D.add(fq.length()); - twoDCount ++; - if (fq.length() > 0){ - double sumQual = 0; - for (int p = 0; p < fq.length(); p++){ - sumQual += (fq.getQualByte(p) - MIN_QUAL); - + ArrayList seqList = npReader.getFastqList(); + if(seqList == null || seqList.isEmpty()) + return false; + else{ + for (BaseCalledFastq fq:seqList){ + if (fq.length() >= minLength){ + fq.setName((number?(getTotalFilesNumber() *3 + fq.type()) + "_":"") + fq.getName() + + (getTimeStamp? (" timestamp=" +(npReader.timeStamp/rps)):"") + ); + //do multiplexing here + if(dmplx!=null) + dmplx.clustering(fq); + + print(fq); + if (stats){ + lengths.add(fq.length()); + double sumQual = 0; + for (int p = 0; p < fq.length(); p++){ + sumQual += (fq.getQualByte(p) - MIN_QUAL); + } + if (fq.isTwoDim()){ + lengths2D.add(fq.length()); + twoDCount ++; + qual2D.add(sumQual/fq.length()); + }else if (fq.isComplement()){ + lengthsComp.add(fq.length()); + compCount ++; + qualComp.add(sumQual/fq.length()); + }else if (fq.isTemplate()){ + lengthsTemp.add(fq.length()); + tempCount ++; + qualTemp.add(sumQual/fq.length()); + } } - qual2D.add(sumQual/fq.length()); } } } - fq = npReader.getSeqTemplate(); - if (fq != null && fq.length() >= minLength && this.doLow){ - fq.setName((number?(fileNumber *3 + 1) + "_":"") + fq.getName() + " " + log); - print(fq); - if (stats){ - lengths.add(fq.length()); - lengthsTemp.add(fq.length()); - tempCount ++; - - if (fq.length() > 0){ - double sumQual = 0; - for (int p = 0; p < fq.length(); p++){ - sumQual += (fq.getQualByte(p) - MIN_QUAL); - - } - qualTemp.add(sumQual/fq.length()); - } - } - } - - fq = npReader.getSeqComplement(); - if (fq != null && fq.length() >= minLength && this.doLow){ - fq.setName((number?(fileNumber *3 + 2) + "_":"") + fq.getName() + " " + log); - print(fq); - if (stats){ - lengths.add(fq.length()); - lengthsComp.add(fq.length()); - compCount ++; - - if (fq.length() > 0){ - double sumQual = 0; - for (int p = 0; p < fq.length(); p++){ - sumQual += (fq.getQualByte(p) - MIN_QUAL); - - } - qualComp.add(sumQual/fq.length()); - } - - } - } - - fileNumber ++; + //fileNumber ++; }catch (JapsaException e){ throw e; }catch (Exception e){ - Logging.error("Problem with reading " + fileName + ":" + e.getMessage()); + LOG.error("Problem with reading " + fileName + ":" + e.getMessage()); e.printStackTrace(); return false; } return true; } - - public boolean moveFile(File f, String pFolder){ - String fName = f.getName(); - if (f.renameTo(new File(pFolder + fName))){ - Logging.info("Move " + fName + " to " + pFolder); - return true; - } - else - return false; - } - + /*****************************************************************************/ /** * Read read sequence from a list of fast5 files. @@ -272,164 +255,96 @@ public boolean moveFile(File f, String pFolder){ * @param stats: print out statistics * @throws IOException */ - public void readFastq(String pFolder) throws JapsaException, IOException{ - if (pFolder != null ){ - pFolder = pFolder + File.separatorChar; - Logging.info("Copy to " + pFolder); - } - /*********************************************** - if (f5List != null){ - Logging.info("Reading in file " + f5List); - BufferedReader bf = SequenceReader.openFile(f5List); - String fileName; - while ((fileName = bf.readLine())!=null){ - readFastq2(fileName); - - //Move to done folder - if (pFolder != null){ - moveFile(new File(fileName), pFolder); - } - }//while - bf.close(); - }else - /***********************************************/ - {//folder - HashSet filesDone = new HashSet(); - - File mainFolder = new File(folder); - File passFolder = new File(folder + File.separatorChar + "pass"); - File failFolder = new File(folder + File.separatorChar + "fail"); - - while (wait){ - //Do main - long now = System.currentTimeMillis(); - File [] fileList = mainFolder.listFiles(); - Logging.info("Reading in folder " + mainFolder.getAbsolutePath()); - if (fileList!=null){ - for (File f:fileList){ - if (!wait) - break; - - //directory - if (!f.isFile()) - continue;//for - - if (!f.getName().endsWith("fast5")) - continue;//for - - //File too new - if (now - f.lastModified() < age) - continue;//for - - //if processed already - String sPath = f.getAbsolutePath(); - if (filesDone.contains(sPath)) - continue;//for - - if (readFastq2(sPath)){ - filesDone.add(sPath); - if (pFolder != null){ - moveFile(f, pFolder); - }//if - }//if - }//for - }//if - else{ - Logging.info("Folder " + mainFolder.getAbsolutePath() + " does not exist, are you sure this is the right folder?"); - } - - //Pass folder - now = System.currentTimeMillis(); - Logging.info("Reading in folder " + passFolder.getAbsolutePath()); - fileList = passFolder.listFiles(); - if (fileList!=null){ - for (File f:fileList){ - if (!wait) - break; - - //directory - if (!f.isFile()) - continue;//for - - if (!f.getName().endsWith("fast5")) - continue;//for - - //File too new - if (now - f.lastModified() < age) - continue;//for - - //if processed already - String sPath = f.getAbsolutePath(); - if (filesDone.contains(sPath)) - continue;//for - - if (readFastq2(sPath)){ - passNumber ++; - filesDone.add(sPath); - if (pFolder != null){ - moveFile(f, pFolder); - }//if - }//if - }//for - }//if - - //Fail folder - if (doFail){ - now = System.currentTimeMillis(); - Logging.info("Reading in folder " + failFolder.getAbsolutePath()); - fileList = failFolder.listFiles(); - if (fileList!=null){ - for (File f:fileList){ - if (!wait) - break; - - //directory - if (!f.isFile()) - continue; - - if (!f.getName().endsWith("fast5")) - continue; - - //File too new - if (now - f.lastModified() < age) - continue; - - //if processed already - String sPath = f.getAbsolutePath(); - if (filesDone.contains(sPath)) - continue; - - if (readFastq2(sPath)){ - failNumber ++; - filesDone.add(sPath); - if (pFolder != null){ - moveFile(f, pFolder); - }//if - }//if - }//for - }//if - } - if (!realtime) - break; - - for (int x = 0; x < interval && wait; x++){ - try { - Thread.sleep(1000); - } catch (InterruptedException e) { + HashSet filesOK = new HashSet(), + filesSkipped = new HashSet(); + public void readFast5() throws JapsaException, IOException{ + if (minLength < 1) + minLength = 1; + + LOG.info("Start reading " + folder); + + + //HashSet filesDone = new HashSet(); + + while (wait){ + //Do main + final long now = System.currentTimeMillis(); + + LOG.info("Start reading " + now ); + try{ + Files.walk(Paths.get(folder)) + //is a file + .filter(Files::isRegularFile) + //fail folder + .filter(p -> { + try{ + Path failFolderPath= Paths.get(folder+ File.separator + "fail"); + if(failFolderPath.toFile().isDirectory()) + return doFail || !Files.isSameFile(p.getParent(), failFolderPath); + else + return true; + }catch(IOException e){ e.printStackTrace(); + return false; + } + }) + //fast5 file + .filter(p -> p.toString().endsWith("fast5")) + //age is old enough + .filter(p -> { + try{ + return now - Files.getLastModifiedTime(p).toMillis() > age; + }catch (IOException e1) { + e1.printStackTrace(); + return false; } + }) + //not read before + .filter(p -> !filesOK.contains(p.toString()) && (exhaustive || !filesSkipped.contains(p.toString())) ) + //read + .forEach(p -> { + // System.out.println(p); + try { + if (readFastq2(p.toString())){ + filesOK.add(p.toString()); + filesSkipped.remove(p.toString()); + } + else + filesSkipped.add(p.toString()); + + } catch (JapsaException | IOException e) { + e.printStackTrace(); + } + + if(!wait) + throw new BreakException("Stopping"); + }); + + }catch(BreakException e){ + LOG.info("Stop to read on directory " + folder); + } + /*******************************************************/ + if (!realtime) + break; + + for (int x = 0; x < interval && wait; x++){ + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); } + } + + }//while + LOG.info("EXITING"); - }//while - Logging.info("EXISTING"); - } if (stats){ - Logging.info("Getting stats ... "); + LOG.info("Getting stats ... "); int [] ls = lengths.toArray(); if (ls.length ==0){ - Logging.info("Open " + fileNumber + " files"); - Logging.info("Fould 0 reads"); + LOG.info("Open " + getTotalFilesNumber() + " files"); + LOG.info("Found 0 reads"); }else{ Arrays.sort(ls); @@ -453,13 +368,13 @@ public void readFastq(String pFolder) throws JapsaException, IOException{ quantile3rd = i; } - Logging.info("Open " + fileNumber + " files"); - Logging.info("Read count = " + ls.length + "(" + tempCount + " temppate, " + compCount + " complements and " + twoDCount +" 2D)"); - Logging.info("Base count = " + baseCount); - Logging.info("Longest read = " + ls[ls.length - 1] + ", shortest read = " + ls[0]); - Logging.info("Average read length = " + mean); - Logging.info("Median read length = " + median); - Logging.info("Quantile first = " + ls[quantile1st] + " second = " + ls[quantile2nd] + " third = " + ls[quantile3rd]); + LOG.info("Open " + getTotalFilesNumber() + " files"); + LOG.info("Read count = " + ls.length + "(" + tempCount + " templates, " + compCount + " complements and " + twoDCount +" 2D)"); + LOG.info("Base count = " + baseCount); + LOG.info("Longest read = " + ls[ls.length - 1] + ", shortest read = " + ls[0]); + LOG.info("Average read length = " + mean); + LOG.info("Median read length = " + median); + LOG.info("Quantile first = " + ls[quantile1st] + " second = " + ls[quantile2nd] + " third = " + ls[quantile3rd]); if (qual2D.size() > 0){ double sumQual = 0; @@ -473,7 +388,7 @@ public void readFastq(String pFolder) throws JapsaException, IOException{ double meanQual = sumQual / qual2D.size(); double stdQual = Math.sqrt(sumQualSq / qual2D.size() - meanQual * meanQual); - Logging.info("Ave 2D qual " +meanQual + " " + qual2D.size() + " std = " + stdQual); + LOG.info("Ave 2D qual " +meanQual + " " + qual2D.size() + " std = " + stdQual); } if (qualTemp.size() > 0){ @@ -488,7 +403,7 @@ public void readFastq(String pFolder) throws JapsaException, IOException{ double meanQual = sumQual / qualTemp.size(); double stdQual = Math.sqrt(sumQualSq / qualTemp.size() - meanQual * meanQual); - Logging.info("Ave Temp qual " +meanQual + " " + qualTemp.size() + " std = " + stdQual); + LOG.info("Ave Temp qual " +meanQual + " " + qualTemp.size() + " std = " + stdQual); } if (qualComp.size() > 0){ @@ -504,10 +419,46 @@ public void readFastq(String pFolder) throws JapsaException, IOException{ double meanQual = sumQual / qualComp.size(); double stdQual = Math.sqrt(sumQualSq / qualComp.size() - meanQual * meanQual); - Logging.info("Ave Comp qual " + meanQual + " " + qualComp.size() + " std = " + stdQual); + LOG.info("Ave Comp qual " + meanQual + " " + qualComp.size() + " std = " + stdQual); } } + printToFile("stats"); } } + public void printToFile(String prefix) throws IOException{ + if(prefix.length() < 1) + prefix = "out"; + BufferedWriter lenFile = new BufferedWriter(new PrintWriter(prefix + ".len")), + qualTempFile = new BufferedWriter(new PrintWriter(prefix + ".temp.qual")), + qualCompFile = new BufferedWriter(new PrintWriter(prefix + ".comp.qual")), + qual2DFile = new BufferedWriter(new PrintWriter(prefix + ".2d.qual")); + for(int i=0; i < lengths.size(); i++){ + lenFile.write(lengths.get(i) + "\n"); + } + for(int i=0; i < qualTemp.size(); i++){ + qualTempFile.write(new DecimalFormat("#0.000").format(qualTemp.get(i)) + "\n"); + } + for(int i=0; i < qualComp.size(); i++){ + qualCompFile.write(new DecimalFormat("#0.000").format(qualComp.get(i)) + "\n"); + } + for(int i=0; i < qual2D.size(); i++){ + qual2DFile.write(new DecimalFormat("#0.000").format(qual2D.get(i)) + "\n"); + } + lenFile.close(); + qualTempFile.close(); + qualCompFile.close(); + qual2DFile.close(); + } + + public synchronized int getTotalFilesNumber(){ + return filesOK.size() + filesSkipped.size(); + } + public synchronized int getOKFilesNumber(){ + return filesOK.size(); + } + public synchronized int getSkippedFilesNumber(){ + return filesSkipped.size(); + } + } diff --git a/src/main/java/japsa/seq/nanopore/NanoporeReaderWindowFX.java b/src/main/java/japsa/seq/nanopore/NanoporeReaderWindowFX.java new file mode 100644 index 0000000..5c7d05e --- /dev/null +++ b/src/main/java/japsa/seq/nanopore/NanoporeReaderWindowFX.java @@ -0,0 +1,1192 @@ +/***************************************************************************** + * Copyright (c) Son Hoang Nguyen, IMB - UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 01/03/2017 - Son Hoang Nguyen: Created + * + ****************************************************************************/ +package japsa.seq.nanopore; + +import japsa.util.DynamicHistogram; +import japsa.util.JapsaException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Paths; +import java.text.DecimalFormat; +import java.text.SimpleDateFormat; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +import org.jfree.chart.ChartColor; +import org.jfree.chart.ChartFactory; +import org.jfree.chart.ChartPanel; +import org.jfree.chart.JFreeChart; +import org.jfree.chart.axis.DateAxis; +import org.jfree.chart.axis.NumberAxis; +import org.jfree.chart.labels.StandardCategoryItemLabelGenerator; +import org.jfree.chart.plot.CategoryPlot; +import org.jfree.chart.plot.PlotOrientation; +import org.jfree.chart.plot.SeriesRenderingOrder; +import org.jfree.chart.plot.XYPlot; +import org.jfree.chart.renderer.category.BarRenderer; +import org.jfree.chart.renderer.xy.StackedXYAreaRenderer; +import org.jfree.data.category.DefaultCategoryDataset; +import org.jfree.data.statistics.HistogramType; +import org.jfree.data.time.Second; +import org.jfree.data.time.TimeTableXYDataset; + +import javafx.application.Application; +import javafx.application.Platform; +import javafx.embed.swing.SwingNode; +import javafx.geometry.HPos; +import javafx.geometry.Insets; +import javafx.geometry.Pos; +import javafx.scene.Scene; +import javafx.scene.control.Button; +import javafx.scene.control.CheckBox; +import javafx.scene.control.ComboBox; +import javafx.scene.control.Label; +import javafx.scene.control.Separator; +import javafx.scene.control.Tab; +import javafx.scene.control.TabPane; +import javafx.scene.control.TextField; +import javafx.scene.image.Image; +import javafx.scene.image.ImageView; +import javafx.scene.input.KeyCode; +import javafx.scene.layout.BorderPane; +import javafx.scene.layout.ColumnConstraints; +import javafx.scene.layout.GridPane; +import javafx.scene.layout.HBox; +import javafx.scene.layout.Priority; +import javafx.scene.layout.RowConstraints; +import javafx.scene.layout.StackPane; +import javafx.scene.layout.VBox; +import javafx.scene.text.Font; +import javafx.scene.text.FontWeight; +import javafx.scene.text.Text; +import javafx.stage.DirectoryChooser; +import javafx.stage.FileChooser; +import javafx.stage.FileChooser.ExtensionFilter; +import javafx.stage.Stage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class NanoporeReaderWindowFX extends Application{ + private static final Logger LOG = LoggerFactory.getLogger(NanoporeReaderWindowFX.class); + + + TimeTableXYDataset allReadsCount = new TimeTableXYDataset(), + demultiplexedStackReadsCount = new TimeTableXYDataset(); + DefaultCategoryDataset demultiplexedBarReadsCount = new DefaultCategoryDataset(); + DynamicHistogram histoLengthDataSet = new DynamicHistogram(), + histoQualDataSet = new DynamicHistogram(); + + static NanoporeReaderStream reader = new NanoporeReaderStream(); + + public static void setReader(NanoporeReaderStream r){ + reader = r; + } + + public void start(Stage primaryStage){ + + running(primaryStage); + } + + private void running(Stage stage){ + //Start with a BorderPane as root + BorderPane border = new BorderPane(); + // Put start/stop button here + HBox hbox = addHBox(); + border.setTop(hbox); + + // All the parameters setting to the left + leftBox=addVBox(stage); + border.setLeft(leftBox); + + // Add a stack to the HBox in the top region with Restart button. + // Uncomment this and line 332 (buttonRestart.setDisability(true)) + // to have the function + // addStackPane(hbox, stage); + + // Here the main content + tabPane = new TabPane(); + Tab mainTab = new Tab("Main",addMainGridPane()), + bcTab = new Tab("Barcode",addBarcodePane()); + tabPane.getTabs().addAll(mainTab,bcTab); + + bcTab.disableProperty().bind(barcodeCB.selectedProperty().not()); + border.setCenter(tabPane); + + + Scene scene = new Scene(border); + stage.setScene(scene); + stage.setTitle("npreader"); + stage.setOnCloseRequest(e -> { + Platform.exit(); + System.exit(0); + }); + stage.show(); + + new Thread(new Runnable(){ + + @Override + public void run() { + while (!reader.ready){ + //LOG.info("NOT READY"); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + // TODO Auto-generated method stub + LOG.info("GO"); + + updateData(); + try{ + reader.readFast5(); + }catch (JapsaException e){ + System.err.println(e.getMessage()); + e.getStackTrace(); + interupt(e); + }catch (Exception e){ + e.printStackTrace(); + } + + } + + }).start(); + + } + + private void reset(){ + buttonStart.setDisable(false); + buttonStop.setDisable(true); + leftBox.setDisable(false); + + allReadsCount = new TimeTableXYDataset(); + demultiplexedStackReadsCount = new TimeTableXYDataset(); + histoLengthDataSet = new DynamicHistogram(); + histoQualDataSet = new DynamicHistogram(); + + NanoporeReaderStream newReader = new NanoporeReaderStream(); + //reader.getTime = time; + newReader.number = reader.number; + newReader.minLength = reader.minLength; + newReader.interval = reader.interval; + newReader.age = reader.age; + newReader.folder = reader.folder; + newReader.doFail = reader.doFail; + newReader.output = reader.output; + newReader.format = reader.format.toLowerCase(); + newReader.realtime = reader.realtime; + newReader.streamServers = reader.streamServers; + newReader.exhaustive = reader.exhaustive; + newReader.realtime = true; + newReader.stats = true;//GUI implies stats + newReader.ready = false;//wait for the command from GUI + newReader.updateDemultiplexFile(reader.getBCFileName()); + + setReader(newReader); + + txtCompReads.setText("0"); + txtTempReads.setText("0"); + txt2DReads.setText("0"); + txtPFiles.setText("0"); + txtFFiles.setText("0"); + txtTFiles.setText("0"); + + } + + private void restart(Stage stage){ + reset(); + running(stage); + } + /* + * Components from left pane + */ + private VBox leftBox; + private TabPane tabPane; + private Button buttonStart, buttonStop, buttonRestart, + inputBrowseButton, barcodeBrowseButton, outputBrowseButton; + private TextField inputTF, barcodeTF, bcThresholdTF, outputTF, streamTF, minLenTF; + private CheckBox failCB, exhautiveCB, barcodeCB, serversCB, saveDemultiplexToFilesOptCB, addNumberOptCB; + private ComboBox outputToCombo, outputFormatCombo; + /* + * Creates an HBox with two buttons for the top region + */ + + private HBox addHBox() { + + HBox hbox = new HBox(); + hbox.setPadding(new Insets(15, 12, 15, 12)); + hbox.setSpacing(10); // Gap between nodes + hbox.setStyle("-fx-background-color: #336699;"); + + + Image imageStart = new Image(getClass().getResourceAsStream("icons/start.png")); + ImageView viewStart = new ImageView(imageStart); + viewStart.setFitWidth(20); + viewStart.setFitHeight(20); + buttonStart = new Button("Start", viewStart); + buttonStart.setPrefSize(100, 20); + buttonStart.setOnAction((event) -> { + //1. Validate before running + //validate input + String _path = inputTF.getText().trim(); + if (_path.equals("")){ + FxDialogs.showWarning("File not found!", "Please specify download directory"); + inputTF.requestFocus(); + return; + } + + File _file = new File(_path); + if (!_file.isDirectory()){ + FxDialogs.showWarning("File not found!", "Directory \"" + _path + "\" does not exist!"); + inputTF.requestFocus(); + return; + } + reader.folder = _path; + //validate output + if (outputToCombo.getSelectionModel().getSelectedItem().toString().equals("to file")){ + String _foutput = outputTF.getText().trim(); + if (_foutput.equals("")){ + FxDialogs.showWarning("File not found!", "Please specify output file"); + outputTF.requestFocus(); + return; + } else if(new File(_foutput).exists()){ + String confirm = FxDialogs.showConfirm( "Output file already exists!", "Are you sure to overwrite the old file?", "No", "Yes"); + if(confirm.equals("No")){ + outputTF.requestFocus(); + return; + } + } + reader.output = new File(_foutput).getAbsolutePath(); + try{ + System.setProperty("usr.dir", Paths.get(reader.output).getParent().toString()); + } + catch(NullPointerException | IllegalArgumentException | SecurityException e ){ + e.printStackTrace(); + FxDialogs.showWarning("Illegal output folder!", "Please specify another output destination"); + outputTF.requestFocus(); + return; + } + }else + reader.output = "-";//stream + + + //validate stream + if (serversCB.isSelected()){ + if (streamTF.getText().trim().equals("")){ + FxDialogs.showWarning("Server(s) not found!", "Please specify output address of a server"); + streamTF.requestFocus(); + return; + } + reader.streamServers = streamTF.getText().trim(); + } + + //validate barcode analysis + if(barcodeCB.isSelected()){ + if(barcodeTF.getText().trim().equals("")){ + FxDialogs.showWarning("File not found!", "Please specify barcode file for demultiplex"); + barcodeTF.requestFocus(); + return; + } + reader.updateDemultiplexFile(barcodeTF.getText().trim()); + } + try{ + reader.minLength = Integer.parseInt(minLenTF.getText().trim()); + }catch(NumberFormatException e){ + reader.minLength = 0; + minLenTF.setText("0"); + } + + if(reader.dmplx!=null) + try{ + reader.dmplx.setThreshold(Integer.parseInt(bcThresholdTF.getText().trim())); + }catch(NumberFormatException e){ + bcThresholdTF.setText(Integer.toString(reader.dmplx.SCORE_THRES)); + } + + String msg = reader.prepareIO(); + if (msg !=null){ + FxDialogs.showWarning("Warning", msg); + return; + } + + //Start running + leftBox.setDisable(true); + + buttonStart.setDisable(true);; + buttonStop.setDisable(false); + + reader.ready = true; + }); + + Image imageStop = new Image(getClass().getResourceAsStream("icons/stop.png")); + ImageView viewStop = new ImageView(imageStop); + viewStop.setFitWidth(20); + viewStop.setFitHeight(20); + buttonStop = new Button("Stop", viewStop); + buttonStop.setPrefSize(100, 20); + buttonStop.setDisable(true); + buttonStop.setOnAction((event) -> { + String confirm = FxDialogs.showConfirm( "STOP button just being hit...", "Do you really want to stop the process?", "No", "Yes"); + if(confirm.equals("No")){ + return; + } + + reader.wait = false; + buttonStop.setDisable(true); + + try { + reader.flush(); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + if(!reader.output.equals("-")){ + try { + reader.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + //buttonRestart.setDisable(false); + + }); + + + hbox.getChildren().addAll(buttonStart, buttonStop); + + return hbox; + } + + private final int LeftPaneWidth=360; + /* + * Creates a VBox with a list of parameter settings + */ + private VBox addVBox(Stage stage) { + + VBox vbox = new VBox(); + vbox.setPadding(new Insets(10)); // Set all sides to 10 + vbox.setSpacing(8); // Gap between nodes + + final Text title = new Text("Settings"); + title.setFont(Font.font("Arial", FontWeight.BOLD, 15)); + vbox.getChildren().add(title); + final Separator sep1 = new Separator(); + sep1.setMaxWidth(LeftPaneWidth); + vbox.getChildren().add(1, sep1); + + vbox.getChildren().add(addInputPane(stage)); + vbox.setSpacing(5); + final Separator sep2 = new Separator(); + sep2.setMaxWidth(LeftPaneWidth); + vbox.getChildren().add(3, sep2); + + + vbox.getChildren().add(addOutputPane(stage)); + vbox.setSpacing(5); + final Separator sep3 = new Separator(); + sep3.setMaxWidth(LeftPaneWidth); + vbox.getChildren().add(5, sep3); + + vbox.getChildren().add(addOptionPane()); + + + return vbox; + } + private GridPane addInputPane(Stage stage) { + GridPane inputPane = createFixGridPane(LeftPaneWidth, 5); + + final Label inputLabel = new Label("Input:"); + inputLabel.setFont(Font.font("Roman", FontWeight.BOLD, 12)); + inputLabel.setStyle("-fx-underline:true"); + GridPane.setConstraints(inputLabel, 0,0); + inputPane.getChildren().add(inputLabel); + + failCB = new CheckBox("Include fail folder"); + failCB.setSelected(reader.doFail); + failCB.selectedProperty().addListener( + (obs_val,old_val,new_val) -> { + reader.doFail = new_val; + }); + + GridPane.setConstraints(failCB, 2,0,2,1); + inputPane.getChildren().add(failCB); + + + + inputTF = new TextField(reader.folder == null?"":reader.folder); + inputTF.setPromptText("Enter folder of basecalled reads..."); + inputTF.setOnKeyPressed(e -> { + if (e.getCode() == KeyCode.ENTER) { + buttonStart.requestFocus(); + } + }); + //textField.setPrefWidth(250); + GridPane.setConstraints(inputTF, 0,1,4,1); + inputPane.getChildren().add(inputTF); + + + inputBrowseButton = new ImageButton("icons/folder.png"); + inputBrowseButton.setPrefSize(10, 10); + inputBrowseButton.setOnAction((event) -> { + DirectoryChooser chooser = new DirectoryChooser(); + chooser.setTitle("Select basecalled raw data (fast5) directory"); + File defaultDirectory = new File(inputTF.getText()); + if(defaultDirectory.isDirectory()) + chooser.setInitialDirectory(defaultDirectory); + File selectedDirectory = chooser.showDialog(stage); + if(selectedDirectory != null){ + reader.folder = selectedDirectory.getPath(); + inputTF.setText(reader.folder); + } + }); + GridPane.setConstraints(inputBrowseButton, 4,1); + GridPane.setHalignment(inputBrowseButton,HPos.LEFT); + inputPane.getChildren().add(inputBrowseButton); + //inputPane.setGridLinesVisible(true); + + barcodeCB = new CheckBox("Demultiplexing for barcode analysis"); + barcodeCB.setSelected(reader.dmplx!=null); + GridPane.setConstraints(barcodeCB, 0,3,5,1); + inputPane.getChildren().add(barcodeCB); + + barcodeTF = new TextField(reader.getBCFileName() == null?"":reader.getBCFileName()); + barcodeTF.setPromptText("Enter name of barcode sequences file..."); + barcodeTF.setDisable(!barcodeCB.isSelected()); + barcodeTF.setOnKeyPressed(e -> { + if (e.getCode() == KeyCode.ENTER) { + buttonStart.requestFocus(); + } + }); + GridPane.setConstraints(barcodeTF, 0,4,4,1); + inputPane.getChildren().add(barcodeTF); + + barcodeBrowseButton = new ImageButton("icons/folder.png"); + barcodeBrowseButton.setPrefSize(10, 10); + barcodeBrowseButton.setDisable(!barcodeCB.isSelected()); + barcodeBrowseButton.setOnAction((event) -> { + FileChooser chooser = new FileChooser(); + chooser.setTitle("Select barcode file"); + File defaultFile = new File(barcodeTF.getText()); + if(defaultFile.isFile()) + chooser.setInitialFileName(defaultFile.getName()); + chooser.setInitialDirectory(defaultFile.getParentFile()); + chooser.setSelectedExtensionFilter(new ExtensionFilter("FASTA files", "*.fasta", "*.fna", "*.fa")); + File selectedFile = chooser.showOpenDialog(stage); + if(selectedFile != null){ + reader.updateDemultiplexFile(selectedFile.getPath()); + barcodeTF.setText(reader.getBCFileName()); + } + }); + + final Label label = new Label("Barcode matching threshold"); + label.setDisable(!barcodeCB.isSelected());; + GridPane.setConstraints(label, 0,5,3,1); + inputPane.getChildren().add(label); + + bcThresholdTF = new TextField(reader.dmplx!=null?Integer.toString(reader.dmplx.SCORE_THRES):""); + bcThresholdTF.setPromptText("Enter minimum score..."); + barcodeBrowseButton.setDisable(!barcodeCB.isSelected()); + bcThresholdTF.setOnKeyPressed(e -> { + if (e.getCode() == KeyCode.ENTER) { + buttonStart.requestFocus(); + } + }); + GridPane.setConstraints(bcThresholdTF, 3,5); + inputPane.getChildren().add(bcThresholdTF); + + GridPane.setConstraints(barcodeBrowseButton, 4,4); + GridPane.setHalignment(barcodeBrowseButton,HPos.LEFT); + inputPane.getChildren().add(barcodeBrowseButton); + + barcodeCB.selectedProperty().addListener( + (obs_val,old_val,new_val) -> { + if(!new_val) + reader.dmplx = null; + barcodeTF.setDisable(!new_val); + barcodeBrowseButton.setDisable(!new_val); + bcThresholdTF.setDisable(!new_val); + saveDemultiplexToFilesOptCB.setDisable(!new_val); + label.setDisable(!new_val); + tabPane.getSelectionModel().select(0); + }); + + return inputPane; + } + private GridPane addOutputPane(Stage stage) { + GridPane outputPane = createFixGridPane(LeftPaneWidth, 5); + + final Label outputLabel = new Label("Output:"); + outputLabel.setFont(Font.font("Roman", FontWeight.BOLD, 12)); + outputLabel.setStyle("-fx-underline:true"); + GridPane.setConstraints(outputLabel, 0,0); + outputPane.getChildren().add(outputLabel); + + outputToCombo = new ComboBox(); + outputToCombo.getItems().addAll("to file", "to stdout"); + outputToCombo.setValue(reader.output.equals("-")?"to stdout":"to file"); + outputToCombo.valueProperty().addListener((obs_val, old_val, new_val) -> { + if(new_val.trim().equals("to file")){ + //outputTF.setText(""); + outputTF.setDisable(false); + outputBrowseButton.setDisable(false); + } else{ + outputTF.setText("-"); + outputTF.setDisable(true); + outputBrowseButton.setDisable(true); + } + }); + GridPane.setConstraints(outputToCombo, 1, 0, 2, 1); + outputPane.getChildren().add(outputToCombo); + + outputFormatCombo = new ComboBox(); + outputFormatCombo.getItems().addAll("fastq", "fasta"); + outputFormatCombo.setValue(reader.format); + outputFormatCombo.valueProperty().addListener((obs_val, old_val, new_val) -> { + reader.format = new_val; + }); + GridPane.setConstraints(outputFormatCombo, 3, 0, 2, 1); + outputPane.getChildren().add(outputFormatCombo); + + outputTF = new TextField(reader.output); + outputTF.setDisable(reader.output.equals("-")); + outputTF.setPromptText("Enter name for output file..."); + outputTF.setOnKeyPressed(e -> { + if (e.getCode() == KeyCode.ENTER) { + buttonStart.requestFocus(); + } + }); + GridPane.setConstraints(outputTF, 0,1,4,1); + outputPane.getChildren().add(outputTF); + + + outputBrowseButton = new ImageButton("icons/folder.png"); + outputBrowseButton.setPrefSize(10, 10); + outputBrowseButton.setDisable(reader.output.equals("-")); + outputBrowseButton.setOnAction((event) -> { + FileChooser fileChooser = new FileChooser(); + fileChooser.setTitle("Save output to file"); + File initFolder = new File(inputTF.getText()); + if(initFolder.isDirectory()) + fileChooser.setInitialDirectory(initFolder); + fileChooser.setInitialFileName("output."+reader.format); + File savedFile = fileChooser.showSaveDialog(stage); + if(savedFile != null){ + reader.output = savedFile.getAbsolutePath(); + outputTF.setText(reader.output); + } + }); + GridPane.setConstraints(outputBrowseButton, 4,1); + GridPane.setHalignment(outputBrowseButton, HPos.LEFT); + outputPane.getChildren().add(outputBrowseButton); + + //init +// if(reader.output.equals("-")){ +// outputTF.setText("-"); +// outputToCombo.setValue("to stdout"); +// outputTF.setDisable(true); +// outputBrowseButton.setDisable(true); +// }else{ +// outputTF.setText(reader.output); +// outputToCombo.setValue("to file"); +// outputTF.setDisable(false); +// outputBrowseButton.setDisable(false); +// } + + + serversCB = new CheckBox("Streaming output to server(s)"); + serversCB.selectedProperty().addListener( + (obs_val,old_val,new_val) -> { + streamTF.setDisable(!new_val); + }); + GridPane.setConstraints(serversCB, 0,4,3,1); + outputPane.getChildren().add(serversCB); + + streamTF = new TextField(); + streamTF.setPromptText("address1:port1, address2:port2,..."); + streamTF.setOnKeyPressed(e -> { + if (e.getCode() == KeyCode.ENTER) { + buttonStart.requestFocus(); + } + }); + GridPane.setConstraints(streamTF, 0,5,4,1); + outputPane.getChildren().add(streamTF); + + if(reader.streamServers != null){ + serversCB.setSelected(true); + streamTF.setText(reader.streamServers); + }else{ + streamTF.setDisable(true); + } + //outputPane.setGridLinesVisible(true); + return outputPane; + } + private GridPane addOptionPane() { + GridPane optionPane = createFixGridPane(LeftPaneWidth, 5); + + final Label optLabel = new Label("Other options:"); + optLabel.setFont(Font.font("Roman", FontWeight.BOLD, 12)); + optLabel.setStyle("-fx-underline:true"); + + GridPane.setConstraints(optLabel, 0,0,4,1); + optionPane.getChildren().add(optLabel); + + saveDemultiplexToFilesOptCB = new CheckBox("Save demultiplexed reads to separated files"); + saveDemultiplexToFilesOptCB.setSelected(Demultiplexer.toPrint); + saveDemultiplexToFilesOptCB.setDisable(!barcodeCB.isSelected());; + saveDemultiplexToFilesOptCB.selectedProperty().addListener( + (obs_val,old_val,new_val) -> { + Demultiplexer.toPrint=new_val; + }); + GridPane.setConstraints(saveDemultiplexToFilesOptCB, 0,2,4,1); + optionPane.getChildren().add(saveDemultiplexToFilesOptCB); + + addNumberOptCB = new CheckBox("Assign unique number to every read name"); + addNumberOptCB.setSelected(reader.number); + addNumberOptCB.selectedProperty().addListener( + (obs_val,old_val,new_val) -> { + reader.number=new_val; + }); + GridPane.setConstraints(addNumberOptCB, 0,4,4,1); + optionPane.getChildren().add(addNumberOptCB); + + exhautiveCB = new CheckBox("Exhaustively watch-mode (Albacore)"); + exhautiveCB.setSelected(reader.exhaustive); + exhautiveCB.selectedProperty().addListener( + (obs_val,old_val,new_val) -> { + reader.exhaustive = new_val; + }); + + GridPane.setConstraints(exhautiveCB, 0,6,4,1); + optionPane.getChildren().add(exhautiveCB); + + final Label label2 = new Label("Filter out read shorter than "); + GridPane.setConstraints(label2, 0,8,3,1); + optionPane.getChildren().add(label2); + + minLenTF = new TextField(Integer.toString(reader.minLength)); + minLenTF.setPromptText("min."); + minLenTF.setOnKeyPressed(e -> { + if (e.getCode() == KeyCode.ENTER) { + buttonStart.requestFocus(); + } + }); + GridPane.setConstraints(minLenTF, 3,8); + optionPane.getChildren().add(minLenTF); + + final Label label3 = new Label("bp"); + GridPane.setConstraints(label3, 4,8); + optionPane.getChildren().add(label3); + + return optionPane; + } + + private GridPane createFixGridPane(int width, int ncols){ + GridPane gridpane = new GridPane(); + for (int i = 0; i < ncols; i++) { + ColumnConstraints column = new ColumnConstraints(1.0*width/ncols); + gridpane.getColumnConstraints().add(column); + } + gridpane.setPadding(new Insets(10, 10, 10, 10)); + gridpane.setVgap(5); + gridpane.setHgap(5); + return gridpane; + } + private GridPane createAutoresizeGridPane(int ncols, int nrows){ + GridPane gridpane = new GridPane(); + for (int i = 0; i < ncols; i++) { + ColumnConstraints column = new ColumnConstraints(); + column.setPercentWidth(100/ncols); + gridpane.getColumnConstraints().add(column); + } + for (int i = 0; i < nrows; i++) { + RowConstraints row = new RowConstraints(); + row.setPercentHeight(100/nrows); + gridpane.getRowConstraints().add(row); + } + gridpane.setPadding(new Insets(5, 5, 5, 5)); + gridpane.setVgap(5); + gridpane.setHgap(5); + return gridpane; + } + /* + * Restart button locates here + * @param hb HBox to add the stack to + */ + private void addStackPane(HBox hb, Stage stage) { + + StackPane stack = new StackPane(); + + Image imageStart = new Image(getClass().getResourceAsStream("icons/restart.png")); + ImageView viewStart = new ImageView(imageStart); + viewStart.setFitWidth(20); + viewStart.setFitHeight(20); + buttonRestart = new Button("Restart", viewStart); + buttonRestart.setOnAction(e ->{ + restart(stage); + }); + buttonRestart.setDisable(true); + + stack.getChildren().add(buttonRestart); + stack.setAlignment(Pos.CENTER_RIGHT); + // Add offset to right for question mark to compensate for RIGHT + // alignment of all nodes + //StackPane.setMargin(buttonRestart, new Insets(0, 10, 0, 0)); + + hb.getChildren().add(stack); + HBox.setHgrow(stack, Priority.ALWAYS); + + } + + /* + * Creates a grid for the center region with 2 columns and 2 rows + */ + private GridPane addMainGridPane() { + + GridPane mainGrid = createAutoresizeGridPane(2,2); + mainGrid.setStyle("-fx-background-color: #C0C0C0;"); + /* + * Read count chart + */ + final JFreeChart chart = ChartFactory.createStackedXYAreaChart( + "Read count", // chart title + "Time", // domain axis label + "Read number", // range axis label + allReadsCount + ); + + final StackedXYAreaRenderer render = new StackedXYAreaRenderer(); + DateAxis domainAxis = new DateAxis(); + domainAxis.setAutoRange(true); + domainAxis.setDateFormatOverride(new SimpleDateFormat("HH:mm:ss")); + + XYPlot plot = (XYPlot) chart.getPlot(); + plot.setRenderer(render); + plot.setDomainAxis(domainAxis); + plot.setSeriesRenderingOrder(SeriesRenderingOrder.FORWARD); + plot.setForegroundAlpha(0.5f); + + NumberAxis rangeAxis = (NumberAxis) plot.getRangeAxis(); + rangeAxis.setNumberFormatOverride(new DecimalFormat("#,###.#")); + rangeAxis.setAutoRange(true); + + ChartPanel chartPanel = new ChartPanel(chart, + 450, + 280, + 450, + 280, + 450, + 280, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + SwingNode chartSwingNode = new SwingNode(); + chartSwingNode.setContent(chartPanel); + GridPane.setConstraints(chartSwingNode, 0,0); + + mainGrid.getChildren().add(chartSwingNode); + + /* + * Read length histogram + */ + + //histoLengthDataSet=new DynamicHistogram(); + histoLengthDataSet.prepareSeries("Read Length", 500, 0, 40000); + //histoDataset.prepareSeries("2D", 50, 0, 50000); + //histoDataset.prepareSeries("template", 50, 0, 50000); + //histoDataset.prepareSeries("complement", 50, 0, 50000); + + JFreeChart hisLengths=ChartFactory.createHistogram("Read length histogram","length","count",histoLengthDataSet,PlotOrientation.VERTICAL,true,true,false); + ChartPanel hisPanel = new ChartPanel(hisLengths, + 450, + 280, + 450, + 280, + 450, + 280, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + + XYPlot hisPlot = (XYPlot) hisLengths.getPlot(); + hisPlot.getDomainAxis().setAutoRange(true); + hisPlot.getRangeAxis().setAutoRange(true); + + SwingNode lengthSwingNode = new SwingNode(); + lengthSwingNode.setContent(hisPanel); + GridPane.setConstraints(lengthSwingNode, 1,0); +// GridPane.setHalignment(lengthSwingNode, HPos.CENTER); +// GridPane.setValignment(lengthSwingNode, VPos.CENTER); + mainGrid.getChildren().add(lengthSwingNode); + + /* + * Quality histogram + */ + //histoQualDataSet=new DynamicHistogram(); + histoQualDataSet.setType(HistogramType.SCALE_AREA_TO_1); + histoQualDataSet.prepareSeries("2D", 100, 0, 30); + histoQualDataSet.prepareSeries("template", 100, 0, 30); + histoQualDataSet.prepareSeries("complement", 100, 0, 30); + + + JFreeChart hisQual=ChartFactory.createXYLineChart("Quality","quality","frequency",histoQualDataSet,PlotOrientation.VERTICAL,true,true,false); + ChartPanel hisQualPanel = new ChartPanel(hisQual, + 450, + 280, + 450, + 280, + 450, + 280, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + + XYPlot hisQualPlot = (XYPlot) hisQual.getPlot(); + hisQualPlot.getDomainAxis().setAutoRange(true); + hisQualPlot.getRangeAxis().setAutoRange(true); + hisQualPlot.setForegroundAlpha(0.8F); + + SwingNode qualitySwingNode = new SwingNode(); + qualitySwingNode.setContent(hisQualPanel); + GridPane.setConstraints(qualitySwingNode, 1,1); + mainGrid.getChildren().add(qualitySwingNode); + + /* + * Statistics field + */ + GridPane countPane = createAutoresizeGridPane(3, 9); + countPane.setPadding(new Insets(30, 30, 30, 30)); + countPane.setStyle("-fx-background-color: #AABBCC;"); + + + final Label lblFiles = new Label("Total fast5 files"); + GridPane.setConstraints(lblFiles, 0, 0); + countPane.getChildren().add(lblFiles); + + //txtTFiles = new TextField("0"); + txtTFiles.setPrefWidth(100); + GridPane.setConstraints(txtTFiles, 1, 0); + countPane.getChildren().add(txtTFiles); + + final Label lblpFiles = new Label("Good-read fast5"); + GridPane.setConstraints(lblpFiles, 0, 1); + countPane.getChildren().add(lblpFiles); + + //txtPFiles = new TextField("0"); + txtPFiles.setEditable(false); + txtPFiles.setPrefWidth(100); + GridPane.setConstraints(txtPFiles, 1, 1); + countPane.getChildren().add(txtPFiles); + + + final Label lblFFiles = new Label("Invalid fast5"); + GridPane.setConstraints(lblFFiles, 0, 2); + countPane.getChildren().add(lblFFiles); + + //txtFFiles = new TextField("0"); + txtFFiles.setEditable(false); + txtFFiles.setPrefWidth(100); + GridPane.setConstraints(txtFFiles, 1, 2); + countPane.getChildren().add(txtFFiles); + + + + final Label lbl2DReads = new Label("2D reads"); + GridPane.setConstraints(lbl2DReads, 0, 3); + countPane.getChildren().add(lbl2DReads); + + //txt2DReads= new TextField("0"); + txt2DReads.setEditable(false); + txt2DReads.setPrefWidth(100); + GridPane.setConstraints(txt2DReads, 1, 3); + countPane.getChildren().add(txt2DReads); + + final Label lblTempReads = new Label("Template reads"); + GridPane.setConstraints(lblTempReads, 0, 4); + countPane.getChildren().add(lblTempReads); + + //txtTempReads= new TextField("0"); + txtTempReads.setEditable(false); + txtTempReads.setPrefWidth(100); + GridPane.setConstraints(txtTempReads, 1, 4); + countPane.getChildren().add(txtTempReads); + + final Label lblCompReads = new Label("Complement reads"); + GridPane.setConstraints(lblCompReads, 0, 5); + countPane.getChildren().add(lblCompReads); + + //txtCompReads= new TextField("0"); + txtCompReads.setEditable(false); + txtCompReads.setPrefWidth(100); + GridPane.setConstraints(txtCompReads, 1, 5); + countPane.getChildren().add(txtCompReads); + + + GridPane.setConstraints(countPane, 0, 1); + mainGrid.getChildren().add(countPane); + +// mainGrid.setGridLinesVisible(true); + return mainGrid; + } + + /* + * Create a Grid Pane for barcode analysis + */ + private GridPane addBarcodePane(){ + GridPane mainGrid = createAutoresizeGridPane(1,2); + mainGrid.setStyle("-fx-background-color: #C0C0C0;"); + mainGrid.setPadding(new Insets(5, 100, 5, 100)); + mainGrid.setVgap(5); + mainGrid.setHgap(5); + /* + * Read count stack chart + */ + final JFreeChart stackChart = ChartFactory.createXYLineChart( + "", // chart title + "Time", // domain axis label + "Over-time read count", // range axis label + demultiplexedStackReadsCount, + PlotOrientation.VERTICAL, + true, + true, + false + ); + + final StackedXYAreaRenderer stackRender = new StackedXYAreaRenderer(); + + DateAxis domainAxis = new DateAxis(); + domainAxis.setAutoRange(true); + domainAxis.setDateFormatOverride(new SimpleDateFormat("HH:mm:ss")); + + XYPlot plot = (XYPlot) stackChart.getPlot(); + plot.setRenderer(stackRender); + plot.setDomainAxis(domainAxis); + plot.setSeriesRenderingOrder(SeriesRenderingOrder.FORWARD); + plot.setForegroundAlpha(0.5f); + + NumberAxis rangeAxis = (NumberAxis) plot.getRangeAxis(); + rangeAxis.setNumberFormatOverride(new DecimalFormat("#,###.#")); + rangeAxis.setAutoRange(true); + + ChartPanel stackChartPanel = new ChartPanel(stackChart, + 500, + 300, + 500, + 300, + 500, + 300, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + SwingNode stackChartSwingNode = new SwingNode(); + stackChartSwingNode.setContent(stackChartPanel); + + GridPane.setConstraints(stackChartSwingNode, 0,0); + mainGrid.getChildren().add(stackChartSwingNode); + + /* + * Reads count bar chart + */ + final JFreeChart barChart = ChartFactory.createBarChart( + "", + "Barcode", + "In-time read count", + demultiplexedBarReadsCount, + PlotOrientation.VERTICAL, + false, + false, + false); + final BarRenderer barRender = new CustomRenderer(ChartColor.createDefaultPaintArray()); + barRender.setSeriesItemLabelGenerator(0, new StandardCategoryItemLabelGenerator()); + barRender.setSeriesItemLabelsVisible(0, true); +// barRender.setBarPainter(new StandardBarPainter()); + + CategoryPlot bar = barChart.getCategoryPlot(); + bar.getDomainAxis().setVisible(false); + bar.setRenderer(barRender); + bar.setForegroundAlpha(0.5f); + + + ChartPanel barChartPanel = new ChartPanel(barChart, + 500, + 300, + 500, + 300, + 500, + 300, + true, + true, // properties + true, // save + true, // print + true, // zoom + true // tooltips + ); + + SwingNode barChartSwingNode = new SwingNode(); + barChartSwingNode.setContent(barChartPanel); + + GridPane.setConstraints(barChartSwingNode, 0,1); + mainGrid.getChildren().add(barChartSwingNode); + + return mainGrid; + } + + /****************************************************************************************** + * ** Here are variables and controls for all plots *************************************** + ******************************************************************************************/ + final TextField txtCompReads= new TextField("0"), + txtTempReads= new TextField("0"), + txt2DReads= new TextField("0"); + final TextField txtPFiles= new TextField("0"), + txtFFiles= new TextField("0"), + txtTFiles= new TextField("0"); + + //private static boolean stillRun = true; + + public static void interupt(JapsaException e){ + //stillRun = false; + reader.wait = false; + FxDialogs.showError("Unexpected errors happened!", e.getMessage()); + } + + private void updateData(){ + + final ScheduledExecutorService scheduler + = Executors.newScheduledThreadPool(1); + + scheduler.scheduleAtFixedRate( + new Runnable(){ + int lastIndexLengths = 0;//, lastIndexLengths2D = 0, lastIndexLengthsComp = 0, lastIndexLengthsTemp = 0; + int lastIndexQual2D = 0, lastIndexQualComp = 0, lastIndexQualTemp = 0; + @Override + public void run() { + if(!reader.wait) + scheduler.shutdown(); + + Second period = new Second(); + allReadsCount.add(period, reader.twoDCount,"2D"); + allReadsCount.add(period, reader.tempCount,"template"); + allReadsCount.add(period, reader.compCount,"complement"); + + + Demultiplexer myDmplx = reader.dmplx; + if(myDmplx!=null){ + for(int i=0;i lastIndexLengths){ + int index = histoLengthDataSet.getSeriesIndex("Read Length"); + for (int i = lastIndexLengths; i < currentIndex;i++) + histoLengthDataSet.addSeries(index, reader.lengths.get(i)); + + lastIndexLengths = currentIndex; + + histoLengthDataSet.notifyChanged(); + } + + currentIndex = reader.qual2D.size(); + if (currentIndex > lastIndexQual2D){ + int index = histoQualDataSet.getSeriesIndex("2D"); + for (int i = lastIndexQual2D; i < currentIndex;i++) + histoQualDataSet.addSeries(index, reader.qual2D.get(i)); + + lastIndexQual2D = currentIndex; + histoQualDataSet.notifyChanged(); + } + currentIndex = reader.qualTemp.size(); + if (currentIndex > lastIndexQualTemp){ + int index = histoQualDataSet.getSeriesIndex("template"); + for (int i = lastIndexQualTemp; i < currentIndex;i++) + histoQualDataSet.addSeries(index, reader.qualTemp.get(i)); + + lastIndexQualTemp = currentIndex; + histoQualDataSet.notifyChanged(); + } + currentIndex = reader.qualComp.size(); + if (currentIndex > lastIndexQualComp){ + int index = histoQualDataSet.getSeriesIndex("complement"); + for (int i = lastIndexQualComp; i < currentIndex;i++) + histoQualDataSet.addSeries(index, reader.qualComp.get(i)); + + lastIndexQualComp = currentIndex; + histoQualDataSet.notifyChanged(); + } + + } + }, + 1, + 1, + TimeUnit.SECONDS); + } + +} diff --git a/src/main/java/japsa/seq/nanopore/icons/folder.png b/src/main/java/japsa/seq/nanopore/icons/folder.png new file mode 100644 index 0000000..0c1b65a Binary files /dev/null and b/src/main/java/japsa/seq/nanopore/icons/folder.png differ diff --git a/src/main/java/japsa/seq/nanopore/icons/house.png b/src/main/java/japsa/seq/nanopore/icons/house.png new file mode 100644 index 0000000..21046da Binary files /dev/null and b/src/main/java/japsa/seq/nanopore/icons/house.png differ diff --git a/src/main/java/japsa/seq/nanopore/icons/restart.png b/src/main/java/japsa/seq/nanopore/icons/restart.png new file mode 100644 index 0000000..3906bd7 Binary files /dev/null and b/src/main/java/japsa/seq/nanopore/icons/restart.png differ diff --git a/src/main/java/japsa/seq/nanopore/icons/start.png b/src/main/java/japsa/seq/nanopore/icons/start.png new file mode 100644 index 0000000..a9ab3d7 Binary files /dev/null and b/src/main/java/japsa/seq/nanopore/icons/start.png differ diff --git a/src/main/java/japsa/seq/nanopore/icons/stop.png b/src/main/java/japsa/seq/nanopore/icons/stop.png new file mode 100644 index 0000000..558c272 Binary files /dev/null and b/src/main/java/japsa/seq/nanopore/icons/stop.png differ diff --git a/src/main/java/japsa/tools/bio/amra/AssemblyPostProcessingCmd.java b/src/main/java/japsa/tools/bio/amra/AssemblyPostProcessingCmd.java new file mode 100644 index 0000000..7e82ad7 --- /dev/null +++ b/src/main/java/japsa/tools/bio/amra/AssemblyPostProcessingCmd.java @@ -0,0 +1,141 @@ +/***************************************************************************** + * Copyright (c) 2017 Minh Duc Cao (minhduc.cao@gmail.com). + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the institutions nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ****************************************************************************/ +/* Revision History + * 12-03-2017 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsa.tools.bio.amra; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; + +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.amra.assppro", + scriptDesc = "Extract subsequences" + ) +public class AssemblyPostProcessingCmd extends CommandLine { + public AssemblyPostProcessingCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + /////////////////////////////////////////////////////////////// + + addString("sample", null, "Sample ID", true); + addString("input", null, "Name of the input file, - for standard input", true); + addString("output", null, "Name of the output file, - for standard output", true); + addString("summary", null, "Name of the summary file", true); + + addStdHelp(); + } + + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new AssemblyPostProcessingCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String sample = cmdLine.getStringVal("sample"); + String inputFile = cmdLine.getStringVal("input"); + String outputFile = cmdLine.getStringVal("output"); + String summaryFile = cmdLine.getStringVal("summary"); + + String summaryStr = postProcessing(sample, inputFile, outputFile); + if (summaryStr != null){ + Files.write(Paths.get(summaryFile), (summaryStr + "\n").getBytes()); + }else + System.exit(1); + } + + public static String postProcessing( String sampleID, String input, String output) throws IOException{ + ArrayList seqList = SequenceReader.readAll(input, Alphabet.DNA()); + if (seqList.size() ==0) + return null; + Collections.sort(seqList, + new Comparator(){ + public int compare(Sequence seq1, Sequence seq2) { + return Integer.compare(seq2.length(), seq2.length()); // + } + }); + + int sum = 0; + //c. Rename contig and write to file + int ind = 0; + //how many chars needed + int fieldSize = 3;//String.valueOf(seqList.size()).length(); + SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(output); + for (Sequence seq:seqList){ + ind ++; + String name = String.valueOf(ind); + //pad in 0 + while (name.length() < fieldSize) + name = "0" + name; + + seq.setDesc(seq.getName() +" " + seq.getDesc()); + seq.setName(sampleID + "_C" + name); + seq.writeFasta(sos); + sum += seq.length(); + } + sos.close(); + + //second round: compute N50 + int n50 = 0; + int sumHalf = 0; + for (Sequence seq:seqList){ + sumHalf += seq.length(); + if (sumHalf * 2 >= sum){ + n50 = seq.length(); + break; + } + } + + return "length " + sum + "\ncontig " + seqList.size() + "\nn50 " + n50; + } + +} \ No newline at end of file diff --git a/src/main/java/japsa/tools/bio/bac/Genomes2ResistanceGeneCmd.java b/src/main/java/japsa/tools/bio/amra/Genomes2ResistanceGeneCmd.java similarity index 97% rename from src/main/java/japsa/tools/bio/bac/Genomes2ResistanceGeneCmd.java rename to src/main/java/japsa/tools/bio/amra/Genomes2ResistanceGeneCmd.java index 322c32a..701b00f 100644 --- a/src/main/java/japsa/tools/bio/bac/Genomes2ResistanceGeneCmd.java +++ b/src/main/java/japsa/tools/bio/amra/Genomes2ResistanceGeneCmd.java @@ -32,7 +32,7 @@ * 7 Sep 2015 - Minh Duc Cao: Created * ****************************************************************************/ -package japsa.tools.bio.bac; +package japsa.tools.bio.amra; import java.io.BufferedReader; import java.io.IOException; @@ -40,7 +40,7 @@ import java.util.ArrayList; import java.util.HashSet; -import japsa.bio.bac.ResistanceGeneDB; +import japsa.bio.amra.ResistanceGeneDB; import japsa.seq.Alphabet; import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; @@ -53,7 +53,7 @@ * */ @Deployable( - scriptName = "jsa.bac.genome2res", + scriptName = "jsa.amra.genome2res", scriptDesc = "Finding resistance genes/classes in a genome" ) public class Genomes2ResistanceGeneCmd extends CommandLine { @@ -103,7 +103,7 @@ public static void main(String[] args) throws IOException, InterruptedException //System.out.println(s); String geneRes = resDB.getRes(s); if (geneRes != null && geneRes.length() > 0){ - //Logging.info(line + " : " + geneRes); + //LOG.info(line + " : " + geneRes); String [] toks = geneRes.split(","); for (String tok:toks){ sos.print(tok + "\t" + resDB.getClass(s) + "\n"); diff --git a/src/main/java/japsa/tools/bio/bac/MLSTCmd.java b/src/main/java/japsa/tools/bio/amra/MLSTCmd.java similarity index 95% rename from src/main/java/japsa/tools/bio/bac/MLSTCmd.java rename to src/main/java/japsa/tools/bio/amra/MLSTCmd.java index ae5bdc5..db04954 100644 --- a/src/main/java/japsa/tools/bio/bac/MLSTCmd.java +++ b/src/main/java/japsa/tools/bio/amra/MLSTCmd.java @@ -31,7 +31,7 @@ * 28/05/2014 - Minh Duc Cao: Created ****************************************************************************/ -package japsa.tools.bio.bac; +package japsa.tools.bio.amra; import java.io.File; @@ -53,8 +53,8 @@ import com.google.common.io.Files; import com.google.common.io.Resources; -import japsa.bio.bac.MLSTyping; -import japsa.bio.bac.MLSTyping.MLSType; +import japsa.bio.amra.MLSTyping; +import japsa.bio.amra.MLSTyping.MLSType; import japsa.seq.Alphabet; import japsa.seq.FastaReader; import japsa.seq.Sequence; @@ -62,15 +62,19 @@ import japsa.util.CommandLine; import japsa.util.deploy.Deployable; +//import org.slf4j.Logger; +//import org.slf4j.LoggerFactory; + /** * @author minhduc * */ @Deployable( - scriptName = "jsa.bac.mlst", + scriptName = "jsa.amra.mlst", scriptDesc = "Multi-locus strain typing" ) public class MLSTCmd extends CommandLine{ + //private static final Logger LOG = LoggerFactory.getLogger(MLSTCmd.class); //CommandLine cmdLine; public MLSTCmd(){ super(); @@ -80,7 +84,7 @@ public MLSTCmd(){ addString("build", null, "Build the databases to this directory only"); addString("input", null, "Name of the genome file"); - addString("mlstScheme", null, "Folder contianing the allele files"); + addString("mlstScheme", null, "Folder containing the MLST scheme"); addInt("top", 0, "If > 0, will provide top closest profile"); addStdHelp(); diff --git a/src/main/java/japsa/tools/bio/amra/PlasmidFinderCmd.java b/src/main/java/japsa/tools/bio/amra/PlasmidFinderCmd.java new file mode 100644 index 0000000..4524351 --- /dev/null +++ b/src/main/java/japsa/tools/bio/amra/PlasmidFinderCmd.java @@ -0,0 +1,151 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 18/03/2017 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsa.tools.bio.amra; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Calendar; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; + +/** + * Identify plasmids from an assembly + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.amra.plasmidfinder", + scriptDesc = "Multi-locus strain typing" +) +public class PlasmidFinderCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(PlasmidFinderCmd.class); + + //CommandLine cmdLine; + public PlasmidFinderCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addBoolean("build", false, "To build the latest from plasmidFinder"); + addString("input", null, "Name of the genome file"); + addString("plasmiddb", null, "Folder of the the plasmid database",true); + + addStdHelp(); + } + + public static void main(String [] args) throws IOException, InterruptedException{ + PlasmidFinderCmd cmdLine = new PlasmidFinderCmd (); + args = cmdLine.stdParseLine(args); + + boolean build = cmdLine.getBooleanVal("build"); + String input = cmdLine.getStringVal("input"); + String plasmidFolder = cmdLine.getStringVal("plasmiddb"); + + if (build){ + try { + File buildFolder = new File(plasmidFolder); + if (!buildFolder.isDirectory()) { + if (!buildFolder.mkdirs()) { + LOG.error("Cannot create folder " + plasmidFolder); + System.exit(1); + } + } + ProcessBuilder setup = new ProcessBuilder("curl", "-o", plasmidFolder + File.separator + "data.zip", + "--data", "folder=plasmidfinder&filename=plasmidfinder.zip","https://cge.cbs.dtu.dk/cge/download_data.php"); + Process process = setup.inheritIO().start(); + int status = process.waitFor(); + + if (status != 0) { + LOG.error("Problem downloading the current database from plasmidfilder server"); + System.exit(1); + } + + setup = new ProcessBuilder("unzip", "-o","-d", plasmidFolder, plasmidFolder + File.separator + "data.zip"); + process = setup.inheritIO().start(); + status = process.waitFor(); + if (status != 0) { + LOG.error("Problem unzip file " + plasmidFolder + File.separator + "data.zip"); + System.exit(1); + } + Long timestamp = Calendar.getInstance().getTimeInMillis(); + Path actualFile = Paths.get(plasmidFolder + File.separator + "ORI" + timestamp + ".fasta"); + actualFile = Files.move(Paths.get(plasmidFolder + File.separator + "plasmid_database.fsa"), actualFile, REPLACE_EXISTING); + Files.copy(actualFile, Paths.get(plasmidFolder + File.separator + "ORI.fasta"),REPLACE_EXISTING); + }catch (IOException e){ + e.printStackTrace(); + System.exit(1); + } + System.exit(0); + } + + if (input == null){ + System.err.println("Please specify an input file"); + System.exit(1); + } + + ProcessBuilder pb = new ProcessBuilder("blastn", "-subject", plasmidFolder + File.separator + "ORI.fasta", + "-query", input, "-outfmt", "6 qseqid qlen qstart qend sseqid slen sstart send length frames pident nident gaps mismatch score bitscore"); + //qseqid qlen qstart qend sseqid slen sstart send length frames pident nident gaps mismatch score bitscore + Process process = pb.start(); + BufferedReader br = new BufferedReader(new InputStreamReader(process.getInputStream())); + String line; + while ((line = br.readLine()) != null) { + String [] toks = line.trim().split("\t"); + double oLength = Double.parseDouble(toks[5]); + double oCov = Math.abs(Double.parseDouble(toks[7])- Double.parseDouble(toks[6])) + 1; + + double ratio = oCov / oLength; + double identity = Double.parseDouble(toks[10]) / 10; + + if (ratio > 0.9 && identity > 0.9){ + System.out.println(toks[0] + "\t" + toks[4] + "\t" + ratio + "\t" + identity); + } + + } + br.close(); + process.waitFor(); + } +} diff --git a/src/main/java/japsa/tools/bio/amra/ResistanceGeneCardCmd.java b/src/main/java/japsa/tools/bio/amra/ResistanceGeneCardCmd.java new file mode 100644 index 0000000..f8f26ec --- /dev/null +++ b/src/main/java/japsa/tools/bio/amra/ResistanceGeneCardCmd.java @@ -0,0 +1,152 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 7 Sep 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsa.tools.bio.amra; + +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import javax.json.Json; +import javax.json.JsonObject; +import javax.json.JsonReader; +import javax.json.JsonValue; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.util.Collection; +import java.util.Set; + + +/** + * @author minhduc + */ +@Deployable( + scriptName = "jsa.amra.rescard", + scriptDesc = "Finding resistance genes/classes in a genome using card database" +) +public class ResistanceGeneCardCmd extends CommandLine { + + public ResistanceGeneCardCmd() { + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + //addString("input", null, "Name of the genome file",true); + //addString("output", null, "Name of the output file",true); + //addString("resDB", null, "Name of the resistance gene database",true); + + //addDouble("identity", 0.85, "Minimum identity"); + //addDouble("coverage", 0.85, "Minimum coverage of gene"); + + addStdHelp(); + } + + /** + * @param args + */ + public static void main(String[] args) throws FileNotFoundException { + CommandLine cmdLine = new ResistanceGeneCardCmd(); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + //String input = cmdLine.getStringVal("input"); + //String output = cmdLine.getStringVal("output"); + //String resDBPath = cmdLine.getStringVal("resDB"); + + //double identity = cmdLine.getDoubleVal("identity"); + //double coverage = cmdLine.getDoubleVal("coverage"); + + //script + //curl -o card.tar.bz2 https://card.mcmaster.ca/download/0/broadstreet-v1.1.7.tar.gz + //tar jxvf card.tar.bz2 + + JsonReader reader = Json.createReader(new FileReader("card.json")); + JsonObject sampleObject = reader.readObject(); + reader.close(); + + + Set keys = sampleObject.keySet(); + + for (String key : keys) { + JsonValue jsonValue = sampleObject.get(key); + + if (jsonValue instanceof JsonObject) { + try { + JsonObject jSeq = (JsonObject) jsonValue; + if (jSeq == null) + continue; + + String desc = ""; + String info; + + info = jSeq.getString("ARO_accession"); + if (info == null) + continue; + desc += "ARO" + info; + + info = jSeq.getString("ARO_name"); + if (info == null) + continue; + desc += " ~~~" + info; + + info = jSeq.getString("ARO_description"); + if (info == null) + continue; + desc += "~~~" + info; + + //TODO: check this + + jSeq = (JsonObject) jSeq.get("model_sequences"); + jSeq = (JsonObject) jSeq.get("sequence"); + Collection values = jSeq.values(); + + for (JsonValue value : values) { + jSeq = (JsonObject) value; + String proteinSeq = jSeq.getJsonObject("protein_sequence").getString("sequence"); + System.out.println(">" + desc + "\n" + proteinSeq.toString()); + + break;//for + } + } catch (Exception e) { + System.err.println(e.getMessage() + '\n' + key); + } + + + } + + } + + } + +} diff --git a/src/main/java/japsa/tools/bio/hts/AddReadSequence2SamCmd.java b/src/main/java/japsa/tools/bio/hts/AddReadSequence2SamCmd.java new file mode 100644 index 0000000..c9a8e0d --- /dev/null +++ b/src/main/java/japsa/tools/bio/hts/AddReadSequence2SamCmd.java @@ -0,0 +1,142 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/************************** REVISION HISTORY ************************** + * 10/01/2017 - Minh Duc Cao: Created + ****************************************************************************/ +package japsa.tools.bio.hts; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileHeader.SortOrder; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMTextWriter; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; + + + +/** + * Tools such as bwa does not store read sequence in the secondary alignments. + * This tool correct this. It is useful for select a region later on + * + * @author Minh Duc Cao (http://www.caominhduc.org/) + */ +@Deployable( + scriptName = "jsa.hts.fixsam", + scriptDesc = "Add read sequences to secondary alignment, applied only for" + + "\nsam files by bwa without sorting." + + "\nNote it does not support paired-end at this version") +public class AddReadSequence2SamCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(AddReadSequence2SamCmd.class); + public AddReadSequence2SamCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input", null, "Name of the input file, - for standard input", true); + addString("output", null, "Name of output s/bam file. If output file is .bam, bam format is outputed", true); + + addStdHelp(); + } + /** + * @param args + */ + public static void main(String[] args) throws IOException{ + CommandLine cmdLine = new AddReadSequence2SamCmd(); + args = cmdLine.stdParseLine(args); + + String output = cmdLine.getStringVal("output"); + String inFile = cmdLine.getStringVal("input"); + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader + samReader = "-".equals(inFile)?SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)): + SamReaderFactory.makeDefault().open(new File(inFile)); + + + SAMFileHeader samHeader = samReader.getFileHeader(); + + SAMTextWriter samWriter = null; + if ("-".equals(output)){ + samWriter = new SAMTextWriter(System.out); + }else{ + samWriter = new SAMTextWriter(new File(output)); + } + + samWriter.setSortOrder(SortOrder.unsorted, false); + samWriter.writeHeader(samHeader.getTextHeader()); + + String readID = ""; + String readSequence = null; + String revSequence = null; + boolean firstFlag = true; + + SAMRecordIterator samIter = samReader.iterator(); + while (samIter.hasNext()){ + SAMRecord sam = samIter.next(); + if (!readID.equals(sam.getReadName())){ + readSequence = sam.getReadString(); + if (readSequence.length() < 2){ + LOG.error("Some thing wrong " + sam.getReadName()); + continue; + } + readID = sam.getReadName(); + revSequence = null; + firstFlag = sam.getReadNegativeStrandFlag(); + readID = sam.getReadName(); + }else if (sam.getReadString().length() < 2){ + if (sam.getReadNegativeStrandFlag() == firstFlag) + sam.setReadString(readSequence); + else{ + if (revSequence == null){ + Sequence seq = new Sequence(Alphabet.DNA6(), readSequence, "somename"); + revSequence = Alphabet.DNA16.complement(seq).toString(); + } + sam.setReadString(revSequence); + } + } + samWriter.writeAlignment(sam); + }//while + samReader.close(); + samWriter.close(); + } +} diff --git a/src/main/java/japsa/tools/bio/hts/AlignmentParamOptCmd.java b/src/main/java/japsa/tools/bio/hts/AlignmentParamOptCmd.java index 06093db..78c9260 100644 --- a/src/main/java/japsa/tools/bio/hts/AlignmentParamOptCmd.java +++ b/src/main/java/japsa/tools/bio/hts/AlignmentParamOptCmd.java @@ -52,8 +52,9 @@ import japsa.util.CommandLine; import japsa.util.HTSUtilities; import japsa.util.JapsaMath; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * @author minhduc @@ -62,6 +63,9 @@ @Deployable(scriptName = "jsa.hts.alignOpt", scriptDesc = "Parameter estimation for alignment of erronenous read data") public class AlignmentParamOptCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(AlignmentParamOptCmd.class); + + //static Alphabet alphabet = Alphabet.DNA(); public AlignmentParamOptCmd(){ @@ -183,7 +187,7 @@ public static void main(String [] args) throws IOException, InterruptedException ps.println(bwaCommand); ps.close(); - Logging.info("Running " + bwaCommand); + LOG.info("Running " + bwaCommand); Process process = Runtime.getRuntime().exec("bash ./" + scriptFile); process.waitFor(); @@ -213,7 +217,7 @@ public static void main(String [] args) throws IOException, InterruptedException ps.println(bwaCommand); ps.close(); - Logging.info("Running " + bwaCommand); + LOG.info("Running " + bwaCommand); Process process = Runtime.getRuntime().exec("bash ./" + scriptFile); process.waitFor(); @@ -224,9 +228,7 @@ public static void main(String [] args) throws IOException, InterruptedException /** * Error analysis of a bam file. Assume it has been sorted - * @param bamFile - * @param pad - * @throws IOException + */ static double[] paramEst(String bamFile, String refFile, int qual) throws IOException{ @@ -273,7 +275,7 @@ public static void main(String [] args) throws IOException, InterruptedException //make the read seq Sequence readSeq = new Sequence(Alphabet.DNA(), sam.getReadString(), sam.getReadName()); if (readSeq.length() <= 1){ - Logging.warn(sam.getReadName() +" ignored"); + LOG.warn(sam.getReadName() +" ignored"); continue; } diff --git a/src/main/java/japsa/tools/bio/hts/AlternativeAllelesCmd.java b/src/main/java/japsa/tools/bio/hts/AlternativeAllelesCmd.java new file mode 100644 index 0000000..27a9bbb --- /dev/null +++ b/src/main/java/japsa/tools/bio/hts/AlternativeAllelesCmd.java @@ -0,0 +1,418 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2014 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsa.tools.bio.hts; + + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedList; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileHeader.SortOrder; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMTextWriter; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.hts.aareads", + scriptDesc = "Filter reads supporting alternative alleles" + ) +public class AlternativeAllelesCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(AlternativeAllelesCmd.class); + + //CommandLine cmdLine; + public AlternativeAllelesCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + + addString("input", null, "Name of the input bam file",true); + addString("reference", null, "Reference file",true); + addString("vcf", null, "Name of the vcf file",true); + addString("output", "-", "Name of the output file"); + addInt("threshold", 500, "Maximum concordant insert size"); + + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + AlternativeAllelesCmd cmdTool = new AlternativeAllelesCmd (); + args = cmdTool.stdParseLine(args); + + /**********************************************************************/ + String input = cmdTool.getStringVal("input"); + String ref = cmdTool.getStringVal("reference"); + String output = cmdTool.getStringVal("output"); + String vcf = cmdTool.getStringVal("vcf"); + int threshold = cmdTool.getIntVal("threshold"); + + addSequence(input, vcf, ref, output,threshold); + + } + + public static class VarRecord{ + String chrom; + int pos; + int base; //Alphabet.DNA.A, Alphabet.DNA.C, Alphabet.DNA.G, Alphabet.DNA.T; + + VarRecord(String s, int p, int b){ + chrom = s; + pos = p; + base = b; + } + + static VarRecord parseLine(String line){ + String [] toks = line.split("\t"); + if (toks.length < 4) + throw new RuntimeException("Line " + line + " unexpected!!"); + + if (toks[3].length() > 1) + throw new RuntimeException("Field " + toks[3] + " unexpected in line " + line + "!!"); + + int b = -1; + switch (toks[3].charAt(0)){ + case 'A': + case 'a': + b = Alphabet.DNA.A; + break; + + case 'C': + case 'c': + b = Alphabet.DNA.C; + break; + + case 'G': + case 'g': + b = Alphabet.DNA.G; + break; + case 'T': + case 't': + b = Alphabet.DNA.T; + break; + + default: + throw new RuntimeException("Field " + toks[3] + " unexpected in line " + line + "!!"); + + } + + int p = Integer.parseInt(toks[1]) - 1; + return new VarRecord(toks[0], p, b); + + } + } + + static VarRecord nextRecord(BufferedReader br) throws IOException{ + String line = br.readLine(); + if (line == null) + return null; + + return VarRecord.parseLine(line); + + } + + static void addSequence(String inFile, String vcfFile, String reference, String outFile, int threshold) throws IOException{ + //double sumIZ = 0, sumSq = 0; + //int countGood = 0, countBad = 0, countUgly = 0; + //int countALLGood = 0, countALLBad = 0, countALLUgly = 0; + //double sumALLIZ = 0, sumALLSq = 0; + //Good: 0 < insert size <= SIZE_THRESHOLD + //Bad: insertSize >SIZE_THRESHOLD + //Ugly: insertSize=0 + + HashSet somaticSet = new HashSet(); + + + BufferedReader bf = SequenceReader.openFile(vcfFile); + bf.readLine();//dont care the first line + + LinkedList varList = new LinkedList(); + + VarRecord fVar = nextRecord(bf); + + varList.add(fVar); + + String myChrom = fVar.chrom; + Sequence refSeq = null; + { + LOG.info("Read reference started"); + ArrayList seqs = SequenceReader.readAll(reference, Alphabet.DNA()); + for (Sequence seq:seqs){ + if (seq.getName().equals(myChrom)){ + refSeq = seq; + break; + } + } + LOG.info("Read reference done"); + } + + if(refSeq == null){ + bf.close(); + throw new RuntimeException("Chrom " + myChrom + " not found in the reference!!"); + } + + boolean hasVar = true; + + /////////////////////////////////////////////////////////// + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader samReader = SamReaderFactory.makeDefault().open(new File(inFile)); + + + SAMFileHeader samHeader = samReader.getFileHeader(); + SAMTextWriter samWriter = outFile.equals("-")? (new SAMTextWriter(System.out)) :(new SAMTextWriter(new File(outFile))); + + samWriter.setSortOrder(SortOrder.unsorted, false); + samWriter.writeHeader( samHeader.getTextHeader()); + /////////////////////////////////////////////////////////// + + int myChromIndex = samHeader.getSequenceIndex(myChrom); + if(myChromIndex < 0){ + samWriter.close(); + samReader.close(); + bf.close(); + throw new RuntimeException("Chrom " + myChrom + " not found in bam file!!"); + } + + SAMRecordIterator samIter = samReader.query(refSeq.getName(),0,0,false); + LOG.info(" " + samIter.hasNext()); + + LOG.info("Chrom = " + myChrom + " RefName = " + refSeq.getName() + " chromIndex = " + myChromIndex); + + while (samIter.hasNext()){ + SAMRecord sam = samIter.next(); + if (sam.getReadUnmappedFlag()) + continue; + + int samRefIndex = sam.getReferenceIndex(); + if (samRefIndex < myChromIndex) + continue; + + if (samRefIndex > myChromIndex) + break;//while + + //assert samRefIndex == myChromIndex + + //int insertSize = Math.abs(sam.getInferredInsertSize()); + + //if (insertSize == 0){ + // countALLUgly ++; + //}else if (insertSize <= threshold){ + // countALLGood ++; + // sumALLIZ += insertSize; + // sumALLSq += insertSize * insertSize; + //}else{ + // countALLBad ++; + //} + + String readName = sam.getReadName(); + if (somaticSet.contains(readName)) + continue; + + Sequence readSeq = new Sequence(Alphabet.DNA(), sam.getReadString(), sam.getReadName()); + boolean support = false; + + int readPos = 0;//start from 0 + int refPos = sam.getAlignmentStart() - 1;//convert to 0-based index + for (final CigarElement e : sam.getCigar().getCigarElements()) { + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + break; // ignore hard clips + case P : + break; // ignore pads + case S : + readPos += length; + break; // soft clip read bases + case N : + refPos += length; + break; // reference skip + + case D ://deletion + refPos += length; + break; + + case I : + readPos += length; + break; + case M : + for (int i = 0; i < length; i++){ + int readBase = readSeq.getBase(readPos + i); + if (refSeq.getBase(refPos + i) != readBase){ + //1. + while(varList.size() > 0){ + VarRecord first = varList.getFirst(); + + if (first.pos < sam.getAlignmentStart()){ + varList.removeFirst(); + continue; + } + break; + } + + //2. go through the list + int currentVarPos = -1; + for (VarRecord var:varList){ + + if (var.pos == refPos + i && var.base == readBase){ + //yay + support = true; + break;//for + } + + currentVarPos = var.pos; + if (currentVarPos > refPos + i) + break;//for + } + if (support) + break;//for i + + while (currentVarPos < refPos + i && hasVar){ + VarRecord var = nextRecord(bf); + if (var == null){ + hasVar = false; + break; + } + varList.add(var); + + if (var.pos == refPos + i && var.base == readBase){ + //yay + support = true; + break;//for + } + + currentVarPos = var.pos; + } + }//if + if (support) + break;//for + + }//for + + readPos += length; + refPos += length; + break; + + case EQ : + readPos += length; + refPos += length; + + break; + case X : + //do some thing here + LOG.error("Var X is not currently support, please let Minh know if you see this"); + readPos += length; + refPos += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//case + if (support) + break; + }//for + + if (support){ + //samWriter.writeAlignment(sam); + //if (insertSize == 0){ + // countUgly ++; + //}else if (insertSize <= threshold){ + // countGood ++; + // sumIZ += insertSize; + // sumSq += insertSize * insertSize; + //}else{ + // countBad ++; + //} + somaticSet.add(readName); + } + + }//while + + samIter.close(); + + samIter = samReader.query(refSeq.getName(),0,0,false); + while (samIter.hasNext()){ + SAMRecord sam = samIter.next(); + String readName = sam.getReadName(); + if (somaticSet.contains(readName)){ + samWriter.writeAlignment(sam); + } + } + + samWriter.close(); + samReader.close(); + bf.close(); + /********************************************************************** + + System.out.println("================ ALL DATA==================="); + System.out.printf("Good insert fragments (00){ + double mean = sumALLIZ / countALLGood; + double stdev = Math.sqrt(sumALLSq/countALLGood - mean * mean ); + System.out.printf(" mean = %f, std=%f\n",mean, stdev); + } + System.out.printf("Bad insert fragments (insert>%d): %d\n", threshold,countALLBad); + System.out.printf("Ungly insert fragments (insert=0): %d\n", countALLUgly); + + + System.out.println("================ SELECTED DATA==================="); + System.out.printf("Good insert fragments (00){ + double mean = sumIZ / countGood; + double stdev = Math.sqrt(sumSq/countGood - mean * mean ); + System.out.printf(" mean = %f, std=%f\n",mean, stdev); + } + System.out.printf("Bad insert fragments (insert>%d): %d\n", threshold,countBad); + System.out.printf("Ungly insert fragments (insert=0): %d\n", countUgly); + + /**********************************************************************/ + } + + +} diff --git a/src/main/java/japsa/tools/bio/hts/CountReadInRegionCmd.java b/src/main/java/japsa/tools/bio/hts/CountReadInRegionCmd.java index 1b8c473..839c0fa 100644 --- a/src/main/java/japsa/tools/bio/hts/CountReadInRegionCmd.java +++ b/src/main/java/japsa/tools/bio/hts/CountReadInRegionCmd.java @@ -34,53 +34,81 @@ ****************************************************************************/ package japsa.tools.bio.hts; +import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; -import japsa.bio.tr.TandemRepeat; import japsa.seq.JapsaFeature; import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.List; /** - *A program to count reads overlapping or containted with regions from a bam/sam file - * - *FIXME: Generalise to any kinds of regions, not just STR + *A program to count reads overlapping or containted with regions from a bam/sam file + * */ @Deployable( - scriptName = "jsa.hts.countReads", - scriptDesc = "Count the number of reads in some regions from a sorted, indexed bam file" - ) -public class CountReadInRegionCmd extends CommandLine{ + scriptName = "jsa.hts.countReads", + scriptDesc = "Count the number of reads in some regions from a sorted, indexed bam file" + ) +public class CountReadInRegionCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(CountReadInRegionCmd.class); + public CountReadInRegionCmd(){ super(); - Deployable annotation = getClass().getAnnotation(Deployable.class); - setUsage(annotation.scriptName() + " [options] ..."); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); setDesc(annotation.scriptDesc()); - addString("xafFile", null, "Name of the regions file in xaf"); - addString("bedFile", null, "Name of the regions file in bed\n"+ - "Either xafFile or bedFile has to be specified"); - addString("output", "-", "Name of output file, - for from standard out."); - addInt("flanking", 0, "Size of the flanking regions"); - addInt("qual", 0, "Minimum quality"); + CommandLine.Option bamFileOpt = + addString("bamFile", null, "Name of the bam file",true); + bamFileOpt.setGalaxySetting(new GalaxySetting("data", "bam",false)); + + //addExtraGalaxyCmd("ln -s $bamFile bamFile.bam && ln -s $bamFile.metadata.bam_index bamFile.bai &&"); + + CommandLine.Option bedFileOpt = + addString("bedFile", null, "Name of the regions file in bed format",true); + bedFileOpt.setGalaxySetting(new GalaxySetting("data", "bed",false)); + + + CommandLine.Option outputOpt = + addString("output", "-", "Name of output file, - for from standard out."); + + GalaxySetting galaxyOutput = new GalaxySetting("data", "tabular",true); + galaxyOutput.setLabel("countRead.txt"); + outputOpt.setGalaxySetting(galaxyOutput); + + addInt("flanking", 0, "Size of the flanking regions, effectively expand the region by flanking") + .setGalaxySetting(new GalaxySetting("integer", null,false)); + + addInt("qual", 0, "Minimum quality") + .setGalaxySetting(new GalaxySetting("integer", null,false)); + addInt("filterBits", 0, "Filter reads based on flag. Common values:\n 0 no filter\n 256 exclude secondary alignment \n 1024 exclude PCR/optical duplicates\n 2048 exclude supplementary alignments"); - addBoolean("contained", false, "true: Reads contained in the region; false: reads overlap with the region"); + addBoolean("contained", false, "Count reads contained in the region") + .setGalaxySetting(new GalaxySetting("boolean", null,false)); + + addBoolean("overlap", false, "Count number of read overlap with the region") + .setGalaxySetting(new GalaxySetting("boolean", null,false)); + addBoolean("span", false, "Count reads span the region") + .setGalaxySetting(new GalaxySetting("boolean", null,false)); addStdHelp(); + + setGalaxy(annotation.scriptName()); } public static void main(String[] args) throws IOException { @@ -88,74 +116,93 @@ public static void main(String[] args) throws IOException { CommandLine cmdLine = new CountReadInRegionCmd(); args = cmdLine.stdParseLine(args); - String output = cmdLine.getStringVal("output"); int flanking = cmdLine.getIntVal("flanking"); if (flanking < 0) - flanking = 0; - + flanking = 0; int qual = cmdLine.getIntVal("qual"); - int filter = cmdLine.getIntVal("filterBits"); - boolean contained = cmdLine.getBooleanVal("contained"); + int filter = cmdLine.getIntVal("filterBits"); - String strFile = cmdLine.getStringVal("xafFile"); - String bedFile = cmdLine.getStringVal("bedFile"); + boolean contained = cmdLine.getBooleanVal("contained"); + boolean overlap = cmdLine.getBooleanVal("overlap"); + boolean span = cmdLine.getBooleanVal("span"); - if (strFile!= null && bedFile != null){ - System.err.println("##ERROR: only one of bedFile and strFile is specified"); - System.err.println(cmdLine.usageString()); - System.exit(-1); - } - if (strFile== null && bedFile == null){ - System.err.println("##ERROR: one of bedFile and xafFile has to be specified"); + String bedFile = cmdLine.getStringVal("bedFile"); + if (!(contained || overlap || span)){ + System.err.print("ERROR: At least one of contained, overlap and span switched on\n"); System.err.println(cmdLine.usageString()); - System.exit(-1); + + System.exit(1); } - /**********************************************************************/ - ArrayList myList; - if(bedFile != null) - myList = JapsaFeature.readBED(bedFile); - else{ - ArrayList list - = TandemRepeat.readFromFile(SequenceReader.openFile(strFile), new ArrayList()); - myList = new ArrayList(list.size()); - for (TandemRepeat str:list){ - myList.add(str); - } - } - + /**********************************************************************/ + ArrayList myList = JapsaFeature.readBED(bedFile); + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); SequenceOutputStream os = SequenceOutputStream.makeOutputStream(output); - char sep = '\t'; + String sep = "\t"; int notCount = 0; - os.print("#H:chr\tID\tstart\tend"); + //os.print("#CMD:" + cmdLine.fullCmd() + '\n'); + os.print("#H:chrom\tID\tstart\tend"); + + /****************************************************************** + * This is to be removed later, will get back to list of bam files + */ + + args = new String[1]; + args[0] = cmdLine.getStringVal("bamFile"); + /****************** End of short cut ***********************/ SamReader [] readers = new SamReader[args.length]; for (int i = 0; i < readers.length; i++){ - File file = new File(args[i]); - os.print("\t" + file.getName().replace(".sam", "").replace(".bam","")); - readers[i] = SamReaderFactory.makeDefault().open(file); + File file = new File(args[i]); + + readers[i] = SamReaderFactory.makeDefault().open(file); + String sampleID = null; + //SAMReadGroupRecord groupID = + + List readGroups = readers[i].getFileHeader().getReadGroups(); + + if (readGroups != null && readGroups.size() > 0){ + sampleID = readGroups.get(0).getSample(); + } + + + if (sampleID == null){ + sampleID = file.getName().replace(".sam", "").replace(".bam",""); + } + + if (overlap) + os.print(sep + sampleID + "_overlap"); + + if (contained) + os.print(sep + sampleID + "_contained"); + + if (span) + os.print(sep + sampleID + "_span"); } os.print("\n"); - + os.flush(); for (JapsaFeature str:myList){ int start = str.getStart() - flanking; int end = str.getEnd() + flanking; - if (start < 0 ) - start = 0; + if (start <= 0 ) + start = 1; + //TODO: check if end > chr.length - os.print(str.getParent() + sep + str.getID() + sep + str.getStart() + sep + str.getEnd()); + os.print(str.getParent() + sep + str.getID() + sep + (str.getStart() -1) + sep + str.getEnd()); for (int i = 0; i < readers.length; i++){ - SAMRecordIterator iter = readers[i].query(str.getParent(), start, end, contained); - int count = 0; + SAMRecordIterator iter = readers[i].query(str.getParent(), start, end, false); + + int countOverlap = 0, countContained = 0, countSpan = 0; + while (iter.hasNext()){ SAMRecord rec = iter.next(); @@ -167,24 +214,50 @@ public static void main(String[] args) throws IOException { if ((filter & rec.getFlags()) != 0){ notCount ++; continue; - } - count ++; + } + countOverlap ++; + + int alignmentStart = rec.getAlignmentStart(); + int alignmentEnd = rec.getAlignmentEnd(); + + if (alignmentStart >= start && alignmentEnd <= end) + countContained ++; + + if (alignmentStart < start && alignmentEnd > end) + countSpan ++; + }//while iter.close(); - os.print(sep); - os.print(count); + + if (overlap) + os.print(sep + countOverlap); + + if (contained) + os.print(sep + countContained); + + if (span) + os.print(sep + countSpan); + + //os.print(sep); + //os.print(countOverlap); }//for os.print("\n"); }//for for (int i = 0; i < readers.length; i++){ readers[i].close(); } - os.close(); - Logging.info("Ignore " + notCount + " reads"); + LOG.info("Ignore " + notCount + " reads"); } - } +/*RST* +------------------------------------------------ +*jsa.hts.countReads*: Count reads from bam files +------------------------------------------------ + + + +*RST*/ diff --git a/src/main/java/japsa/tools/bio/hts/FastQTrimCmd.java b/src/main/java/japsa/tools/bio/hts/FastQTrimCmd.java index c3e936d..0bdf9cf 100644 --- a/src/main/java/japsa/tools/bio/hts/FastQTrimCmd.java +++ b/src/main/java/japsa/tools/bio/hts/FastQTrimCmd.java @@ -36,8 +36,9 @@ import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; @@ -50,7 +51,9 @@ @Deployable( scriptName = "jsa.hts.fqtrim", scriptDesc = "Trim reads from a fastq file and break the file to smaller ones") -public class FastQTrimCmd extends CommandLine{ +public class FastQTrimCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(FastQTrimCmd.class); + public FastQTrimCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -90,7 +93,8 @@ public static void main(String[] args) throws IOException{ boolean trim = cmdLine.getBooleanVal("trim"); if (end > 0 && begin >= end) { - Logging.exit("Begin "+(begin) + " must be smaller than end (" + end +")", -1); + LOG.error("Begin "+(begin) + " must be smaller than end (" + end +")", -1); + System.exit(1); } if (size == 0) @@ -171,6 +175,6 @@ public static void main(String[] args) throws IOException{ outStream.print('\n'); }//while outStream.close(); - Logging.info("Write " + countAll + " reads to " + index + " files" ); + LOG.info("Write " + countAll + " reads to " + index + " files" ); } } diff --git a/src/main/java/japsa/tools/bio/hts/GetN50Cmd.java b/src/main/java/japsa/tools/bio/hts/GetN50Cmd.java index 4cd8fae..4735831 100644 --- a/src/main/java/japsa/tools/bio/hts/GetN50Cmd.java +++ b/src/main/java/japsa/tools/bio/hts/GetN50Cmd.java @@ -36,13 +36,13 @@ import java.io.IOException; -import java.util.ArrayList; +import java.util.Arrays; import japsa.seq.Alphabet; import japsa.seq.Sequence; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.HTSUtilities; +import japsa.util.IntArray; import japsa.util.deploy.Deployable; /** @@ -50,9 +50,9 @@ * */ @Deployable( - scriptName = "jsa.hts.n50", - scriptDesc = "Compute N50 of an assembly" - ) + scriptName = "jsa.hts.n50", + scriptDesc = "Compute N50 of an assembly" + ) public class GetN50Cmd extends CommandLine{ //CommandLine cmdLine; public GetN50Cmd(){ @@ -71,10 +71,48 @@ public static void main(String [] args) throws IOException, InterruptedException /**********************************************************************/ String input = cmdTool.getStringVal("input"); - - - ArrayList seqs = SequenceReader.readAll(input, Alphabet.DNA()); - double n50 = HTSUtilities.n50(seqs); - System.out.println(n50 + "\t" + seqs.size()); + + + //ArrayList seqs = SequenceReader.readAll(input, Alphabet.DNA()); + Alphabet dna = Alphabet.DNA(); + //double n50 = HTSUtilities.n50(seqs); + //System.out.println(n50 + "\t" + seqs.size()); + + IntArray lengthArray = new IntArray(); + SequenceReader reader = SequenceReader.getReader(input); + + Sequence seq ; + while ((seq = reader.nextSequence(dna))!= null){ + lengthArray.add(seq.length()); + } + + int [] lengths = lengthArray.toArray(); + + double sum = 0; + for (int i = 0;i < lengths.length;i++){ + sum += lengths[i]; + } + Arrays.sort(lengths); + + int index = lengths.length; + double contains = 0; + while (contains < sum/2){ + index --; + contains += lengths[index]; + } + + int n50 = lengths[index]; + System.out.println(n50 + "\t" + lengths.length + "\t" + sum); + } } + +/*RST* +----------------------------------------- +*jsa.hts.n50*: Compute N50 of an assembly +----------------------------------------- + + + +*RST*/ + diff --git a/src/main/java/japsa/tools/bio/hts/HTSErrorAnalysisCmd.java b/src/main/java/japsa/tools/bio/hts/HTSErrorAnalysisCmd.java index e184a2f..764d66f 100644 --- a/src/main/java/japsa/tools/bio/hts/HTSErrorAnalysisCmd.java +++ b/src/main/java/japsa/tools/bio/hts/HTSErrorAnalysisCmd.java @@ -57,8 +57,10 @@ */ @Deployable( scriptName = "jsa.hts.errorAnalysis", - scriptDesc = "Error analysis of HTS sequencing data") -public class HTSErrorAnalysisCmd extends CommandLine{ + scriptDesc = "Error analysis of sequencing data") +public class HTSErrorAnalysisCmd extends CommandLine{ +// private static final Logger LOG = LoggerFactory.getLogger(HTSErrorAnalysisCmd.class); + public HTSErrorAnalysisCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -93,9 +95,6 @@ public static void main(String [] args) throws IOException, InterruptedException /** * Error analysis of a bam file. Assume it has been sorted - * @param bamFile - * @param pad - * @throws IOException */ static void errorAnalysis(String bamFile, String refFile, String pattern, int qual) throws IOException{ @@ -115,7 +114,8 @@ static void errorAnalysis(String bamFile, String refFile, String pattern, int qu totNumIns = 0, totNumDel = 0, totMisMatch = 0, - totMatch = 0; + totMatch = 0, + totClipped = 0; long totReadBase = 0, totRefBase = 0; int numReads = 0; @@ -131,7 +131,8 @@ static void errorAnalysis(String bamFile, String refFile, String pattern, int qu //make the read seq Sequence readSeq = new Sequence(Alphabet.DNA(), sam.getReadString(), sam.getReadName()); if (readSeq.length() <= 1){ - //Logging.warn(sam.getReadName() +" ignored"); + //LOG.warn(sam.getReadName() +" ignored"); + //TODO: This might be secondary alignment, need to do something about it continue; } @@ -143,8 +144,13 @@ static void errorAnalysis(String bamFile, String refFile, String pattern, int qu continue; } - if (sam.getMappingQuality() < qual) + int flag = sam.getFlags(); + + + if (sam.getMappingQuality() < qual) { + numNotAligned ++; continue; + } @@ -169,8 +175,8 @@ static void errorAnalysis(String bamFile, String refFile, String pattern, int qu totMatch += profile.match; totReadBase += profile.readBase; - totRefBase += profile.refBase; - + totRefBase += profile.refBase; + totClipped += profile.readClipped; //numReadsConsidered ++; } samReader.close(); @@ -183,6 +189,8 @@ static void errorAnalysis(String bamFile, String refFile, String pattern, int qu System.out.println("Insertion " + totBaseIns + " " + totNumIns+" " + totBaseIns*1.0/totRefBase); System.out.println("MisMatch " + totMisMatch +" " + totMisMatch*1.0/totRefBase); System.out.println("Match " + totMatch); + System.out.println("Clipped " + totClipped); + System.out.println("ReadBase " + totReadBase); @@ -199,7 +207,7 @@ static void errorAnalysis(String bamFile, String refFile, String pattern, int qu double probDE = (1.0 + totBaseDel - totNumDel) / (2.0 +totBaseDel); double probIE = (1.0 + totBaseIns - totNumIns) / (2.0 +totBaseIns); - System.out.printf("Indentity %f %f %f %f\n",1.0 *totMatch/(totMatch + totMisMatch + totBaseDel +totBaseIns), + System.out.printf("Identity %f %f %f %f\n",1.0 *totMatch/(totMatch + totMisMatch + totBaseDel +totBaseIns), 1.0 *totMisMatch/(totMatch + totMisMatch + totBaseDel +totBaseIns), 1.0 *totBaseIns/(totMatch + totMisMatch + totBaseDel +totBaseIns), 1.0 *totBaseDel/(totMatch + totMisMatch + totBaseDel +totBaseIns )); @@ -220,3 +228,17 @@ static void errorAnalysis(String bamFile, String refFile, String pattern, int qu } } + +/*RST* +---------------------------------------------------------- +*jsa.hts.errorAnalysis*: Error analysis of sequencing data +---------------------------------------------------------- + +*jsa.hts.errorAnalysis* assesses the error profile of sequencing data by getting the numbers +of errors (mismatches, indels etc) from a bam file. Obviously, it does not distinguish +sequencing errors from mutations, and hence consider mutations as errors. It is best to use +with the bam file from aligning sequencing reads to a reliable assembly of the sample. + + + +*RST*/ \ No newline at end of file diff --git a/src/main/java/japsa/tools/bio/hts/SelectReadIntersectCmd.java b/src/main/java/japsa/tools/bio/hts/SelectReadIntersectCmd.java index 7b0e90e..3758ccf 100644 --- a/src/main/java/japsa/tools/bio/hts/SelectReadIntersectCmd.java +++ b/src/main/java/japsa/tools/bio/hts/SelectReadIntersectCmd.java @@ -336,7 +336,7 @@ private static void filterSamPair(String inFile, String outFile, String strFile, samWriter = new SAMTextWriter(new File("2_" + outFile)); samWriter.setSortOrder(SortOrder.unsorted, false); - samWriter.writeHeader( samHeader.getTextHeader()); + samWriter.writeHeader(samHeader.getTextHeader()); samIter = samReader.iterator(); while (samIter.hasNext()){ diff --git a/src/main/java/japsa/tools/bio/hts/VNTRDepthCmd.java b/src/main/java/japsa/tools/bio/hts/VNTRDepthCmd.java index 2b2afae..8b031cd 100644 --- a/src/main/java/japsa/tools/bio/hts/VNTRDepthCmd.java +++ b/src/main/java/japsa/tools/bio/hts/VNTRDepthCmd.java @@ -47,7 +47,6 @@ import java.util.HashMap; import java.util.List; - import japsa.seq.SequenceOutputStream; import japsa.seq.XAFReader; import japsa.util.CommandLine; @@ -325,7 +324,6 @@ else if (alignmentStart < rightBound5) if (depth){ String cmd = "samtools depth -q "+qual+" -r " + chrom + ":" + startSeq + "-" + endRep +" " + bamFiles[i]; - //Logging.info("Run " + cmd); Process process = Runtime.getRuntime().exec(cmd); BufferedReader depthReader = new BufferedReader (new InputStreamReader(process.getInputStream())); String depthLine = ""; diff --git a/src/main/java/japsa/tools/bio/hts/VNTRLongReadsCmd.java b/src/main/java/japsa/tools/bio/hts/VNTRLongReadsCmd.java new file mode 100644 index 0000000..3792f52 --- /dev/null +++ b/src/main/java/japsa/tools/bio/hts/VNTRLongReadsCmd.java @@ -0,0 +1,895 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/**************************************************************************** + * Revision History + * 15/05/2014 - Minh Duc Cao: Started + * + ****************************************************************************/ +package japsa.tools.bio.hts; + +import japsa.bio.alignment.MultipleAlignment; +import japsa.bio.alignment.ProfileDP; +import japsa.bio.alignment.ProfileDP.EmissionState; +import japsa.bio.tr.TandemRepeat; +import japsa.bio.tr.TandemRepeatVariant; +import japsa.seq.Alphabet; +import japsa.seq.FastaReader; +import japsa.seq.SequenceOutputStream; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.seq.XAFReader; +import japsa.util.ByteArray; +import japsa.util.CommandLine; +import japsa.util.DoubleArray; +import japsa.util.IntArray; +import japsa.util.JapsaMath; +import japsa.util.deploy.Deployable; +import japsa.xm.expert.Expert; +import japsa.xm.expert.MarkovExpert; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * VNTR typing using long reads + * + */ + +@Deployable(scriptName = "jsa.tr.longreads", scriptDesc = "VNTR typing using long reads") +public class VNTRLongReadsCmd extends CommandLine { + private static final Logger LOG = LoggerFactory.getLogger(VNTRLongReadsCmd.class); + public VNTRLongReadsCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + CommandLine.Option referenceOpt = + addString("reference", null, "Name of the reference genome ", true); + ///addStdInputFile(); + CommandLine.Option bamFileOpt = + addString("bamFile", null, "Name of the bam file", true); + + CommandLine.Option outputOpt = + addString("output", "-", + "Name of the output file, - for stdout"); + + CommandLine.Option xafFileOpt = + addString("xafFile", null, "Name of the regions file in xaf", + true); + + CommandLine.Option flankingOpt = + addInt("flanking", 30, "Size of the flanking regions"); + + CommandLine.Option minQualOpt = + addInt("qual", 0, "Minimum quality"); + + addInt("iteration", 1, "Number of iteration"); + addInt("nploidy",2, + "The ploidy of the genome 1 = happloid, 2 = diploid. Currenly only support up to 2-ploidy"); + addString("prefix", "", + "Prefix of temporary files, if not specified, will be automatically generated"); + + ///////////////Adding galaxy support///////////// + flankingOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + minQualOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + xafFileOpt.setGalaxySetting(new GalaxySetting("data", "tabular",false)); + + GalaxySetting galaxyOutput = new GalaxySetting("data", "text",true); + galaxyOutput.setLabel("countRead.txt"); + outputOpt.setGalaxySetting(galaxyOutput); + bamFileOpt.setGalaxySetting(new GalaxySetting("data", "bam",false)); + referenceOpt.setGalaxySetting(new GalaxySetting("data", "fasta",false)); + setGalaxy(annotation.scriptName()); + + + addStdHelp(); + } + + static Alphabet dna = Alphabet.DNA16(); + static IntArray profilePositions = new IntArray(); + static IntArray seqPositions = new IntArray(); + static DoubleArray costGeneration = new DoubleArray(); + static ByteArray byteArray = new ByteArray(); + + + public static void main(String[] args) throws Exception, + InterruptedException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new VNTRLongReadsCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + int flanking = cmdLine.getIntVal("flanking"); + if (flanking < 10) + flanking = 10; + + int qual = cmdLine.getIntVal("qual"); + + int np = cmdLine.getIntVal("nploidy"); + if (np > 2) { + System.err.println("The program currenly only support haploid and diployd. Enter nploidy of 1 or 2"); + System.exit(1); + } + + String bamFile = cmdLine.getStringVal("bamFile"); + String prefix = cmdLine.getStringVal("prefix"); + + if (prefix == null || prefix.length() == 0) { + prefix = "p" + System.currentTimeMillis(); + } + /**********************************************************************/ + + SequenceOutputStream outOS = SequenceOutputStream + .makeOutputStream(cmdLine.getStringVal("output")); + + String[] headers = TandemRepeatVariant.SIMPLE_HEADERS; + if (np > 1) { + headers = TandemRepeatVariant.SIMPLE_HEADERS2; + } + + TandemRepeatVariant.printHeader(outOS, headers); + + String strFile = cmdLine.getStringVal("xafFile"); + + LOG.info("Read genome begins"); + HashMap genome = new HashMap (); + SequenceReader seqReader = SequenceReader.getReader(cmdLine.getStringVal("reference")); + Sequence seq; + while ((seq = seqReader.nextSequence(dna)) != null){ + genome.put(seq.getName(), seq); + } + seqReader.close(); + LOG.info("Read genome done"); + + /**********************************************************************/ + XAFReader xafReader = new XAFReader(strFile); + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + Expert.setAlphabet(Alphabet.DNA4()); + + ArrayList readSequences = new ArrayList(); + + //Go through the list of repeats + while (xafReader.next() != null){ + TandemRepeat str = TandemRepeat.read(xafReader); + + //start,end = the start and end of the region (including flanks) + int start = Integer.parseInt(xafReader.getField("start")) - flanking; + int end = Integer.parseInt(xafReader.getField("end")) + flanking; + String chrom = xafReader.getField("chrom"); + + if (seq == null || !seq.getName().equals(chrom)){ + seq = genome.get(chrom); + } + if (seq == null){ + xafReader.close(); + LOG.error("Chrom in line " + xafReader.lineNo() + " not found!!!"); + System.exit(1); + } + + if (end > seq.length()) + end = seq.length(); + + if (start < 1) + start = 1; + + int hmmFlank = flanking; + int period = str.getPeriod(); + double fraction = str.getUnitNo() - Math.floor(str.getUnitNo()); + int hmmPad = (int)(fraction * period ) ; + + //System.out.println("###" + str.getPeriod() + " " + str.getUnitNo() + " " + hmmPad); + Sequence hmmSeq = new Sequence(dna, hmmFlank * 2 + hmmPad + str.getPeriod()); + int i = 0; + + for (;i < hmmFlank + hmmPad + str.getPeriod(); i++) + hmmSeq.setBase(i, seq.getBase(str.getStart() - hmmFlank + i -1)); + + for (;i < hmmSeq.length();i++){ + byte base = seq.getBase(str.getEnd() + i - (hmmFlank + hmmPad + str.getPeriod()) );//no need to -1 + hmmSeq.setBase(i,base); + } + + ProfileDP dp = new ProfileDP(hmmSeq, hmmFlank + hmmPad, hmmFlank + hmmPad + str.getPeriod() - 1);//-1 for 0-index, inclusive + + //System.out.println("Lengths: " + hmmFlank + ", " + hmmPad + " " + str.getPeriod() + " " + hmmSeq.length() ); + //System.out.println("CHECKING BEGIN"); + + outOS.print("##"+str.getID()+"\n## "); + for (int x = 0; x < hmmSeq.length();x++){ + outOS.print(hmmSeq.charAt(x)); + if (x == hmmFlank + hmmPad -1 || x == hmmFlank + hmmPad + period - 1) + outOS.print("=="); + } + outOS.println(); + + //run on the reference + //if (1==0) + { + Sequence refRepeat = seq.subSequence(start, end); + refRepeat.setName("reference"); + processRead(refRepeat, dp, fraction, hmmFlank, hmmPad, period, outOS ); + + } + + SAMRecordIterator iter = reader.query(str.getParent(), start, end, false); + + String fileName = prefix + "_" + str.getID() + "_i.fasta"; + SequenceOutputStream os = SequenceOutputStream.makeOutputStream(fileName); + + // double var = 0; + TandemRepeatVariant trVar = new TandemRepeatVariant(); + trVar.setTandemRepeat(str); + + int readIndex = 0; + + //Clear the list + readSequences.clear(); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + // Check qualilty + if (rec.getMappingQuality() < qual) { + continue; + } + + // Only reads that fully span the repeat and flankings + int currentRefPos = rec.getAlignmentStart(); + if (currentRefPos > start) + continue; + if (rec.getAlignmentEnd() < end) + continue; + + readIndex ++; + //////////////////////////////////////////////////////////////////// + //assert currentRefBase < start + + Sequence readSeq = getReadPosition(rec,start,end); + if (readSeq == null) + continue; + + + String readName = readSeq.getName(); + String [] toks = readName.split("/",4); + + String polymerageRead = (toks.length > 1) ? toks[1] : toks[0]; + String subRead = (toks.length > 2) ? toks[2] : "_"; + //String alignSubRead = (toks.length > 3) ? toks[3] : "_"; + readSeq.setName(polymerageRead + "_" + subRead); + readSeq.writeFasta(os); + + //processRead(readSeq, dp, fraction, hmmFlank, hmmPad, period, outOS ); + readSequences.add(readSeq); + }// while + iter.close(); + os.close(); + //readSequences: an array of reads + + ProfileDP dpBatch = new ProfileDP(hmmSeq, hmmFlank + hmmPad, hmmFlank + hmmPad + str.getPeriod() - 1);//-1 for 0-index, inclusive + processBatch(readSequences, dpBatch, fraction, hmmFlank, hmmPad, period, outOS ); + + outOS.print(trVar.toString(headers)); + outOS.print('\n'); + }// for + + reader.close(); + outOS.close(); + } + + static private void processBatch(ArrayList readBatch, ProfileDP dpBatch, double fraction, int hmmFlank, int hmmPad, int period, SequenceOutputStream outOS ) throws IOException{ + + for (int round = 0; round < 5;round ++){ + double myCost = 0; + int countIns = 0, countDel = 0, countMG = 0, countMB = 0; + for (Sequence readSeq:readBatch){ + EmissionState bestState = dpBatch.align(readSeq); + //TODO: make a filter here: select only eligible alignment + double alignScore = bestState.getScore(); + countIns += bestState.getCountIns(); + countDel += bestState.getCountDel(); + countMG += bestState.getCountMG(); + countMB += bestState.getCountMB(); + //System.out.println("Score " + alignScore + " vs " + readSeq.length()*2 + " (" + alignScore/readSeq.length() +")"); + double bestIter = bestState.getIter() + fraction; + myCost += alignScore; + }//for readSeq + double sum = 3.0 + countMG + countMB + countIns + countDel; + double insP = (countIns + 1.0) /sum; + double delP = (countDel + 1.0) /sum; + double matP = (countMG + countMB + 1.0) /sum; + double matchP = (countMG + 1.0) / (countMG + countMB + 2.0); + double misMatchP = 1 - matchP; + System.out.printf("Total: %3d %3d %3d %3d %8.4f %8.4f %8.4f %8.4f\n", countMG, countMB, countIns, countDel, insP, delP, misMatchP,myCost); + dpBatch.setTransitionProbability(matP, insP, delP); + dpBatch.setMatchProbability(matchP); + }//round + + for (Sequence readSeq:readBatch){ + MarkovExpert expert = new MarkovExpert(1); + double costM = 0; + for (int x = 0; x< readSeq.length();x++){ + int base = readSeq.getBase(x); + costM -= JapsaMath.log2(expert.update(base)); + } + + double backGround = costM / readSeq.length() - 0.1; + boolean pass = true; + outOS.print("Markov " + costM + "\t" + (costM / readSeq.length()) + "\n"); + + + EmissionState bestState = dpBatch.align(readSeq); + double alignScore = bestState.getScore(); + //System.out.println("Score " + alignScore + " vs " + readSeq.length()*2 + " (" + alignScore/readSeq.length() +")"); + double bestIter = bestState.getIter() + fraction; + profilePositions.clear(); + seqPositions.clear(); + costGeneration.clear(); + byteArray.clear(); + + EmissionState lastState = bestState; + bestState = bestState.bwdState; + + while (bestState != null){ + profilePositions.add(bestState.profilePos); + seqPositions.add(bestState.profilePos); + costGeneration.add(lastState.score - bestState.score); + + if (bestState.seqPos == lastState.seqPos) + byteArray.add((byte)Alphabet.DNA.GAP); + else + byteArray.add(readSeq.getBase(lastState.seqPos)); + + lastState = bestState; + bestState = bestState.bwdState; + } + + double costL = 0, costR = 0, costCurrentRep = 0, costRep = 0; + int stateL = 0, stateR = 0, stateCurrentRep = 0, stateRep = 0; + int baseL = 0, baseR = 0, baseCurrentRep = 0, baseRep = 0; + int bSeqL = 0, bSeqR = 0, bSeqCurrentRep = 0, bSeqRep = 0; + + int lastProfilePos = -1, lastSeqPos = -1; + + for (int x = profilePositions.size() - 1; x >=0; x--){ + outOS.print(Alphabet.DNA().int2char(byteArray.get(x))); + + int profilePos = profilePositions.get(x); + int seqPos = seqPositions.get(x); + + if (profilePos < hmmFlank + hmmPad){ + stateL ++; + costL += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseL ++; + + if (lastSeqPos != seqPos) + bSeqL ++; + + }else if(profilePos > hmmFlank + hmmPad + period){ + stateR ++; + costR += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseR ++; + + if (lastSeqPos != seqPos) + bSeqR ++; + }else{ + stateCurrentRep ++; + costCurrentRep += costGeneration.get(x); + + stateRep ++; + costRep += costGeneration.get(x); + + if (lastProfilePos != profilePos){ + baseRep ++; + baseCurrentRep ++; + } + + if (lastSeqPos != seqPos){ + bSeqRep ++; + bSeqCurrentRep ++; + } + + } + + //end of a repeat cycle + if (profilePos < lastProfilePos){ + outOS.print("<-----------------REP " + costCurrentRep + + " " + stateCurrentRep + + " " + (stateCurrentRep == 0?"inf": "" + (costCurrentRep/stateCurrentRep)) + + " " + baseCurrentRep + + " " + (baseCurrentRep == 0?"inf": "" + (costCurrentRep/baseCurrentRep)) + + " " + bSeqCurrentRep + + " " + (bSeqCurrentRep == 0?"inf": "" + (costCurrentRep/bSeqCurrentRep)) + ); + + if (costCurrentRep/bSeqCurrentRep > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + costCurrentRep = 0;//restart + stateCurrentRep = 0;//restart + baseCurrentRep = 0; + bSeqCurrentRep = 0; + } + + //left + if (profilePos >= hmmFlank + hmmPad && lastProfilePos < hmmFlank + hmmPad){ + outOS.print("<-----------------LEFT " + costL + + " " + stateL + + " " + (stateL == 0?"inf": "" + (costL/stateL)) + + " " + baseL + + " " + (baseL == 0?"inf": "" + (costL/baseL)) + + " " + bSeqL + + " " + (bSeqL == 0?"inf": "" + (costL/bSeqL)) + ); + if (costL/bSeqL > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + + } + + //right + //if (profilePos < hmmFlank + hmmPad + period && lastProfilePos >= hmmFlank + hmmPad + period){ + // outOS.print("<-----------------RIGHT " + costR + // + " " + stateR + // + " " + (stateR == 0?"inf": "" + (costR/stateR)) + // + " " + baseR + // + " " + (baseR == 0?"inf": "" + (costR/baseR)) + // + " " + bSeqR + // + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + // ); + // outOS.println(); + //} + lastProfilePos = profilePos; + lastSeqPos = seqPos; + + }//for x + + //move to out of the loop + outOS.print("<-----------------RIGHT " + costR + + " " + stateR + + " " + (stateR == 0?"inf": "" + (costR/stateR)) + + " " + baseR + + " " + (baseR == 0?"inf": "" + (costR/baseR)) + + " " + bSeqR + + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + ); + + if (costR/bSeqR > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + //outOS.println(); + + outOS.println(); + outOS.print ("L = " + (costL/(hmmFlank + hmmPad)) + " R = " + costR/(dpBatch.getProfileLength() - hmmFlank - hmmPad - period) + "\n"); + + /*****************************************************************/ + outOS.print("##" + readSeq.getName() +"\t"+bestIter+"\t"+readSeq.length() +"\t" +alignScore+"\t" + alignScore/readSeq.length() + '\t' + costM + "\t" + costM / readSeq.length() + "\t" + costL + "\t" + stateL + "\t" + costR + "\t" + stateR + "\t" + (alignScore - costL - costR) + "\t" + stateRep + "\t" + pass + '\n'); + outOS.print("==================================================================\n"); + } + } + /*******************************************************************/ + + static private void processRead(Sequence readSeq, ProfileDP dp, double fraction, int hmmFlank, int hmmPad, int period, SequenceOutputStream outOS ) throws IOException{ + + MarkovExpert expert = new MarkovExpert(1); + double costM = 0; + for (int x = 0; x< readSeq.length();x++){ + int base = readSeq.getBase(x); + costM -= JapsaMath.log2(expert.update(base)); + } + + double backGround = costM / readSeq.length() - 0.1; + boolean pass = true; + + + outOS.print("Markov " + costM + "\t" + (costM / readSeq.length()) + "\n"); + + EmissionState bestState = dp.align(readSeq); + double alignScore = bestState.getScore(); + //System.out.println("Score " + alignScore + " vs " + readSeq.length()*2 + " (" + alignScore/readSeq.length() +")"); + double bestIter = bestState.getIter() + fraction; + + /*******************************************************************/ + profilePositions.clear(); + seqPositions.clear(); + costGeneration.clear(); + byteArray.clear(); + + //double oldCost = bestState.score; + EmissionState lastState = bestState; + bestState = bestState.bwdState; + + while (bestState != null){ + profilePositions.add(bestState.profilePos); + seqPositions.add(bestState.profilePos); + costGeneration.add(lastState.score - bestState.score); + + if (bestState.seqPos == lastState.seqPos) + byteArray.add((byte)Alphabet.DNA.GAP); + else + byteArray.add(readSeq.getBase(lastState.seqPos)); + + lastState = bestState; + bestState = bestState.bwdState; + } + + double costL = 0, costR = 0, costCurrentRep = 0, costRep = 0; + int stateL = 0, stateR = 0, stateCurrentRep = 0, stateRep = 0; + int baseL = 0, baseR = 0, baseCurrentRep = 0, baseRep = 0; + int bSeqL = 0, bSeqR = 0, bSeqCurrentRep = 0, bSeqRep = 0; + + int lastProfilePos = -1, lastSeqPos = -1; + + for (int x = profilePositions.size() - 1; x >=0; x--){ + outOS.print(Alphabet.DNA().int2char(byteArray.get(x))); + + int profilePos = profilePositions.get(x); + int seqPos = seqPositions.get(x); + + if (profilePos < hmmFlank + hmmPad){ + stateL ++; + costL += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseL ++; + + if (lastSeqPos != seqPos) + bSeqL ++; + + }else if(profilePos > hmmFlank + hmmPad + period){ + stateR ++; + costR += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseR ++; + + if (lastSeqPos != seqPos) + bSeqR ++; + }else{ + stateCurrentRep ++; + costCurrentRep += costGeneration.get(x); + + stateRep ++; + costRep += costGeneration.get(x); + + if (lastProfilePos != profilePos){ + baseRep ++; + baseCurrentRep ++; + } + + if (lastSeqPos != seqPos){ + bSeqRep ++; + bSeqCurrentRep ++; + } + + } + + //end of a repeat cycle + if (profilePos < lastProfilePos){ + outOS.print("<-----------------REP " + costCurrentRep + + " " + stateCurrentRep + + " " + (stateCurrentRep == 0?"inf": "" + (costCurrentRep/stateCurrentRep)) + + " " + baseCurrentRep + + " " + (baseCurrentRep == 0?"inf": "" + (costCurrentRep/baseCurrentRep)) + + " " + bSeqCurrentRep + + " " + (bSeqCurrentRep == 0?"inf": "" + (costCurrentRep/bSeqCurrentRep)) + ); + + if (costCurrentRep/bSeqCurrentRep > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + costCurrentRep = 0;//restart + stateCurrentRep = 0;//restart + baseCurrentRep = 0; + bSeqCurrentRep = 0; + } + + //left + if (profilePos >= hmmFlank + hmmPad && lastProfilePos < hmmFlank + hmmPad){ + outOS.print("<-----------------LEFT " + costL + + " " + stateL + + " " + (stateL == 0?"inf": "" + (costL/stateL)) + + " " + baseL + + " " + (baseL == 0?"inf": "" + (costL/baseL)) + + " " + bSeqL + + " " + (bSeqL == 0?"inf": "" + (costL/bSeqL)) + ); + if (costL/bSeqL > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + + } + + //right + //if (profilePos < hmmFlank + hmmPad + period && lastProfilePos >= hmmFlank + hmmPad + period){ + // outOS.print("<-----------------RIGHT " + costR + // + " " + stateR + // + " " + (stateR == 0?"inf": "" + (costR/stateR)) + // + " " + baseR + // + " " + (baseR == 0?"inf": "" + (costR/baseR)) + // + " " + bSeqR + // + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + // ); + // outOS.println(); + //} + lastProfilePos = profilePos; + lastSeqPos = seqPos; + + }//for x + + //move to out of the loop + outOS.print("<-----------------RIGHT " + costR + + " " + stateR + + " " + (stateR == 0?"inf": "" + (costR/stateR)) + + " " + baseR + + " " + (baseR == 0?"inf": "" + (costR/baseR)) + + " " + bSeqR + + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + ); + + if (costR/bSeqR > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + //outOS.println(); + + outOS.println(); + outOS.print ("L = " + (costL/(hmmFlank + hmmPad)) + " R = " + costR/(dp.getProfileLength() - hmmFlank - hmmPad - period) + "\n"); + + /*****************************************************************/ + outOS.print("##" + readSeq.getName() +"\t"+bestIter+"\t"+readSeq.length() +"\t" +alignScore+"\t" + alignScore/readSeq.length() + '\t' + costM + "\t" + costM / readSeq.length() + "\t" + costL + "\t" + stateL + "\t" + costR + "\t" + stateR + "\t" + (alignScore - costL - costR) + "\t" + stateRep + "\t" + pass + '\n'); + outOS.print("==================================================================\n"); + } + + public static Sequence getReadPosition(SAMRecord rec, int startRef, int endRef){ + byte[] seqRead = rec.getReadBases();// + if (seqRead.length <= 1) + return null; + + int startRead = -1, endRead = -1; + + int refPos = rec.getAlignmentStart(); + int readPos = 0; + //currentRefPos <= startRead + + for (final CigarElement e : rec.getCigar().getCigarElements()) { + int length = e.getLength(); + switch (e.getOperator()) { + case H: + break; // ignore hard clips + case P: + break; // ignore pads + case S: + readPos += e.getLength(); + break; // soft clip read bases + case N: // N ~ D + case D: + refPos += length; + + if (startRead < 0 && refPos >= startRef){ + startRead = readPos; + } + + if (endRead < 0 && refPos >= endRef){ + endRead = readPos; + } + + break;// case + case I: + readPos += length; + break; + + case M: + case EQ: + case X: + if ((startRead < 0) && refPos + length >= startRef) { + startRead = readPos + startRef - refPos; + } + + if ((endRead < 0) && (refPos + length >= endRef)){ + endRead = readPos + endRef - refPos; + } + + refPos += length; + readPos += length; + break; + default: + throw new IllegalStateException( + "Case statement didn't deal with cigar op: " + + e.getOperator()); + }// case + if (refPos >= endRef) + break;//for + + }// for + if (startRead < 0 || endRead < 0){ + LOG.warn(" " + refPos + " " + readPos + " " + startRead + " " + endRead); + return null; + } + + Alphabet alphabet = Alphabet.DNA16(); + Sequence retSeq = new Sequence(alphabet, endRead - startRead + 1, rec.getReadName() + "/" + startRead + "_" + endRead); + for (int i = 0; i < retSeq.length();i++){ + retSeq.setBase(i, alphabet.byte2index(seqRead[startRead + i])); + } + return retSeq; + + } + + /** + * + * @param seqList + * @param indexStart + * : the start index of the list (inclusive) + * @param indexEnd + * : the end index of the list (exclusive) + */ + static int call(ArrayList seqList, int indexStart, int indexEnd) { + if (indexEnd <= indexStart) + return 0; + + // Get consensus + int gaps = 0; + Sequence nSeq = new Sequence(Alphabet.DNA6(), seqList.get(0).length(), + "consensus"); + int[] votes = new int[6]; + for (int i = 0; i < nSeq.length(); i++) { + Arrays.fill(votes, 0); + for (int s = indexStart; s < indexEnd; s++) { + votes[seqList.get(s).symbolAt(i)]++; + } + byte best = 0; + for (byte b = 1; b < 6; b++) + if (votes[b] > votes[best]) + best = b; + + nSeq.setBase(i, best); + if (best == 5) + gaps++; + }// for + return gaps; + } + + static int call(ArrayList seqList) { + return call(seqList,0,seqList.size()); + + } + + static void aaa(Sequence seq, TandemRepeat str, int flanking, SamReader reader, int qual, String prefix, int np) throws IOException, InterruptedException{ + String cmd = "kalign -gpo 60 -gpe 10 -tgpe 0 -bonus 0 -q -i " + prefix + + "i.fasta -o " + prefix + "o.fasta"; + + //cmd = "clustalo --force -i " + prefix + "i.fasta -o " + prefix + // + "o.fasta"; + + + String chrom = str.getChr(); + int start = str.getStart() - flanking; + int end = str.getEnd() + flanking; + + if (start < 0) start = 0; + if (end > seq.length()) + end = seq.length(); + + SAMRecordIterator iter + = reader.query(chrom, start, end, false); + + + int maxAlign = 300; + + MultipleAlignment ma = new MultipleAlignment(maxAlign, seq); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + // Check qualilty + if (rec.getMappingQuality() < qual) { + continue; + } + + // Only reads that fully span the repeat and flankings + if (rec.getAlignmentStart() > start) + continue; + if (rec.getAlignmentEnd() < end) + continue; + + ma.addRead(rec); + }// while + iter.close(); + // os.close(); + + double var = 0; + TandemRepeatVariant trVar = new TandemRepeatVariant(); + trVar.setTandemRepeat(str); + + if (ma.printFasta(start, end, prefix + "i.fasta") > 0) { + LOG.info("Running " + cmd); + Process process = Runtime.getRuntime().exec(cmd); + process.waitFor(); + LOG.info("Done " + cmd); + + SequenceReader hmmSeqReader + = FastaReader.getReader(prefix + "i.fasta"); + Sequence readSeq; + + SequenceReader msaReader + = FastaReader.getReader(prefix + "o.fasta"); + ArrayList seqList = new ArrayList(); + Sequence nSeq = null; + while ((nSeq = msaReader.nextSequence(Alphabet.DNA16())) != null) { + seqList.add(nSeq); + } + //str.getChr()+"_"+str.getStart()+"_"+str.getEnd(); + + if (np >= 2) { + trVar.setVar2(var); + trVar.addEvidence(seqList.size()); + } else {// nploidy ==1 + int llength = seqList.get(0).length(); + + int gaps = call(seqList); + + var = (llength - gaps - end + start) * 1.0 + / str.getPeriod(); + + trVar.setVar(var); + trVar.addEvidence(seqList.size()); + } + }// if + } +} diff --git a/src/main/java/japsa/tools/bio/hts/VNTRLongReadsV2Cmd.java b/src/main/java/japsa/tools/bio/hts/VNTRLongReadsV2Cmd.java new file mode 100644 index 0000000..3da6e73 --- /dev/null +++ b/src/main/java/japsa/tools/bio/hts/VNTRLongReadsV2Cmd.java @@ -0,0 +1,667 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/**************************************************************************** + * Revision History + * 15/05/2014 - Minh Duc Cao: Started + * + ****************************************************************************/ +package japsa.tools.bio.hts; + +import japsa.bio.alignment.ProfileDP; +import japsa.bio.alignment.ProfileDP.EmissionState; +import japsa.bio.alignment.ppfsm.Emission; +import japsa.bio.alignment.ppfsm.ProfilePFSM; +import japsa.bio.alignment.ppfsm.VNTRpThreeSM; +import japsa.bio.alignment.ppfsm.VNTRpOneSM; +import japsa.bio.tr.TandemRepeat; +import japsa.bio.tr.TandemRepeatVariant; +import japsa.seq.Alphabet; +import japsa.seq.SequenceOutputStream; +import japsa.seq.Sequence; +import japsa.seq.SequenceReader; +import japsa.seq.XAFReader; +import japsa.util.ByteArray; +import japsa.util.CommandLine; +import japsa.util.DoubleArray; +import japsa.util.IntArray; +import japsa.util.JapsaMath; +import japsa.util.deploy.Deployable; +import japsa.xm.expert.Expert; +import japsa.xm.expert.MarkovExpert; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * VNTR typing using long reads + * + */ + +@Deployable(scriptName = "jsa.tr.longreadsv2", scriptDesc = "VNTR typing using long reads") +public class VNTRLongReadsV2Cmd extends CommandLine { + private static final Logger LOG = LoggerFactory.getLogger(VNTRLongReadsV2Cmd.class); + public VNTRLongReadsV2Cmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + CommandLine.Option referenceOpt = + addString("reference", null, "Name of the reference genome ", true); + ///addStdInputFile(); + CommandLine.Option bamFileOpt = + addString("bamFile", null, "Name of the bam file", true); + + CommandLine.Option outputOpt = + addString("output", "-", + "Name of the output file, - for stdout"); + + CommandLine.Option xafFileOpt = + addString("xafFile", null, "Name of the regions file in xaf", + true); + + CommandLine.Option flankingOpt = + addInt("flanking", 30, "Size of the flanking regions"); + + CommandLine.Option minQualOpt = + addInt("qual", 0, "Minimum quality"); + + addInt("iteration", 1, "Number of iteration"); + addInt("nploidy",2, + "The ploidy of the genome 1 = happloid, 2 = diploid. Currenly only support up to 2-ploidy"); + addString("prefix", "", + "Prefix of temporary files, if not specified, will be automatically generated"); + + ///////////////Adding galaxy support///////////// + flankingOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + minQualOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + xafFileOpt.setGalaxySetting(new GalaxySetting("data", "tabular",false)); + + GalaxySetting galaxyOutput = new GalaxySetting("data", "text",true); + galaxyOutput.setLabel("countRead.txt"); + outputOpt.setGalaxySetting(galaxyOutput); + bamFileOpt.setGalaxySetting(new GalaxySetting("data", "bam",false)); + referenceOpt.setGalaxySetting(new GalaxySetting("data", "fasta",false)); + setGalaxy(annotation.scriptName()); + + + addStdHelp(); + } + + static Alphabet dna = Alphabet.DNA16(); + static IntArray profilePositions = new IntArray(); + static IntArray seqPositions = new IntArray(); + static DoubleArray costGeneration = new DoubleArray(); + static ByteArray byteArray = new ByteArray(); + + + public static void main(String[] args) throws Exception, + InterruptedException { + /*********************** Setting up script ****************************/ + CommandLine cmdLine = new VNTRLongReadsV2Cmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + int flanking = cmdLine.getIntVal("flanking"); + if (flanking < 10) + flanking = 10; + + int qual = cmdLine.getIntVal("qual"); + + int np = cmdLine.getIntVal("nploidy"); + if (np > 2) { + System.err.println("The program currenly only support haploid and diployd. Enter nploidy of 1 or 2"); + System.exit(1); + } + + String bamFile = cmdLine.getStringVal("bamFile"); + String prefix = cmdLine.getStringVal("prefix"); + + if (prefix == null || prefix.length() == 0) { + prefix = "p" + System.currentTimeMillis(); + } + /**********************************************************************/ + + SequenceOutputStream outOS = SequenceOutputStream + .makeOutputStream(cmdLine.getStringVal("output")); + + SequenceOutputStream sequenceOut = SequenceOutputStream + .makeOutputStream(prefix + "output.fasta"); + + String[] headers = TandemRepeatVariant.SIMPLE_HEADERS; + if (np > 1) { + headers = TandemRepeatVariant.SIMPLE_HEADERS2; + } + + TandemRepeatVariant.printHeader(outOS, headers); + + String strFile = cmdLine.getStringVal("xafFile"); + + LOG.info("Read genome begins"); + HashMap genome = new HashMap (); + SequenceReader seqReader = SequenceReader.getReader(cmdLine.getStringVal("reference")); + Sequence seq; + while ((seq = seqReader.nextSequence(dna)) != null){ + genome.put(seq.getName(), seq); + } + seqReader.close(); + LOG.info("Read genome done"); + + /**********************************************************************/ + XAFReader xafReader = new XAFReader(strFile); + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + SamReader reader = SamReaderFactory.makeDefault().open(new File(bamFile)); + + Expert.setAlphabet(Alphabet.DNA4()); + + ArrayList readSequences = new ArrayList(); + + while (xafReader.next() != null){ + TandemRepeat str = TandemRepeat.read(xafReader); + + //start,end = the start and end of the region (including flanks) + int start = Integer.parseInt(xafReader.getField("start")); + int end = Integer.parseInt(xafReader.getField("end")); + String chrom = xafReader.getField("chrom"); + + if (seq == null || !seq.getName().equals(chrom)){ + seq = genome.get(chrom); + } + if (seq == null){ + xafReader.close(); + LOG.error("Chrom in line " + xafReader.lineNo() + " not found!!!", 1); + System.exit(1); + } + + int period = str.getPeriod(); + double fraction = str.getUnitNo() - Math.floor(str.getUnitNo()); + int hmmPad = (int)(fraction * period ) ; + + int hmmFlank = flanking; + + //System.out.println("###" + str.getPeriod() + " " + str.getUnitNo() + " " + hmmPad); + Sequence hmmSeq = new Sequence(dna, hmmFlank * 2 + hmmPad + str.getPeriod()); + int i = 0; + + for (;i < hmmFlank + hmmPad + str.getPeriod(); i++) + hmmSeq.setBase(i, seq.getBase(str.getStart() - hmmFlank + i -1)); + + for (;i < hmmSeq.length();i++){ + byte base = seq.getBase(str.getEnd() + i - (hmmFlank + hmmPad + str.getPeriod()) );//no need to -1 + hmmSeq.setBase(i,base); + } + + ProfileDP dp = new ProfileDP(hmmSeq, hmmFlank + hmmPad, hmmFlank + hmmPad + str.getPeriod() - 1);//-1 for 0-index, inclusive + + + Sequence leftFlank = seq.subSequence(start - flanking, start + hmmPad); + Sequence repUnit = seq.subSequence(start + hmmPad, start + hmmPad + str.getPeriod()); + Sequence rightFlank = seq.subSequence(end, end + flanking); + + + leftFlank.setName("left_" + str.getID() + "_" + chrom + ":" + (start - flanking) + "-" + (start + hmmPad)); + repUnit.setName("rep_" + str.getID() + "_" + chrom + ":" + (start - flanking) + "-" + (start + hmmPad)); + rightFlank.setName("right_" + str.getID() + "_" + chrom + ":" + (start - flanking) + "-" + (start + hmmPad)); + + leftFlank.writeFasta(sequenceOut); + repUnit.writeFasta(sequenceOut); + rightFlank.writeFasta(sequenceOut); + + VNTRpThreeSM rep3FSM = new VNTRpThreeSM(leftFlank, repUnit, rightFlank); + VNTRpOneSM rep1FSM = new VNTRpOneSM(leftFlank, repUnit, rightFlank); + + + outOS.print("##"+str.getID()+"\n## "); + for (int x = 0; x < leftFlank.length();x++){ + outOS.print(leftFlank.charAt(x)); + } + + outOS.print("=="); + for (int x = 0; x < repUnit.length();x++){ + outOS.print(repUnit.charAt(x)); + } + + outOS.print("=="); + for (int x = 0; x < rightFlank.length();x++){ + outOS.print(rightFlank.charAt(x)); + } + outOS.println(); + + //run on the reference + //if (1==0) + { + Sequence refRepeat = seq.subSequence(start - flanking, end + flanking); + refRepeat.setName("reference"); + refRepeat.writeFasta(sequenceOut); + processRead(refRepeat, rep3FSM, fraction, outOS ); + processRead(refRepeat, rep1FSM, fraction, outOS ); + + + } + + SAMRecordIterator iter = reader.query(str.getParent(), start - flanking, end + flanking, false); + + //String fileName = prefix + "_" + str.getID() + "_i.fasta"; + //SequenceOutputStream os = SequenceOutputStream.makeOutputStream(fileName); + + //double var = 0; + TandemRepeatVariant trVar = new TandemRepeatVariant(); + trVar.setTandemRepeat(str); + + int readIndex = 0; + + readSequences.clear(); + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + // Check qualilty + if (rec.getMappingQuality() < qual) { + continue; + } + + // Only reads that fully span the repeat and flankings + int currentRefPos = rec.getAlignmentStart(); + if (currentRefPos > start - flanking) + continue; + if (rec.getAlignmentEnd() < end + flanking) + continue; + + readIndex ++; + //////////////////////////////////////////////////////////////////// + //assert currentRefBase < start + + Sequence readSeq = getReadPosition(rec, start - flanking, end + flanking); + if (readSeq == null) + continue; + + + String readName = readSeq.getName(); + String [] toks = readName.split("/",4); + + String polymerageRead = (toks.length > 1) ? toks[1] : toks[0]; + String subRead = (toks.length > 2) ? toks[2] : "_"; + //String alignSubRead = (toks.length > 3) ? toks[3] : "_"; + readSeq.setName(polymerageRead + "_" + subRead); + //readSeq.writeFasta(os); + + readSeq.writeFasta(sequenceOut); + //processRead(readSeq, dp, fraction, hmmFlank, hmmPad, period, outOS ); + processRead(readSeq, rep3FSM, fraction, outOS ); + processRead(readSeq, rep1FSM, fraction, outOS ); + //readSequences.add(readSeq); + }// while + iter.close(); + //os.close(); + + + + //ProfileDP dpBatch = new ProfileDP(hmmSeq, hmmFlank + hmmPad, hmmFlank + hmmPad + str.getPeriod() - 1);//-1 for 0-index, inclusive + //processBatch(readSequences, dpBatch, fraction, hmmFlank, hmmPad, period, outOS ); + + outOS.print(trVar.toString(headers)); + outOS.print('\n'); + }// for + + sequenceOut.close(); + reader.close(); + outOS.close(); + } + + static private void processRead(Sequence readSeq, ProfilePFSM dp, double fraction, SequenceOutputStream outOS ) throws IOException{ + + MarkovExpert expert = new MarkovExpert(1); + double costM = 0; + for (int x = 0; x< readSeq.length();x++){ + int base = readSeq.getBase(x); + costM -= JapsaMath.log2(expert.update(base)); + } + + double backGround = costM / readSeq.length() - 0.1; + boolean pass = true; + + + //outOS.print("Markov " + costM + "\t" + (costM / readSeq.length()) + "\n"); + + Emission bestState = dp.align(readSeq); + double alignScore = bestState.getScore(); + double bestIter = bestState.iteration + fraction; + outOS.print(readSeq.getName() + " " + alignScore + " " + bestIter); + outOS.println(); + + /*******************************************************************/ + profilePositions.clear(); + seqPositions.clear(); + costGeneration.clear(); + byteArray.clear(); + } + + + + static private void processRead(Sequence readSeq, ProfileDP dp, double fraction, int hmmFlank, int hmmPad, int period, SequenceOutputStream outOS ) throws IOException{ + + MarkovExpert expert = new MarkovExpert(1); + double costM = 0; + for (int x = 0; x< readSeq.length();x++){ + int base = readSeq.getBase(x); + costM -= JapsaMath.log2(expert.update(base)); + } + + double backGround = costM / readSeq.length() - 0.1; + boolean pass = true; + + + outOS.print("Markov " + costM + "\t" + (costM / readSeq.length()) + "\n"); + + EmissionState bestState = dp.align(readSeq); + double alignScore = bestState.getScore(); + //System.out.println("Score " + alignScore + " vs " + readSeq.length()*2 + " (" + alignScore/readSeq.length() +")"); + double bestIter = bestState.getIter() + fraction; + + /*******************************************************************/ + profilePositions.clear(); + seqPositions.clear(); + costGeneration.clear(); + byteArray.clear(); + + //double oldCost = bestState.score; + EmissionState lastState = bestState; + bestState = bestState.bwdState; + + while (bestState != null){ + profilePositions.add(bestState.profilePos); + seqPositions.add(bestState.profilePos); + costGeneration.add(lastState.score - bestState.score); + + if (bestState.seqPos == lastState.seqPos) + byteArray.add((byte)Alphabet.DNA.GAP); + else + byteArray.add(readSeq.getBase(lastState.seqPos)); + + lastState = bestState; + bestState = bestState.bwdState; + } + + double costL = 0, costR = 0, costCurrentRep = 0, costRep = 0; + int stateL = 0, stateR = 0, stateCurrentRep = 0, stateRep = 0; + int baseL = 0, baseR = 0, baseCurrentRep = 0, baseRep = 0; + int bSeqL = 0, bSeqR = 0, bSeqCurrentRep = 0, bSeqRep = 0; + + int lastProfilePos = -1, lastSeqPos = -1; + + for (int x = profilePositions.size() - 1; x >=0; x--){ + outOS.print(Alphabet.DNA().int2char(byteArray.get(x))); + + int profilePos = profilePositions.get(x); + int seqPos = seqPositions.get(x); + + if (profilePos < hmmFlank + hmmPad){ + stateL ++; + costL += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseL ++; + + if (lastSeqPos != seqPos) + bSeqL ++; + + }else if(profilePos > hmmFlank + hmmPad + period){ + stateR ++; + costR += costGeneration.get(x); + + if (lastProfilePos != profilePos) + baseR ++; + + if (lastSeqPos != seqPos) + bSeqR ++; + }else{ + stateCurrentRep ++; + costCurrentRep += costGeneration.get(x); + + stateRep ++; + costRep += costGeneration.get(x); + + if (lastProfilePos != profilePos){ + baseRep ++; + baseCurrentRep ++; + } + + if (lastSeqPos != seqPos){ + bSeqRep ++; + bSeqCurrentRep ++; + } + + } + + //end of a repeat cycle + if (profilePos < lastProfilePos){ + outOS.print("<-----------------REP " + costCurrentRep + + " " + stateCurrentRep + + " " + (stateCurrentRep == 0?"inf": "" + (costCurrentRep/stateCurrentRep)) + + " " + baseCurrentRep + + " " + (baseCurrentRep == 0?"inf": "" + (costCurrentRep/baseCurrentRep)) + + " " + bSeqCurrentRep + + " " + (bSeqCurrentRep == 0?"inf": "" + (costCurrentRep/bSeqCurrentRep)) + ); + + if (costCurrentRep/bSeqCurrentRep > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + costCurrentRep = 0;//restart + stateCurrentRep = 0;//restart + baseCurrentRep = 0; + bSeqCurrentRep = 0; + } + + //left + if (profilePos >= hmmFlank + hmmPad && lastProfilePos < hmmFlank + hmmPad){ + outOS.print("<-----------------LEFT " + costL + + " " + stateL + + " " + (stateL == 0?"inf": "" + (costL/stateL)) + + " " + baseL + + " " + (baseL == 0?"inf": "" + (costL/baseL)) + + " " + bSeqL + + " " + (bSeqL == 0?"inf": "" + (costL/bSeqL)) + ); + if (costL/bSeqL > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + + outOS.println(); + + } + + //right + //if (profilePos < hmmFlank + hmmPad + period && lastProfilePos >= hmmFlank + hmmPad + period){ + // outOS.print("<-----------------RIGHT " + costR + // + " " + stateR + // + " " + (stateR == 0?"inf": "" + (costR/stateR)) + // + " " + baseR + // + " " + (baseR == 0?"inf": "" + (costR/baseR)) + // + " " + bSeqR + // + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + // ); + // outOS.println(); + //} + lastProfilePos = profilePos; + lastSeqPos = seqPos; + + }//for x + + //move to out of the loop + outOS.print("<-----------------RIGHT " + costR + + " " + stateR + + " " + (stateR == 0?"inf": "" + (costR/stateR)) + + " " + baseR + + " " + (baseR == 0?"inf": "" + (costR/baseR)) + + " " + bSeqR + + " " + (bSeqR == 0?"inf": "" + (costR/bSeqR)) + ); + + if (costR/bSeqR > backGround){ + pass = false; + outOS.print(" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + } + //outOS.println(); + + outOS.println(); + outOS.print ("L = " + (costL/(hmmFlank + hmmPad)) + " R = " + costR/(dp.getProfileLength() - hmmFlank - hmmPad - period) + "\n"); + + /*****************************************************************/ + outOS.print("##" + readSeq.getName() +"\t"+bestIter+"\t"+readSeq.length() +"\t" +alignScore+"\t" + alignScore/readSeq.length() + '\t' + costM + "\t" + costM / readSeq.length() + "\t" + costL + "\t" + stateL + "\t" + costR + "\t" + stateR + "\t" + (alignScore - costL - costR) + "\t" + stateRep + "\t" + pass + '\n'); + outOS.print("==================================================================\n"); + } + + public static Sequence getReadPosition(SAMRecord rec, int startRef, int endRef){ + byte[] seqRead = rec.getReadBases();// + if (seqRead.length <= 1) + return null; + + int startRead = -1, endRead = -1; + + int refPos = rec.getAlignmentStart(); + int readPos = 0; + //currentRefPos <= startRead + + for (final CigarElement e : rec.getCigar().getCigarElements()) { + int length = e.getLength(); + switch (e.getOperator()) { + case H: + break; // ignore hard clips + case P: + break; // ignore pads + case S: + readPos += e.getLength(); + break; // soft clip read bases + case N: // N ~ D + case D: + refPos += length; + + if (startRead < 0 && refPos >= startRef){ + startRead = readPos; + } + + if (endRead < 0 && refPos >= endRef){ + endRead = readPos; + } + + break;// case + case I: + readPos += length; + break; + + case M: + case EQ: + case X: + if ((startRead < 0) && refPos + length >= startRef) { + startRead = readPos + startRef - refPos; + } + + if ((endRead < 0) && (refPos + length >= endRef)){ + endRead = readPos + endRef - refPos; + } + + refPos += length; + readPos += length; + break; + default: + throw new IllegalStateException( + "Case statement didn't deal with cigar op: " + + e.getOperator()); + }// case + if (refPos >= endRef) + break;//for + + }// for + if (startRead < 0 || endRead < 0){ + LOG.warn(" " + refPos + " " + readPos + " " + startRead + " " + endRead); + return null; + } + + Alphabet alphabet = Alphabet.DNA16(); + Sequence retSeq = new Sequence(alphabet, endRead - startRead + 1, rec.getReadName() + "/" + startRead + "_" + endRead); + + for (int i = 0; i < retSeq.length();i++){ + retSeq.setBase(i, alphabet.byte2index(seqRead[startRead + i])); + } + return retSeq; + + } + + /******************************************************************************* + + static int call(ArrayList seqList, int indexStart, int indexEnd) { + if (indexEnd <= indexStart) + return 0; + + // Get consensus + int gaps = 0; + Sequence nSeq = new Sequence(Alphabet.DNA6(), seqList.get(0).length(), + "consensus"); + int[] votes = new int[6]; + for (int i = 0; i < nSeq.length(); i++) { + Arrays.fill(votes, 0); + for (int s = indexStart; s < indexEnd; s++) { + votes[seqList.get(s).symbolAt(i)]++; + } + byte best = 0; + for (byte b = 1; b < 6; b++) + if (votes[b] > votes[best]) + best = b; + + nSeq.setBase(i, best); + if (best == 5) + gaps++; + }// for + return gaps; + } + + static int call(ArrayList seqList) { + return call(seqList,0,seqList.size()); + + } + /*******************************************************************************/ + +} diff --git a/src/main/java/japsa/tools/bio/np/BarCodeAnalysisCmd.java b/src/main/java/japsa/tools/bio/np/BarCodeAnalysisCmd.java new file mode 100644 index 0000000..d327b43 --- /dev/null +++ b/src/main/java/japsa/tools/bio/np/BarCodeAnalysisCmd.java @@ -0,0 +1,148 @@ +package japsa.tools.bio.np; + +import java.io.IOException; + +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import japsa.bio.np.barcode.BarCodeAnalysis; + + +@Deployable( + scriptName = "jsa.np.barcode", + scriptDesc = "Clustering nanopore sequences based on barcode" + ) +public class BarCodeAnalysisCmd extends CommandLine{ + public BarCodeAnalysisCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("bcFile", null, "Barcode file",true); + addString("seqFile", null, "Nanopore sequences file",true); + addString("scriptRun", null, "Invoke command script to run npScarf"); + addDouble("threshold", 70, "Minimum identity(%) for barcode alignment"); + addDouble("distance", 4, "Minimum identity(%) distance between the best alignment to others"); + + addBoolean("twoends", false, "Whether a read must contain barcode sequence from both ends or just one end (default)"); + addBoolean("print", false, "Print out demultiplexed reads to corresponding FASTA file or not."); + addStdHelp(); + } + public static void main(String[] args) throws IOException, InterruptedException{ + CommandLine cmdLine = new BarCodeAnalysisCmd (); + args = cmdLine.stdParseLine(args); + + String bcFile = cmdLine.getStringVal("bcFile"); + String script = cmdLine.getStringVal("scriptRun"); + String seqFile = cmdLine.getStringVal("seqFile"); + Double threshold = cmdLine.getDoubleVal("threshold"), + distance = cmdLine.getDoubleVal("distance"); + + BarCodeAnalysis.print = cmdLine.getBooleanVal("print"); + BarCodeAnalysis.twoends = cmdLine.getBooleanVal("twoends"); + + + BarCodeAnalysis bc = new BarCodeAnalysis(bcFile,script); + bc.setThreshold(threshold); + bc.setDistance(distance); + bc.clustering(seqFile); + + + } +} +/*RST* +--------------------------------------------------------------------------- +*barcode*: real-time de-multiplexing Nanopore reads from barcode sequencing +--------------------------------------------------------------------------- + +*barcode* (jsa.np.barcode) is a program that demultiplex the nanopore reads from +Nanopore barcode sequencing. Downstream analysis can be invoked concurrently by an input script. + +*barcode* is included in the `Japsa package `_. + + + +~~~~~~~~~~~~~~ +Usage examples +~~~~~~~~~~~~~~ + +A summary of *barcode* usage can be obtained by invoking the --help option:: + + jsa.np.barcode --help + +===== +Input +===== + *barcode* takes 2 files as required input:: + + jsa.np.barcode -seq -bc + +<*nanopore reads*> is either the long reads in FASTA/FASTQ file (after MinION sequencing is +finished) or standard input ( specified by "-", for real-time analysis). + +<*barcode.fasta*> is the FASTA file of barcode sequences (given by ONT) with name correspond to the assigned sample id. + +Missing any file would break down the whole pipeline. + +In addition, one can provide <*analysis_script*> which is the script call for further action on the de-multiplexed reads. It always take one argument and be +executable by invoking:: + + ./analysis_script + +in which <*id*> is the identifier of a sample as given in the . The script should read the standard input +of long-read streams to do further analysis. + +*barcode* allows user to set the minimum criteria of a hit with barcode reference to be considered valid. The default value +is 70% for minimum identity. At the same time, 4% distance between the best hit and the second best is necessary for differentiation. +Decreasing the thresholds will lead to more reads being clustered but with higher risk of false positive while more stringent parameters +will generate less but more confident of demultiplexed reads. + +User can also have control on the matching condition for barcode detection, either one-end match or both-end match. For the first case (default), only the +a legal maximal hit from one end of a read is enough to label it while in the later case, we take into account a pair from both 5' and 3'terminus. +Thus the input for each use case should be different. The one-end option can take the simple FASTA file of Nanopore barcodes while the two-end need pairs of +barcode to be specified (e.g. with _F and _R suffix). One of a typical use case for two-end matching is when we want to detect the super-barcode which includes +also tail- and primer-sequences in pre-defined orientation. + +====== +Output +====== +*barcode* output depends on the <*analysis script*> because the de-multiplexed reads are streamed directly to its dedicated process. +If ones only interest in de-multiplexing alone, then the script should be as simple as to write stream to file. For example: + +.. code-block:: bash + :linenos: + + #!/bin/bash + while read line + do + echo "$line" + done >> ${1}_script.fasta + +This is equivalent to enable the *-p* option:: + + jsa.np.barcode -seq -bc -script -p + +that would print out de-multiplexed FASTA sequences \_clustered.fasta + +============================================ +Real-time scaffolding for barcode sequencing +============================================ + +One use-case for barcode sequencing is to run *npscarf* on the resulted de-multiplexed reads. This could be done by calling a script +that can take an output folder of long reads from a sample to scaffold its corresponding short-reads (e.g. SPAdes) assembly. +E.g. + +.. code-block:: bash + :linenos: + + #!/bin/bash + dirname=`find /coin/barcode/ -maxdepth 1 -type d -name "*${1}*" -print -quit` + + bwa index ${dirname}/contigs.fasta + + bwa mem -t 16 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y -K 10000 ${dirname}/contigs.fasta - 2> /dev/null | \ + jsa.np.npscarf -realtime -read 100 -time 1 -b - -seq ${dirname}/contigs.fasta -spadesDir ${dirname} -prefix ${1} > ${1}.log 2>&1 + +In this scenario, we assume the output SPAdes folders locate in one directory and the folder names contain the ID of the corresponding samples. + +*RST*/ \ No newline at end of file diff --git a/src/main/java/japsa/tools/bio/np/FastNanoporeReaderCmd.java b/src/main/java/japsa/tools/bio/np/FastNanoporeReaderCmd.java new file mode 100644 index 0000000..08f0822 --- /dev/null +++ b/src/main/java/japsa/tools/bio/np/FastNanoporeReaderCmd.java @@ -0,0 +1,124 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/***************************************************************************** + * Revision History + * 7 Aug 2015 - Minh Duc Cao: Created + * + ****************************************************************************/ +package japsa.tools.bio.np; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import japsa.seq.SequenceOutputStream; +import japsa.seq.nanopore.Fast5NPReader; +import japsa.util.CommandLine; +import japsa.util.JapsaException; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.np.fastnpreader", + scriptDesc = "Fast Extraction of Oxford Nanopore sequencing data in real-time" + ) +public class FastNanoporeReaderCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(FastNanoporeReaderCmd.class); + public FastNanoporeReaderCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + + addString("folder", null,"The folder containing base-called reads"); + addString("output", "-","Name of the output file, - for stdout"); + + addStdHelp(); + } + + public static void main(String[] args) throws OutOfMemoryError, Exception { + CommandLine cmdLine = new FastNanoporeReaderCmd(); + args = cmdLine.stdParseLine(args); + /**********************************************************************/ + + String output = cmdLine.getStringVal("output"); + String folder = cmdLine.getStringVal("folder"); + + SequenceOutputStream out = SequenceOutputStream.makeOutputStream(output); + readFastq(folder, out); + out.close(); + + }//main + + + public static void readFastq (String folderPath, SequenceOutputStream out) throws JapsaException, IOException{ + File mainFolder = new File(folderPath); + File passFolder = new File(folderPath + File.separatorChar + "pass"); + File failFolder = new File(folderPath + File.separatorChar + "fail"); + + ArrayList folders = new ArrayList(); + folders.add(mainFolder); + folders.add(passFolder); + folders.add(failFolder); + + for (File folder:folders){ + File [] fileList = folder.listFiles(); + if (fileList!=null){ + for (File f:fileList){ + //directory + if (!f.isFile()) + continue;//for + + if (!f.getName().endsWith("fast5")) + continue;//for + String sPath = f.getAbsolutePath(); + try{ + Fast5NPReader npReader = new Fast5NPReader(sPath); + npReader.readAllFastq(out); + npReader.close(); + }catch (JapsaException e){ + throw e; + }catch (Exception e){ + LOG.error("Problem with reading " + sPath + ":" + e.getMessage()); + e.printStackTrace(); + } + }//for f + }//if + }//for folder + + } +} + diff --git a/src/main/java/japsa/tools/bio/np/GapCloserCmd.java b/src/main/java/japsa/tools/bio/np/GapCloserCmd.java index 554dc88..d61ac0f 100644 --- a/src/main/java/japsa/tools/bio/np/GapCloserCmd.java +++ b/src/main/java/japsa/tools/bio/np/GapCloserCmd.java @@ -34,23 +34,33 @@ package japsa.tools.bio.np; +import japsa.bio.hts.scaffold.ContigBridge; import japsa.bio.hts.scaffold.RealtimeScaffolding; import japsa.bio.hts.scaffold.ScaffoldGraph; import japsa.bio.hts.scaffold.ScaffoldGraphDFS; -import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; import japsa.util.CommandLine; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.BufferedReader; +import java.io.File; import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** - * @author minhduc + * @author sonnguyen, minhduc * */ -@Deployable(scriptName = "jsa.np.gapcloser", -scriptDesc = "Scaffold and finish assemblies using Oxford Nanopore sequencing reads") +@Deployable( + scriptName = "jsa.np.npscarf", + scriptDesc = "Experimental Scaffold and finish assemblies using Oxford Nanopore sequencing reads", + seeAlso = "jsa.np.npreader, jsa.util.streamServer, jsa.util.streamClient" + ) public class GapCloserCmd extends CommandLine{ - //static Alphabet alphabet = Alphabet.DNA(); + private static final Logger LOG = LoggerFactory.getLogger(GapCloserCmd.class); public GapCloserCmd(){ super(); @@ -58,25 +68,37 @@ public GapCloserCmd(){ setUsage(annotation.scriptName() + " [options]"); setDesc(annotation.scriptDesc()); - addString("bamFile", null, "Name of the bam file", true); - addString("sequenceFile", null, "Name of the assembly file (sorted by length)",true); - addString("output", "-", "Name of the output file, - for stdout"); - addInt("threshold", 0, "Margin threshold: to limit distance to the contig's ends of the alignment used in bridging."); - addInt("minContig", 300, "Minimum contigs length that are used in scaffolding (default 200)."); - //addInt("minMarker", 300, "Minimum length of markers that are used in scaffolding (default 1000)."); + addString("seqFile", null, "Name of the assembly file (sorted by length)",true); + + addString("input", "-", "Name of the input file, - for stdin", true); + addString("format", "sam", "Format of the input: fastq/fasta or sam/bam", true); + addBoolean("index", true, "Whether to index the contigs sequence by the aligner or not."); + + addString("bwaExe", "bwa", "Path to bwa"); + addInt("bwaThread", 4, "Theads used by bwa"); + addBoolean("long", false, "Whether report all sequences, including short/repeat contigs (default) or only long/unique/completed sequences."); + addString("spadesDir", null, "Name of the output folder by SPAdes: assembly graph and paths will be used for better gap-filling."); + addString("prefix", "out", "Prefix for the output files"); + addString("genes", null , "Realtime annotation: name of annotated genes in GFF 3.0 format"); + addString("resistGene", null , "Realtime annotation: name of antibiotic resistance gene fasta file"); + addString("insertSeq", null , "Realtime annotation: name of IS fasta file"); + addString("oriRep", null, "Realtime annotation: name of fasta file containing possible origin of replication"); + //addInt("marginThres", 1000, "Margin threshold: to limit distance to the contig's ends of the alignment used in bridging."); + addInt("minContig", 300, "Minimum contigs length that are used in scaffolding."); + addInt("maxRepeat", 7500, "Maximum length of repeat in considering species."); + addDouble("cov", 0, "Expected average coverage of Illumina, <=0 to estimate"); addInt("qual", 1, "Minimum quality"); - addInt("support", 2, "Minimum supporting long read needed for a link between markers"); - addString("connect", null, "Name of the connection file"); - addString("stat", null, "Name of the stastistic file for Nanopore read alignment"); + addInt("support", 1, "Minimum supporting long read needed for a link between markers"); + addBoolean("realtime", false, "Process in real-time mode. Default is batch mode (false)"); addInt("read", 50, "Minimum number of reads between analyses"); - addInt("time", 30, "Minimum number of seconds between analyses"); + addInt("time", 10, "Minimum number of seconds between analyses"); addBoolean("verbose", false, "Turn on debugging mode"); addStdHelp(); - + } //static boolean hardClip = false; @@ -85,63 +107,306 @@ public static void main(String[] args) throws CommandLine cmdLine = new GapCloserCmd(); args = cmdLine.stdParseLine(args); - /**********************************************************************/ - String output = cmdLine.getStringVal("output"); - String bamFile = cmdLine.getStringVal("bamFile"); - String sequenceFile = cmdLine.getStringVal("sequenceFile"); - int threshold = cmdLine.getIntVal("threshold"); - - int minContig = cmdLine.getIntVal("minContig"); - if(minContig != 300) - ScaffoldGraph.minContigLength = minContig; - - int minSupport = cmdLine.getIntVal("support"); - if(minSupport != 2) - ScaffoldGraph.minSupportReads = minSupport; - - if(threshold != 0) - ScaffoldGraph.marginThres = threshold; - boolean verbose = cmdLine.getBooleanVal("verbose"); - if(verbose) - ScaffoldGraph.verbose = verbose; - + /***********************************************************************/ + String prefix = cmdLine.getStringVal("prefix"); + //String bamFile = cmdLine.getStringVal("bamFile"); + + String input = cmdLine.getStringVal("input"); + String bwaExe = cmdLine.getStringVal("bwaExe"); + int bwaThread = cmdLine.getIntVal("bwaThread"); + String format = cmdLine.getStringVal("format").toLowerCase(); + + String sequenceFile = cmdLine.getStringVal("seqFile"), + spadesFolder = cmdLine.getStringVal("spadesDir"), + + genesFile = cmdLine.getStringVal("genes"), + resistFile = cmdLine.getStringVal("resistGene"), + isFile = cmdLine.getStringVal("insertSeq"), + oriFile = cmdLine.getStringVal("oriRep"); + + + File graphFile = new File(spadesFolder+"/assembly_graph.fastg"), + pathFile = new File(spadesFolder+"/contigs.paths"); + + + //TODO: need to validate bwa if fastq is the input + if (format.startsWith("fastq") || + format.startsWith("fasta") || + format.startsWith("fq") || + format.startsWith("fa")){ + try{ + ProcessBuilder pb = new ProcessBuilder(bwaExe).redirectErrorStream(true); + Process process = pb.start(); + BufferedReader bf = SequenceReader.openFile(process.getInputStream()); + + + String line; + String version = ""; + Pattern versionPattern = Pattern.compile("^Version:\\s(\\d+\\.\\d+\\.\\d+).*"); + Matcher matcher=versionPattern.matcher(""); + + while ((line = bf.readLine())!=null){ + matcher.reset(line); + if (matcher.find()){ + version = matcher.group(1); + break;//while + } + + + } + bf.close(); + + if (version.length() == 0){ + LOG.error(bwaExe + " is not the right path to bwa. bwa is required"); + System.exit(1); + }else{ + LOG.info("bwa version: " + version); + if (version.compareTo("0.7.11") < 0){ + LOG.error(" Require bwa of 0.7.11 or above"); + System.exit(1); + } + } + + //run indexing + if(cmdLine.getBooleanVal("index")){ + LOG.info("bwa index running..."); + ProcessBuilder pb2 = new ProcessBuilder(bwaExe,"index",sequenceFile); + Process indexProcess = pb2.start(); + indexProcess.waitFor(); + LOG.info("bwa index finished!"); + } + }catch (IOException e){ + System.err.println(e.getMessage()); + System.exit(1); + } + + }else if (format.startsWith("sam") || format.startsWith("bam")){ + // no problem + }else{ + LOG.error("Unrecognized format: " + format); + System.exit(1); + } + + if(spadesFolder !=null && graphFile.exists() && pathFile.exists()) + LOG.info("===> Use assembly graph and path from SPAdes!"); + else{ + LOG.warn("Not found any legal SPAdes output folder, assembly graph thus not included!"); + spadesFolder=null; + } + + //ProcessBuilder pb = new ProcessBuilder(bwaExe, + // "mem", + // + // "-k", "21,33,55,77,99,127", + // "--careful", + // "--pe1-1", String.valueOf(sampleRecord.readFile1), + // "--pe1-2", String.valueOf(sampleRecord.readFile2), + // "-m", "60", + // "-t", String.valueOf(threads), + // "-o", assemblyPath + "spades_output_" + sampleID + // ); + + + int //marginThres = cmdLine.getIntVal("marginThres"), + minContig = cmdLine.getIntVal("minContig"), + minSupport = cmdLine.getIntVal("support"), + maxRepeat = cmdLine.getIntVal("maxRepeat"); + //if(marginThres < 0) + // LOG.exit("Marginal threshold must not be negative", 1); + if(minContig <= 0) { + LOG.error("Minimum contig length has to be positive"); + System.exit(1); + + }if(minSupport <= 0) { + LOG.error("Minimum supporting reads has to be positive"); + System.exit(1); + } + if(maxRepeat <= 0) { + LOG.error("Maximal possible repeat length has to be positive", 1); + } + + + ScaffoldGraph.minContigLength = minContig; + ScaffoldGraph.minSupportReads = minSupport; + ScaffoldGraph.maxRepeatLength = maxRepeat; + //ScaffoldGraph.marginThres = marginThres; + ScaffoldGraph.verbose = cmdLine.getBooleanVal("verbose"); + ScaffoldGraph.reportAll = !cmdLine.getBooleanVal("long"); + double cov = cmdLine.getDoubleVal("cov"); int qual = cmdLine.getIntVal("qual"); - boolean rt = cmdLine.getBooleanVal("realtime"); - int number = cmdLine.getIntVal("read"); - int time = cmdLine.getIntVal("time"); - /**********************************************************************/ + if(qual < 0) { + LOG.error("Phred score of quality has to be positive"); + System.exit(1); + } + + int number = cmdLine.getIntVal("read"), + time = cmdLine.getIntVal("time"); - SequenceOutputStream outOS = null, connectOS = null, statOS = null; - if (cmdLine.getStringVal("connect") != null){ - connectOS = SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("connect")); + if(number <= 0) { + LOG.error("Number of reads has to be positive"); + System.exit(1); } - if (cmdLine.getStringVal("stat") != null){ - statOS = SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("stat")); + if(time < 0) { + LOG.error("Sleeping time must not be negative"); + System.exit(1); } + /**********************************************************************/ + ScaffoldGraph graph; + boolean rt = cmdLine.getBooleanVal("realtime"); + ContigBridge.relaxFilling(); if(rt){ - RealtimeScaffolding rtScaffolding = new RealtimeScaffolding(sequenceFile, "-"); - rtScaffolding.scaffolding(bamFile, number, time, cov/1.6, qual); + RealtimeScaffolding rtScaffolding = new RealtimeScaffolding(sequenceFile, genesFile, resistFile, isFile, oriFile, "-"); + graph = rtScaffolding.graph; + if(prefix != null) + graph.prefix = prefix; + if(spadesFolder!=null) + synchronized(graph){ + graph.readMore(spadesFolder+"/assembly_graph.fastg",spadesFolder+"/contigs.paths"); + } + if (cov <=0) + cov = ScaffoldGraph.estimatedCov; + + rtScaffolding.scaffolding2(input, number, time, cov/1.6, qual, format, bwaExe, bwaThread, sequenceFile); + } else{ - graph = new ScaffoldGraphDFS(sequenceFile); + graph = new ScaffoldGraphDFS(sequenceFile, genesFile, resistFile, isFile, oriFile); + if(spadesFolder!=null) + graph.readMore(spadesFolder+"/assembly_graph.fastg",spadesFolder+"/contigs.paths"); + if (cov <=0) - cov = graph.estimatedCov; - graph.makeConnections(bamFile, cov / 1.6, qual, connectOS, statOS); - - if (connectOS != null) - connectOS.close(); - if(statOS != null) - statOS.close(); - + cov = ScaffoldGraph.estimatedCov; + + graph.makeConnections2(input, cov / 1.6, qual, format, bwaExe, bwaThread, sequenceFile); + graph.connectBridges(); + if(prefix != null) + graph.prefix = prefix; + + ContigBridge.forceFilling(); + graph.printSequences(); } - outOS = SequenceOutputStream.makeOutputStream(output); - graph.printSequences(outOS); - //graph.printScaffoldSequence(outOS); - outOS.close(); - + } } + +/*RST* +---------------------------------------------------------------------------------- +*npScarf*: real-time scaffolder using SPAdes contigs and Nanopore sequencing reads +---------------------------------------------------------------------------------- + +*npScarf* (jsa.np.npscarf) is a program that connect contigs from a draft genomes +to generate sequences that are closer to finish. These pipelines can run on a single laptop +for microbial datasets. In real-time mode, it can be integrated with simple structural +analyses such as gene ordering, plasmid forming. + +*npScarf* is included in the `Japsa package `_. + + + +~~~~~~~~~~~~~~ +Usage examples +~~~~~~~~~~~~~~ + +A summary of *npScarf* usage can be obtained by invoking the --help option:: + + jsa.np.npscarf --help + +Input +===== + *npScarf* takes two files as required input:: + + jsa.np.npscarf -seq -input + +<*draft*> input is the FASTA file containing the pre-assemblies. Normally this +is the output from running SPAdes on Illumina MiSeq paired end reads. + +<*nanopore*> is either the long reads in FASTA/FASTQ file or SAM/BAM formated alignments +between them to <*draft*> file. We use BWA-MEM as the recommended aligner +with the fixed parameter set as follow:: + + bwa mem -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y > + +The input file format is specified by option --format. The default is FASTA/FASTQ in which +the path to BWA version 0.7.11 or newer is required. Remember to always *INDEXING* the +reference before running BWA:: + + bwa index + +Missing this step would break down the whole pipeline. + +Output +======= +*npScarf* output is specified by *-prefix* option. The default prefix is \'out\'. +Normally the tool generate two files: *prefix*.fin.fasta and *prefix*.fin.japsa which +indicate the result scaffolders in FASTA and JAPSA format. + +In realtime mode, if any annotation analysis is enabled, a file named +*prefix*.anno.japsa is generated instead. This file contains features detected after +scaffolding. + +Real-time scaffolding +===================== +To run *npScarf* in streaming mode:: + + jsa.np.npscarf -realtime [options] + +In this mode, the <*bam*> file will be processed block by block. The size of block +(number of BAM/SAM records) can be manipulated through option *-read* and *-time*. + +The idea of streaming mode is when the input <*nanopore*> file is retrieved in stream. +npReader is the module that provides such data from fast5 files returned from the real-time +base-calling cloud service Metrichor. Ones can run:: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npscarf --realtime -bwaExe= -bwaThread=10 -input - -seq > log.out 2>&1 + +For the same purpose, you can also invoke BWA-MEM explicitly as in the old version of *npScarf*, +In this case, option --format=SAM must be presented as follow: + + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ + bwa mem -t 10 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -a -Y -K 3000 - 2> /dev/null | \ + jsa.np.npscarf --realtime -input - -format=SAM -seq > log.out 2>&1 + +or if you have the whole set of Nanopore long reads already and want to emulate the +streaming mode:: + + jsa.np.timeEmulate -s 100 -i -output - | \ + jsa.np.npscarf --realtime -bwaExe= -bwaThread=10 -input - -seq > log.out 2>&1 + +Note that jsa.np.timeEmulate based on the field *timestamp* located in the read name line to +decide the order of streaming data. So if your input <*nanopore*> already contains the field, +you have to sort it:: + + jsa.seq.sort -i -o -sortKey=timestamp + +or if your file does not have the *timestamp* data yet, you can manually make ones. For example:: + + cat | \ + awk 'BEGIN{time=0.0}NR%4==1{printf "%s timestamp=%.2f\n", $0, time; time++}NR%4!=1{print}' \ + > <*nanopore-with-time*> + +Real-time annotation +==================== + +The tool includes usecase for streaming annotation. Ones can provides database of antibiotic +resistance genes and/or Origin of Replication in FASTA format for the analysis of gene ordering +and/or plasmid identifying respectively:: + + jsa.np.timeEmulate -s 100 -i -output - | \ + jsa.np.npscarf --realtime -bwaExe= -input - -seq -resistGene -oriRep > log.out 2>&1 + +Assembly graph +============== + +*npScarf* can read the assembly graph info from SPAdes to make the results more precise. +The results might be slightly deviate from the old version in term of number of final contigs:: + + jsa.np.npscarf --spadesFolder= + +where SPAdes_output_directory indicates the result folder of SPAdes, containing files such as contigs.fasta, +contigs.paths and assembly_graph.fastg. + *RST*/ diff --git a/src/main/java/japsa/tools/bio/np/NanoporeReadFilterCmd.java b/src/main/java/japsa/tools/bio/np/NanoporeReadFilterCmd.java index f3b8046..82fcdd1 100644 --- a/src/main/java/japsa/tools/bio/np/NanoporeReadFilterCmd.java +++ b/src/main/java/japsa/tools/bio/np/NanoporeReadFilterCmd.java @@ -50,9 +50,9 @@ * */ @Deployable( - scriptName = "jsa.np.filter", - scriptDesc = "Filter nanopore reads data from fastq file", - seeAlso = "jsa.np.f5reader, jsa.util.streamServer, jsa.util.streamClient") + scriptName = "jsa.np.filter", + scriptDesc = "Filter nanopore reads data from fastq file", + seeAlso = "jsa.np.npreader, jsa.util.streamServer, jsa.util.streamClient") public class NanoporeReadFilterCmd extends CommandLine{ public NanoporeReadFilterCmd(){ super(); @@ -60,19 +60,61 @@ public NanoporeReadFilterCmd(){ setUsage(annotation.scriptName() + " [options]"); setDesc(annotation.scriptDesc()); - addStdInputFile(); - addStdOutputFile(); - - addInt("lenMin", 0, "Minimum sequence length"); + CommandLine.Option inputOpt = + addString("input", null, "Name of the input file, - for standard input", true); + + CommandLine.Option outputOpt = + addString("output", null, "Name of the output file, - for standard output", true); + + CommandLine.Option lenMinOpt = + addInt("lenMin", 0, "Minimum sequence length"); + + //CommandLine.Option lenMaxOpt = addInt("lenMax", Integer.MAX_VALUE, "Minimum sequence length"); - addDouble("qualMin", 0, "Minimum average quality"); - addDouble("qualMax", 1000, "Maximum average quality"); - addBoolean("excl2D", false, "Exclude 2D reads"); - addBoolean("exclTemp", false, "Exclude template reads"); - addBoolean("exclComp", false, "Exclude complement reads"); + + CommandLine.Option qualMinOpt = + addDouble("qualMin", 0.0, "Minimum average quality"); + + //CommandLine.Option qualMaxOpt = + addDouble("qualMax", 1000.0, "Maximum average quality"); + + CommandLine.Option groupOpt = + addString("group", "", "Group need to be extracted, leave blank for selecting all groups"); + + CommandLine.Option excl2DOpt = + addBoolean("excl2D", false, "Exclude 2D reads"); + + CommandLine.Option exclTempOpt = + addBoolean("exclTemp", false, "Exclude template reads"); + + CommandLine.Option exclCompOpt = + addBoolean("exclComp", false, "Exclude complement reads"); + + //CommandLine.Option formatOpt = addString("format", "fastq", "Format of the output file"); addStdHelp(); + + + inputOpt.setGalaxySetting(new GalaxySetting("data", "fastqsanger",false)); + groupOpt.setGalaxySetting(new GalaxySetting("text", null, false)); + + lenMinOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + //lenMaxOpt.setGalaxySetting(new GalaxySetting("integer", null,false)); + qualMinOpt.setGalaxySetting(new GalaxySetting("double", null,false)); + //qualMaxOpt.setGalaxySetting(new GalaxySetting("double", null,false)); + + excl2DOpt.setGalaxySetting(new GalaxySetting("boolean", null,false)); + exclTempOpt.setGalaxySetting(new GalaxySetting("boolean", null,false)); + exclCompOpt.setGalaxySetting(new GalaxySetting("boolean", null,false)); + + + GalaxySetting outputGalaxy = new GalaxySetting("data", "fastqsanger",true); + //outputGalaxy.setLabel + outputOpt.setGalaxySetting(outputGalaxy); + + + setGalaxy(annotation.scriptName()); } public static void main(String[] args) throws IOException { @@ -90,6 +132,7 @@ public static void main(String[] args) throws IOException { boolean excludeTemplate = cmdLine.getBooleanVal("exclTemp"); boolean excludeComplement = cmdLine.getBooleanVal("exclComp"); + String group = cmdLine.getStringVal("group").trim(); String format = cmdLine.getStringVal("format"); @@ -98,7 +141,7 @@ public static void main(String[] args) throws IOException { SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(output); FastqReader reader = "-".equals(input)? (new FastqReader(System.in) ) - : (new FastqReader(input)); + : (new FastqReader(input)); FastqSequence seq; @@ -115,14 +158,14 @@ public static void main(String[] args) throws IOException { double qual = -1; //min quality - if (qualMin > 0){ + if (qualMin > 0.0){ qual = NanoporeReaderStream.averageQuality(seq); if (qual < qualMin) continue; } //max quality - if (qualMax < 1000){ + if (qualMax < 1000.0){ if (qual < 0) qual = NanoporeReaderStream.averageQuality(seq); if (qual >= qualMax) @@ -138,6 +181,19 @@ public static void main(String[] args) throws IOException { if (exclude2D && seq.getName().contains("twodim")) continue; + if (group.length() > 0){ + String [] toks = seq.getName().split(" "); + boolean match = false; + for (String tok:toks){ + if (tok.startsWith("group=")&&tok.substring(6).equals(group)){ + match = true; + break; + } + } + if (!match) + continue;//while + } + //done all the fitlering if (fastaOutput) seq.writeFasta(sos); @@ -157,9 +213,9 @@ public static void main(String[] args) throws IOException { --------------------------------------- *jsa.np.filter* filters sequencing data based on sequence read type, length and -quality. Examples of its usage can be found on jsa.np.f5reader_. +quality. Examples of its usage can be found on jsa.np.npreader_. -*RST*/ + *RST*/ diff --git a/src/main/java/japsa/tools/bio/np/NanoporeReaderCmd.java b/src/main/java/japsa/tools/bio/np/NanoporeReaderCmd.java index f4fb769..b01f235 100644 --- a/src/main/java/japsa/tools/bio/np/NanoporeReaderCmd.java +++ b/src/main/java/japsa/tools/bio/np/NanoporeReaderCmd.java @@ -34,25 +34,26 @@ ****************************************************************************/ package japsa.tools.bio.np; -import org.jfree.data.time.TimeTableXYDataset; - import japsa.seq.nanopore.NanoporeReaderStream; -import japsa.seq.nanopore.NanoporeReaderWindow; +import japsa.seq.nanopore.NanoporeReaderWindowFX; +import javafx.application.Application; import japsa.util.CommandLine; import japsa.util.JapsaException; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * @author minhduc * */ @Deployable( - scriptName = "jsa.np.f5reader", - scriptDesc = "Extract and stream Oxford Nanopore sequencing data in real-time", + scriptName = "jsa.np.npreader", + scriptDesc = "Extract and stream Oxford Nanopore sequencing data in real-time. Demultiplexe included.", seeAlso = "jsa.np.filter, jsa.util.streamServer, jsa.util.streamClient,jsa.np.rtSpeciesTyping, jsa.np.rtStrainTyping, jsa.np.rtResistGenes" ) -public class NanoporeReaderCmd extends CommandLine{ +public class NanoporeReaderCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(NanoporeReaderCmd.class); public NanoporeReaderCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -66,11 +67,13 @@ public NanoporeReaderCmd(){ addString("output", "-","Name of the output file, - for stdout"); addString("streams", null,"Stream output to some servers, format \"IP:port,IP:port\" (no spaces)"); addString("format", "fastq","Format of sequence reads (fastq or fasta)"); - addInt("minLength", 0,"Minimum read length"); + //addString("group", "","Group of base-called to be extracted ()"); + addInt("minLength", 1,"Minimum read length"); addBoolean("number", false,"Add a unique number to read name"); addBoolean("stats", false,"Generate a report of read statistics"); - addBoolean("time", false,"Extract the sequencing time of each read -- only work with Metrichor > 1.12"); - + addBoolean("time", false,"Extract the sequencing time of each read -- experimental"); + addBoolean("exhaustive", false,"Whether to traverse the input directory exhaustively (albacore) or lazily (metrichor)"); + addString("barcode", null,"The file containing all barcode sequences for demultiplexing."); addStdHelp(); } @@ -91,22 +94,20 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { boolean fail = cmdLine.getBooleanVal("fail"); String format = cmdLine.getStringVal("format"); String streamServers = cmdLine.getStringVal("streams"); - - //String pFolderName = cmdLine.getStringVal("pFolderName"); - //String f5list = cmdLine.getStringVal("f5list"); - //int interval = cmdLine.getIntVal("interval");//in second - //int age = cmdLine.getIntVal("age") * 1000;//in second + boolean exhaustive = cmdLine.getBooleanVal("exhaustive"); + String barcode = cmdLine.getStringVal("barcode"); int age = 20 * 1000;//cmdLine.getIntVal("age") * 1000;//in second int interval = 30; - String pFolderName = null; if (!GUI && folder == null){// && f5list == null){ - Logging.exit("Download folder need to be specified", 1); + System.err.println("Download folder need to be specified\n\n" + + cmdLine.usageString()); + System.exit(1); } NanoporeReaderStream reader = new NanoporeReaderStream(); - reader.getTime = time; + reader.getTimeStamp = time; reader.stats = stats; reader.number = number; reader.minLength = minLength; @@ -121,7 +122,12 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { reader.format = format.toLowerCase(); reader.realtime = realtime; reader.streamServers = streamServers; - NanoporeReaderWindow mGUI = null; + reader.exhaustive = exhaustive; + + if(barcode != null) + reader.updateDemultiplexFile(barcode); + + //NanoporeReaderWindowFX mGUI = null; if (GUI){ reader.realtime = true; @@ -129,41 +135,29 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { reader.stats = true;//GUI implies stats reader.ready = false;//wait for the command from GUI - TimeTableXYDataset dataset = new TimeTableXYDataset(); - mGUI = new NanoporeReaderWindow(reader,dataset); + NanoporeReaderWindowFX.setReader(reader); + Application.launch(NanoporeReaderWindowFX.class,args); - while (!reader.ready){ - Logging.info("NOT READY"); - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - Logging.info("GO"); - - new Thread(mGUI).start(); }else{ String msg = reader.prepareIO(); if (msg != null){ - Logging.exit(msg, 1); + LOG.error(msg); + System.exit(1); + } + try{ + LOG.info("Start reading" ); + reader.readFast5(); + }catch (JapsaException e){ + System.err.println(e.getMessage()); + e.getStackTrace(); + }catch (Exception e){ + throw e; + }finally{ + reader.close(); } } - //reader need to wait until ready to go - - //reader.sos = SequenceOutputStream.makeOutputStream(reader.output); - try{ - reader.readFastq(pFolderName); - }catch (JapsaException e){ - System.err.println(e.getMessage()); - e.getStackTrace(); - if (mGUI != null) - mGUI.interupt(e); - }catch (Exception e){ - throw e; - }finally{ - reader.close(); - } + + }//main } @@ -174,7 +168,7 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { *npReader*: real-time conversion and analysis of Nanopore sequencing data ------------------------------------------------------------------------- -*npReader* (jsa.np.f5reader) is a program that extracts Oxford Nanopore +*npReader* (jsa.np.npreader) is a program that extracts Oxford Nanopore sequencing data from FAST5 files, performs an initial analysis of the date and streams them to real-time analysis pipelines. These pipelines can run on the same computer or on computing clouds/high performance clusters. @@ -206,6 +200,10 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { The library is typically installed to *#/usr/lib/jni*. Enter this path when prompted for "Path to HDF library" during installation of Japsa. +HDF-View (https://www.hdfgroup.org/products/java/release/download.html) also +contains the neccessary library. Please install HDF-2.10.1 instead of the +latest version. + ~~~~~~~~~~~~~~ @@ -214,16 +212,16 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { A summary of npReader usage can be obtained by invoking the --help option:: - jsa.np.f5reader --help + jsa.np.npreader --help The simplest way to run *npReader* in GUI mode is by typing:: - jsa.np.f5reader -GUI -realtime + jsa.np.npreader -GUI -realtime and specify various options in the GUI. All of these options can be specified from the command line:: - jsa.np.f5reader -GUI -realtime -folder c:\Downloads\ -fail -output myrun.fastq --minLength 200 --stats + jsa.np.npreader -GUI -realtime -folder c:\Downloads\ -fail -output myrun.fastq --minLength 200 --stats npReader can run natively on a Windows laptop that runs the Metrichor agent. It can stream sequence data to multiple analysis pipelines on the same computer @@ -256,7 +254,7 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { Once these pipelines are ready, npReader can start streaming data off the MinION and the Metrichor agent to these pipelines:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output myrun.fastq \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output myrun.fastq \ --minLength 200 --streams server1IP:3456,server2IP:3457 One can run *npReader* on a computing cloud if the download folder (containing @@ -264,22 +262,30 @@ public static void main(String[] args) throws OutOfMemoryError, Exception { direct stream data to the pipelines without the need of *jsa.util.streamServer*:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ bwa mem -t 8 -k11 -W20 -r10 -A1 -B1 -O1 -E1 -L0 -Y -K 10000 index - | \ jsa.np.speciesTyping -bam - --index speciesIndex -output output.dat +*npReader* now supports barcode sequencing demultiplex. For this analysis, it +requires a FASTA file of barcode tag sequences and will classify output sequences +based on alignment. User can specify the threshold for alignment confidence from +the GUI. Demultiplexing results are illustrated as prefix Barcode::| +added to each output sequence name. + + jsa.np.npreader -GUI -barcode barcode.fasta + Japsa also provides *jsa.np.filter*, a tool to bin sequence data in groups of the user's liking. Like any other streamline tools, jsa.np.filter can run behind *jsa.util.streamServer* on a remote machine, or can get data directly from npReader via pipe:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ jsa.np.filter -input - -lenMin 2000 --qualMin 10 -output goodreads.fq One can also use *tee* to group data into different bins *in real-time* with *jsa.np.filter*:: - jsa.np.f5reader -realtime -folder c:\Downloads\ -fail -output - | \ + jsa.np.npreader -realtime -folder c:\Downloads\ -fail -output - | \ tee >(jsa.np.filter -input - -lenMax 2000 -output 0k2k.fq) \ >(jsa.np.filter -lenMin 2000 -lenMax 4000 -input - -output 2k4k.fq) \ >(jsa.np.filter -lenMin 4000 -lenMax 6000 -input - -output 4k6k.fq) \ diff --git a/src/main/java/japsa/tools/bio/np/RealtimeMLSTCmd.java b/src/main/java/japsa/tools/bio/np/RealtimeMLSTCmd.java index 592b34b..03006af 100644 --- a/src/main/java/japsa/tools/bio/np/RealtimeMLSTCmd.java +++ b/src/main/java/japsa/tools/bio/np/RealtimeMLSTCmd.java @@ -39,6 +39,8 @@ import japsa.bio.np.RealtimeMLST; import japsa.util.CommandLine; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * @author minhduc @@ -47,9 +49,10 @@ @Deployable( scriptName = "jsa.np.rtMLST", scriptDesc = "Realtime Multi-Locus Strain Typing using Nanopore Sequencing data", - seeAlso = "jsa.np.f5reader, jsa.np.rtSpeciesTyping, jsa.np.rtStrainTyping, jsa.np.rtResistGenes, jsa.util.streamServer, jsa.util.streamClient" + seeAlso = "jsa.np.npreader, jsa.np.rtSpeciesTyping, jsa.np.rtStrainTyping, jsa.np.rtResistGenes, jsa.util.streamServer, jsa.util.streamClient" ) -public class RealtimeMLSTCmd extends CommandLine{ +public class RealtimeMLSTCmd extends CommandLine{ +// private static final Logger LOG = LoggerFactory.getLogger(RealtimeMLSTCmd.class); public RealtimeMLSTCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); diff --git a/src/main/java/japsa/tools/bio/np/RealtimeResistanceGeneCmd.java b/src/main/java/japsa/tools/bio/np/RealtimeResistanceGeneCmd.java index d449061..9ed9c07 100644 --- a/src/main/java/japsa/tools/bio/np/RealtimeResistanceGeneCmd.java +++ b/src/main/java/japsa/tools/bio/np/RealtimeResistanceGeneCmd.java @@ -47,7 +47,7 @@ @Deployable( scriptName = "jsa.np.rtResistGenes", scriptDesc = "Realtime identification of antibiotic resistance genes from Nanopore sequencing", - seeAlso = "jsa.np.f5reader, jsa.np.rtSpeciesTyping, jsa.np.rtStrainTyping, jsa.util.streamServer, jsa.util.streamClient" + seeAlso = "jsa.np.npreader, jsa.np.rtSpeciesTyping, jsa.np.rtStrainTyping, jsa.util.streamServer, jsa.util.streamClient" ) public class RealtimeResistanceGeneCmd extends CommandLine{ public RealtimeResistanceGeneCmd(){ @@ -118,7 +118,8 @@ public static void main(String[] args) throws IOException, InterruptedException{ Setting up ~~~~~~~~~~ -Refer to real-time analyais page at https://github.com/mdcao/npAnalysis/ +Refer to the documentation at https://github.com/mdcao/npAnalysis/ for more +details. *RST*/ diff --git a/src/main/java/japsa/tools/bio/np/RealtimeSpeciesTypingCmd.java b/src/main/java/japsa/tools/bio/np/RealtimeSpeciesTypingCmd.java index 0614297..3e490bd 100644 --- a/src/main/java/japsa/tools/bio/np/RealtimeSpeciesTypingCmd.java +++ b/src/main/java/japsa/tools/bio/np/RealtimeSpeciesTypingCmd.java @@ -47,7 +47,7 @@ @Deployable( scriptName = "jsa.np.rtSpeciesTyping", scriptDesc = "Realtime species typing using Nanopore Sequencing data", - seeAlso = "jsa.np.f5reader, jsa.np.rtStrainTyping, jsa.np.rtResistGenes, jsa.util.streamServer, jsa.util.streamClient" + seeAlso = "jsa.np.npreader, jsa.np.rtStrainTyping, jsa.np.rtResistGenes, jsa.util.streamServer, jsa.util.streamClient" ) public class RealtimeSpeciesTypingCmd extends CommandLine { @@ -57,16 +57,18 @@ public RealtimeSpeciesTypingCmd(){ setUsage(annotation.scriptName() + " [options]"); setDesc(annotation.scriptDesc()); - addString("output", "output.dat", "Output file"); + addString("output", "output.dat", "Output file, - for standard output"); addString("bamFile", null, "The bam file",true); addString("indexFile", null, "indexFile ",true); - addDouble("qual", 0, "Minimum alignment quality"); + addDouble("qual", 1, "Minimum alignment quality"); addBoolean("twodonly", false, "Use only two dimentional reads"); addInt("read", 50, "Minimum number of reads between analyses"); addInt("time", 30, "Minimum number of seconds between analyses"); + addBoolean("web", false, "Whether to use Web visualization."); + addBoolean("log", false, "Whether to write mapping details to species2reads.map."); addStdHelp(); } /** @@ -88,8 +90,10 @@ public static void main(String[] args) throws IOException, InterruptedException int time = cmdLine.getIntVal("time"); double qual = cmdLine.getDoubleVal("qual"); boolean twoOnly = cmdLine.getBooleanVal("twodonly"); - - RealtimeSpeciesTyping paTyping = new RealtimeSpeciesTyping(indexFile, output); + RealtimeSpeciesTyping.JSON = cmdLine.getBooleanVal("web"); + RealtimeSpeciesTyping.OUTSEQ = cmdLine.getBooleanVal("log"); + + RealtimeSpeciesTyping paTyping = new RealtimeSpeciesTyping(indexFile, output); paTyping.setMinQual(qual); paTyping.setTwoOnly(twoOnly); paTyping.typing(bamFile, number, time); @@ -105,15 +109,11 @@ public static void main(String[] args) throws IOException, InterruptedException using Oxford Nanopore sequencing in real-time. It reads data in SAM/BAM format of the alignments of sequence reads to a collection of species genomes. -We provide a genome collection of nearly 1500 bacterial species http://genomicsresearch.org/public/researcher/npAnalysis/SpeciesTyping.tar.gz -(or https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/SpeciesTyping.tar.gz). -Obtain them by:: - - wget https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/SpeciesTyping.tar.gz.tar.gz - tar zxvf SpeciesTyping.tar.gz - -which will generate three folders for the three species. - +We provide a genome collection of nearly 1500 bacterial species +on http://data.genomicsresearch.org/Projects/npAnalysis/. +Refer to the documentation at https://github.com/mdcao/npAnalysis/ for more +details. + ~~~~~~~~~~~~~~ @@ -140,7 +140,7 @@ public static void main(String[] args) throws IOException, InterruptedException and streams data to this pipeline using npReader: :: - jsa.np.f5reader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3456 + jsa.np.npreader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3456 *RST*/ diff --git a/src/main/java/japsa/tools/bio/np/RealtimeStrainTypingCmd.java b/src/main/java/japsa/tools/bio/np/RealtimeStrainTypingCmd.java index 951dce1..604c206 100644 --- a/src/main/java/japsa/tools/bio/np/RealtimeStrainTypingCmd.java +++ b/src/main/java/japsa/tools/bio/np/RealtimeStrainTypingCmd.java @@ -49,7 +49,7 @@ @Deployable( scriptName = "jsa.np.rtStrainTyping", scriptDesc = "Realtime strain typing using Nanopore sequencing data", - seeAlso = "jsa.np.f5reader, jsa.np.rtSpeciesTyping, jsa.np.rtResistGenes, jsa.util.streamServer, jsa.util.streamClient") + seeAlso = "jsa.np.npreader, jsa.np.rtSpeciesTyping, jsa.np.rtResistGenes, jsa.util.streamServer, jsa.util.streamClient") public class RealtimeStrainTypingCmd extends CommandLine{ public RealtimeStrainTypingCmd(){ super(); @@ -109,14 +109,10 @@ public static void main(String[] args) throws IOException, InterruptedException{ interval of 95%. We provide the gene databases for three bacterial species K. pneumoniae, -E. coli and S. aureus on http://genomicsresearch.org/public/researcher/npAnalysis/StrainTyping.tar.gz -(or https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/StrainTyping.tar.gz). -Obtain them by:: +E. coli and S. aureus on http://data.genomicsresearch.org/Projects/npAnalysis/. +Refer to the documentation at https://github.com/mdcao/npAnalysis/ for more +details. - wget https://swift.rc.nectar.org.au:8888/v1/AUTH_15574c7fb24c44b3b34069185efba190/npAnalysis/StrainTyping.tar.gz - tar zxvf StrainTyping.tar.gz - -which will generate three folders for the three species. @@ -142,7 +138,7 @@ can read from this file (note, this is not real-time analysis): and streams data to this pipeline using npReader: :: - jsa.np.f5reader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3457 + jsa.np.npreader -GUI -realtime -folder -fail -output data.fastq -stream serverAddress:3457 *RST*/ \ No newline at end of file diff --git a/src/main/java/japsa/tools/bio/np/RegulateTimeCmd.java b/src/main/java/japsa/tools/bio/np/RegulateTimeCmd.java index 8013850..3fbe5be 100644 --- a/src/main/java/japsa/tools/bio/np/RegulateTimeCmd.java +++ b/src/main/java/japsa/tools/bio/np/RegulateTimeCmd.java @@ -39,13 +39,15 @@ import japsa.seq.Alphabet; import japsa.seq.FastqReader; +import japsa.seq.FastaReader; import japsa.seq.FastqSequence; import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -57,7 +59,9 @@ scriptDesc = "Regulate time" ) public class RegulateTimeCmd extends CommandLine { - public RegulateTimeCmd(){ + private static final Logger LOG = LoggerFactory.getLogger(RegulateTimeCmd.class); + + public RegulateTimeCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); setUsage(annotation.scriptName() + " [options]"); @@ -90,7 +94,8 @@ public static void main(String[] args) throws IOException { SequenceReader reader = SequenceReader.getReader(input); - boolean isFastq = (reader instanceof FastqReader); + boolean isFastq = (reader instanceof FastqReader), + isFasta = (reader instanceof FastaReader); Sequence seq; @@ -101,12 +106,15 @@ public static void main(String[] args) throws IOException { long numBase = 0; long timeStart = System.currentTimeMillis(); - Logging.info("Time start " + new Date(timeStart)); + LOG.info("Time start " + new Date(timeStart)); long reportTime = timeStart; while ((seq = reader.nextSequence(Alphabet.DNA()))!= null){ double cTime = 0; String [] toks = seq.getName().split(" "); + if(isFasta) + toks = seq.getDesc().split(" "); + try{ for (int i = 0; i < toks.length;i++){ if (toks[i].startsWith(sortKeyOptionPrefix)){ @@ -115,10 +123,10 @@ public static void main(String[] args) throws IOException { } } }catch (Exception e){ - Logging.error(e.getMessage()); + LOG.error(e.getMessage()); } if (cTime == 0){ - Logging.info("Not found timing for sequence " + seq.getName()); + LOG.info("Not found timing for sequence " + seq.getName()); continue; } if (firstReadTime == 0){ @@ -128,7 +136,7 @@ public static void main(String[] args) throws IOException { long reportTimeNow = System.currentTimeMillis(); if (reportTimeNow - reportTime >= 60000){ reportTime = reportTimeNow; - Logging.info(new Date(reportTime) + " : " + numRead + " reads " + numBase + " bases"); + LOG.info(new Date(reportTime) + " : " + numRead + " reads " + numBase + " bases"); } cTime = 1000* (cTime - firstReadTime) / scale;//scale and convert to milisecond diff --git a/src/main/java/japsa/tools/bio/phylo/XMDistance2Cmd.java b/src/main/java/japsa/tools/bio/phylo/XMDistance2Cmd.java new file mode 100644 index 0000000..a87ca84 --- /dev/null +++ b/src/main/java/japsa/tools/bio/phylo/XMDistance2Cmd.java @@ -0,0 +1,148 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 08/10/2012 - Minh Duc Cao: Revised + * + ****************************************************************************/ + +package japsa.tools.bio.phylo; + + +import java.io.BufferedReader; +import java.io.FileReader; +import java.util.ArrayList; + +import japsa.seq.Alphabet.DNA; +import japsa.seq.FastaReader; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.tools.bio.xm.ExpertModelCmd; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; +import japsa.xm.ExpertModel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +@Deployable(scriptName = "jsa.phylo.xmdist2", +scriptDesc = "Generate a distances bet matrix from genomes (potentially not alignable") +public class XMDistance2Cmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(XMDistance2Cmd.class); + + public XMDistance2Cmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + //addStdInputFile(); + addString("output", "output", "Name of the file for output (distances in phylip format)"); + addInt("index", 0, "Index"); + + addInt("hashSize", 11, "Hash size"); + addInt("context", 15, "Length of the context"); + addInt("limit", 50, "Expert Limit"); + //addInt("thread", 1, "Number of threads"); + addDouble("threshold", 0.15, "Listen threshold"); + addInt("chance", 20, "Chances"); + addBoolean("binaryHash", false, "Use binary hash or not"); + addString("offsetType", "counts", + "Way of update offset/palindrome expert: possible value count, subs"); + addBoolean("optimise", false, + "Running in optimise mode, just report the entropy,recommended for long sequence"); + addInt("checkPoint", 10000000, "Frequency of check point"); + addString("hashType", "hash", + "Type of Hash table: hash=hashtable, sft=SuffixTree,sfa = SuffixArray"); + addBoolean("selfRep", true, + "Propose experts from the sequence to compressed?"); + + addStdHelp(); + } + //public static boolean adapt = false; + + + public static void main(String[] args) throws Exception { + CommandLine cmdLine = new XMDistance2Cmd(); + args = cmdLine.stdParseLine(args); + //ExpertModel eModel = ExpertModelCmd.getExpertModel(cmdLine); + + + BufferedReader bf = new BufferedReader(new FileReader("list")); + ArrayList list = new ArrayList(); + String str; + + while ( (str = bf.readLine())!=null){ + list.add(str.trim()); + } + bf.close(); + int index = cmdLine.getIntVal("index"); + + if(index >= list.size()){ + LOG.error("Wrong index (< " + list.size()+") : " + index); + System.exit(1); + } + + FastaReader sin = new FastaReader("data/" + list.get(index)); + Sequence mySeq = sin.nextSequence(DNA.DNA4()); + /**************************************************/ + sin.close(); + + Sequence [] mS = new Sequence[1]; + mS[0] = mySeq; + ExpertModel eModel = ExpertModelCmd.getExpertModel(cmdLine); + double myE = eModel.encode_optimise(mS); + + SequenceOutputStream outStr = + SequenceOutputStream.makeOutputStream(cmdLine.getStringVal("output")); + + outStr.print(mySeq.getName() + "\t" + mySeq.length() + "\t" +myE+"\n"); + + mS = new Sequence[2]; + for (int x = 0;x < index;x++){ + sin = new FastaReader("data/" + list.get(x)); + Sequence mateSeq = sin.nextSequence(DNA.DNA4()); + sin.close(); + + mS[0] = mySeq; + mS[1] = mateSeq; + + eModel = ExpertModelCmd.getExpertModel(cmdLine); + double e_ji = eModel.encode_optimise(mS); + + mS[1] = mySeq; + mS[0] = mateSeq; + eModel = ExpertModelCmd.getExpertModel(cmdLine); + double e_ij = eModel.encode_optimise(mS); + outStr.print(mateSeq.getName() + "\t" + mateSeq.length() + "\t" +e_ji+"\t" + e_ij + "\n"); + outStr.flush(); + } + outStr.close(); + } +} diff --git a/src/main/java/japsa/tools/bio/phylo/XMDistanceCmd.java b/src/main/java/japsa/tools/bio/phylo/XMDistanceCmd.java index 2a12f69..e8d6b78 100644 --- a/src/main/java/japsa/tools/bio/phylo/XMDistanceCmd.java +++ b/src/main/java/japsa/tools/bio/phylo/XMDistanceCmd.java @@ -37,11 +37,12 @@ import japsa.seq.Alphabet; import japsa.seq.FastaReader; import japsa.seq.Sequence; -import japsa.tools.xm.ExpertModelCmd; +import japsa.tools.bio.xm.ExpertModelCmd; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; import japsa.xm.ExpertModel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedOutputStream; import java.io.FileOutputStream; @@ -54,7 +55,8 @@ @Deployable(scriptName = "jsa.phylo.xmdist", scriptDesc = "Generate a distance matrix from genomes (potentially not alignable") -public class XMDistanceCmd extends CommandLine{ +public class XMDistanceCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(XMDistanceCmd.class); public XMDistanceCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -152,7 +154,7 @@ public static void main(String[] args) throws Exception { boolean finished = executor.awaitTermination(3, TimeUnit.DAYS); double [][] mtx = new double[seqs.size()] [seqs.size()]; - Logging.info("ALL DONE " + finished); + LOG.info("ALL DONE " + finished); for (int i = 0; i < seqs.size();i++){ mtx[i][i] = 0; for (int j = i+1; j < seqs.size(); j++){ @@ -231,16 +233,16 @@ public CompressSingle(CommandLine cmdLine, int i1, int i2) throws Exception{ public void run() { try { if (index2 < 0){ - Logging.info("Thread Single " + index1 + " started"); + LOG.info("Thread Single " + index1 + " started"); Sequence [] mS = new Sequence[1]; mS[0] = seqs.get(index1); double score = eModel.encode_optimise(mS); synchronized(resultSingle){ resultSingle[index1] = score; } - Logging.info("Thread Single " + index1 + " done!"); + LOG.info("Thread Single " + index1 + " done!"); }else{ - Logging.info("Thread GB " + index1 + " - " + index2 + " started"); + LOG.info("Thread GB " + index1 + " - " + index2 + " started"); Sequence [] mS = new Sequence[2]; mS[0] = seqs.get(index1); mS[1] = seqs.get(index2); @@ -248,10 +250,10 @@ public void run() { synchronized(resultBG){ resultBG[index1][index2] = e_ij; } - Logging.info("Thread GB " + index1 + " - " + index2 + " done"); + LOG.info("Thread GB " + index1 + " - " + index2 + " done"); - Logging.info("Thread GB2 " + index2 + " - " + index1 + " started"); + LOG.info("Thread GB2 " + index2 + " - " + index1 + " started"); mS[0] = seqs.get(index2); mS[1] = seqs.get(index1); double e_ji = eModel.encode_optimise(mS); @@ -259,7 +261,7 @@ public void run() { synchronized(resultBG){ resultBG[index2][index1] = e_ji; } - Logging.info("Thread GB2 " + index2 + " - " + index1 + " done"); + LOG.info("Thread GB2 " + index2 + " - " + index1 + " done"); } } catch (Exception e) { // TODO Auto-generated catch block diff --git a/src/main/java/japsa/tools/bio/sim/SimHTSWithFSMCmd.java b/src/main/java/japsa/tools/bio/sim/SimHTSWithFSMCmd.java index efd28ba..d819cda 100644 --- a/src/main/java/japsa/tools/bio/sim/SimHTSWithFSMCmd.java +++ b/src/main/java/japsa/tools/bio/sim/SimHTSWithFSMCmd.java @@ -271,6 +271,7 @@ public static void main(String[] args) throws Exception{ static final char sep = '#'; static final String sepSTR = "#"; + static int relax = 20; static void eval(String bamFile){ SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); @@ -278,10 +279,10 @@ static void eval(String bamFile){ SAMRecordIterator samIter = samReader.iterator(); - int countRead = 0; - int TP = 0, FP = 0, FN =0 , dup = 0; + //int countRead = 0; + int TP = 0, FP = 0; while (samIter.hasNext()){ - countRead ++; + //countRead ++; SAMRecord samRecord = samIter.next(); String readName = samRecord.getReadName(); diff --git a/src/main/java/japsa/tools/bio/sim/SimulateCaptureCmd.java b/src/main/java/japsa/tools/bio/sim/SimulateCaptureCmd.java new file mode 100644 index 0000000..b21f8eb --- /dev/null +++ b/src/main/java/japsa/tools/bio/sim/SimulateCaptureCmd.java @@ -0,0 +1,544 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 28/05/2016 - Minh Duc Cao: Created + ****************************************************************************/ + +package japsa.tools.bio.sim; + + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Random; + + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; +import japsa.bio.sim.IlluminaSequencing; +import japsa.bio.sim.PacBioSequencing; +import japsa.seq.Genome; +import japsa.seq.Sequence; +import japsa.seq.SequenceOutputStream; +import japsa.util.CommandLine; +import japsa.util.Simulation; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.sim.capsim", + scriptDesc = "Simulate capture sequencing" + ) +public class SimulateCaptureCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(SimulateCaptureCmd.class); + + //CommandLine cmdLine; + public SimulateCaptureCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + //Input/output + addString("reference", null, "Name of genome to be ",true); + addString("probe", null, "File containing probes mapped to the reference in bam format"); + addString("logFile", "-", "Log file"); + addString("ID", "", "A unique ID for the data set"); + + //addString("fragment", null, "Output of fragment file"); + addString("miseq", null, "Name of read file if miseq is simulated"); + addString("pacbio", null, "Name of read file if pacbio is simulated"); + + //Fragment size distribution + addInt("fmedian", 2000 , "Median of fragment size at shearing"); + addDouble("fshape", 6, "Shape parameter of the fragment size distribution"); + + //addInt("smedian", 1300 , "Median of fragment size distribution"); + //addDouble("sshape", 6, "Shape parameter of the fragment size distribution"); + + //addInt("tmedian", 0 , "Median of target fragment size (the fragment size of the data).\n If specified, " + + // "will override fmedian and smedian.\n Othersise will be estimated"); + //addDouble("tshape", 0, "Shape parameter of the effective fragment size distribution"); + + addInt("num", 1000000, "Number of fragments "); + + //addDouble("mismatch",0.01,"probability of mismatches"); + //addDouble("deletion",0.01,"probability of deletion"); + //addDouble("insertion",0.01,"probability of insertion"); + //addDouble("extension",0.01,"probability of indel extention"); + //Specific parameter for each sequencing technology + + addInt("pblen", 30000, "PacBio: Average (polymerase) read length"); + + addInt("illen", 300, "Illumina: read length"); + //addString("ilmode", "pe", "Illumina: Sequencing mode: pe = paired-end, mp=mate-paired and se=singled-end"); + + addInt("seed", 0, "Random seed, 0 for a random seed"); + addStdHelp(); + } + public static void main(String [] args) throws IOException{ + CommandLine cmdLine = new SimulateCaptureCmd (); + args = cmdLine.stdParseLine(args); + + /**********************************************************************/ + String logFile = cmdLine.getStringVal("logFile"); + String probe = cmdLine.getStringVal("probe"); + String ID = cmdLine.getStringVal("ID"); + String referenceFile = cmdLine.getStringVal("reference"); + + //int smedian = cmdLine.getIntVal("smedian"); + //double sshape = cmdLine.getDoubleVal("sshape"); + + //int tmedian = cmdLine.getIntVal("tmedian"); + //double tshape = cmdLine.getDoubleVal("tshape"); + + int seed = cmdLine.getIntVal("seed"); + int num = cmdLine.getIntVal("num"); + + int pblen = cmdLine.getIntVal("pblen"); + int pbshape = 6; + + String miseq = cmdLine.getStringVal("miseq"); + String pacbio = cmdLine.getStringVal("pacbio"); + + if (miseq == null && pacbio == null){ + System.err.println("One of miseq or pacbio must be set\n" + cmdLine.usageString()); + System.exit(-1); + } + + if (miseq != null && pacbio != null){ + System.err.println("Only one of miseq or pacbio can be set\n" + cmdLine.usageString()); + System.exit(-1); + } + + int fmedian = cmdLine.getIntVal("fmedian"); + double fshape = cmdLine.getDoubleVal("fshape"); + + int flank = fmedian + (fmedian / 4); + double hybridizationRatio = 0.5; + + //double [] dist2 = null; + //if (tmedian <=0){ + // LOG.info("Estimating target distribution"); + // dist2 = new double[fmedian*6]; + //}else{ + // dist2 = new double[tmedian*6]; + // double max = 0; + // for (int i = 0; i < dist2.length;i++){ + // dist2[i] = Simulation.logLogisticPDF(i + 1, smedian, sshape); + // if (dist2[i] > max) + // max = dist2[i]; + // } + //} + + //double [] dist2 = new double[smedian*4]; + //double max = 0.0; + //for (int i = 0; i < dist2.length;i++){ + // dist2[i] = Simulation.logLogisticPDF(i + 1, smedian, sshape); + // if (dist2[i] > max) + // max = dist2[i]; + //} + + //Normalise to 1 + //for (int i = 0; i < dist2.length;i++){ + // //LOG.info("dist2 [" + i + "] = " + dist2[i] + " max = " + max + " after " + dist2[i] / max); + // dist2[i] = dist2[i] / max; + //} + + SequenceOutputStream miSeq1Fq = null, miSeq2Fq = null, pacbioFq = null; + + if (miseq != null){ + miSeq1Fq = SequenceOutputStream.makeOutputStream(miseq + "_1.fastq.gz"); + miSeq2Fq = SequenceOutputStream.makeOutputStream(miseq + "_2.fastq.gz"); + } + + if (pacbio != null){ + pacbioFq = SequenceOutputStream.makeOutputStream(pacbio + ".fastq.gz"); + } + + SequenceOutputStream + logOS = logFile.equals("-")? (new SequenceOutputStream(System.err)):(SequenceOutputStream.makeOutputStream(logFile)); + logOS.print("Parameters for simulation \n" + cmdLine.optionValues()); + + seed = Simulation.seed(seed); + Random rnd = new Random(seed); + + logOS.print("#Seed " + seed + "\n"); + + Genome genome = new Genome(); + genome.read(referenceFile); + ArrayList chrList = genome.chrList(); + + logOS.print("Read " + chrList.size() + " chr " + genome.getLength() + "bp\n" ); + + BitSet [] bitSets = null; + GenomicRegion genRegion = null; + SamReader samReader = null; + long [] accLen = null; + + if (probe != null){ + genRegion = new GenomicRegion(); + + SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); + samReader = SamReaderFactory.makeDefault().open(new File(probe)); + SAMRecordIterator samIter = samReader.iterator(); + LOG.info("Mark capturable regions"); + + bitSets = new BitSet[chrList.size()]; + for (int i = 0; i < bitSets.length;i++) + bitSets[i] = new BitSet(); + + //Mark regions from which fragments *may* be captured + while (samIter.hasNext()){ + SAMRecord sam = samIter.next(); + if (sam.getReadUnmappedFlag()) + continue; + + int start = sam.getAlignmentStart(); + int end = sam.getAlignmentEnd(); + + if ((end - start) < hybridizationRatio * sam.getReadLength()) + continue; + + int refIndex = sam.getReferenceIndex(); + //TODO: this part can be improved + bitSets[refIndex].set(Math.max(start - flank,0), end); + } + samIter.close(); + //LOG.info("Mark capturable regions -- done"); + for (int x=0; x < genome.chrList().size();x++){ + Sequence chrom = genome.chrList().get(x); + BitSet myBitSet = bitSets[x]; + int regionStart = -1; + for (int i = 0; i < chrom.length();i++){ + if (myBitSet.get(i) && (regionStart < 0)){ + //start of a new region + regionStart = i; + }else if (!myBitSet.get(i) && (regionStart >= 0)){ + //end of a region + genRegion.addRegion(x, regionStart, i - regionStart); + regionStart = -1; + } + } + if (regionStart >=0){ + genRegion.addRegion(x, regionStart, chrom.length() - regionStart); + } + }//for + //LOG.info("Mark capturable regions 2 -- done"); + }else{ + accLen = new long[chrList.size()]; + accLen[0] = chrList.get(0).length(); + LOG.info("Acc 0 " + accLen[0]); + for (int i = 1; i < accLen.length;i++){ + accLen[i] = accLen[i-1] + chrList.get(i).length(); + LOG.info("Acc " +i + " " + accLen[i]); + } + + } + + //if (fragment != null) + // sos = SequenceOutputStream.makeOutputStream(fragment); + + long numFragment = 0; + //actual number of fragments generated, including the non probed + long numGen = 0; + + long + fragmentRej1 = 0, + fragmentRej2 = 0, + fragmentRej3 = 0, + fragmentRej4 = 0; + + while (numFragment < num){ + numGen ++; + if (numGen % 1000000 == 0){ + LOG.info("Generated " + numGen + " selected " + numFragment + + "; reject1 = " + fragmentRej1 + + "; reject2 = " + fragmentRej2 + + "; reject3 = " + fragmentRej3 + + "; reject4 = " + fragmentRej4); + } + + //1. Generate the length of the next fragment + int fragLength = + Math.max((int) Simulation.logLogisticSample(fmedian, fshape, rnd), 50); + + //LOG.info("Gen0 " + fragLength); + + //2. Generate the position of the fragment + //toss the coin + double r = rnd.nextDouble(); + + int chrIndex = 0, chrPos = 0; + + if (genRegion != null){ + long p = (long) (r * genRegion.totLength); + int index = 0; + + while (p > genRegion.regions.get(index).accuLength){ + index ++; + } + + if (index > 0){ + p = p - genRegion.regions.get(index - 1).accuLength; + } + + if (p > genRegion.regions.get(index).length){ + LOG.error("Not expecting2 " + p + " vs " + index); + System.exit(1); + } + + chrPos = ((int) p) + genRegion.regions.get(index).position; + chrIndex = genRegion.regions.get(index).chrIndex; + }else{ + + long p = (long) (r * genome.getLength()); + int index = 0; + while (p > accLen[index]) + index ++; + //identify the chroms + if (index > 0){ + p = p - accLen[index - 1]; + } + + chrIndex = index; + chrPos = (int) p; + //LOG.info("Found " + index + " " + myP + " " + p); + } + if (chrPos > chrList.get(chrIndex).length()){ + LOG.error("Not expecting " + chrPos + " vs " + chrIndex); + System.exit(1); + } + + //Take the min of the frag length and the length to the end + fragLength = Math.min(fragLength, chrList.get(chrIndex).length() - chrPos); + if (fragLength < 50){ + LOG.warn("Whoops"); + continue; + } + + if (samReader != null){ + //if probe is provided, see if the fragment is rejected + if (!bitSets[chrIndex].get(chrPos)){ + //LOG.info("Reject0 " + fragLength); + + fragmentRej1 ++; + continue;//while + } + + /******************************************************************************* + SAMRecordIterator iter = samReader.query(chrList.get(chrIndex).getName(), chrPos, chrPos + fragLength, false); + int countProbe = 0; + int myEnd = 0; + while (iter.hasNext()){ + SAMRecord sam = iter.next(); + + int start = sam.getAlignmentStart(); + if (start < myEnd) + continue; + + int end = sam.getAlignmentEnd(); + //probe can only bind if > 80% + //if ((end - start) < hybridizationRatio * sam.getReadLength()) + // continue;//while iter probe + + if (start < chrPos) + start = chrPos; + + if (end > chrPos + fragLength) + end = chrPos + fragLength; + + countProbe += (end - start + 1); + myEnd = end; + } + iter.close(); + + if (countProbe <= 0){ + //LOG.info("Reject1 " + fragLength + " " + countProbe); + fragmentRej2 ++; + continue; + } + double myOdd = countProbe * 4.0/ fragLength; + //myOdd = 4 * (myOdd - 0.1) / 0.9; + /*******************************************************************************/ + + /*******************************************************************************/ + SAMRecordIterator iter = samReader.query(chrList.get(chrIndex).getName(), chrPos, chrPos + fragLength, false); + int countProbe = 0; + + while (iter.hasNext()){ + SAMRecord sam = iter.next(); + + int start = sam.getAlignmentStart(); + int end = sam.getAlignmentEnd(); + + if (start < chrPos) + start = chrPos; + + if (end > chrPos + fragLength) + end = chrPos + fragLength; + + countProbe += (end - start + 1); + } + iter.close(); + + if (countProbe <= 0){ + //LOG.info("Reject1 " + fragLength + " " + countProbe); + fragmentRej2 ++; + continue; + } + double myOdd = (countProbe * 0.5 / fragLength) - 0.2; + /*******************************************************************************/ + + r = rnd.nextDouble(); + if (r > myOdd){ + //bad luck, rejected + //LOG.info("Reject2 " + fragLength + " " + countProbe); + fragmentRej3 ++; + //System.out.println("Rejected 3 " + myOdd + " vs " + r); + continue;//while + } + }//if + + //LOG.info("Gen1 " + fragLength); + //here, the fragment is captured + + //another round of selection + //double myOdd = 0; + //if (fragLength >= dist2.length) + // myOdd = dist2[dist2.length - 1]; + //else myOdd = dist2[fragLength]; + + + //if (rnd.nextDouble() > myOdd){ + // fragmentRej4 ++; + // continue; + //} + numFragment ++; + //numFragmentApp += count; + + //LOG.info("Gen2 " + fragLength); + //now that the fragment is to be sequenced + Sequence seq = chrList.get(chrIndex).subSequence(chrPos, chrPos + fragLength); + seq.setName(ID + "_" + chrList.get(chrIndex).getName() + "_" + (chrPos + 1) + "_" +(chrPos + fragLength)); + + //if (sos != null) + // seq.writeFasta(sos); + + if (miSeq1Fq != null){ + IlluminaSequencing.simulatePaired(seq, miSeq1Fq, miSeq2Fq, rnd); + } + + if (pacbioFq != null){ + int readLen = Math.max((int) Simulation.logLogisticSample(pblen, pbshape, rnd), 50); + PacBioSequencing.simulatePacBio(seq, readLen, pacbioFq, rnd); + } + + } + + LOG.info("Generated " + numGen + " selected " + numFragment + + "; reject1 = " + fragmentRej1 + + "; reject2 = " + fragmentRej2 + + "; reject3 = " + fragmentRej3 + + "; reject4 = " + fragmentRej4); + + //if (sos != null) + // sos.close(); + + if (miSeq1Fq != null) + miSeq1Fq.close(); + + if (miSeq2Fq != null) + miSeq2Fq.close(); + + if (pacbioFq != null) + pacbioFq.close(); + + logOS.close(); + } + + /** + * Implement regions that may be capturable + * @author minhduc + * + */ + static class GenomicRegion{ + Genome genome; + long totLength = 0; + static class Region{ + int chrIndex; + int position; + int length; + long accuLength; + } + + ArrayList regions = new ArrayList(); + + Region addRegion(int cIndex, int pos, int length){ + Region region = new Region(); + region.chrIndex = cIndex; + region.position = pos; + region.length = length; + totLength += length; + region.accuLength = totLength; + + regions.add(region); + return region; + } + } + +} + + +/*RST* +---------------------------------------------------------------------------- +*capsim*: Simulating the Dynamics of Targeted Capture Sequencing with CapSim +---------------------------------------------------------------------------- + +*capsim* (jsa.sim.capsim) is a tool to simulate target capture sequencing. Its +simulates the dynamics of capture process + + + +~~~~~~~~~~~~~ +Usage samples +~~~~~~~~~~~~~ + +*RST*/ diff --git a/src/main/java/japsa/tools/bio/sim/SimulateGenomeCmd.java b/src/main/java/japsa/tools/bio/sim/SimulateGenomeCmd.java new file mode 100755 index 0000000..49e4601 --- /dev/null +++ b/src/main/java/japsa/tools/bio/sim/SimulateGenomeCmd.java @@ -0,0 +1,295 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 10/05/2012 - Minh Duc Cao: Created + * 10/06/2016: Revisit + ****************************************************************************/ +package japsa.tools.bio.sim; + +import japsa.seq.AbstractSequence; +import japsa.seq.Alphabet; +import japsa.seq.Sequence; +import japsa.seq.SequenceBuilder; +import japsa.seq.SequenceOutputStream; +import japsa.seq.SequenceReader; +import japsa.util.CommandLine; +import japsa.util.Simulation; +import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Random; + +/** + * Simulate a haploid genome from a reference + * + * @author minhduc + * + */ +@Deployable( + scriptName = "jsa.sim.genome", + scriptDesc = "Simulate genomes with variation from an existing genome" + ) +public class SimulateGenomeCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(SimulateGenomeCmd.class); + public SimulateGenomeCmd(){ + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addString("input", null, "Name of input file", true); + addString("output", null, "Prefix output file", true); + addString("logFile", "-", "Log file, - for stadard error"); + addString("sv", "", "List of deletions/insertions eg chr1:1000-1020:-,chr2:1300-1340:+"); + addDouble("snp", 0.0025, "SNP rate"); + addDouble("indel", 0.00028, "indel rate"); + //Defaults parameters from doi:10.1038/nature09534: + //5.9M SNPs, 650K indels (1-50) on 2.3Gb + addDouble("ext", 0.5, "indel extension rate"); + addInt("seed", 0, "Random seed, 0 for a random seed"); + + addStdHelp(); + + } + /** + * @param args + */ + + public static void main(String[] args) throws IOException { + CommandLine cmdLine = new SimulateGenomeCmd(); + args = cmdLine.stdParseLine(args); + + String inFile = cmdLine.getStringVal("input"); + String outFile = cmdLine.getStringVal("output"); + String logFile = cmdLine.getStringVal("logFile"); + + String svOption = cmdLine.getStringVal("sv"); + + double snp = cmdLine.getDoubleVal("snp"); + double indel = cmdLine.getDoubleVal("indel"); + double ext = cmdLine.getDoubleVal("ext"); + int seed = cmdLine.getIntVal("seed"); + + + + ArrayList svs = new ArrayList(); + String [] toks = svOption.trim().split(","); + + for (int i =0; i < toks.length;i++){ + svs.add(StructualVarition.parseSV(toks[i])); + } + + int svsIndex = 0; + + + //generate a random seed if need to + seed = Simulation.seed(seed); + Random rnd = new Random(seed); + + SequenceReader reader = SequenceReader.getReader(inFile); + + SequenceOutputStream logOS = logFile.equals("-")? + (new SequenceOutputStream(System.err)) + : + (SequenceOutputStream.makeOutputStream(logFile)); + + + SequenceOutputStream outFasta = SequenceOutputStream + .makeOutputStream(outFile); + + //SequenceOutputStream outJsa = SequenceOutputStream + // .makeOutputStream(outFile + ".jsa"); + + AbstractSequence seq = null; + + logOS.print("#Seed " + seed + "\n"); + + StructualVarition sv = null; + if (svsIndex < svs.size()){ + sv = svs.get(svsIndex); + } + + while ((seq = reader.nextSequence(Alphabet.DNA())) != null){ + int length = seq.length(); + //Step 0: Introduce any structural variation first + int currentIndex = 0; + SequenceBuilder sb = new SequenceBuilder(Alphabet.DNA(), length + length/5, seq.getName()); + + logOS.print("#Start with " + seq.getName() + " " + seq.length() +"\n"); + + while (sv !=null && sv.chr.equals(seq.getName())){ + int start = sv.start; + int end = sv.end; + for (;currentIndex < start - 1 && currentIndex < seq.length();currentIndex++){ + sb.append(seq.getBase(currentIndex)); + } + + logOS.print("# " + sv.chr + ":" + sv.start + "-" + sv.end + ":" + ((sv.svType == StructualVarition.DELETION)?"-":"+") + "\n"); + + if (sv.svType == StructualVarition.DUPLICATION){ + for (int x = currentIndex;x < end-1;x++){ + sb.append(seq.getBase(x)); + } + //one more time + for (int x = currentIndex;x < end-1;x++){ + sb.append(seq.getBase(x)); + } + + }else if (sv.svType == StructualVarition.DELETION){ + //doing nothing + }else{ + LOG.error("Dont know what to do"); + } + currentIndex = end - 1; + + svsIndex ++; + if (svsIndex < svs.size()){ + sv = svs.get(svsIndex); + }else + sv = null; + + } + for (;currentIndex < seq.length();currentIndex++){ + sb.append(seq.getBase(currentIndex)); + } + + seq = sb; + //step 1: introduce white noise + logOS.print("#Restart with " + seq.getName() + " " + seq.length() +"\n"); + byte[] seqByte = new byte[length + length / 5]; + int currentInx = 0; + int numSNPs = 0; + int numIndels = 0; + + int index = 0; + for (; index < seq.length();) { + byte base = seq.getBase(index); + if (base >=4){ + seqByte[currentInx] = (byte) (rnd.nextInt(4)); + currentInx++; + }else{ + double val = rnd.nextDouble(); + if (val <= snp) {// SNPs + // An SNP + // Generate a random number between 0-2, then plus 1 and + // plus the index of the previous char + // to avoid generating the same nucleotide + seqByte[currentInx] = (byte) ((1 + rnd.nextInt(3) + base) % 4); + currentInx++; + numSNPs++; + logOS.print("SNP " + (index + 1) + " at " + currentInx + "\n"); + } else if (val <= snp + indel) {// indel + numIndels++; + val = rnd.nextDouble(); + if (val >= 0.5) { + int size = 1; + while (rnd.nextDouble() < ext) { + size++; + index++; + } + logOS.print("DEL " + (index + 1 - size) + " at " + currentInx + " of " + size + "\n"); + numIndels++; + // A deletion + // currentInx[seqIdx] --; + } else {// an insertion + seqByte[currentInx] = base; + currentInx++; + + int size = 0; + do { + seqByte[currentInx] = (byte) (rnd.nextInt(4)); + currentInx++; + size++; + } while (rnd.nextDouble() < ext); + logOS.print("INS " + (index + 1) + " at " + currentInx + " of " + size + "\n"); + numIndels++; + }// insert vs delete + } else {// Direct copy + seqByte[currentInx] = base; + currentInx++; + } + }// if + index++; + }// for index + // System.out.println("Insert reps done"); + Sequence nSeq = new Sequence(Alphabet.DNA4(), seqByte, currentInx, + seq.getName()); + nSeq.writeFasta(outFasta); + + logOS.print("Sequence " + seq.getName() + " : " + numSNPs + " SNPs " + + numIndels + " indels \n"); + + //JapsaAnnotation.write(nSeq, null, outJsa); + }// for + outFasta.close(); + logOS.close(); + //outJsa.close(); + } + + + + static class StructualVarition{ + static final int DELETION = 0; + static final int DUPLICATION = 1; + + String chr; + int svType; + int start; + int end; + + static StructualVarition parseSV(String str){ + StructualVarition sv = new StructualVarition(); + //example: chr1:1000-1020:-,chr2:1300-1340:+ + String [] toks = str.split(":"); + sv.chr = toks[0]; + + if (toks.length<2) + sv.svType = StructualVarition.DELETION; + else if (toks[2].charAt(0) == '-') + sv.svType = StructualVarition.DELETION; + else if (toks[2].charAt(0) == '+') + sv.svType = StructualVarition.DUPLICATION; + else + return null; + + toks = toks[1].split("-"); + + sv.start = (int) Double.parseDouble(toks[0]); + sv.end = (int) Double.parseDouble(toks[1]); + + return sv; + } + + } + +} diff --git a/src/main/java/japsa/tools/bio/tr/Fragment2TRVCmd.java b/src/main/java/japsa/tools/bio/tr/Fragment2TRVCmd.java index 2a39608..31458c3 100755 --- a/src/main/java/japsa/tools/bio/tr/Fragment2TRVCmd.java +++ b/src/main/java/japsa/tools/bio/tr/Fragment2TRVCmd.java @@ -133,10 +133,6 @@ public static void main(String[] args) throws Exception { /** * Assume both inserts and annotations are sorted by the start position - * - * @param iFile - * @param aFile - * @throws IOException */ static int bsearch(JapsaAnnotation anno, int start, int end, int gap){ @@ -270,7 +266,7 @@ static void runAnalysisAAA(ArrayList trvList, String[] ins CompareTRVCmd.compareStr(ansList, trvList,0,10000); }//for iFdx - String[] headers = {TandemRepeat.chrHd, TandemRepeat.startHd, TandemRepeat.endHd, + String[] headers = {TandemRepeat.chromHd, TandemRepeat.startHd, TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, TandemRepeatVariant.varHd, TandemRepeatVariant.confidenceHd, TandemRepeatVariant.meanHd, TandemRepeatVariant.stdHd}; out.write(("#H:" + headers[0]).getBytes()); @@ -396,7 +392,7 @@ static void runAnalysis(ArrayList trvList, String[] insert CompareTRVCmd.compareStr(ansList, trvList,0,10000); }//for iFdx - String[] headers = {TandemRepeat.chrHd, TandemRepeat.startHd, TandemRepeat.endHd, + String[] headers = {TandemRepeat.chromHd, TandemRepeat.startHd, TandemRepeat.endHd, TandemRepeat.periodHd, TandemRepeat.unitNoHd, TandemRepeatVariant.varHd, TandemRepeatVariant.confidenceHd, TandemRepeatVariant.meanHd, TandemRepeatVariant.stdHd}; out.print("#H:" + headers[0]); diff --git a/src/main/java/japsa/tools/bio/tr/ParseTRFCmd.java b/src/main/java/japsa/tools/bio/tr/ParseTRFCmd.java index d0234c1..7e51b1f 100644 --- a/src/main/java/japsa/tools/bio/tr/ParseTRFCmd.java +++ b/src/main/java/japsa/tools/bio/tr/ParseTRFCmd.java @@ -43,8 +43,9 @@ import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; @@ -58,7 +59,9 @@ */ @Deployable(scriptName = "jsa.trv.parseTRF", scriptDesc = "Parse trf output to jsa, bed or tr format") -public class ParseTRFCmd extends CommandLine{ +public class ParseTRFCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(ParseTRFCmd.class); + public ParseTRFCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -205,10 +208,10 @@ public static void main(String[] args) throws Exception { if (tr.getScore() > lastTR.getScore()){ anno.remove(lastTR); skip ++;retain --; - Logging.warn("Skip [" + lastTR.getStart() + " " + lastTR.getEnd() + "](" +lastTR.getScore()+") because of [" + tr.getStart() + " " + tr.getEnd() + "](" +tr.getScore() +")"); + LOG.warn("Skip [" + lastTR.getStart() + " " + lastTR.getEnd() + "](" +lastTR.getScore()+") because of [" + tr.getStart() + " " + tr.getEnd() + "](" +tr.getScore() +")"); }else{ skip ++; - Logging.warn("Skip [" + tr.getStart() + " " + tr.getEnd() + "](" +tr.getScore()+") because of [" + lastTR.getStart() + " " + lastTR.getEnd() + "](" +lastTR.getScore() +")"); + LOG.warn("Skip [" + tr.getStart() + " " + tr.getEnd() + "](" +tr.getScore()+") because of [" + lastTR.getStart() + " " + lastTR.getEnd() + "](" +lastTR.getScore() +")"); continue; } } @@ -223,7 +226,7 @@ public static void main(String[] args) throws Exception { write(seq, anno, out, format); out.close(); - Logging.info(" Retain " + retain + " records, skip " + skip + " because of overlapping and filter "+ filter + " of irrelevant period size"); + LOG.info(" Retain " + retain + " records, skip " + skip + " because of overlapping and filter "+ filter + " of irrelevant period size"); } /** diff --git a/src/main/java/japsa/tools/xm/ExpertModelCmd.java b/src/main/java/japsa/tools/bio/xm/ExpertModelCmd.java similarity index 92% rename from src/main/java/japsa/tools/xm/ExpertModelCmd.java rename to src/main/java/japsa/tools/bio/xm/ExpertModelCmd.java index 2d3a9c7..27dbc07 100755 --- a/src/main/java/japsa/tools/xm/ExpertModelCmd.java +++ b/src/main/java/japsa/tools/bio/xm/ExpertModelCmd.java @@ -32,7 +32,7 @@ * ****************************************************************************/ -package japsa.tools.xm; +package japsa.tools.bio.xm; import japsa.seq.Alphabet; import japsa.seq.Sequence; @@ -40,7 +40,9 @@ import japsa.util.CommandLine; import japsa.util.deploy.Deployable; import japsa.xm.ExpertModel; -import japsa.xm.expert.*; +import japsa.xm.expert.RepeatCountExpert; +import japsa.xm.expert.RepeatExpert; +import japsa.xm.expert.RepeatSubsExpert; import java.io.File; import java.util.Random; @@ -303,6 +305,31 @@ public static ExpertModel getExpertModel(CommandLine cmdLine) eModel.setCheckPoint(cmdLine.getIntVal("checkPoint")); return eModel; } +} -} +/*RST* +--------------------------------------------------------- +*Expert Model*: tool for compression of genomic sequences +--------------------------------------------------------- + +*jsa.xm.compress* in the implementation of the expert model (XM) algorithm for +compression of genomics sequences. The source code is included in the +`Japsa package `_. +Please see check the installation_ page for instructions. + +.. _installation: ../install.html + + + +~~~~~~~~ +Citation +~~~~~~~~ + +If you find XM useful for your research, please cite + +Cao MD, Dix TI, Allison L, and Mears C, +*A simple statistical algorithm for biological sequence compression*, +Data Compression Conference, 2007 (DCC'07), Snowbird, UT, pp43-52. + +*RST*/ diff --git a/src/main/java/japsa/tools/seq/AddAnnotationCmd.java b/src/main/java/japsa/tools/seq/AddAnnotationCmd.java index faf00f1..f345255 100644 --- a/src/main/java/japsa/tools/seq/AddAnnotationCmd.java +++ b/src/main/java/japsa/tools/seq/AddAnnotationCmd.java @@ -41,9 +41,9 @@ import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -52,7 +52,8 @@ */ @Deployable(scriptName = "jsa.seq.addanno", scriptDesc = "Add annotations to a Japsa file") -public class AddAnnotationCmd extends CommandLine{ +public class AddAnnotationCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(AddAnnotationCmd.class); public AddAnnotationCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -144,11 +145,11 @@ public static void main(String[] args) throws Exception { //mainAnno.sortFeatures(); mainAnno.write(out); }else{ - Logging.warn("The IDs are not identical, annotation not added!"); + LOG.warn("The IDs are not identical, annotation not added!"); } }else//if anno = null - Logging.warn("Annotations for " + mainAnno.getAnnotationID() + " not found"); + LOG.warn("Annotations for " + mainAnno.getAnnotationID() + " not found"); } out.close(); diff --git a/src/main/java/japsa/tools/seq/AlignmentEMCmd.java b/src/main/java/japsa/tools/seq/AlignmentEMCmd.java index 2b6285c..be59318 100644 --- a/src/main/java/japsa/tools/seq/AlignmentEMCmd.java +++ b/src/main/java/japsa/tools/seq/AlignmentEMCmd.java @@ -60,7 +60,7 @@ public AlignmentEMCmd(){ setUsage(annotation.scriptName() + " [options] seq1 seq2"); setDesc(annotation.scriptDesc()); - //addBoolean("reverse",false,"Reverse sort order"); + addInt("iteration",5,"Number of iteration"); addStdHelp(); } @@ -70,6 +70,8 @@ public static void main(String[] args) throws Exception{ CommandLine cmdLine = new AlignmentEMCmd(); args = cmdLine.stdParseLine(args); + int itNum = cmdLine.getIntVal("iteration"); + Alphabet dna = Alphabet.DNA(); if (args.length <2){ @@ -78,32 +80,54 @@ public static void main(String[] args) throws Exception{ System.exit(-1); } - Sequence mSeq = SequenceReader.getReader(args[0]).nextSequence(dna); - Sequence sSeq = SequenceReader.getReader(args[1]).nextSequence(dna); - - ProbFSM eDp = new ProbThreeSM(mSeq); - - int itNum = 10;//number of iteration - - Emission retState = null; - for (int x = 0; x < itNum;x++){ - eDp.resetCount(); - - retState = eDp.alignGenerative(sSeq); - double cost = retState.myCost; - System.out.println(eDp.updateCount(retState) + " states and " + cost + " bits " + sSeq.length() + "bp" ); - eDp.reEstimate(); - System.out.println("----------------------------------------------\n Total cost = " + cost); - eDp.showProb(); - System.out.println("============================================="); - } + SequenceReader readFile = SequenceReader.getReader(args[0]); + SequenceReader barcodeFile = SequenceReader.getReader(args[1]); + //Sequence mSeq = SequenceReader.getReader(args[0]).nextSequence(dna); + //Sequence sSeq = SequenceReader.getReader(args[1]).nextSequence(dna); + SequenceOutputStream out = SequenceOutputStream.makeOutputStream("-"); - eDp.printAlignment(retState, sSeq, out); - out.close(); + Sequence readSeq = null, barcodeSeq = null; + while ((readSeq = readFile.nextSequence(dna)) != null){ + while ((barcodeSeq = barcodeFile.nextSequence(dna)) != null){ + //forward run + ProbFSM eDp = new ProbThreeSM(readSeq); + Emission retState = null; + + for (int x = 0; x < itNum;x++){ + eDp.resetCount(); + retState = eDp.alignGenerative(barcodeSeq); + double cost = retState.myCost; + System.out.println(eDp.updateCount(retState) + " states and " + cost + " bits " + barcodeSeq.length() + "bp" ); + eDp.reEstimate(); + System.out.println("----------------------------------------------\n Total cost = " + cost); + eDp.showProb(); + System.out.println("============================================="); + } + + eDp.printAlignment(retState, barcodeSeq, out); + + barcodeSeq = Alphabet.DNA.complement(barcodeSeq); + eDp = new ProbThreeSM(readSeq); + retState = null; + + for (int x = 0; x < itNum;x++){ + eDp.resetCount(); + retState = eDp.alignGenerative(barcodeSeq); + double cost = retState.myCost; + System.out.println(eDp.updateCount(retState) + " states and " + cost + " bits " + barcodeSeq.length() + "bp" ); + eDp.reEstimate(); + System.out.println("----------------------------------------------\n Total cost = " + cost); + eDp.showProb(); + System.out.println("============================================="); + } + eDp.printAlignment(retState, barcodeSeq, out); + } + } - /***************************************************************/ + out.close(); + /***************************************************************/ } } diff --git a/src/main/java/japsa/tools/seq/AnnotateRegionsCmd.java b/src/main/java/japsa/tools/seq/AnnotateRegionsCmd.java index a923a9c..5ed28db 100644 --- a/src/main/java/japsa/tools/seq/AnnotateRegionsCmd.java +++ b/src/main/java/japsa/tools/seq/AnnotateRegionsCmd.java @@ -38,8 +38,9 @@ import japsa.seq.JapsaFileFormat; import japsa.seq.SequenceOutputStream; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; @@ -50,7 +51,9 @@ */ @Deployable(scriptName = "jsa.seq.annotate", scriptDesc = "Annotate a list of regions using some annotation such as RefSeq") -public class AnnotateRegionsCmd extends CommandLine{ +public class AnnotateRegionsCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(AnnotateRegionsCmd.class); + public AnnotateRegionsCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -77,7 +80,7 @@ public static void main(String[] args) throws IOException { JapsaAnnotation annoAnno = annoF.readAnnotation(); while (( inputAnno = inputF.readAnnotation())!=null){ //Move the the next annotations if not match - Logging.info(inputAnno.getAnnotationID()); + LOG.info(inputAnno.getAnnotationID()); while (!annoAnno.getAnnotationID().equals(inputAnno.getAnnotationID())){ annoAnno = annoF.readAnnotation(); } diff --git a/src/main/java/japsa/tools/seq/AnnotateVCFCmd.java b/src/main/java/japsa/tools/seq/AnnotateVCFCmd.java index bcfbae7..dc5340a 100644 --- a/src/main/java/japsa/tools/seq/AnnotateVCFCmd.java +++ b/src/main/java/japsa/tools/seq/AnnotateVCFCmd.java @@ -44,7 +44,6 @@ import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; -import java.util.HashMap; /** * @author minhduc diff --git a/src/main/java/japsa/tools/seq/BuildGeneDatabaseCmd.java b/src/main/java/japsa/tools/seq/BuildGeneDatabaseCmd.java index af3e211..93f9a40 100644 --- a/src/main/java/japsa/tools/seq/BuildGeneDatabaseCmd.java +++ b/src/main/java/japsa/tools/seq/BuildGeneDatabaseCmd.java @@ -37,13 +37,15 @@ import java.io.IOException; import java.util.HashMap; +import japsa.bio.BuildSequenceGroupDatabase; import japsa.seq.Alphabet; import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -55,6 +57,7 @@ scriptDesc = "Group genes based on their identity and build a database of gene family and their alleles" ) public class BuildGeneDatabaseCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(BuildGeneDatabaseCmd.class); //CommandLine cmdLine; public BuildGeneDatabaseCmd(){ super(); @@ -80,7 +83,8 @@ public BuildGeneDatabaseCmd(){ * @throws Exception * @throws OutOfMemoryError */ - public static void main(String[] args) throws IOException, InterruptedException{ + public static void main(String[] args) throws IOException, InterruptedException{ + CommandLine cmdLine = new BuildGeneDatabaseCmd(); args = cmdLine.stdParseLine(args); @@ -98,35 +102,32 @@ public static void main(String[] args) throws IOException, InterruptedException{ double thresholdOption = cmdLine.getDoubleVal("threshold"); - BuildGeneDatabase.ratio = thresholdOption; - BuildGeneDatabase db = new BuildGeneDatabase(prefix); + BuildSequenceGroupDatabase.ratio = thresholdOption; + BuildSequenceGroupDatabase db = new BuildSequenceGroupDatabase(prefix); SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(outOption); - - HashMap myGenes; - //if (listOption == null){ + HashMap myGenes = new HashMap(); SequenceReader reader = SequenceReader.getReader(inputOption); Alphabet.DNA alphabet = Alphabet.DNA(); Sequence seq; - myGenes = new HashMap(); while ((seq = reader.nextSequence(alphabet)) != null){ - if (myGenes.size() >=number){ - Logging.info("BIG TER "); + if (myGenes.size() >= number){ + LOG.trace("BIG TER "); HashMap mapped = db.addGeneMap(myGenes, checkGeneID); for (String key:myGenes.keySet()){ Sequence keySeq = myGenes.get(key); String dbID = mapped.get(key); if (dbID != null) - Logging.info("Added " + key + " as "+ dbID +" G"); + LOG.trace("Added " + key + " as "+ dbID +" G"); else{ dbID = db.addGene(myGenes.get(key)); - Logging.info("Added " + key + " as "+ dbID +" B"); + LOG.trace("Added " + key + " as "+ dbID +" B"); }//else keySeq.setDesc("JSA=" + dbID+";"+keySeq.getDesc()); keySeq.writeFasta(sos); }//for key - Logging.info("BIG TER END " + db.geneDatabase.size()); + LOG.trace("BIG TER END " + db.geneDatabase.size()); myGenes.clear(); } @@ -134,16 +135,15 @@ public static void main(String[] args) throws IOException, InterruptedException{ } reader.close(); - HashMap mapped = db.addGeneMap(myGenes, checkGeneID); for (String key:myGenes.keySet()){ Sequence keySeq = myGenes.get(key); String dbID = mapped.get(key); if (dbID != null) - Logging.info("Added " + key + " as "+ dbID +" G"); + LOG.info("Added " + key + " as "+ dbID +" G"); else{ dbID = db.addGene(myGenes.get(key)); - Logging.info("Added " + key + " as "+ dbID +" B"); + LOG.info("Added " + key + " as "+ dbID +" B"); }//else keySeq.setDesc("JSA=" + dbID+";"+keySeq.getDesc()); keySeq.writeFasta(sos); diff --git a/src/main/java/japsa/tools/seq/ExtractGeneSequenceCmd.java b/src/main/java/japsa/tools/seq/ExtractGeneSequenceCmd.java index 5882957..a7003a7 100644 --- a/src/main/java/japsa/tools/seq/ExtractGeneSequenceCmd.java +++ b/src/main/java/japsa/tools/seq/ExtractGeneSequenceCmd.java @@ -41,8 +41,9 @@ import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.io.IOException; @@ -56,7 +57,9 @@ */ @Deployable(scriptName = "jsa.seq.gff2fasta", scriptDesc = "Extract sequences from a gff annotation") -public class ExtractGeneSequenceCmd extends CommandLine{ +public class ExtractGeneSequenceCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(ExtractGeneSequenceCmd.class); + public ExtractGeneSequenceCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -97,18 +100,18 @@ public static void extractGenes(String sequence, String gff, String type, int fl ArrayList annos = JapsaAnnotation.readMGFF(aReader,0,0,type); aReader.close(); - Logging.info("Read " + annos.size()); + LOG.info("Read " + annos.size()); SequenceReader reader = SequenceReader.getReader(sequence); Sequence seq = reader.nextSequence(Alphabet.DNA()); for (JapsaAnnotation anno:annos){ - //Logging.info("Read anno " + anno.numFeatures()); + //LOG.info("Read anno " + anno.numFeatures()); while (seq != null && !seq.getName().equals(anno.getAnnotationID())){ seq = reader.nextSequence(Alphabet.DNA()); } if (seq == null){ - Logging.error("Sequence " + anno.getAnnotationID() + " not found"); + LOG.error("Sequence " + anno.getAnnotationID() + " not found"); reader.close(); out.close(); System.exit(1); @@ -142,19 +145,19 @@ public static void extractGenes(String sequence, String gff, String type, int fl ArrayList annos = JapsaAnnotation.readMGFF(aReader,0,0,type); aReader.close(); - Logging.info("Read " + annos.size()); + LOG.info("Read " + annos.size()); SequenceReader reader = SequenceReader.getReader(sequence); Sequence seq = reader.nextSequence(Alphabet.DNA()); for (JapsaAnnotation anno:annos){ - //Logging.info("Read anno " + anno.numFeatures()); + //LOG.info("Read anno " + anno.numFeatures()); while (seq != null && !seq.getName().equals(anno.getAnnotationID())){ seq = reader.nextSequence(Alphabet.DNA()); } if (seq == null){ - Logging.error("Sequence " + anno.getAnnotationID() + " not found"); + LOG.error("Sequence " + anno.getAnnotationID() + " not found"); reader.close(); System.exit(1); } @@ -189,12 +192,12 @@ public static void extractGenes(String sequence, String gff, String type, int fl /*RST* ------------------------------------------- - *jsa.seq.gff2fasta*: Extract gene sequences +*jsa.seq.gff2fasta*: Extract gene sequences ------------------------------------------- - *jsa.seq.gff2fasta* extract the functional sequences (genes, CDS, etc) from +*jsa.seq.gff2fasta* extract the functional sequences (genes, CDS, etc) from a gff file and a sequence file. - *RST*/ +*RST*/ diff --git a/src/main/java/japsa/tools/seq/ExtractRefSeqGenes.java b/src/main/java/japsa/tools/seq/ExtractRefSeqGenes.java index e532f36..63f9c1f 100644 --- a/src/main/java/japsa/tools/seq/ExtractRefSeqGenes.java +++ b/src/main/java/japsa/tools/seq/ExtractRefSeqGenes.java @@ -34,14 +34,8 @@ package japsa.tools.seq; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; +import japsa.bio.BuildSequenceGroupDatabase; import japsa.seq.Alphabet; import japsa.seq.JapsaAnnotation; import japsa.seq.JapsaFeature; @@ -49,9 +43,19 @@ import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + /** * @author minhduc @@ -62,6 +66,8 @@ scriptDesc = "Extract gene sequences from refseq anotation and group them" ) public class ExtractRefSeqGenes extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(ExtractRefSeqGenes.class); + //CommandLine cmdLine; public ExtractRefSeqGenes(){ super(); @@ -105,7 +111,7 @@ public static void main(String[] args) throws IOException, InterruptedException{ prefix = System.currentTimeMillis() + ""; double thresholdOption = cmdLine.getDoubleVal("threshold"); - BuildGeneDatabase.ratio = thresholdOption; + BuildSequenceGroupDatabase.ratio = thresholdOption; geneOS = SequenceOutputStream.makeOutputStream(gene); processDB(db, annoType, familyOption, alleleOption, outOption, prefix); @@ -120,7 +126,7 @@ private static void processDB(String dbFile, String annoType, String family, Str HashSet organismSet = new HashSet(); HashSet stSet = new HashSet(); - BuildGeneDatabase geneDB = new BuildGeneDatabase(prefix); + BuildSequenceGroupDatabase geneDB = new BuildSequenceGroupDatabase(prefix); SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(outOption); String line = ""; @@ -132,7 +138,7 @@ private static void processDB(String dbFile, String annoType, String family, Str String strainID = toks[4]; //if (toks[2].equals(toks[1]) && toks[3].equals("")){ - // Logging.info(strainID + " Ignored because of no strain information"); + // LOG.info(strainID + " Ignored because of no strain information"); // continue; //} @@ -145,7 +151,7 @@ private static void processDB(String dbFile, String annoType, String family, Str double n50 = Double.parseDouble(toks[7]); if (n50 < 100000){ - Logging.info(strainID + " Ignored because of low n50 " + n50); + LOG.info(strainID + " Ignored because of low n50 " + n50); continue;//while } @@ -164,19 +170,19 @@ private static void processDB(String dbFile, String annoType, String family, Str strainName = strainName.replaceAll("__*", "_");//Make sure no double hyphen if (!organismSet.add(organismName)){ - Logging.info(strainID + " Ignored because of dupNAME\t" + organismName + "\t" + n50); + LOG.info(strainID + " Ignored because of dupNAME\t" + organismName + "\t" + n50); continue; } if (toks.length >= 14){ if (!toks[13].equals("0")){ - Logging.info(strainID + " Ignored because of not good ST " + toks[13]); + LOG.info(strainID + " Ignored because of not good ST " + toks[13]); continue;//while } String ST = toks[12]; if (!stSet.add(toks[11])){ - Logging.info(strainID + " Ignored because of dupST\t" + organismName + "\t" + n50 + " ST_" + ST + " (" + toks[11] +")"); + LOG.info(strainID + " Ignored because of dupST\t" + organismName + "\t" + n50 + " ST_" + ST + " (" + toks[11] +")"); continue; } @@ -190,15 +196,15 @@ private static void processDB(String dbFile, String annoType, String family, Str Sequence keySeq = myGenes.get(key); String dbID = mapped.get(key); if (dbID != null) - Logging.info("Added " + key + " as "+ dbID +" G"); + LOG.info("Added " + key + " as "+ dbID +" G"); else{ dbID = geneDB.addGene(myGenes.get(key)); - Logging.info("Added " + key + " as "+ dbID +" B"); + LOG.info("Added " + key + " as "+ dbID +" B"); }//else keySeq.setDesc("JSA=" + dbID+";"+keySeq.getDesc()); keySeq.writeFasta(sos); }//for key - Logging.info("BIG END " + strainName + " " + geneDB.geneDatabase.size()); + LOG.info("BIG END " + strainName + " " + geneDB.geneDatabase.size()); /*****************************************************/ System.out.println(strainID + "\t" + strainName + "\t" + species + "\t" + strain + "\t" + toks[8] + "\t" + n50); } @@ -269,7 +275,7 @@ private static void processDB(String dbFile, String annoType, String family, Str } if (seq == null){ - Logging.info("ERROR: not found sequence for " + anno.getAnnotationID()); + LOG.info("ERROR: not found sequence for " + anno.getAnnotationID()); continue; } diff --git a/src/main/java/japsa/tools/seq/SequenceExtractCmd.java b/src/main/java/japsa/tools/seq/SequenceExtractCmd.java index ca6cf38..33a6817 100644 --- a/src/main/java/japsa/tools/seq/SequenceExtractCmd.java +++ b/src/main/java/japsa/tools/seq/SequenceExtractCmd.java @@ -3,17 +3,21 @@ */ package japsa.tools.seq; -import java.io.IOException; -import java.util.ArrayList; import japsa.seq.Alphabet; import japsa.seq.Sequence; import japsa.seq.SequenceOutputStream; import japsa.seq.SequenceReader; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +import java.io.IOException; +import java.util.ArrayList; + /** * @author minhduc * @@ -23,6 +27,7 @@ scriptDesc = "Extract subsequences" ) public class SequenceExtractCmd extends CommandLine { + private static final Logger LOG = LoggerFactory.getLogger(SequenceExtractCmd.class); public SequenceExtractCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -42,6 +47,7 @@ public SequenceExtractCmd(){ addStdHelp(); } + /** * @param args * @throws IOException @@ -72,7 +78,7 @@ public static void main(String[] args) throws IOException { Sequence seq = findSequence(seqs, chr); if (seq == null){ - Logging.error("Sequence " + chr + " not found"); + LOG.error("Sequence " + chr + " not found"); }else{ Sequence newSequence = seq.subSequence(start - 1, end); if (rev) @@ -89,8 +95,6 @@ public static void main(String[] args) throws IOException { } /** * Find a sequence with ID from a list - * @param seqHash - * @param id * @return */ static Sequence findSequence(ArrayList seqs, String id){ diff --git a/src/main/java/japsa/tools/seq/SequenceSortCmd.java b/src/main/java/japsa/tools/seq/SequenceSortCmd.java index 2e93d69..480ddec 100644 --- a/src/main/java/japsa/tools/seq/SequenceSortCmd.java +++ b/src/main/java/japsa/tools/seq/SequenceSortCmd.java @@ -106,7 +106,7 @@ public static void main(String[] args) throws IOException { if (sortKeyOption.equals("length")) seqL.keyCompare = seq.length(); else{ - String [] toks = seq.getName().split(" "); + String [] toks = (seq.getName() + " " +seq.getDesc()).split(" "); for (int i = 0; i < toks.length;i++){ if (toks[i].startsWith(sortKeyOptionPrefix)){ diff --git a/src/main/java/japsa/tools/util/DnaGraphToolCmd.java b/src/main/java/japsa/tools/util/DnaGraphToolCmd.java new file mode 100755 index 0000000..3c4ec71 --- /dev/null +++ b/src/main/java/japsa/tools/util/DnaGraphToolCmd.java @@ -0,0 +1,82 @@ +/****************************************************************************** + * Copyright (C) 2006-2010 Minh Duc Cao * + * * + * This program is free software; you can redistribute it and/or modify it * + * under the terms of the GNU General Public License as published by the Free * + * Software Foundation; either version 2 of the License, or (at your option) * + * any later version. This program is distributed in the hope that it will be * + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General * + * Public License for more details. * + * * + * You should have received a copy of the GNU General Public License along with* + * this program; if not, write to the Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +//This class is written by Julie Bernal and subsequently modified and maintained +//by Minh Duc Cao + +package japsa.tools.util; + +import japsa.bio.misc.dnaPlatform.gui.MainFrame; +import japsa.util.CommandLine; +import japsa.util.deploy.Deployable; + +import javax.swing.*; + +/** + *

+ * Title: DNA Graph Tool + *

+ * + *

+ * Description: This is the main class of the DNAPlatform + *

+ * + *

+ * Copyright: Copyright (c) 2005 + *

+ * + *

+ * Company: Monash + *

+ * + * @author Julie Bernal + * @version 1.0 + */ + +@Deployable(scriptName = "jsa.misc.dnaGraph", scriptDesc = "Visualisation") +public class DnaGraphToolCmd extends CommandLine{ + public DnaGraphToolCmd() { + super(); + Deployable annotation = getClass().getAnnotation(Deployable.class); + setUsage(annotation.scriptName() + " [options]"); + setDesc(annotation.scriptDesc()); + + addStdHelp(); + } + + public void show(){ + @SuppressWarnings("unused") + MainFrame frame = new MainFrame(); + + } + + + public static void main(String[] args){ + DnaGraphToolCmd cmdLine = new DnaGraphToolCmd(); + cmdLine.stdParseLine(args); + + /**********************************************************************/ + System.setProperty("java.awt.headless", "false"); + + try { + UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); + }catch (Exception e) { + e.printStackTrace(); + } + cmdLine.show(); + } + +} diff --git a/src/main/java/japsa/tools/util/StreamClientCmd.java b/src/main/java/japsa/tools/util/StreamClientCmd.java index 2d92997..635098e 100644 --- a/src/main/java/japsa/tools/util/StreamClientCmd.java +++ b/src/main/java/japsa/tools/util/StreamClientCmd.java @@ -34,15 +34,16 @@ package japsa.tools.util; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; import japsa.util.net.StreamClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; - import java.net.Socket; +import java.util.Date; /** * @author minhduc @@ -51,10 +52,11 @@ @Deployable( scriptName = "jsa.util.streamClient", scriptDesc = "Forward data from a stream input or a file over the network to a jsa.util.streamServer", - seeAlso = "jsa.util.streamServer, jsa.np.filter, jsa.np.f5reader" + seeAlso = "jsa.util.streamServer, jsa.np.filter, jsa.np.npreader" ) -public class StreamClientCmd extends CommandLine{ +public class StreamClientCmd extends CommandLine{ + private static final Logger LOG = LoggerFactory.getLogger(StreamClientCmd.class); public StreamClientCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -75,13 +77,14 @@ public StreamClientCmd(){ * @throws Exception * @throws OutOfMemoryError */ - public static void main(String[] args) throws IOException{ + public static void main(String[] args) throws IOException{ + CommandLine cmdLine = new StreamClientCmd(); args = cmdLine.stdParseLine(args); /**********************************************************************/ String input = cmdLine.getStringVal("input"); StreamClient client = new StreamClient(cmdLine.getStringVal("server")); - Logging.info("Connection established"); + LOG.info("Connection established at " + new Date()); InputStream ins = input.equals("-")? System.in : new FileInputStream(input); byte[] buffer = new byte[8192]; @@ -98,7 +101,7 @@ public static void main(String[] args) throws IOException{ socket.getOutputStream().write(buffer,0, ret); count ++; } catch (IOException e) { - Logging.info("Connection to " + socket.getRemoteSocketAddress() + " closed"); + LOG.info("Connection to " + socket.getRemoteSocketAddress() + " closed at "+ new Date()); socket.close(); } } diff --git a/src/main/java/japsa/tools/util/StreamServerCmd.java b/src/main/java/japsa/tools/util/StreamServerCmd.java index ff3eea3..769932e 100644 --- a/src/main/java/japsa/tools/util/StreamServerCmd.java +++ b/src/main/java/japsa/tools/util/StreamServerCmd.java @@ -34,14 +34,16 @@ package japsa.tools.util; import japsa.util.CommandLine; -import japsa.util.Logging; import japsa.util.deploy.Deployable; import java.io.IOException; import java.net.ServerSocket; import java.net.Socket; +import java.util.Date; import com.google.common.io.ByteStreams; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** @@ -51,10 +53,12 @@ @Deployable( scriptName = "jsa.util.streamServer", scriptDesc = "Listen for input from a stream and forward the streamed data to the standard output", - seeAlso = "jsa.util.streamClient, jsa.np.filter, jsa.np.f5reader" + seeAlso = "jsa.util.streamClient, jsa.np.filter, jsa.np.npreader" ) public class StreamServerCmd extends CommandLine{ - public static int DEFAULT_PORT = 3456; + private static final Logger LOG = LoggerFactory.getLogger(StreamServerCmd.class); + + public static int DEFAULT_PORT = 3456; public StreamServerCmd(){ super(); Deployable annotation = getClass().getAnnotation(Deployable.class); @@ -81,12 +85,12 @@ public static void main(String[] args) throws IOException, InterruptedException{ int port = cmdLine.getIntVal("port"); ServerSocket serverSocket = new ServerSocket(port); - Logging.info("Listen on " + serverSocket.getInetAddress() + ":" + serverSocket.getLocalPort()); - Socket clientSocket = serverSocket.accept(); - Logging.info("Connection establised"); + LOG.info("Listen on " + serverSocket.getInetAddress() + ":" + serverSocket.getLocalPort()); + Socket clientSocket = serverSocket.accept(); + LOG.info("Connection established at " + new Date()); ByteStreams.copy(clientSocket.getInputStream(), System.out); serverSocket.close(); - Logging.info("Connection closed"); + LOG.info("Connection closed at " + new Date()); } } diff --git a/src/main/java/japsa/util/BetaBinomialModel.java b/src/main/java/japsa/util/BetaBinomialModel.java index 66b4647..66d72e4 100644 --- a/src/main/java/japsa/util/BetaBinomialModel.java +++ b/src/main/java/japsa/util/BetaBinomialModel.java @@ -48,10 +48,21 @@ * */ public class BetaBinomialModel { - - + + static int psuedoCount = 1; - + + /*OK, let do the maths + * count_flank = #read in the flanks + * count_repeat = #reads in repeats + * count_all = count_flank + count_repeat + * + * assume count_flank follows Bin(count_all,p), so p would follow the + * beta distribution Beta(alpha,beta) which alpha= count_flank, beta=count_repeat + * + * + * + */ /** * Compute the ratio distribution of ps/pr where ps and pr are the params of * the binomial distributionsm from countR vs totR/countS vs totS @@ -65,25 +76,30 @@ * @return */ public static NormalDistribution ratioDistribution(double countR, double totR, double countS, double totS, int numSamples){ + + BetaDistribution betaR = + new BetaDistribution(countR + psuedoCount, totR - countR + psuedoCount); + + BetaDistribution betaS = + new BetaDistribution(countS + psuedoCount, totS - countS + psuedoCount); - BetaDistribution betaR = new BetaDistribution(countR + psuedoCount, totR - countR + psuedoCount); - BetaDistribution betaS = new BetaDistribution(countS + psuedoCount, totS - countS + psuedoCount); - + //betaR models the + double sum = 0, sq = 0; //int count = 0; for (int i = 0; i < numSamples; i++){ double r = betaR.sample(); - + if (r == 0 ){//wont happen{ i --;continue; } - - r = betaS.sample()/ r; - sum += r; - sq += r*r; + + double ratio = betaS.sample()/ r; + sum += ratio; + sq += ratio*ratio; } double mean = sum/numSamples; - + return new NormalDistribution(mean, Math.sqrt(sq/numSamples - mean*mean)); } /** @@ -93,7 +109,7 @@ public static void main(String[] args) { } - + /** * This class is copied from org.apache.commons.math3.distribution.SaddlePointExpansion * so that some methods (such as logBinomialProbability) are accessible from here. @@ -121,159 +137,159 @@ public static void main(String[] args) { */ static public final class SaddlePointExpansion { - /** 1/2 * log(2 π). */ - private static final double HALF_LOG_2_PI = 0.5 * FastMath.log(MathUtils.TWO_PI); + /** 1/2 * log(2 π). */ + private static final double HALF_LOG_2_PI = 0.5 * FastMath.log(MathUtils.TWO_PI); - /** exact Stirling expansion error for certain values. */ - private static final double[] EXACT_STIRLING_ERRORS = { 0.0, /* 0.0 */ - 0.1534264097200273452913848, /* 0.5 */ - 0.0810614667953272582196702, /* 1.0 */ - 0.0548141210519176538961390, /* 1.5 */ - 0.0413406959554092940938221, /* 2.0 */ - 0.03316287351993628748511048, /* 2.5 */ - 0.02767792568499833914878929, /* 3.0 */ - 0.02374616365629749597132920, /* 3.5 */ - 0.02079067210376509311152277, /* 4.0 */ - 0.01848845053267318523077934, /* 4.5 */ - 0.01664469118982119216319487, /* 5.0 */ - 0.01513497322191737887351255, /* 5.5 */ - 0.01387612882307074799874573, /* 6.0 */ - 0.01281046524292022692424986, /* 6.5 */ - 0.01189670994589177009505572, /* 7.0 */ - 0.01110455975820691732662991, /* 7.5 */ - 0.010411265261972096497478567, /* 8.0 */ - 0.009799416126158803298389475, /* 8.5 */ - 0.009255462182712732917728637, /* 9.0 */ - 0.008768700134139385462952823, /* 9.5 */ - 0.008330563433362871256469318, /* 10.0 */ - 0.007934114564314020547248100, /* 10.5 */ - 0.007573675487951840794972024, /* 11.0 */ - 0.007244554301320383179543912, /* 11.5 */ - 0.006942840107209529865664152, /* 12.0 */ - 0.006665247032707682442354394, /* 12.5 */ - 0.006408994188004207068439631, /* 13.0 */ - 0.006171712263039457647532867, /* 13.5 */ - 0.005951370112758847735624416, /* 14.0 */ - 0.005746216513010115682023589, /* 14.5 */ - 0.005554733551962801371038690 /* 15.0 */ - }; + /** exact Stirling expansion error for certain values. */ + private static final double[] EXACT_STIRLING_ERRORS = { 0.0, /* 0.0 */ + 0.1534264097200273452913848, /* 0.5 */ + 0.0810614667953272582196702, /* 1.0 */ + 0.0548141210519176538961390, /* 1.5 */ + 0.0413406959554092940938221, /* 2.0 */ + 0.03316287351993628748511048, /* 2.5 */ + 0.02767792568499833914878929, /* 3.0 */ + 0.02374616365629749597132920, /* 3.5 */ + 0.02079067210376509311152277, /* 4.0 */ + 0.01848845053267318523077934, /* 4.5 */ + 0.01664469118982119216319487, /* 5.0 */ + 0.01513497322191737887351255, /* 5.5 */ + 0.01387612882307074799874573, /* 6.0 */ + 0.01281046524292022692424986, /* 6.5 */ + 0.01189670994589177009505572, /* 7.0 */ + 0.01110455975820691732662991, /* 7.5 */ + 0.010411265261972096497478567, /* 8.0 */ + 0.009799416126158803298389475, /* 8.5 */ + 0.009255462182712732917728637, /* 9.0 */ + 0.008768700134139385462952823, /* 9.5 */ + 0.008330563433362871256469318, /* 10.0 */ + 0.007934114564314020547248100, /* 10.5 */ + 0.007573675487951840794972024, /* 11.0 */ + 0.007244554301320383179543912, /* 11.5 */ + 0.006942840107209529865664152, /* 12.0 */ + 0.006665247032707682442354394, /* 12.5 */ + 0.006408994188004207068439631, /* 13.0 */ + 0.006171712263039457647532867, /* 13.5 */ + 0.005951370112758847735624416, /* 14.0 */ + 0.005746216513010115682023589, /* 14.5 */ + 0.005554733551962801371038690 /* 15.0 */ + }; - /** - * Default constructor. - */ - private SaddlePointExpansion() { - super(); - } + /** + * Default constructor. + */ + private SaddlePointExpansion() { + super(); + } - /** - * Compute the error of Stirling's series at the given value. - *

- * References: - *

    - *
  1. Eric W. Weisstein. "Stirling's Series." From MathWorld--A Wolfram Web - * Resource. - * http://mathworld.wolfram.com/StirlingsSeries.html
  2. - *
- *

- * - * @param z the value. - * @return the Striling's series error. - */ - static double getStirlingError(double z) { - double ret; - if (z < 15.0) { - double z2 = 2.0 * z; - if (FastMath.floor(z2) == z2) { - ret = EXACT_STIRLING_ERRORS[(int) z2]; - } else { - ret = Gamma.logGamma(z + 1.0) - (z + 0.5) * FastMath.log(z) + - z - HALF_LOG_2_PI; - } - } else { - double z2 = z * z; - ret = (0.083333333333333333333 - - (0.00277777777777777777778 - - (0.00079365079365079365079365 - - (0.000595238095238095238095238 - - 0.0008417508417508417508417508 / - z2) / z2) / z2) / z2) / z; - } - return ret; - } + /** + * Compute the error of Stirling's series at the given value. + *

+ * References: + *

    + *
  1. Eric W. Weisstein. "Stirling's Series." From MathWorld--A Wolfram Web + * Resource. + * http://mathworld.wolfram.com/StirlingsSeries.html
  2. + *
+ *

+ * + * @param z the value. + * @return the Striling's series error. + */ + static double getStirlingError(double z) { + double ret; + if (z < 15.0) { + double z2 = 2.0 * z; + if (FastMath.floor(z2) == z2) { + ret = EXACT_STIRLING_ERRORS[(int) z2]; + } else { + ret = Gamma.logGamma(z + 1.0) - (z + 0.5) * FastMath.log(z) + + z - HALF_LOG_2_PI; + } + } else { + double z2 = z * z; + ret = (0.083333333333333333333 - + (0.00277777777777777777778 - + (0.00079365079365079365079365 - + (0.000595238095238095238095238 - + 0.0008417508417508417508417508 / + z2) / z2) / z2) / z2) / z; + } + return ret; + } - /** - * A part of the deviance portion of the saddle point approximation. - *

- * References: - *

    - *
  1. Catherine Loader (2000). "Fast and Accurate Computation of Binomial - * Probabilities.". - * http://www.herine.net/stat/papers/dbinom.pdf
  2. - *
- *

- * - * @param x the x value. - * @param mu the average. - * @return a part of the deviance. - */ - static double getDeviancePart(double x, double mu) { - double ret; - if (FastMath.abs(x - mu) < 0.1 * (x + mu)) { - double d = x - mu; - double v = d / (x + mu); - double s1 = v * d; - double s = Double.NaN; - double ej = 2.0 * x * v; - v = v * v; - int j = 1; - while (s1 != s) { - s = s1; - ej *= v; - s1 = s + ej / ((j * 2) + 1); - ++j; - } - ret = s1; - } else { - ret = x * FastMath.log(x / mu) + mu - x; - } - return ret; - } + /** + * A part of the deviance portion of the saddle point approximation. + *

+ * References: + *

    + *
  1. Catherine Loader (2000). "Fast and Accurate Computation of Binomial + * Probabilities.". + * http://www.herine.net/stat/papers/dbinom.pdf
  2. + *
+ *

+ * + * @param x the x value. + * @param mu the average. + * @return a part of the deviance. + */ + static double getDeviancePart(double x, double mu) { + double ret; + if (FastMath.abs(x - mu) < 0.1 * (x + mu)) { + double d = x - mu; + double v = d / (x + mu); + double s1 = v * d; + double s = Double.NaN; + double ej = 2.0 * x * v; + v = v * v; + int j = 1; + while (s1 != s) { + s = s1; + ej *= v; + s1 = s + ej / ((j * 2) + 1); + ++j; + } + ret = s1; + } else { + ret = x * FastMath.log(x / mu) + mu - x; + } + return ret; + } - /** - * Compute the logarithm of the PMF for a binomial distribution - * using the saddle point expansion. - * - * @param x the value at which the probability is evaluated. - * @param n the number of trials. - * @param p the probability of success. - * @param q the probability of failure (1 - p). - * @return log(p(x)). - */ - static public double logBinomialProbability(int x, int n, double p, double q) { - double ret; - if (x == 0) { - if (p < 0.1) { - ret = -getDeviancePart(n, n * q) - n * p; - } else { - ret = n * FastMath.log(q); - } - } else if (x == n) { - if (q < 0.1) { - ret = -getDeviancePart(n, n * p) - n * q; - } else { - ret = n * FastMath.log(p); - } - } else { - ret = getStirlingError(n) - getStirlingError(x) - - getStirlingError(n - x) - getDeviancePart(x, n * p) - - getDeviancePart(n - x, n * q); - double f = (MathUtils.TWO_PI * x * (n - x)) / n; - ret = -0.5 * FastMath.log(f) + ret; - } - return ret; - } + /** + * Compute the logarithm of the PMF for a binomial distribution + * using the saddle point expansion. + * + * @param x the value at which the probability is evaluated. + * @param n the number of trials. + * @param p the probability of success. + * @param q the probability of failure (1 - p). + * @return log(p(x)). + */ + static public double logBinomialProbability(int x, int n, double p, double q) { + double ret; + if (x == 0) { + if (p < 0.1) { + ret = -getDeviancePart(n, n * q) - n * p; + } else { + ret = n * FastMath.log(q); + } + } else if (x == n) { + if (q < 0.1) { + ret = -getDeviancePart(n, n * p) - n * q; + } else { + ret = n * FastMath.log(p); + } + } else { + ret = getStirlingError(n) - getStirlingError(x) - + getStirlingError(n - x) - getDeviancePart(x, n * p) - + getDeviancePart(n - x, n * q); + double f = (MathUtils.TWO_PI * x * (n - x)) / n; + ret = -0.5 * FastMath.log(f) + ret; + } + return ret; + } } } diff --git a/src/main/java/japsa/util/CommandLine.java b/src/main/java/japsa/util/CommandLine.java index 668e6a0..db8312c 100755 --- a/src/main/java/japsa/util/CommandLine.java +++ b/src/main/java/japsa/util/CommandLine.java @@ -36,6 +36,8 @@ import java.util.ArrayList; +import japsa.util.deploy.Deploy; + /** * An implementation of commandLine utilities. This class was written based * heavily on code from David Powell @@ -44,9 +46,8 @@ */ public class CommandLine { - /** - * String describe the usage of the program (progname -i input -o output f1 f2 ...) + * String describe the usage of the program (program -i input -o output f1 f2 ...) */ private String usage = ""; /** @@ -200,47 +201,49 @@ public String usageMessage() { return res.toString(); } - private void addOption(String opt, char type, Object def, String help, boolean req) { - options.add(new Option(opt, type, def, help, req)); + private Option addOption(String opt, char type, Object def, String help, boolean req) { + Option option = new Option(opt, type, def, help, req); + options.add(option); + return option; } - public void addBoolean(String opt, boolean def, String help, boolean req) { - addOption(opt, 'b', new Boolean(def), help, req); + public Option addBoolean(String opt, boolean def, String help, boolean req) { + return addOption(opt, 'b', new Boolean(def), help, req); } - public void addInt(String opt, int def, String help, boolean req) { - addOption(opt, 'i', new Integer(def), help, req); + public Option addInt(String opt, int def, String help, boolean req) { + return addOption(opt, 'i', new Integer(def), help, req); } - public void addDouble(String opt, double def, String help, boolean req) { - addOption(opt, 'f', new Double(def), help, req); + public Option addDouble(String opt, double def, String help, boolean req) { + return addOption(opt, 'f', new Double(def), help, req); } - public void addString(String opt, String def, String help, boolean req) { - addOption(opt, 's', def, help, req); + public Option addString(String opt, String def, String help, boolean req) { + return addOption(opt, 's', def, help, req); } - public void addBoolean(String opt, boolean def, String help) { - addOption(opt, 'b', new Boolean(def), help, false); + public Option addBoolean(String opt, boolean def, String help) { + return addOption(opt, 'b', new Boolean(def), help, false); } - public void addInt(String opt, int def, String help) { - addOption(opt, 'i', new Integer(def), help, false); + public Option addInt(String opt, int def, String help) { + return addOption(opt, 'i', new Integer(def), help, false); } - public void addDouble(String opt, double def, String help) { - addOption(opt, 'f', new Double(def), help, false); + public Option addDouble(String opt, double def, String help) { + return addOption(opt, 'f', new Double(def), help, false); } - public void addString(String opt, String def, String help) { - addOption(opt, 's', def, help, false); + public Option addString(String opt, String def, String help) { + return addOption(opt, 's', def, help, false); } public boolean optionSet(String opt) { int o = isOption(opt, 1); if (o < 0) { System.err.println("ERROR: Attempt to lookup non-defined option '" + opt - + "'"); + + "'"); return false; } return options.get(o).optionSet; @@ -250,7 +253,7 @@ Object getVal(String opt) { int o = isOption(opt, 1); if (o < 0) { System.err.println("ERROR: Attempt to lookup non-defined option '" + opt - + "'"); + + "'"); return null; } return options.get(o).value; @@ -260,13 +263,13 @@ public int getIntVal(String opt) { int o = isOption(opt, 1); if (o < 0) { System.err.println("ERROR: Attempt to lookup non-defined option '" + opt - + "'"); + + "'"); return 0; } if (options.get(o).optType != 'i') { System.err.println("ERROR: Option '" + opt - + "' is not an int option in getIntVal"); + + "' is not an int option in getIntVal"); return 0; } @@ -277,13 +280,13 @@ public double getDoubleVal(String opt) { int o = isOption(opt, 1); if (o < 0) { System.err.println("ERROR: Attempt to lookup non-defined option '" + opt - + "'"); + + "'"); return 0; } if (options.get(o).optType != 'f') { System.err.println("ERROR: Option '" + opt - + "' is not a double option in getDoubleVal"); + + "' is not a double option in getDoubleVal"); return 0; } @@ -294,13 +297,13 @@ public String getStringVal(String opt) { int o = isOption(opt, 1); if (o < 0) { System.err.println("ERROR: Attempt to lookup non-defined option '" + opt - + "'"); + + "'"); return null; } if (options.get(o).optType != 's') { System.err.println("ERROR: Option '" + opt - + "' is not a string option in getStringVal"); + + "' is not a string option in getStringVal"); return null; } @@ -311,13 +314,13 @@ public boolean getBooleanVal(String opt) { int o = isOption(opt, 1); if (o < 0) { System.err.println("ERROR: Attempt to lookup non-defined option '" + opt - + "'"); + + "'"); return false; } if (options.get(o).optType != 'b') { System.err.println("ERROR: Option '" + opt - + "' is not a boolean option in getBooleanVal"); + + "' is not a boolean option in getBooleanVal"); return false; } @@ -345,8 +348,8 @@ else if (noDashOk == 0) if (opt.compareToIgnoreCase(optStr) == 0) { if (match >= 0) { addError("ERROR: Ambiguous option '" + opt - + "' could be '" + options.get(i).optName + "' or '" + options.get(match).optName - + "'"); + + "' could be '" + options.get(i).optName + "' or '" + options.get(match).optName + + "'"); return match; } match = i; @@ -360,15 +363,13 @@ public String errorString(){ } public String usageString(){ return desc + "\n\n" + "Usage: " + usage() + "\nOptions:\n" + options(); - } - + } - @Deprecated - public String[] stdParseLine_old(String[] args) { - addStdHelp(); + public String[] stdParseLine(String[] args) { /**********************************************************************/ String[] ret = parseLine(args); - if (getBooleanVal("help")){ + //System.out.println(optionValues()); + if (isOption("help", 1) >=0 && getBooleanVal("help")){ System.out.println(usageString()); System.exit(0); } @@ -381,25 +382,15 @@ public String usageString(){ return ret; } - public String[] stdParseLine(String[] args) { - /**********************************************************************/ - String[] ret = parseLine(args); - - if (isOption("help", 1) >=0 && getBooleanVal("help")){ - System.out.println(usageString()); - System.exit(0); - } + public String optionValues(){ + String ret = ""; + for (Option option:options){ + ret = ret + String.format("%20s = ",option.optName) + ((option.value == null)?"(null)":option.value) + "\n"; + } - if (errors != null) { - System.out.println(errorString()); - System.exit(-1); - } - /**********************************************************************/ return ret; } - public String[] parseLine(String[] args) { - //Keep the original command line StringBuilder sb = new StringBuilder(); for (int i = 0; i < args.length;i++){ @@ -433,7 +424,7 @@ public String usageString(){ case 'b': if (args[i].indexOf("=") >= 0) { String s = args[i] - .substring(args[i].indexOf("=") + 1); + .substring(args[i].indexOf("=") + 1); if (s.equalsIgnoreCase("true")) option.value = Boolean.TRUE; else if (s.equalsIgnoreCase("yes")) @@ -453,7 +444,7 @@ else if (s.equalsIgnoreCase("0")) else { System.err .println("ERROR: Unknown boolean option parameter '" - + s + "'"); + + s + "'"); return null; } }else @@ -478,7 +469,7 @@ else if (s.equalsIgnoreCase("0")) case 's': if (args[i].indexOf("=") >= 0) option.value = args[i] - .substring(args[i].indexOf("=") + 1); + .substring(args[i].indexOf("=") + 1); else { option.value = args[i + 1]; i++; @@ -530,6 +521,190 @@ public void addStdHelp(){ addBoolean("help", false, "Display this usage and exit"); } + boolean galaxy = false; + String cmd = null; + + public void setGalaxy(String cmd){ + galaxy = true; + this.cmd = cmd; + } + + public boolean galaxy(){ + return galaxy; + } + + + + public String generateGalaxyWrapper(){ + + String inputs = " \n"; + String outputs = " \n"; + + String wrapper = + "\n" + + "\n" + + " " + desc + "\n" + + " \n" + + " \n" + + " \n" + + " \n"; + + wrapper = wrapper + + " \n" + cmd; + + for (Option option:options){ + GalaxySetting galaxySetting = option.galaxySetting; + if (galaxySetting != null){ + wrapper += " --" + option.optName + " $" + option.optName; + String params = galaxySetting.isOutput? + " \n"; + wrapper = wrapper + inputs + " \n" + outputs + " \n"; + wrapper = wrapper + + " \n" + + "**What it does**\n" + + desc + "\n" + + " \n"; + wrapper = wrapper + + ""; + + /************************************************************************ + + + + + + + + + + + + + + + + + + + + + + **What it does** + +Converts tab delimited data into FASTA formatted sequences. + +----------- + + **Example** + +Suppose this is a sequence file produced by Illumina (Solexa) sequencer:: + + 5 300 902 419 GACTCATGATTTCTTACCTATTAGTGGTTGAACATC + 5 300 880 431 GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT + +Selecting **c3** and **c4** as the **Title column(s)** and **c5** as the **Sequence column** will result in:: + + >902_419 + GACTCATGATTTCTTACCTATTAGTGGTTGAACATC + >880_431 + GTGATATGTATGTTGACGGCCATAAGGCTGCTTCTT + + + + + /************************************************************************/ + + + return wrapper; + } + + + /** + * Representing option for galaxy + * + * @author minhduc + * + */ + public static class GalaxySetting{ + //For use in galaxy wraper: + String format = null;//bam, text, etc + boolean isOutput = false; + String type = null; + String label = null; + String help = null; + + public GalaxySetting(String type, String format, boolean isOutput){ + this.format = format; + this.type = type; + this.isOutput = isOutput; + } + + + public GalaxySetting(String type, String format){ + this (type,format,false); + } + + /** + * @return the galaxyFormat + */ + public String getGalaxyFormat() { + return format; + } + /** + * @param galaxyFormat the galaxyFormat to set + */ + public void setGalaxyFormat(String format) { + this.format = format; + } + + public void setLabel(String label){ + this.label = label; + } + + public String getLabel(){ + return label; + } + + + } /** * Represent an option from the command line * @author minhduc @@ -544,10 +719,7 @@ public void addStdHelp(){ boolean required; boolean optionSet = false; - //For use in galaxy wraper: - boolean galaxyUse = false; - String galaxyFormat = null;//bam, text, etc - + private GalaxySetting galaxySetting = null; public Option(String opt, char type, Object def, String help, boolean req) { optName = opt; @@ -560,30 +732,14 @@ public Option(String opt, char type, Object def, String help) { this(opt, type, def,help,true); } - // - /** - * @return the galaxyUse - */ - public boolean isGalaxyUse() { - return galaxyUse; + public GalaxySetting getGalaxySetting(){ + return galaxySetting; } - /** - * @param galaxyUse the galaxyUse to set - */ - public void setGalaxyUse(boolean galaxyUse) { - this.galaxyUse = galaxyUse; - } - /** - * @return the galaxyFormat - */ - public String getGalaxyFormat() { - return galaxyFormat; + + public void setGalaxySetting(GalaxySetting galaxy){ + galaxySetting = galaxy; } - /** - * @param galaxyFormat the galaxyFormat to set - */ - public void setGalaxyFormat(String galaxyFormat) { - this.galaxyFormat = galaxyFormat; - } + // + } } diff --git a/src/main/java/japsa/util/DynamicHistogram.java b/src/main/java/japsa/util/DynamicHistogram.java index 43f40de..b6b1289 100644 --- a/src/main/java/japsa/util/DynamicHistogram.java +++ b/src/main/java/japsa/util/DynamicHistogram.java @@ -244,7 +244,8 @@ public int getSeriesCount() { * @throws IndexOutOfBoundsException if series is outside the * specified range. */ - @Override + @SuppressWarnings("rawtypes") + @Override public Comparable getSeriesKey(int series) { return this.mySeriesList.get(series).seriesID; } diff --git a/src/main/java/japsa/util/HTSUtilities.java b/src/main/java/japsa/util/HTSUtilities.java index 60a7f97..ea4cac6 100644 --- a/src/main/java/japsa/util/HTSUtilities.java +++ b/src/main/java/japsa/util/HTSUtilities.java @@ -40,6 +40,8 @@ import htsjdk.samtools.SAMRecord; import japsa.seq.Alphabet; import japsa.seq.Sequence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A collection of utilities to analyse HTS, based on HTS library @@ -47,13 +49,16 @@ * */ public class HTSUtilities { + private static final Logger LOG = LoggerFactory.getLogger(HTSUtilities.class); + + /** * Extract read between start and end (on ref) * @param record * @param readSequence - * @param start - * @param end + * @param fromPos + * @param toPos * @return */ public static Sequence readSequence(SAMRecord record, Sequence readSequence, int fromPos, int toPos){ @@ -118,7 +123,8 @@ public static Sequence readSequence(SAMRecord record, Sequence readSequence, int }//for if (readFrom ==0 || readTo ==0){ - Logging.exit("Error at HTSUtilities.readSequence " + readFrom + " " + readTo, 1); + LOG.error("Error at HTSUtilities.readSequence " + readFrom + " " + readTo, 1); + System.exit(1); } if (record.getReadNegativeStrandFlag()){ //Need to complement the read sequence before calling subsequence = calling sub l-e, l-s then complementing @@ -133,6 +139,82 @@ public static Sequence readSequence(SAMRecord record, Sequence readSequence, int return seq; } } + + + public static Sequence getReadPosition(SAMRecord rec, int startRef, int endRef){ + byte[] seqRead = rec.getReadBases();// + if (seqRead.length <= 1) + return null; + + int startRead = -1, endRead = -1; + + int refPos = rec.getAlignmentStart(); + int readPos = 0; + //currentRefPos <= startRead + + for (final CigarElement e : rec.getCigar().getCigarElements()) { + int length = e.getLength(); + switch (e.getOperator()) { + case H: + break; // ignore hard clips + case P: + break; // ignore pads + case S: + readPos += e.getLength(); + break; // soft clip read bases + case N: // N ~ D + case D: + refPos += length; + + if (startRead < 0 && refPos >= startRef){ + startRead = readPos; + } + + if (endRead < 0 && refPos >= endRef){ + endRead = readPos; + } + + break;// case + case I: + readPos += length; + break; + + case M: + case EQ: + case X: + if ((startRead < 0) && refPos + length >= startRef) { + startRead = readPos + startRef - refPos; + } + + if ((endRead < 0) && (refPos + length >= endRef)){ + endRead = readPos + endRef - refPos; + } + + refPos += length; + readPos += length; + break; + default: + throw new IllegalStateException( + "Case statement didn't deal with cigar op: " + + e.getOperator()); + }// case + if (refPos >= endRef) + break;//for + + }// for + if (startRead < 0 || endRead < 0){ + LOG.warn(" " + refPos + " " + readPos + " " + startRead + " " + endRead); + return null; + } + + Alphabet alphabet = Alphabet.DNA16(); + Sequence retSeq = new Sequence(alphabet, endRead - startRead + 1, rec.getReadName() + "/" + startRead + "_" + endRead); + for (int i = 0; i < retSeq.length();i++){ + retSeq.setBase(i, alphabet.byte2index(seqRead[startRead + i])); + } + return retSeq; + + } /** * Get the read subsequence that spans the gene. The method look at an alignment, @@ -194,7 +276,7 @@ public static Sequence spanningSequence(SAMRecord record, Sequence readSequence, readEnd = readLength;//1-index if (readLength != readSequence.length()){ - Logging.error("Error0 " + record.getReadName() + " " + readSequence.length() + " vs estimated " + readLength + " Flag = " + record.getFlags()); + LOG.error("Error0 " + record.getReadName() + " " + readSequence.length() + " vs estimated " + readLength + " Flag = " + record.getFlags()); return null; } @@ -204,7 +286,7 @@ public static Sequence spanningSequence(SAMRecord record, Sequence readSequence, start = 1;//I am still live in 1-index world if (readEnd > readSequence.length()){ - Logging.error("Error1 " + record.getReadName() + " " + record.getReadLength() + " vs " + readEnd); + LOG.error("Error1 " + record.getReadName() + " " + record.getReadLength() + " vs " + readEnd); return null; } int end = readEnd + right; @@ -213,7 +295,7 @@ public static Sequence spanningSequence(SAMRecord record, Sequence readSequence, end = readSequence.length(); if (start >= end){ - Logging.error("Error2 " + record.getReadName() + " " + record.getReadLength() + " " + start + " " + end); + LOG.error("Error2 " + record.getReadName() + " " + record.getReadLength() + " " + start + " " + end); return null; } @@ -231,7 +313,7 @@ public static Sequence spanningSequence(SAMRecord record, Sequence readSequence, } }catch(Exception e){ - Logging.warn(e.getMessage()); + LOG.warn(e.getMessage()); e.printStackTrace(); //continue;//while return null; @@ -260,7 +342,7 @@ public static IdentityProfile identity(Sequence refSeq, Sequence readSeq, SAMRe profile.match = 0; profile.mismatch = 0; profile.refBase = 0; - profile.readBase = 0;//the number of bases from ref and read + profile.readBase = 0;//the number of bases from ref and read for (final CigarElement e : sam.getCigar().getCigarElements()) { final int length = e.getLength(); @@ -299,7 +381,7 @@ public static IdentityProfile identity(Sequence refSeq, Sequence readSeq, SAMRe profile.numIns ++; break; case M : - for (int i = 0; i < length; i++){ + for (int i = 0; i < length && refPos + i < refSeq.length(); i++){ if (refSeq.getBase(refPos + i) == readSeq.getBase(readPos + i)) profile.match ++; else @@ -338,6 +420,84 @@ public static IdentityProfile identity(Sequence refSeq, Sequence readSeq, SAMRe } + + /** + * Get the list of positions in reads corresponding the the positions in reference + * @param sam + * @param refPositions + * @return + */ + public static int[] positionsInRead(SAMRecord sam, final int [] refPositions){ + int readPos = 0;//start from 0 + int refPos = sam.getAlignmentStart();//convert to 0-based index + int [] readPositions = new int[refPositions.length]; + int index = 0; + + while (index < refPositions.length && refPositions[index] <= refPos) + index ++; + + if (index >= refPositions.length) + return readPositions; + + for (final CigarElement e : sam.getCigar().getCigarElements()) { + //assert: refPositions[index] > refPos + + final int length = e.getLength(); + switch (e.getOperator()) { + case H : + //nothing todo + break; // ignore hard clips + case P : + //pad is a kind of hard clipped ?? + break; // ignore pads + case S : + //soft clip: advance on the reference + readPos += length; + break; // soft clip read bases + case N : + refPos += length; + + //advance index + while (index < refPositions.length && refPositions[index] <= refPos) + index ++; + + if (index >= refPositions.length) + return readPositions; + break; // reference skip + + case D ://deletion + refPos += length; + while (index < refPositions.length && refPositions[index] <= refPos){ + readPositions[index] = readPos; + index ++; + } + if (index >= refPositions.length) + return readPositions; + break; + case I : + readPos += length; + break; + case M : + case EQ: + case X: + while (index < refPositions.length && refPositions[index] <= refPos +length){ + readPositions[index] = readPos + refPositions[index] - refPos; + index ++; + } + if (index >= refPositions.length) + return readPositions; + + readPos += length; + refPos += length; + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); + }//case + }//for + + return readPositions; + + } + public static class IdentityProfile{ public int match, mismatch, baseIns, baseDel, numIns, numDel, refClipped, readClipped, refBase, readBase; @@ -366,5 +526,25 @@ public static double n50(ArrayList seqs){ } return lengths[index]; + } + + public static double n50(ArrayList seqs, long genomeSize){ + int [] lengths = new int[seqs.size()]; + + for (int i = 0;i < lengths.length;i++){ + int l = seqs.get(i).length(); + lengths[i] = l; + } + Arrays.sort(lengths); + + int index = lengths.length; + double contains = 0; + while (contains < genomeSize/2){ + index --; + contains += lengths[index]; + } + + return lengths[index]; } + } diff --git a/src/main/java/japsa/util/JapsaTimer.java b/src/main/java/japsa/util/JapsaTimer.java index c1c5297..15e46cc 100644 --- a/src/main/java/japsa/util/JapsaTimer.java +++ b/src/main/java/japsa/util/JapsaTimer.java @@ -34,10 +34,14 @@ ****************************************************************************/ package japsa.util; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; //Mon 6 April 2009 public class JapsaTimer { // A Timer object can 'mark' an event by printing the given - // message, + private static final Logger LOG = LoggerFactory.getLogger(JapsaTimer.class); + + // message, // the time since the last event, and the total time the Timer has existed. private final long milliSecs0; long last; @@ -49,14 +53,14 @@ public JapsaTimer() { public void mark(String msg) { final long now = System.currentTimeMillis(); - Logging.info(msg + ":" + " increment=" + (now - last) / 1000.0 + LOG.info(msg + ":" + " increment=" + (now - last) / 1000.0 + " sec" + " (total=" + (now - milliSecs0) / 1000.0 + ")"); last = now; }// mark() static public void systemInfo(){ Runtime runtime = Runtime.getRuntime(); - Logging.info(" CPU=" + runtime.availableProcessors() + LOG.info(" CPU=" + runtime.availableProcessors() + ", maxMem=" + runtime.maxMemory() / 1000000.0 + " MB" + ", freeMem=" + runtime.freeMemory() / 1000000.0 + " MB" + ", totalMem=" + runtime.totalMemory() / 1000000.0 + " MB" @@ -64,6 +68,7 @@ static public void systemInfo(){ ); } + /************************************************************************ // ------------------------------------------------------------------------- public static void main(String[] args) { System.out.println("-- test Timer.java --"); @@ -78,6 +83,7 @@ public static void main(String[] args) { t.mark("tock"); System.out.println("-- done --"); }// main() + /************************************************************************/ }// class Timer //LA, 4/2009 diff --git a/src/main/java/japsa/util/Logging.java b/src/main/java/japsa/util/Logging.java index e0d430b..c6d4c1e 100644 --- a/src/main/java/japsa/util/Logging.java +++ b/src/main/java/japsa/util/Logging.java @@ -34,12 +34,16 @@ package japsa.util; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.PrintStream; /** * @author minhduc * + * We should use slf4j instead */ +@Deprecated public class Logging { private static String prefix = "#"; @@ -51,13 +55,35 @@ /** * A simple logging system */ - public Logging() { + private Logging() { // TODO Auto-generated constructor stub } + /** * @param args + * @throws FileNotFoundException */ - + public static void setInfoFile(String filePath) throws FileNotFoundException{ + infoStr = new PrintStream(new FileOutputStream(filePath,true)); + } + + public static void setWarnFile(String filePath) throws FileNotFoundException{ + warnStr = new PrintStream(new FileOutputStream(filePath,true)); + } + + public static void setErrorFile(String filePath) throws FileNotFoundException{ + errorStr = new PrintStream(new FileOutputStream(filePath,true)); + } + /** + * Force all three streams to the same file + * @param filePath + * @throws FileNotFoundException + */ + public static void setFile(String filePath) throws FileNotFoundException{ + warnStr = errorStr = infoStr = new PrintStream(new FileOutputStream(filePath,true)); + } + + public static void info(String msg) { synchronized(infoStr){ infoStr.println(prefix+msg); diff --git a/src/main/java/japsa/util/LongArray.java b/src/main/java/japsa/util/LongArray.java new file mode 100644 index 0000000..d4c01b0 --- /dev/null +++ b/src/main/java/japsa/util/LongArray.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 1998 - 2005 Versant Corporation + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Public License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/legal/epl-v10.html + * + * Contributors: + * Versant Corporation - initial API and implementation + */ + +/** + * Growable int[]. This is based com.sosnoski.util.array.IntArray from + * Sosnoski Software Solutions, Inc. + */ + + +/************************** REVISION HISTORY ************************** + * 14/03/2014 - Minh Duc Cao: added + * + ****************************************************************************/ + +package japsa.util; +/** + * A resizeable array of long. This array bypass safe checks + * @author minhduc + * + */ +public final class LongArray { + + private long[] buf; + private int size; + + public LongArray() { + this(64); + } + + public LongArray(int capacity) { + buf = new long[capacity]; + } + + public int size() { + return size; + } + + private void ensureCapacity(int len) { + if (size + len > buf.length) { + int n = buf.length * 3 / 2 + 1; + if (size + len > n) { + n = size + len; + } + long[] a = new long[n]; + System.arraycopy(buf, 0, a, 0, size); + buf = a; + } + } + + public void add(long v) { + ensureCapacity(size + 1); + buf[size++] = v; + } + + /** + * Add a value at a specified index in the array. + */ + public void add(int index, long value) { + ensureCapacity(size + 1); + if (index == size) { + buf[size++] = value; + } else { + System.arraycopy(buf, index, buf, index + 1, size - index); + buf[index] = value; + } + } + + /** + * Constructs and returns a simple array containing the same data as held + * in this growable array. + */ + public long[] toArray() { + long[] a = new long[size]; + System.arraycopy(buf, 0, a, 0, size); + return a; + } + + public void clear() { + size = 0; + } + + /** + * Retrieve the value present at an index position in the array. + */ + public long get(int index) { + return buf[index]; + } + +} + diff --git a/src/main/java/japsa/util/ProcessManagement.java b/src/main/java/japsa/util/ProcessManagement.java index 80edb02..5da5704 100644 --- a/src/main/java/japsa/util/ProcessManagement.java +++ b/src/main/java/japsa/util/ProcessManagement.java @@ -34,6 +34,10 @@ ****************************************************************************/ package japsa.util; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; @@ -43,8 +47,9 @@ * */ public class ProcessManagement { + private static final Logger LOG = LoggerFactory.getLogger(ProcessManagement.class); /** - * To start a process, redirect output and error to Logging.info, print out + * To start a process, redirect output and error to LOG.info, print out * a message and return the status * @param pb * @return @@ -59,7 +64,7 @@ static public int runProcess(ProcessBuilder pb) throws IOException, InterruptedE String outLine = ""; while ((outLine = pbOut.readLine())!=null){ - Logging.info(outLine.trim()); + LOG.info(outLine.trim()); } pbOut.close(); int status = process.waitFor(); diff --git a/src/main/java/japsa/util/Simulation.java b/src/main/java/japsa/util/Simulation.java new file mode 100644 index 0000000..d7c4e43 --- /dev/null +++ b/src/main/java/japsa/util/Simulation.java @@ -0,0 +1,87 @@ +/***************************************************************************** + * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * * + * Redistribution and use in source and binary forms, with or without * + * modification, are permitted provided that the following conditions * + * are met: * + * * + * 1. Redistributions of source code must retain the above copyright notice, * + * this list of conditions and the following disclaimer. * + * 2. Redistributions in binary form must reproduce the above copyright * + * notice, this list of conditions and the following disclaimer in the * + * documentation and/or other materials provided with the distribution. * + * 3. Neither the names of the institutions nor the names of the contributors* + * may be used to endorse or promote products derived from this software * + * without specific prior written permission. * + * * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * + ****************************************************************************/ + +/* Revision History + * 26/08/2016 - Minh Duc Cao: Start + * + ****************************************************************************/ + + +package japsa.util; + +import java.util.Random; + +/** + * A library for simulation + * @author minhduc + * + */ +public class Simulation { + + /** + * Generate a random seed if the input <=0 + * @param seed + * @return + */ + public static int seed(int seed){ + if (seed <= 0) + seed = new Random().nextInt(); + + //make sure seed is not negative + if (seed <0 ) + seed = - seed; + + return seed; + } + + /** + * This function return a sample from log-logistic distribution (aka Fisk + * distribution) with a scale parameter alpha and shape parameter beta. + * + * See wikipedia on Fisk Distribution + * + * @param alpha: scale parameter + * @param beta: shape parameter + * @param rnd: random generator + * @return + */ + public static double logLogisticSample(double alpha, double beta, Random rnd){ + double u = rnd.nextDouble(); + + if (u <= 0.5) + return alpha * Math.pow (u / (1.0 - u), 1.0 / beta); + else + return alpha / Math.pow ((1.0 - u)/ u, 1.0 / beta); + } + + public static double logLogisticPDF(double x, double alpha, double beta){ + x = x / alpha; + return Math.pow(beta * x, -beta -1) * Math.pow(( 1 + Math.pow(x, -beta)), -2); + } +} diff --git a/src/main/java/japsa/util/deploy/Deploy.java b/src/main/java/japsa/util/deploy/Deploy.java index 1faffc2..792d1da 100644 --- a/src/main/java/japsa/util/deploy/Deploy.java +++ b/src/main/java/japsa/util/deploy/Deploy.java @@ -1,5 +1,5 @@ /***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * + * Copyright (c) 2017 Minh Duc Cao. All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted provided that the following conditions * @@ -35,10 +35,10 @@ package japsa.util.deploy; -//import japsa.bio.hts.HTSAlignmentParam; -import japsa.tools.bio.bac.Genomes2ResistanceGeneCmd; -import japsa.tools.bio.bac.MLSTCmd; +import japsa.tools.bio.amra.*; +import japsa.tools.bio.hts.AddReadSequence2SamCmd; import japsa.tools.bio.hts.AlignmentParamOptCmd; +import japsa.tools.bio.hts.AlternativeAllelesCmd; import japsa.tools.bio.hts.BreakBamCmd; import japsa.tools.bio.hts.CountReadInRegionCmd; import japsa.tools.bio.hts.FastQTrimCmd; @@ -48,6 +48,10 @@ import japsa.tools.bio.hts.SelectReadSpanCmd; import japsa.tools.bio.hts.SpeciesMixtureCmd; import japsa.tools.bio.hts.VNTRDepthCmd; +import japsa.tools.bio.hts.VNTRLongReadsCmd; +import japsa.tools.bio.hts.VNTRLongReadsV2Cmd; +import japsa.tools.bio.np.BarCodeAnalysisCmd; +import japsa.tools.bio.np.FastNanoporeReaderCmd; import japsa.tools.bio.np.GapCloserCmd; import japsa.tools.bio.np.NanoporeReadFilterCmd; import japsa.tools.bio.np.NanoporeReaderCmd; @@ -61,6 +65,8 @@ import japsa.tools.bio.phylo.XMDistanceCmd; import japsa.tools.bio.sim.SimHTSWithFSMCmd; import japsa.tools.bio.sim.SimProbFSMCmd; +import japsa.tools.bio.sim.SimulateCaptureCmd; +import japsa.tools.bio.sim.SimulateGenomeCmd; import japsa.tools.bio.tr.Fragment2TRVCmd; import japsa.tools.bio.tr.Japsa2TRCmd; import japsa.tools.bio.tr.ParseTRFCmd; @@ -84,9 +90,10 @@ import japsa.tools.seq.SequenceSortCmd; import japsa.tools.seq.SequenceStatsCmd; import japsa.tools.seq.SplitSequenceFileCmd; +import japsa.tools.util.DnaGraphToolCmd; import japsa.tools.util.StreamClientCmd; import japsa.tools.util.StreamServerCmd; -import japsa.tools.xm.ExpertModelCmd; +import japsa.tools.bio.xm.ExpertModelCmd; import japsa.util.CommandLine; import japsa.util.StringSeparator; @@ -94,11 +101,13 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; +import java.nio.file.Files; import java.util.ArrayList; import java.util.Date; import java.util.Scanner; -import com.google.common.io.Files; +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; + /** * This class is used to deploy tools: create a makefile to generate scripts @@ -107,9 +116,9 @@ */ public class Deploy { public static ArrayList tools = new ArrayList(); - public static String VERSION = "1.6-01c"; + public static String VERSION = "1.7-05b"; public static final String FIELD_SEP = "\t"; - + public static boolean DEBUG = true; //private static String AUTHORS = "Minh Duc Cao"; @@ -143,17 +152,23 @@ tools.add(new CountReadInRegionCmd()); tools.add(new AlignmentParamOptCmd()); tools.add(new HTSErrorAnalysisCmd()); + tools.add(new AlternativeAllelesCmd()); tools.add(new GetN50Cmd()); tools.add(new SpeciesMixtureCmd()); + tools.add(new AddReadSequence2SamCmd()); - tools.add("Bacterial analysis:"); + tools.add("Bacterial AMR analysis:"); tools.add(new MLSTCmd()); - tools.add(new Genomes2ResistanceGeneCmd()); - + tools.add(new PlasmidFinderCmd()); + tools.add(new Genomes2ResistanceGeneCmd()); + tools.add(new AssemblyPostProcessingCmd()); + tools.add(new ResistanceGeneCardCmd()); + //jsa.np. //tools.add(NanoporeReader()); tools.add("Oxford Nanopore sequencing analysis tools:"); tools.add(new NanoporeReaderCmd()); + tools.add(new FastNanoporeReaderCmd()); tools.add(new NanoporeReadFilterCmd()); //tools.add(new SpeciesMixtureTypingCmd()); tools.add(new RealtimeSpeciesTypingCmd()); @@ -162,9 +177,10 @@ tools.add(new RealtimeResistanceGeneCmd()); tools.add(new RegulateTimeCmd()); tools.add(new GapCloserCmd()); + tools.add(new BarCodeAnalysisCmd()); //tools.add(new SpeciesMixtureCmd()); - - + //tools.add(new BaseMethylationCmd()); + //jsa.trv.* tools.add("Tandem repeat variation analysis tools:"); tools.add(new ParseTRFCmd()); @@ -176,6 +192,7 @@ tools.add(new Japsa2TRCmd()); tools.add(new TRV2BedCmd()); tools.add(new VNTRDepthCmd()); + tools.add(new VNTRLongReadsCmd()); tools.add("Utilities:"); tools.add(new StreamServerCmd()); @@ -192,11 +209,18 @@ tools.add("Alignment with Finite State Machines"); tools.add(new SimProbFSMCmd()); tools.add(new SimHTSWithFSMCmd()); + tools.add(new SimulateGenomeCmd()); + tools.add(new SimulateCaptureCmd()); //jsa.xm tools.add("Export Model compression"); tools.add(new ExpertModelCmd()); - //tools.add(.class); + tools.add(new DnaGraphToolCmd()); + + + tools.add("==========Testing==============="); + tools.add(new VNTRLongReadsV2Cmd()); + //tools.add(new ResistanceGeneCardCmd()); } @@ -228,10 +252,11 @@ static private String setupJapsaDir(String japsaJar) throws IOException{ } //Get the classpath now that the directory of installation is known - File from = new File(japsaJar); + File from = new File(japsaJar); + File to = new File (japsaLib.getCanonicalPath() + File.separator + japsaJar); try{ - Files.copy(from,to); + Files.copy(from.toPath(),to.toPath(),REPLACE_EXISTING); }catch (IOException e){ System.err.println(e.getMessage()); return null; @@ -245,7 +270,8 @@ static private String setupJapsaDir(String japsaJar) throws IOException{ from = new File("libs" + File.separator + l); to = new File (japsaLib.getCanonicalPath() + File.separator + l); try{ - Files.copy(from,to); + Files.copy(from.toPath(),to.toPath(),REPLACE_EXISTING); + //Files.copy(from,to); }catch (IOException e){ System.err.println(e.getMessage()); return null; @@ -269,6 +295,14 @@ static private String setupJapsaDir(String japsaJar) throws IOException{ private static String classPath = null; private static String javaCommand = null; + private static void guessJapsaPath(){ + if (japsaPath == null){ + boolean isWindows = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0; + japsaPath = isWindows? + "c:\\Japsa" + : System.getProperty("user.home") + "/.usr/local"; + } + } /** * Prepare the directory to copy libraries and scripts for instalation. * This method also set up classpath, java command, library path, and @@ -278,7 +312,10 @@ static private String setupJapsaDir(String japsaJar) throws IOException{ * @throws IOException */ public static void setUpDirectory() throws IOException{ - boolean isWindows = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0; + + boolean isWindows = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0; + boolean isMac = System.getProperty("os.name").toLowerCase().indexOf("mac") >= 0; + classPath = japsaJar; Scanner scanner = new Scanner(System.in); String line = null; @@ -286,10 +323,7 @@ public static void setUpDirectory() throws IOException{ System.out.println("Setting up Japsa Directory and copying libraries"); //////////////////////////////////////////////////////////////////////////// if (japsaPath == null){ - //Get directory to install and create - japsaPath = isWindows? - "c:\\Japsa" - : System.getProperty("user.home") + "/.usr/local"; + guessJapsaPath(); while (true){ System.out.print("Directory to install japsa: [" + japsaPath + "]"); line = scanner.nextLine(); @@ -321,8 +355,8 @@ public static void setUpDirectory() throws IOException{ } javaCommand = isWindows? - "java -Xmx%JSA_MEM% -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8" - :"java -Xmx${JSA_MEM} -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8"; + "java -Xmx%JSA_MEM% -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8" + :"java -Xmx${JSA_MEM} -ea -Djava.awt.headless=true -Dfile.encoding=UTF-8"; //Get server mode or client mode @@ -352,8 +386,11 @@ public static void setUpDirectory() throws IOException{ String [] fNix = {"libjhdf.so","libjhdf5.so"}; String [] fWindows = {"jhdf.dll","jhdf5.dll","libhdf.lib","libhdf5.lib"}; - + String [] fMac = {};//"libjhdf5.a"}; + String [] requires = isWindows ? fWindows:fNix; + if (isMac) + requires = fMac; boolean pass = true; for (String rLib:requires){ @@ -390,8 +427,11 @@ public static void setUpDirectory() throws IOException{ String [] fNix = {"libjri.so"}; String [] fWindows = {"jri.dll","libjri.lib"}; + String [] fMac = {}; String [] requires = isWindows?fWindows:fNix; + if (isMac) + requires = fMac; boolean pass = true; for (String rLib:requires){ @@ -428,7 +468,7 @@ public static void setUpDirectory() throws IOException{ * @throws IOException */ public static void setUpScripts(ArrayList toolList, String masterScript) - throws IOException{ + throws IOException{ System.out.println("Set upting scripts in " + masterScript + ":"); boolean isWindows = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0; //Set up differences between windows and the rest @@ -444,7 +484,7 @@ public static void setUpScripts(ArrayList toolList, String masterScript) } outJsaMain.print(echoStr + "Japsa: A Java Package for Statistical Sequence Analysis\n" - + echoStr + "Version " + VERSION + ", Built on " + (new Date())); + + echoStr + "Version " + VERSION + ", Built on " + (new Date())); if (compiler != null){ outJsaMain.println(" with " + compiler); @@ -465,7 +505,7 @@ public static void setUpScripts(ArrayList toolList, String masterScript) } if (obj instanceof CommandLine){ Class tool = obj.getClass(); - + Deployable annotation = (Deployable) tool.getAnnotation(Deployable.class); File file = new File(japsaPath + File.separator + "bin" + File.separator + annotation.scriptName() + suffixStr); @@ -484,20 +524,27 @@ public static void setUpScripts(ArrayList toolList, String masterScript) out.println("#!/bin/sh\n"); out.println("case $JSA_MEM in\n '')JSA_MEM="+maxMem +";;\n *);;\nesac\n\n"); out.println("case $JSA_CP in\n '')JSA_CP=" - + classPath - + ";;\n *)echo \"[INFO] Use ${JSA_CP} as path \" 1>&2;;\nesac\n\n"); + + classPath + + ";;\n *)echo \"[INFO] Use ${JSA_CP} as path \" 1>&2;;\nesac\n\n"); //out.println("JSA_CMD=\"`basename $0` $@\"\n"); out.println(javaCommand + " -classpath ${JSA_CP} " - + tool.getCanonicalName() + " \"$@\""); + + tool.getCanonicalName() + " \"$@\""); out.close(); Runtime.getRuntime().exec( - "chmod a+x " + file.getCanonicalPath()); + "chmod a+x " + file.getCanonicalPath()); } System.out.println(" " + file.getCanonicalPath() + " created"); outJsaMain.printf(echoStr + " %-23s %s\n", annotation.scriptName(), annotation.scriptDesc()); + + //CommandLine cmdTool = (CommandLine) obj; + //if (cmdTool.galaxy()){ + // String wrapper = cmdTool.generateGalaxyWrapper(); + // System.out.println(wrapper); + //} + }//if else{ System.err.println("Cannot generate script for " + obj + " " + obj.getClass()); @@ -512,13 +559,61 @@ public static void setUpScripts(ArrayList toolList, String masterScript) outJsaMain.close(); if (!isWindows){ Runtime.getRuntime().exec( - "chmod a+x " + outJsa.getCanonicalPath()); + "chmod a+x " + outJsa.getCanonicalPath()); } System.out.println("Done " + masterScript + "\n"); } - public static boolean uninstallLibraries() throws IOException{ + public static void setUpGalaxyScripts(ArrayList toolList) + throws IOException{ + System.out.println("Set galaxy wrapper :"); + PrintStream masterFile = new PrintStream(new File("galaxy" + File.separator + "japsa.xml")); + masterFile.println("
"); + + for (Object obj : toolList) { + //A string separated + if ((obj instanceof String)){ + continue; + } + if (obj instanceof CommandLine){ + + CommandLine cmdTool = (CommandLine) obj; + if (cmdTool.galaxy()){ + + Class tool = obj.getClass(); + Deployable annotation = (Deployable) tool.getAnnotation(Deployable.class); + + String fileName = "japsa" + File.separator + annotation.scriptName().replace(".", "_") + ".xml"; + File file = new File("galaxy" + File.separator + fileName); + String wrapper = cmdTool.generateGalaxyWrapper(); + + PrintStream ps = new PrintStream(file); + ps.println(wrapper); + ps.close(); + + masterFile.println(" "); + + masterFile.println("
"); + + + System.out.println(" " + file.getCanonicalPath() + " created"); + + } + } + } + masterFile.println(" "); + masterFile.close(); + + System.out.println("Done galaxy \n"); + } + + + + + public static boolean uninstallLibraries() throws IOException{ + guessJapsaPath(); + if (japsaPath.startsWith("~/")) { japsaPath = System.getProperty("user.home") + japsaPath.substring(1); } @@ -553,11 +648,13 @@ public static void uninstallScripts(ArrayList toolList, String masterScr // Delete all the scripts for (Object obj : toolList) { - if (!(obj instanceof Class)){ + if (!(obj instanceof CommandLine)){ continue; } - Class tool = (Class) obj; + + Class tool = obj.getClass(); Deployable annotation = tool.getAnnotation(Deployable.class); + File file = new File(japsaPath + File.separator + "bin" + File.separator + annotation.scriptName()); System.out.println("rm " + file.getCanonicalPath()); file.delete(); @@ -616,7 +713,10 @@ public static void main(String[] args) throws NoSuchFieldException, //japsaPath must have been set if (uninstallLibraries()) uninstallScripts(tools, "jsa"); - } else { + }else if ("galaxy".equals(mode)) { + setUpGalaxyScripts(tools); + } + else { System.err.println("Mode " + mode + " not recognised"); System.err.println(cmdLine.errors() + "\n" + "Usage: " + cmdLine.usage() + "\nOptions:\n" + cmdLine.options()); System.exit(-1); diff --git a/src/main/java/japsa/util/net/StreamClient.java b/src/main/java/japsa/util/net/StreamClient.java index 2b1c4e9..73dcb12 100644 --- a/src/main/java/japsa/util/net/StreamClient.java +++ b/src/main/java/japsa/util/net/StreamClient.java @@ -35,7 +35,8 @@ import japsa.tools.util.StreamServerCmd; -import japsa.util.Logging; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.Closeable; import java.io.IOException; @@ -48,6 +49,7 @@ * */ public class StreamClient implements Closeable{ + private static final Logger LOG = LoggerFactory.getLogger(StreamClient.class); private ArrayList sockets; public StreamClient(String serverList){ @@ -58,14 +60,14 @@ public StreamClient(String serverList){ int portNumber = StreamServerCmd.DEFAULT_PORT; if (toks.length > 1) portNumber = Integer.parseInt(toks[1]); - Logging.info("Trying to connect " + toks[0] + ":" + portNumber); + LOG.info("Trying to connect " + toks[0] + ":" + portNumber); try { Socket socket = new Socket(toks[0], portNumber); sockets.add(socket); - Logging.info("Connection to " + toks[0] + ":" + portNumber + " established"); + LOG.info("Connection to " + toks[0] + ":" + portNumber + " established"); } catch (UnknownHostException e) { //e.printStackTrace(); - Logging.warn("Could not connect to " + toks[0] + ":" + portNumber); + LOG.warn("Could not connect to " + toks[0] + ":" + portNumber); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -84,7 +86,7 @@ public StreamClient(String serverList){ public void close() throws IOException { for (Socket socket:sockets){ socket.close(); - Logging.info("Connection to " + socket.getRemoteSocketAddress() + " closed"); + LOG.info("Connection to " + socket.getRemoteSocketAddress() + " closed"); } } diff --git a/src/main/java/japsa/xm/ExpertModel.java b/src/main/java/japsa/xm/ExpertModel.java index aec274a..0004e5c 100755 --- a/src/main/java/japsa/xm/ExpertModel.java +++ b/src/main/java/japsa/xm/ExpertModel.java @@ -48,7 +48,7 @@ import japsa.seq.AbstractSequence; import japsa.seq.Alphabet; import japsa.seq.Sequence; -import japsa.tools.xm.ExpertModelCmd; +import japsa.tools.bio.xm.ExpertModelCmd; import japsa.util.IntIterator; import japsa.util.MyBitSet; import japsa.util.JapsaMath; @@ -113,9 +113,9 @@ public void setCheckPoint(int checkPoint) { // Two expert seeds public RepeatExpert offSetSeed; // = new OffsetCountExpert(null,1,null);// a - // dummy one + // dummy one public RepeatExpert palinSeed;// = new PalindromeCountExpert(null,1,null);// - // a dummy one + // a dummy one /* * Parameters of the algorithm */ @@ -123,10 +123,10 @@ public void setCheckPoint(int checkPoint) { protected int chances;// Number of chances given to each expert PatternStore myHash; // The hashtable int hashSize; - + boolean selfRep = false; boolean binaryHash = false; - + LinkedList panel; Alphabet alphabet; @@ -151,7 +151,7 @@ public ExpertModel(int hashSize, Alphabet alphabet , int context, finalD = new double[Expert.alphabet().size()]; markovD = new double[Expert.alphabet().size()];// Combination of the markov - // experts + // experts this.hashSize = hashSize; this.binaryHash = binaryHash; } @@ -216,7 +216,8 @@ else if ("sft".equals(hashType)) { + "bps" + "\nChances : " + chances + "\nBinaryHash : " + binaryHash + "\nHashType : " + hashName + "\nExpert Type : " - + offSetSeed.getClass()); + //+ offSetSeed.getClass() + ); } // protected abstract void initilise(BioCompSequence[] seqArray);//should @@ -240,7 +241,7 @@ protected void initiliseHash(AbstractSequence[] seqArray) { store(seqArray); } - + public void store(AbstractSequence[] seqArray) { // Store all back ground sequence in the hash for (int sid = 0; sid < seqArray.length - 1; sid++) { @@ -276,14 +277,14 @@ protected void initilise_optimise(AbstractSequence[] seqArray) { adapMarkovEx = new AdaptiveMarkovExpert(1,256); //markovEx.setNext(adapMarkovEx); - + repEx = new CombinationExpert(); panel = new LinkedList(); - + MyHashtable hash = new MyHashtable(hashSize, (int) Math.ceil(JapsaMath.log2(alphabet.size()))); /*************************************************************************/ - - System.out.println("Run first pass"); + + //System.out.println("Run first pass"); for (int sid = 0; sid < seqArray.length; sid++) { for (int i = 0; i < seqArray[sid].length(); i++) { @@ -293,7 +294,7 @@ protected void initilise_optimise(AbstractSequence[] seqArray) { } hash.reinitialise_optimise(); - System.out.println("Finish first pass"); + //System.out.println("Finish first pass"); myHash = hash; store(seqArray); @@ -327,7 +328,7 @@ protected void initiliseCommon(AbstractSequence[] seqArray) { markovEx = new MarkovExpert(2); adapMarkovEx = new AdaptiveMarkovExpert(1, 256); - + panel = new LinkedList(); repEx = new CombinationExpert(); @@ -372,7 +373,7 @@ protected void preCoding(byte nextSym) { if (score > baseRateProb / repeatPriorProb){ repEx.getCombDistribution().addWeight(nextSym, score * ptr.probability(nextSym)); - + repSum += score; ptr.resetCounter(); } else { @@ -438,7 +439,7 @@ protected void preCoding() { repEx.getCombDistribution().addWeight(a, score * ptr.probability(a)); } - + repSum += score; ptr.resetCounter(); } else { @@ -478,10 +479,6 @@ protected void resurrectExpert(RepeatExpert e, AbstractSequence bs, int pos, int /** * Update all experts at this positition - * - * @param seqArray - * @param i - * @param sid */ protected void updateExperts(int c) { markovEx.update(c); @@ -524,41 +521,41 @@ protected void postCoding(AbstractSequence[] seqArray, int sid) { if (position > 0) {// Offset expert int id = position >> posSize; - int pos = position % (1 << posSize); - - if (pos <= hashSize) { - continue; - } - - if (pos > hashSize && // Have enough for resurrect - !bitSet.get(currentInd + accLengths[sid] - - accLengths[id] - pos)// Not in there - && pos < seqArray[id].length() - 3) {// have some - // thing to - // predict - e = offSetSeed.duplicate(seqArray[id], pos, - bitSet); - e.setID(currentInd + accLengths[sid] - accLengths[id] - - pos); - } + int pos = position % (1 << posSize); + + if (pos <= hashSize) { + continue; + } + + if (pos > hashSize && // Have enough for resurrect + !bitSet.get(currentInd + accLengths[sid] + - accLengths[id] - pos)// Not in there + && pos < seqArray[id].length() - 3) {// have some + // thing to + // predict + e = offSetSeed.duplicate(seqArray[id], pos, + bitSet); + e.setID(currentInd + accLengths[sid] - accLengths[id] + - pos); + } } else {// Palindrome expert position = -position; // position = position - hashSize; int id = position >> posSize; - int pos = position % (1 << posSize); - - if (pos > hashSize && // Have enough for resurrect - (!pBitSet.get(currentInd + accLengths[sid] - + accLengths[id] + pos))// ? - && (pos + 3 < seqArray[id].length())) {// Have - // something - // to - // predict - e = palinSeed.duplicate(seqArray[id], pos - - hashSize + 1, pBitSet); - e.setID(currentInd + accLengths[sid] + accLengths[id] - + pos); - } + int pos = position % (1 << posSize); + + if (pos > hashSize && // Have enough for resurrect + (!pBitSet.get(currentInd + accLengths[sid] + + accLengths[id] + pos))// ? + && (pos + 3 < seqArray[id].length())) {// Have + // something + // to + // predict + e = palinSeed.duplicate(seqArray[id], pos + - hashSize + 1, pBitSet); + e.setID(currentInd + accLengths[sid] + accLengths[id] + + pos); + } } // Add this expert in only if an identical expert not in the // list @@ -610,7 +607,7 @@ public void decode(AbstractSequence[] seqArray, File encodedFile) AbstractSequence seq = seqArray[sid]; ArithDecoder decoder = new ArithDecoder(new BitInput(fileIn)); - + currentInd = 0; while (!decoder.endOfStream()) { int mid = decoder.getCurrentSymbolCount(total); @@ -632,7 +629,7 @@ public void decode(AbstractSequence[] seqArray, File encodedFile) decoder.removeSymbolFromStream((int) (accu * total), (int) ((accu + finalD[actual]) * total), total); seq.setSymbol(currentInd, actual); - + updateExperts(actual); postCoding(seqArray, sid); @@ -653,7 +650,7 @@ public File realEncode(AbstractSequence[] seqArray, String filename) { // Get the sequence to be encode int sid = seqArray.length - 1; AbstractSequence seq = seqArray[seqArray.length - 1]; - + File file = new File(filename); FileOutputStream fileOut = new FileOutputStream(file); @@ -822,7 +819,7 @@ public void encode(AbstractSequence[] seqArray, String infoFile, String markovFi double totalCost = 0.0, totalMarkovCost = 0.0, cost; PrintStream infoPs = new PrintStream(new BufferedOutputStream(new FileOutputStream(infoFile))); PrintStream markovPs = null; - + if (markovFile != null) { markovPs = new PrintStream(new BufferedOutputStream(new FileOutputStream(markovFile))); markovPs.println("#Information content produced using Markov Model by the eXpert Model (XM,DCC'07, doi:10.1109/DCC.2007.7) "); diff --git a/src/main/java/japsa/xm/expert/RepeatExpert.java b/src/main/java/japsa/xm/expert/RepeatExpert.java index 2d35964..e0ba46b 100755 --- a/src/main/java/japsa/xm/expert/RepeatExpert.java +++ b/src/main/java/japsa/xm/expert/RepeatExpert.java @@ -59,7 +59,7 @@ MyBitSet bitSet; protected int expertType = 1;// Copy or palindrome - AbstractSequence seq; + protected AbstractSequence seq; protected int id;// ID to identify it self public int getID() { @@ -92,7 +92,7 @@ public void incrementCounter() { } - RepeatExpert(AbstractSequence seq, int start, MyBitSet b, int type) { + protected RepeatExpert(AbstractSequence seq, int start, MyBitSet b, int type) { super(); this.seq = seq; bitSet = b; @@ -155,4 +155,8 @@ public int getLength() { public int getExpertType() { return expertType; } + + public AbstractSequence getSeq(){ + return seq; + } }