diff --git a/src/dev/java/obsolete/MLSTStrainTyping.java b/src/dev/java/obsolete/MLSTStrainTyping.java deleted file mode 100644 index c40e498..0000000 --- a/src/dev/java/obsolete/MLSTStrainTyping.java +++ /dev/null @@ -1,579 +0,0 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ - -/************************** REVISION HISTORY ************************** - * 07/09/2014 - Minh Duc Cao: Created - * - ****************************************************************************/ - -package obsolete; - -import japsa.bio.alignment.ProbFSM.Emission; -import japsa.bio.alignment.ProbFSM.ProbThreeSM; -import japsa.bio.np.ErrorCorrection; -import japsa.seq.Alphabet; -import japsa.seq.FastaReader; -import japsa.seq.Sequence; -import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; -import japsa.util.HTSUtilities; -import japsa.util.IntArray; -import japsa.util.Logging; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordIterator; -import htsjdk.samtools.SamInputResource; -import htsjdk.samtools.SamReader; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; - - -/** - * @author minhduc - * - */ -public class MLSTStrainTyping { - - /** - * @param args - * @throws InterruptedException - * @throws Exception - * @throws OutOfMemoryError - */ - - ///////////////////////////////////////////////////////////////////////////// - - ArrayList profileList; - ArrayList geneList; - HashMap geneMap; - HashMap> alignmentMap; - - - int currentReadCount = 0; - long currentBaseCount = 0; - - public String prefix = "tmp"; - public String msa = "kalign"; - public boolean twoDOnly = false; - public int readNumber = 100; - SequenceOutputStream datOS = null; - - - public IntArray hoursArray = null; - public IntArray readCountArray = null; - int arrayIndex = 0; - - public MLSTStrainTyping(){ - - } - - /** - * TODO: make this private - * @param mlstFile - * @throws IOException - */ - - public void readMLSTProfiles(String mlstFile) throws IOException{ - BufferedReader bf = SequenceReader.openFile(mlstFile); - String line = bf.readLine(); - String [] toks = line.trim().split("\t"); - String [] geneNames = Arrays.copyOfRange(toks, 1, toks.length); - //mlstProfiles = new ArrayList(); - - profileList = new ArrayList(); - //do a check - for (int i = 0; i < geneNames.length;i++){ - if (!geneNames[i].equals(geneList.get(i).getName())){ - bf.close(); - throw new RuntimeException("Gene file does not match MLST profile"); - } - } - while ((line = bf.readLine())!=null){ - toks = line.split("\t"); - - GeneProfile profile = new GeneProfile(toks[0]); - for (int i = 0; i < geneNames.length;i++){ - profile.addGene(geneNames[i] + "_" + toks[1 + i]); - } - profileList.add(profile); - }// - bf.close(); - } - /** - * TODO: make it private - * Read genes from gene file to a list + map: for random access - * @param geneFile - * @throws IOException - */ - public void readGenes(String geneFile) throws IOException{ - geneList = SequenceReader.readAll(geneFile, Alphabet.DNA()); - geneMap = new HashMap(); - - for (Sequence gene:geneList){ - geneMap.put(gene.getName(), gene); - } - } - - - private void makeMLSTTyping(int top) throws IOException, InterruptedException{ - if (msa.equals("hmm")) - makeMLSTTyping2(top); - else - makeMLSTTyping1(top); - - } - /** - * Compute the alignment score between a gene and a list of (errornous) reads - * that were aligned to the gene. The algorithm is: - * - Get all read sequences (which were previously trimmed) - * - Call a MSA method to alignment them - * - Make a consensus sequence of those reads - * - Align the consensus sequence to the gene sequence using needle - * - Return the needle alignment score - * @param gene : the gene sequence - * @param readList: an array list of read sequences aligned to this gene - - * @throws IOException - * @throws InterruptedException - */ - - - private void makeMLSTTyping2(int top) throws IOException, InterruptedException{ - //Get consensus read sequence of reads mapped to each gene - - HashMap scoreMap = new HashMap(); - for (GeneProfile profile: profileList){ - profile.score = 0; - for (int i = 0; i < geneList.size(); i++){ - String geneID = geneList.get(i).getName(); - Logging.info("Compute score for " + profile.genes.get(i)); - Double scoreObj = scoreMap.get(geneID + "_" + profile.genes.get(i)); - - if (scoreObj != null){ - Logging.info(" Found catched " + scoreObj); - profile.score += scoreObj; - }else{ - Logging.info(" FSM alignment :"); - ArrayList alignmentList = alignmentMap.get(geneID); - if (alignmentList != null){ - Sequence fa = FastaReader.getReader("geneAlleles/out_" + profile.genes.get(i) + ".fasta").nextSequence(Alphabet.DNA16()); - double thisScore = 0; - - for (Sequence readSeq:alignmentList){ - ProbThreeSM tsm = new ProbThreeSM(fa); - double cost = 1000000000; - - for (int c = 0; c < 10; c++){ - tsm.resetCount(); - Emission retState = tsm.alignGenerative(readSeq); - if (cost <= retState.myCost) - break; - - //if (cost - retState.myCost < 0.5){ - // cost = retState.myCost; - // break; - //} - cost = retState.myCost; - int emitCount = tsm.updateCount(retState); - Logging.info("Iter " + c + " : " + emitCount + " states and " + cost + " bits " + readSeq.length() + "bp " + readSeq.getName() + " by " + fa.getName()); - tsm.reEstimate(); - } - Logging.info(" Saving " + (readSeq.length() * 2 - cost) + " on " + readSeq.getName() + " by " + fa.getName()); - if (cost < readSeq.length() * 2){ - thisScore += (readSeq.length() * 2 - cost); - } - } - Logging.info(" Computed " + geneID + "_" + profile.genes.get(i) + " : " + thisScore); - scoreMap.put(geneID + "_" + profile.genes.get(i), thisScore); - profile.score += thisScore; //tsm.showProb(); - }//if not null - }//else (null) - }//for i - }//for profile - - Collections.sort(profileList); - - if (top > profileList.size()) - top = profileList.size(); - - System.out.println("============================================== " + currentReadCount); - for (int i = 0; i < top;i++){ - GeneProfile profile = profileList.get(i); - System.out.println(profile.strainID + "\t" + profile.score); - } - } - - private void makeMLSTTyping1(int top) throws IOException, InterruptedException{ - //Get consensus read sequence of reads mapped to each gene - HashMap consensusMap = new HashMap(); - - for (int i = 0; i < geneList.size();i++){ - String geneID = geneList.get(i).getName(); - ArrayList alignmentList = alignmentMap.get(geneID); - Sequence consensus = ErrorCorrection.consensusSequence(alignmentList, prefix + "_" + geneID + "_" + this.currentReadCount, msa); - if (consensus != null){ - consensusMap.put(geneID, consensus); - } - } - - HashMap scoreMap = new HashMap(); - for (GeneProfile profile: profileList){ - profile.score = 0; - for (int i = 0; i < geneList.size(); i++){ - String geneID = geneList.get(i).getName(); - Sequence consensus = consensusMap.get(geneID); - Logging.info("Compute score for " + profile.genes.get(i)); - if (consensus != null){ - Double scoreObj = scoreMap.get(geneID + "_" + profile.genes.get(i)); - if (scoreObj != null){ - profile.score += scoreObj; - Logging.info(" Found catched " + profile.score); - }else{ - - Logging.info(" FSM alignment :"); - Sequence fa = FastaReader.getReader("geneAlleles/out_" + profile.genes.get(i) + ".fasta").nextSequence(Alphabet.DNA16()); - ProbThreeSM tsm = new ProbThreeSM(fa); - - double cost = 100000000; - for (int c = 0; c < 10; c++){ - tsm.resetCount(); - Emission retState = tsm.alignGenerative(consensus); - if (cost <= retState.myCost) - break; - - cost = retState.myCost; - int emitCount = tsm.updateCount(retState); - Logging.info("Iter " + c + " : " + emitCount + " states and " + cost + " bits " + consensus.length() + "bp " + consensus.getName() + " by " + fa.getName()); - tsm.reEstimate(); - } - - - double thisScore = consensus.length() * 2 - cost; - profile.score += thisScore; - - scoreMap.put(geneID + "_" + profile.genes.get(i), thisScore); - Logging.info(" Computed " + geneID + "_" + profile.genes.get(i) + " : " + thisScore); - //tsm.showProb(); - } - - } - } - } - - Collections.sort(profileList); - - if (top > profileList.size()) - top = profileList.size(); - - System.out.println("============================================== " + currentReadCount); - for (int i = 0; i < top;i++){ - GeneProfile profile = profileList.get(i); - System.out.println(profile.strainID + "\t" + profile.score); - } - } - - private void makeMLSTTyping3(int top) throws IOException, InterruptedException{ - String [] facFiles = new String[geneList.size()]; - - //Get consensus read sequence of reads mapped to each gene - for (int i = 0; i < geneList.size();i++){ - String geneID = geneList.get(i).getName(); - ArrayList alignmentList = alignmentMap.get(geneID); - Sequence consensus = ErrorCorrection.consensusSequence(alignmentList, prefix + "_" + geneID + "_" + this.currentReadCount, msa); - - - if (consensus != null){ - facFiles[i] = prefix + geneID + "_" + this.currentReadCount + "_consensus.fasta";//name of fasta files of reads mapped to the gene - consensus.writeFasta(facFiles[i]); - }else - facFiles[i] = null; - } - - for (GeneProfile profile: profileList){ - profile.score = 0; - for (int i = 0; i < geneList.size(); i++){ - if (facFiles[i] != null){ - - String faAFile = "geneAlleles/out_" + profile.genes.get(i) + ".fasta"; - String needleOut = prefix + profile.genes.get(i) + "_" + this.currentReadCount + "_consensus.needle";; - - File f = new File(needleOut); - if (!f.exists()){//only run if file not exists (this allele has not done before) - String cmd = "needle -gapopen 10 -gapextend 0.5 -asequence " - + faAFile + " -bsequence " + facFiles[i] + " -outfile " + needleOut; - - Logging.info("Running " + cmd); - Process process = Runtime.getRuntime().exec(cmd); - process.waitFor(); - - Logging.info("Run'ed " + cmd ); - } - BufferedReader scoreBf = new BufferedReader(new FileReader(needleOut)); - String scoreLine = null; - - while ((scoreLine = scoreBf.readLine())!=null){ - String [] scoreToks = scoreLine.split(" "); - if (scoreToks.length == 3 && scoreToks[1].equals("Score:")){ - profile.score += Double.parseDouble(scoreToks[2]); - break;//while - } - }//while - scoreBf.close(); - } - } - } - - Collections.sort(profileList); - - if (top > profileList.size()) - top = profileList.size(); - - System.out.println("============================================== " + currentReadCount); - for (int i = 0; i < top;i++){ - GeneProfile profile = profileList.get(i); - System.out.println(profile.strainID + "\t" + profile.score); - } - } - - /** - * Using blastn - * @param top - * @throws IOException - * @throws InterruptedException - */ - private void makeMLSTTyping4(int top) throws IOException, InterruptedException{ - String [] facFiles = new String[geneList.size()]; - - //Get consensus read sequence of reads mapped to each gene - for (int i = 0; i < geneList.size();i++){ - String geneID = geneList.get(i).getName(); - ArrayList alignmentList = alignmentMap.get(geneID); - Sequence consensus = ErrorCorrection.consensusSequence(alignmentList, prefix + "_" + geneID + "_" + this.currentReadCount, msa); - - - if (consensus != null){ - facFiles[i] = prefix + geneID + "_" + this.currentReadCount + "_consensus.fasta";//name of fasta files of reads mapped to the gene - consensus.writeFasta(facFiles[i]); - }else - facFiles[i] = null; - } - - for (GeneProfile profile: profileList){ - profile.score = 0; - for (int i = 0; i < geneList.size(); i++){ - if (facFiles[i] != null){ - - String faAFile = "geneAlleles/out_" + profile.genes.get(i) + ".fasta"; - String needleOut = prefix + profile.genes.get(i) + "_" + this.currentReadCount + "_consensus.needle";; - - File f = new File(needleOut); - if (!f.exists()){//only run if file not exists (this allele has not done before) - String cmd = "needle -gapopen 10 -gapextend 0.5 -asequence " - + faAFile + " -bsequence " + facFiles[i] + " -outfile " + needleOut; - - Logging.info("Running " + cmd); - Process process = Runtime.getRuntime().exec(cmd); - process.waitFor(); - - Logging.info("Run'ed " + cmd ); - } - BufferedReader scoreBf = new BufferedReader(new FileReader(needleOut)); - String scoreLine = null; - - while ((scoreLine = scoreBf.readLine())!=null){ - String [] scoreToks = scoreLine.split(" "); - if (scoreToks.length == 3 && scoreToks[1].equals("Score:")){ - profile.score += Double.parseDouble(scoreToks[2]); - break;//while - } - }//while - scoreBf.close(); - } - } - } - - Collections.sort(profileList); - - if (top > profileList.size()) - top = profileList.size(); - - System.out.println("============================================== " + currentReadCount); - for (int i = 0; i < top;i++){ - GeneProfile profile = profileList.get(i); - System.out.println(profile.strainID + "\t" + profile.score); - } - } - - - /** - * @param bamFile - * @param geneFile - * @throws IOException - * @throws InterruptedException - */ - public void typing(String bamFile, int top) throws IOException, InterruptedException{ - alignmentMap = new HashMap> (); - - SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); - SamReader samReader; - if ("-".equals(bamFile)) - samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); - else - samReader = SamReaderFactory.makeDefault().open(new File(bamFile)); - - SAMRecordIterator samIter = samReader.iterator(); - - - String readName = ""; - //A dummy sequence - Sequence readSequence = new Sequence(Alphabet.DNA(),1,""); - while (samIter.hasNext()){ - SAMRecord record = samIter.next(); - if (this.twoDOnly && !record.getReadName().contains("twodim")){ - continue; - } - - if (!record.getReadName().equals(readName)){ - readName = record.getReadName(); - - currentReadCount ++; - currentBaseCount += record.getReadLength(); - - if (hoursArray != null){ - if (arrayIndex < hoursArray.size() && currentReadCount >= this.readCountArray.get(arrayIndex)){ - makeMLSTTyping(top); - arrayIndex ++; - } - }else{ - if (currentReadCount % readNumber == 0){ - makeMLSTTyping(top); - } - } - //Get the read - if (!record.getReadUnmappedFlag()){ - readSequence = new Sequence(Alphabet.DNA(), record.getReadString(), readName); - if (record.getReadNegativeStrandFlag()){ - readSequence = Alphabet.DNA.complement(readSequence); - readSequence.setName(readName); - } - } - } - - if (record.getReadUnmappedFlag()) - continue; - //assert: the read sequence is stored in readSequence with the right direction - String geneID = record.getReferenceName(); - if (!geneMap.containsKey(geneID)) - continue; - - int refLength = geneMap.get(geneID).length(); - - - ArrayList alignmentList = alignmentMap.get(geneID); - if (alignmentList == null){ - alignmentList = new ArrayList(); - alignmentMap.put(geneID, alignmentList); - } - //put the sequence into alignment list - - Sequence readSeq = HTSUtilities.spanningSequence(record, readSequence, refLength, 0); - - if (readSeq == null){ - Logging.warn("Read sequence is NULL sequence "); - }else{ - alignmentList.add(readSeq); - } - }//while - samIter.close(); - samReader.close(); - makeMLSTTyping(top); - } - - private static class GeneProfile implements Comparable{ - String strainID; - double score = 0; - ArrayList genes; - - public GeneProfile(String id){ - strainID = id; - genes = new ArrayList(); - } - - public void addGene(String geneID){ - genes.add(geneID); - } - - /* (non-Javadoc) - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - @Override - public int compareTo(GeneProfile o) { - double comp = score - o.score; - - if (comp < 0) - return 1; - else if (comp > 0) - return -1; - else - return 0; - } - } - - public static String compare(HashSet s1,HashSet s2){ - String ret = ""; - int count1 = 0,count2=0,count=0; - - for (String st:s1){ - if (s2.contains(st)) - count ++; - else{ count1++; - ret = ret + ";" +st; - } - } - ret = ret + "#"; - - for (String st:s2){ - if (!s1.contains(st)){ - count2++; - ret = ret + ";" +st; - } - } - return "Common" + count + "#"+count1+"#"+count2+"#"+ret; - } - -} \ No newline at end of file diff --git a/src/dev/java/obsolete/MLSTStrainTypingCmd.java b/src/dev/java/obsolete/MLSTStrainTypingCmd.java deleted file mode 100644 index 02281ad..0000000 --- a/src/dev/java/obsolete/MLSTStrainTypingCmd.java +++ /dev/null @@ -1,127 +0,0 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ - -/***************************************************************************** - * Revision History - * 7 Aug 2015 - Minh Duc Cao: Created - * - ****************************************************************************/ -package obsolete; - -import java.io.BufferedReader; -import java.io.IOException; - -import japsa.seq.SequenceReader; -import japsa.util.CommandLine; -import japsa.util.IntArray; -import japsa.util.deploy.Deployable; - -/** - * @author minhduc - * - */ -@Deprecated -@Deployable( - scriptName = "jsa.np.mlstStrainTyping", - scriptDesc = "Strain typing using MLST system" - ) -public class MLSTStrainTypingCmd extends CommandLine{ - public MLSTStrainTypingCmd(){ - super(); - Deployable annotation = getClass().getAnnotation(Deployable.class); - setUsage(annotation.scriptName() + " [options]"); - setDesc(annotation.scriptDesc()); - - addString("mlst", "MLST_profiles.txt", "MLST file"); - addString("bamFile", null, "The bam file"); - addString("geneFile", null, "The gene file"); - - addInt("top", 10, "The number of top strains"); - addString("msa", "kalign", - "Name of the msa method, support poa, kalign, muscle and clustalo"); - addString("tmp", "tmp/t", "Temporary folder"); - addString("hours", null, "The file containging hours against yields, if set will output acording to tiime"); - - addInt("read", 500, "Number of reads before a typing, NA if timestamp is set"); - addBoolean("twodonly", false, "Use only two dimentional reads"); - - - addStdHelp(); - } - - public static void main(String[] args) throws IOException, InterruptedException{ - CommandLine cmdLine = new MLSTStrainTypingCmd(); - args = cmdLine.stdParseLine(args); - /**********************************************************************/ - - String mlst = cmdLine.getStringVal("mlst"); - String bamFile = cmdLine.getStringVal("bamFile"); - String geneFile = cmdLine.getStringVal("geneFile"); - String msa = cmdLine.getStringVal("msa"); - String tmp = cmdLine.getStringVal("tmp"); - String hours = cmdLine.getStringVal("hours"); - - int top = cmdLine.getIntVal("top"); - int read = cmdLine.getIntVal("read"); - - boolean twodonly = cmdLine.getBooleanVal("twodonly"); - - - MLSTStrainTyping paTyping = new MLSTStrainTyping(); - paTyping.msa = msa; - paTyping.prefix = tmp; - - paTyping.twoDOnly = twodonly; - paTyping.readNumber = read; - if (hours !=null){ - BufferedReader bf = SequenceReader.openFile(hours); - String line = bf.readLine();//first line - paTyping.hoursArray = new IntArray(); - paTyping.readCountArray = new IntArray(); - - while ((line = bf.readLine())!= null){ - String [] tokens = line.split("\\s"); - int hrs = Integer.parseInt(tokens[0]); - int readCount = Integer.parseInt(tokens[2]); - - paTyping.hoursArray.add(hrs); - paTyping.readCountArray.add(readCount); - } - } - - - if (paTyping.readNumber < 1) - paTyping.readNumber = 1; - - paTyping.readGenes(geneFile); - paTyping.readMLSTProfiles(mlst); - paTyping.typing(bamFile, top); - - } -} diff --git a/src/dev/java/obsolete/ResistanceGene.java b/src/dev/java/obsolete/ResistanceGene.java deleted file mode 100644 index 0d9fdf0..0000000 --- a/src/dev/java/obsolete/ResistanceGene.java +++ /dev/null @@ -1,599 +0,0 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ - -/************************** REVISION HISTORY ************************** - * 07/09/2014 - Minh Duc Cao: Created - * - ****************************************************************************/ - -package obsolete; - - -import japsa.bio.alignment.ProbFSM.Emission; -import japsa.bio.alignment.ProbFSM.ProbThreeSM; -import japsa.bio.np.ErrorCorrection; -import japsa.seq.Alphabet; -import japsa.seq.FastaReader; -import japsa.seq.Sequence; -import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; -import japsa.util.HTSUtilities; -import japsa.util.IntArray; -import japsa.util.Logging; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordIterator; -import htsjdk.samtools.SamInputResource; -import htsjdk.samtools.SamReader; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; - -/** - * @author minhduc - * - */ - -public class ResistanceGene { - //TODO: make the below private - public double ilThreshold = 0.9; - HashMap> aroMap;//map a gene to function/annotations - - //Map every gene to a list of reads that align to this gene - HashMap> alignmentMap; - HashMap> samMap; - - HashSet targetGenes; - HashSet targetGroup; - HashSet targetPrimaryGroup; - - - HashMap maxScores = new HashMap(); - - int currentReadCount = 0; - long currentBaseCount = 0; - - public String prefix = "tmp"; - public String msa = "kalign"; - public String global = "needle"; - - public double scoreThreshold = 2; - - public boolean twoDOnly = false; - - ArrayList functions = new ArrayList(); - ArrayList functionNames = new ArrayList(); - public int readNumber = 100; - public SequenceOutputStream datOS = null; - - public IntArray hoursArray = null; - public IntArray readCountArray = null; - int arrayIndex = 0; - - long firstReadTime = 0; - - public ResistanceGene(){ - } - - HashMap> familyMap; - HashMap gene2Group; - HashMap gene2PrimaryGroup; - HashSet groupSet, primaryGroupSet, geneSet; - - ArrayList geneList; - HashMap geneMap; - - - //TODO: make below protected/private - public void getGeneClassInformation(String mcoordFile) throws IOException{ - ArrayList genes = FastaReader.readAll("geneAlleles90.fasta", Alphabet.DNA()); - HashMap myMap = new HashMap(); - - gene2Group = new HashMap(); - gene2PrimaryGroup = new HashMap(); - - familyMap = new HashMap>(); - geneMap = new HashMap(); - geneList = new ArrayList (); - - for (Sequence g:genes){ - myMap.put(g.getName(), g); - } - - - groupSet = new HashSet(); - primaryGroupSet = new HashSet(); - geneSet = new HashSet(); - - - String fn = "GeneClassification.csv"; - - BufferedReader bf = SequenceReader.openFile(fn); - String line = bf.readLine(); - while ((line = bf.readLine())!= null){ - if (line.startsWith("#")) - continue; - - String [] toks = line.split("\t"); - if (!toks[0].equals("0")) - continue; - - String famID = "JSA_"+toks[5]; - String geneID = "JSA_"+toks[5] + "_" + toks[6]; - Sequence gene = myMap.remove(geneID); - - if (gene != null){ - ArrayList fam = familyMap.get(famID); - if (fam == null){//fam not yet entered - fam = new ArrayList(); - familyMap.put(famID, fam); - Sequence famSeq = myMap.get(famID); - if(famSeq == null){ - Logging.exit(" Gene Family " + famID + " not found",1); - } - geneList.add(famSeq); - geneMap.put(famID, famSeq); - gene2Group.put(famID, toks[3]); - gene2PrimaryGroup.put(famID, toks[2]); - - } - fam.add(gene); - gene2Group.put(geneID, toks[3]); - gene2PrimaryGroup.put(geneID, toks[2]); - } - } - bf.close(); - - - groupSet.addAll(gene2Group.values()); - primaryGroupSet.addAll(gene2PrimaryGroup.values()); - - targetGenes = new HashSet(); - targetGroup = new HashSet(); - targetPrimaryGroup = new HashSet(); - bf = new BufferedReader (new FileReader(mcoordFile)); - - - while ((line = bf.readLine()) != null){ - String [] toks = line.trim().split("\t"); - String [] tt = toks[12].split("_"); - String tGeneID = tt[0] + "_" + tt[1]; - - if (Double.parseDouble(toks[10]) >= this.ilThreshold * 100 - && Double.parseDouble(toks[6]) >= this.ilThreshold * 100 - && geneMap.containsKey(tGeneID)){ - targetGenes.add(tGeneID); - targetGroup.add(gene2Group.get(tGeneID)); - targetPrimaryGroup.add(gene2PrimaryGroup.get(tGeneID)); - } - } - - datOS.print("#Target : " + targetGenes.size() + "\n"); - for (String tGeneID: targetGenes){ - datOS.print("#TG " + tGeneID + " " + gene2Group.get(tGeneID) + " " + gene2PrimaryGroup.get(tGeneID)); - datOS.println(); - } - - for (String group:targetGroup){ - datOS.print("#TC " + group); - datOS.println(); - } - for (String group:targetPrimaryGroup){ - datOS.print("#TP " + group); - datOS.println(); - } - - datOS.flush(); - bf.close(); - } - - - int [] targetProfile = null; - public double checkNeedle(String consensusFile, Sequence gene) throws IOException, InterruptedException{ - //Needle the gene - String geneID = gene.getName(); - String faAFile = "geneAlleles/out_" + geneID + ".fasta"; - String needleOut = prefix + geneID + "_" + this.currentReadCount + "_consensus.needle"; - - String cmd = "needle -gapopen 10 -gapextend 0.5 -asequence " - + faAFile + " -bsequence " + consensusFile + " -outfile " + needleOut; - Logging.info("Running " + cmd); - Process process = Runtime.getRuntime().exec(cmd); - process.waitFor(); - Logging.info("Run'ed " + cmd ); - - BufferedReader scoreBf = new BufferedReader(new FileReader(needleOut)); - String scoreLine = null; - double score = 0; - while ((scoreLine = scoreBf.readLine())!=null){ - String [] scoreToks = scoreLine.split(" "); - if (scoreToks.length == 3 && scoreToks[1].equals("Score:")){ - score += Double.parseDouble(scoreToks[2]); - break;//while - } - }//while - scoreBf.close(); - return score / gene.length(); - } - - public double checkHMM(Sequence consensus, Sequence gene){ - - if (gene.length() > 2700 || consensus.length() > 4000 || gene.length() * consensus.length() > 6000000){ - Logging.info("SKIP " + gene.getName() + " " + gene.length() + " vs " + consensus.length()); - return 0; - } - - ProbThreeSM tsmF = new ProbThreeSM(gene); - double cost = 100000000; - for (int c = 0; c < 10; c++){ - tsmF.resetCount(); - Emission retState = tsmF.alignGenerative(consensus); - if (cost <= retState.myCost) - break;//for c - - cost = retState.myCost; - int emitCount = tsmF.updateCount(retState); - Logging.info("Iter " + c + " : " + emitCount + " states and " + cost + " bits " + consensus.length() + "bp " + consensus.getName() + " by " + gene.getName()); - tsmF.reEstimate(); - } - return (consensus.length() * 2 - cost) / gene.length(); - } - - private void addPreditedGene(String geneID){ - pGenes.add(geneID); - pGroup.add(gene2Group.get(geneID)); - pPrimaryGroup.add(gene2PrimaryGroup.get(geneID)); - } - - - HashSet pGenes = new HashSet(); - HashSet pGroup = new HashSet(); - HashSet pPrimaryGroup = new HashSet(); - - - private void antiBioticsProfile() throws IOException, InterruptedException{ - int step = currentReadCount; - if (hoursArray != null) - step = hoursArray.get(arrayIndex); - - Logging.info("STEP " + step); - //Get list of genes from my - for (Sequence gene:geneList){ - String geneID = gene.getName(); - //TODO: to remove - if (pPrimaryGroup.contains(gene2PrimaryGroup.get(geneID))) - continue; - - if (pGenes.contains(geneID)) - continue; - - ArrayList alignmentList = alignmentMap.get(geneID); - Sequence - consensus = ErrorCorrection.consensusSequence(alignmentList, prefix + "_" + geneID + "_" + this.currentReadCount, msa); - - if (consensus == null){ - //Not consider this gene at all - continue;//gene - } - - if (global.equals("hmm")){ - { - double score = checkHMM(consensus, gene); - - Logging.info("REF: " + geneID + " " + score + " " + gene.length() + " " + consensus.length()+ " " + gene2Group.get(geneID)+ " " + gene2PrimaryGroup.get(geneID)); - - Double oldScore = maxScores.get(gene2PrimaryGroup.get(geneID)); - if (oldScore == null || oldScore < score){ - maxScores.put(gene2PrimaryGroup.get(geneID), score); - } - - Logging.info("SGF: " + score + " " + geneID + " " + targetGenes.contains(geneID)); - Logging.info("SCF: " + score + " " + gene2Group.get(geneID) + " " + targetGroup.contains(gene2Group.get(geneID))); - Logging.info("SPF: " + score + " " + gene2PrimaryGroup.get(geneID) + " " + targetPrimaryGroup.contains(gene2PrimaryGroup.get(geneID))); - - if (score >= scoreThreshold){ - addPreditedGene(geneID); - Logging.info("ADDF " + geneID + " " + gene2Group.get(geneID)+ " " + gene2PrimaryGroup.get(geneID) + " " + step + " " + geneID); - continue;//for gene - } - } - - /*************************************************/ - ArrayList alleleList = familyMap.get(geneID); - for (Sequence allele:alleleList){ - double score = checkHMM(consensus, allele); - Logging.info("REA: " + allele.getName() + " " + score + " " + allele.length() + " " + consensus.length()+ " " + gene2Group.get(geneID)+ " " + gene2PrimaryGroup.get(geneID)); - Double oldScore = maxScores.get(gene2PrimaryGroup.get(geneID)); - if (oldScore == null || oldScore < score){ - maxScores.put(gene2PrimaryGroup.get(geneID), score); - } - - - Logging.info("SGA: " + score + " " + geneID + " " + targetGenes.contains(geneID)); - Logging.info("SCA: " + score + " " + gene2Group.get(geneID) + " " + targetGroup.contains(gene2Group.get(geneID))); - Logging.info("SPA: " + score + " " + gene2PrimaryGroup.get(geneID) + " " + targetPrimaryGroup.contains(gene2PrimaryGroup.get(geneID))); - - - if (score >= scoreThreshold){ - addPreditedGene(geneID); - Logging.info("ADDA " + geneID + " " + gene2Group.get(geneID)+ " " + gene2PrimaryGroup.get(geneID) + " " + step + " " + allele.getName()); - break;//for allele - } - } - - }else{ - String consensusFile = prefix + "consensus" + geneID + "_" + this.currentReadCount + ".fasta"; - consensus.writeFasta(consensusFile); - { - double score = checkNeedle(consensusFile, gene); - Logging.info("REF: " + geneID + " " + score + " " + gene.length() + " " + consensus.length()+ " " + gene2Group.get(geneID)+ " " + gene2PrimaryGroup.get(geneID)); - Double oldScore = maxScores.get(gene2PrimaryGroup.get(geneID)); - if (oldScore == null || oldScore < score){ - maxScores.put(gene2PrimaryGroup.get(geneID), score); - } - - - Logging.info("SGF: " + score + " " + geneID + " " + targetGenes.contains(geneID)); - Logging.info("SCF: " + score + " " + gene2Group.get(geneID) + " " + targetGroup.contains(gene2Group.get(geneID))); - Logging.info("SPF: " + score + " " + gene2PrimaryGroup.get(geneID) + " " + targetPrimaryGroup.contains(gene2PrimaryGroup.get(geneID))); - if (score >= scoreThreshold){ - Logging.info("ADDF " + geneID + " " + gene2Group.get(geneID) + " " + gene2PrimaryGroup.get(geneID) + " " + step + " " + geneID); - addPreditedGene(geneID); - continue;//for gene - } - } - - //Needle every allele - ArrayList alleleList = familyMap.get(geneID); - for (Sequence allele:alleleList){ - double score = checkNeedle(consensusFile, allele); - Logging.info("REA: " + geneID + " " + score + " " + allele.length() + " " + consensus.length() + " " + gene2Group.get(geneID)+ " " + gene2PrimaryGroup.get(geneID) ); - - Double oldScore = maxScores.get(gene2PrimaryGroup.get(geneID)); - if (oldScore == null || oldScore < score){ - maxScores.put(gene2PrimaryGroup.get(geneID), score); - } - - Logging.info("SGA: " + score + " " + geneID + " " + targetGenes.contains(geneID)); - Logging.info("SCA: " + score + " " + gene2Group.get(geneID) + " " + targetGroup.contains(gene2Group.get(geneID))); - Logging.info("SPA: " + score + " " + gene2PrimaryGroup.get(geneID) + " " + targetPrimaryGroup.contains(gene2PrimaryGroup.get(geneID))); - - - if (score >= scoreThreshold){ - Logging.info("ADDA " + geneID + " " + gene2Group.get(geneID)+ " " + gene2PrimaryGroup.get(geneID) + " " + step + " " + allele.getName()); - addPreditedGene(geneID); - break;//for allele - } - }//for allele - } - } - - Logging.info("===Found " + pGenes.size() + " vs " + geneList.size() + " " + alignmentMap.size() + " " + targetGenes.size()); - - /********************************************/ - int TP = 0, FN = 0; - for (String geneID: targetGenes){ - if (pGenes.contains(geneID)) - TP ++; - else{ - FN ++; - //System.out.println("FN " + geneID); - - } - }//for geneID - - int FP = pGenes.size() - TP; - - double precision = (TP * 1.0) / (TP + FP); - double recall = (TP * 1.0) / (TP + FN); - double f1 = 2 * precision * recall/ (precision + recall); - /********************************************/ - datOS.print("O\t" + step + "\t" + currentReadCount + "\t" + currentBaseCount + "\t" + f1 + "\t" + recall + "\t" + precision + "\t" + pGenes.size() + "\t" + targetGenes.size()); - datOS.println(); - - datOS.print("G\t" + step + "\t" + currentReadCount + "\t" + currentBaseCount + "\t" + stats(targetGenes, pGenes, familyMap.keySet() ) + "\t" + pGenes.size() + "\t" + targetGenes.size()); - datOS.println(); - - datOS.print("C\t" + step + "\t" + currentReadCount + "\t" + currentBaseCount + "\t" + stats(targetGroup, pGroup, groupSet) + "\t" + pGroup.size() + "\t" + targetGroup.size()); - datOS.println(); - - datOS.print("P\t" + step + "\t" + currentReadCount + "\t" + currentBaseCount + "\t" + stats(targetPrimaryGroup, pPrimaryGroup, primaryGroupSet) + "\t" + pPrimaryGroup.size() + "\t" + targetPrimaryGroup.size()); - datOS.println(); - - //System.out.printf("%d %6.4f %6.4f %6.4f %d %d, %d\n",step,f1,recall, precision, myGenes.size(), targetGenes.size(), currentReadCount); - - for (String x:pGenes){ - datOS.print("#PG\t" + step + "\t" + x + "\t" + targetGenes.contains(x)); - datOS.println(); - } - - for (String x:pGroup){ - datOS.print("#PC\t" + step + "\t" + x + "\t" + targetGroup.contains(x)); - datOS.println(); - } - for (String x:pPrimaryGroup){ - datOS.print("#PP\t" + step + "\t" + x + "\t" + targetPrimaryGroup.contains(x)); - datOS.println(); - } - - - datOS.println(); - datOS.flush(); - - /**************************************************************** - int [] myProfile = new int[functions.size()]; - for (String geneID: myGenes){ - HashSet fList = aroMap.get(geneID); - if (fList != null){ - for (String geneFunction: fList){ - int index = functions.indexOf(geneFunction); - if (index >= 0) - myProfile[index] ++; - } - } - } - - //antibioticProfiles.add(myProfile); - datOS.print(step + "\t" + currentReadCount + "\t" + currentBaseCount); - for (int i =0 ;i < myProfile.length;i++){ - datOS.print("\t" + myProfile[i]); - } - datOS.println(); - datOS.flush(); - /****************************************************************/ - } - /** - * @param bamFile - * @param geneFile - * @throws IOException - * @throws InterruptedException - */ - public void typing(String bamFile) throws IOException, InterruptedException{ - alignmentMap = new HashMap> (); - - SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); - SamReader samReader; - if ("-".equals(bamFile)) - samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); - else - samReader = SamReaderFactory.makeDefault().open(new File(bamFile)); - - SAMRecordIterator samIter = samReader.iterator(); - - String readName = ""; - //A dummy sequence - Sequence readSequence = new Sequence(Alphabet.DNA(),1,""); - while (samIter.hasNext()){ - SAMRecord record = samIter.next(); - - if (firstReadTime <=0) - firstReadTime = System.currentTimeMillis(); - - - if (this.twoDOnly && !record.getReadName().contains("twodim")){ - continue; - } - - if (!record.getReadName().equals(readName)){ - readName = record.getReadName(); - - currentReadCount ++; - currentBaseCount += record.getReadLength(); - - if (hoursArray != null){ - if (arrayIndex < hoursArray.size() && currentReadCount >= this.readCountArray.get(arrayIndex)){ - antiBioticsProfile(); - arrayIndex ++; - } - }else{ - if (currentReadCount % readNumber == 0){ - antiBioticsProfile(); - } - } - //Get the read - if (!record.getReadUnmappedFlag()){ - readSequence = new Sequence(Alphabet.DNA(), record.getReadString(), readName); - if (record.getReadNegativeStrandFlag()){ - readSequence = Alphabet.DNA.complement(readSequence); - readSequence.setName(readName); - } - } - } - - if (record.getReadUnmappedFlag()) - continue; - //assert: the read sequence is stored in readSequence with the right direction - String geneID = record.getReferenceName(); - if (!geneMap.containsKey(geneID)) - continue; - - int refLength = geneMap.get(geneID).length(); - - ArrayList alignmentList = alignmentMap.get(geneID); - if (alignmentList == null){ - alignmentList = new ArrayList(); - alignmentMap.put(geneID, alignmentList); - } - //put the sequence into alignment list - - Sequence readSeq = HTSUtilities.spanningSequence(record, readSequence, refLength, 20); - - if (readSeq == null){ - Logging.warn("Read sequence is NULL sequence "); - }else{ - alignmentList.add(readSeq); - } - }//while - samIter.close(); - samReader.close(); - antiBioticsProfile(); - //makeStackedChart(); - - for (String pg:maxScores.keySet()){ - double score = maxScores.get(pg); - Logging.info("SCORE: " + pg + " " + score + " " + targetPrimaryGroup.contains(pg)); - } - - DateFormat df = new SimpleDateFormat("dd/MM/yy HH:mm:ss"); - Logging.info("END : " + df.format(Calendar.getInstance().getTime())); - } - private String stats(HashSet a, HashSet p, Collection universe){ - int TP = 0, FP = 0, FN = 0; - for (K s:p){ - if (a.contains(s)) - TP ++;//True positive - else{ - FP ++; - } - } - FN = a.size() - TP; - - int TN = universe.size() - TP - FP - FN; - - double precision = (TP * 1.0) / (TP + FP); - double recall = (TP * 1.0) / (TP + FN); - double specificity = ( TN * 1.0 ) / (TN + FP); - - - double f1 = 2 * precision * recall/ (precision + recall); - double accuracy = (TP + TN + 0.0)/(universe.size()); - - return f1 +"\t" + precision + "\t" + recall + "\t" + specificity + "\t" + accuracy + "\t" + TP + "\t" + TN + "\t" + FP + "\t" + FN; - } -} \ No newline at end of file diff --git a/src/dev/java/obsolete/ResistanceGeneCmd.java b/src/dev/java/obsolete/ResistanceGeneCmd.java deleted file mode 100644 index 9d73fa1..0000000 --- a/src/dev/java/obsolete/ResistanceGeneCmd.java +++ /dev/null @@ -1,147 +0,0 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ - -/***************************************************************************** - * Revision History - * 7 Aug 2015 - Minh Duc Cao: Created - * - ****************************************************************************/ -package obsolete; - -import java.io.BufferedReader; -import java.io.IOException; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Calendar; - -import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; -import japsa.util.CommandLine; -import japsa.util.IntArray; -import japsa.util.Logging; -import japsa.util.deploy.Deployable; - -/** - * @author minhduc - * - */ -@Deprecated -@Deployable( - scriptName = "jsa.np.resistGenes", - scriptDesc = "Antibiotic resistance genes identification") -public class ResistanceGeneCmd extends CommandLine{ - public ResistanceGeneCmd(){ - super(); - Deployable annotation = getClass().getAnnotation(Deployable.class); - setUsage(annotation.scriptName() + " [options]"); - setDesc(annotation.scriptDesc()); - - addString("output", "output.dat", "Output file"); - addString("bamFile", null, "The bam file"); - //addString("geneFile", null, "The gene file"); - //addString("figure", null, "Figure file"); - addString("mcoordFile", null, "Alignment with dnadiff between the genome (illumina) and genes database"); - - addDouble("scoreThreshold", 1.5, "The alignment score threshold"); - addString("msa", "kalign", - "Name of the msa method, support poa, kalign, muscle and clustalo"); - addString("global", "needle", - "Name of the global method, support needle and hmm"); - addString("tmp", "tmp/t", "Temporary folder"); - addString("hours", null, "The file containging hours against yields, if set will output acording to time"); - addInt("read", 500, "Number of reads before a typing, NA if timestamp is set"); - addInt("timestamp", 0, "Number of seconds between internval"); - addDouble("il", 0.9, "Threshold for Illumina"); - addBoolean("twodonly", false, "Use only two dimentional reads"); - - - addStdHelp(); - } - - public static void main(String[] args) throws IOException, InterruptedException{ - CommandLine cmdLine = new ResistanceGeneCmd(); - args = cmdLine.stdParseLine(args); - - String output = cmdLine.getStringVal("output"); - String bamFile = cmdLine.getStringVal("bam"); - String mcoordFile = cmdLine.getStringVal("mcoordFile"); - String msa = cmdLine.getStringVal("msa"); - String global = cmdLine.getStringVal("global"); - - //String figure = cmdLine.getStringVal("figure"); - String tmp = cmdLine.getStringVal("tmp"); - String hours = cmdLine.getStringVal("hours"); - - double scoreThreshold = cmdLine.getDoubleVal("scoreThreshold"); - int read = cmdLine.getIntVal("read"); - int timestamp = cmdLine.getIntVal("timestamp"); - - //double np = cmdLine.getDoubleVal("np"); - double il = cmdLine.getDoubleVal("il"); - - boolean twodonly = cmdLine.getBooleanVal("twodonly"); - - ResistanceGene paTyping = new ResistanceGene(); - paTyping.msa = msa; - paTyping.global = global; - - paTyping.prefix = tmp; - paTyping.scoreThreshold = scoreThreshold; - paTyping.twoDOnly = twodonly; - paTyping.readNumber = read; - DateFormat df = new SimpleDateFormat("dd/MM/yy HH:mm:ss"); - Logging.info("START : " + df.format(Calendar.getInstance().getTime())); - - if (paTyping.readNumber < 1) - paTyping.readNumber = 1; - - paTyping.datOS = SequenceOutputStream.makeOutputStream(output); - - paTyping.getGeneClassInformation(mcoordFile); - if (hours !=null){ - BufferedReader bf = SequenceReader.openFile(hours); - String line = bf.readLine();//first line - paTyping.hoursArray = new IntArray(); - paTyping.readCountArray = new IntArray(); - - while ((line = bf.readLine())!= null){ - String [] tokens = line.split("\\s"); - int hrs = Integer.parseInt(tokens[0]); - int readCount = Integer.parseInt(tokens[2]); - - paTyping.hoursArray.add(hrs); - paTyping.readCountArray.add(readCount); - } - } - - paTyping.ilThreshold = il; - paTyping.typing(bamFile); - paTyping.datOS.close(); - } -} diff --git a/src/dev/java/obsolete/SpeciesMixtureTyping.java b/src/dev/java/obsolete/SpeciesMixtureTyping.java deleted file mode 100644 index 762cf63..0000000 --- a/src/dev/java/obsolete/SpeciesMixtureTyping.java +++ /dev/null @@ -1,446 +0,0 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ - -/************************** REVISION HISTORY ************************** - * 07/09/2014 - Minh Duc Cao: Created - * - ****************************************************************************/ - -package obsolete; - -import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; -import japsa.util.DoubleArray; -import japsa.util.IntArray; -import japsa.util.Logging; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordIterator; -import htsjdk.samtools.SamInputResource; -import htsjdk.samtools.SamReader; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; - -import java.awt.BorderLayout; -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.HashMap; - -import javax.swing.JFrame; -import javax.swing.JLabel; - -import org.jfree.chart.ChartFactory; -import org.jfree.chart.ChartPanel; -import org.jfree.chart.JFreeChart; -import org.jfree.chart.axis.ValueAxis; -import org.jfree.chart.plot.XYPlot; -import org.jfree.chart.renderer.xy.XYErrorRenderer; -import org.jfree.data.xy.YIntervalSeries; -import org.jfree.data.xy.YIntervalSeriesCollection; -import org.rosuda.JRI.REXP; -import org.rosuda.JRI.Rengine; - - -/** - * @author minhduc - * - */ -@Deprecated -public class SpeciesMixtureTyping { - boolean withGUI = false; - public double qual = 0; - - Rengine rengine; - - public IntArray hoursArray = null; - public IntArray readCountArray = null; - - int currentReadCount = 0; - int currentReadAligned = 0; - long currentBaseCount = 0; - - int arrayIndex = 0; - String prefix; - public SequenceOutputStream countsOS; - YIntervalSeriesCollection dataset = new YIntervalSeriesCollection(); - - long firstReadTime = 0; - JLabel timeLabel; - - ///////////////////////////////////////////////////////////////////////////// - - long startTime; - public SpeciesMixtureTyping(boolean withGUI){ - this.withGUI = withGUI; - rengine = new Rengine (new String [] {"--no-save"}, false, null); - if (!rengine.waitForR()){ - Logging.exit("Cannot load R",1); - } - rengine.eval("library(MultinomialCI)"); - rengine.eval("alpha<-0.05"); - - Logging.info("REngine ready"); - - if (withGUI){ - System.setProperty("java.awt.headless", "false"); - - GUITyping myGen = new GUITyping(this); - new Thread(myGen).start(); - //dataset.addSeries(s1); - //dataset.addSeries(s2); - JFreeChart chart = ChartFactory.createTimeSeriesChart( - "Species Typing", - "Time", - "Value", - dataset, - true, - true, - false - ); - final XYPlot plot = chart.getXYPlot(); - plot.setRenderer(new XYErrorRenderer()); - - //System.out.println(chart.getXYPlot().getRenderer().getClass().getCanonicalName()); - - ValueAxis axis = plot.getDomainAxis(); - axis.setAutoRange(true); - axis.setAutoRangeMinimumSize(6000); - - ValueAxis yAxis = plot.getRangeAxis(); - yAxis.setRange(0.0, 1.0); - //axis.set - - //axis.setFixedAutoRange(6000.0); - - JFrame frame = new JFrame("Species Typing"); - frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - ChartPanel label = new ChartPanel(chart); - frame.getContentPane().add(label,BorderLayout.CENTER); - //Suppose I add combo boxes and buttons here later - - timeLabel = new JLabel("Time lapsed : waiting Total reads: " + currentReadCount + " Aligned reads: " + currentReadAligned); - frame.getContentPane().add(timeLabel, BorderLayout.SOUTH); - frame.pack(); - frame.setVisible(true); - } - - startTime = System.currentTimeMillis(); - } - - public void close(){ - rengine.end(); - } - /** - * @param bamFile - * @param geneFile - * @throws IOException - * @throws InterruptedException - */ - static class SpeciesCount implements Comparable{ - String species; - int count = 0; - - SpeciesCount (String s){ - species = s; - } - - /* (non-Javadoc) - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - @Override - public int compareTo(SpeciesCount o) { - return o.count - count; - } - - } - - - HashMap seq2Species = new HashMap(); - HashMap species2Count = new HashMap(); - ArrayList speciesList = new ArrayList(); - - - public void preTyping(String indexFile)throws IOException{ - BufferedReader bf = SequenceReader.openFile(indexFile); - String line = ""; - while ( (line = bf.readLine())!=null){ - if (line.startsWith("#")) - continue; - - String [] toks = line.split(" "); - String sp = toks[0]; - String seq = toks[1].substring(1); - - if (seq2Species.put(seq, sp) != null) - throw new RuntimeException("sequence " + seq +" presents multiple time"); - - if (species2Count.get(sp) == null){ - species2Count.put(sp,new SpeciesCount(sp)); - } - }//while - bf.close(); - Logging.info(seq2Species.size() + " " + species2Count.size()); - speciesList.addAll(species2Count.keySet()); - - //Write header - countsOS.print("step\treads\tbases\tspecies\tprob\terr\ttAligned\tsAligned\n"); - //for (String species:speciesList){ - // countsOS.print("\t" + species); - //} - //countsOS.println(); - } - - private void simpleAnalysisCurrent(int currentRead) throws IOException{ - int step = currentRead; - if (hoursArray != null) - step = hoursArray.get(arrayIndex); - - int sum = 0; - double [] count = new double[speciesList.size()]; - for (int i = 0; i < count.length;i++){ - count[i] = species2Count.get(speciesList.get(i)).count; - sum += count[i]; - } - DoubleArray countArray = new DoubleArray(); - ArrayList speciesArray = new ArrayList (); - - int minCount = Math.max(1,sum/50); - - for (int i = 0; i < count.length;i++){ - if (count[i] >= minCount){ - countArray.add(count[i]); - speciesArray.add(speciesList.get(i)); - Logging.info(step+" : " + speciesList.get(i) + " == " + count[i]); - } - } - //if (countArray.size() > 10) return; - countArray.add(1); - speciesArray.add("others"); - - rengine.assign("count", countArray.toArray()); - rengine.eval("tab = multinomialCI(count,alpha)"); - REXP tab = rengine.eval("tab",true); - double [][] results = tab.asDoubleMatrix(); - - //countsOS.print(step); - if (simulation > 0){ - long delay = step * 60 * 1000 / simulation - (System.currentTimeMillis() - startTime); - Logging.info("Step " + step + " delay " + delay/1000); - if(delay > 0){ - Logging.info("Step " + step + " delay " + delay/1000); - try { - Thread.sleep(delay); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - } - - for (int i = 0; i < results.length;i++){ - if (results[i][0] <= 0.00001) - continue; - - double mid = (results[i][0] + results[i][1])/2; - double err = mid - results[i][0]; - - countsOS.print(step + "\t" + currentReadCount + "\t" + currentBaseCount + "\t" + speciesArray.get(i).replaceAll("_"," ") + "\t" + mid +"\t" + err + "\t" + this.currentReadAligned + "\t" + count[i]); - countsOS.println(); - } - - countsOS.flush(); - Logging.info(step+" " + countArray.size()); - } - - public int simulation = 0; - private void guiAnalysisCurrent( HashMap speciesSeries){ - - int sum = 0; - double [] count = new double[speciesList.size()]; - for (int i = 0; i < count.length;i++){ - count[i] = species2Count.get(speciesList.get(i)).count; - sum += count[i]; - } - DoubleArray countArray = new DoubleArray(); - ArrayList speciesArray = new ArrayList (); - - for (int i = 0; i < count.length;i++){ - if (count[i] >= sum/50){ - countArray.add(count[i]); - speciesArray.add(speciesList.get(i)); - } - } - - Calendar cal = Calendar.getInstance(); - SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); - Logging.info(sdf.format(cal.getTime()) + " Found " + countArray.size() + " counts"); - if (countArray.size() > 20) return; - - countArray.add(1); - speciesArray.add("others"); - - rengine.assign("count", countArray.toArray()); - rengine.eval("tab = multinomialCI(count,alpha)"); - REXP tab = rengine.eval("tab",true); - double [][] results = tab.asDoubleMatrix(); - - //countsOS.print(step); - long x = System.currentTimeMillis(); - for (int i = 0; i < results.length;i++){ - String species = speciesArray.get(i); - if (species.equals("others")) - continue; - - double mid = (results[i][0] + results[i][1])/2; - YIntervalSeries series = speciesSeries.get(species); - if (series == null){ - series = new YIntervalSeries(species); - speciesSeries.put(species, series); - dataset.addSeries(series); - } - series.add(x, mid, results[i][0], results[i][1]); - //countsOS.print("\t" + mid +"\t" + err); - } - //countsOS.println(); - //countsOS.flush(); - //Logging.info(step+" " + countArray.size()); - } - - public void typing(String bamFile, int readNumber) throws IOException, InterruptedException{ - if (readNumber <= 0) - readNumber = 1; - - - String readName = ""; - //Read the bam file - SamReaderFactory.setDefaultValidationStringency(ValidationStringency.SILENT); - SamReader samReader; - if ("-".equals(bamFile)) - samReader = SamReaderFactory.makeDefault().open(SamInputResource.of(System.in)); - else - samReader = SamReaderFactory.makeDefault().open(new File(bamFile)); - - SAMRecordIterator samIter = samReader.iterator(); - - while (samIter.hasNext()){ - SAMRecord sam = samIter.next(); - if (firstReadTime <=0) - firstReadTime = System.currentTimeMillis(); - - if (!sam.getReadName().equals(readName)){ - readName = sam.getReadName(); - - currentReadCount ++; - currentBaseCount += sam.getReadLength(); - - if (hoursArray != null){ - if (arrayIndex < hoursArray.size() && currentReadCount >= this.readCountArray.get(arrayIndex)){ - simpleAnalysisCurrent(currentReadCount); - arrayIndex ++; - } - }else{ - if (currentReadCount % readNumber == 0){ - simpleAnalysisCurrent(currentReadCount); - } - } - } - - if (sam.getReadUnmappedFlag()){ - continue; - } - - if (sam.getMappingQuality() < this.qual) - continue; - - currentReadAligned ++; - - String refSequence = sam.getReferenceName(); - String species = seq2Species.get(refSequence); - if (species == null){ - throw new RuntimeException(" Can find species with ref " + refSequence + " line " + currentReadCount ); - } - SpeciesCount sCount = species2Count.get(species); - if (sCount == null){ - throw new RuntimeException(" Can find record with species " + species + " line " + currentReadCount ); - } - - synchronized(this) { - sCount.count ++; - } - - }//while - - //final run - simpleAnalysisCurrent(currentReadCount); - - samIter.close(); - samReader.close(); - } - - static class GUITyping implements Runnable { - SpeciesMixtureTyping typing; - public GUITyping (SpeciesMixtureTyping typ){ - this.typing = typ; - } - HashMap speciesSeries = new HashMap(); - - - public void run() { - long lastRun = 0; - while(true) { - long delay = System.currentTimeMillis() - lastRun; - if (delay < 5000){ - try { - Thread.sleep((5000 - delay)); - } catch (InterruptedException ex) { - System.out.println(ex); - } - } - System.out.println("TICK @ Species " + delay); - synchronized(this.typing) {//avoid concurrent update - lastRun = System.currentTimeMillis(); - typing.guiAnalysisCurrent(speciesSeries); - if (typing.firstReadTime > 0){ - long lapsedTime = (System.currentTimeMillis() - typing.firstReadTime) / 1000; - long hours = lapsedTime / 3600; - lapsedTime = lapsedTime % 3600; - long mins = lapsedTime / 60; - lapsedTime = lapsedTime % 60; - typing.timeLabel.setText("Time lapsed : " + hours + ":" + (mins < 10?"0":"") + mins + ":" + (lapsedTime < 10?"0":"") + lapsedTime - + " Total reads: " + typing.currentReadCount + " Aligned reads: " + typing.currentReadAligned); - } - } - - } - } - } -} \ No newline at end of file diff --git a/src/dev/java/obsolete/SpeciesMixtureTypingCmd.java b/src/dev/java/obsolete/SpeciesMixtureTypingCmd.java deleted file mode 100644 index 6459f59..0000000 --- a/src/dev/java/obsolete/SpeciesMixtureTypingCmd.java +++ /dev/null @@ -1,126 +0,0 @@ -/***************************************************************************** - * Copyright (c) Minh Duc Cao, Monash Uni & UQ, All rights reserved. * - * * - * Redistribution and use in source and binary forms, with or without * - * modification, are permitted provided that the following conditions * - * are met: * - * * - * 1. Redistributions of source code must retain the above copyright notice, * - * this list of conditions and the following disclaimer. * - * 2. Redistributions in binary form must reproduce the above copyright * - * notice, this list of conditions and the following disclaimer in the * - * documentation and/or other materials provided with the distribution. * - * 3. Neither the names of the institutions nor the names of the contributors* - * may be used to endorse or promote products derived from this software * - * without specific prior written permission. * - * * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - ****************************************************************************/ - -/***************************************************************************** - * Revision History - * 7 Aug 2015 - Minh Duc Cao: Created - * - ****************************************************************************/ -package obsolete; - -import java.io.BufferedReader; -import java.io.IOException; - -import japsa.seq.SequenceOutputStream; -import japsa.seq.SequenceReader; -import japsa.util.CommandLine; -import japsa.util.IntArray; -import japsa.util.deploy.Deployable; - - -/** - * @author minhduc - * - */ -@Deployable( - scriptName = "jsa.np.speciesTyping", - scriptDesc = "Species typing using Nanopore Sequencing" - ) -@Deprecated -public class SpeciesMixtureTypingCmd extends CommandLine { - - public SpeciesMixtureTypingCmd(){ - super(); - Deployable annotation = getClass().getAnnotation(Deployable.class); - setUsage(annotation.scriptName() + " [options]"); - setDesc(annotation.scriptDesc()); - - addString("output", "output.dat", "Output file"); - addString("bamFile", null, "The bam file"); - addString("indexFile", null, "indexFile "); - addString("hours", null, "The file containging hours against yields, if set will output acording to tiime"); - addBoolean("GUI", false, "Run on GUI"); - addInt("number", 50, "Number of reads"); - addInt("timestamp", 0, "Timestamp to check, if <=0 then use read number instead"); - addInt("sim", 0, "Scale for simulation"); - addDouble("qual", 0, "Minimum alignment quality"); - - addStdHelp(); - } - /** - * @param args - * @throws IOException - * @throws InterruptedException - */ - public static void main(String[] args) throws IOException, InterruptedException { - CommandLine cmdLine = new SpeciesMixtureTypingCmd(); - args = cmdLine.stdParseLine(args); - - /**********************************************************************/ - - String output = cmdLine.getStringVal("output"); - String bamFile = cmdLine.getStringVal("bamFile"); - String indexFile = cmdLine.getStringVal("indexFile"); - String hours = cmdLine.getStringVal("hours"); - boolean GUI = cmdLine.getBooleanVal("GUI"); - int number = cmdLine.getIntVal("number"); - double qual = cmdLine.getDoubleVal("qual"); - - SpeciesMixtureTyping paTyping = new SpeciesMixtureTyping(GUI); - - paTyping.simulation = cmdLine.getIntVal("sim"); - paTyping.qual = qual; - - if (hours !=null){ - BufferedReader bf = SequenceReader.openFile(hours); - String line = bf.readLine();//first line -> ignore - paTyping.hoursArray = new IntArray(); - paTyping.readCountArray = new IntArray(); - - while ((line = bf.readLine())!= null){ - String [] tokens = line.split("\\s+"); - int hrs = Integer.parseInt(tokens[0]); - int readCount = Integer.parseInt(tokens[2]); - - paTyping.hoursArray.add(hrs); - paTyping.readCountArray.add(readCount); - } - bf.close(); - } - - // paTyping.prefix = prefix; - paTyping.countsOS = SequenceOutputStream.makeOutputStream(output); - paTyping.preTyping(indexFile); - paTyping.typing(bamFile, number); - paTyping.countsOS.close(); - paTyping.close(); - - } - -}