From 7aeef30eaa9cc668e7a7acffec3c0e44c558f056 Mon Sep 17 00:00:00 2001 From: Bhuvan Sankar Date: Sat, 22 Apr 2017 23:21:03 +1000 Subject: [PATCH 1/7] with elapsed time --- .classpath | 26 + .gitignore | 1 + .project | 17 + .../bio/hts/clustering/GettingTreadsFromFasta.java | 332 ++++++------- .../bio/hts/clustering/KmeanClustering.java | 544 +++++++++++---------- .../japsadev/bio/hts/clustering/PairDistance.java | 410 ++++++++-------- src/dev/java/japsadev/tools/VNTRClusteringCmd.java | 31 +- 7 files changed, 712 insertions(+), 649 deletions(-) create mode 100644 .classpath create mode 100644 .gitignore create mode 100644 .project diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..9e92c6b --- /dev/null +++ b/.classpath @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ae3c172 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/bin/ diff --git a/.project b/.project new file mode 100644 index 0000000..c012ce4 --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + japsa + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + + diff --git a/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java index bc03da8..6fa2da5 100644 --- a/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java +++ b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java @@ -1,164 +1,168 @@ -package japsadev.bio.hts.clustering; - -import java.io.*; -import java.util.*; - - - -/** - * @author buvan.suji - * - */ - -public class GettingTreadsFromFasta { - - private String[] description; - private String[] sequence; - - static ArrayList Rname = new ArrayList(); - static ArrayList TReads = new ArrayList(); - static String FnamePath; - static int sequenceLength; - - public GettingTreadsFromFasta(String filename) { - readSequenceFromFile(filename); - } - - @SuppressWarnings("resource") - void readSequenceFromFile(String file) { - - ArrayList desc = new ArrayList(); - ArrayList seq = new ArrayList(); - BufferedReader in = null; - - try { - in = new BufferedReader(new FileReader(file)); - StringBuffer buffer = new StringBuffer(); - String line = in.readLine(); - - if (line == null) - throw new IOException(file + " is an empty file"); - - if (line.charAt(0) != '>') - throw new IOException("First line of " + file - + " should start with '>'"); - else - desc.add(line); - - for (line = in.readLine().trim(); line != null; line = in - .readLine()) { - if (line.length() > 0 && line.charAt(0) == '>') { - seq.add(buffer.toString()); - buffer = new StringBuffer(); - desc.add(line); - } else - buffer.append(line.trim()); - } - if (buffer.length() != 0) - seq.add(buffer.toString()); - } catch (IOException e) { - System.out.println("Error when reading " + file); - e.printStackTrace(); - } - - description = new String[desc.size()]; - sequence = new String[seq.size()]; - for (int i = 0; i < seq.size(); i++) { - description[i] = (String) desc.get(i); - sequence[i] = (String) seq.get(i); - } - sequenceLength = seq.size(); - - } - - // return first sequence as a String - public String getSequence() { - return sequence[0]; - } - - // return first xdescription as String - public String getDescription() { - return description[0]; - } - - // return sequence as a String - public String getSequence(int i) { - return sequence[i]; - } - - // return description as String - public String getDescription(int i) { - return description[i]; - } - - public int size() { - return sequence.length; - } - - public static void FileReading(String filename) { - GettingTreadsFromFasta fsf = new GettingTreadsFromFasta( - filename); - StringBuffer buffer1 = new StringBuffer(); - - for (int i = 0; i < fsf.size(); i++) { - String temp = ((buffer1.append(fsf.getDescription(i))) - .deleteCharAt(0)).toString(); - Rname.add(temp); - TReads.add(fsf.getSequence(i)); - buffer1 = new StringBuffer(); - } - } - - // call this method-1 - public static void DestReads() throws Exception { - String fname = ""; - System.out.print("Enter the name of the FastaFile:"); - fname = (new BufferedReader(new InputStreamReader(System.in))) - .readLine(); - FnamePath = new File(fname).getName(); - // System.out.println(FnamePath); - FileReading(fname); - } - - public static void main(String[] args) throws Exception { - DestReads(); - ViewList(); - } - - // call this method-2 - public static ArrayList GetRname() { - return Rname; - } - - // call this method-3 - public static ArrayList GetTReads() { - return TReads; - } - - public static int NumberReads() { - return TReads.size(); - } - - // call this method-4 - public static String GetFileName() { - return FnamePath; - } - - public static int SeqLength() { - return sequenceLength; - } - - public static int NelementsClustering() { - return sequenceLength * (sequenceLength - 1) / 2; - } - - public static void ViewList() { - System.out.println(GetRname()); - // System.out.println(GetTReads()); - System.out.println(SeqLength()); - System.out.println(NelementsClustering()); - } - - - -} +package japsadev.bio.hts.clustering; + +import java.io.*; +import java.util.*; + + + + +/** + * @author buvan.suji + * + */ + +public class GettingTreadsFromFasta { + + private String[] description; + private String[] sequence; + + static ArrayList Rname = new ArrayList(); + static ArrayList TReads = new ArrayList(); + static String FnamePath; + static int sequenceLength; + + public GettingTreadsFromFasta(String filename) { + readSequenceFromFile(filename); + } + + @SuppressWarnings("resource") + void readSequenceFromFile(String file) { + + ArrayList desc = new ArrayList(); + ArrayList seq = new ArrayList(); + BufferedReader in = null; + + try { + in = new BufferedReader(new FileReader(file)); + StringBuffer buffer = new StringBuffer(); + String line = in.readLine(); + + if (line == null) + throw new IOException(file + " is an empty file"); + + if (line.charAt(0) != '>') + throw new IOException("First line of " + file + + " should start with '>'"); + else + desc.add(line); + + for (line = in.readLine().trim(); line != null; line = in + .readLine()) { + if (line.length() > 0 && line.charAt(0) == '>') { + seq.add(buffer.toString()); + buffer = new StringBuffer(); + desc.add(line); + } else + buffer.append(line.trim()); + } + if (buffer.length() != 0) + seq.add(buffer.toString()); + } catch (IOException e) { + System.out.println("Error when reading " + file); + e.printStackTrace(); + } + + description = new String[desc.size()]; + sequence = new String[seq.size()]; + for (int i = 0; i < seq.size(); i++) { + description[i] = (String) desc.get(i); + sequence[i] = (String) seq.get(i); + } + sequenceLength = seq.size(); + + } + + // return first sequence as a String + public String getSequence() { + return sequence[0]; + } + + // return first xdescription as String + public String getDescription() { + return description[0]; + } + + // return sequence as a String + public String getSequence(int i) { + return sequence[i]; + } + + // return description as String + public String getDescription(int i) { + return description[i]; + } + + public int size() { + return sequence.length; + } + + public static void FileReading(String filename) { + GettingTreadsFromFasta fsf = new GettingTreadsFromFasta( + filename); + StringBuffer buffer1 = new StringBuffer(); + + for (int i = 0; i < fsf.size(); i++) { + String temp = ((buffer1.append(fsf.getDescription(i))) + .deleteCharAt(0)).toString(); + Rname.add(temp); + TReads.add(fsf.getSequence(i)); + buffer1 = new StringBuffer(); + } + } + + // call this method-1 + public static void DestReads() throws Exception { + /*String fname = ""; + System.out.print("Enter the name of the FastaFile:"); + fname = (new BufferedReader(new InputStreamReader(System.in))) + .readLine(); + FnamePath = new File(fname).getName();*/ + + // System.out.println(FnamePath); + //FileReading(fname); + FnamePath = "chr2250781022_50785963.fasta"; + FileReading(FnamePath); + } + + public static void main(String[] args) throws Exception { + DestReads(); + ViewList(); + } + + // call this method-2 + public static ArrayList GetRname() { + return Rname; + } + + // call this method-3 + public static ArrayList GetTReads() { + return TReads; + } + + public static int NumberReads() { + return TReads.size(); + } + + // call this method-4 + public static String GetFileName() { + return FnamePath; + } + + public static int SeqLength() { + return sequenceLength; + } + + public static int NelementsClustering() { + return sequenceLength * (sequenceLength - 1) / 2; + } + + public static void ViewList() { + System.out.println(GetRname()); + // System.out.println(GetTReads()); + System.out.println(SeqLength()); + System.out.println(NelementsClustering()); + } + + + +} diff --git a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java index 573422b..825174e 100644 --- a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java +++ b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java @@ -1,265 +1,279 @@ -package japsadev.bio.hts.clustering; - -import java.util.ArrayList; - -/** - * @author buvan.suji - * - */ - -public class KmeanClustering { - - - /** - * @param args - */ - - static int count1,count2,count3; - static int d[]; - static int k[][]; - static int tempk[][]; - static int m[]; - static double diff[]; - static int n; - final static int p =2;//number of clusters - static double max = 0; - static double temp1 = 0; - static int index1; - static int index2; - static ArrayList t = new ArrayList() ; - - - static ArrayList descript = new ArrayList(); - static ArrayList reads = new ArrayList(); - static int Nreads; - static String FileName; - static int NumberElements; - static int ClustElements; - static ArrayList distList1 = new ArrayList(); - static ArrayList> distList2 = new ArrayList>(); - - - public static void Clustering() throws Exception{ - GettingTreadsFromFasta.DestReads(); - descript = GettingTreadsFromFasta.GetRname(); - reads = GettingTreadsFromFasta.GetTReads(); - Nreads = GettingTreadsFromFasta.NumberReads(); - FileName = GettingTreadsFromFasta.GetFileName(); - NumberElements = GettingTreadsFromFasta.SeqLength(); - ClustElements = GettingTreadsFromFasta.NelementsClustering(); - - double[][] table = new double[Nreads][Nreads]; - - for (int i=0; im[i]){ - diff[i] = list[m[i]][a]; - } - else{ - diff[i]=0; - } - } - - int val=0; - double temp=diff[0]; - for(int i=0;i s = new ArrayList(); - - for(int x1 = 0; x1t.get(x2)){ - sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); - } - else{ - sum = sum+0; - } - } - //System.out.println(sum); - - s.add(sum/t.size()); - } - //System.out.println(s.size()); - double min=s.get(0); - int d2 = 0; - for(int x3=1;x3s.get(x3)){ - min = s.get(x3); - d2 = x3; - } - } - - - m[i]=t.get(d2); - s.clear(); - t.clear(); - - } - } - -//This checks if previous k ie. tempk and current k are same.Used as terminating case. - static int VerifyEqual(){ - for(int i=0;i descript = new ArrayList(); + ArrayList reads = new ArrayList(); + int Nreads; + String FileName; + int NumberElements; + int ClustElements; + int n; + int d[]; + int k[][]; + final int p =2;//number of clusters + int tempk[][]; + int m[]; + int Nclusters = 0; + + double max = 0; + double temp1 = 0; + int index1=0; + int index2=0; + int count1=0,count2=0; + long startTime = System.nanoTime(); + GettingTreadsFromFasta.DestReads(); + descript = GettingTreadsFromFasta.GetRname(); + reads = GettingTreadsFromFasta.GetTReads(); + Nreads = GettingTreadsFromFasta.NumberReads(); + FileName = GettingTreadsFromFasta.GetFileName(); + NumberElements = GettingTreadsFromFasta.SeqLength(); + ClustElements = GettingTreadsFromFasta.NelementsClustering(); + + double[][] table = new double[Nreads][Nreads]; + + for (int i=0; im[i]){ + diff[i] = list[m[i]][a]; + } + else{ + diff[i]=0; + } + } + + int val=0; + double temp=diff[0]; + for(int i=0;i<2;++i){ + if(diff[i] t = new ArrayList() ; + + for(int i=0;i<2;++i){ + m[i]=0; // initializing means to 0 + } + + //int cnt=0; + for(int i=0;i<2;++i){ + + for(int j=0;j s = new ArrayList(); + + for(int x1 = 0; x1t.get(x2)){ + sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); + } + else{ + sum = sum+0; + } + } + //System.out.println(sum); + s.add(sum/t.size()); + } + //System.out.println(s.size()); + double min=s.get(0); + int d2 = 0; + for(int x3=1;x3s.get(x3)){ + min = s.get(x3); + d2 = x3; + } + } + + + m[i]=t.get(d2); + s.clear(); + t.clear(); + + } + } + +//This checks if previous k ie. tempk and current k are same.Used as terminating case. + static int VerifyEqual(int n, int k[][], int tempk[][]){ + for(int i=0;i<2;++i){ + for(int j=0;j parentMap = new HashMap<>(); - - for (int i = 1; i <= m; ++i) { - d[i][0] = i; - } - - for (int j = 1; j <= n; ++j) { - d[0][j] = j; - } - - //System.out.println("initial="+d[0][0]); - - - for (int j = 1; j <= n; ++j) { - for (int i = 1; i <= m; ++i) { - /*if(s.charAt(j-1)==z.charAt(i-1)){ - final int delta = 0; - } - else{ - if(s.charAt(j-1)== z.charAt(i-1)) - }*/ - final int delta = (s.charAt(j - 1) == z.charAt(i - 1)) ? 0 : 1; - - int tentativeDistance = d[i - 1][j] + 2;//gap penalty - EditOperation editOperation = EditOperation.INSERT; - - if (tentativeDistance > d[i][j - 1] + 2) { - tentativeDistance = d[i][j - 1] + 2; - editOperation = EditOperation.DELETE; - } - - if (tentativeDistance > d[i - 1][j - 1] + delta) { - tentativeDistance = d[i - 1][j - 1] + delta; - editOperation = EditOperation.SUBSTITUTE; - } - - d[i][j] = tentativeDistance; - //System.out.println("inloop d("+i+", "+j+")="+d[i][j]); - - switch (editOperation) { - case SUBSTITUTE: - parentMap.put(new Point(i, j), new Point(i - 1, j - 1)); - break; - - case INSERT: - parentMap.put(new Point(i, j), new Point(i - 1, j)); - break; - - case DELETE: - parentMap.put(new Point(i, j), new Point(i, j - 1)); - break; - default: - break; - } - } - } - - final StringBuilder topLineBuilder = new StringBuilder(n + m); - final StringBuilder bottomLineBuilder = new StringBuilder(n + m); - final StringBuilder editSequenceBuilder = new StringBuilder(n + m); - Point current = new Point(m, n); - - while (true) { - Point predecessor = parentMap.get(current); - - if (predecessor == null) { - break; - } - - if (current.x != predecessor.x && current.y != predecessor.y) { - final char schar = s.charAt(predecessor.y); - final char zchar = z.charAt(predecessor.x); - - topLineBuilder.append(schar); - bottomLineBuilder.append(zchar); - editSequenceBuilder.append(schar != zchar ? - EditOperation.SUBSTITUTE : - EditOperation.MATCH); - } else if (current.x != predecessor.x) { - topLineBuilder.append(GAP); - bottomLineBuilder.append(z.charAt(predecessor.x)); - editSequenceBuilder.append(EditOperation.INSERT); - } else { - topLineBuilder.append(s.charAt(predecessor.y)); - bottomLineBuilder.append(GAP); - editSequenceBuilder.append(EditOperation.DELETE); - } - - current = predecessor; - } - - // Remove the last characters that correspond to the very beginning - // of the alignments and edit sequence (since the path reconstruction - // proceeds from the "end" to the "beginning" of the distance matrix. - topLineBuilder .deleteCharAt(topLineBuilder.length() - 1); - bottomLineBuilder .deleteCharAt(bottomLineBuilder.length() - 1); - editSequenceBuilder.deleteCharAt(editSequenceBuilder.length() - 1); - - // Our result data is backwards, reverse them. - topLineBuilder .reverse(); - bottomLineBuilder .reverse(); - editSequenceBuilder.reverse(); - - return new EditDistanceResult(d[m][n], - editSequenceBuilder.toString(), - topLineBuilder.toString(), - bottomLineBuilder.toString()); - } - - - - public static void main(String[] args) throws IOException { - EditDistanceResult result = compute("ACTAGGTTA", "TAAGGCTCAAT"); - System.out.println("Distance: " + result.getDistance()); - System.out.println("Edit sequence: " + result.getEditSequence()); - System.out.println("Alignment:"); - System.out.println(result.getTopAlignmentRow()); - System.out.println(result.getBottomAlignmentRow()); - - //S1 = "ATTCGGCATCA" - //S2 = "ACTAGGTTA" - //S3 = "GGTAAGGATTGCC" - //S4 = "CTAGGATCCAG" - //S5 = "TAAGGCTCAAT" - //S6 = "CTACTGCAAG" - //S7 = "TTTAGGAGCTCA" - //S8 = "ACCTGACCTTG" - //S9 = "GGTTCAGTTCC" - } - - - - -} +package japsadev.bio.hts.clustering; + +import java.awt.Point; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + + +/** + * @author buvan.suji + * + */ + +public class PairDistance { + + public static final String GAP = "-"; + + public static final class EditDistanceResult { + private final int distance; + private final String editSequence; + private final String topAlignmentRow; + private final String bottomAlignmentRow; + + EditDistanceResult(final int distance, + final String editSequence, + final String topAlignmentRow, + final String bottomAlignmentRow) { + this.distance = distance; + this.editSequence = editSequence; + this.topAlignmentRow = topAlignmentRow; + this.bottomAlignmentRow = bottomAlignmentRow; + } + + public int getDistance() { + return distance; + } + + public String getEditSequence() { + return editSequence; + } + + public String getTopAlignmentRow() { + return topAlignmentRow; + } + + public String getBottomAlignmentRow() { + return bottomAlignmentRow; + } + } + + private static enum EditOperation { + INSERT ("I"), + SUBSTITUTE ("S"), + DELETE ("D"), + MATCH ("M"); + + private final String s; + + private EditOperation(String s) { + this.s = s; + } + + @Override + public String toString() { + return s; + } + } + + public static EditDistanceResult compute(String s, String z) { + // This is required to keep the parent map invariant. + // Otherwise the very first edit operation would not end up in the output. + s = "\u0000" + s; + z = "\u0000" + z; + + final int n = s.length(); + final int m = z.length(); + final int[][] d = new int[m + 1][n + 1]; + final Map parentMap = new HashMap<>(); + + for (int i = 1; i <= m; ++i) { + d[i][0] = i; + } + + for (int j = 1; j <= n; ++j) { + d[0][j] = j; + } + + + + + for (int j = 1; j <= n; ++j) { + for (int i = 1; i <= m; ++i) { + final int delta = (s.charAt(j - 1) == z.charAt(i - 1)) ? 0 : 1; + + int tentativeDistance = d[i - 1][j] + 2;//gap penalty + EditOperation editOperation = EditOperation.INSERT; + + if (tentativeDistance > d[i][j - 1] + 2) { + tentativeDistance = d[i][j - 1] + 2; + editOperation = EditOperation.DELETE; + } + + if (tentativeDistance > d[i - 1][j - 1] + delta) { + tentativeDistance = d[i - 1][j - 1] + delta; + editOperation = EditOperation.SUBSTITUTE; + } + + d[i][j] = tentativeDistance; + + + switch (editOperation) { + case SUBSTITUTE: + parentMap.put(new Point(i, j), new Point(i - 1, j - 1)); + break; + + case INSERT: + parentMap.put(new Point(i, j), new Point(i - 1, j)); + break; + + case DELETE: + parentMap.put(new Point(i, j), new Point(i, j - 1)); + break; + default: + break; + } + } + } + + final StringBuilder topLineBuilder = new StringBuilder(n + m); + final StringBuilder bottomLineBuilder = new StringBuilder(n + m); + final StringBuilder editSequenceBuilder = new StringBuilder(n + m); + Point current = new Point(m, n); + + while (true) { + Point predecessor = parentMap.get(current); + + if (predecessor == null) { + break; + } + + if (current.x != predecessor.x && current.y != predecessor.y) { + final char schar = s.charAt(predecessor.y); + final char zchar = z.charAt(predecessor.x); + + topLineBuilder.append(schar); + bottomLineBuilder.append(zchar); + editSequenceBuilder.append(schar != zchar ? + EditOperation.SUBSTITUTE : + EditOperation.MATCH); + } else if (current.x != predecessor.x) { + topLineBuilder.append(GAP); + bottomLineBuilder.append(z.charAt(predecessor.x)); + editSequenceBuilder.append(EditOperation.INSERT); + } else { + topLineBuilder.append(s.charAt(predecessor.y)); + bottomLineBuilder.append(GAP); + editSequenceBuilder.append(EditOperation.DELETE); + } + + current = predecessor; + } + + // Remove the last characters that correspond to the very beginning + // of the alignments and edit sequence (since the path reconstruction + // proceeds from the "end" to the "beginning" of the distance matrix. + topLineBuilder .deleteCharAt(topLineBuilder.length() - 1); + bottomLineBuilder .deleteCharAt(bottomLineBuilder.length() - 1); + editSequenceBuilder.deleteCharAt(editSequenceBuilder.length() - 1); + + // Our result data is backwards, reverse them. + topLineBuilder .reverse(); + bottomLineBuilder .reverse(); + editSequenceBuilder.reverse(); + + return new EditDistanceResult(d[m][n], + editSequenceBuilder.toString(), + topLineBuilder.toString(), + bottomLineBuilder.toString()); + } + + + + public static void main(String[] args) throws IOException { + EditDistanceResult result = compute("ACTAGGTTA", "TAAGGCTCAAT"); + System.out.println("Distance: " + result.getDistance()); + System.out.println("Edit sequence: " + result.getEditSequence()); + System.out.println("Alignment:"); + System.out.println(result.getTopAlignmentRow()); + System.out.println(result.getBottomAlignmentRow()); + + + } + + + + +} diff --git a/src/dev/java/japsadev/tools/VNTRClusteringCmd.java b/src/dev/java/japsadev/tools/VNTRClusteringCmd.java index 01d77f5..4063864 100644 --- a/src/dev/java/japsadev/tools/VNTRClusteringCmd.java +++ b/src/dev/java/japsadev/tools/VNTRClusteringCmd.java @@ -1,5 +1,5 @@ /***************************************************************************** - * Copyright (c) Bhuvana, email + * Copyright (c) Bhuvaneswari Thirugnanasambandham, buvan.suji@gmail.com * All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * @@ -36,14 +36,15 @@ + import japsa.util.CommandLine; import japsa.util.deploy.Deployable; +import japsadev.bio.hts.clustering.KmeanClustering; -import java.io.IOException; /** - * @author your name + * @author Bhuvaneswari Thirugnanasambandham * */ @Deployable( @@ -53,6 +54,8 @@ public class VNTRClusteringCmd extends CommandLine{ //CommandLine cmdLine; public VNTRClusteringCmd(){ + + super(); Deployable annotation = getClass().getAnnotation(Deployable.class); setUsage(annotation.scriptName() + " [options]"); @@ -62,18 +65,32 @@ public VNTRClusteringCmd(){ addString("output", "-", "Output file"); addStdHelp(); + + + + } - public static void main(String [] args) throws IOException{ + public static void main(String [] args) throws Exception{ CommandLine cmdLine = new VNTRClusteringCmd(); args = cmdLine.stdParseLine(args); + /*String[] s = args; + System.out.println(s);*/ + + /**********************************************************************/ String input = cmdLine.getStringVal("input"); - String output = cmdLine.getStringVal("output"); - ////YOUR CODE GOES HERE - System.out.println("Hello world input is " + input); + String output= cmdLine.getStringVal("output"); + ////YOUR CODE GOES HERE + /*System.out.println("Hello world input is " + input); + System.out.println("Testing this statement");*/ //////////// + + KmeanClustering cluster = new KmeanClustering(); + cluster.Clustering(); + + } } From 52d3fac615b615022743787b5f1dcc994c694cfc Mon Sep 17 00:00:00 2001 From: Minh Duc Cao Date: Mon, 24 Apr 2017 20:50:36 +1000 Subject: [PATCH 2/7] improve the construction of gene database --- .../java/japsa/bio/BuildSequenceGroupDatabase.java | 22 +++--- src/main/java/japsa/bio/gene/GeneDatabase.java | 79 ++++++++-------------- 2 files changed, 40 insertions(+), 61 deletions(-) diff --git a/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java b/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java index 19a395f..dc46298 100644 --- a/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java +++ b/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java @@ -128,7 +128,8 @@ public void cleanUp() throws IOException, InterruptedException{ * @throws IOException * @throws InterruptedException */ - public HashMap addGeneMap(final Map seqs, boolean checkGeneID) throws IOException, InterruptedException{ + public HashMap addGeneMap(final Map seqs, boolean checkGeneID) + throws IOException, InterruptedException{ HashMap strMap = new HashMap (); //0: initialise grouping within the new sequences @@ -243,6 +244,7 @@ public void cleanUp() throws IOException, InterruptedException{ LOG.info("Iteration " + iteration + " of " + countI + " sequences " + new Date()); //JapsaTimer.systemInfo(); + //TODO: 1. think of a way to perform alignment `in memory' instead of write to bwa (may be use jalign) Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA2.sh"); process.waitFor(); @@ -289,25 +291,27 @@ public void cleanUp() throws IOException, InterruptedException{ LOG.info(" 2 Grouping " + geneDatabase.size()); //JapsaTimer.systemInfo(); - //2.Try to add to the existing groups + //2.Try to add to the existing groups. Align the rep from each group to each of the family rep + //if match, add the whole group to the family int G = 0, GG = 0; if (geneDatabase.size() > 0){ - geneDatabase.write2File(fFile, false); + //geneDatabase.write2File(fFile, false); //creat a map of current reps - SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(fFile); HashMap repMap = new HashMap(); for (GeneDatabase.GeneFamily family:geneDatabase){ Sequence rep = family.represetationSequence(); repMap.put(rep.getName(),rep); + rep.writeFasta(sos); } sos.close(); //Run bwa //LOG.info("Running bwa for " + seq.getName() + " " + geneFamilies.size() + " family"); - Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA.sh"); + //TODO 2: Read this directly from stdout of bwa + Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA.sh"); process.waitFor(); //Read the sam file @@ -338,14 +342,14 @@ public void cleanUp() throws IOException, InterruptedException{ continue; } - Sequence refSeq = family.represetationSequence(); + Sequence refSeq = repMap.get(refName); if (refSeq == null){ LOG.error("ERROR 6: rep for family " + refName + " not found!"); } - + if (isSimilar(refSeq, readSeq, sam)){ + //asign all reads in this set to the family ArrayList readSet = setMap.get(readName); - for (String key:readSet){ //if (strMap.containsKey(key)){ // LOG.error("ERROR 1 : " + key); @@ -367,6 +371,7 @@ public void cleanUp() throws IOException, InterruptedException{ LOG.info(" 3 Grouping " + geneDatabase.size() + " " + seqs.size()); //JapsaTimer.systemInfo(); + //IF there are any groups left, add each of them as a family for (Sequence seq:seqs.values()){ String seqName = seq.getName(); ArrayList tSet = setMap.get(seqName); @@ -393,7 +398,6 @@ public void cleanUp() throws IOException, InterruptedException{ } tSet.clear(); } - LOG.info("Manage to add " + G + " and " + GG + " " + new Date()); return strMap; } diff --git a/src/main/java/japsa/bio/gene/GeneDatabase.java b/src/main/java/japsa/bio/gene/GeneDatabase.java index b7edd1e..679f5b6 100644 --- a/src/main/java/japsa/bio/gene/GeneDatabase.java +++ b/src/main/java/japsa/bio/gene/GeneDatabase.java @@ -38,25 +38,27 @@ import japsa.seq.SequenceReader; import japsa.seq.Alphabet.DNA; import japsa.seq.SequenceOutputStream; -import japsa.util.Logging; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; -public class GeneDatabase implements Iterable{ - ArrayList geneFamilies; +public class GeneDatabase extends ArrayList{ + private static final Logger LOG = LoggerFactory.getLogger(GeneDatabase.class); String dbID = "JSA"; public GeneDatabase(){ - geneFamilies = new ArrayList(); + super(); } public String addNewFamily(Sequence seq){ GeneDatabase.GeneFamily newFam = new GeneDatabase.GeneFamily(size()); String geneID = newFam.addSequence(seq); - geneFamilies.add(newFam); + add(newFam); return geneID; } @@ -68,7 +70,7 @@ public String addNewFamily(Sequence seq){ */ public void write2File(String fileName, boolean includeAlleles) throws IOException{ SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(fileName); - for (GeneDatabase.GeneFamily family:geneFamilies){ + for (GeneDatabase.GeneFamily family:this){ Sequence rep = family.represetationSequence(); rep.writeFasta(sos); if (includeAlleles){ @@ -98,13 +100,13 @@ public static GeneDatabase readDB(String fileName) throws IOException{ if (toks.length == 2){ //rep GeneDatabase.GeneFamily newFam = new GeneDatabase.GeneFamily(db.size()); - db.geneFamilies.add(newFam); + db.add(newFam); }else if (toks.length == 3){ //a gene GeneDatabase.GeneFamily fam = db.getFamily(Integer.parseInt(toks[1])); - fam.geneAlleles.add(seq); + fam.add(seq); }else{ - Logging.error("Unknown sequence " + seq.getName()); + LOG.error("Unknown sequence " + seq.getName()); } } reader.close(); @@ -131,7 +133,7 @@ public static GeneDatabase readDB(String fileName) throws IOException{ //doing nothing } - return geneFamilies.get(fID); + return get(fID); } /** @@ -141,44 +143,25 @@ public static GeneDatabase readDB(String fileName) throws IOException{ */ public GeneDatabase.GeneFamily getFamily(int famID){ - if (famID >= geneFamilies.size() || famID < 0){ + //TODO: Check if really need to validate famID + if (famID >= size() || famID < 0){ return null; } - return geneFamilies.get(famID); + return get(famID); } - - - /** - * Return the number of families in the database - * @return - */ - public int size(){ - return geneFamilies.size(); - } - - /* (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - public Iterator iterator() { - return geneFamilies.iterator(); - - } ///////////////////////////////////////////////////////////////////////// - public static class GeneFamily implements Iterable{ + public static class GeneFamily extends ArrayList{ private final int fID; - private ArrayList geneAlleles;//known instance of this family - //Sequence rep = null;//The representation of this gene family - + int repIndex = -1; String desc = ""; public GeneFamily(int id){ + super(); fID = id; - geneAlleles = new ArrayList(); } /** @@ -202,14 +185,14 @@ public String familyID(){ } public Sequence represetationSequence(){ - Sequence rep = geneAlleles.get(repIndex).clone(); - rep.setDesc(desc + ";index=" +repIndex + ";size=" + geneAlleles.size()); + Sequence rep = get(repIndex).clone(); + rep.setDesc(desc + ";index=" +repIndex + ";size=" + size()); rep.setName(familyID()); return rep; } private void updateRep(int newIndex){ - if (repIndex < 0 || geneAlleles.get(newIndex).length() > geneAlleles.get(repIndex).length()){ + if (repIndex < 0 || get(newIndex).length() > get(repIndex).length()){ repIndex = newIndex; } } @@ -222,8 +205,9 @@ private void updateRep(int newIndex){ */ public String addSequence(Sequence seq){ //This allele already in the database - for (int i = 0; i < geneAlleles.size(); i++){ - Sequence eSeq = geneAlleles.get(i); + //TODO: Need to see if this feature is really needed + for (int i = 0; i < size(); i++){ + Sequence eSeq = get(i); if (eSeq.match(seq) == 0) return eSeq.getName(); if (eSeq.match(DNA.complement(seq)) == 0) @@ -232,19 +216,10 @@ public String addSequence(Sequence seq){ Sequence nSeq = seq.clone(); nSeq.setDesc(nSeq.getName() + " " + nSeq.getDesc()); - nSeq.setName(familyID() + "_" + (geneAlleles.size())); - geneAlleles.add(nSeq); - updateRep(geneAlleles.size() - 1); + nSeq.setName(familyID() + "_" + (size())); + add(nSeq); + updateRep(size() - 1); return nSeq.getName(); } - - /* (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - public Iterator iterator() { - return geneAlleles.iterator(); - } } - } \ No newline at end of file From 4721f13fbdabc782bf7f6c007e6983562438fd9c Mon Sep 17 00:00:00 2001 From: Bhuvan Sankar Date: Mon, 24 Apr 2017 21:03:14 +1000 Subject: [PATCH 3/7] updating txt file --- .../bio/hts/clustering/GettingTreadsFromFasta.java | 11 +- .../bio/hts/clustering/KmeanClustering.java | 281 +++++++++++---------- 2 files changed, 159 insertions(+), 133 deletions(-) diff --git a/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java index 6fa2da5..f175847 100644 --- a/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java +++ b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java @@ -15,11 +15,13 @@ private String[] description; private String[] sequence; + static ArrayList Rname = new ArrayList(); static ArrayList TReads = new ArrayList(); static String FnamePath; static int sequenceLength; + static String fle; public GettingTreadsFromFasta(String filename) { readSequenceFromFile(filename); @@ -111,7 +113,8 @@ public static void FileReading(String filename) { } // call this method-1 - public static void DestReads() throws Exception { + //public static void DestReads() throws Exception { + public static void DestReads(String fle1) throws Exception { /*String fname = ""; System.out.print("Enter the name of the FastaFile:"); fname = (new BufferedReader(new InputStreamReader(System.in))) @@ -120,12 +123,14 @@ public static void DestReads() throws Exception { // System.out.println(FnamePath); //FileReading(fname); - FnamePath = "chr2250781022_50785963.fasta"; + fle = fle1; + FnamePath = fle1; + //FnamePath = "chr10134169759_134174717.fasta"; FileReading(FnamePath); } public static void main(String[] args) throws Exception { - DestReads(); + DestReads(fle); ViewList(); } diff --git a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java index 825174e..ddd55f9 100644 --- a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java +++ b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java @@ -15,155 +15,176 @@ public class KmeanClustering { public static void Clustering() throws Exception{ - ArrayList descript = new ArrayList(); - ArrayList reads = new ArrayList(); - int Nreads; - String FileName; - int NumberElements; - int ClustElements; - int n; - int d[]; - int k[][]; - final int p =2;//number of clusters - int tempk[][]; - int m[]; - int Nclusters = 0; + FileInputStream file1 = new FileInputStream("TRfile.fasta"); + BufferedReader br = new BufferedReader(new InputStreamReader(file1)); + String line = null; + while((line = br.readLine())!=null){ + ArrayList descript = new ArrayList(); + ArrayList reads = new ArrayList(); + //ArrayList MaxReadLn = new ArrayList(); + int MaxReadLn = 0; + int Nreads; + String FileName; + int NumberElements; + int ClustElements; + int n; + int d[]; + int k[][]; + final int p =2;//number of clusters + int tempk[][]; + int m[]; + int Nclusters = 0; + + double max = 0; + double temp1 = 0; + int index1=0; + int index2=0; + int count1=0,count2=0; + long startTime = System.nanoTime(); - double max = 0; - double temp1 = 0; - int index1=0; - int index2=0; - int count1=0,count2=0; - long startTime = System.nanoTime(); - GettingTreadsFromFasta.DestReads(); - descript = GettingTreadsFromFasta.GetRname(); - reads = GettingTreadsFromFasta.GetTReads(); - Nreads = GettingTreadsFromFasta.NumberReads(); - FileName = GettingTreadsFromFasta.GetFileName(); - NumberElements = GettingTreadsFromFasta.SeqLength(); - ClustElements = GettingTreadsFromFasta.NelementsClustering(); - - double[][] table = new double[Nreads][Nreads]; - - for (int i=0; i Date: Mon, 24 Apr 2017 23:26:17 +1000 Subject: [PATCH 4/7] adding duplicate clustering --- .../bio/hts/clustering/KmeanClustering1.java | 300 +++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 src/dev/java/japsadev/bio/hts/clustering/KmeanClustering1.java diff --git a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering1.java b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering1.java new file mode 100644 index 0000000..6e39a5a --- /dev/null +++ b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering1.java @@ -0,0 +1,300 @@ +package japsadev.bio.hts.clustering; + +import java.util.ArrayList; +import java.io.*; + +import japsa.seq.SequenceOutputStream; +import japsadev.bio.hts.clustering.PairDistance; +import japsadev.bio.hts.clustering.GettingTreadsFromFasta; + +/** + * @author buvan.suji + * + */ + +public class KmeanClustering1 { + + public static void Clustering() throws Exception{ + FileInputStream file1 = new FileInputStream("TRfile1.fasta"); + BufferedReader br = new BufferedReader(new InputStreamReader(file1)); + String line = null; + while((line = br.readLine())!=null){ + ArrayList descript = new ArrayList(); + ArrayList reads = new ArrayList(); + //ArrayList MaxReadLn = new ArrayList(); + int MaxReadLn = 0; + int Nreads; + String FileName; + int NumberElements; + int ClustElements; + int n; + int d[]; + int k[][]; + final int p =2;//number of clusters + int tempk[][]; + int m[]; + int Nclusters = 0; + + double max = 0; + double temp1 = 0; + int index1=0; + int index2=0; + int count1=0,count2=0; + long startTime = System.nanoTime(); + + + GettingTreadsFromFasta.DestReads(line); + //GettingTreadsFromFasta.DestReads(); + descript = GettingTreadsFromFasta.GetRname(); + reads = GettingTreadsFromFasta.GetTReads(); + Nreads = GettingTreadsFromFasta.NumberReads(); + FileName = GettingTreadsFromFasta.GetFileName(); + NumberElements = GettingTreadsFromFasta.SeqLength(); + ClustElements = GettingTreadsFromFasta.NelementsClustering(); + + double[][] table = new double[Nreads][Nreads]; + + for (int i=0; im[i]){ + diff[i] = list[m[i]][a]; + } + else{ + diff[i]=0; + } + } + + int val=0; + double temp=diff[0]; + for(int i=0;i<2;++i){ + if(diff[i] t = new ArrayList() ; + + for(int i=0;i<2;++i){ + m[i]=0; // initializing means to 0 + } + + //int cnt=0; + for(int i=0;i<2;++i){ + + for(int j=0;j s = new ArrayList(); + + for(int x1 = 0; x1t.get(x2)){ + sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); + } + else{ + sum = sum+0; + } + } + //System.out.println(sum); + s.add(sum/t.size()); + } + //System.out.println(s.size()); + double min=s.get(0); + int d2 = 0; + for(int x3=1;x3s.get(x3)){ + min = s.get(x3); + d2 = x3; + } + } + + + m[i]=t.get(d2); + s.clear(); + t.clear(); + + } + } + +//This checks if previous k ie. tempk and current k are same.Used as terminating case. + static int VerifyEqual(int n, int k[][], int tempk[][]){ + for(int i=0;i<2;++i){ + for(int j=0;j Date: Wed, 26 Apr 2017 12:30:31 +1000 Subject: [PATCH 5/7] Delete .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index ae3c172..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/bin/ From 8d7c4655b6cc8e97166e6ffffc535863f3ee3afd Mon Sep 17 00:00:00 2001 From: hsnguyen Date: Wed, 26 Apr 2017 12:30:41 +1000 Subject: [PATCH 6/7] Delete .classpath --- .classpath | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 .classpath diff --git a/.classpath b/.classpath deleted file mode 100644 index 9e92c6b..0000000 --- a/.classpath +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - From 0348d2db7252f0d565d19c1885fa44aff852af94 Mon Sep 17 00:00:00 2001 From: hsnguyen Date: Wed, 26 Apr 2017 12:30:50 +1000 Subject: [PATCH 7/7] Delete .project --- .project | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 .project diff --git a/.project b/.project deleted file mode 100644 index c012ce4..0000000 --- a/.project +++ /dev/null @@ -1,17 +0,0 @@ - - - japsa - - - - - - org.eclipse.jdt.core.javabuilder - - - - - - org.eclipse.jdt.core.javanature - -