diff --git a/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java index bc03da8..f175847 100644 --- a/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java +++ b/src/dev/java/japsadev/bio/hts/clustering/GettingTreadsFromFasta.java @@ -1,164 +1,173 @@ -package japsadev.bio.hts.clustering; - -import java.io.*; -import java.util.*; - - - -/** - * @author buvan.suji - * - */ - -public class GettingTreadsFromFasta { - - private String[] description; - private String[] sequence; - - static ArrayList Rname = new ArrayList(); - static ArrayList TReads = new ArrayList(); - static String FnamePath; - static int sequenceLength; - - public GettingTreadsFromFasta(String filename) { - readSequenceFromFile(filename); - } - - @SuppressWarnings("resource") - void readSequenceFromFile(String file) { - - ArrayList desc = new ArrayList(); - ArrayList seq = new ArrayList(); - BufferedReader in = null; - - try { - in = new BufferedReader(new FileReader(file)); - StringBuffer buffer = new StringBuffer(); - String line = in.readLine(); - - if (line == null) - throw new IOException(file + " is an empty file"); - - if (line.charAt(0) != '>') - throw new IOException("First line of " + file - + " should start with '>'"); - else - desc.add(line); - - for (line = in.readLine().trim(); line != null; line = in - .readLine()) { - if (line.length() > 0 && line.charAt(0) == '>') { - seq.add(buffer.toString()); - buffer = new StringBuffer(); - desc.add(line); - } else - buffer.append(line.trim()); - } - if (buffer.length() != 0) - seq.add(buffer.toString()); - } catch (IOException e) { - System.out.println("Error when reading " + file); - e.printStackTrace(); - } - - description = new String[desc.size()]; - sequence = new String[seq.size()]; - for (int i = 0; i < seq.size(); i++) { - description[i] = (String) desc.get(i); - sequence[i] = (String) seq.get(i); - } - sequenceLength = seq.size(); - - } - - // return first sequence as a String - public String getSequence() { - return sequence[0]; - } - - // return first xdescription as String - public String getDescription() { - return description[0]; - } - - // return sequence as a String - public String getSequence(int i) { - return sequence[i]; - } - - // return description as String - public String getDescription(int i) { - return description[i]; - } - - public int size() { - return sequence.length; - } - - public static void FileReading(String filename) { - GettingTreadsFromFasta fsf = new GettingTreadsFromFasta( - filename); - StringBuffer buffer1 = new StringBuffer(); - - for (int i = 0; i < fsf.size(); i++) { - String temp = ((buffer1.append(fsf.getDescription(i))) - .deleteCharAt(0)).toString(); - Rname.add(temp); - TReads.add(fsf.getSequence(i)); - buffer1 = new StringBuffer(); - } - } - - // call this method-1 - public static void DestReads() throws Exception { - String fname = ""; - System.out.print("Enter the name of the FastaFile:"); - fname = (new BufferedReader(new InputStreamReader(System.in))) - .readLine(); - FnamePath = new File(fname).getName(); - // System.out.println(FnamePath); - FileReading(fname); - } - - public static void main(String[] args) throws Exception { - DestReads(); - ViewList(); - } - - // call this method-2 - public static ArrayList GetRname() { - return Rname; - } - - // call this method-3 - public static ArrayList GetTReads() { - return TReads; - } - - public static int NumberReads() { - return TReads.size(); - } - - // call this method-4 - public static String GetFileName() { - return FnamePath; - } - - public static int SeqLength() { - return sequenceLength; - } - - public static int NelementsClustering() { - return sequenceLength * (sequenceLength - 1) / 2; - } - - public static void ViewList() { - System.out.println(GetRname()); - // System.out.println(GetTReads()); - System.out.println(SeqLength()); - System.out.println(NelementsClustering()); - } - - - -} +package japsadev.bio.hts.clustering; + +import java.io.*; +import java.util.*; + + + + +/** + * @author buvan.suji + * + */ + +public class GettingTreadsFromFasta { + + private String[] description; + private String[] sequence; + + + static ArrayList Rname = new ArrayList(); + static ArrayList TReads = new ArrayList(); + static String FnamePath; + static int sequenceLength; + static String fle; + + public GettingTreadsFromFasta(String filename) { + readSequenceFromFile(filename); + } + + @SuppressWarnings("resource") + void readSequenceFromFile(String file) { + + ArrayList desc = new ArrayList(); + ArrayList seq = new ArrayList(); + BufferedReader in = null; + + try { + in = new BufferedReader(new FileReader(file)); + StringBuffer buffer = new StringBuffer(); + String line = in.readLine(); + + if (line == null) + throw new IOException(file + " is an empty file"); + + if (line.charAt(0) != '>') + throw new IOException("First line of " + file + + " should start with '>'"); + else + desc.add(line); + + for (line = in.readLine().trim(); line != null; line = in + .readLine()) { + if (line.length() > 0 && line.charAt(0) == '>') { + seq.add(buffer.toString()); + buffer = new StringBuffer(); + desc.add(line); + } else + buffer.append(line.trim()); + } + if (buffer.length() != 0) + seq.add(buffer.toString()); + } catch (IOException e) { + System.out.println("Error when reading " + file); + e.printStackTrace(); + } + + description = new String[desc.size()]; + sequence = new String[seq.size()]; + for (int i = 0; i < seq.size(); i++) { + description[i] = (String) desc.get(i); + sequence[i] = (String) seq.get(i); + } + sequenceLength = seq.size(); + + } + + // return first sequence as a String + public String getSequence() { + return sequence[0]; + } + + // return first xdescription as String + public String getDescription() { + return description[0]; + } + + // return sequence as a String + public String getSequence(int i) { + return sequence[i]; + } + + // return description as String + public String getDescription(int i) { + return description[i]; + } + + public int size() { + return sequence.length; + } + + public static void FileReading(String filename) { + GettingTreadsFromFasta fsf = new GettingTreadsFromFasta( + filename); + StringBuffer buffer1 = new StringBuffer(); + + for (int i = 0; i < fsf.size(); i++) { + String temp = ((buffer1.append(fsf.getDescription(i))) + .deleteCharAt(0)).toString(); + Rname.add(temp); + TReads.add(fsf.getSequence(i)); + buffer1 = new StringBuffer(); + } + } + + // call this method-1 + //public static void DestReads() throws Exception { + public static void DestReads(String fle1) throws Exception { + /*String fname = ""; + System.out.print("Enter the name of the FastaFile:"); + fname = (new BufferedReader(new InputStreamReader(System.in))) + .readLine(); + FnamePath = new File(fname).getName();*/ + + // System.out.println(FnamePath); + //FileReading(fname); + fle = fle1; + FnamePath = fle1; + //FnamePath = "chr10134169759_134174717.fasta"; + FileReading(FnamePath); + } + + public static void main(String[] args) throws Exception { + DestReads(fle); + ViewList(); + } + + // call this method-2 + public static ArrayList GetRname() { + return Rname; + } + + // call this method-3 + public static ArrayList GetTReads() { + return TReads; + } + + public static int NumberReads() { + return TReads.size(); + } + + // call this method-4 + public static String GetFileName() { + return FnamePath; + } + + public static int SeqLength() { + return sequenceLength; + } + + public static int NelementsClustering() { + return sequenceLength * (sequenceLength - 1) / 2; + } + + public static void ViewList() { + System.out.println(GetRname()); + // System.out.println(GetTReads()); + System.out.println(SeqLength()); + System.out.println(NelementsClustering()); + } + + + +} diff --git a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java index 573422b..ddd55f9 100644 --- a/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java +++ b/src/dev/java/japsadev/bio/hts/clustering/KmeanClustering.java @@ -1,265 +1,300 @@ -package japsadev.bio.hts.clustering; - -import java.util.ArrayList; - -/** - * @author buvan.suji - * - */ - -public class KmeanClustering { - - - /** - * @param args - */ - - static int count1,count2,count3; - static int d[]; - static int k[][]; - static int tempk[][]; - static int m[]; - static double diff[]; - static int n; - final static int p =2;//number of clusters - static double max = 0; - static double temp1 = 0; - static int index1; - static int index2; - static ArrayList t = new ArrayList() ; - - - static ArrayList descript = new ArrayList(); - static ArrayList reads = new ArrayList(); - static int Nreads; - static String FileName; - static int NumberElements; - static int ClustElements; - static ArrayList distList1 = new ArrayList(); - static ArrayList> distList2 = new ArrayList>(); - - - public static void Clustering() throws Exception{ - GettingTreadsFromFasta.DestReads(); - descript = GettingTreadsFromFasta.GetRname(); - reads = GettingTreadsFromFasta.GetTReads(); - Nreads = GettingTreadsFromFasta.NumberReads(); - FileName = GettingTreadsFromFasta.GetFileName(); - NumberElements = GettingTreadsFromFasta.SeqLength(); - ClustElements = GettingTreadsFromFasta.NelementsClustering(); - - double[][] table = new double[Nreads][Nreads]; - - for (int i=0; im[i]){ - diff[i] = list[m[i]][a]; - } - else{ - diff[i]=0; - } - } - - int val=0; - double temp=diff[0]; - for(int i=0;i s = new ArrayList(); - - for(int x1 = 0; x1t.get(x2)){ - sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); - } - else{ - sum = sum+0; - } - } - //System.out.println(sum); - - s.add(sum/t.size()); - } - //System.out.println(s.size()); - double min=s.get(0); - int d2 = 0; - for(int x3=1;x3s.get(x3)){ - min = s.get(x3); - d2 = x3; - } - } - - - m[i]=t.get(d2); - s.clear(); - t.clear(); - - } - } - -//This checks if previous k ie. tempk and current k are same.Used as terminating case. - static int VerifyEqual(){ - for(int i=0;i descript = new ArrayList(); + ArrayList reads = new ArrayList(); + //ArrayList MaxReadLn = new ArrayList(); + int MaxReadLn = 0; + int Nreads; + String FileName; + int NumberElements; + int ClustElements; + int n; + int d[]; + int k[][]; + final int p =2;//number of clusters + int tempk[][]; + int m[]; + int Nclusters = 0; + + double max = 0; + double temp1 = 0; + int index1=0; + int index2=0; + int count1=0,count2=0; + long startTime = System.nanoTime(); + + + GettingTreadsFromFasta.DestReads(line); + //GettingTreadsFromFasta.DestReads(); + descript = GettingTreadsFromFasta.GetRname(); + reads = GettingTreadsFromFasta.GetTReads(); + Nreads = GettingTreadsFromFasta.NumberReads(); + FileName = GettingTreadsFromFasta.GetFileName(); + NumberElements = GettingTreadsFromFasta.SeqLength(); + ClustElements = GettingTreadsFromFasta.NelementsClustering(); + + double[][] table = new double[Nreads][Nreads]; + + for (int i=0; im[i]){ + diff[i] = list[m[i]][a]; + } + else{ + diff[i]=0; + } + } + + int val=0; + double temp=diff[0]; + for(int i=0;i<2;++i){ + if(diff[i] t = new ArrayList() ; + + for(int i=0;i<2;++i){ + m[i]=0; // initializing means to 0 + } + + //int cnt=0; + for(int i=0;i<2;++i){ + + for(int j=0;j s = new ArrayList(); + + for(int x1 = 0; x1t.get(x2)){ + sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); + } + else{ + sum = sum+0; + } + } + //System.out.println(sum); + s.add(sum/t.size()); + } + //System.out.println(s.size()); + double min=s.get(0); + int d2 = 0; + for(int x3=1;x3s.get(x3)){ + min = s.get(x3); + d2 = x3; + } + } + + + m[i]=t.get(d2); + s.clear(); + t.clear(); + + } + } + +//This checks if previous k ie. tempk and current k are same.Used as terminating case. + static int VerifyEqual(int n, int k[][], int tempk[][]){ + for(int i=0;i<2;++i){ + for(int j=0;j descript = new ArrayList(); + ArrayList reads = new ArrayList(); + //ArrayList MaxReadLn = new ArrayList(); + int MaxReadLn = 0; + int Nreads; + String FileName; + int NumberElements; + int ClustElements; + int n; + int d[]; + int k[][]; + final int p =2;//number of clusters + int tempk[][]; + int m[]; + int Nclusters = 0; + + double max = 0; + double temp1 = 0; + int index1=0; + int index2=0; + int count1=0,count2=0; + long startTime = System.nanoTime(); + + + GettingTreadsFromFasta.DestReads(line); + //GettingTreadsFromFasta.DestReads(); + descript = GettingTreadsFromFasta.GetRname(); + reads = GettingTreadsFromFasta.GetTReads(); + Nreads = GettingTreadsFromFasta.NumberReads(); + FileName = GettingTreadsFromFasta.GetFileName(); + NumberElements = GettingTreadsFromFasta.SeqLength(); + ClustElements = GettingTreadsFromFasta.NelementsClustering(); + + double[][] table = new double[Nreads][Nreads]; + + for (int i=0; im[i]){ + diff[i] = list[m[i]][a]; + } + else{ + diff[i]=0; + } + } + + int val=0; + double temp=diff[0]; + for(int i=0;i<2;++i){ + if(diff[i] t = new ArrayList() ; + + for(int i=0;i<2;++i){ + m[i]=0; // initializing means to 0 + } + + //int cnt=0; + for(int i=0;i<2;++i){ + + for(int j=0;j s = new ArrayList(); + + for(int x1 = 0; x1t.get(x2)){ + sum = sum+(list[t.get(x2)][t.get(x1)]*list[t.get(x2)][t.get(x1)]); + } + else{ + sum = sum+0; + } + } + //System.out.println(sum); + s.add(sum/t.size()); + } + //System.out.println(s.size()); + double min=s.get(0); + int d2 = 0; + for(int x3=1;x3s.get(x3)){ + min = s.get(x3); + d2 = x3; + } + } + + + m[i]=t.get(d2); + s.clear(); + t.clear(); + + } + } + +//This checks if previous k ie. tempk and current k are same.Used as terminating case. + static int VerifyEqual(int n, int k[][], int tempk[][]){ + for(int i=0;i<2;++i){ + for(int j=0;j parentMap = new HashMap<>(); - - for (int i = 1; i <= m; ++i) { - d[i][0] = i; - } - - for (int j = 1; j <= n; ++j) { - d[0][j] = j; - } - - //System.out.println("initial="+d[0][0]); - - - for (int j = 1; j <= n; ++j) { - for (int i = 1; i <= m; ++i) { - /*if(s.charAt(j-1)==z.charAt(i-1)){ - final int delta = 0; - } - else{ - if(s.charAt(j-1)== z.charAt(i-1)) - }*/ - final int delta = (s.charAt(j - 1) == z.charAt(i - 1)) ? 0 : 1; - - int tentativeDistance = d[i - 1][j] + 2;//gap penalty - EditOperation editOperation = EditOperation.INSERT; - - if (tentativeDistance > d[i][j - 1] + 2) { - tentativeDistance = d[i][j - 1] + 2; - editOperation = EditOperation.DELETE; - } - - if (tentativeDistance > d[i - 1][j - 1] + delta) { - tentativeDistance = d[i - 1][j - 1] + delta; - editOperation = EditOperation.SUBSTITUTE; - } - - d[i][j] = tentativeDistance; - //System.out.println("inloop d("+i+", "+j+")="+d[i][j]); - - switch (editOperation) { - case SUBSTITUTE: - parentMap.put(new Point(i, j), new Point(i - 1, j - 1)); - break; - - case INSERT: - parentMap.put(new Point(i, j), new Point(i - 1, j)); - break; - - case DELETE: - parentMap.put(new Point(i, j), new Point(i, j - 1)); - break; - default: - break; - } - } - } - - final StringBuilder topLineBuilder = new StringBuilder(n + m); - final StringBuilder bottomLineBuilder = new StringBuilder(n + m); - final StringBuilder editSequenceBuilder = new StringBuilder(n + m); - Point current = new Point(m, n); - - while (true) { - Point predecessor = parentMap.get(current); - - if (predecessor == null) { - break; - } - - if (current.x != predecessor.x && current.y != predecessor.y) { - final char schar = s.charAt(predecessor.y); - final char zchar = z.charAt(predecessor.x); - - topLineBuilder.append(schar); - bottomLineBuilder.append(zchar); - editSequenceBuilder.append(schar != zchar ? - EditOperation.SUBSTITUTE : - EditOperation.MATCH); - } else if (current.x != predecessor.x) { - topLineBuilder.append(GAP); - bottomLineBuilder.append(z.charAt(predecessor.x)); - editSequenceBuilder.append(EditOperation.INSERT); - } else { - topLineBuilder.append(s.charAt(predecessor.y)); - bottomLineBuilder.append(GAP); - editSequenceBuilder.append(EditOperation.DELETE); - } - - current = predecessor; - } - - // Remove the last characters that correspond to the very beginning - // of the alignments and edit sequence (since the path reconstruction - // proceeds from the "end" to the "beginning" of the distance matrix. - topLineBuilder .deleteCharAt(topLineBuilder.length() - 1); - bottomLineBuilder .deleteCharAt(bottomLineBuilder.length() - 1); - editSequenceBuilder.deleteCharAt(editSequenceBuilder.length() - 1); - - // Our result data is backwards, reverse them. - topLineBuilder .reverse(); - bottomLineBuilder .reverse(); - editSequenceBuilder.reverse(); - - return new EditDistanceResult(d[m][n], - editSequenceBuilder.toString(), - topLineBuilder.toString(), - bottomLineBuilder.toString()); - } - - - - public static void main(String[] args) throws IOException { - EditDistanceResult result = compute("ACTAGGTTA", "TAAGGCTCAAT"); - System.out.println("Distance: " + result.getDistance()); - System.out.println("Edit sequence: " + result.getEditSequence()); - System.out.println("Alignment:"); - System.out.println(result.getTopAlignmentRow()); - System.out.println(result.getBottomAlignmentRow()); - - //S1 = "ATTCGGCATCA" - //S2 = "ACTAGGTTA" - //S3 = "GGTAAGGATTGCC" - //S4 = "CTAGGATCCAG" - //S5 = "TAAGGCTCAAT" - //S6 = "CTACTGCAAG" - //S7 = "TTTAGGAGCTCA" - //S8 = "ACCTGACCTTG" - //S9 = "GGTTCAGTTCC" - } - - - - -} +package japsadev.bio.hts.clustering; + +import java.awt.Point; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + + +/** + * @author buvan.suji + * + */ + +public class PairDistance { + + public static final String GAP = "-"; + + public static final class EditDistanceResult { + private final int distance; + private final String editSequence; + private final String topAlignmentRow; + private final String bottomAlignmentRow; + + EditDistanceResult(final int distance, + final String editSequence, + final String topAlignmentRow, + final String bottomAlignmentRow) { + this.distance = distance; + this.editSequence = editSequence; + this.topAlignmentRow = topAlignmentRow; + this.bottomAlignmentRow = bottomAlignmentRow; + } + + public int getDistance() { + return distance; + } + + public String getEditSequence() { + return editSequence; + } + + public String getTopAlignmentRow() { + return topAlignmentRow; + } + + public String getBottomAlignmentRow() { + return bottomAlignmentRow; + } + } + + private static enum EditOperation { + INSERT ("I"), + SUBSTITUTE ("S"), + DELETE ("D"), + MATCH ("M"); + + private final String s; + + private EditOperation(String s) { + this.s = s; + } + + @Override + public String toString() { + return s; + } + } + + public static EditDistanceResult compute(String s, String z) { + // This is required to keep the parent map invariant. + // Otherwise the very first edit operation would not end up in the output. + s = "\u0000" + s; + z = "\u0000" + z; + + final int n = s.length(); + final int m = z.length(); + final int[][] d = new int[m + 1][n + 1]; + final Map parentMap = new HashMap<>(); + + for (int i = 1; i <= m; ++i) { + d[i][0] = i; + } + + for (int j = 1; j <= n; ++j) { + d[0][j] = j; + } + + + + + for (int j = 1; j <= n; ++j) { + for (int i = 1; i <= m; ++i) { + final int delta = (s.charAt(j - 1) == z.charAt(i - 1)) ? 0 : 1; + + int tentativeDistance = d[i - 1][j] + 2;//gap penalty + EditOperation editOperation = EditOperation.INSERT; + + if (tentativeDistance > d[i][j - 1] + 2) { + tentativeDistance = d[i][j - 1] + 2; + editOperation = EditOperation.DELETE; + } + + if (tentativeDistance > d[i - 1][j - 1] + delta) { + tentativeDistance = d[i - 1][j - 1] + delta; + editOperation = EditOperation.SUBSTITUTE; + } + + d[i][j] = tentativeDistance; + + + switch (editOperation) { + case SUBSTITUTE: + parentMap.put(new Point(i, j), new Point(i - 1, j - 1)); + break; + + case INSERT: + parentMap.put(new Point(i, j), new Point(i - 1, j)); + break; + + case DELETE: + parentMap.put(new Point(i, j), new Point(i, j - 1)); + break; + default: + break; + } + } + } + + final StringBuilder topLineBuilder = new StringBuilder(n + m); + final StringBuilder bottomLineBuilder = new StringBuilder(n + m); + final StringBuilder editSequenceBuilder = new StringBuilder(n + m); + Point current = new Point(m, n); + + while (true) { + Point predecessor = parentMap.get(current); + + if (predecessor == null) { + break; + } + + if (current.x != predecessor.x && current.y != predecessor.y) { + final char schar = s.charAt(predecessor.y); + final char zchar = z.charAt(predecessor.x); + + topLineBuilder.append(schar); + bottomLineBuilder.append(zchar); + editSequenceBuilder.append(schar != zchar ? + EditOperation.SUBSTITUTE : + EditOperation.MATCH); + } else if (current.x != predecessor.x) { + topLineBuilder.append(GAP); + bottomLineBuilder.append(z.charAt(predecessor.x)); + editSequenceBuilder.append(EditOperation.INSERT); + } else { + topLineBuilder.append(s.charAt(predecessor.y)); + bottomLineBuilder.append(GAP); + editSequenceBuilder.append(EditOperation.DELETE); + } + + current = predecessor; + } + + // Remove the last characters that correspond to the very beginning + // of the alignments and edit sequence (since the path reconstruction + // proceeds from the "end" to the "beginning" of the distance matrix. + topLineBuilder .deleteCharAt(topLineBuilder.length() - 1); + bottomLineBuilder .deleteCharAt(bottomLineBuilder.length() - 1); + editSequenceBuilder.deleteCharAt(editSequenceBuilder.length() - 1); + + // Our result data is backwards, reverse them. + topLineBuilder .reverse(); + bottomLineBuilder .reverse(); + editSequenceBuilder.reverse(); + + return new EditDistanceResult(d[m][n], + editSequenceBuilder.toString(), + topLineBuilder.toString(), + bottomLineBuilder.toString()); + } + + + + public static void main(String[] args) throws IOException { + EditDistanceResult result = compute("ACTAGGTTA", "TAAGGCTCAAT"); + System.out.println("Distance: " + result.getDistance()); + System.out.println("Edit sequence: " + result.getEditSequence()); + System.out.println("Alignment:"); + System.out.println(result.getTopAlignmentRow()); + System.out.println(result.getBottomAlignmentRow()); + + + } + + + + +} diff --git a/src/dev/java/japsadev/tools/VNTRClusteringCmd.java b/src/dev/java/japsadev/tools/VNTRClusteringCmd.java index 01d77f5..4063864 100644 --- a/src/dev/java/japsadev/tools/VNTRClusteringCmd.java +++ b/src/dev/java/japsadev/tools/VNTRClusteringCmd.java @@ -1,5 +1,5 @@ /***************************************************************************** - * Copyright (c) Bhuvana, email + * Copyright (c) Bhuvaneswari Thirugnanasambandham, buvan.suji@gmail.com * All rights reserved. * * * * Redistribution and use in source and binary forms, with or without * @@ -36,14 +36,15 @@ + import japsa.util.CommandLine; import japsa.util.deploy.Deployable; +import japsadev.bio.hts.clustering.KmeanClustering; -import java.io.IOException; /** - * @author your name + * @author Bhuvaneswari Thirugnanasambandham * */ @Deployable( @@ -53,6 +54,8 @@ public class VNTRClusteringCmd extends CommandLine{ //CommandLine cmdLine; public VNTRClusteringCmd(){ + + super(); Deployable annotation = getClass().getAnnotation(Deployable.class); setUsage(annotation.scriptName() + " [options]"); @@ -62,18 +65,32 @@ public VNTRClusteringCmd(){ addString("output", "-", "Output file"); addStdHelp(); + + + + } - public static void main(String [] args) throws IOException{ + public static void main(String [] args) throws Exception{ CommandLine cmdLine = new VNTRClusteringCmd(); args = cmdLine.stdParseLine(args); + /*String[] s = args; + System.out.println(s);*/ + + /**********************************************************************/ String input = cmdLine.getStringVal("input"); - String output = cmdLine.getStringVal("output"); - ////YOUR CODE GOES HERE - System.out.println("Hello world input is " + input); + String output= cmdLine.getStringVal("output"); + ////YOUR CODE GOES HERE + /*System.out.println("Hello world input is " + input); + System.out.println("Testing this statement");*/ //////////// + + KmeanClustering cluster = new KmeanClustering(); + cluster.Clustering(); + + } } diff --git a/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java b/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java index 19a395f..dc46298 100644 --- a/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java +++ b/src/main/java/japsa/bio/BuildSequenceGroupDatabase.java @@ -128,7 +128,8 @@ public void cleanUp() throws IOException, InterruptedException{ * @throws IOException * @throws InterruptedException */ - public HashMap addGeneMap(final Map seqs, boolean checkGeneID) throws IOException, InterruptedException{ + public HashMap addGeneMap(final Map seqs, boolean checkGeneID) + throws IOException, InterruptedException{ HashMap strMap = new HashMap (); //0: initialise grouping within the new sequences @@ -243,6 +244,7 @@ public void cleanUp() throws IOException, InterruptedException{ LOG.info("Iteration " + iteration + " of " + countI + " sequences " + new Date()); //JapsaTimer.systemInfo(); + //TODO: 1. think of a way to perform alignment `in memory' instead of write to bwa (may be use jalign) Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA2.sh"); process.waitFor(); @@ -289,25 +291,27 @@ public void cleanUp() throws IOException, InterruptedException{ LOG.info(" 2 Grouping " + geneDatabase.size()); //JapsaTimer.systemInfo(); - //2.Try to add to the existing groups + //2.Try to add to the existing groups. Align the rep from each group to each of the family rep + //if match, add the whole group to the family int G = 0, GG = 0; if (geneDatabase.size() > 0){ - geneDatabase.write2File(fFile, false); + //geneDatabase.write2File(fFile, false); //creat a map of current reps - SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(fFile); HashMap repMap = new HashMap(); for (GeneDatabase.GeneFamily family:geneDatabase){ Sequence rep = family.represetationSequence(); repMap.put(rep.getName(),rep); + rep.writeFasta(sos); } sos.close(); //Run bwa //LOG.info("Running bwa for " + seq.getName() + " " + geneFamilies.size() + " family"); - Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA.sh"); + //TODO 2: Read this directly from stdout of bwa + Process process = Runtime.getRuntime().exec("bash " + prefix + "runBWA.sh"); process.waitFor(); //Read the sam file @@ -338,14 +342,14 @@ public void cleanUp() throws IOException, InterruptedException{ continue; } - Sequence refSeq = family.represetationSequence(); + Sequence refSeq = repMap.get(refName); if (refSeq == null){ LOG.error("ERROR 6: rep for family " + refName + " not found!"); } - + if (isSimilar(refSeq, readSeq, sam)){ + //asign all reads in this set to the family ArrayList readSet = setMap.get(readName); - for (String key:readSet){ //if (strMap.containsKey(key)){ // LOG.error("ERROR 1 : " + key); @@ -367,6 +371,7 @@ public void cleanUp() throws IOException, InterruptedException{ LOG.info(" 3 Grouping " + geneDatabase.size() + " " + seqs.size()); //JapsaTimer.systemInfo(); + //IF there are any groups left, add each of them as a family for (Sequence seq:seqs.values()){ String seqName = seq.getName(); ArrayList tSet = setMap.get(seqName); @@ -393,7 +398,6 @@ public void cleanUp() throws IOException, InterruptedException{ } tSet.clear(); } - LOG.info("Manage to add " + G + " and " + GG + " " + new Date()); return strMap; } diff --git a/src/main/java/japsa/bio/gene/GeneDatabase.java b/src/main/java/japsa/bio/gene/GeneDatabase.java index b7edd1e..679f5b6 100644 --- a/src/main/java/japsa/bio/gene/GeneDatabase.java +++ b/src/main/java/japsa/bio/gene/GeneDatabase.java @@ -38,25 +38,27 @@ import japsa.seq.SequenceReader; import japsa.seq.Alphabet.DNA; import japsa.seq.SequenceOutputStream; -import japsa.util.Logging; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; -public class GeneDatabase implements Iterable{ - ArrayList geneFamilies; +public class GeneDatabase extends ArrayList{ + private static final Logger LOG = LoggerFactory.getLogger(GeneDatabase.class); String dbID = "JSA"; public GeneDatabase(){ - geneFamilies = new ArrayList(); + super(); } public String addNewFamily(Sequence seq){ GeneDatabase.GeneFamily newFam = new GeneDatabase.GeneFamily(size()); String geneID = newFam.addSequence(seq); - geneFamilies.add(newFam); + add(newFam); return geneID; } @@ -68,7 +70,7 @@ public String addNewFamily(Sequence seq){ */ public void write2File(String fileName, boolean includeAlleles) throws IOException{ SequenceOutputStream sos = SequenceOutputStream.makeOutputStream(fileName); - for (GeneDatabase.GeneFamily family:geneFamilies){ + for (GeneDatabase.GeneFamily family:this){ Sequence rep = family.represetationSequence(); rep.writeFasta(sos); if (includeAlleles){ @@ -98,13 +100,13 @@ public static GeneDatabase readDB(String fileName) throws IOException{ if (toks.length == 2){ //rep GeneDatabase.GeneFamily newFam = new GeneDatabase.GeneFamily(db.size()); - db.geneFamilies.add(newFam); + db.add(newFam); }else if (toks.length == 3){ //a gene GeneDatabase.GeneFamily fam = db.getFamily(Integer.parseInt(toks[1])); - fam.geneAlleles.add(seq); + fam.add(seq); }else{ - Logging.error("Unknown sequence " + seq.getName()); + LOG.error("Unknown sequence " + seq.getName()); } } reader.close(); @@ -131,7 +133,7 @@ public static GeneDatabase readDB(String fileName) throws IOException{ //doing nothing } - return geneFamilies.get(fID); + return get(fID); } /** @@ -141,44 +143,25 @@ public static GeneDatabase readDB(String fileName) throws IOException{ */ public GeneDatabase.GeneFamily getFamily(int famID){ - if (famID >= geneFamilies.size() || famID < 0){ + //TODO: Check if really need to validate famID + if (famID >= size() || famID < 0){ return null; } - return geneFamilies.get(famID); + return get(famID); } - - - /** - * Return the number of families in the database - * @return - */ - public int size(){ - return geneFamilies.size(); - } - - /* (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - public Iterator iterator() { - return geneFamilies.iterator(); - - } ///////////////////////////////////////////////////////////////////////// - public static class GeneFamily implements Iterable{ + public static class GeneFamily extends ArrayList{ private final int fID; - private ArrayList geneAlleles;//known instance of this family - //Sequence rep = null;//The representation of this gene family - + int repIndex = -1; String desc = ""; public GeneFamily(int id){ + super(); fID = id; - geneAlleles = new ArrayList(); } /** @@ -202,14 +185,14 @@ public String familyID(){ } public Sequence represetationSequence(){ - Sequence rep = geneAlleles.get(repIndex).clone(); - rep.setDesc(desc + ";index=" +repIndex + ";size=" + geneAlleles.size()); + Sequence rep = get(repIndex).clone(); + rep.setDesc(desc + ";index=" +repIndex + ";size=" + size()); rep.setName(familyID()); return rep; } private void updateRep(int newIndex){ - if (repIndex < 0 || geneAlleles.get(newIndex).length() > geneAlleles.get(repIndex).length()){ + if (repIndex < 0 || get(newIndex).length() > get(repIndex).length()){ repIndex = newIndex; } } @@ -222,8 +205,9 @@ private void updateRep(int newIndex){ */ public String addSequence(Sequence seq){ //This allele already in the database - for (int i = 0; i < geneAlleles.size(); i++){ - Sequence eSeq = geneAlleles.get(i); + //TODO: Need to see if this feature is really needed + for (int i = 0; i < size(); i++){ + Sequence eSeq = get(i); if (eSeq.match(seq) == 0) return eSeq.getName(); if (eSeq.match(DNA.complement(seq)) == 0) @@ -232,19 +216,10 @@ public String addSequence(Sequence seq){ Sequence nSeq = seq.clone(); nSeq.setDesc(nSeq.getName() + " " + nSeq.getDesc()); - nSeq.setName(familyID() + "_" + (geneAlleles.size())); - geneAlleles.add(nSeq); - updateRep(geneAlleles.size() - 1); + nSeq.setName(familyID() + "_" + (size())); + add(nSeq); + updateRep(size() - 1); return nSeq.getName(); } - - /* (non-Javadoc) - * @see java.lang.Iterable#iterator() - */ - @Override - public Iterator iterator() { - return geneAlleles.iterator(); - } } - } \ No newline at end of file