Skip to content

Commit

Permalink
Several new filters in assembleContigs, cleaner output for noisy data…
Browse files Browse the repository at this point in the history
…. Minor bug fixes.
  • Loading branch information
dbolotin committed Apr 12, 2018
1 parent ebbb796 commit 6d903cb
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public final class FullSeqAssembler {
/** splitting region in global coordinates */
final Range splitRegion;
/** parameters */
FullSeqAssemblerParameters parameters;
final FullSeqAssemblerParameters parameters;
/** aligner parameters */
final VDJCAlignerParameters alignerParameters;
/** nucleotide sequence -> its integer index */
Expand All @@ -81,6 +81,10 @@ public FullSeqAssembler(CloneFactory cloneFactory,
|| alignerParameters.getJAlignerParameters().getScoring() instanceof AffineGapAlignmentScoring)
throw new IllegalArgumentException("Do not support Affine Gap Alignment Scoring.");

// Checking parameters
if (parameters.outputMinimalSumQuality > parameters.branchingMinimalSumQuality)
throw new IllegalArgumentException("Wrong parameters. (branchingMinimalSumQuality must be greater than outputMinimalSumQuality)");

this.cloneFactory = cloneFactory;
this.parameters = parameters;
this.clone = clone;
Expand Down Expand Up @@ -192,7 +196,7 @@ public Clone[] callVariants(RawVariantsData data) {
int[] variantInfos = port.take();
List<VariantBranch> newBranches = new ArrayList<>();
for (VariantBranch branch : branches) {
List<Variant> variants = callVariantsForPoint(variantInfos, branch.reads);
List<Variant> variants = callVariantsForPoint(variantInfos, branch.reads, data.points[i] == positionOfAssemblingFeature);
if (variants.size() == 1 && variants.get(0).variantInfo == ABSENT_PACKED_VARIANT_INFO)
newBranches.add(branch.addAbsentVariant());
else {
Expand Down Expand Up @@ -314,7 +318,10 @@ BranchSequences clean(BranchSequences seq) {
seq = seq.cut(i, trimRange);
} else {
Range trimRange = QualityTrimmer.trim(seq.sequences[i].getQuality(), parameters.trimmingParameters);
seq = seq.cut(i, trimRange);
if (trimRange == null)
seq.without(i);
else
seq = seq.cut(i, trimRange);
}
if (seq.sequences[i].size() < parameters.minimalContigLength && i != seq.assemblingFeatureTargetId)
seq = seq.without(i);
Expand Down Expand Up @@ -811,7 +818,7 @@ static <S extends Sequence<S>> Alignment<S> mergeTwoAlignments(Alignment<S> a1,
/**
* Call variants for a single position
*/
private List<Variant> callVariantsForPoint(int[] pointVariantInfos, BitSet targetReads) {
private List<Variant> callVariantsForPoint(int[] pointVariantInfos, BitSet targetReads, boolean isAssemblingFeature) {
// Pre-calculating number of present variants
int count = 0;
for (int readId = targetReads.nextSetBit(0); readId >= 0; readId = targetReads.nextSetBit(readId + 1))
Expand Down Expand Up @@ -858,9 +865,9 @@ private List<Variant> callVariantsForPoint(int[] pointVariantInfos, BitSet targe
if (currentIndex == count || currentVariant != (int) (targets[currentIndex] >>> 40)) {
// Checking significance conditions
if ((1.0 * nonEdgePoints / (currentIndex - blockBegin) >= parameters.minimalNonEdgePointsFraction)
&& ((variantSumQuality >= parameters.minimalSumQuality
&& variantSumQuality >= parameters.minimalQualityShare * totalSumQuality)
|| variantSumQuality >= parameters.decisiveSumQualityThreshold)) {
&& ((variantSumQuality >= parameters.branchingMinimalSumQuality
&& variantSumQuality >= parameters.branchingMinimalQualityShare * totalSumQuality)
|| variantSumQuality >= parameters.decisiveBranchingSumQualityThreshold)) {
// Variant is significant
BitSet reads = new BitSet();
for (int j = currentIndex - 1; j >= blockBegin; --j)
Expand All @@ -873,7 +880,7 @@ private List<Variant> callVariantsForPoint(int[] pointVariantInfos, BitSet targe
unassignedVariants.set((int) targets[j]);
if (variantSumQuality > bestVariantSumQuality) {
bestVariant = currentVariant;
// totalSumQuality is definitely less than Long because variantSumQuality < decisiveSumQualityThreshold
// totalSumQuality is definitely less than Long because variantSumQuality < decisiveBranchingSumQualityThreshold
bestVariantSumQuality = variantSumQuality;
}
}
Expand All @@ -895,18 +902,26 @@ private List<Variant> callVariantsForPoint(int[] pointVariantInfos, BitSet targe
} while (++currentIndex <= count);

if (variants.isEmpty()) {
// no significant variants
assert bestVariant != -1;
BitSet reads = new BitSet();
for (long target : targets)
reads.set((int) target);
reads.or(unassignedVariants);
double p = 1 - 1.0 * bestVariantSumQuality / totalSumQuality;
long phredQuality = p == 0 ? bestVariantSumQuality : Math.min((long) (-10 * Math.log10(p)), bestVariantSumQuality);
// nSignificant = 1 (will not be practically used, only one variant, don't care)
return Collections.singletonList(
new Variant(bestVariant << 8 | (int) Math.min((long) SequenceQuality.MAX_QUALITY_VALUE, phredQuality),
reads, 1));
// Checking best variant to meet output criteria
// Always assemble assembling feature
if (isAssemblingFeature ||
(bestVariantSumQuality >= parameters.outputMinimalQualityShare * totalSumQuality &&
bestVariantSumQuality >= parameters.outputMinimalSumQuality)) {
// no significant variants
assert bestVariant != -1;
BitSet reads = new BitSet();
for (long target : targets)
reads.set((int) target);
reads.or(unassignedVariants);
double p = 1 - 1.0 * bestVariantSumQuality / totalSumQuality;
long phredQuality = p == 0 ? bestVariantSumQuality : Math.min((long) (-10 * Math.log10(p)), bestVariantSumQuality);
// nSignificant = 1 (will not be practically used, only one variant, don't care)
return Collections.singletonList(
new Variant(bestVariant << 8 | (int) Math.min((long) SequenceQuality.MAX_QUALITY_VALUE, phredQuality),
reads, 1));
} else
// No variants to output (poorly covered or controversial position)
return Collections.singletonList(new Variant(ABSENT_PACKED_VARIANT_INFO, targetReads, 0));
} else {
for (Variant variant : variants)
variant.reads.or(unassignedVariants);
Expand Down Expand Up @@ -1100,6 +1115,19 @@ public String toString(byte qualityThreshold, int readsFrom, int readsTo) {
for (char[] line : result)
Arrays.fill(line, ' ');

char[] positionStrokes = new char[maxLength];
char[] positionValues = new char[maxLength + 10];
Arrays.fill(positionStrokes, ' ');
Arrays.fill(positionValues, ' ');
for (int i = 0; i < len.length; i++) {
positionStrokes[positionMap[i]] = '|';
if (i % 10 == 0) {
String val = "" + (i + minPosition);
for (int j = 0; j < val.length(); j++)
positionValues[positionMap[i] + j] = val.charAt(j);
}
}

port = createPort();
for (int position : points) {
int[] states = port.take();
Expand All @@ -1119,9 +1147,12 @@ public String toString(byte qualityThreshold, int readsFrom, int readsTo) {
}
assert port.take() == null;

return Arrays.stream(result)
.map(String::new)
.collect(Collectors.joining("\n"));

return new String(positionValues) + "\n" +
new String(positionStrokes) + "\n" +
Arrays.stream(result)
.map(String::new)
.collect(Collectors.joining("\n"));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@
public class FullSeqAssemblerParameters {
/**
* Minimal quality fraction (variant may be marked significant if {@code variantQuality > totalSumQuality *
* minimalQualityShare}
* branchingMinimalQualityShare}
*/
double minimalQualityShare;
double branchingMinimalQualityShare;
/**
* Minimal variant quality threshold (variant may be marked significant if {@code variantQuality >
* minimalSumQuality}
* branchingMinimalSumQuality}
*/
long minimalSumQuality;
long branchingMinimalSumQuality;
/**
* Variant quality that guaranties that variant will be marked significant (even if other criteria are not
* satisfied)
*/
long decisiveSumQualityThreshold;
long decisiveBranchingSumQualityThreshold;
/**
* Maximal number of not aligned nucleotides at the edge of sequence so that sequence is still considered aligned
* "to the end"
Expand All @@ -45,9 +45,17 @@ public class FullSeqAssemblerParameters {
*/
int alignmentEdgeRegionSize;
/**
* Minimal fraction of non edge points in variant that should be reached to consider variant as significant
* Minimal fraction of non edge points in variant that must be reached to consider the variant significant
*/
double minimalNonEdgePointsFraction;
/**
* Positions having quality share less then this value, will not be represented in the output
*/
double outputMinimalQualityShare;
/**
* Positions having sum quality less then this value, will not be represented in the output
*/
long outputMinimalSumQuality;
/**
* Region where variants are allowed
*/
Expand All @@ -67,50 +75,54 @@ public class FullSeqAssemblerParameters {

@JsonCreator
public FullSeqAssemblerParameters(
@JsonProperty("minimalQualityShare") double minimalQualityShare,
@JsonProperty("minimalSumQuality") long minimalSumQuality,
@JsonProperty("decisiveSumQualityThreshold") long decisiveSumQualityThreshold,
@JsonProperty("branchingMinimalQualityShare") double branchingMinimalQualityShare,
@JsonProperty("branchingMinimalSumQuality") long branchingMinimalSumQuality,
@JsonProperty("decisiveBranchingSumQualityThreshold") long decisiveBranchingSumQualityThreshold,
@JsonProperty("alignedSequenceEdgeDelta") int alignedSequenceEdgeDelta,
@JsonProperty("alignmentEdgeRegionSize") int alignmentEdgeRegionSize,
@JsonProperty("minimalNonEdgePointsFraction") double minimalNonEdgePointsFraction,
@JsonProperty("outputMinimalQualityShare") double outputMinimalQualityShare,
@JsonProperty("outputMinimalSumQuality") long outputMinimalSumQuality,
@JsonProperty("subCloningRegion") GeneFeature subCloningRegion,
@JsonProperty("trimmingParameters") QualityTrimmerParameters trimmingParameters,
@JsonProperty("minimalContigLength") int minimalContigLength,
@JsonProperty("alignedRegionsOnly") boolean alignedRegionsOnly) {
this.minimalQualityShare = minimalQualityShare;
this.minimalSumQuality = minimalSumQuality;
this.decisiveSumQualityThreshold = decisiveSumQualityThreshold;
this.branchingMinimalQualityShare = branchingMinimalQualityShare;
this.branchingMinimalSumQuality = branchingMinimalSumQuality;
this.decisiveBranchingSumQualityThreshold = decisiveBranchingSumQualityThreshold;
this.alignedSequenceEdgeDelta = alignedSequenceEdgeDelta;
this.alignmentEdgeRegionSize = alignmentEdgeRegionSize;
this.minimalNonEdgePointsFraction = minimalNonEdgePointsFraction;
this.outputMinimalQualityShare = outputMinimalQualityShare;
this.outputMinimalSumQuality = outputMinimalSumQuality;
this.subCloningRegion = subCloningRegion;
this.trimmingParameters = trimmingParameters;
this.minimalContigLength = minimalContigLength;
this.alignedRegionsOnly = alignedRegionsOnly;
}

public double getMinimalQualityShare() {
return minimalQualityShare;
public double getBranchingMinimalQualityShare() {
return branchingMinimalQualityShare;
}

public void setMinimalQualityShare(double minimalQualityShare) {
this.minimalQualityShare = minimalQualityShare;
public void setBranchingMinimalQualityShare(double branchingMinimalQualityShare) {
this.branchingMinimalQualityShare = branchingMinimalQualityShare;
}

public long getMinimalSumQuality() {
return minimalSumQuality;
public long getBranchingMinimalSumQuality() {
return branchingMinimalSumQuality;
}

public void setMinimalSumQuality(long minimalSumQuality) {
this.minimalSumQuality = minimalSumQuality;
public void setBranchingMinimalSumQuality(long branchingMinimalSumQuality) {
this.branchingMinimalSumQuality = branchingMinimalSumQuality;
}

public long getDecisiveSumQualityThreshold() {
return decisiveSumQualityThreshold;
public long getDecisiveBranchingSumQualityThreshold() {
return decisiveBranchingSumQualityThreshold;
}

public void setDecisiveSumQualityThreshold(long decisiveSumQualityThreshold) {
this.decisiveSumQualityThreshold = decisiveSumQualityThreshold;
public void setDecisiveBranchingSumQualityThreshold(long decisiveBranchingSumQualityThreshold) {
this.decisiveBranchingSumQualityThreshold = decisiveBranchingSumQualityThreshold;
}

public int getAlignedSequenceEdgeDelta() {
Expand All @@ -137,6 +149,22 @@ public void setMinimalNonEdgePointsFraction(double minimalNonEdgePointsFraction)
this.minimalNonEdgePointsFraction = minimalNonEdgePointsFraction;
}

public double getOutputMinimalQualityShare() {
return outputMinimalQualityShare;
}

public void setOutputMinimalQualityShare(double outputMinimalQualityShare) {
this.outputMinimalQualityShare = outputMinimalQualityShare;
}

public long getOutputMinimalSumQuality() {
return outputMinimalSumQuality;
}

public void setOutputMinimalSumQuality(long outputMinimalSumQuality) {
this.outputMinimalSumQuality = outputMinimalSumQuality;
}

public boolean isAlignedRegionsOnly() {
return alignedRegionsOnly;
}
Expand All @@ -163,8 +191,9 @@ public void setMinimalContigLength(int minimalContigLength) {

@Override
public FullSeqAssemblerParameters clone() {
return new FullSeqAssemblerParameters(minimalQualityShare, minimalSumQuality, decisiveSumQualityThreshold,
alignedSequenceEdgeDelta, alignmentEdgeRegionSize, minimalNonEdgePointsFraction, subCloningRegion,
return new FullSeqAssemblerParameters(branchingMinimalQualityShare, branchingMinimalSumQuality, decisiveBranchingSumQualityThreshold,
alignedSequenceEdgeDelta, alignmentEdgeRegionSize, minimalNonEdgePointsFraction,
outputMinimalQualityShare, outputMinimalSumQuality, subCloningRegion,
trimmingParameters, minimalContigLength, alignedRegionsOnly);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
{
"default": {
"minimalQualityShare": 0.1,
"minimalSumQuality": 80,
"decisiveSumQualityThreshold": 120,
"branchingMinimalQualityShare": 0.1,
"branchingMinimalSumQuality": 80,
"decisiveBranchingSumQualityThreshold": 120,
"outputMinimalQualityShare": 0.5,
"outputMinimalSumQuality": 50,
"alignedSequenceEdgeDelta": 3,
"alignmentEdgeRegionSize": 7,
"minimalNonEdgePointsFraction": 0.25,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
public class FullSeqAssemblerTest {
static final FullSeqAssemblerParameters DEFAULT_PARAMETERS =
new FullSeqAssemblerParameters(0.1, 80, 120,
3, 7, 0.25, GeneFeature.VDJRegion,
3, 7, 0.25, 0.5, 50, GeneFeature.VDJRegion,
new QualityTrimmerParameters(20.0f, 8), 20, false);

static final class MasterSequence {
Expand Down Expand Up @@ -445,7 +445,7 @@ public void testLargeCloneNoMismatches() throws Exception {
align.usedGenes, align.parameters.alignerParameters.getFeaturesToAlignMap());
FullSeqAssembler agg = new FullSeqAssembler(cloneFactory, DEFAULT_PARAMETERS, initialClone, align.parameters.alignerParameters);
FullSeqAssembler.RawVariantsData prep = agg.calculateRawData(() -> CUtils.asOutputPort(alignments));
// System.out.println(prep);
System.out.println(prep);
List<Clone> clones = new ArrayList<>(new CloneSet(Arrays.asList(agg.callVariants(prep))).getClones());
clones.sort(Comparator.comparingDouble(Clone::getCount).reversed());
for (Clone clone : clones) {
Expand Down

0 comments on commit 6d903cb

Please sign in to comment.