Skip to content

Commit

Permalink
Switch to lognorm + beta binom Q score, update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mikessh committed Nov 10, 2016
1 parent c915d1c commit 17a26bf
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -184,30 +184,23 @@ private static double getBinomialQScore(int majorCount, int total, double errorR
return score;
}

private static double getNegativeBinomialQScore(int majorCount, int total, double errorRate) {
errorRate *= total;

// Estimate the parameters of distribution of real error rate (lambda) based on errorRate estimate

// Mean and variance of lambda from linear fitting
double meanA = 5.02, meanB = 0.35,
varA = 27.67, varB = 4.09;

double mean = meanA + meanB * errorRate, var = varA + varB * errorRate;

// Alpha and beta parameters of Gamma distribution
double beta = mean / var, alpha = mean * beta;

// Convert them to Negative Binomial distribution parameters
double p = beta / (1.0 + beta);
int r = (int) Math.round(alpha);
private static double getCompoundQScore(int majorCount, int total, double errorRate,
int countThreshold, double modelSD) {
if (majorCount <= countThreshold) {
double m = errorRate * Math.exp(modelSD * modelSD / 2),
v = errorRate * errorRate * Math.exp(modelSD * modelSD) * (Math.exp(modelSD * modelSD) - 1),
alpha = m * (m * (1 - m) / v - 1), beta = (1 - m) * (m * (1 - m) / v - 1);

if (alpha == 0 || beta == 0) {
return VcfUtil.UNDEF_QUAL;
}

if (r == 0) {
return VcfUtil.UNDEF_QUAL;
return -10 * Math.log10(1.0 - AuxiliaryStats.betaBinomialCdf(majorCount, total, alpha, beta) +
0.5 * AuxiliaryStats.betaBinomialCdf(majorCount, total, alpha, beta));
} else {
return -10 * Math.log10(1.0 - AuxiliaryStats.normalCdf(Math.log10(majorCount / (double) total),
Math.log10(errorRate), modelSD));
}

return -10 * Math.log10(1.0 - AuxiliaryStats.negativeBinomialCdf(majorCount, r, p) +
0.5 * AuxiliaryStats.negativeBinomialPdf(majorCount, r, p));
}

private static double getQScore(int majorCount, int total, double errorRate,
Expand All @@ -216,12 +209,14 @@ private static double getQScore(int majorCount, int total, double errorRate,
return 0;
}

if (Double.isNaN(errorRate)) {
if (Double.isNaN(errorRate) || errorRate == 0) {
return VcfUtil.UNDEF_QUAL;
}

double score = variantCallerParameters.getErrorModelType() == ErrorModelType.MinorBased ?
getNegativeBinomialQScore(majorCount, total, errorRate) :
getCompoundQScore(majorCount, total, errorRate,
variantCallerParameters.getCompoundQScoreThreshold(),
variantCallerParameters.getCompoundQScoreSD()) :
getBinomialQScore(majorCount, total, errorRate);

return Double.isInfinite(score) ? VcfUtil.MAX_QUAL : score;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import org.jdom.Element;

public final class VariantCallerParameters implements ParameterSet {
private final double modelOrder, modelCycles, modelEfficiency;
private final double modelOrder, modelCycles, modelEfficiency, compoundQScoreSD;
private final ErrorModelType errorModelType;
private final int qualityThreshold, singletonFrequencyThreshold, coverageThreshold,
modelCoverageThreshold, modelMinorCountThreshold;
modelCoverageThreshold, modelMinorCountThreshold, compoundQScoreThreshold;
private final String substitutionErrorRateMatrix;
private final SubstitutionErrorMatrix parsedSubstitutionErrorRateMatrix;
private final boolean noIndels, showAbsentVariants;
Expand All @@ -34,13 +34,17 @@ public final class VariantCallerParameters implements ParameterSet {
20, 10000, 100,
ErrorModelType.MinorBased,
1.0, 20.0, 1.8, 100, 10,
SubstitutionErrorMatrix.DEFAULT.toString(), false);
SubstitutionErrorMatrix.DEFAULT.toString(),
5, 0.64,
false);

public VariantCallerParameters(boolean noIndels, int qualityThreshold, int singletonFrequencyThreshold, int coverageThreshold,
ErrorModelType errorModelType,
double modelOrder, double modelCycles, double modelEfficiency,
int modelCoverageThreshold, int modelMinorCountThreshold,
String substitutionErrorRateMatrix, boolean showAbsentVariants) {
String substitutionErrorRateMatrix,
int compoundQScoreThreshold, double compoundQScoreSD,
boolean showAbsentVariants) {
if (modelCycles < 10 || modelCycles > 40)
throw new IllegalArgumentException("(model parameters) Number of PCR cycles should be in [10,40]");

Expand All @@ -67,6 +71,10 @@ public VariantCallerParameters(boolean noIndels, int qualityThreshold, int singl
this.modelCycles = modelCycles;
this.modelEfficiency = modelEfficiency;
this.substitutionErrorRateMatrix = substitutionErrorRateMatrix;

this.compoundQScoreThreshold = compoundQScoreThreshold;
this.compoundQScoreSD = compoundQScoreSD;

this.parsedSubstitutionErrorRateMatrix = SubstitutionErrorMatrix.fromString(substitutionErrorRateMatrix);
this.showAbsentVariants = showAbsentVariants;
}
Expand Down Expand Up @@ -123,13 +131,22 @@ public boolean showAbsentVariants() {
return showAbsentVariants;
}

public double getCompoundQScoreSD() {
return compoundQScoreSD;
}

public int getCompoundQScoreThreshold() {
return compoundQScoreThreshold;
}

public VariantCallerParameters withNoIndels(boolean noIndels) {
return new VariantCallerParameters(noIndels,
qualityThreshold, singletonFrequencyThreshold, coverageThreshold,
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withOrder(double order) {
Expand All @@ -138,7 +155,8 @@ public VariantCallerParameters withOrder(double order) {
errorModelType,
order, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withModelCycles(double modelCycles) {
Expand All @@ -147,7 +165,8 @@ public VariantCallerParameters withModelCycles(double modelCycles) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withModelEfficiency(double modelEfficiency) {
Expand All @@ -156,7 +175,8 @@ public VariantCallerParameters withModelEfficiency(double modelEfficiency) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withQualityThreshold(int qualityThreshold) {
Expand All @@ -165,7 +185,8 @@ public VariantCallerParameters withQualityThreshold(int qualityThreshold) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withSingletonFrequencyThreshold(int singletonFrequencyThreshold) {
Expand All @@ -174,7 +195,8 @@ public VariantCallerParameters withSingletonFrequencyThreshold(int singletonFreq
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withErrorModelType(ErrorModelType errorModelType) {
Expand All @@ -183,7 +205,8 @@ public VariantCallerParameters withErrorModelType(ErrorModelType errorModelType)
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withSubstitutionErrorRateMatrix(String substitutionErrorRateMatrix) {
Expand All @@ -192,7 +215,8 @@ public VariantCallerParameters withSubstitutionErrorRateMatrix(String substituti
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withCoverageThreshold(int coverageThreshold) {
Expand All @@ -201,7 +225,8 @@ public VariantCallerParameters withCoverageThreshold(int coverageThreshold) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withModelCoverageThreshold(int modelCoverageThreshold) {
Expand All @@ -210,7 +235,8 @@ public VariantCallerParameters withModelCoverageThreshold(int modelCoverageThres
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withModelMinorCountThreshold(int modelMinorCountThreshold) {
Expand All @@ -219,7 +245,8 @@ public VariantCallerParameters withModelMinorCountThreshold(int modelMinorCountT
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withModelOrder(int modelOrder) {
Expand All @@ -228,7 +255,28 @@ public VariantCallerParameters withModelOrder(int modelOrder) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAbsentVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withCompoundQScoreThreshold(int compoundQScoreThreshold) {
return new VariantCallerParameters(noIndels,
qualityThreshold, singletonFrequencyThreshold, coverageThreshold,
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withCompoundQScoreSD(double compoundQScoreSD) {
return new VariantCallerParameters(noIndels,
qualityThreshold, singletonFrequencyThreshold, coverageThreshold,
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withShowAllVariants(boolean showAllVariants) {
Expand All @@ -237,7 +285,8 @@ public VariantCallerParameters withShowAllVariants(boolean showAllVariants) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, showAllVariants);
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
showAbsentVariants);
}

@Override
Expand All @@ -254,6 +303,8 @@ public Element toXml() {
e.addContent(new Element("modelCoverageThreshold").setText(Integer.toString(modelCoverageThreshold)));
e.addContent(new Element("modelMinorCountThreshold").setText(Integer.toString(modelMinorCountThreshold)));
e.addContent(new Element("substitutionErrorRateMatrix").setText(substitutionErrorRateMatrix));
e.addContent(new Element("compoundQScoreThreshold").setText(Integer.toString(compoundQScoreThreshold)));
e.addContent(new Element("compoundQScoreSD").setText(Double.toString(compoundQScoreSD)));
e.addContent(new Element("showAbsentVariants").setText(Boolean.toString(showAbsentVariants)));
return e;
}
Expand All @@ -272,6 +323,8 @@ public static VariantCallerParameters fromXml(Element parent) {
Integer.parseInt(e.getChildTextTrim("modelCoverageThreshold")),
Integer.parseInt(e.getChildTextTrim("modelMinorCountThreshold")),
e.getChildTextTrim("substitutionErrorRateMatrix"),
Integer.parseInt(e.getChildTextTrim("compoundQScoreThreshold")),
Double.parseDouble(e.getChildTextTrim("compoundQScoreSD")),
Boolean.parseBoolean(e.getChildTextTrim("showAbsentVariants"))
);
}
Expand All @@ -286,11 +339,13 @@ public boolean equals(Object o) {
if (Double.compare(that.modelOrder, modelOrder) != 0) return false;
if (Double.compare(that.modelCycles, modelCycles) != 0) return false;
if (Double.compare(that.modelEfficiency, modelEfficiency) != 0) return false;
if (Double.compare(that.compoundQScoreSD, compoundQScoreSD) != 0) return false;
if (qualityThreshold != that.qualityThreshold) return false;
if (singletonFrequencyThreshold != that.singletonFrequencyThreshold) return false;
if (coverageThreshold != that.coverageThreshold) return false;
if (modelCoverageThreshold != that.modelCoverageThreshold) return false;
if (modelMinorCountThreshold != that.modelMinorCountThreshold) return false;
if (compoundQScoreThreshold != that.compoundQScoreThreshold) return false;
if (noIndels != that.noIndels) return false;
if (showAbsentVariants != that.showAbsentVariants) return false;
if (errorModelType != that.errorModelType) return false;
Expand All @@ -309,12 +364,15 @@ public int hashCode() {
result = 31 * result + (int) (temp ^ (temp >>> 32));
temp = Double.doubleToLongBits(modelEfficiency);
result = 31 * result + (int) (temp ^ (temp >>> 32));
temp = Double.doubleToLongBits(compoundQScoreSD);
result = 31 * result + (int) (temp ^ (temp >>> 32));
result = 31 * result + errorModelType.hashCode();
result = 31 * result + qualityThreshold;
result = 31 * result + singletonFrequencyThreshold;
result = 31 * result + coverageThreshold;
result = 31 * result + modelCoverageThreshold;
result = 31 * result + modelMinorCountThreshold;
result = 31 * result + compoundQScoreThreshold;
result = 31 * result + substitutionErrorRateMatrix.hashCode();
result = 31 * result + parsedSubstitutionErrorRateMatrix.hashCode();
result = 31 * result + (noIndels ? 1 : 0);
Expand Down
31 changes: 31 additions & 0 deletions src/main/java/com/antigenomics/mageri/misc/AuxiliaryStats.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,17 @@

package com.antigenomics.mageri.misc;

import org.apache.commons.math3.special.Beta;
import org.apache.commons.math3.special.Erf;
import org.apache.commons.math3.special.Gamma;
import org.apache.commons.math3.util.FastMath;

import static org.apache.commons.math3.special.Beta.regularizedBeta;
import static org.apache.commons.math3.util.CombinatoricsUtils.binomialCoefficientLog;

public class AuxiliaryStats {
private static final double SQRT2 = FastMath.sqrt(2.0);

public static double negativeBinomialCdf(int k, int r, double p) {
return 1.0 - regularizedBeta(p, k + 1, r);
}
Expand All @@ -28,4 +35,28 @@ public static double negativeBinomialPdf(int k, int r, double p) {
return Math.exp(binomialCoefficientLog(k + r - 1, k) +
r * Math.log(1.0 - p) + k * Math.log(p));
}

public static double betaBinomialPdf(int k, int n, double alpha, double beta) {
return Math.exp(Beta.logBeta(k + alpha, n - k + beta) - Beta.logBeta(alpha, beta) +
Gamma.logGamma(n + 1) - Gamma.logGamma(k + 1) - Gamma.logGamma(n - k + 1)
);
}

public static double betaBinomialCdf(int k, int n, double alpha, double beta) {
double sum = 0;

for (int i = 0; i < k; i++) {
sum += betaBinomialPdf(k, n, alpha, beta);
}

return sum;
}

public static double normalCdf(double x, double mean, double sd) {
final double dev = x - mean;
if (FastMath.abs(dev) > 40 * sd) {
return dev < 0 ? 0.0d : 1.0d;
}
return 0.5 * Erf.erfc(-dev / (sd * SQRT2));
}
}

0 comments on commit 17a26bf

Please sign in to comment.