Skip to content

Commit

Permalink
WIP on error model. Use overall minor rates instead of reference-base…
Browse files Browse the repository at this point in the history
…d rates for global estimates
  • Loading branch information
mikessh committed Nov 15, 2016
1 parent 77c50bf commit 5839a19
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ public double getGlobalMinorRate(int from, int to) {

public double getGlobalMinorRate(int from, int to, boolean symmetric) {
return symmetric ?
0.5 * (getM1(from, to) / getM(from, to) + getM1(~from & 3, ~to & 3) / getM(~from & 3, ~to & 3)) :
(getM1(from, to) + getM1(~from & 3, ~to & 3)) / (getM(from, to) + getM(~from & 3, ~to & 3)) :
getM1(from, to) / getM(from, to);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,16 @@
import com.antigenomics.mageri.core.variant.filter.QualFilter;
import com.antigenomics.mageri.core.variant.filter.SingletonFilter;
import com.antigenomics.mageri.core.variant.filter.VariantFilter;
import com.antigenomics.mageri.core.variant.model.ErrorModel;
import com.antigenomics.mageri.core.variant.model.ErrorModelProvider;
import com.antigenomics.mageri.core.variant.model.ErrorModelType;
import com.antigenomics.mageri.core.variant.model.ErrorRateEstimate;
import com.antigenomics.mageri.core.variant.model.*;
import com.antigenomics.mageri.misc.AuxiliaryStats;
import com.milaboratory.core.sequence.mutations.Mutations;
import com.milaboratory.core.sequence.nucleotide.NucleotideSequence;
import com.milaboratory.core.sequence.nucleotide.NucleotideSequenceBuilder;
import com.antigenomics.mageri.core.genomic.ReferenceLibrary;
import com.antigenomics.mageri.core.mutations.Mutation;
import com.antigenomics.mageri.core.variant.filter.CoverageFilter;
import org.apache.commons.collections.ListUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.math.MathException;
import org.apache.commons.math.distribution.BinomialDistribution;
import org.apache.commons.math.distribution.BinomialDistributionImpl;
Expand Down Expand Up @@ -168,7 +167,7 @@ public VariantCaller(ConsensusAligner consensusAligner, MinorCaller minorCaller,
Collections.sort(variants);
}

private static double getBinomialQScore(int majorCount, int total, double errorRate) {
static double getBinomialQScore(int majorCount, int total, double errorRate) {
BinomialDistribution binomialDistribution = new BinomialDistributionImpl(total,
errorRate);

Expand All @@ -184,27 +183,27 @@ private static double getBinomialQScore(int majorCount, int total, double errorR
return score;
}

private static double getCompoundQScore(int majorCount, int total, double errorRate,
int countThreshold, double modelSD) {
if (majorCount <= countThreshold) {
double m = errorRate * Math.exp(modelSD * modelSD / 2),
v = errorRate * errorRate * Math.exp(modelSD * modelSD) * (Math.exp(modelSD * modelSD) - 1),
alpha = m * (m * (1 - m) / v - 1), beta = (1 - m) * (m * (1 - m) / v - 1);
static double getNegBinomialQScore(int majorCount, int total, double errorRate,
VariantCallerParameters variantCallerParameters) {
return getNegBinomialQScore(majorCount, total, errorRate,
variantCallerParameters.getCompoundQScoreMu(),
variantCallerParameters.getCompoundQScoreSD());
}

if (alpha == 0 || beta == 0) {
return VcfUtil.UNDEF_QUAL;
}
static double getNegBinomialQScore(int majorCount, int total, double errorRate,
double modelMu, double modelSD) {
double r = 1.0 / (Math.exp(modelSD * modelSD) - 1),
p = 1 / (1 + r / Math.exp(modelMu + modelSD * modelSD / 2) / errorRate / total);

return -10 * Math.log10(1.0 - AuxiliaryStats.betaBinomialCdf(majorCount, total, alpha, beta) +
0.5 * AuxiliaryStats.betaBinomialCdf(majorCount, total, alpha, beta));
} else {
return -10 * Math.log10(1.0 - AuxiliaryStats.normalCdf(Math.log10(majorCount / (double) total),
Math.log10(errorRate), modelSD));
if (r <= 0 || p >= 1 || p <= 0) {
return VcfUtil.UNDEF_QUAL;
}

return -10 * Math.log10(1.0 - AuxiliaryStats.negativeBinomialCdf(majorCount, r, p));
}

private static double getQScore(int majorCount, int total, double errorRate,
VariantCallerParameters variantCallerParameters) {
static double getQScore(int majorCount, int total, double errorRate,
VariantCallerParameters variantCallerParameters) {
if (majorCount == 0) {
return 0;
}
Expand All @@ -214,9 +213,7 @@ private static double getQScore(int majorCount, int total, double errorRate,
}

double score = variantCallerParameters.getErrorModelType() == ErrorModelType.MinorBased ?
getCompoundQScore(majorCount, total, errorRate,
variantCallerParameters.getCompoundQScoreThreshold(),
variantCallerParameters.getCompoundQScoreSD()) :
getNegBinomialQScore(majorCount, total, errorRate, variantCallerParameters) :
getBinomialQScore(majorCount, total, errorRate);

return Double.isInfinite(score) ? VcfUtil.MAX_QUAL : score;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import org.jdom.Element;

public final class VariantCallerParameters implements ParameterSet {
private final double modelOrder, modelCycles, modelEfficiency, compoundQScoreSD;
private final double modelOrder, modelCycles, modelEfficiency, compoundQScoreSD, compoundQScoreMu;
private final ErrorModelType errorModelType;
private final int qualityThreshold, singletonFrequencyThreshold, coverageThreshold,
modelCoverageThreshold, modelMinorCountThreshold, compoundQScoreThreshold;
modelCoverageThreshold, modelMinorCountThreshold;
private final String substitutionErrorRateMatrix;
private final SubstitutionErrorMatrix parsedSubstitutionErrorRateMatrix;
private final boolean noIndels, showAbsentVariants;
Expand All @@ -35,15 +35,15 @@ public final class VariantCallerParameters implements ParameterSet {
ErrorModelType.MinorBased,
1.0, 20.0, 1.8, 100, 10,
SubstitutionErrorMatrix.DEFAULT.toString(),
5, 0.64,
-0.46, 1.44,
false);

public VariantCallerParameters(boolean noIndels, int qualityThreshold, int singletonFrequencyThreshold, int coverageThreshold,
ErrorModelType errorModelType,
double modelOrder, double modelCycles, double modelEfficiency,
int modelCoverageThreshold, int modelMinorCountThreshold,
String substitutionErrorRateMatrix,
int compoundQScoreThreshold, double compoundQScoreSD,
double compoundQScoreMu, double compoundQScoreSD,
boolean showAbsentVariants) {
if (modelCycles < 10 || modelCycles > 40)
throw new IllegalArgumentException("(model parameters) Number of PCR cycles should be in [10,40]");
Expand Down Expand Up @@ -72,7 +72,7 @@ public VariantCallerParameters(boolean noIndels, int qualityThreshold, int singl
this.modelEfficiency = modelEfficiency;
this.substitutionErrorRateMatrix = substitutionErrorRateMatrix;

this.compoundQScoreThreshold = compoundQScoreThreshold;
this.compoundQScoreMu = compoundQScoreMu;
this.compoundQScoreSD = compoundQScoreSD;

this.parsedSubstitutionErrorRateMatrix = SubstitutionErrorMatrix.fromString(substitutionErrorRateMatrix);
Expand Down Expand Up @@ -135,8 +135,8 @@ public double getCompoundQScoreSD() {
return compoundQScoreSD;
}

public int getCompoundQScoreThreshold() {
return compoundQScoreThreshold;
public double getCompoundQScoreMu() {
return compoundQScoreMu;
}

public VariantCallerParameters withNoIndels(boolean noIndels) {
Expand All @@ -145,7 +145,7 @@ public VariantCallerParameters withNoIndels(boolean noIndels) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -155,7 +155,7 @@ public VariantCallerParameters withOrder(double order) {
errorModelType,
order, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -165,7 +165,7 @@ public VariantCallerParameters withModelCycles(double modelCycles) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -175,7 +175,7 @@ public VariantCallerParameters withModelEfficiency(double modelEfficiency) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -185,7 +185,7 @@ public VariantCallerParameters withQualityThreshold(int qualityThreshold) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -195,7 +195,7 @@ public VariantCallerParameters withSingletonFrequencyThreshold(int singletonFreq
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -205,7 +205,7 @@ public VariantCallerParameters withErrorModelType(ErrorModelType errorModelType)
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -215,7 +215,7 @@ public VariantCallerParameters withSubstitutionErrorRateMatrix(String substituti
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -225,7 +225,7 @@ public VariantCallerParameters withCoverageThreshold(int coverageThreshold) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -235,7 +235,7 @@ public VariantCallerParameters withModelCoverageThreshold(int modelCoverageThres
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -245,7 +245,7 @@ public VariantCallerParameters withModelMinorCountThreshold(int modelMinorCountT
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -255,17 +255,17 @@ public VariantCallerParameters withModelOrder(int modelOrder) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

public VariantCallerParameters withCompoundQScoreThreshold(int compoundQScoreThreshold) {
public VariantCallerParameters withCompoundQScoreMu(double compoundQScoreMu) {
return new VariantCallerParameters(noIndels,
qualityThreshold, singletonFrequencyThreshold, coverageThreshold,
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -275,7 +275,7 @@ public VariantCallerParameters withCompoundQScoreSD(double compoundQScoreSD) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -285,7 +285,7 @@ public VariantCallerParameters withShowAllVariants(boolean showAllVariants) {
errorModelType,
modelOrder, modelCycles, modelEfficiency,
modelCoverageThreshold, modelMinorCountThreshold,
substitutionErrorRateMatrix, compoundQScoreThreshold, compoundQScoreSD,
substitutionErrorRateMatrix, compoundQScoreMu, compoundQScoreSD,
showAbsentVariants);
}

Expand All @@ -303,7 +303,7 @@ public Element toXml() {
e.addContent(new Element("modelCoverageThreshold").setText(Integer.toString(modelCoverageThreshold)));
e.addContent(new Element("modelMinorCountThreshold").setText(Integer.toString(modelMinorCountThreshold)));
e.addContent(new Element("substitutionErrorRateMatrix").setText(substitutionErrorRateMatrix));
e.addContent(new Element("compoundQScoreThreshold").setText(Integer.toString(compoundQScoreThreshold)));
e.addContent(new Element("compoundQScoreMu").setText(Double.toString(compoundQScoreMu)));
e.addContent(new Element("compoundQScoreSD").setText(Double.toString(compoundQScoreSD)));
e.addContent(new Element("showAbsentVariants").setText(Boolean.toString(showAbsentVariants)));
return e;
Expand All @@ -323,7 +323,7 @@ public static VariantCallerParameters fromXml(Element parent) {
Integer.parseInt(e.getChildTextTrim("modelCoverageThreshold")),
Integer.parseInt(e.getChildTextTrim("modelMinorCountThreshold")),
e.getChildTextTrim("substitutionErrorRateMatrix"),
Integer.parseInt(e.getChildTextTrim("compoundQScoreThreshold")),
Double.parseDouble(e.getChildTextTrim("compoundQScoreMu")),
Double.parseDouble(e.getChildTextTrim("compoundQScoreSD")),
Boolean.parseBoolean(e.getChildTextTrim("showAbsentVariants"))
);
Expand All @@ -340,12 +340,12 @@ public boolean equals(Object o) {
if (Double.compare(that.modelCycles, modelCycles) != 0) return false;
if (Double.compare(that.modelEfficiency, modelEfficiency) != 0) return false;
if (Double.compare(that.compoundQScoreSD, compoundQScoreSD) != 0) return false;
if (Double.compare(that.compoundQScoreMu, compoundQScoreMu) != 0) return false;
if (qualityThreshold != that.qualityThreshold) return false;
if (singletonFrequencyThreshold != that.singletonFrequencyThreshold) return false;
if (coverageThreshold != that.coverageThreshold) return false;
if (modelCoverageThreshold != that.modelCoverageThreshold) return false;
if (modelMinorCountThreshold != that.modelMinorCountThreshold) return false;
if (compoundQScoreThreshold != that.compoundQScoreThreshold) return false;
if (noIndels != that.noIndels) return false;
if (showAbsentVariants != that.showAbsentVariants) return false;
if (errorModelType != that.errorModelType) return false;
Expand All @@ -366,13 +366,14 @@ public int hashCode() {
result = 31 * result + (int) (temp ^ (temp >>> 32));
temp = Double.doubleToLongBits(compoundQScoreSD);
result = 31 * result + (int) (temp ^ (temp >>> 32));
temp = Double.doubleToLongBits(compoundQScoreMu);
result = 31 * result + (int) (temp ^ (temp >>> 32));
result = 31 * result + errorModelType.hashCode();
result = 31 * result + qualityThreshold;
result = 31 * result + singletonFrequencyThreshold;
result = 31 * result + coverageThreshold;
result = 31 * result + modelCoverageThreshold;
result = 31 * result + modelMinorCountThreshold;
result = 31 * result + compoundQScoreThreshold;
result = 31 * result + substitutionErrorRateMatrix.hashCode();
result = 31 * result + parsedSubstitutionErrorRateMatrix.hashCode();
result = 31 * result + (noIndels ? 1 : 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ public static ErrorModel create(VariantCallerParameters parameters, MutationsTab
return new MinorBasedErrorModel(parameters.getModelOrder(),
parameters.getModelCycles(), parameters.getModelEfficiency(),
parameters.getModelCoverageThreshold(), parameters.getModelMinorCountThreshold(),
mutationsTable, minorCaller);
mutationsTable,
minorCaller);
case RawData:
return new RawDataErrorModel(mutationsTable);
case Custom:
Expand Down Expand Up @@ -84,7 +85,7 @@ public static String[] getErrorModelStatisticDescriptions(VariantCallerParameter
"Fraction of reads with PCR error within MIG",
"Fraction of reads filtered by sequencing quality",
"Geometric mean of MIG size",
"2 for global, 1 for reference-wide and 0 for local (point) error rate estimate"};
"1 for global (substitution type) and 0 for local (point) error rate estimate"};
default:
return new String[0];
}
Expand Down

0 comments on commit 5839a19

Please sign in to comment.