From 73c13808085da87116854ac76dbff6ce8f987a6d Mon Sep 17 00:00:00 2001 From: Matthew The Date: Tue, 19 Apr 2016 17:00:15 +0200 Subject: [PATCH] Added picked protein FDR approach --- src/Caller.cpp | 43 +- src/FidoInterface.cpp | 17 +- src/FisherInterface.cpp | 265 +++++-- src/FisherInterface.h | 21 +- src/Protein.cpp | 13 +- src/Protein.h | 71 +- src/ProteinProbEstimator.cpp | 1280 +++++++++++++++++----------------- src/ProteinProbEstimator.h | 31 +- src/Scores.cpp | 2 +- src/fisher/FisherCaller.cpp | 6 +- 10 files changed, 958 insertions(+), 791 deletions(-) diff --git a/src/Caller.cpp b/src/Caller.cpp index b8e8e7a2..2f062355 100755 --- a/src/Caller.cpp +++ b/src/Caller.cpp @@ -250,11 +250,11 @@ bool Caller::parseOptions(int argc, char **argv) { "", TRUE_IF_SET); cmd.defineOption("f", - "fisher-protein", - "Use fisher's method to infer protein probabilities, provide the fasta file as the argument to this flag.", + "picked-protein", + "Use the picked protein-level FDR to infer protein probabilities, provide the fasta file as the argument to this flag.", "value"); cmd.defineOption("z", - "fisher-enzyme", + "protein-enzyme", "Type of enzyme \"no_enzyme\",\"elastase\",\"pepsin\",\"proteinasek\",\"thermolysin\",\"trypsinp\",\"chymotrypsin\",\"lys-n\",\"lys-c\",\"arg-c\",\"asp-n\",\"glu-c\",\"trypsin\" default=\"trypsin\"", "", "trypsin"); @@ -263,15 +263,19 @@ bool Caller::parseOptions(int argc, char **argv) { "The p-value cutoff for peptides when inferring proteins with fisher's method. Default = 1.0", "value");*/ cmd.defineOption("c", - "fisher-report-fragments", - "By default, if the peptides associated with protein A are a proper subset of the peptides associated with protein B, then protein A is eliminated and all the peptides are considered as evidence for protein B. Note that this filtering is done based on the complete set of peptides in the database, not based on the identified peptides in the search results. Alternatively, if this option is set and if all of the identified peptides associated with protein B are also associated with protein A, then Percolator will report a comma-separated list of protein IDs, where the full-length protein B is first in the list and the fragment protein A is listed second.", + "protein-report-fragments", + "By default, if the peptides associated with protein A are a proper subset of the peptides associated with protein B, then protein A is eliminated and all the peptides are considered as evidence for protein B. Note that this filtering is done based on the complete set of peptides in the database, not based on the identified peptides in the search results. Alternatively, if this option is set and if all of the identified peptides associated with protein B are also associated with protein A, then Percolator will report a comma-separated list of protein IDs, where the full-length protein B is first in the list and the fragment protein A is listed second. Not available for Fido.", "", TRUE_IF_SET); cmd.defineOption("g", - "fisher-report-duplicates", - "If multiple database proteins contain exactly the same set of peptides, then Percolator will randomly discard all but one of the proteins. If this option is set, then the IDs of these duplicated proteins will be reported as a comma-separated list.", + "protein-report-duplicates", + "If multiple database proteins contain exactly the same set of peptides, then Percolator will randomly discard all but one of the proteins. If this option is set, then the IDs of these duplicated proteins will be reported as a comma-separated list. Not available for Fido.", "", TRUE_IF_SET); + cmd.defineOption("I", + "protein-absence-ratio", + "The ratio of absent proteins, used for calculating protein-level q-values with a null hypothesis of \"Protein P is absent\". This uses the \"classic\" protein FDR in favor of the \"picked\" protein FDR.", + "value"); cmd.defineOption("A", "fido-protein", "Use the Fido algorithm to infer protein probabilities", @@ -290,10 +294,6 @@ bool Caller::parseOptions(int argc, char **argv) { "fido-gamma", "Set Fido's prior probability that a protein is present in the sample. Set by grid search if not specified", "value"); - cmd.defineOption("I", - "fido-protein-level-pi0", - "The pi_0 value when calculating protein-level empirical q-values. The value defaults to 1.0", - "value"); cmd.defineOption("q", "fido-empirical-protein-q", "Estimate empirical p-values and q-values using target-decoy analysis.", @@ -301,7 +301,7 @@ bool Caller::parseOptions(int argc, char **argv) { TRUE_IF_SET); cmd.defineOption("P", "fido-pattern", - "Define the text pattern to identify decoy proteins in the database. Default = \"random\".", + "Define the text pattern to identify decoy proteins in the database. Default = \"random_\".", "value"); cmd.defineOption("d", "fido-gridsearch-depth", @@ -402,9 +402,9 @@ bool Caller::parseOptions(int argc, char **argv) { // Confidence estimation options (general protein prob options) bool protEstimatorOutputEmpirQVal = false; bool protEstimatorTrivialGrouping = true; // cannot be set on cmd line - std::string protEstimatorDecoyPrefix = "random"; - double protEstimatorPi0 = 1.0; - if (cmd.optionSet("I")) protEstimatorPi0 = cmd.getDouble("I", 0.0, 1.0); + std::string protEstimatorDecoyPrefix = "random_"; + double protEstimatorAbsenceRatio = 1.0; + if (cmd.optionSet("I")) protEstimatorAbsenceRatio = cmd.getDouble("I", 0.0, 1.0); protEstimatorOutputEmpirQVal = cmd.optionSet("q"); if (cmd.optionSet("P")) protEstimatorDecoyPrefix = cmd.options["P"]; //if (cmd.optionSet("Q")) protEstimatorTrivialGrouping = false; @@ -442,7 +442,7 @@ bool Caller::parseOptions(int argc, char **argv) { fidoNoClustering, fidoNoPartitioning, fidoNoPruning, fidoGridSearchDepth, fidoGridSearchThreshold, fidoProteinThreshold, fidoMseThreshold, - protEstimatorPi0, protEstimatorOutputEmpirQVal, + protEstimatorAbsenceRatio, protEstimatorOutputEmpirQVal, protEstimatorDecoyPrefix, protEstimatorTrivialGrouping); } else if (cmd.optionSet("f")) { std::string fastaDatabase = cmd.options["f"]; @@ -460,7 +460,7 @@ bool Caller::parseOptions(int argc, char **argv) { protEstimator_ = new FisherInterface(fastaDatabase, fisherPvalueCutoff, fisherReportFragmentProteins, fisherReportDuplicateProteins, - protEstimatorTrivialGrouping, protEstimatorPi0, + protEstimatorTrivialGrouping, protEstimatorAbsenceRatio, protEstimatorOutputEmpirQVal, protEstimatorDecoyPrefix); } } @@ -698,9 +698,12 @@ void Caller::calculateProteinProbabilities(Scores& allScores) { double diff_time = difftime(procStart, startTime); if (VERB > 1) { - cerr << "Estimating Protein Probabilities took : " - << ((double)(procStartClock - startClock)) / (double)CLOCKS_PER_SEC - << " cpu seconds or " << diff_time << " seconds wall time" << endl; + ostringstream timerValues; + timerValues.precision(4); + timerValues << "Estimating Protein Probabilities took : " + << ((double)(procStartClock - startClock)) / (double)CLOCKS_PER_SEC + << " cpu seconds or " << diff_time << " seconds wall time" << endl; + std::cerr << timerValues.str(); } protEstimator_->printOut(proteinResultFN_, decoyProteinResultFN_); diff --git a/src/FidoInterface.cpp b/src/FidoInterface.cpp index 095521d3..74ad6bcc 100644 --- a/src/FidoInterface.cpp +++ b/src/FidoInterface.cpp @@ -77,9 +77,9 @@ FidoInterface::FidoInterface(double alpha, double beta, double gamma, bool noPartitioning, bool noClustering, bool noPruning, unsigned gridSearchDepth, double gridSearchThreshold, double proteinThreshold, double mseThreshold, - double pi0, bool outputEmpirQVal, + double absenceRatio, bool outputEmpirQVal, std::string decoyPattern, bool trivialGrouping) : - ProteinProbEstimator(trivialGrouping, pi0, outputEmpirQVal, decoyPattern), + ProteinProbEstimator(trivialGrouping, absenceRatio, outputEmpirQVal, decoyPattern), alpha_(alpha), beta_(beta), gamma_(gamma), noPartitioning_(noPartitioning), noClustering_(noClustering), noPruning_(noPruning), proteinThreshold_(proteinThreshold), @@ -99,8 +99,8 @@ void FidoInterface::run() { double localPeptidePrior = kPeptidePrior; if (kComputePriors) { - if (pi0_ != 1.0) { - localPeptidePrior = 1 - pi0_; + if (absenceRatio_ != 1.0) { + localPeptidePrior = 1 - absenceRatio_; } else { localPeptidePrior = estimatePriors(); } @@ -500,7 +500,7 @@ void FidoInterface::getEstimated_and_Empirical_FDR( PosteriorEstimator::getPValues(combined, pvals); pi0_ = PosteriorEstimator::estimatePi0(pvals); } - FDRCalculator fdrCalculator(usePi0_, targetDecoyRatio, pi0_, countDecoyQvalue_); + FDRCalculator fdrCalculator(usePi0_, targetDecoyRatio, pi0_ * absenceRatio_, countDecoyQvalue_); //NOTE no need to store more q values since they will not be taken into account while estimating MSE FDR divergence for (unsigned int k = 0; (k < proteinNames.size() && @@ -643,11 +643,8 @@ void FDRCalculator::calcFDRs(double fpChange, double tpChange, double prob, } if (tpCount_ > 0) { - if (usePi0_) { - empFDR = (fpCount_ * pi0_ * targetDecoyRatio_) / tpCount_; - } else { - empFDR = fpCount_ / tpCount_; - } + empFDR = fpCount_ / tpCount_; + if (usePi0_) empFDR *= pi0_ * targetDecoyRatio_; } if (empFDR > 1.0 || std::isnan(empFDR) || std::isinf(empFDR)) empFDR = 1.0; diff --git a/src/FisherInterface.cpp b/src/FisherInterface.cpp index 56c6851f..5e9a1dd8 100644 --- a/src/FisherInterface.cpp +++ b/src/FisherInterface.cpp @@ -19,12 +19,15 @@ FisherInterface::FisherInterface(const std::string& fastaDatabase, double pvalueCutoff, bool reportFragmentProteins, bool reportDuplicateProteins, - bool trivialGrouping, double pi0, bool outputEmpirQval, + bool trivialGrouping, double absenceRatio, bool outputEmpirQval, std::string& decoyPattern) : - ProteinProbEstimator(trivialGrouping, pi0, outputEmpirQval, decoyPattern), - fastaProteinFN_(fastaDatabase), maxPeptidePval_(pvalueCutoff), - reportFragmentProteins_(reportFragmentProteins), - reportDuplicateProteins_(reportDuplicateProteins) {} + ProteinProbEstimator(trivialGrouping, absenceRatio, outputEmpirQval, decoyPattern), + fastaProteinFN_(fastaDatabase), maxPeptidePval_(pvalueCutoff), + reportFragmentProteins_(reportFragmentProteins), + reportDuplicateProteins_(reportDuplicateProteins), + protInferenceMethod_(BESTPEPT) { + if (absenceRatio == 1.0) usePi0_ = false; +} FisherInterface::~FisherInterface() {} @@ -129,8 +132,16 @@ bool FisherInterface::initialize(Scores* fullset) { void FisherInterface::run() { std::map fragment_map, duplicate_map; fisherCaller_.setFastaDatabase(fastaProteinFN_); + + if (VERB > 1) { + std::cerr << "Detecting protein fragments/duplicates in target database" << std::endl; + } bool generateDecoys = false; fisherCaller_.getProteinFragmentsAndDuplicates(fragment_map, duplicate_map, generateDecoys); + + if (VERB > 1) { + std::cerr << "Detecting protein fragments/duplicates in decoy database" << std::endl; + } generateDecoys = true; fisherCaller_.getProteinFragmentsAndDuplicates(fragment_map, duplicate_map, generateDecoys); @@ -173,22 +184,22 @@ void FisherInterface::run() { if (!isShared) { Protein::Peptide *peptide = new Protein::Peptide( - peptideIt->pPSM->getPeptideSequence(),peptideIt->isDecoy(), - peptideIt->p, peptideIt->pep,peptideIt->q,peptideIt->p); - if (proteins.find(lastProteinId) == proteins.end()) { + peptideIt->pPSM->getPeptideSequence(), peptideIt->isDecoy(), + peptideIt->p, peptideIt->pep, peptideIt->q, peptideIt->score); + if (proteins_.find(lastProteinId) == proteins_.end()) { if (proteinsInGroup.size() > 1) { groupProteinIds[lastProteinId] = proteinsInGroup; } - Protein *newprotein = new Protein(lastProteinId,0.0,0.0,0.0,0.0, - peptideIt->isDecoy(),peptide,++numGroups); - proteins.insert(std::make_pair(lastProteinId,newprotein)); + Protein *newprotein = new Protein(lastProteinId, peptideIt->isDecoy(), + peptide, ++numGroups); + proteins_.insert(std::make_pair(lastProteinId,newprotein)); if (lastProteinId.find(decoyPattern_) == std::string::npos) { ++numberTargetProteins_; } else { ++numberDecoyProteins_; } } else { - proteins[lastProteinId]->setPeptide(peptide); + proteins_[lastProteinId]->setPeptide(peptide); if (proteinsInGroup.size() > 1) { groupProteinIds[lastProteinId].insert(proteinsInGroup.begin(), proteinsInGroup.end()); @@ -201,8 +212,8 @@ void FisherInterface::run() { std::map >::iterator groupIt; std::map::iterator representIt; for (groupIt = groupProteinIds.begin(); groupIt != groupProteinIds.end(); ++groupIt) { - representIt = proteins.find(groupIt->first); - if (representIt != proteins.end()) { + representIt = proteins_.find(groupIt->first); + if (representIt != proteins_.end()) { std::string newName = ""; for (std::set::iterator proteinIt = groupIt->second.begin(); proteinIt != groupIt->second.end(); ++proteinIt) { newName += *proteinIt + ","; @@ -215,67 +226,207 @@ void FisherInterface::run() { } void FisherInterface::computeProbabilities(const std::string& fname) { - for (std::map::iterator it = proteins.begin(); - it != proteins.end(); it++) { + for (std::map::iterator it = proteins_.begin(); + it != proteins_.end(); it++) { std::vector peptides = it->second->getPeptides(); - if (true) { - double fisher = 0.0; - int significantPeptides = 0; - for (std::vector::const_iterator itP = peptides.begin(); - itP != peptides.end(); itP++) { - fisher += log((*itP)->p / maxPeptidePval_); + switch (protInferenceMethod_) { + case FISHER: { + double fisher = 0.0; + int significantPeptides = 0; + for (std::vector::const_iterator itP = peptides.begin(); + itP != peptides.end(); itP++) { + fisher += log((*itP)->p / maxPeptidePval_); + } + double proteinPvalue = boost::math::gamma_q(peptides.size(), -1.0*fisher); + if (proteinPvalue == 0.0) proteinPvalue = DBL_MIN; + it->second->setP(proteinPvalue); + it->second->setScore(proteinPvalue); + break; + } case PEPPROD: { // MaxQuant's strategy + double logPepProd = 0.0; + for (std::vector::const_iterator itP = peptides.begin(); + itP != peptides.end(); itP++) { + logPepProd += log((*itP)->pep); + } + it->second->setScore(logPepProd); + break; + } case BESTPEPT: { + double maxScore = -1000.0; + for (std::vector::const_iterator itP = peptides.begin(); + itP != peptides.end(); itP++) { + maxScore = std::max(maxScore, (*itP)->score); + } + it->second->setScore(-1.0*maxScore); // lower scores are better + break; + } + } + } + + std::vector > protIdProtPairs(proteins_.begin(), proteins_.end()); + + if (!usePi0_) pickedProteinStrategy(protIdProtPairs); + + std::vector peps; + estimatePEPs(protIdProtPairs, peps); + + for (size_t i = 0; i < protIdProtPairs.size(); ++i) { + pepProteinMap_.insert(std::make_pair(peps[i], std::vector(1, protIdProtPairs[i].first) )); + } +} + +/* less efficient than pickedProteinStrategy, but more forgiving in specifying the correct decoyPattern */ +void FisherInterface::pickedProteinStrategySubstring( + std::vector >& protIdProtPairs) { + std::sort(protIdProtPairs.begin(), protIdProtPairs.end(), IntCmpScore()); + + std::vector > pickedProtIdProtPairs; + std::vector targetProts, decoyProts; + std::vector >::iterator it = protIdProtPairs.begin(); + size_t numErased = 0; + for (; it != protIdProtPairs.end(); ++it) { + bool isDecoy = it->second->getIsDecoy(); + bool globalErase = false; + std::string proteinName = it->second->getName(); + + std::istringstream ss(proteinName); + std::string proteinId; + while (std::getline(ss, proteinId, ',')) { // split name by comma + bool localErase = false; + if (isDecoy) { + for (size_t j = 0; j < targetProts.size(); ++j) { + if (proteinId.find(targetProts[j]) != std::string::npos) { + localErase = true; + break; + } + } + } else { + for (size_t j = 0; j < decoyProts.size(); ++j) { + if (decoyProts[j].find(proteinId) != std::string::npos) { + localErase = true; + break; + } + } } - double proteinPvalue = boost::math::gamma_q(peptides.size(), -1.0*fisher); - if (proteinPvalue == 0.0) proteinPvalue = DBL_MIN; - it->second->setP(proteinPvalue); - } else { // MaxQuant's strategy - double fisher = 0.0; - for (std::vector::const_iterator itP = peptides.begin(); - itP != peptides.end(); itP++) { - fisher += log((*itP)->pep); + if (!localErase) { + if (isDecoy) decoyProts.push_back(proteinId); + else targetProts.push_back(proteinId); + } else { + globalErase = true; } - it->second->setP(fisher); + } + if (globalErase) { + if (isDecoy) --numberDecoyProteins_; + else --numberTargetProteins_; + proteins_.erase(it->first); // this is where the printed proteins are stored + numErased += 1; + } else { + pickedProtIdProtPairs.push_back(*it); } } + pickedProtIdProtPairs.swap(protIdProtPairs); - std::vector > myvec(proteins.begin(), proteins.end()); - std::sort(myvec.begin(), myvec.end(), IntCmpPvalue()); + if (numErased == 0) { + std::cerr << "Warning: No target-decoy protein pairs found. Check if the " + << "correct decoy pattern was specified with the -P flag, i.e. if the " + << "target protein is called \"protA\" and the decoy protein " + << "\"decoy_protA\", use the option \"-p decoy_\"." << std::endl; + } +} + + +void FisherInterface::pickedProteinStrategy( + std::vector >& protIdProtPairs) { + std::sort(protIdProtPairs.begin(), protIdProtPairs.end(), IntCmpScore()); + std::vector > pickedProtIdProtPairs; + std::set targetProts, decoyProts; + std::vector >::iterator it = protIdProtPairs.begin(); + size_t numErased = 0; + for (; it != protIdProtPairs.end(); ++it) { + bool isDecoy = it->second->getIsDecoy(); + bool globalErase = false; + std::string proteinName = it->second->getName(); + + std::istringstream ss(proteinName); + std::string proteinId; + while (std::getline(ss, proteinId, ',')) { // split name by comma + bool localErase = false; + if (isDecoy) { + std::string targetId = proteinId.substr(decoyPattern_.size()); + if (targetProts.find(targetId) != targetProts.end()) { + localErase = true; + } else { + decoyProts.insert(proteinId); + } + } else { + if (decoyProts.find(decoyPattern_ + proteinId) != decoyProts.end()) { + localErase = true; + } else { + targetProts.insert(proteinId); + } + } + if (localErase) globalErase = true; + } + if (globalErase) { + if (isDecoy) --numberDecoyProteins_; + else --numberTargetProteins_; + proteins_.erase(it->first); // this is where the printed proteins are stored + numErased += 1; + } else { + pickedProtIdProtPairs.push_back(*it); + } + } + pickedProtIdProtPairs.swap(protIdProtPairs); + + if (numErased == 0) { + std::cerr << "Warning: No target-decoy protein pairs found for the picked " + << "protein strategy.\n Check if the correct decoy pattern was specified " + << "with the -P flag;\n if the target protein is called \"protA\" and the " + << "decoy protein \"decoy_protA\", use the option \"-P decoy_\"." + << std::endl << std::endl; + } +} + +void FisherInterface::estimatePEPs( + std::vector >& protIdProtPairs, + std::vector& peps) { std::vector > combined; std::vector pvals; - if (true) { // if we have well calibrated p-values - for (size_t i = 0; i < myvec.size(); ++i) { - double pValue = myvec[i].second->getP(); - bool isDecoy = myvec[i].second->getIsDecoy(); - combined.push_back(make_pair(pValue, !isDecoy)); - if (!isDecoy) { - pvals.push_back(pValue); + switch (protInferenceMethod_) { + case FISHER: { // if we have well calibrated p-values + for (size_t i = 0; i < protIdProtPairs.size(); ++i) { + double pValue = protIdProtPairs[i].second->getP(); + bool isDecoy = protIdProtPairs[i].second->getIsDecoy(); + combined.push_back(make_pair(pValue, !isDecoy)); + if (!isDecoy) { + pvals.push_back(pValue); + } } + break; + } case PEPPROD: + case BESTPEPT: { // if we have some other type of score + for (size_t i = 0; i < protIdProtPairs.size(); ++i) { + double score = protIdProtPairs[i].second->getScore(); + bool isDecoy = protIdProtPairs[i].second->getIsDecoy(); + combined.push_back(make_pair(score, !isDecoy)); + } + PosteriorEstimator::getPValues(combined, pvals); + break; } - } else { // if we have some other type of score - for (size_t i = 0; i < myvec.size(); ++i) { - double pValue = myvec[i].second->getP(); - bool isDecoy = myvec[i].second->getIsDecoy(); - combined.push_back(make_pair(pValue, !isDecoy)); - } - PosteriorEstimator::getPValues(combined, pvals); } - pi0_ = PosteriorEstimator::estimatePi0(pvals); - if (VERB > 1) { - std::cerr << "protein pi0 estimate = " << pi0_ << std::endl; + if (usePi0_) { + pi0_ = PosteriorEstimator::estimatePi0(pvals); + if (VERB > 1) { + std::cerr << "protein pi0 estimate = " << pi0_ << std::endl; + } } + std::sort(combined.begin(), combined.end()); bool includeNegativesInResult = true; - std::vector peps; PosteriorEstimator::setReversed(true); - PosteriorEstimator::estimatePEP(combined, usePi0_, pi0_, peps, includeNegativesInResult); - - for (size_t i = 0; i < myvec.size(); ++i) { - pvalues.push_back(myvec[i].second->getP()); - pepProteinMap_.insert(std::make_pair(peps[i], std::vector(1, myvec[i].first) )); - } + PosteriorEstimator::estimatePEP(combined, usePi0_, pi0_ * absenceRatio_, peps, includeNegativesInResult); } std::ostream& FisherInterface::printParametersXML(std::ostream &os) { diff --git a/src/FisherInterface.h b/src/FisherInterface.h index e718f8c6..466583b7 100644 --- a/src/FisherInterface.h +++ b/src/FisherInterface.h @@ -29,6 +29,16 @@ #include "Enzyme.h" #include "PseudoRandom.h" +/* +* Protein inference methods: +* 1. Fisher's method for combining p-values +* 2. Product of Posterior error probabilities (PEPs) +* 3. Best peptide approach +*/ +enum ProteinInferenceMethod { + FISHER, PEPPROD, BESTPEPT +}; + /* * FisherInterface is a class that computes probabilities and statistics based * on provided proteins from the set of scored peptides from percolator. It @@ -44,7 +54,7 @@ class FisherInterface : public ProteinProbEstimator { public: FisherInterface(const std::string& fastaDatabase, double pvalueCutoff, bool reportFragmentProteins, bool reportDuplicateProteins, - bool trivialGrouping, double pi0, + bool trivialGrouping, double absenceRatio, bool outputEmpirQval, std::string& decoyPattern); virtual ~FisherInterface(); @@ -56,7 +66,16 @@ class FisherInterface : public ProteinProbEstimator { string printCopyright(); private: + void pickedProteinStrategy( + std::vector >& protIdProtPairs); + void pickedProteinStrategySubstring( + std::vector >& protIdProtPairs); + void estimatePEPs( + std::vector >& protIdProtPairs, + std::vector& peps); + /** FISHER PARAMETERS **/ + ProteinInferenceMethod protInferenceMethod_; std::string fastaProteinFN_; bool reportFragmentProteins_, reportDuplicateProteins_; FisherCaller fisherCaller_; diff --git a/src/Protein.cpp b/src/Protein.cpp index 7c3547ae..87921efc 100644 --- a/src/Protein.cpp +++ b/src/Protein.cpp @@ -17,16 +17,15 @@ #include "Protein.h" -Protein::Protein(string namenew, double qnew, double qempnew, double pepnew, - double pnew, bool isdecoy_new, Protein::Peptide* __peptide, +Protein::Protein(std::string name, bool isDecoy, Protein::Peptide *peptide, int groupId) - : name(namenew), q(qnew), qemp(qempnew), pep(pepnew), p(pnew), - isDecoy(isdecoy_new), groupId_(groupId) { - if (__peptide) peptides.push_back(__peptide); + : name_(name), q_(0.0), qemp_(0.0), pep_(0.0), p_(0.0), score_(0.0), + isDecoy_(isDecoy), groupId_(groupId) { + if (peptide) setPeptide(peptide); } Protein::~Protein() { - for (unsigned i = 0; i < peptides.size(); i++) - if (peptides[i]) delete peptides[i]; + for (unsigned i = 0; i < peptides_.size(); i++) + if (peptides_[i]) delete peptides_[i]; } diff --git a/src/Protein.h b/src/Protein.h index 5026b356..f1a0ecff 100644 --- a/src/Protein.h +++ b/src/Protein.h @@ -26,52 +26,55 @@ using namespace std; class Protein { public: struct Peptide { - Peptide(std::string __name, bool __isdecoy, double __p, double __pep, double __q, - double __empq) : name(__name), isdecoy(__isdecoy), p(__p), pep(__pep), - q(__q), empq(__empq) {} + Peptide(std::string _name, bool _isdecoy, double _p, double _pep, double _q, + double _score) : name(_name), isdecoy(_isdecoy), + p(_p), pep(_pep), q(_q), score(_score) {} std::string name; bool isdecoy; - double p, pep, q, empq; + double p, pep, q, score; }; - Protein() : name(""), q(0.0), qemp(0.0), pep(0.0), p(0.0), groupId_(-1), - isDecoy(false) {} - Protein(std::string namenew,double qnew, double qempnew, double pepnew, - double pnew, bool isdecoy_new, Peptide *__peptide, int groupId); + Protein() : name_(""), q_(0.0), qemp_(0.0), pep_(0.0), p_(0.0), score_(0.0), + groupId_(-1), isDecoy_(false) {} + Protein(std::string name, bool isdecoy, Peptide *peptide, int groupId); ~Protein(); - void setName(std::string namenew) { name = namenew; } - std::string getName() const { return name; } + inline void setName(std::string name) { name_ = name; } + inline std::string getName() const { return name_; } - void setQ(double qnew) { q = qnew; } - double getQ() const { return q; } + inline void setQ(double q) { q_ = q; } + inline double getQ() const { return q_; } - void setQemp(double qempnew) { qemp = qempnew; } - double getQemp() const { return qemp; } + inline void setQemp(double qemp) { qemp_ = qemp; } + inline double getQemp() const { return qemp_; } - void setPEP(double pepnew) { pep = pepnew; } - double getPEP() const { return pep; } + inline void setPEP(double pep) { pep_ = pep; } + inline double getPEP() const { return pep_; } - void setP(double pnew) { p = pnew; } - double getP() const { return p; } + inline void setP(double p) { p_ = p; } + inline double getP() const { return p_; } - void setIsDecoy(bool isdecoynew) { isDecoy = isdecoynew; } - bool getIsDecoy() const { return isDecoy; } + inline void setScore(double score) { score_ = score; } + inline double getScore() const { return score_; } - void setGroupId(int groupId) { groupId_ = groupId; } - int getGroupId() const { return groupId_; } + inline void setIsDecoy(bool isdecoy) { isDecoy_ = isdecoy; } + inline bool getIsDecoy() const { return isDecoy_; } - void setPeptide(std::string peptide,bool isdecoy,double p,double pep,double q,double empq) { - peptides.push_back(new Peptide(peptide,isdecoy,p,pep,q,empq)); + inline void setGroupId(int groupId) { groupId_ = groupId; } + inline int getGroupId() const { return groupId_; } + + void setPeptide(std::string peptide, bool isdecoy, double p, double pep, + double q, double empq) { + peptides_.push_back(new Peptide(peptide, isdecoy, p, pep, q, empq)); } - void setPeptide(Peptide *__peptide) { - peptides.push_back(__peptide); + void setPeptide(Peptide *peptide) { + peptides_.push_back(peptide); } - void setPeptides(std::vector peptidesnew) { - peptides = std::vector(peptidesnew); + void setPeptides(std::vector peptides) { + peptides_ = std::vector(peptides); } - std::vector getPeptides() { return peptides; } - std::vector getPeptides() const { return peptides; } + std::vector getPeptides() { return peptides_; } + std::vector getPeptides() const { return peptides_; } /* inline bool operator<(const Protein& a,const Protein& b) { @@ -89,11 +92,11 @@ class Protein { */ private: - std::string name; - double q, qemp, pep, p, pi0; + std::string name_; + double q_, qemp_, pep_, p_, score_; int groupId_; - bool isDecoy; - std::vector peptides; + bool isDecoy_; + std::vector peptides_; }; diff --git a/src/ProteinProbEstimator.cpp b/src/ProteinProbEstimator.cpp index d031a678..a93b2806 100755 --- a/src/ProteinProbEstimator.cpp +++ b/src/ProteinProbEstimator.cpp @@ -1,643 +1,637 @@ -/******************************************************************************* - Copyright 2006-2012 Lukas Käll - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - *******************************************************************************/ - -#include "ProteinProbEstimator.h" - -const double ProteinProbEstimator::target_decoy_ratio = 1.0; -const double ProteinProbEstimator::psmThresholdMayu = 0.90; -const double ProteinProbEstimator::prior_protein = 0.5; -bool ProteinProbEstimator::calcProteinLevelProb = false; -/** Helper functions **/ - -template -void bootstrap(const vector& in, vector& out, size_t max_size = 1000) { - out.clear(); - double n = in.size(); - size_t num_draw = min(in.size(), max_size); - for (size_t ix = 0; ix < num_draw; ++ix) { - size_t draw = (size_t)((double)PseudoRandom::lcg_rand() / ((double)PseudoRandom::kRandMax + (double)1) * n); - out.push_back(in[draw]); - } - // sort in desending order - std::sort(out.begin(), out.end()); -} - - -ProteinProbEstimator::ProteinProbEstimator(bool trivialGrouping, double pi0, - bool outputEmpirQVal, std::string decoyPattern) : - trivialGrouping_(trivialGrouping), pi0_(pi0), peptideScores_(NULL), - numberDecoyProteins_(0u), numberTargetProteins_(0u), usePi0_(true), - outputEmpirQVal_(outputEmpirQVal), decoyPattern_(decoyPattern), fdr_(1.0) {} - -ProteinProbEstimator::~ProteinProbEstimator() { - FreeAll(qvalues); - FreeAll(qvaluesEmp); - FreeAll(pvalues); - - if (mayufdr && fastReader) { - delete fastReader; - } - fastReader = 0; - - for(std::multimap >::iterator it = pepProteinMap_.begin(); - it != pepProteinMap_.end(); it++) { - FreeAll(it->second); - } - - for(std::map::iterator it = proteins.begin(); - it != proteins.end(); it++) { - if(it->second) - delete it->second; - } -} - -bool ProteinProbEstimator::initialize(Scores* fullset) { - peptideScores_ = fullset; - setTargetandDecoysNames(); - return true; -} - -void ProteinProbEstimator::computeFDR() { - if (VERB > 1) - std::cerr << "Estimating Protein FDR ... " << std::endl; - - fastReader = new ProteinFDRestimator(); - fastReader->setDecoyPrefix(decoyPattern_); - fastReader->setTargetDecoyRatio(target_decoy_ratio); - fastReader->setEqualDeepthBinning(binning_equal_deepth); - fastReader->setNumberBins(number_bins); - fastReader->correctIdenticalSequences(targetProteins,decoyProteins); - //These guys are the number of target and decoys proteins but from the subset of PSM with FDR < threshold - std::set numberTP; - std::set numberFP; - getTPandPFfromPeptides(psmThresholdMayu,numberTP,numberFP); - - double fptol = fastReader->estimateFDR(numberTP,numberFP); - - if (fptol == -1) { - fdr_ = 1.0; - - if(VERB > 1) - std::cerr << "There was an error estimating the Protein FDR..\n" << std::endl; - } else { - fdr_ = (fptol/(double)numberTP.size()); - if(fdr_ <= 0 || fdr_ >= 1.0) fdr_ = 1.0; - - if(VERB > 1) { - std::cerr << "Estimated Protein FDR at ( " << psmThresholdMayu << ") PSM FDR is : " - << fdr_ << " with " << fptol << " expected number of false positives proteins\n" << std::endl; - } - } -} - -void ProteinProbEstimator::computeStatistics() { - if (pvalues.size() == 0) { - estimatePValues(); - } - - if (usePi0_ && !mayufdr && outputEmpirQVal_) { - pi0_ = estimatePi0(); - if (pi0_ <= 0.0 || pi0_ > 1.0) pi0_ = *qvalues.rbegin(); - if (VERB > 1) { - std::cerr << "protein pi0 estimate = " << pi0_ << std::endl; - } - /* - } else { - pi0_ = fdr_; - */ - } - - /** computing q values **/ - estimateQValues(); - estimateQValuesEmp(); - updateProteinProbabilities(); - - if (VERB > 1) { - if (trivialGrouping_) { - std::cerr << "Number of protein groups identified at q-value = 0.01: "; - } else { - std::cerr << "Number of proteins identified at q-value = 0.01: "; - } - std::cerr << getQvaluesBelowLevel(0.01) << std::endl; - } -} - -void ProteinProbEstimator::printOut(const std::string &proteinFN, - const std::string &proteinDecoyFN) { - if(!proteinFN.empty() || !proteinDecoyFN.empty()) { - if(!proteinFN.empty()) { - ofstream proteinOut(proteinFN.data(), ios::out); - print(proteinOut,false); - proteinOut.close(); - } - if(!proteinDecoyFN.empty()) { - ofstream proteinOut(proteinDecoyFN.data(), ios::out); - print(proteinOut,true); - proteinOut.close(); - } - } else { - print(std::cout); - } -} - -double ProteinProbEstimator::estimatePriors() { - /* Compute a priori probabilities of peptide presence (correctness?) */ - /* prior = the mean of the probabilities, maybe one prior for each charge * - * prior2 = assuming a peptide is present if only if the protein is present and counting - * the size of protein and prior protein probabily in the computation - * prior3 = the ratio of confident peptides among all the peptides - * prior4 = the ratio of decoy peptides versus target peptides (TDC) */ - - double prior_peptide = 0.0; - double prior_peptide2 = 0.0; - unsigned confident_peptides = 0; - unsigned decoy_peptides = 0; - unsigned total_peptides = 0; - double prior, prior2, prior3, prior4; - prior = prior2 = prior3 = prior4 = 0.0; - for (vector::iterator psm = peptideScores_->begin(); - psm!= peptideScores_->end(); ++psm) { - if(!psm->isDecoy()) { - unsigned size = psm->pPSM->proteinIds.size(); - double prior = prior_protein * size; - double tmp_prior = prior; - // for each protein - for(std::vector::iterator protIt = psm->pPSM->proteinIds.begin(); - protIt != psm->pPSM->proteinIds.end(); protIt++) { - unsigned index = std::distance(psm->pPSM->proteinIds.begin(), protIt); - tmp_prior = (tmp_prior * prior_protein * (size - index)) / (index + 1); - prior += pow(-1.0,(int)index) * tmp_prior; - } - /* update computed prior */ - prior_peptide += (1.0-prior); - if(psm->q <= 0.1) ++confident_peptides; - prior_peptide2 += (1.0-psm->pep); - ++total_peptides; - } else { - ++decoy_peptides; - } - } - - prior = prior_peptide2 / (double)total_peptides; - prior2 = prior_peptide / (double)total_peptides; - prior3 = confident_peptides / (double)total_peptides; - prior4 = (total_peptides - decoy_peptides) / (double)total_peptides; - - if (prior > 0.99) prior = 0.99; - if (prior < 0.01) prior = 0.01; - if (prior2 > 0.99) prior2 = 0.99; - if (prior2 < 0.01) prior2 = 0.01; - if (prior3 > 0.99) prior3 = 0.99; - if (prior3 < 0.01) prior3 = 0.01; - if (prior4 > 0.99) prior4 = 0.99; - if (prior4 < 0.01) prior4 = 0.01; - - return prior4; -} - -void ProteinProbEstimator::getCombinedList(std::vector > &combined) { - for (std::multimap >::const_iterator it = pepProteinMap_.begin(); - it != pepProteinMap_.end(); it++) { - double prob = it->first; - std::vector proteinList = it->second; - for(std::vector::const_iterator itP = proteinList.begin(); - itP != proteinList.end(); itP++) { - std::string proteinName = *itP; - bool isdecoy = proteins[proteinName]->getIsDecoy(); - combined.push_back(std::make_pair(prob,isdecoy)); - } - } -} - -void ProteinProbEstimator::estimatePValues() { - // assuming combined sorted in best hit first order - std::vector > combined; - getCombinedList(combined); - pvalues.clear(); - std::vector >::const_iterator myPair = combined.begin(); - size_t nDecoys = 0, posSame = 0, negSame = 0; - double prevScore = -4711.4711; // number that hopefully never turn up first in sequence - while (myPair != combined.end()) { - if (myPair->first != prevScore) { - for (size_t ix = 0; ix < posSame; ++ix) { - pvalues.push_back((double)nDecoys + (((double)negSame) - / (double)(posSame + 1)) * (ix + 1)); - } - nDecoys += negSame; - negSame = 0; - posSame = 0; - prevScore = myPair->first; - } - if (myPair->second) { - ++negSame; - } else { - ++posSame; - } - ++myPair; - } - std::transform(pvalues.begin(), pvalues.end(), pvalues.begin(), - std::bind2nd(std::divides (),(double)nDecoys)); -} - -void ProteinProbEstimator::getTPandPFfromPeptides(double psm_threshold, - std::set &numberTP, - std::set &numberFP) { - /* The original paper of Mayu describes a protein as : - * FP = if only if all its peptides with q <= threshold are decoy - * TP = at least one of its peptides with q <= threshold is target - * However, the way mayu estimates it on the program is like this : - * FP = any protein that contains a decoy psm with q <= threshold - * TP = any protein that contains a target psm with q <= threshold - * They do not consider protein containing both decoy and target psms - * Also, mayu estimates q as the empirical (target-decoy) q value. - * Percolator estimates q as the empirical (target-decoy) q value and adjusted by pi0 - * Mayu extracts the list of TP and FP proteins from PSM level whereas percolator - * extract the list of TP and FP proteins from peptide level, this avoids redundancy and - * gives a better calibration since peptide level q values are re-adjusted in percolator. - * This creates sometimes a difference in the number of TP and FP proteins between percolator and Mayus - * which causes a slight difference in the estimated protein FDR - */ - for (std::map::const_iterator it = proteins.begin(); - it != proteins.end(); it++) { - unsigned num_target_confident = 0; - unsigned num_decoy_confident = 0; - std::string protname = it->first; - std::vector peptides = it->second->getPeptides(); - for(std::vector::const_iterator itP = peptides.begin(); - itP != peptides.end(); itP++) { - Protein::Peptide *p = *itP; - if(p->q <= psm_threshold && p->isdecoy) - ++num_decoy_confident; - if(p->q <= psm_threshold && !p->isdecoy) - ++num_target_confident; - } - if (num_decoy_confident > 0) { - numberFP.insert(protname); - } - if (num_target_confident > 0) { - numberTP.insert(protname); - } - } -} - -double ProteinProbEstimator::estimatePi0(const unsigned int numBoot) { - std::vector pBoot, lambdas, pi0s, mse; - std::vector::iterator start; - unsigned int numLambda = 100; - double maxLambda = 0.5; - size_t n = pvalues.size(); - // Calculate pi0 for different values for lambda - // N.B. numLambda and maxLambda are global variables. - for (unsigned int ix = 0; ix <= numLambda; ++ix) { - double lambda = ((ix + 1) / (double)numLambda) * maxLambda; - // Find the index of the first element in p that is < lambda. - // N.B. Assumes p is sorted in ascending order. - start = lower_bound(pvalues.begin(), pvalues.end(), lambda); - // Calculates the difference in index between start and end - double Wl = (double)distance(start, pvalues.end()); - double pi0 = Wl / n / (1 - lambda); - if (pi0 > 0.0) { - lambdas.push_back(lambda); - pi0s.push_back(pi0); - } - } - if (pi0s.size()==0) { - cerr << "Error in the input data: too good separation between target " - << "and decoy Proteins.\nImpossible to estimate pi0. Taking the highest estimated q value as pi0.\n"; - return -1; - } - double minPi0 = *min_element(pi0s.begin(), pi0s.end()); - // Initialize the vector mse with zeroes. - fill_n(back_inserter(mse), pi0s.size(), 0.0); - // Examine which lambda level that is most stable under bootstrap - for (unsigned int boot = 0; boot < numBoot; ++boot) { - // Create an array of bootstrapped p-values, and sort in ascending order. - bootstrap (pvalues, pBoot); - n = pBoot.size(); - for (unsigned int ix = 0; ix < lambdas.size(); ++ix) - { - start = lower_bound(pBoot.begin(), pBoot.end(), lambdas[ix]); - double Wl = (double)distance(start, pBoot.end()); - double pi0Boot = Wl / n / (1 - lambdas[ix]); - // Estimated mean-squared error. - mse[ix] += (pi0Boot - minPi0) * (pi0Boot - minPi0); - } - } - // Which index did the iterator get? - unsigned int minIx = distance(mse.begin(), min_element(mse.begin(),mse.end())); - double pi0 = max(min(pi0s[minIx], 1.0), 0.0); - return pi0; -} - -unsigned ProteinProbEstimator::getQvaluesBelowLevel(double level) { - std::set identifiedGroupIds; - unsigned nP = 0; - for (std::map::const_iterator myP = proteins.begin(); - myP != proteins.end(); ++myP) { - if (myP->second->getQ() < level && !myP->second->getIsDecoy()) { - nP++; - identifiedGroupIds.insert(myP->second->getGroupId()); - } - } - - if (trivialGrouping_) { - return identifiedGroupIds.size(); - } else { - return nP; - } -} - -unsigned ProteinProbEstimator::getQvaluesBelowLevelDecoy(double level) { - std::set identifiedGroupIds; - unsigned nP = 0; - for (std::map::const_iterator myP = proteins.begin(); - myP != proteins.end(); ++myP) { - if (myP->second->getQ() < level && myP->second->getIsDecoy()) { - nP++; - identifiedGroupIds.insert(myP->second->getGroupId()); - } - } - - if (trivialGrouping_) { - return identifiedGroupIds.size(); - } else { - return nP; - } -} - - -void ProteinProbEstimator::estimateQValues() { - unsigned nP = 0; - double sum = 0.0; - double qvalue = 0.0; - qvalues.clear(); - - for (std::multimap >::const_iterator it = pepProteinMap_.begin(); - it != pepProteinMap_.end(); it++) { - int ntargets = countTargets(it->second); - int ndecoys = it->second.size() - ntargets; - if (trivialGrouping_) { - if (ntargets > 0) ntargets = 1; - if (ndecoys > 0) ndecoys = 1; - } - - if (ndecoys == 0) { - //NOTE in case I want to count and use target and decoys proteins while estimating qvalue from PEP - if (countDecoyQvalue_) { - sum += (double)(it->first * (ntargets + ndecoys)); - nP += (ntargets + ndecoys); - } else { - sum += (double)(it->first * ntargets); - nP += ntargets; - } - } - qvalue = (sum / (double)nP); - if (std::isnan(qvalue) || std::isinf(qvalue) || qvalue > 1.0) qvalue = 1.0; - - for (size_t i = 0; i < it->second.size(); ++i) { - qvalues.push_back(qvalue); - } - } - std::partial_sum(qvalues.rbegin(),qvalues.rend(),qvalues.rbegin(),myminfunc); -} - -void ProteinProbEstimator::estimateQValuesEmp() { - // assuming combined sorted in decending order - unsigned nDecoys = 0; - unsigned numTarget = 0; - unsigned nTargets = 0; - double qvalue = 0.0; - unsigned numDecoy = 0; - //pvalues.clear(); - qvaluesEmp.clear(); - double targetDecoyRatio = (double)numberTargetProteins_ / (double)numberDecoyProteins_; - - if (VERB > 1) { - if (trivialGrouping_) { - std::cerr << "Number of target protein groups: " << numberTargetProteins_ << std::endl; - std::cerr << "Number of decoy protein groups: " << numberDecoyProteins_ << std::endl; - } else { - std::cerr << "Number of target proteins: " << numberTargetProteins_ << std::endl; - std::cerr << "Number of decoy proteins: " << numberDecoyProteins_ << std::endl; - } - } - - std::multimap >::const_iterator it; - for (it = pepProteinMap_.begin(); it != pepProteinMap_.end(); it++) { - numTarget = countTargets(it->second); - numDecoy = it->second.size() - numTarget; - if (trivialGrouping_) { - if (numTarget > 0) numTarget = 1; - if (numDecoy > 0) numDecoy = 1; - } - nDecoys += numDecoy; - nTargets += numTarget; - - if (nTargets) { - if (usePi0_) { - qvalue = (double)(nDecoys * pi0_ * targetDecoyRatio) / (double)nTargets; - } else { - qvalue = (double)(nDecoys) / (double)nTargets; - } - } - if (std::isnan(qvalue) || std::isinf(qvalue) || qvalue > 1.0) qvalue = 1.0; - - for (size_t i = 0; i < it->second.size(); ++i) { - qvaluesEmp.push_back(qvalue); - /* - if (numDecoy > 0) { - pvalues.push_back((nDecoys)/(double)(numberDecoyProteins_)); - } else { - pvalues.push_back((nDecoys+(double)1)/(numberDecoyProteins_+(double)1)); - } - */ - } - } - std::partial_sum(qvaluesEmp.rbegin(), qvaluesEmp.rend(), qvaluesEmp.rbegin(), myminfunc); -} - -void ProteinProbEstimator::updateProteinProbabilities() { - std::vector peps; // posterior error probabilities, not peptides - std::vector > proteinNames; - std::transform(pepProteinMap_.begin(), pepProteinMap_.end(), std::back_inserter(peps), RetrieveKey()); - std::transform(pepProteinMap_.begin(), pepProteinMap_.end(), std::back_inserter(proteinNames), RetrieveValue()); - unsigned protIdx = 0; - for (unsigned pepIdx = 0; pepIdx < peps.size(); pepIdx++) { - double pep = peps[pepIdx]; - std::vector proteinlist = proteinNames[pepIdx]; - for (unsigned j = 0; j < proteinlist.size(); j++) { - int protGroupId = protIdx + 1; - if (trivialGrouping_) protGroupId = pepIdx + 1; - - std::string proteinName = proteinlist[j]; - proteins[proteinName]->setPEP(pep); - proteins[proteinName]->setQ(qvalues[protIdx]); - proteins[proteinName]->setQemp(qvaluesEmp[protIdx]); - proteins[proteinName]->setP(pvalues[protIdx]); - proteins[proteinName]->setGroupId(protGroupId); - ++protIdx; - } - } - -} - -void ProteinProbEstimator::setTargetandDecoysNames() { - unsigned int numGroups = 0; - for (vector::iterator psm = peptideScores_->begin(); psm!= peptideScores_->end(); ++psm) { - // for each protein - for (std::vector::iterator protIt = psm->pPSM->proteinIds.begin(); protIt != psm->pPSM->proteinIds.end(); protIt++) { - Protein::Peptide *peptide = new Protein::Peptide(psm->pPSM->getPeptideSequence(),psm->isDecoy(),psm->p, - psm->pep,psm->q,psm->p); - if (proteins.find(*protIt) == proteins.end()) { - Protein *newprotein = new Protein(*protIt,0.0,0.0,0.0,0.0,psm->isDecoy(),peptide,++numGroups); - proteins.insert(std::make_pair(*protIt,newprotein)); - - if (psm->isDecoy()) { - falsePosSet.insert(*protIt); - } else { - truePosSet.insert(*protIt); - } - } else { - proteins[*protIt]->setPeptide(peptide); - } - } - } - numberDecoyProteins_ = falsePosSet.size(); - numberTargetProteins_ = truePosSet.size(); -} - -void ProteinProbEstimator::addProteinDb(bool isDecoy, std::string name, std::string sequence, double length) { - if (isDecoy) - decoyProteins.insert(std::make_pair(name,std::make_pair(sequence,length))); - else - targetProteins.insert(std::make_pair(name,std::make_pair(sequence,length))); -} - -unsigned ProteinProbEstimator::countTargets(const std::vector &proteinList) { - unsigned count = 0; - for (std::vector::const_iterator it = proteinList.begin(); it != proteinList.end(); it++) { - if (useDecoyPrefix) { - if ((*it).find(decoyPattern_) == std::string::npos) { - count++; - } - } else { - if (truePosSet.count(*it) != 0) { - count++; - } - } - } - return count; -} - -unsigned ProteinProbEstimator::countDecoys(const std::vector &proteinList) { - unsigned count = 0; - for(std::vector::const_iterator it = proteinList.begin(); it != proteinList.end(); it++) { - if (useDecoyPrefix) { - if((*it).find(decoyPattern_) != std::string::npos) { - count++; - } - } else { - if(falsePosSet.count(*it) != 0) { - count++; - } - } - } - return count; -} - -bool ProteinProbEstimator::isDecoy(const std::string& proteinName) { - //NOTE faster with decoyPrefix but I assume the label that identifies decoys is in decoyPattern_ - return (bool)(useDecoyPrefix ? proteinName.find(decoyPattern_) - != std::string::npos : falsePosSet.count(proteinName) != 0); -} - -bool ProteinProbEstimator::isTarget(const std::string& proteinName) { - //NOTE faster with decoyPrefix but I assume the label that identifies decoys is in decoyPattern_ - return (bool)(useDecoyPrefix ? proteinName.find(decoyPattern_) - == std::string::npos : truePosSet.count(proteinName) != 0); -} - - -void ProteinProbEstimator::writeOutputToXML(string xmlOutputFN, bool outputDecoys) { - std::vector > myvec(proteins.begin(), proteins.end()); - std::sort(myvec.begin(), myvec.end(), IntCmpProb()); - - ofstream os; - os.open(xmlOutputFN.data(), ios::app); - // append PROTEINs tag - os << " " << endl; - for (std::vector > ::const_iterator myP = myvec.begin(); - myP != myvec.end(); myP++) { - if ( (!outputDecoys && !myP->second->getIsDecoy()) || (outputDecoys)) { - os << " second->getName() << "\""; - if (outputDecoys) { - if (myP->second->getIsDecoy()) - os << " p:decoy=\"true\""; - else - os << " p:decoy=\"false\""; - } - os << ">" << endl; - - os << " " << scientific << myP->second->getPEP() << "" << endl; - - if (outputEmpirQVal_) { - os << " " << scientific << myP->second->getQemp() << "\n"; - } - - os << " " << scientific << myP->second->getQ() << "\n"; - - if (outputEmpirQVal_) { - os << " " << scientific << myP->second->getP() << "\n"; - } - - std::vector peptides = myP->second->getPeptides(); - for (std::vector::const_iterator peptIt = peptides.begin(); - peptIt != peptides.end(); peptIt++) { - if ((*peptIt)->name != "") { - os << " name << "\"/>"<" << endl; - } - } - - os << " " << endl << endl; - os.close(); -} - -void ProteinProbEstimator::print(ostream& myout, bool decoy) { - - std::vector > myvec(proteins.begin(), proteins.end()); - std::sort(myvec.begin(), myvec.end(), IntCmpProb()); - - myout << "ProteinId\tProteinGroupId\tq-value\tposterior_error_prob\tpeptideIds" << std::endl; - - for (std::vector > ::const_iterator myP = myvec.begin(); - myP != myvec.end(); myP++) { - if( (decoy && myP->second->getIsDecoy()) || (!decoy && !myP->second->getIsDecoy())) { - myout << myP->second->getName() << "\t" << myP->second->getGroupId() << "\t" - << myP->second->getQ() << "\t" << myP->second->getPEP() << "\t"; - std::vector peptides = myP->second->getPeptides(); - for(std::vector::const_iterator peptIt = peptides.begin(); peptIt != peptides.end(); peptIt++) { - if((*peptIt)->name != "") { - myout << (*peptIt)->name << " "; - } - } - myout << std::endl; - } - } -} +/******************************************************************************* + Copyright 2006-2012 Lukas Käll + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + *******************************************************************************/ + +#include "ProteinProbEstimator.h" + +const double ProteinProbEstimator::target_decoy_ratio = 1.0; +const double ProteinProbEstimator::psmThresholdMayu = 0.90; +const double ProteinProbEstimator::prior_protein = 0.5; +bool ProteinProbEstimator::calcProteinLevelProb = false; +/** Helper functions **/ + +template +void bootstrap(const vector& in, vector& out, size_t max_size = 1000) { + out.clear(); + double n = in.size(); + size_t num_draw = min(in.size(), max_size); + for (size_t ix = 0; ix < num_draw; ++ix) { + size_t draw = (size_t)((double)PseudoRandom::lcg_rand() / ((double)PseudoRandom::kRandMax + (double)1) * n); + out.push_back(in[draw]); + } + // sort in desending order + std::sort(out.begin(), out.end()); +} + + +ProteinProbEstimator::ProteinProbEstimator(bool trivialGrouping, double absenceRatio, + bool outputEmpirQVal, std::string decoyPattern) : + trivialGrouping_(trivialGrouping), absenceRatio_(absenceRatio), peptideScores_(NULL), + numberDecoyProteins_(0u), numberTargetProteins_(0u), usePi0_(true), + outputEmpirQVal_(outputEmpirQVal), decoyPattern_(decoyPattern), fdr_(1.0) {} + +ProteinProbEstimator::~ProteinProbEstimator() { + FreeAll(qvalues); + FreeAll(qvaluesEmp); + FreeAll(pvalues); + + if (mayufdr && fastReader) { + delete fastReader; + } + fastReader = 0; + + for(std::multimap >::iterator it = pepProteinMap_.begin(); + it != pepProteinMap_.end(); it++) { + FreeAll(it->second); + } + + for(std::map::iterator it = proteins_.begin(); + it != proteins_.end(); it++) { + if(it->second) + delete it->second; + } +} + +bool ProteinProbEstimator::initialize(Scores* fullset) { + peptideScores_ = fullset; + setTargetandDecoysNames(); + return true; +} + +void ProteinProbEstimator::computeFDR() { + if (VERB > 1) + std::cerr << "Estimating Protein FDR ... " << std::endl; + + fastReader = new ProteinFDRestimator(); + fastReader->setDecoyPrefix(decoyPattern_); + fastReader->setTargetDecoyRatio(target_decoy_ratio); + fastReader->setEqualDeepthBinning(binning_equal_deepth); + fastReader->setNumberBins(number_bins); + fastReader->correctIdenticalSequences(targetProteins,decoyProteins); + //These guys are the number of target and decoys proteins but from the subset of PSM with FDR < threshold + std::set numberTP; + std::set numberFP; + getTPandPFfromPeptides(psmThresholdMayu,numberTP,numberFP); + + double fptol = fastReader->estimateFDR(numberTP,numberFP); + + if (fptol == -1) { + fdr_ = 1.0; + + if(VERB > 1) + std::cerr << "There was an error estimating the Protein FDR..\n" << std::endl; + } else { + fdr_ = (fptol/(double)numberTP.size()); + if(fdr_ <= 0 || fdr_ >= 1.0) fdr_ = 1.0; + + if(VERB > 1) { + std::cerr << "Estimated Protein FDR at ( " << psmThresholdMayu << ") PSM FDR is : " + << fdr_ << " with " << fptol << " expected number of false positives proteins\n" << std::endl; + } + } +} + +void ProteinProbEstimator::computeStatistics() { + if (pvalues.size() == 0) { + estimatePValues(); + } + + if (usePi0_ && !mayufdr && outputEmpirQVal_) { + pi0_ = estimatePi0(); + if (pi0_ <= 0.0 || pi0_ > 1.0) pi0_ = *qvalues.rbegin(); + if (VERB > 1) { + std::cerr << "protein pi0 estimate = " << pi0_ << std::endl; + } + } + + /** computing q values **/ + estimateQValues(); + estimateQValuesEmp(); + updateProteinProbabilities(); + + if (VERB > 1) { + if (trivialGrouping_) { + std::cerr << "Number of protein groups identified at q-value = 0.01: "; + } else { + std::cerr << "Number of proteins identified at q-value = 0.01: "; + } + std::cerr << getQvaluesBelowLevel(0.01) << std::endl; + } +} + +void ProteinProbEstimator::printOut(const std::string &proteinFN, + const std::string &proteinDecoyFN) { + if(!proteinFN.empty() || !proteinDecoyFN.empty()) { + if(!proteinFN.empty()) { + ofstream proteinOut(proteinFN.data(), ios::out); + print(proteinOut,false); + proteinOut.close(); + } + if(!proteinDecoyFN.empty()) { + ofstream proteinOut(proteinDecoyFN.data(), ios::out); + print(proteinOut,true); + proteinOut.close(); + } + } else { + print(std::cout); + } +} + +double ProteinProbEstimator::estimatePriors() { + /* Compute a priori probabilities of peptide presence (correctness?) */ + /* prior = the mean of the probabilities, maybe one prior for each charge * + * prior2 = assuming a peptide is present if only if the protein is present and counting + * the size of protein and prior protein probabily in the computation + * prior3 = the ratio of confident peptides among all the peptides + * prior4 = the ratio of decoy peptides versus target peptides (TDC) */ + + double prior_peptide = 0.0; + double prior_peptide2 = 0.0; + unsigned confident_peptides = 0; + unsigned decoy_peptides = 0; + unsigned total_peptides = 0; + double prior, prior2, prior3, prior4; + prior = prior2 = prior3 = prior4 = 0.0; + for (vector::iterator psm = peptideScores_->begin(); + psm!= peptideScores_->end(); ++psm) { + if(!psm->isDecoy()) { + unsigned size = psm->pPSM->proteinIds.size(); + double prior = prior_protein * size; + double tmp_prior = prior; + // for each protein + for(std::vector::iterator protIt = psm->pPSM->proteinIds.begin(); + protIt != psm->pPSM->proteinIds.end(); protIt++) { + unsigned index = std::distance(psm->pPSM->proteinIds.begin(), protIt); + tmp_prior = (tmp_prior * prior_protein * (size - index)) / (index + 1); + prior += pow(-1.0,(int)index) * tmp_prior; + } + /* update computed prior */ + prior_peptide += (1.0-prior); + if(psm->q <= 0.1) ++confident_peptides; + prior_peptide2 += (1.0-psm->pep); + ++total_peptides; + } else { + ++decoy_peptides; + } + } + + prior = prior_peptide2 / (double)total_peptides; + prior2 = prior_peptide / (double)total_peptides; + prior3 = confident_peptides / (double)total_peptides; + prior4 = (total_peptides - decoy_peptides) / (double)total_peptides; + + if (prior > 0.99) prior = 0.99; + if (prior < 0.01) prior = 0.01; + if (prior2 > 0.99) prior2 = 0.99; + if (prior2 < 0.01) prior2 = 0.01; + if (prior3 > 0.99) prior3 = 0.99; + if (prior3 < 0.01) prior3 = 0.01; + if (prior4 > 0.99) prior4 = 0.99; + if (prior4 < 0.01) prior4 = 0.01; + + return prior4; +} + +void ProteinProbEstimator::getCombinedList(std::vector > &combined) { + for (std::multimap >::const_iterator it = pepProteinMap_.begin(); + it != pepProteinMap_.end(); it++) { + double prob = it->first; + std::vector proteinList = it->second; + for(std::vector::const_iterator itP = proteinList.begin(); + itP != proteinList.end(); itP++) { + std::string proteinName = *itP; + bool isdecoy = proteins_[proteinName]->getIsDecoy(); + combined.push_back(std::make_pair(prob,isdecoy)); + } + } +} + +void ProteinProbEstimator::estimatePValues() { + // assuming combined sorted in best hit first order + std::vector > combined; + getCombinedList(combined); + pvalues.clear(); + std::vector >::const_iterator myPair = combined.begin(); + size_t nDecoys = 0, posSame = 0, negSame = 0; + double prevScore = -4711.4711; // number that hopefully never turn up first in sequence + while (myPair != combined.end()) { + if (myPair->first != prevScore) { + for (size_t ix = 0; ix < posSame; ++ix) { + pvalues.push_back((double)nDecoys + (((double)negSame) + / (double)(posSame + 1)) * (ix + 1)); + } + nDecoys += negSame; + negSame = 0; + posSame = 0; + prevScore = myPair->first; + } + if (myPair->second) { + ++negSame; + } else { + ++posSame; + } + ++myPair; + } + std::transform(pvalues.begin(), pvalues.end(), pvalues.begin(), + std::bind2nd(std::divides (),(double)nDecoys)); +} + +void ProteinProbEstimator::getTPandPFfromPeptides(double psm_threshold, + std::set &numberTP, + std::set &numberFP) { + /* The original paper of Mayu describes a protein as : + * FP = if only if all its peptides with q <= threshold are decoy + * TP = at least one of its peptides with q <= threshold is target + * However, the way mayu estimates it on the program is like this : + * FP = any protein that contains a decoy psm with q <= threshold + * TP = any protein that contains a target psm with q <= threshold + * They do not consider protein containing both decoy and target psms + * Also, mayu estimates q as the empirical (target-decoy) q value. + * Percolator estimates q as the empirical (target-decoy) q value and adjusted by pi0 + * Mayu extracts the list of TP and FP proteins from PSM level whereas percolator + * extract the list of TP and FP proteins from peptide level, this avoids redundancy and + * gives a better calibration since peptide level q values are re-adjusted in percolator. + * This creates sometimes a difference in the number of TP and FP proteins between percolator and Mayus + * which causes a slight difference in the estimated protein FDR + */ + for (std::map::const_iterator it = proteins_.begin(); + it != proteins_.end(); it++) { + unsigned num_target_confident = 0; + unsigned num_decoy_confident = 0; + std::string protname = it->first; + std::vector peptides = it->second->getPeptides(); + for(std::vector::const_iterator itP = peptides.begin(); + itP != peptides.end(); itP++) { + Protein::Peptide *p = *itP; + if(p->q <= psm_threshold && p->isdecoy) + ++num_decoy_confident; + if(p->q <= psm_threshold && !p->isdecoy) + ++num_target_confident; + } + if (num_decoy_confident > 0) { + numberFP.insert(protname); + } + if (num_target_confident > 0) { + numberTP.insert(protname); + } + } +} + +double ProteinProbEstimator::estimatePi0(const unsigned int numBoot) { + std::vector pBoot, lambdas, pi0s, mse; + std::vector::iterator start; + unsigned int numLambda = 100; + double maxLambda = 0.5; + size_t n = pvalues.size(); + // Calculate pi0 for different values for lambda + // N.B. numLambda and maxLambda are global variables. + for (unsigned int ix = 0; ix <= numLambda; ++ix) { + double lambda = ((ix + 1) / (double)numLambda) * maxLambda; + // Find the index of the first element in p that is < lambda. + // N.B. Assumes p is sorted in ascending order. + start = lower_bound(pvalues.begin(), pvalues.end(), lambda); + // Calculates the difference in index between start and end + double Wl = (double)distance(start, pvalues.end()); + double pi0 = Wl / n / (1 - lambda); + if (pi0 > 0.0) { + lambdas.push_back(lambda); + pi0s.push_back(pi0); + } + } + if (pi0s.size()==0) { + cerr << "Error in the input data: too good separation between target " + << "and decoy Proteins.\nImpossible to estimate pi0. Taking the highest estimated q value as pi0.\n"; + return -1; + } + double minPi0 = *min_element(pi0s.begin(), pi0s.end()); + // Initialize the vector mse with zeroes. + fill_n(back_inserter(mse), pi0s.size(), 0.0); + // Examine which lambda level that is most stable under bootstrap + for (unsigned int boot = 0; boot < numBoot; ++boot) { + // Create an array of bootstrapped p-values, and sort in ascending order. + bootstrap (pvalues, pBoot); + n = pBoot.size(); + for (unsigned int ix = 0; ix < lambdas.size(); ++ix) + { + start = lower_bound(pBoot.begin(), pBoot.end(), lambdas[ix]); + double Wl = (double)distance(start, pBoot.end()); + double pi0Boot = Wl / n / (1 - lambdas[ix]); + // Estimated mean-squared error. + mse[ix] += (pi0Boot - minPi0) * (pi0Boot - minPi0); + } + } + // Which index did the iterator get? + unsigned int minIx = distance(mse.begin(), min_element(mse.begin(),mse.end())); + double pi0 = max(min(pi0s[minIx], 1.0), 0.0); + return pi0; +} + +unsigned ProteinProbEstimator::getQvaluesBelowLevel(double level) { + std::set identifiedGroupIds; + unsigned nP = 0; + for (std::map::const_iterator myP = proteins_.begin(); + myP != proteins_.end(); ++myP) { + if (myP->second->getQemp() < level && !myP->second->getIsDecoy()) { + nP++; + identifiedGroupIds.insert(myP->second->getGroupId()); + } + } + + if (trivialGrouping_) { + return identifiedGroupIds.size(); + } else { + return nP; + } +} + +unsigned ProteinProbEstimator::getQvaluesBelowLevelDecoy(double level) { + std::set identifiedGroupIds; + unsigned nP = 0; + for (std::map::const_iterator myP = proteins_.begin(); + myP != proteins_.end(); ++myP) { + if (myP->second->getQ() < level && myP->second->getIsDecoy()) { + nP++; + identifiedGroupIds.insert(myP->second->getGroupId()); + } + } + + if (trivialGrouping_) { + return identifiedGroupIds.size(); + } else { + return nP; + } +} + + +void ProteinProbEstimator::estimateQValues() { + unsigned nP = 0; + double sum = 0.0; + double qvalue = 0.0; + qvalues.clear(); + + for (std::multimap >::const_iterator it = pepProteinMap_.begin(); + it != pepProteinMap_.end(); it++) { + int ntargets = countTargets(it->second); + int ndecoys = it->second.size() - ntargets; + if (trivialGrouping_) { + if (ntargets > 0) ntargets = 1; + if (ndecoys > 0) ndecoys = 1; + } + + if (ndecoys == 0) { + //NOTE in case I want to count and use target and decoys proteins while estimating qvalue from PEP + if (countDecoyQvalue_) { + sum += (double)(it->first * (ntargets + ndecoys)); + nP += (ntargets + ndecoys); + } else { + sum += (double)(it->first * ntargets); + nP += ntargets; + } + } + qvalue = (sum * absenceRatio_ / (double)nP); + if (std::isnan(qvalue) || std::isinf(qvalue) || qvalue > absenceRatio_) qvalue = absenceRatio_; + + for (size_t i = 0; i < it->second.size(); ++i) { + qvalues.push_back(qvalue); + } + } + std::partial_sum(qvalues.rbegin(),qvalues.rend(),qvalues.rbegin(),myminfunc); +} + +void ProteinProbEstimator::estimateQValuesEmp() { + // assuming combined sorted in decending order + unsigned nDecoys = 0; + unsigned numTarget = 0; + unsigned nTargets = 0; + double qvalue = 0.0; + unsigned numDecoy = 0; + //pvalues.clear(); + qvaluesEmp.clear(); + double targetDecoyRatio = (double)numberTargetProteins_ / (double)numberDecoyProteins_; + + if (VERB > 1) { + if (trivialGrouping_) { + std::cerr << "Number of target protein groups: " << numberTargetProteins_ << std::endl; + std::cerr << "Number of decoy protein groups: " << numberDecoyProteins_ << std::endl; + } else { + std::cerr << "Number of target proteins: " << numberTargetProteins_ << std::endl; + std::cerr << "Number of decoy proteins: " << numberDecoyProteins_ << std::endl; + } + } + + std::multimap >::const_iterator it; + for (it = pepProteinMap_.begin(); it != pepProteinMap_.end(); it++) { + numTarget = countTargets(it->second); + numDecoy = it->second.size() - numTarget; + if (trivialGrouping_) { + if (numTarget > 0) numTarget = 1; + if (numDecoy > 0) numDecoy = 1; + } + nDecoys += numDecoy; + nTargets += numTarget; + + if (nTargets) { + qvalue = (double)(nDecoys * absenceRatio_) / (double)nTargets; + if (usePi0_) qvalue *= pi0_ * targetDecoyRatio; + } + if (std::isnan(qvalue) || std::isinf(qvalue) || qvalue > absenceRatio_) qvalue = absenceRatio_; + + for (size_t i = 0; i < it->second.size(); ++i) { + qvaluesEmp.push_back(qvalue); + /* + if (numDecoy > 0) { + pvalues.push_back((nDecoys)/(double)(numberDecoyProteins_)); + } else { + pvalues.push_back((nDecoys+(double)1)/(numberDecoyProteins_+(double)1)); + } + */ + } + } + std::partial_sum(qvaluesEmp.rbegin(), qvaluesEmp.rend(), qvaluesEmp.rbegin(), myminfunc); +} + +void ProteinProbEstimator::updateProteinProbabilities() { + std::vector peps; // posterior error probabilities, not peptides + std::vector > proteinNames; + std::transform(pepProteinMap_.begin(), pepProteinMap_.end(), std::back_inserter(peps), RetrieveKey()); + std::transform(pepProteinMap_.begin(), pepProteinMap_.end(), std::back_inserter(proteinNames), RetrieveValue()); + unsigned protIdx = 0; + for (unsigned pepIdx = 0; pepIdx < peps.size(); pepIdx++) { + double pep = peps[pepIdx]; + std::vector proteinlist = proteinNames[pepIdx]; + for (unsigned j = 0; j < proteinlist.size(); j++) { + int protGroupId = protIdx + 1; + if (trivialGrouping_) protGroupId = pepIdx + 1; + + std::string proteinName = proteinlist[j]; + proteins_[proteinName]->setPEP(pep); + proteins_[proteinName]->setQ(qvalues[protIdx]); + proteins_[proteinName]->setQemp(qvaluesEmp[protIdx]); + proteins_[proteinName]->setP(pvalues[protIdx]); + proteins_[proteinName]->setGroupId(protGroupId); + ++protIdx; + } + } + +} + +void ProteinProbEstimator::setTargetandDecoysNames() { + unsigned int numGroups = 0; + for (vector::iterator psm = peptideScores_->begin(); psm!= peptideScores_->end(); ++psm) { + // for each protein + for (std::vector::iterator protIt = psm->pPSM->proteinIds.begin(); protIt != psm->pPSM->proteinIds.end(); protIt++) { + Protein::Peptide *peptide = new Protein::Peptide( + psm->pPSM->getPeptideSequence(), psm->isDecoy(), + psm->p, psm->pep, psm->q, psm->score); + if (proteins_.find(*protIt) == proteins_.end()) { + Protein *newprotein = new Protein(*protIt, psm->isDecoy(), peptide, ++numGroups); + proteins_.insert(std::make_pair(*protIt,newprotein)); + + if (psm->isDecoy()) { + falsePosSet.insert(*protIt); + } else { + truePosSet.insert(*protIt); + } + } else { + proteins_[*protIt]->setPeptide(peptide); + } + } + } + numberDecoyProteins_ = falsePosSet.size(); + numberTargetProteins_ = truePosSet.size(); +} + +void ProteinProbEstimator::addProteinDb(bool isDecoy, std::string name, std::string sequence, double length) { + if (isDecoy) + decoyProteins.insert(std::make_pair(name,std::make_pair(sequence,length))); + else + targetProteins.insert(std::make_pair(name,std::make_pair(sequence,length))); +} + +unsigned ProteinProbEstimator::countTargets(const std::vector &proteinList) { + unsigned count = 0; + for (std::vector::const_iterator it = proteinList.begin(); it != proteinList.end(); it++) { + if (useDecoyPrefix) { + if ((*it).find(decoyPattern_) == std::string::npos) { + count++; + } + } else { + if (truePosSet.count(*it) != 0) { + count++; + } + } + } + return count; +} + +unsigned ProteinProbEstimator::countDecoys(const std::vector &proteinList) { + unsigned count = 0; + for(std::vector::const_iterator it = proteinList.begin(); it != proteinList.end(); it++) { + if (useDecoyPrefix) { + if((*it).find(decoyPattern_) != std::string::npos) { + count++; + } + } else { + if(falsePosSet.count(*it) != 0) { + count++; + } + } + } + return count; +} + +bool ProteinProbEstimator::isDecoy(const std::string& proteinName) { + //NOTE faster with decoyPrefix but I assume the label that identifies decoys is in decoyPattern_ + return (bool)(useDecoyPrefix ? proteinName.find(decoyPattern_) + != std::string::npos : falsePosSet.count(proteinName) != 0); +} + +bool ProteinProbEstimator::isTarget(const std::string& proteinName) { + //NOTE faster with decoyPrefix but I assume the label that identifies decoys is in decoyPattern_ + return (bool)(useDecoyPrefix ? proteinName.find(decoyPattern_) + == std::string::npos : truePosSet.count(proteinName) != 0); +} + + +void ProteinProbEstimator::writeOutputToXML(string xmlOutputFN, bool outputDecoys) { + std::vector > myvec(proteins_.begin(), proteins_.end()); + std::sort(myvec.begin(), myvec.end(), IntCmpProb()); + + ofstream os; + os.open(xmlOutputFN.data(), ios::app); + // append PROTEINs tag + os << " " << endl; + for (std::vector > ::const_iterator myP = myvec.begin(); + myP != myvec.end(); myP++) { + if ( (!outputDecoys && !myP->second->getIsDecoy()) || (outputDecoys)) { + os << " second->getName() << "\""; + if (outputDecoys) { + if (myP->second->getIsDecoy()) + os << " p:decoy=\"true\""; + else + os << " p:decoy=\"false\""; + } + os << ">" << endl; + + os << " " << scientific << myP->second->getPEP() << "" << endl; + + if (outputEmpirQVal_) { + os << " " << scientific << myP->second->getQemp() << "\n"; + } + + os << " " << scientific << myP->second->getQ() << "\n"; + + if (outputEmpirQVal_) { + os << " " << scientific << myP->second->getP() << "\n"; + } + + std::vector peptides = myP->second->getPeptides(); + for (std::vector::const_iterator peptIt = peptides.begin(); + peptIt != peptides.end(); peptIt++) { + if ((*peptIt)->name != "") { + os << " name << "\"/>"<" << endl; + } + } + + os << " " << endl << endl; + os.close(); +} + +void ProteinProbEstimator::print(ostream& myout, bool decoy) { + + std::vector > myvec(proteins_.begin(), proteins_.end()); + std::sort(myvec.begin(), myvec.end(), IntCmpProb()); + + myout << "ProteinId\tProteinGroupId\tq-value\tposterior_error_prob\tpeptideIds" << std::endl; + + for (std::vector > ::const_iterator myP = myvec.begin(); + myP != myvec.end(); myP++) { + if( (decoy && myP->second->getIsDecoy()) || (!decoy && !myP->second->getIsDecoy())) { + myout << myP->second->getName() << "\t" << myP->second->getGroupId() << "\t" + << myP->second->getQemp() << "\t" << myP->second->getPEP() << "\t"; + std::vector peptides = myP->second->getPeptides(); + for(std::vector::const_iterator peptIt = peptides.begin(); peptIt != peptides.end(); peptIt++) { + if((*peptIt)->name != "") { + myout << (*peptIt)->name << " "; + } + } + myout << std::endl; + } + } +} diff --git a/src/ProteinProbEstimator.h b/src/ProteinProbEstimator.h index 29e6f338..a45385fe 100755 --- a/src/ProteinProbEstimator.h +++ b/src/ProteinProbEstimator.h @@ -36,22 +36,22 @@ /** set of helper functions to sort data structures and some operations overloaded **/ struct IntCmpProb { bool operator()(const std::pair &lhs, const std::pair &rhs) { - return ( (lhs.second->getPEP() < rhs.second->getPEP()) - || ( (lhs.second->getPEP() == rhs.second->getPEP()) && (lhs.second->getQ() < rhs.second->getQ()) ) - || ( (lhs.second->getPEP() == rhs.second->getPEP()) && (lhs.second->getQ() == rhs.second->getQ()) - && (lhs.second->getGroupId() < rhs.second->getGroupId()) ) - || ( (lhs.second->getPEP() == rhs.second->getPEP()) && (lhs.second->getQ() == rhs.second->getQ()) - && (lhs.second->getGroupId() == rhs.second->getGroupId()) - && (lhs.second->getName() < rhs.second->getName()) ) + return ( (lhs.second->getPEP() < rhs.second->getPEP()) + || ( (lhs.second->getPEP() == rhs.second->getPEP()) && (lhs.second->getQ() < rhs.second->getQ()) ) + || ( (lhs.second->getPEP() == rhs.second->getPEP()) && (lhs.second->getQ() == rhs.second->getQ()) + && (lhs.second->getGroupId() < rhs.second->getGroupId()) ) + || ( (lhs.second->getPEP() == rhs.second->getPEP()) && (lhs.second->getQ() == rhs.second->getQ()) + && (lhs.second->getGroupId() == rhs.second->getGroupId()) + && (lhs.second->getName() < rhs.second->getName()) ) ); } }; -struct IntCmpPvalue { +struct IntCmpScore { bool operator()(const std::pair &lhs, const std::pair &rhs) { - return ( (lhs.second->getP() < rhs.second->getP()) - || ( (lhs.second->getP() == rhs.second->getP()) - && (lhs.second->getName() < rhs.second->getName()) ) + return ( lhs.second->getScore() < rhs.second->getScore() + || (lhs.second->getScore() == rhs.second->getScore() + && lhs.second->getName() < rhs.second->getName()) ); } }; @@ -117,7 +117,7 @@ class ProteinProbEstimator { /******************************************************************************************************************/ - ProteinProbEstimator(bool trivialGrouping = true, double pi0 = 1.0, + ProteinProbEstimator(bool trivialGrouping = true, double absenceRatio = 1.0, bool outputEmpirQVal = false, std::string decoyPattern = "random"); virtual ~ProteinProbEstimator(); @@ -152,7 +152,7 @@ class ProteinProbEstimator { void setTargetandDecoysNames(); /** return the data structure for the proteins **/ - std::map getProteins() { return proteins; } + std::map getProteins() { return proteins_; } /** add proteins read from the database **/ void addProteinDb(bool isDecoy, std::string name, std::string sequence, double length); @@ -168,6 +168,7 @@ class ProteinProbEstimator { static inline bool getCalcProteinLevelProb() { return calcProteinLevelProb; } inline bool getUsePi0() { return usePi0_; } double getPi0() { return pi0_; } + double getAbsenceRatio() { return absenceRatio_; } double getFDR() { return fdr_; } protected: @@ -212,7 +213,7 @@ class ProteinProbEstimator { /** variables **/ std::set truePosSet, falsePosSet; ProteinFDRestimator *fastReader; - std::map proteins; + std::map proteins_; std::multimap > pepProteinMap_; std::map > targetProteins; std::map > decoyProteins; @@ -224,7 +225,7 @@ class ProteinProbEstimator { /* protein groups are either present or absent and cannot be partially present */ bool trivialGrouping_; bool usePi0_; - double pi0_; + double pi0_, absenceRatio_; bool outputEmpirQVal_; double fdr_; unsigned int numberDecoyProteins_; diff --git a/src/Scores.cpp b/src/Scores.cpp index 7925be1e..26063abd 100755 --- a/src/Scores.cpp +++ b/src/Scores.cpp @@ -689,7 +689,7 @@ void Scores::calcPep() { // Logistic regression on the data PosteriorEstimator::estimatePEP(combined, usePi0_, pi0_, peps, true); for (size_t ix = 0; ix < scores_.size(); ix++) { - (scores_[ix]).pep = peps[ix]; + scores_[ix].pep = peps[ix]; } } diff --git a/src/fisher/FisherCaller.cpp b/src/fisher/FisherCaller.cpp index 72df6087..fae9b2a9 100644 --- a/src/fisher/FisherCaller.cpp +++ b/src/fisher/FisherCaller.cpp @@ -333,7 +333,7 @@ bool FisherCaller::getProteinFragmentsAndDuplicates( clock_t procStartClock = clock(); time(&procStart); double diff = difftime(procStart, startTime); - if (VERB > 1) cerr << "Creating database took " + if (VERB > 3) cerr << "Creating database took " << ((double)(procStartClock - startClock)) / (double)CLOCKS_PER_SEC << " cpu seconds or " << diff << " seconds wall time" << endl; @@ -351,7 +351,7 @@ bool FisherCaller::getProteinFragmentsAndDuplicates( procStartClock = clock(); time(&procStart); diff = difftime(procStart, startTime); - if (VERB > 1) cerr << "Creating protein peptide map took " + if (VERB > 3) cerr << "Creating protein peptide map took " << ((double)(procStartClock - startClock)) / (double)CLOCKS_PER_SEC << " cpu seconds or " << diff << " seconds wall time" << endl; @@ -368,7 +368,7 @@ bool FisherCaller::getProteinFragmentsAndDuplicates( procStartClock = clock(); time(&procStart); diff = difftime(procStart, startTime); - if (VERB > 1) cerr << "Creating fragment protein map took " + if (VERB > 3) cerr << "Creating fragment protein map took " << ((double)(procStartClock - startClock)) / (double)CLOCKS_PER_SEC << " cpu seconds or " << diff << " seconds wall time" << endl;