Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improving BuildDistribution #1597

Merged
merged 4 commits into from
Sep 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
* Deprecated VisualTest::DrawLinearModelResidual(inputSample, outputSample, linearModelResult), use VisualTest::DrawLinearModelResidual(linearModelResult) instead
* Deprecated OptimizationResult::getLagrangeMultipliers
* Moved OptimizationAlgorithm::computeLagrangeMultipliers to OptimizationResult
* Added AIC & BestModelAIC static methods in FittingTest
* Added AICC & BestModelAICC static methods in FittingTest
* Moved BuildDistribution from FunctionalChaosAlgorithm to MetaModelAlgorithm

=== Documentation ===
==== API documentation ====
Expand All @@ -50,6 +53,7 @@
* #1267 (Some CSV files cannot be imported)
* #1377 (The `setKnownParameter` method is not compatible with `buildEstimator`)
* #1407 (GeneralLinearModelAlgorithm mishandles user-specified scale parameter when normalize is True)
* #1415 (The BuildDistribution static method should not use the KS-test)
* #1421 (UserDefinedStationaryCovarianceModel doc suggests input dimension can be >1)
* #1447 (Highly inaccurate result in reliability model when using subset of RandomVector)
* #1465 (The Sample constructor based on a list and an integer should not exist)
Expand Down
6 changes: 5 additions & 1 deletion lib/etc/openturns.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -579,9 +579,13 @@
<CleaningStrategy-DefaultSignificanceFactor value_float="1.0e-4" />
<CleaningStrategy-DefaultMaximumSize value_int="20" />

<!-- OT::MetaModelAlgorithm parameters -->
<MetaModelAlgorithm-PValueThreshold value_float="1.0e-3" />
<MetaModelAlgorithm-ModelSelectionCriterion value_string="BIC" />
<MetaModelAlgorithm-NonParametricModel value_string="Histogram" />

<!-- OT::FunctionalChaosAlgorithm parameters -->
<FunctionalChaosAlgorithm-DefaultMaximumResidual value_float="1.0e-6" />
<FunctionalChaosAlgorithm-PValueThreshold value_float="1.0e-3" />
<FunctionalChaosAlgorithm-QNorm value_float="0.5" />
<FunctionalChaosAlgorithm-LargeSampleSize value_int="10000" />
<FunctionalChaosAlgorithm-MaximumTotalDegree value_int="10" />
Expand Down
6 changes: 5 additions & 1 deletion lib/src/Base/Common/ResourceMap.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1196,9 +1196,13 @@ void ResourceMap::loadDefaultConfiguration()
addAsScalar("CleaningStrategy-DefaultSignificanceFactor", 1.0e-4);
addAsUnsignedInteger("CleaningStrategy-DefaultMaximumSize", 20);

// MetaModelAlgorithm parameters //
addAsScalar("MetaModelAlgorithm-PValueThreshold", 1.0e-3);
addAsString("MetaModelAlgorithm-ModelSelectionCriterion", "BIC");
addAsString("MetaModelAlgorithm-NonParametricModel", "Histogram");

// FunctionalChaosAlgorithm parameters //
addAsScalar("FunctionalChaosAlgorithm-DefaultMaximumResidual", 1.0e-6);
addAsScalar("FunctionalChaosAlgorithm-PValueThreshold", 1.0e-3);
addAsScalar("FunctionalChaosAlgorithm-QNorm", 0.5);
addAsUnsignedInteger("FunctionalChaosAlgorithm-LargeSampleSize", 10000);
addAsUnsignedInteger("FunctionalChaosAlgorithm-MaximumTotalDegree", 10);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,13 @@
#include "openturns/HyperbolicAnisotropicEnumerateFunction.hxx"
#include "openturns/OrthogonalUniVariatePolynomialFamily.hxx"
#include "openturns/OrthogonalProductPolynomialFactory.hxx"
#include "openturns/KernelSmoothing.hxx"
#include "openturns/NormalCopulaFactory.hxx"
#include "openturns/UserDefined.hxx"
#include "openturns/DistributionFactory.hxx"
#include "openturns/ComposedDistribution.hxx"
#include "openturns/StandardDistributionPolynomialFactory.hxx"
#include "openturns/LeastSquaresMetaModelSelectionFactory.hxx"
#include "openturns/LARS.hxx"
#include "openturns/KFold.hxx"
#include "openturns/CorrectedLeaveOneOut.hxx"
#include "openturns/FittingTest.hxx"
#include "openturns/DistributionTransformation.hxx"
#include "openturns/HypothesisTest.hxx"

BEGIN_NAMESPACE_OPENTURNS

Expand Down Expand Up @@ -153,55 +147,6 @@ FunctionalChaosAlgorithm::FunctionalChaosAlgorithm(const Sample & inputSample,
if (inputSample.getSize() != outputSample.getSize()) throw InvalidArgumentException(HERE) << "Error: the input sample and the output sample must have the same size.";
}

Distribution FunctionalChaosAlgorithm::BuildDistribution(const Sample & inputSample)
jschueller marked this conversation as resolved.
Show resolved Hide resolved
{
// Recover the distribution, taking into account that we look for performance
// so we avoid to rebuild expensive distributions as much as possible
const UnsignedInteger inputDimension = inputSample.getDimension();
// For the dependence structure, we use the Spearman independence test to decide between an independent and a Normal copula.
Bool isIndependent = true;
for (UnsignedInteger j = 0; j < inputDimension && isIndependent; ++ j)
{
const Sample marginalJ = inputSample.getMarginal(j);
for (UnsignedInteger i = j + 1; i < inputDimension && isIndependent; ++ i)
{
TestResult testResult(HypothesisTest::Spearman(inputSample.getMarginal(i), marginalJ));
isIndependent = isIndependent && testResult.getBinaryQualityMeasure();
}
}
Collection< Distribution > marginals(inputDimension);
// The strategy for the marginals is to find the best continuous 1-d parametric model else fallback to a kernel smoothing
KernelSmoothing ks;
Collection< DistributionFactory > factories(DistributionFactory::GetContinuousUniVariateFactories());
const Description inputDescription(inputSample.getDescription());
for (UnsignedInteger i = 0; i < inputDimension; ++i)
{
TestResult bestResult;
// Here we remove the duplicate entries in the marginal sample as we are suppose to have a continuous distribution. The duplicates are mostly due to truncation in the file export.
const Sample marginalSample(inputSample.getMarginal(i).sortUnique());
Collection<Distribution> possibleDistributions(0);
for (UnsignedInteger j = 0; j < factories.getSize(); ++j)
try
{
possibleDistributions.add(factories[j].build(marginalSample));
}
catch (...)
{
// Just skip the factories incompatible with the current marginal sample
}
const Distribution candidate(FittingTest::BestModelKolmogorov(marginalSample, possibleDistributions, bestResult));
// This threshold is somewhat arbitrary. It is here to avoid expensive kernel smoothing.
if (bestResult.getPValue() >= ResourceMap::GetAsScalar("FunctionalChaosAlgorithm-PValueThreshold")) marginals[i] = candidate;
else marginals[i] = ks.build(marginalSample);
marginals[i].setDescription(Description(1, inputDescription[i]));
}

ComposedDistribution distribution(marginals);
if (!isIndependent)
distribution.setCopula(NormalCopulaFactory().build(inputSample));
return distribution;
}


/* Constructor */
FunctionalChaosAlgorithm::FunctionalChaosAlgorithm(const Sample & inputSample,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ public:
/** Method load() reloads the object from the StorageManager */
void load(Advocate & adv) override;

/** Recover the distribution */
static Distribution BuildDistribution(const Sample & inputSample);

private:

/** Marginal computation */
Expand Down
143 changes: 143 additions & 0 deletions lib/src/Uncertainty/Algorithm/MetaModel/MetaModelAlgorithm.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@

#include "openturns/MetaModelAlgorithm.hxx"
#include "openturns/PersistentObjectFactory.hxx"
#include "openturns/KernelSmoothing.hxx"
#include "openturns/HistogramFactory.hxx"
#include "openturns/NormalCopulaFactory.hxx"
#include "openturns/UserDefined.hxx"
#include "openturns/DistributionFactory.hxx"
#include "openturns/ComposedDistribution.hxx"
#include "openturns/FittingTest.hxx"
#include "openturns/HypothesisTest.hxx"

BEGIN_NAMESPACE_OPENTURNS

Expand Down Expand Up @@ -60,6 +68,141 @@ String MetaModelAlgorithm::__repr__() const
return oss;
}

struct TestedDistribution
{
Distribution distribution_;
Scalar score_;
Scalar bic_;
Scalar aic_;
Scalar aicc_;
String criterion_;

TestedDistribution(const Distribution & distribution,
const Scalar score,
const Scalar bic,
const Scalar aic,
const Scalar aicc,
const String & criterion)
: distribution_(distribution)
, score_(score)
, bic_(bic)
, aic_(aic)
, aicc_(aicc)
, criterion_(criterion)
{
// Nothing to do
}

bool operator< (const TestedDistribution& other) const
{
if (criterion_ == "BIC")
return bic_ < other.bic_;
else if (criterion_ == "AIC")
return aic_ < other.aic_;
else if (criterion_ == "AICC")
return aicc_ < other.aicc_;
else
return score_ > other.score_;
}
};

Distribution MetaModelAlgorithm::BuildDistribution(const Sample & inputSample)
{
// Recover the distribution, taking into account that we look for performance
// so we avoid to rebuild expensive distributions as much as possible
const UnsignedInteger inputDimension = inputSample.getDimension();
Collection< Distribution > marginals(inputDimension);
// The strategy for the marginals is to find the best continuous 1-d parametric model else fallback to a non-parametric one
DistributionFactory nonParametricModel;
const String nonParametric = ResourceMap::GetAsString("MetaModelAlgorithm-NonParametricModel");
if (nonParametric == "Histogram")
nonParametricModel = DistributionFactory(HistogramFactory());
else
nonParametricModel = DistributionFactory(KernelSmoothing());

Collection< DistributionFactory > factories(DistributionFactory::GetContinuousUniVariateFactories());
jschueller marked this conversation as resolved.
Show resolved Hide resolved
// Filter factories : remove histogram
for (UnsignedInteger i = 0; i < factories.getSize(); ++i)
{
if (factories[i].getImplementation()->getClassName() == "HistogramFactory")
{
factories.erase(factories.begin() + i);
break;
}
}

// Level for model qualification using KS test
const Scalar level = ResourceMap::GetAsScalar("MetaModelAlgorithm-PValueThreshold");
// Model selection using BIC/AIC/AICC criterion
const String criterion = ResourceMap::GetAsString("MetaModelAlgorithm-ModelSelectionCriterion");

const Description inputDescription(inputSample.getDescription());
for (UnsignedInteger i = 0; i < inputDimension; ++i)
{
// Here we remove the duplicate entries in the marginal sample as we are suppose to have a continuous distribution.
// The duplicates are mostly due to truncation in the file export.
const Sample marginalSample(inputSample.getMarginal(i).sortUnique());
// First we estimate distribution using its factory
// Once it is possible, we keep it only if Kolmogorov score exceed level
std::vector<TestedDistribution> possibleDistributions;
sofianehaddad marked this conversation as resolved.
Show resolved Hide resolved
LOGINFO(OSS() << "In MetaModelAlgorithm::BuildDistribution, estimate distribution for marginal " << i);
for (UnsignedInteger j = 0; j < factories.getSize(); ++j)
{
try
{
const Distribution candidateDistribution(factories[j].build(marginalSample));
const Scalar pValue = FittingTest::Kolmogorov(marginalSample, candidateDistribution, level).getPValue();
const Bool isKSAccepted = (pValue >= level);
if (isKSAccepted)
{
const Scalar BIC = FittingTest::BIC(marginalSample, candidateDistribution, candidateDistribution.getParameterDimension());
const Scalar AIC = FittingTest::AIC(marginalSample, candidateDistribution, candidateDistribution.getParameterDimension());
const Scalar AICC = FittingTest::AICC(marginalSample, candidateDistribution, candidateDistribution.getParameterDimension());
TestedDistribution test(candidateDistribution, pValue, BIC, AIC, AICC, criterion);
LOGINFO(OSS() << "Candidate distribution = " << candidateDistribution.getImplementation()->getClassName() << ", pValue=" << pValue << ", BIC=" << BIC << ", AIC=" << AIC<< ", AICC=" << AICC);
possibleDistributions.push_back(test);
}
else
LOGINFO(OSS() << "Tested distribution & not selected = " << candidateDistribution.getImplementation()->getClassName() << ", pValue=" << pValue);
}
catch (...)
{
// Just skip the factories incompatible with the current marginal sample
// or distribution that are not valid according to the KS test
}
} //for j in factories
// Make sure the list is not empty
// otherwise the candidate is a non-parametric model
if (!(possibleDistributions.size() > 0))
marginals[i] = nonParametricModel.build(marginalSample);
else
{
// Now we return the "best" model according to a fixed criterion
// This last one might be : BIC, AIC, AICC
std::sort(possibleDistributions.begin(), possibleDistributions.end());
marginals[i] = possibleDistributions[0].distribution_;
}// else
marginals[i].setDescription(Description(1, inputDescription[i]));
LOGINFO(OSS() << "Selected distribution = " << marginals[i].getImplementation()->getClassName());
}// for i

// For the dependence structure, we use the Spearman independence test to decide between an independent and a Normal copula.
Bool isIndependent = true;
for (UnsignedInteger j = 0; j < inputDimension && isIndependent; ++ j)
{
const Sample marginalJ = inputSample.getMarginal(j);
for (UnsignedInteger i = j + 1; i < inputDimension && isIndependent; ++ i)
{
TestResult testResult(HypothesisTest::Spearman(inputSample.getMarginal(i), marginalJ));
isIndependent = isIndependent && testResult.getBinaryQualityMeasure();
}
}

ComposedDistribution distribution(marginals);
if (!isIndependent)
distribution.setCopula(NormalCopulaFactory().build(inputSample));
return distribution;
}


/* Distribution accessor */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ public:
/** Method load() reloads the object from the StorageManager */
void load(Advocate & adv) override;

/** Recover the distribution */
static Distribution BuildDistribution(const Sample & inputSample);

protected:

/** The input vector distribution */
Expand All @@ -83,7 +86,7 @@ protected:


private:

friend struct TestedDistribution;

}; /* class MetaModelAlgorithm */

Expand Down
Loading