Skip to content

Commit

Permalink
A number of bug fixes/improvements. 1. Moved to CE weka-stable 3.8.5-…
Browse files Browse the repository at this point in the history
…snapshot. 2. Fixed a bug in getFields() that could affect the ordering of class label-specific evaluation metrics. 3. Fixed a bug in PMI scoring that would cause an NPE when there was no incoming rows to score. 4. Fixed a bug, inadvertently introduced in the 1.5 release, that prevented scheme configuration sub-dialogs from displaying correctly. 5. Fixed a bug that prevented the clearing of options for a given scheme when moving from the implementation in one engine (with options) to an implementation in another engine (with no user options) - mainly affected switching from non-Weka naive Bayes multinomial to Weka's naive Bayes multinomial. 6. Fixed a bug that resulted in the Keras engine being an option for support vector classifier, logistic regression and linear regression.
  • Loading branch information
Mark Hall committed Apr 24, 2020
1 parent e9ba3dc commit 613c1f9
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 85 deletions.
Binary file added lib/weka-stable-3.8.5.jar
Binary file not shown.
15 changes: 11 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>1.8</maven.compiler.source>
<kettle.version>8.2.0.0-SNAPSHOT</kettle.version>
<weka.version>3.8.3.1</weka.version>
<!-- <weka.version>3.8.3.1</weka.version> -->
<weka.version>3.8.5</weka.version>
<weka.timeseries.version>1.0.25</weka.timeseries.version>
<weka.kfkettle.version>1.0.5</weka.kfkettle.version>
<xpp-min.version>1.1.3.4.O</xpp-min.version>
Expand Down Expand Up @@ -74,15 +75,21 @@
<version>${weka.version}</version>
</dependency> -->

<!-- <dependency>
<groupId>nz.ac.waikato.cms.weka</groupId>
<artifactId>weka-stable</artifactId>
<version>${weka.version}</version>
</dependency> -->

<!-- Bit of a hack here (through including a local version of this).
Unfortunately, changes to 3.8.1.1 in svn are not being picked up
in Pentaho builds of Weka/PDM 3.8.1.1 for some reason -->
<dependency>
<groupId>pdm-ce</groupId>
<artifactId>pdm-ce</artifactId>
<groupId>nz.ac.waikato.cms.weka</groupId>
<artifactId>weka-stable</artifactId>
<version>${weka.version}</version>
<scope>system</scope>
<systemPath>${basedir}/lib/pdm-ce-${weka.version}.jar</systemPath>
<systemPath>${basedir}/lib/weka-stable-${weka.version}.jar</systemPath>
</dependency>

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1237,9 +1237,10 @@ protected static void establishOutputRowMeta( RowMetaInterface outRowMeta, Varia
if ( stepMeta.getOutputIRMetrics() ) {
String classLabels = classArffMeta.getNominalVals();
if ( !Const.isEmpty( classLabels ) ) {
TreeSet<String> ts = new TreeSet<>( ArffMeta.stringToVals( classLabels ) );
// TreeSet<String> ts = new TreeSet<>( ArffMeta.stringToVals( classLabels ) );
ArrayList<String> preOrdered = new ArrayList<>( ArffMeta.stringToVals( classLabels ) );
//String[] labels = classLabels.split( "," );
for ( String label : ts ) {
for ( String label : preOrdered ) {
label = label.trim();
vm =
ValueMetaFactory
Expand Down Expand Up @@ -1283,9 +1284,10 @@ protected static void establishOutputRowMeta( RowMetaInterface outRowMeta, Varia
if ( stepMeta.getOutputAUCMetrics() ) {
String classLabels = classArffMeta.getNominalVals();
if ( !Const.isEmpty( classLabels ) ) {
TreeSet<String> ts = new TreeSet<>( ArffMeta.stringToVals( classLabels ) );
//TreeSet<String> ts = new TreeSet<>( ArffMeta.stringToVals( classLabels ) );
// String[] labels = classLabels.split( "," );
for ( String label : ts ) {
ArrayList<String> preOrdered = new ArrayList<>( ArffMeta.stringToVals( classLabels ) );
for ( String label : preOrdered ) {
label = label.trim();

vm =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ public class PMILifecycleListener implements KettleLifecycleListener {

// TODO replace this by some code that somehow locates the pdm jar file in plugins/steps/pmi/lib
// This allows the Spark engine to locate the main weka.jar file for use in the Spark execution environment
System.setProperty( "weka.jar.filename", "pdm-ce-3.8.3.1.jar" );
//System.setProperty( "weka.jar.filename", "pdm-ce-3.8.3.1.jar" );
System.setProperty( "weka.jar.filename", "weka-stable-3.8.5.jar" );

// check that the required packages are installed (and possibly install if not)
try {
Expand Down
153 changes: 79 additions & 74 deletions src/main/java/org/pentaho/di/trans/steps/pmi/PMIScoring.java
Original file line number Diff line number Diff line change
Expand Up @@ -167,84 +167,15 @@ private PMIScoringModel setModel( String modelFileName ) throws KettleException

Object[] r = getRow();

if ( r == null ) {
if ( !m_meta.getEvaluateRatherThanScore() && m_data.getModel().isBatchPredictor() && !m_meta
.getFileNameFromField() && m_batch.size() > 0 ) {
try {
outputBatchRows( true );
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.ProblemWhileGettingPredictionsForBatch" ),
ex ); //$NON-NLS-1$
}
}

if ( m_meta.getEvaluateRatherThanScore() && m_data.getModel().isSupervisedLearningModel() ) {
// generate the output row
try {
if ( m_data.getModel().isBatchPredictor() ) {
outputBatchRows( true );
} else {
Object[] outputRow = m_data.evaluateForRow( getInputRowMeta(), m_data.getOutputRowMeta(), null, m_meta );
putRow( m_data.getOutputRowMeta(), outputRow );
}
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.ProblemWhileGettingPredictionsForBatch" ),
ex ); //$NON-NLS-1$
}
}

// see if we have an incremental model that is to be saved somewhere.
if ( !m_meta.getFileNameFromField() && m_meta.getUpdateIncrementalModel() ) {
if ( !Const.isEmpty( m_meta.getSavedModelFileName() ) ) {
// try and save that sucker...
try {
String modName = environmentSubstitute( m_meta.getSavedModelFileName() );
File updatedModelFile = null;
if ( modName.startsWith( "file:" ) ) {
try {
modName = modName.replace( " ", "%20" );
updatedModelFile = new File( new java.net.URI( modName ) );
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.MalformedURIForUpdatedModelFile" ),
ex );
}
} else {
updatedModelFile = new File( modName );
}
PMIScoringData.saveSerializedModel( m_data.getModel(), updatedModelFile );
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.ProblemSavingUpdatedModelToFile" ),
ex ); //$NON-NLS-1$
}
}
}

if ( m_meta.getFileNameFromField() ) {
// clear the main model
m_data.getModel().done();
m_data.setModel( null );
m_data.setDefaultModel( null );
if ( m_modelCache != null ) {
m_modelCache.clear();
}
} else {
m_data.getModel().done();
m_data.setModel( null );
m_data.setDefaultModel( null );
}

setOutputDone();
return false;
}

// Handle the first row
if ( first ) {
first = false;

if (r == null) {
setOutputDone();
return false;
}

m_data.setOutputRowMeta( getInputRowMeta().clone() );
if ( m_meta.getFileNameFromField() ) {
RowMetaInterface inputRowMeta = getInputRowMeta();
Expand Down Expand Up @@ -378,6 +309,80 @@ private PMIScoringModel setModel( String modelFileName ) throws KettleException
}
} // end (if first)

if ( r == null ) {
if ( !m_meta.getEvaluateRatherThanScore() && m_data.getModel().isBatchPredictor() && !m_meta
.getFileNameFromField() && m_batch.size() > 0 ) {
try {
outputBatchRows( true );
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.ProblemWhileGettingPredictionsForBatch" ),
ex ); //$NON-NLS-1$
}
}

if ( m_meta.getEvaluateRatherThanScore() && m_data.getModel().isSupervisedLearningModel() ) {
// generate the output row
try {
if ( m_data.getModel().isBatchPredictor() ) {
outputBatchRows( true );
} else {
Object[] outputRow = m_data.evaluateForRow( getInputRowMeta(), m_data.getOutputRowMeta(), null, m_meta );
putRow( m_data.getOutputRowMeta(), outputRow );
}
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.ProblemWhileGettingPredictionsForBatch" ),
ex ); //$NON-NLS-1$
}
}

// see if we have an incremental model that is to be saved somewhere.
if ( !m_meta.getFileNameFromField() && m_meta.getUpdateIncrementalModel() ) {
if ( !Const.isEmpty( m_meta.getSavedModelFileName() ) ) {
// try and save that sucker...
try {
String modName = environmentSubstitute( m_meta.getSavedModelFileName() );
File updatedModelFile = null;
if ( modName.startsWith( "file:" ) ) {
try {
modName = modName.replace( " ", "%20" );
updatedModelFile = new File( new java.net.URI( modName ) );
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.MalformedURIForUpdatedModelFile" ),
ex );
}
} else {
updatedModelFile = new File( modName );
}
PMIScoringData.saveSerializedModel( m_data.getModel(), updatedModelFile );
} catch ( Exception ex ) {
throw new KettleException(
BaseMessages.getString( PMIScoringMeta.PKG, "PMIScoring.Error.ProblemSavingUpdatedModelToFile" ),
ex ); //$NON-NLS-1$
}
}
}

if ( m_meta.getFileNameFromField() ) {
// clear the main model
m_data.getModel().done();
m_data.setModel( null );
m_data.setDefaultModel( null );
if ( m_modelCache != null ) {
m_modelCache.clear();
}
} else {
m_data.getModel().done();
m_data.setModel( null );
m_data.setDefaultModel( null );
}

setOutputDone();
return false;
}

// Make prediction for row using model
try {
if ( m_meta.getFileNameFromField() ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,8 @@ protected void setData( BaseSupervisedPMIStepMeta meta ) {
String[] schemeOpts = m_scheme.getSchemeOptions();
if ( schemeOpts != null && schemeOpts.length > 0 ) {
meta.setSchemeCommandLineOptions( Utils.joinOptions( schemeOpts ) );
} else {
meta.setSchemeCommandLineOptions( "" );
}

if ( m_incrementalRowCacheField != null ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ protected static void setValuesOnObject( Object objectToEdit, Map<String, Map<St
final Object value = propDetails.get( "value" );
String category = (String) propDetails.get( "category" );

if ( m_propertyGroupingCategory != null ) {
if ( m_propertyGroupingCategory.length() > 0 ) {
if ( category == null || category.length() == 0 || !category.equalsIgnoreCase( m_propertyGroupingCategory ) ) {
continue;
}
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/pentaho/pmi/engines/KerasScheme.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ public abstract class KerasScheme {
s_excludedSchemes =
Arrays.asList( "Naive Bayes", "Naive Bayes incremental", "Naive Bayes multinomial", "Decision tree classifier",
"Decision tree regressor", "Random forest classifier", "Random forest regressor", "Gradient boosted trees",
"Support vector regressor", "Multi-layer perceptron classifier", "Multi-layer perceptron regressor",
"Support vector regressor", "Support vector classifier", "Logistic regression", "Linear regression",
"Multi-layer perceptron classifier", "Multi-layer perceptron regressor",
"Extreme gradient boosting classifier", "Extreme gradient boosting regressor",
"Multi-layer perceptron classifier", "Multi-layer perceptron regressor" );

Expand Down

0 comments on commit 613c1f9

Please sign in to comment.