Skip to content
This repository has been archived by the owner on Oct 8, 2019. It is now read-only.

Commit

Permalink
Added quantified_features and introduces new package
Browse files Browse the repository at this point in the history
`hivemall.ftvec.trans`.
  • Loading branch information
myui committed Aug 25, 2015
1 parent 645534e commit 22d8897
Show file tree
Hide file tree
Showing 11 changed files with 150 additions and 22 deletions.
25 changes: 16 additions & 9 deletions scripts/ddl/define-all-as-permanent.hive
Original file line number Diff line number Diff line change
Expand Up @@ -252,18 +252,9 @@ CREATE FUNCTION extract_weight as 'hivemall.ftvec.ExtractWeightUDF' USING JAR '$
DROP FUNCTION IF EXISTS add_feature_index;
CREATE FUNCTION add_feature_index as 'hivemall.ftvec.AddFeatureIndexUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS vectorize_features;
CREATE FUNCTION vectorize_features as 'hivemall.ftvec.VectorizeFeaturesUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS feature;
CREATE FUNCTION feature as 'hivemall.ftvec.FeatureUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS categorical_features;
CREATE FUNCTION categorical_features as 'hivemall.ftvec.CategoricalFeaturesUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS indexed_features;
CREATE FUNCTION indexed_features as 'hivemall.ftvec.IndexedFeatures' USING JAR '${hivemall_jar}';

----------------------------------
-- feature converting functions --
----------------------------------
Expand All @@ -288,6 +279,22 @@ CREATE FUNCTION to_sparse as 'hivemall.ftvec.conv.ToSparseFeaturesUDF' USING JAR
DROP FUNCTION IF EXISTS quantify;
CREATE FUNCTION quantify as 'hivemall.ftvec.conv.QuantifyColumnsUDTF' USING JAR '${hivemall_jar}';

--------------------------
-- feature transformers --
--------------------------

DROP FUNCTION IF EXISTS vectorize_features;
CREATE FUNCTION vectorize_features as 'hivemall.ftvec.trans.VectorizeFeaturesUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS categorical_features;
CREATE FUNCTION categorical_features as 'hivemall.ftvec.trans.CategoricalFeaturesUDF' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS indexed_features;
CREATE FUNCTION indexed_features as 'hivemall.ftvec.trans.IndexedFeatures' USING JAR '${hivemall_jar}';

DROP FUNCTION IF EXISTS quantified_features;
CREATE FUNCTION quantified_features as 'hivemall.ftvec.trans.QuantifiedFeaturesUDTF' USING JAR '${hivemall_jar}';

--------------------------
-- ftvec/text functions --
--------------------------
Expand Down
25 changes: 16 additions & 9 deletions scripts/ddl/define-all.hive
Original file line number Diff line number Diff line change
Expand Up @@ -248,18 +248,9 @@ create temporary function extract_weight as 'hivemall.ftvec.ExtractWeightUDF';
drop temporary function add_feature_index;
create temporary function add_feature_index as 'hivemall.ftvec.AddFeatureIndexUDF';

drop temporary function vectorize_features;
create temporary function vectorize_features as 'hivemall.ftvec.VectorizeFeaturesUDF';

drop temporary function feature;
create temporary function feature as 'hivemall.ftvec.FeatureUDF';

drop temporary function categorical_features;
create temporary function categorical_features as 'hivemall.ftvec.CategoricalFeaturesUDF';

drop temporary function indexed_features;
create temporary function indexed_features as 'hivemall.ftvec.IndexedFeatures';

----------------------------------
-- feature converting functions --
----------------------------------
Expand All @@ -284,6 +275,22 @@ create temporary function to_sparse as 'hivemall.ftvec.conv.ToSparseFeaturesUDF'
drop temporary function quantify;
create temporary function quantify as 'hivemall.ftvec.conv.QuantifyColumnsUDTF';

--------------------------
-- feature transformers --
--------------------------

drop temporary function vectorize_features;
create temporary function vectorize_features as 'hivemall.ftvec.trans.VectorizeFeaturesUDF';

drop temporary function categorical_features;
create temporary function categorical_features as 'hivemall.ftvec.trans.CategoricalFeaturesUDF';

drop temporary function indexed_features;
create temporary function indexed_features as 'hivemall.ftvec.trans.IndexedFeatures';

drop temporary function quantified_features;
create temporary function quantified_features as 'hivemall.ftvec.trans.QuantifiedFeaturesUDTF';

--------------------------
-- ftvec/text functions --
--------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hivemall.ftvec;
package hivemall.ftvec.trans;

import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.lang.StringUtils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hivemall.ftvec;
package hivemall.ftvec.trans;

import hivemall.utils.lang.StringUtils;

Expand Down
111 changes: 111 additions & 0 deletions src/main/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTF.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Hivemall: Hive scalable Machine Learning Library
*
* Copyright (C) 2015 Makoto YUI
* Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hivemall.ftvec.trans;

import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.lang.Identifier;

import java.util.ArrayList;
import java.util.Arrays;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;

@Description(name = "quantified_features", value = "_FUNC_(col1, col2, ...) - Returns an identified features in a dence array<double>")
public final class QuantifiedFeaturesUDTF extends GenericUDTF {

private PrimitiveObjectInspector[] doubleOIs;
private Identifier<String>[] identifiers;
private DoubleWritable[] columnValues;

// org.apache.hive.com.esotericsoftware.kryo.KryoException: java.lang.NullPointerException
private transient Object[] fowardObjs;

@SuppressWarnings("unchecked")
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
int size = argOIs.length;
this.doubleOIs = new PrimitiveObjectInspector[size];
this.columnValues = new DoubleWritable[size];
this.identifiers = new Identifier[size];
this.fowardObjs = null;

for(int i = 0; i < size; i++) {
columnValues[i] = new DoubleWritable(Double.NaN);
if(HiveUtils.isNumberOI(argOIs[i])) {
doubleOIs[i] = HiveUtils.asDoubleCompatibleOI(argOIs[i]);
} else {
identifiers[i] = new Identifier<String>();
}
}


ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("features");
fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector));

return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}

@Override
public void process(Object[] args) throws HiveException {
if(fowardObjs == null) {
this.fowardObjs = new Object[] { Arrays.asList(columnValues) };
}

final DoubleWritable[] values = this.columnValues;
for(int i = 0; i < args.length; i++) {
Object arg = args[i];

Identifier<String> identifier = identifiers[i];
if(identifier == null) {
double v = PrimitiveObjectInspectorUtils.getDouble(arg, doubleOIs[i]);
values[i].set(v);
} else {
if(arg == null) {
throw new HiveException("Found Null in the input: " + Arrays.toString(args));
} else {
String k = arg.toString();
int id = identifier.valueOf(k);
values[i].set(id);
}
}

}
forward(fowardObjs);
}

@Override
public void close() throws HiveException {
this.doubleOIs = null;
this.identifiers = null;
this.columnValues = null;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hivemall.ftvec;
package hivemall.ftvec.trans;

import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.lang.StringUtils;
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/hivemall/utils/hadoop/HiveUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ public static boolean isNumberOI(@Nonnull final ObjectInspector argOI)
case LONG:
case FLOAT:
case DOUBLE:
case BYTE:
case TIMESTAMP:
return true;
default:
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hivemall.ftvec;
package hivemall.ftvec.trans;

import hivemall.ftvec.trans.VectorizeFeaturesUDF;
import hivemall.utils.hadoop.WritableUtils;

import java.io.IOException;
Expand Down
Binary file modified target/hivemall-fat.jar
Binary file not shown.
Binary file modified target/hivemall-with-dependencies.jar
Binary file not shown.
Binary file modified target/hivemall.jar
Binary file not shown.

0 comments on commit 22d8897

Please sign in to comment.