Skip to content

Commit

Permalink
SPARK[8235] sha / sha1
Browse files Browse the repository at this point in the history
  • Loading branch information
tarekbecker committed Jun 23, 2015
1 parent 7b1450b commit e5dad4e
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 1 deletion.
21 changes: 21 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
'monotonicallyIncreasingId',
'rand',
'randn',
'sha1',
'sparkPartitionId',
'struct',
'udf',
Expand Down Expand Up @@ -363,6 +364,26 @@ def randn(seed=None):
return Column(jc)


@ignore_unicode_prefix
@since(1.5)
def sha1(col, length):
"""Returns the hex string result of SHA-1.
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.sha1(_to_java_column(col), length)
return Column(jc)


@ignore_unicode_prefix
@since(1.5)
def sha(col, length):
"""Returns the hex string result of SHA-1.
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.sha(_to_java_column(col), length)
return Column(jc)


@since(1.4)
def sparkPartitionId():
"""A column for partition ID of the Spark task.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ object FunctionRegistry {

// misc functions
expression[Md5]("md5"),
expression[Sha1]("sha1"),
expression[Sha1]("sha"),

// aggregate functions
expression[Average]("avg"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions

import org.apache.commons.codec.digest.DigestUtils
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.types.{BinaryType, StringType, DataType}
import org.apache.spark.sql.types.{LongType, BinaryType, StringType, DataType}
import org.apache.spark.unsafe.types.UTF8String

/**
Expand Down Expand Up @@ -48,3 +48,38 @@ case class Md5(child: Expression)
s"(org.apache.commons.codec.digest.DigestUtils.md5Hex($c))")
}
}

/**
* A function that calculates a sha1 hash value and returns it as a hex string
* For input of type [[BinaryType]] or [[StringType]]
*/
case class Sha1(child: Expression) extends UnaryExpression {

override def dataType: DataType = StringType

override def eval(input: InternalRow): Any = {
val value = child.eval(input)
if (value == null) {
null
} else {
value match {
case b: Array[Byte] =>
UTF8String.fromString(DigestUtils.sha1Hex(value.asInstanceOf[Array[Byte]]))
case s: UTF8String =>
UTF8String.fromString(DigestUtils.sha1Hex(s.getBytes))
}
}
}

override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
defineCodeGen(ctx, ev, c =>
child.dataType match {
case BinaryType =>
"org.apache.spark.unsafe.types.UTF8String.fromString" +
s"(org.apache.commons.codec.digest.DigestUtils.sha1Hex($c))"
case StringType =>
"org.apache.spark.unsafe.types.UTF8String.fromString" +
s"(org.apache.commons.codec.digest.DigestUtils.sha1Hex($c.getBytes()))"
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,13 @@ class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
}

test("sha1") {
checkEvaluation(Sha1(Literal("ABC")), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8")
checkEvaluation(Sha1(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8")
checkEvaluation(Sha1(Literal.create(null, StringType)), null)
checkEvaluation(Sha1(Literal.create(null, BinaryType)), null)
checkEvaluation(Sha1(Literal("")), "da39a3ee5e6b4b0d3255bfef95601890afd80709")
checkEvaluation(Sha1(Literal("".getBytes)), "da39a3ee5e6b4b0d3255bfef95601890afd80709")
}

}
32 changes: 32 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,38 @@ object functions {
*/
def md5(columnName: String): Column = md5(Column(columnName))

/**
* Calculates the SHA-1 digest and returns the value as a 40 character hex string.
*
* @group misc_funcs
* @since 1.5.0
*/
def sha1(e: Column): Column = Sha1(e.expr)

/**
* Calculates the SHA-1 digest and returns the value as a 40 character hex string.
*
* @group misc_funcs
* @since 1.5.0
*/
def sha1(columnName: String): Column = sha1(Column(columnName))

/**
* Calculates the SHA-1 digest and returns the value as a 40 character hex string.
*
* @group misc_funcs
* @since 1.5.0
*/
def sha(e: Column): Column = Sha1(e.expr)

/**
* Calculates the SHA-1 digest and returns the value as a 40 character hex string.
*
* @group misc_funcs
* @since 1.5.0
*/
def sha(columnName: String): Column = sha1(Column(columnName))

//////////////////////////////////////////////////////////////////////////////////////////////
// String functions
//////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,30 @@ class DataFrameFunctionsSuite extends QueryTest {
Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
}

test("misc sha1 function") {
val df = Seq(("ABC", "ABC".getBytes)).toDF("a", "b")
checkAnswer(
df.select(sha1($"a"), sha1("b")),
Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8"))

val dfEmpty = Seq(("", "".getBytes)).toDF("a", "b")
checkAnswer(
dfEmpty.selectExpr("sha1(a)", "sha1(b)"),
Row("da39a3ee5e6b4b0d3255bfef95601890afd80709", "da39a3ee5e6b4b0d3255bfef95601890afd80709"))
}

test("misc sha function") {
val df = Seq(("ABC", "ABC".getBytes)).toDF("a", "b")
checkAnswer(
df.select(sha($"a"), sha("b")),
Row("3c01bdbb26f358bab27f267924aa2c9a03fcfdb8", "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8"))

val dfEmpty = Seq(("", "".getBytes)).toDF("a", "b")
checkAnswer(
dfEmpty.selectExpr("sha(a)", "sha(b)"),
Row("da39a3ee5e6b4b0d3255bfef95601890afd80709", "da39a3ee5e6b4b0d3255bfef95601890afd80709"))
}

test("string length function") {
checkAnswer(
nullStrings.select(strlen($"s"), strlen("s")),
Expand Down

0 comments on commit e5dad4e

Please sign in to comment.