Skip to content

Commit

Permalink
Add Python API for hex and unhex
Browse files Browse the repository at this point in the history
  • Loading branch information
Davies Liu committed Jul 2, 2015
1 parent 377ff4c commit 1a24082
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 73 deletions.
28 changes: 28 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,34 @@ def randn(seed=None):
return Column(jc)


@ignore_unicode_prefix
@since(1.5)
def hex(col):
"""Computes hex value of the given column, which could be StringType,
BinaryType, IntegerType or LongType.
>>> sqlContext.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()
[Row(hex(a)=u'414243', hex(b)=u'3')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.hex(_to_java_column(col))
return Column(jc)


@ignore_unicode_prefix
@since(1.5)
def unhex(col):
"""Inverse of hex. Interprets each pair of characters as a hexadecimal number
and converts to the byte representation of number.
>>> sqlContext.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect()
[Row(unhex(a)=u'ABC')]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.unhex(_to_java_column(col))
return Column(jc)


@ignore_unicode_prefix
@since(1.5)
def sha1(col):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ object FunctionRegistry {
expression[Substring]("substr"),
expression[Substring]("substring"),
expression[Upper]("ucase"),
expression[UnHex]("unhex"),
expression[Unhex]("unhex"),
expression[Upper]("upper")
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,30 +259,22 @@ case class Hex(child: Expression) extends UnaryExpression with Serializable {
case LongType => hex(num.asInstanceOf[Long])
case IntegerType => hex(num.asInstanceOf[Integer].toLong)
case BinaryType => hex(num.asInstanceOf[Array[Byte]])
case StringType => hex(num.asInstanceOf[UTF8String])
case StringType => hex(num.asInstanceOf[UTF8String].getBytes)
}
}
}

/**
* Converts every character in s to two hex digits.
*/
private def hex(str: UTF8String): UTF8String = {
hex(str.getBytes)
}
private[this] val hexDigits = Array[Char](
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
).map(_.toByte)

private def hex(bytes: Array[Byte]): UTF8String = {
doHex(bytes, bytes.length)
}

private def doHex(bytes: Array[Byte], length: Int): UTF8String = {
private[this] def hex(bytes: Array[Byte]): UTF8String = {
val length = bytes.length
val value = new Array[Byte](length * 2)
var i = 0
while (i < length) {
value(i * 2) = Character.toUpperCase(Character.forDigit(
(bytes(i) & 0xF0) >>> 4, 16)).toByte
value(i * 2 + 1) = Character.toUpperCase(Character.forDigit(
bytes(i) & 0x0F, 16)).toByte
value(i * 2) = hexDigits((bytes(i) & 0xF0) >> 4)
value(i * 2 + 1) = hexDigits((bytes(i) & 0x0F))
i += 1
}
UTF8String.fromBytes(value)
Expand All @@ -303,6 +295,66 @@ case class Hex(child: Expression) extends UnaryExpression with Serializable {
}
}

/**
* Performs the inverse operation of HEX.
* Resulting characters are returned as a byte array.
*/
case class Unhex(child: Expression)
extends UnaryExpression with AutoCastInputTypes with Serializable {

override def nullable: Boolean = true
override def dataType: DataType = BinaryType
override def inputTypes: Seq[DataType] = Seq(BinaryType)

override def eval(input: InternalRow): Any = {
val num = child.eval(input)
if (num == null) {
null
} else {
unhex(num.asInstanceOf[UTF8String].getBytes)
}
}

// lookup table to translate '0' -> 0 ... 'F'/'f' -> 15
private[this] val unhexDigits = {
val array = Array.fill[Byte](128)(-1)
(0 to 9).foreach(i => array('0' + i) = i.toByte)
(0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
(0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
array
}

private[this] def unhex(bytes: Array[Byte]): Array[Byte] = {
val out = new Array[Byte]((bytes.length + 1) >> 1)
var i = 0
if ((bytes.length & 0x01) != 0) {
// padding with '0'
if (bytes(0) < 0) {
return null
}
val v = unhexDigits(bytes(0))
if (v == -1) {
return null
}
out(0) = v
i += 1
}
// two characters form the hex value.
while (i < bytes.length) {
if (bytes(i) < 0 || bytes(i + 1) < 0) {
return null
}
val first = unhexDigits(bytes(i))
val second = unhexDigits(bytes(i + 1))
if (first == -1 || second == -1) {
return null
}
out(i / 2) = (((first << 4) | second) & 0xFF).toByte
i += 2
}
out
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -351,58 +403,6 @@ case class Pow(left: Expression, right: Expression)
}
}

/**
* Performs the inverse operation of HEX.
* Resulting characters are returned as a byte array.
*/
case class UnHex(child: Expression) extends UnaryExpression with Serializable {

override def dataType: DataType = BinaryType

override def checkInputDataTypes(): TypeCheckResult = {
if (child.dataType.isInstanceOf[StringType] || child.dataType == NullType) {
TypeCheckResult.TypeCheckSuccess
} else {
TypeCheckResult.TypeCheckFailure(s"unHex accepts String type, not ${child.dataType}")
}
}

override def eval(input: InternalRow): Any = {
val num = child.eval(input)
if (num == null) {
null
} else {
unhex(num.asInstanceOf[UTF8String].getBytes)
}
}

private val unhexDigits = {
val array = Array.fill[Byte](128)(-1)
(0 to 9).foreach(i => array('0' + i) = i.toByte)
(0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
(0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
array
}

private def unhex(inputBytes: Array[Byte]): Array[Byte] = {
var bytes = inputBytes
if ((bytes.length & 0x01) != 0) {
bytes = '0'.toByte +: bytes
}
val out = new Array[Byte](bytes.length >> 1)
// two characters form the hex value.
var i = 0
while (i < bytes.length) {
val first = unhexDigits(bytes(i))
val second = unhexDigits(bytes(i + 1))
if (first == -1 || second == -1) { return null}
out(i / 2) = (((first << 4) | second) & 0xFF).toByte
i += 2
}
out
}
}

case class Hypot(left: Expression, right: Expression)
extends BinaryMathExpression(math.hypot, "HYPOT")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,10 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
}

test("unhex") {
checkEvaluation(UnHex(Literal("737472696E67")), "string".getBytes)
checkEvaluation(UnHex(Literal("")), new Array[Byte](0))
checkEvaluation(UnHex(Literal("0")), Array[Byte](0))
checkEvaluation(Unhex(Literal("737472696E67")), "string".getBytes)
checkEvaluation(Unhex(Literal("")), new Array[Byte](0))
checkEvaluation(Unhex(Literal("F")), Array[Byte](15))
checkEvaluation(Unhex(Literal("ff")), Array[Byte](-1))
}

test("hypot") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,7 @@ object functions {
* @group math_funcs
* @since 1.5.0
*/
def unhex(column: Column): Column = UnHex(column.expr)
def unhex(column: Column): Column = Unhex(column.expr)

/**
* Inverse of hex. Interprets each pair of characters as a hexadecimal number
Expand Down

0 comments on commit 1a24082

Please sign in to comment.