# Calling Java function in PySpark  

### Download and transfer java jar file from repository to spark

Note: recommend: place java udf jar under spark jars folder. otherwise require restart jupyter kernel before calling java functions due dynamic loading require java jars in spark classpath.

In [1]:
# download from web link directly and put under user-libs folder
# !wget https://my-repository-link/PySparkJavaUDFDemo-jar-with-dependencies.jar.jar

### Create spark session

In [2]:

from pyspark import SparkConf, SparkContext, SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder \
    .appName('PySpark-Java-UDF-Test') \
    .config('spark.executor.memory', '4gb') \
    .config("spark.cores.max", "1") \
    .config("spark.jars", "./shared/PySparkJavaUDFDemo-jar-with-dependencies.jar") \
    .config("spark.driver.extraClassPath", ("./shared/PySparkJavaUDFDemo-jar-with-dependencies.jar")) \
    .config("spark.executor.extraClassPath", ("./shared/PySparkJavaUDFDemo-jar-with-dependencies.jar")) \
    .config("spark.executor.extraLibraryPath", "./shared/") \
   .getOrCreate()

## or Do add jar file
spark.sparkContext.addFile("./shared/PySparkJavaUDFDemo-jar-with-dependencies.jar")
sc = spark.sparkContext
sqlContext = SQLContext(sc)

## create test dataframe

In [3]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,DoubleType
data2 = [
    ("James","","Smith","3663636636","M",53000,20,96941,"1",1.0,2.0),
    ("Michael","Rose","","4028840288","M",84000,31,96970,"2",3.0,4.0),
    ("Robert","","Williams","4211442114","M",94000,16,96940,"3",5.0,6.0),
    ("Maria","Anne","Jones","3919239192","F",64000,22,96960,"4",7.0,8.0),
    ("Jen","Mary","Brown","5919233911","F",79000,39,96939,"5",0.0,9.0)
  ]
schema = StructType([ \
    StructField("first_name",StringType(),True), \
    StructField("middle_name",StringType(),True), \
    StructField("last_name",StringType(),True), \
    StructField("ssn", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True), \
    StructField("age", IntegerType(), True), \
    StructField("zip", StringType(), True), \
    StructField("race", StringType(), True), \
    StructField("latitude", DoubleType(), True), \
    StructField("longitude", DoubleType(), True)                                            
  ])
df1 = spark.createDataFrame(data=data2,schema=schema)
df1.printSchema()
df1.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- ssn: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- zip: string (nullable = true)
 |-- race: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+----------+-----------+---------+----------+------+------+---+-----+----+--------+---------+
|first_name|middle_name|last_name|ssn       |gender|salary|age|zip  |race|latitude|longitude|
+----------+-----------+---------+----------+------+------+---+-----+----+--------+---------+
|James     |           |Smith    |3663636636|M     |53000 |20 |96941|1   |1.0     |2.0      |
|Michael   |Rose       |         |4028840288|M     |84000 |31 |96970|2   |3.0     |4.0      |
|Robert    |           |Williams |4211442114|M     |94000 |16 |96940|3   |5.0     |6.0      |
|Maria     |Anne       |Jones    |3919239192|F     |64000 |22 |96960|4   |7.0     |8.0      |
|Jen       |Mary       |Brown    |5919233911|F     |79000 |39 |96939|5   |0.0     |9.0      |
+----------+-----------+---------+----------+------+------+---+-----+----+--------+---------+



                                                                                

## A. Call Java Method Directly

In [4]:
# Call the static java function with spark
ssn="123456789321"
spark.sparkContext._jvm.pyspark.java.udf.JavaTransformFunctions.transformFieldValueEnding(ssn, 4,'$')

932156789321


'9321$$$$$$$$'

## B. Call the java UDF with spark

In [5]:
# call the java UDF with spark
spark.udf.registerJavaFunction("transform_fieldvalue", "pyspark.java.udf.JavaFieldTransformUDF", T.StringType())

In [6]:
# dataframe call it using F.expr:
(
    df1.withColumn("ssn_hide", F.expr("transform_fieldvalue(ssn, 2,'*')"))
    .show()
)

+----------+-----------+---------+----------+------+------+---+-----+----+--------+---------+----------+
|first_name|middle_name|last_name|       ssn|gender|salary|age|  zip|race|latitude|longitude|  ssn_hide|
+----------+-----------+---------+----------+------+------+---+-----+----+--------+---------+----------+
|     James|           |    Smith|3663636636|     M| 53000| 20|96941|   1|     1.0|      2.0|36********|
|   Michael|       Rose|         |4028840288|     M| 84000| 31|96970|   2|     3.0|      4.0|88********|
|    Robert|           | Williams|4211442114|     M| 94000| 16|96940|   3|     5.0|      6.0|14********|
|     Maria|       Anne|    Jones|3919239192|     F| 64000| 22|96960|   4|     7.0|      8.0|92********|
|       Jen|       Mary|    Brown|5919233911|     F| 79000| 39|96939|   5|     0.0|      9.0|11********|
+----------+-----------+---------+----------+------+------+---+-----+----+--------+---------+----------+



## C. Call the java UDF in SQL Statement

In [7]:
# spark SQL 
df1.createOrReplaceTempView("people")
spark.sql("""
    SELECT
        first_name, last_name, transform_fieldvalue(ssn, 4,'*') as ssn, transform_fieldvalue(zip, 2,'*') as zip
    FROM people
""").show()

+----------+---------+----------+-----+
|first_name|last_name|       ssn|  zip|
+----------+---------+----------+-----+
|     James|    Smith|6636******|41***|
|   Michael|         |0288******|70***|
|    Robert| Williams|2114******|40***|
|     Maria|    Jones|9192******|60***|
|       Jen|    Brown|3911******|39***|
+----------+---------+----------+-----+

