# PySpark - Apply Function


In [25]:
%%html
<style>
table {float:left}
</style>

In [26]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>

In [27]:
import os
import sys
import gc
import numpy as np

In [51]:
USER = !whoami
USER = USER[0]

#  Environemnt Variables

## Hadoop

In [28]:
os.environ['HADOOP_CONF_DIR'] = "/opt/hadoop/hadoop-3.2.2/etc/hadoop"

In [29]:
%%bash
export HADOOP_CONF_DIR="/opt/hadoop/hadoop-3.2.2/etc/hadoop"
ls $HADOOP_CONF_DIR | head -n 5

capacity-scheduler.xml
configuration.xsl
container-executor.cfg
core-site.xml
core-site.xml.48132.2022-02-15@12:29:41~


## PYTHONPATH

Refer to the **pyspark** modules to load from the ```$SPARK_HOME/python/lib``` in the Spark installation.

* [PySpark Getting Started](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)

> Ensure the SPARK_HOME environment variable points to the directory where the tar file has been extracted. Update PYTHONPATH environment variable such that it can find the PySpark and Py4J under SPARK_HOME/python/lib. One example of doing this is shown below:

```
export PYTHONPATH=$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH
```

Alternatively install **pyspark** with pip or conda locally which installs the Spark runtime libararies (for standalone).

* [Can PySpark work without Spark?](https://stackoverflow.com/questions/51728177/can-pyspark-work-without-spark)

> As of v2.2, executing pip install pyspark will install Spark. If you're going to use Pyspark it's clearly the simplest way to get started. On my system Spark is installed inside my virtual environment (miniconda) at lib/python3.6/site-packages/pyspark/jars  
> PySpark has a Spark installation installed. If installed through pip3, you can find it with pip3 show pyspark. Ex. for me it is at ~/.local/lib/python3.8/site-packages/pyspark. This is a standalone configuration so it can't be used for managing clusters like a full Spark installation.

In [30]:
# os.environ['PYTHONPATH'] = "/opt/spark/spark-3.1.2/python/lib/py4j-0.10.9-src.zip:/opt/spark/spark-3.1.2/python/lib/pyspark.zip"
sys.path.extend([
    "/opt/spark/spark-3.1.2/python/lib/py4j-0.10.9-src.zip",
    "/opt/spark/spark-3.1.2/python/lib/pyspark.zip"
])

## PYSPARK_PYTHON

In [53]:
os.environ['PYSPARK_PYTHON'] = f"/home/{USER}/venv/ml/bin/python3"

## PySpark packages

Execute after the PYTHONPATH setup.

In [32]:
import pyspark.sql 
from pyspark.sql.types import *
from pyspark.sql.functions import (
    col,
    lit,
    isnan,
    lower,
    concat,
    udf,
    array
)

---
# Spark Session


In [33]:
from pyspark.sql import SparkSession

In [34]:
#    .config('spark.yarn.appMasterEnv.PYSPARK_PYTHON', f"/home/{USER}/venv/ml/bin/python3")\

spark = SparkSession.builder\
    .master('yarn') \
    .config('spark.submit.deployMode', 'client') \
    .config('spark.debug.maxToStringFields', 100) \
    .config('spark.executor.memory', '2g') \
    .config('spark.yarn.executorEnv.PYSPARK_PYTHON', f"/home/{USER}/venv/ml/bin/python3")\
    .getOrCreate()

2022-02-23 11:33:58,531 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [35]:
NUM_CORES = 4
NUM_PARTITIONS = 3

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)

# Apply functon on all columns

## upper/lower function

Example to apply function on all the columns using functools.reduce().

1. Go through all the columns one by one.
2. Apply function ```pyspark.sql.functions.lower``` on each column. 

In [36]:
from functools import reduce
from pyspark.sql.dataframe import DataFrame

def f(df: DataFrame,  column: str) -> DataFrame:
    return df.withColumn(column, pyspark.sql.functions.lower(col(column)))


source_df = spark.createDataFrame(
    data=[
        ("Jose", "BLUE"),
        ("lI", "BrOwN")
    ],
    schema=["name", "eye_color"]
)
source_df.show()

applied = (reduce(
    f,
    source_df.columns,
    source_df
))
applied.show()

                                                                                

+----+---------+
|name|eye_color|
+----+---------+
|Jose|     BLUE|
|  lI|    BrOwN|
+----+---------+

+----+---------+
|name|eye_color|
+----+---------+
|jose|     blue|
|  li|    brown|
+----+---------+



In [37]:
source_df.withColumn("concatenated", concat(col("name"), lit(" "), col("eye_color"))).show()

+----+---------+------------+
|name|eye_color|concatenated|
+----+---------+------------+
|Jose|     BLUE|   Jose BLUE|
|  lI|    BrOwN|    lI BrOwN|
+----+---------+------------+



## Concatenate columns

In [38]:
def g(row):
    return (" ".join([row['name'], row["eye_color"]]),)
              
source_df.rdd.map(g).toDF().show()

+---------+
|       _1|
+---------+
|Jose BLUE|
| lI BrOwN|
+---------+



---
# UDF

* [How to Turn Python Functions into PySpark Functions (UDF)](https://changhsinlee.com/pyspark-udf/)

> Spark UDF doesn’t convert integers to floats, unlike Python function which works for both integers and floats, a Spark UDF will return a column of NULLs if the input data type doesn’t match the output data type

```
## Force the output to be float
square_udf_float2 = udf(lambda z: float(z**2), FloatType())
```

OR 
```
@udf("float")
def square_udf_float(x):
    return float(x**2)
```

## Apply numpy functions on columns

* [apply udf to multiple columns and use numpy operations](https://stackoverflow.com/a/58179373/4281353)

In [44]:
df = spark.createDataFrame(
    data = [(138,5,10), (128,4,10), (112,3,10), (120,3,10), (189,1,10)], 
    schema=["count","df","docs"]
)
df.show()

+-----+---+----+
|count| df|docs|
+-----+---+----+
|  138|  5|  10|
|  128|  4|  10|
|  112|  3|  10|
|  120|  3|  10|
|  189|  1|  10|
+-----+---+----+



In [45]:
@udf("float")
def newFunction(count, df, docs):
    import numpy as np
    returnValue = (1 + np.log(count)) * np.log(docs/df)
    return returnValue.item()

In [46]:
df.withColumn("new_function_result", newFunction("count","df","docs")).show()

+-----+---+----+-------------------+
|count| df|docs|new_function_result|
+-----+---+----+-------------------+
|  138|  5|  10|           4.108459|
|  128|  4|  10|           5.362161|
|  112|  3|  10|          6.8849173|
|  120|  3|  10|           6.967983|
|  189|  1|  10|          14.372153|
+-----+---+----+-------------------+



## Sqare the column

In [42]:
@udf("float")
def square_udf_float(x):
    return float(x**2)

In [47]:
df.withColumn("square", square_udf_float("count")).show()

+-----+---+----+-------+
|count| df|docs| square|
+-----+---+----+-------+
|  138|  5|  10|19044.0|
|  128|  4|  10|16384.0|
|  112|  3|  10|12544.0|
|  120|  3|  10|14400.0|
|  189|  1|  10|35721.0|
+-----+---+----+-------+



---
# Stop Spark Session

In [21]:
spark.stop()



# Cleanup

In [22]:
del spark
gc.collect()

715