In [None]:
# Upgrade Oracle ADS to pick up latest features and maintain compatibility with Oracle Cloud Infrastructure.

!pip install -U oracle-ads

Oracle Data Science service sample notebook.

Copyright (c) 2019, 2022 Oracle, Inc. All rights reserved. Licensed under the [Universal Permissive License v 1.0](https://oss.oracle.com/licenses/upl).

---

# <font color="red">PySpark</font>
<p style="margin-left:10%; margin-right:10%;">by the <font color="teal">Oracle Cloud Infrastructure Data Science Service.</font></p>

---

# Overview:

This notebook provides Apache Spark operations for customers by bridging the existing local PySpark workflows with cloud based capabilities. Data scientists can use their familiar local environments with JupyterLab and work with remote data and remote clusters simply by selecting a kernel. The operations that will be demonstrated are: how to use the interactive Spark environment and produce a Spark script; how to prepare and create an application; how to prepare and create a run; how to list existing dataflow applications; and how to retrieve and display the logs.

The purpose of the `dataflow` module is to provide an efficient and convenient way for users to launch a Spark application and run Spark jobs. The interactive Spark kernel provides a simple and efficient way to edit and build your Spark script, and easy access to read from OCI Object Storage.

Compatible conda pack: [PySpark 2.4 and Data Flow](https://docs.oracle.com/iaas/data-science/using/conda-pyspark-fam.htm) for CPU on Python 3.7 (version 3.0)

---

## Contents:

- <a href='#kernel'>Build a PySpark Script Using an Interactive Spark Kernel</a>
- <a href="#ref">References</a>

---


Datasets are provided as a convenience.  Datasets are considered third-party content and are not considered materials 
under your agreement with Oracle.
    
You can access the `orcl_attrition` dataset license [here](https://oss.oracle.com/licenses/upl).

---


In [None]:
import io
import matplotlib.pyplot as plt
import os
import pandas as pd
import tempfile
import uuid

from ads.dataflow.dataflow import DataFlow
from os import path
from pyspark.sql import SparkSession

<a id='kernel'></a>
# Build a PySpark Script Using an Interactive Spark Kernel 

Set up Spark session in your PySpark conda environment.

In [None]:
# create a Spark session
spark = (
    SparkSession.builder.appName("Python Spark SQL basic example")
    .config("spark.driver.cores", "4")
    .config("spark.executor.cores", "4")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

Load the Employee Attrition data file from Oracle Cloud Infrastructure Object Storage into an Apache Spark DataFrame

In [None]:
emp_attrition = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("multiLine", "true")
    .load(
        "oci://hosted-ds-datasets@bigdatadatasciencelarge/synthetic/orcl_attrition.csv"
    )
    .cache()
)  # cache the dataset to increase computing speed
emp_attrition.createOrReplaceTempView("emp_attrition")

Explore the dataframe

In [None]:
spark.sql("select * from emp_attrition limit 5").toPandas()

Visualize how monthly income and age relate to one another in the context of years in industry

In [None]:
fig, ax = plt.subplots()
plot = (
    spark.sql(
        """
          SELECT 
              Age,
              MonthlyIncome,
              YearsInIndustry
          FROM
            emp_attrition 
          """
    )
    .toPandas()
    .plot.scatter(
        x="Age",
        y="MonthlyIncome",
        title="Age vs Monthly Income",
        c="YearsInIndustry",
        cmap="viridis",
        figsize=(12, 12),
        ax=ax,
    )
)
plot.set_xlabel("Age")
plot.set_ylabel("Monthly Income")
plot

View all of the columns in the table

In [None]:
spark.sql("show columns from emp_attrition").show()

Select a few columns using Apache Spark and convert it into a Pandas DataFrame

In [None]:
df = (
    spark.sql(
        """
         SELECT
            Age,
            MonthlyIncome,
            YearsInIndustry
          FROM
            emp_attrition """
    )
    .limit(10)
    .toPandas()
)
df

We can also work with different compression formats within Dataflow. For example snappy parquet: 

In [None]:
# Writing to a snappy parquet file
df.to_parquet("emp_attrition.parquet.snappy", compression="snappy")
pd.read_parquet("emp_attrition.parquet.snappy")

In [None]:
# We are able to read in this snappy parquet file to an Apache Spark dataframe
read_snappy_df = (
    SparkSession.builder.appName("Snappy Compression Loading Example")
    .config("spark.io.compression.codec", "org.apache.spark.io.SnappyCompressionCodec")
    .getOrCreate()
    .read.format("parquet")
    .load(f"{os.getcwd()}/emp_attrition.parquet.snappy")
)

read_snappy_df.first()

Note: other compression formats Data Flow supports today include snappy parquet (example above) and gzip on both csv and parquet.

We have come to a query that we want to run in Data Flow from previous explorations. Please refer to the dataflow.ipynb on how to submit a job to dataflow

In [None]:
dataflow_base_folder = tempfile.mkdtemp()
data_flow = DataFlow(dataflow_base_folder=dataflow_base_folder)
print("Data flow directory: {}".format(dataflow_base_folder))

In [None]:
pyspark_file_path = path.join(
    dataflow_base_folder, "example-{}.py".format(str(uuid.uuid4())[-6:])
)
script = '''
from pyspark.sql import SparkSession

def main():
    
    # Create a Spark session
    spark = SparkSession \\
        .builder \\
        .appName("Python Spark SQL basic example") \\
        .getOrCreate()
    
    # Load a csv file from dataflow public storage
    df = spark \\
        .read \\
        .format("csv") \\
        .option("header", "true") \\
        .option("multiLine", "true") \\
        .load("oci://hosted-ds-datasets@bigdatadatasciencelarge/synthetic/orcl_attrition.csv")
    
    # Create a temp view and do some SQL operations
    df.createOrReplaceTempView("emp_attrition")
    query_result_df = spark.sql("""
        SELECT 
            Age,
            MonthlyIncome,
            YearsInIndustry
        FROM emp_attrition 
    """)
    
    # Convert the filtered Apache Spark DataFrame into JSON format
    # Note: we are writing to the Spark stdout log so that we can retrieve the log later at the end of the notebook.
    print('\\n'.join(query_result_df.toJSON().collect()))
    
if __name__ == '__main__':
    main()
'''

with open(pyspark_file_path, "w") as f:
    print(script.strip(), file=f)

print("Script path: {}".format(pyspark_file_path))

In [None]:
script_bucket = "test"  # Update the value
logs_bucket = "dataflow-log"  # Update the value
display_name = "sample_Data_Flow_app"

app_config = data_flow.prepare_app(
    display_name=display_name,
    script_bucket=script_bucket,
    pyspark_file_path=pyspark_file_path,
    logs_bucket=logs_bucket,
)

app = data_flow.create_app(app_config)

run_display_name = "sample_Data_Flow_run"
run_config = app.prepare_run(run_display_name=run_display_name)

run = app.run(run_config, save_log_to_local=True)

In [None]:
run.status

In [None]:
run.config

In [None]:
run.oci_link

<a id="ref"></a>
# References

- [ADS Library Documentation](https://accelerated-data-science.readthedocs.io/en/latest/index.html)
- [Data Science YouTube Videos](https://www.youtube.com/playlist?list=PLKCk3OyNwIzv6CWMhvqSB_8MLJIZdO80L)
- [OCI Data Science Documentation](https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm)
- [Oracle Data & AI Blog](https://blogs.oracle.com/datascience/)