### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import json
import ads
import os

compartment_id = os.environ.get("NB_SESSION_COMPARTMENT_OCID")
logs_bucket_uri = "oci://dataflow_app@frqap2zhtzbe/log_pystudio"
#metastore_id = "ocid1.datacatalogmetastore.oc1.eu-frankfurt-1.amaaaaaangencdyadlqoeypyt3hks3g5j34axfyfl3rof5ug2z7vokyury3a"

def prepare_command(command: dict) -> str:
    """Converts dictionary command to the string formatted commands."""
    return f"'{json.dumps(command)}'"

ads.set_auth("resource_principal")  # Supported values: resource_principal, api_key

In [2]:
%load_ext dataflow.magics

In [6]:
command = prepare_command(
    {
        "compartmentId": compartment_id,
        "displayName": "TestS3Adw",
        "language": "PYTHON",
        "sparkVersion": "3.2.1",
        "numExecutors": 1,
        "driverShape": "VM.Standard.E4.Flex",
        "executorShape": "VM.Standard.E4.Flex",
        "driverShapeConfig": {"ocpus": 1, "memoryInGBs": 16},
        "executorShapeConfig": {"ocpus": 1, "memoryInGBs": 16},
        "type": "SESSION",
        "logsBucketUri": logs_bucket_uri,
        "configuration": {
            "fs.oci.client.hostname": "https://objectstorage.eu-frankfurt-1.oraclecloud.com",
            "spark.oracle.datasource.enabled":"true",
            "spark.jars.packages": "org.apache.hadoop:hadoop-aws:3.2.1,com.amazonaws:aws-java-sdk-s3:1.11.655,com.amazonaws:aws-java-sdk-core:1.11.655,org.apache.spark:spark-hadoop-cloud_2.12:3.2.1,org.postgresql:postgresql:42.7.1",
            "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
            "spark.hadoop.fs.s3a.access.key":"AKIAXWR7W6DM7ZX22DVB",
            "spark.hadoop.fs.s3a.secret.key":"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",
        },
    }
)

#Create a new DF app
%create_session -l python -c $command

Setting up the Cluster..


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Cluster is ready..
Starting Spark application..


Session ID,Kind,State,Current session
ocid1.dataflowapplication.oc1.eu-frankfurt-1.antheljsngencdyakjp64jlseztjqnu7vdknkryoq5guwzt6hgi2bupxm55q,pyspark,IN_PROGRESS,Dataflow Run


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.
SparkContext available as 'sc'.


In [5]:
%use_session -f -s 'ocid1.dataflowapplication.oc1.eu-frankfurt-1.antheljrngencdyasspcpi6yeyiaz362ru5ijzxpebnwgaipfkhi6pqeaipa'

No active session for provided sessionId.Please create a new session.


In [None]:
%%spark
print(spark.sparkContext._jsc.sc().listJars())
#print(spark.sparkContext._jsc.sc().jarOfClass(jvm.java.lang.Class.forName("com.google.protobuf.AbstractMessage")))
print(sc.version)


In [6]:
%%spark

file = "s3a://samplesdata/AssetSensorData.csv"

# Load our data.
print("Reading data from object store")
# Load our data.
df = (spark.read.format("csv")
      .option("inferSchema", "true")
      .option("header","true")
      .option("multiLine", "false")
      .option("delimiter",";")
      .option("dateFormat","dd.MM.yyyy")
      .load(file)
      .cache())

df.show()
df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Reading data from object store
+----------+--------+----+---------------------+-------------------+--------------+
|  DATE_KEY|PRESSURE| RPM|OPERATING_TEMPERATURE|BEARING_TEMPERATURE|MACHINE_STATUS|
+----------+--------+----+---------------------+-------------------+--------------+
|07.08.2016|    3700|5715|                   84|                 57|             0|
|09.08.2016|    3315|5582|                  116|                 69|             0|
|09.08.2016|    3179|2471|                   82|                 67|             0|
|07.01.2017|    4280|4793|                80,66|                 71|             1|
|07.01.2017|    4480|3086|                  120|                 71|             1|
|07.01.2017|    4280|2522|                 94,6|              76,86|             1|
|08.01.2017|    4320|4732|               121,98|              59,36|             1|
|08.01.2017|    4200|3105|                  112|              68,88|             1|
|08.01.2017|    4640|4436|                  1

In [8]:
%%spark

# save to s3 again.

file_out = "s3a://samplesdata/AssetSensorParquet"
df.write.mode('overwrite').parquet(file_out)

In [19]:
%%spark
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+--------+----+---------------------+-------------------+--------------+
|  DATE_KEY|PRESSURE| RPM|OPERATING_TEMPERATURE|BEARING_TEMPERATURE|MACHINE_STATUS|
+----------+--------+----+---------------------+-------------------+--------------+
|07.08.2016|    3700|5715|                   84|                 57|             0|
|09.08.2016|    3315|5582|                  116|                 69|             0|
|09.08.2016|    3179|2471|                   82|                 67|             0|
|07.01.2017|    4280|4793|                80,66|                 71|             1|
|07.01.2017|    4480|3086|                  120|                 71|             1|
|07.01.2017|    4280|2522|                 94,6|              76,86|             1|
|08.01.2017|    4320|4732|               121,98|              59,36|             1|
|08.01.2017|    4200|3105|                  112|              68,88|             1|
|08.01.2017|    4640|4436|                  119|              76,88|        

In [26]:
%%spark
# read some json from s3


file_name = "s3a://samplesdata/balance_transaction.json"

dj = spark.read.option("multiline", "true").json(file_name)

dj.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- action: string (nullable = true)
 |-- amount_cents: string (nullable = true)
 |-- api_version: string (nullable = true)
 |-- balance: string (nullable = true)
 |-- balance_before: string (nullable = true)
 |-- bonus: string (nullable = true)
 |-- bonus_amount_cents: string (nullable = true)
 |-- client_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- game_info: string (nullable = true)
 |-- id: string (nullable = true)
 |-- inserted_at: string (nullable = true)
 |-- msg_id: string (nullable = true)
 |-- reference_id: string (nullable = true)
 |-- reference_type: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- version: string (nullable = true)

In [16]:
%%spark

spark.conf.set("spark.hadoop.fs.s3a.access.key", '2c46ed9ff31e07c806cd783f150eb85536c7169c')
spark.conf.set('spark.hadoop.fs.s3a.secret.key', 'AJI9Bg3wf57bM7Nbfnka4RwPpvCew71wQ3oHI3O+2AY=')
   
    
# read some json from s3
file_name = "s3a://frqap2zhtzbe.compat.objectstorage.eu-frankfurt-1.oraclecloud.com/dataflow_app/*.parquet"
df = spark.read.parquet(file_name)
df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o216.parquet.
: org.apache.hadoop.fs.s3a.UnknownStoreException: s3a://frqap2zhtzbe.compat.objectstorage.eu-frankfurt-1.oraclecloud.com/dataflow_app
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:257)
	at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:117)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.listStatus(S3AFileSystem.java:2810)
	at org.apache.hadoop.fs.Globber.listStatus(Globber.java:128)
	at org.apache.hadoop.fs.Globber.doGlob(Globber.java:291)
	at org.apache.hadoop.fs.Globber.glob(Globber.java:202)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.globStatus(S3AFileSystem.java:4253)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.globStatus(S3AFileSystem.java:4233)
	at org.apache.spark.deploy.SparkHadoopUtil.globPath(SparkHadoopUtil.scala:253)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$3(DataSource.scala:760)
	at org.apache.spark.util.ThreadUtils$.$anonf

In [30]:
%%spark

schema = dj.schema.json()
print(schema)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{"fields":[{"metadata":{},"name":"_id","nullable":true,"type":"string"},{"metadata":{},"name":"account_id","nullable":true,"type":"string"},{"metadata":{},"name":"action","nullable":true,"type":"string"},{"metadata":{},"name":"amount_cents","nullable":true,"type":"string"},{"metadata":{},"name":"api_version","nullable":true,"type":"string"},{"metadata":{},"name":"balance","nullable":true,"type":"string"},{"metadata":{},"name":"balance_before","nullable":true,"type":"string"},{"metadata":{},"name":"bonus","nullable":true,"type":"string"},{"metadata":{},"name":"bonus_amount_cents","nullable":true,"type":"string"},{"metadata":{},"name":"client_name","nullable":true,"type":"string"},{"metadata":{},"name":"created_at","nullable":true,"type":"string"},{"metadata":{},"name":"currency","nullable":true,"type":"string"},{"metadata":{},"name":"game_info","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"inserted_at","nullable":true

In [7]:
%%spark

plsql_block = """
BEGIN
      ADMIN.test_proc;
END;
"""

ds = spark.read.format("oracle") \
    .option("adbId","ocid1.autonomousdatabase.oc1.eu-frankfurt-1.antheljsngencdyase2z3keeufsr4r5wtnskyurvhbd4txb63nb64qu5tvlq") \
    .option("dbtable", "testdataflow") \
    .option("user", "admin")\
    .option("password", "WelcomeBack123#2xxe2")\
    .option("sessionInitStatement", plsql_block) \
    .load()

ds.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0

In [7]:
%stop_session

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Session has been stopped successfully.


In [11]:
%%spark
df = spark.read.parquet('oci://dataflow_app@frqap2zhtzbe/testdata_year=2024_month=2024-05_day=2024-05-06_hour=09_part-00000-d6d45e02-0c9b-401f-914c-588781770fb2.c000.snappy.parquet')
df.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

695

In [12]:
%%spark
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- kafka_time: timestamp (nullable = true)
 |-- account_id: long (nullable = true)
 |-- action: string (nullable = true)
 |-- amount_cents: long (nullable = true)
 |-- api_version: string (nullable = true)
 |-- balance: long (nullable = true)
 |-- balance_before: long (nullable = true)
 |-- amount_locked_cents: long (nullable = true)
 |-- amount_wager_cents: long (nullable = true)
 |-- amount_wager_requirement_cents: long (nullable = true)
 |-- bonus_issue_id: long (nullable = true)
 |-- bonus_amount_cents: long (nullable = true)
 |-- client_name: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- currency: string (nullable = true)
 |-- id: long (nullable = true)
 |-- msg_id: string (nullable = true)
 |-- reference_id: long (nullable = true)
 |-- reference_type: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- version: long (nullable = true)
 |-- game_table_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- tx_number: lo