### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [63]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf


# Create a Spark session with your AWS Credentials

conf = (
    SparkConf()
    .setAppName("MY_APP") # replace with your desired name
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.1,com.amazonaws:aws-java-sdk-s3:1.11.655,com.amazonaws:aws-java-sdk-core:1.11.655,org.apache.spark:spark-hadoop-cloud_2.12:3.2.1")
    .set("spark.hadoop.fs.s3a.access.key", "")
    .set("spark.hadoop.fs.s3a.secret.key", "")
    .set("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()


In [54]:
spark.conf.set('spark.hadoop.fs.s3a.access.key', '')
spark.conf.set('spark.hadoop.fs.s3a.secret.key', '')
spark.conf.set('spark.hadoop.fs.s3a.path.style.access', 'true')
spark.conf.set("spark.hadoop.fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
    

In [64]:
spark.conf.get('spark.hadoop.fs.s3.impl')

'org.apache.hadoop.fs.s3a.S3AFileSystem'

In [65]:
file = "s3a://samplesdata/AssetSensorData.csv"

# Load our data.
print("Reading data from object store")
# Load our data.
df = (spark.read.format("csv")
      .option("inferSchema", "true")
      .option("header","true")
      .option("multiLine", "false")
      .option("delimiter",";")
      .option("dateFormat","dd.MM.yyyy")
      .load(file))
      

df.show()
df.count()

Reading data from object store
+----------+--------+----+---------------------+-------------------+--------------+
|  DATE_KEY|PRESSURE| RPM|OPERATING_TEMPERATURE|BEARING_TEMPERATURE|MACHINE_STATUS|
+----------+--------+----+---------------------+-------------------+--------------+
|07.08.2016|    3700|5715|                   84|                 57|             0|
|09.08.2016|    3315|5582|                  116|                 69|             0|
|09.08.2016|    3179|2471|                   82|                 67|             0|
|07.01.2017|    4280|4793|                80,66|                 71|             1|
|07.01.2017|    4480|3086|                  120|                 71|             1|
|07.01.2017|    4280|2522|                 94,6|              76,86|             1|
|08.01.2017|    4320|4732|               121,98|              59,36|             1|
|08.01.2017|    4200|3105|                  112|              68,88|             1|
|08.01.2017|    4640|4436|                  1

1981

In [59]:
file = "s3a://baltrans/testdata_year=2024_month=2024-05_day=2024-05-06_hour=09_part-00000-d6d45e02-0c9b-401f-914c-588781770fb2.c000.snappy.parquet"
df = spark.read.parquet(file)
df.count()


                                                                                

695

In [66]:
file = "s3a://samplesdata/balance_transaction.json"
df = spark.read.json(file)
df.count()


                                                                                

110002

In [56]:
spark.stop()


In [69]:
file = "s3a://samplesdata/smalldata.json"
df = spark.read.json(file)
df.count()


                                                                                

528