In [None]:
Oracle AI Data Platform v1.0

Copyright © 2025, Oracle and/or its affiliates.

Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

# Access Object Storage Data
 **Accessing External Data in OCI from AI Data Platform**
 
This notebook demonstrates how to efficiently read and write data from OCI Object Storage using AI Data Platform. You’ll learn three different approaches:
 
 1. **Direct Read/Write to OCI Object Storage**
 2. **Accessing OCI Object Storage via an External Volume**
    - Creating an external volume
    - Reading and writing data
 3. **Using an External Table Referencing OCI Object Storage**
    - Creating an external table
    - Reading and writing data

 **Prerequisites**

Before you begin, ensure you have:
 - An OCI Object Storage bucket created in your tenancy.
 - The necessary IAM policies for accessing AI Data Platform. Learn more about permissions.
 - A configured AI Data Platform environment with a compute cluster created.


 **Parameters**
 
Change the values in the parameters section, before executing the parameter cell and remaining cells;
 - OCI Object Storage bucket name and namespace - must change these to your bucket name and namespace
 - Folder and file name
 - Catalog name
 - Schema name
 - External volume name
 - External table name

 **Next Steps**

Follow the step-by-step instructions in this notebook to implement each approach. You can also refer to the following resources for additional guidance:
	•	Working with OCI Object Storage
	•	Mounting OCI Object Storage as an External Volume
	•	Using External Tables in AI Data Platform

By the end of this notebook, you’ll be able to choose the best method for accessing Object Storage data in your OCI environment.

In [None]:
# Define parameters (can set parameters in a workflow job)

oci_bucket=oidlUtils.parameters.getParameter("OCI_BUCKET", "oci://replace_bucket_name@replace_namespace")
folder_name=oidlUtils.parameters.getParameter("FOLDER_NAME", "new_data_folder")
file_name=oidlUtils.parameters.getParameter("FILE_NAME", "new_data")
catalog_name=oidlUtils.parameters.getParameter("CATALOG_NAME", "default")
schema_name=oidlUtils.parameters.getParameter("SCHEMA_NAME", "default")
external_volume_name=oidlUtils.parameters.getParameter("EXTERNAL_VOLUME_NAME", "ext_volume")
external_table_name=oidlUtils.parameters.getParameter("EXTERNAL_TABLE_NAME", "ext_table")

#End of parameters that need to be set, you can run the rest of the cells

In [1]:
# Define paths
oci_file_path = f"{oci_bucket}/{folder_name}/{file_name}"
external_volume_path = f"/Volumes/{catalog_name}/{schema_name}/{external_volume_name}/{folder_name}/{file_name}"
external_table_path = f"{oci_bucket}/{folder_name}/{file_name}"

In [None]:
# Display parameter values
print(f"Using OCI Bucket: {oci_bucket}")
print(f"Folder Name: {folder_name}")
print(f"File Name: {file_name}")
print(f"External Volume Name: {external_volume_name}")
print(f"External Volume Path: {external_volume_path}")
print(f"External Table Name: {external_table_name}")
print(f"External Table Path: {external_table_path}")
print(f"Schema: {schema_name}")


# **1. Directly Read from and Write to OCI Object Storage**
We will:
 - Create a sample DataFrame.
 - Write the DataFrame directly to an OCI Object Storage bucket as CSV format, you can expriment and change the format.
 - Read the data back from OCI Object Storage.


In [1]:
# Create a sample DataFrame
data = [("Alice", 30), ("Bob", 35), ("Charlie", 25)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

# Write DataFrame to OCI Object Storage
df.write.mode("overwrite").option("header", True).format("csv").save(oci_file_path)
print(f"Data written directly to {oci_file_path}")

# Read from OCI Object Storage
df_read = spark.read.option("header", True).format("csv").load(oci_file_path)
print("Data read directly from OCI:")
df_read.show()

# **2. Accessing OCI Object Storage via an External Volume**
We will:
 - Create an **external volume** referencing OCI Object Storage.
 - Write data to the external volume.
 - Read data back from the external volume.

# 2-1. Create External Volume referencing Object Storage


In [1]:
create_volume_sql = f"""create external volume if not exists {catalog_name}.{schema_name}.{external_volume_name} location '{oci_bucket}/'"""
spark.sql(create_volume_sql)
print(f"External volume '{external_volume_name}' created.")

# 2-2. Read/Write via External Volume

In [None]:
# Write to External Volume
df.write.mode("overwrite").option("header", True).format("csv").save(external_volume_path)
print(f"Data written to external volume at {external_volume_path}")

# Read from External Volume
df_volume = spark.read.option("header", True).format("csv").load(external_volume_path)
print("Data read from external volume:")
df_volume.show()


In [1]:
# Cleanup
drop_volume_sql = f"""drop volume {catalog_name}.{schema_name}.{external_volume_name}"""
spark.sql(drop_volume_sql)
print(f"External volume '{external_volume_name}' dropped.")

# **3. Using an External Table Referencing OCI Object Storage**
We will:
- Create an **external table** referencing OCI Object Storage.
- Write data to the external table location.
- Read data from the external table using SQL.


# 3-1. Create External Table

In [None]:
create_table_sql = f"""CREATE TABLE IF NOT EXISTS {catalog_name}.{schema_name}.{external_table_name} (name STRING, age INT) USING CSV OPTIONS (path='{oci_file_path}',delimiter=',',header='true')"""
print(create_table_sql)
spark.sql(create_table_sql)
print(f"External table '{catalog_name}.{schema_name}.{external_table_name}' created.")


# 3-2. Query External Table

In [1]:
# Read from External Table
df_table = spark.sql(f"SELECT * FROM {catalog_name}.{schema_name}.{external_table_name}")
print("Data read from external table:")
df_table.show()

# 3-2. Write Data to External Table

In [1]:
df.write.mode("overwrite").option("overwriteSchema", "true").option("header", True).format("csv").save(external_table_path)
print(f"Data written to external table location {external_table_path}")

# Read from External Table
df_table = spark.sql(f"SELECT * FROM {catalog_name}.{schema_name}.{external_table_name}")
print("Data read from external table:")
df_table.show()

In [1]:
# Cleanup
drop_table_sql = f"""drop table {catalog_name}.{schema_name}.{external_table_name}"""
spark.sql(drop_table_sql)
print(f"External table '{external_table_name}' dropped.")