# Reproduction of mounting S3 bucket to Databricks

- Mount creates a link between a workspace and cloud object storage, which enables you to interact 
with cloud object storage using familiar file paths relative to the Databricks file system.
- To open a new notebook: `New > Notebook`
- Ref: [Databricks notebook](https://dbc-b54c5c54-233d.cloud.databricks.com/?o=1865928197306450#notebook/627262318697111/command/627262318697120)

In [None]:
# You should see the CSV files you uploaded earlier is now inside the
# FileStore tables folder.
dbutils.fs.ls('/FileStore/tables')

In [None]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

In [None]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

aws_keys_df

In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user') \
                        .select('Access key ID') \
                        .collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user') \
                        .select('Secret access key') \
                        .collect()[0]['Secret access key']

# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
from database_utils import CredentialsReader

db_creds = CredentialsReader.read_db_creds('credentials')

In [None]:
# Mount creates a link between a workspace and cloud object storage,
# which enables you to interact with cloud object storage using familiar
# file paths relative to the Databricks file system.

IAM_USER_NAME = db_creds['IAM_USER_NAME']

# AWS S3 bucket name
AWS_S3_BUCKET = f'user-{IAM_USER_NAME}-bucket'
# Mount name for the bucket
MOUNT_NAME = f'/mnt/{IAM_USER_NAME}-mount'
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}" \
                .format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)

# Mount the drive only once
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

# To unmount, run: dbutils.fs.unmount(MOUNT_NAME)

In [None]:
# Check if the S3 bucket was mounted succesfully
display(dbutils.fs.ls(f'{MOUNT_NAME}/../..'))
display(dbutils.fs.ls(f'{MOUNT_NAME}/..'))
display(dbutils.fs.ls(f'{MOUNT_NAME}/'))

In [None]:
# Read the JSON format dataset from S3 into Databricks
# S3 Filepath to pin topic:
# s3://user-<IAM_USER_NAME>-bucket/topics/<IAM_USER_NAME>.pin/partition=0/

# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = f'{MOUNT_NAME}/topics/{IAM_USER_NAME}.pin/partition=0/*.json'
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_pin)

In [None]:
# S3 Filepath to geo topic:
# s3://user-<IAM_USER_NAME>-bucket/topics/<IAM_USER_NAME>.geo/partition=0/
file_location = f'{MOUNT_NAME}/topics/{IAM_USER_NAME}.geo/partition=0/*.json'
file_type = "json"
infer_schema = "true"

df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)

display(df_geo)

In [None]:
# S3 Filepath to user topic:
# s3://user-<IAM_USER_NAME>-bucket/topics/<IAM_USER_NAME>.user/partition=0/
file_location = f'{MOUNT_NAME}/topics/{IAM_USER_NAME}.user/partition=0/*.json'
file_type = "json"
infer_schema = "true"

df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)

display(df_user)