In [None]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib
import pandas as pd

## Read the credentials and prepare to connect to S3 Bucket

In [None]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

## Connect and Mount S3 Bucket to Databricks File System

In [None]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-0a1d8948160f-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/pin_pipe"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
pin_data_location = f"{MOUNT_NAME}/topics/0a1d8948160f.pin/partition=0/"
geo_data_location = f"{MOUNT_NAME}/topics/0a1d8948160f.geo/partition=0/"
user_data_location = f"{MOUNT_NAME}/topics/0a1d8948160f.user/partition=0/"
display(dbutils.fs.ls(pin_data_location))
display(dbutils.fs.ls(geo_data_location))
display(dbutils.fs.ls(user_data_location))


## Read JSON data files for all pin, geo, user

In [None]:
def prepare_dataframe(data_location):
    data_files = dbutils.fs.ls(data_location)
    df = spark.read\
        .format('json')\
        .option("inferSchema", infer_schema)\
        .load([x.path for x in data_files])
    return df

In [None]:
df_pin = prepare_dataframe(pin_data_location)
df_geo = prepare_dataframe(geo_data_location)
df_user = prepare_dataframe(user_data_location)

In [None]:
display(df_pin)
display(df_geo)
display(df_user)