In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Need postgres
# https://mvnrepository.com/artifact/org.postgresql/postgresql
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "org.postgresql:postgresql:42.2.9",
            "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,org.postgresql:postgresql:42.2.9,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from spark_connections import postgresUrlProperties, createMongoURI

In [5]:
# get or create Spark session

app_name = "spark-mongo-postgres-save-parquet"
spark = SparkSession.builder.appName(app_name).getOrCreate()

## Load from MongoDB

In [7]:
mongo_connection = {
    "hostname": "host.docker.internal",
    "port": "27017"
}
mongo_database = "users"
sample_size = 1000 # how many rows to use to determine the schema

collection = "user_data"
mongoURI = createMongoURI(mongo_connection, mongo_database, collection)
print(mongoURI)

df_user_data = spark.read \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .option("uri",mongoURI) \
    .option("sampleSize", sample_size) \
    .load()
df_user_data.printSchema()

mongodb://host.docker.internal:27017/users.user_data
root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- active_user: boolean (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- last_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- username: string (nullable = true)



## Save data as Parquet

Parquet, an open source file format for Hadoop. Parquet stores nested data structures in a flat columnar format. Compared to a traditional approach where data is stored in row-oriented approach, parquet is more efficient in terms of storage and performance.

In [8]:
save_mode = "overwrite" # options: error, append, overwrite

df_user_data.write \
    .format("parquet") \
    .options() \
    .mode(save_mode) \
    .save("user_data.parquet")

In [9]:
df = spark.read.format("parquet").load("user_data.parquet")
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- active_user: boolean (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- last_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- street_address: string (nullable = true)
 |-- username: string (nullable = true)

