In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Need postgres
# https://mvnrepository.com/artifact/org.postgresql/postgresql
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "org.postgresql:postgresql:42.2.9",
            "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,org.postgresql:postgresql:42.2.9,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from spark_connections import postgresUrlProperties, createMongoURI

In [4]:
# get or create Spark session

app_name = "spark-mongo-postgres-save-file"
spark = SparkSession.builder.appName(app_name).getOrCreate()

## Load from MongoDB

In [13]:
mongo_connection = {
    "hostname": "host.docker.internal",
    "port": "27017"
}
mongo_database = "customers"
sample_size = 1000 # how many rows to use to determine the schema

collection = "customer_data"
mongoURI = createMongoURI(mongo_connection, mongo_database, collection)
print(mongoURI)

df_customer_data = spark.read \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .option("uri",mongoURI) \
    .option("sampleSize", sample_size) \
    .load()
df_customer_data.printSchema()

mongodb://host.docker.internal:27017/customers.customer_data
root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- car: string (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- last_name: string (nullable = true)



## Save as CSV

In [14]:
# to save as a csv file all fields must be flattened
df_customer_data_flat = df_customer_data.withColumn("_id", F.col("_id.oid"))

In [15]:
save_mode = "overwrite" # options: error, append, overwrite

# codec="gzip"
# header="true"
df_customer_data_flat.write \
    .format("com.databricks.spark.csv") \
    .options(header="true") \
    .mode(save_mode) \
    .save("customer_data.csv")

In [16]:
!head -n 2 customer_data.csv/*csv

_id,car,email,first_name,gender,id,last_name
5df292055be4f706164a5324,Scion,bcancott0@studiopress.com,Benetta,Female,1,Cancott


## Save as JSON

Complex column types are allowed to be saved in JSON format

In [23]:
save_mode = "overwrite" # options: error, append, overwrite

# options:
# compression="gzip"

df_customer_data.write \
    .format("json") \
    .options() \
    .mode(save_mode) \
    .save("customer_data.json")

In [25]:
!head -n 2 customer_data.json/*json

{"_id":{"oid":"5df292055be4f706164a5324"},"car":"Scion","email":"bcancott0@studiopress.com","first_name":"Benetta","gender":"Female","id":1,"last_name":"Cancott"}
{"_id":{"oid":"5df292055be4f706164a5325"},"car":"Chrysler","email":"lcherry1@deliciousdays.com","first_name":"Lilyan","gender":"Female","id":2,"last_name":"Cherry"}


## Save data as Parquet

Parquet, an open source file format for Hadoop. Parquet stores nested data structures in a flat columnar format. Compared to a traditional approach where data is stored in row-oriented approach, parquet is more efficient in terms of storage and performance.

In [26]:
save_mode = "overwrite" # options: error, append, overwrite

df_customer_data.write \
    .format("parquet") \
    .options() \
    .mode(save_mode) \
    .save("customer_data.parquet")

In [27]:
df = spark.read.format("parquet").load("customer_data.parquet")
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- car: string (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- last_name: string (nullable = true)



## Save as ORC

The Optimized Row Columnar (ORC) file format provides a highly efficient way to store Hive data. It was designed to overcome limitations of the other Hive file formats. Using ORC files improves performance when Hive is reading, writing, and processing data.

In [28]:
save_mode = "overwrite" # options: error, append, overwrite

df_customer_data.write \
    .format("orc") \
    .options() \
    .mode(save_mode) \
    .save("customer_data.orc")

In [29]:
df = spark.read.format("orc").load("customer_data.orc")
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- car: string (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- last_name: string (nullable = true)

