In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Need postgres
# https://mvnrepository.com/artifact/org.postgresql/postgresql
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "org.postgresql:postgresql:42.2.9",
            "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,org.postgresql:postgresql:42.2.9,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1 pyspark-shell


In [10]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from spark_connections import postgresUrlProperties, createMongoURI

In [4]:
# get or create Spark session

app_name = "spark-mongo-postgres-read"
spark = SparkSession.builder.appName(app_name).getOrCreate()

## Load from MongoDB

In [9]:
mongo_connection = {
    "hostname": "host.docker.internal",
    "port": "27017"
}
mongo_database = "customers"
sample_size = 1000 # how many rows to use to determine the schema

collection = "customer_data"
mongoURI = createMongoURI(mongo_connection, mongo_database, collection)
print(mongoURI)

df_customer_data = spark.read \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .option("uri",mongoURI) \
    .option("sampleSize", sample_size) \
    .load()
df_customer_data.printSchema()

mongodb://host.docker.internal:27017/customers.customer_data
root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- car: string (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- last_name: string (nullable = true)



## Load from Postgres

In [8]:
postgres_connection = {
    "host": "host.docker.internal",
    "port": "5433", # should be 5432 for you
    "database": "customers",
    "dialect": "postgresql",
    "username": "postgres",
    "password": "changeme"
}

postgres_url, postgres_props = postgresUrlProperties(postgres_connection)
print(postgres_url, "\n", postgres_props)
table = "customer_location"

df_customer_location = spark.read.jdbc(url=postgres_url, table=table, properties=postgres_props)
df_customer_location.printSchema()

jdbc:postgresql://host.docker.internal:5433/customers 
 {'user': 'postgres', 'password': 'changeme', 'driver': 'org.postgresql.Driver'}
root
 |-- _corrupt_record: string (nullable = true)
 |-- address: string (nullable = true)
 |-- id: long (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- us_state: string (nullable = true)

