In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Need postgres
# https://mvnrepository.com/artifact/org.postgresql/postgresql
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "org.postgresql:postgresql:42.2.9",
            "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,org.postgresql:postgresql:42.2.9,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F

In [4]:
# get or create Spark session

app_name = "spark-mongo-postgres"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [8]:
customer_location_file ="../Resources/customer_location.json"
customer_data_file ="../Resources/customer_data.csv"

spark.sparkContext.addFile(customer_location_file)
spark.sparkContext.addFile(customer_data_file)

df_location = spark.read \
    .format("json") \
    .load(SparkFiles.get("customer_location.json"))
df_location.printSchema()


df_data = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(SparkFiles.get("customer_data.csv"))
df_data.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- address: string (nullable = true)
 |-- id: long (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- us_state: string (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- car: string (nullable = true)



## Postgres URL

In [14]:
def postgresUrlProperties(connection):
    # Postgres credentials
    jdbcHostname = connection['host']
    jdbcPort = connection['port']
    jdbcDatabase = connection['database']
    dialect = connection['dialect']
    jdbcUsername = connection['username']
    jdbcPassword = connection['password']
    
    jdbcUrl = f"jdbc:{dialect}://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}"
    # for mysql driver = com.mysql.jdbc.Driver
    connectionProperties = {
      "user" : jdbcUsername,
      "password" : jdbcPassword,
      "driver" : "org.postgresql.Driver" 
    }
    return (jdbcUrl, connectionProperties)

In [15]:
connection = {
    "host": "host.docker.internal",
    "port": "5433", # should be 5432 for you
    "database": "customers",
    "dialect": "postgresql",
    "username": "postgres",
    "password": "changeme"
}

url, props = postgresUrlProperties(connection)
print(url, "\n", props)

jdbc:postgresql://host.docker.internal:5433/customers 
 {'user': 'postgres', 'password': 'changeme', 'driver': 'org.postgresql.Driver'}


In [17]:
table = "customer_data"
mode = "overwrite" # options are: error, append, overwrite

df_data.write.jdbc(url, table, mode, props)

In [19]:
table = "customer_location"
mode = "overwrite" # options are: error, append, overwrite

df_location.write.jdbc(url, table, mode, props)

## MongoDB

In [20]:
# Mongo credentials
def createMongoURI(connection, database, collection):
    mongoHostname = connection['hostname']
    mongoPort = connection['port']
    mongoDatabase = database
    mongoCollection = collection
    mongoUsername = connection.get('username')
    mongoPassword = connection.get('password')
    mongoReplica = connection.get('replica')
    
    baseURI = "mongodb://"
    if bool(mongoUsername) and bool(mongoPassword):
        baseURI += f"{mongoUsername}:{mongoPassword}@"
    mongoHosts = ",".join([
        f"{host}:{mongoPort}"
        for host in mongoHostname.split(",")
    ])
    mongoURI = f"{baseURI}{mongoHosts}/{mongoDatabase}"
    if bool(mongoReplica):
        mongoURI = f"{mongoURI}.{mongoCollection}?replicaSet={mongoReplica}"
    else:
        mongoURI = f"{mongoURI}.{mongoCollection}"
    return mongoURI

In [21]:
connection = {
    "hostname": "host.docker.internal",
    "port": "27017"
}
database = "customers"

In [22]:
collection = "customer_data"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

collection = "customer_location"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

mongodb://host.docker.internal:27017/customers.customer_data
mongodb://host.docker.internal:27017/customers.customer_location


In [23]:
mode = "overwrite" # options are: error, append, overwrite
replaceDocument = False # this is for updates so documents don't get replaced on update

collection = "customer_data"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

df_data \
    .write \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .mode(mode) \
    .option("uri",mongoURI) \
    .option("collection", collection) \
    .option("replaceDocument", replaceDocument) \
    .save()

mongodb://host.docker.internal:27017/customers.customer_data


In [24]:
mode = "overwrite" # options are: error, append, overwrite
replaceDocument = False # this is for updates so documents don't get replaced on update

collection = "customer_location"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

df_location \
    .write \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .mode(mode) \
    .option("uri",mongoURI) \
    .option("collection", collection) \
    .option("replaceDocument", replaceDocument) \
    .save()

mongodb://host.docker.internal:27017/customers.customer_location
