## Docs

MongoDB Spark connector

https://docs.mongodb.com/spark-connector/master/python-api/

In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Need postgres
# https://mvnrepository.com/artifact/org.postgresql/postgresql
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0", 
            "org.mongodb.spark:mongo-spark-connector_2.11:2.4.1"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F

In [4]:
# get or create Spark session

app_name = "spark-mongodb"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [5]:
user_payment_file ="../Resources/user_payment.csv"
user_data_file ="../Resources/user_data.csv"

spark.sparkContext.addFile(user_payment_file)
spark.sparkContext.addFile(user_data_file)

df_payment = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(SparkFiles.get("user_payment.csv"))
df_payment.printSchema()


df_data = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(SparkFiles.get("user_data.csv"))
df_data.printSchema()

root
 |-- billing_id: integer (nullable = true)
 |-- username: string (nullable = true)
 |-- cc_encrypted: string (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- active_user: boolean (nullable = true)
 |-- street_address: string (nullable = true)
 |-- state: string (nullable = true)
 |-- username: string (nullable = true)



In [7]:
connection = {
    "hostname": "host.docker.internal",
    "port": "27017"
}
database = "users"

In [8]:
# Mongo credentials
def createMongoURI(connection, database, collection):
    mongoHostname = connection['hostname']
    mongoPort = connection['port']
    mongoDatabase = database
    mongoCollection = collection
    mongoUsername = connection.get('username')
    mongoPassword = connection.get('password')
    mongoReplica = connection.get('replica')
    
    baseURI = "mongodb://"
    if bool(mongoUsername) and bool(mongoPassword):
        baseURI += f"{mongoUsername}:{mongoPassword}@"
    mongoHosts = ",".join([
        f"{host}:{mongoPort}"
        for host in mongoHostname.split(",")
    ])
    mongoURI = f"{baseURI}{mongoHosts}/{mongoDatabase}"
    if bool(mongoReplica):
        mongoURI = f"{mongoURI}.{mongoCollection}?replicaSet={mongoReplica}"
    else:
        mongoURI = f"{mongoURI}.{mongoCollection}"
    return mongoURI

In [9]:
collection = "user_data"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

collection = "user_payment"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

mongodb://host.docker.internal:27017/users.user_data
mongodb://host.docker.internal:27017/users.user_payment


## Loading into Mongo

When updating the default behavior is to replace the original document in Mongo. Example


Starting collection. Single document
```
{
 "_id":1, 
 "age": 10,
 "foo": 20
}
```

If we have a Spark dataframe like

```
_id | val
 1  | 'a'
 2  | 'b'
```

If replaceDocument = True (default) then you get

```
[   
    {
     "_id":1, 
     "val": 'a'
    },
   {
     "_id":2, 
     "val':'b' 
    }
]
```

replaceDocument = False gives you

```
[   
    {
     "_id":1, 
     "age": 10,
     "foo": 20
     "val": 'a'
    },
   {
     "_id":2, 
     "val':'b' 
    }
]
```

In [12]:
mode = "overwrite" # options are: error, append, overwrite
replaceDocument = False # this is for updates so documents don't get replaced on update

collection = "user_data"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

df_data \
    .write \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .mode(mode) \
    .option("uri",mongoURI) \
    .option("collection", collection) \
    .option("replaceDocument", replaceDocument) \
    .save()

mongodb://host.docker.internal:27017/users.user_data


In [13]:
mode = "overwrite" # options are: error, append, overwrite
replaceDocument = False # this is for updates so documents don't get replaced on update

collection = "user_payment"
mongoURI = createMongoURI(connection, database, collection)
print(mongoURI)

df_payment \
    .write \
    .format("com.mongodb.spark.sql.DefaultSource") \
    .mode(mode) \
    .option("uri",mongoURI) \
    .option("collection", collection) \
    .option("replaceDocument", replaceDocument) \
    .save()

mongodb://host.docker.internal:27017/users.user_payment
