In [6]:

from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.0")
conf.set("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.coll")
conf.set("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.coll")

spark = SparkSession.builder \
    .appName("test-mongo") \
    .master('local[*]') \
    .config(conf=conf) \
    .getOrCreate()


In [7]:

people = spark.createDataFrame([
    ("Bilbo Baggins",  50), ("Gandalf", 1000), ("Thorin", 195),
    ("Balin", 178), ("Kili", 77), ("Dwalin", 169), ("Oin", 167),
    ("Gloin", 158), ("Fili", 82), ("Bombur", None)], ["name", "age"])

people.show()


+-------------+----+
|         name| age|
+-------------+----+
|Bilbo Baggins|  50|
|      Gandalf|1000|
|       Thorin| 195|
|        Balin| 178|
|         Kili|  77|
|       Dwalin| 169|
|          Oin| 167|
|        Gloin| 158|
|         Fili|  82|
|       Bombur|null|
+-------------+----+



In [8]:

print("WRITE/READ")

people.write.format("com.mongodb.spark.sql.DefaultSource").mode("append")\
    .option("database", "people") \
    .option("collection", "contacts") \
    .save()

people.printSchema()

df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
    .load()

df.show()


WRITE/READ
root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

+--------------------+----+-------------+
|                 _id| age|         name|
+--------------------+----+-------------+
|[5cc6ac7d78312462...|  82|         Fili|
|[5cc6ac7d78312462...|null|       Bombur|
|[5cc6ac7d78312462...|  50|Bilbo Baggins|
|[5cc6ac7d78312462...| 195|       Thorin|
|[5cc6ac7d78312462...| 158|        Gloin|
|[5cc6ac7d78312462...|1000|      Gandalf|
|[5cc6ac7d78312462...| 167|          Oin|
|[5cc6ac7d78312462...| 178|        Balin|
|[5cc6ac7d78312462...|  77|         Kili|
|[5cc6ac7d78312462...| 169|       Dwalin|
|[5cc6bc4c78312462...| 167|          Oin|
|[5cc6bc4c78312462...|1000|      Gandalf|
|[5cc6bc4c78312462...| 169|       Dwalin|
|[5cc6bc4c78312462...| 195|       Thorin|
|[5cc6bc4c78312462...| 178|        Balin|
|[5cc6bc4c78312462...|  77|         Kili|
|[5cc6bc4c78312462...|  50|Bilbo Baggins|
|[5cc6bc4c78312462...|  82|         Fili|
|[5cc6bc4c78312462...|null|     

In [9]:

print("CHANGE COLLECTION TO people")

df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
    .option("uri", "mongodb://127.0.0.1/people.contacts")\
    .load()

df.show()


CHANGE COLLECTION
+--------------------+----+-------------+
|                 _id| age|         name|
+--------------------+----+-------------+
|[5cc6bc9578312462...|  82|         Fili|
|[5cc6bc9578312462...|null|       Bombur|
|[5cc6bc9578312462...| 167|          Oin|
|[5cc6bc9578312462...|  50|Bilbo Baggins|
|[5cc6bc9578312462...| 158|        Gloin|
|[5cc6bc9578312462...| 195|       Thorin|
|[5cc6bc9578312462...| 178|        Balin|
|[5cc6bc9578312462...|  77|         Kili|
|[5cc6bc9578312462...|1000|      Gandalf|
|[5cc6bc9578312462...| 169|       Dwalin|
|[5cc6bd1178312462...|1000|      Gandalf|
|[5cc6bd1178312462...|  50|Bilbo Baggins|
|[5cc6bd1178312462...| 178|        Balin|
|[5cc6bd1178312462...|  77|         Kili|
|[5cc6bd1178312462...| 167|          Oin|
|[5cc6bd1178312462...| 158|        Gloin|
|[5cc6bd1178312462...| 169|       Dwalin|
|[5cc6bd1178312462...| 195|       Thorin|
|[5cc6bd1178312462...|  82|         Fili|
|[5cc6bd1178312462...|null|       Bombur|
+---------------

In [12]:

print("AGGREGATION")

pipeline = "{'$match': {'age': { '$gte': 150 }}}"
df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
    .option("pipeline", pipeline)\
    .load()

df.show()


AGGREGATION
+--------------------+----+-------+
|                 _id| age|   name|
+--------------------+----+-------+
|[5cc6ac7d78312462...| 195| Thorin|
|[5cc6ac7d78312462...| 158|  Gloin|
|[5cc6ac7d78312462...|1000|Gandalf|
|[5cc6ac7d78312462...| 167|    Oin|
|[5cc6ac7d78312462...| 178|  Balin|
|[5cc6ac7d78312462...| 169| Dwalin|
|[5cc6bc4c78312462...| 167|    Oin|
|[5cc6bc4c78312462...|1000|Gandalf|
|[5cc6bc4c78312462...| 169| Dwalin|
|[5cc6bc4c78312462...| 195| Thorin|
|[5cc6bc4c78312462...| 178|  Balin|
|[5cc6bc4c78312462...| 158|  Gloin|
+--------------------+----+-------+



In [11]:

print("FILTER")
df.filter(df['age'] >= 1000)\
    .show()


FILTER
+--------------------+----+-------+
|                 _id| age|   name|
+--------------------+----+-------+
|[5cc6ac7d78312462...|1000|Gandalf|
|[5cc6bc4c78312462...|1000|Gandalf|
+--------------------+----+-------+



In [15]:

print("SQL")

df.createOrReplaceTempView("temp")
df = spark.sql("SELECT name, age FROM temp WHERE name LIKE '%Gandalf%'")
df.show()


SQL


Py4JJavaError: An error occurred while calling o291.showString.
: java.lang.IllegalStateException: SparkContext has been shutdown
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2053)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [14]:
spark.stop()
