In [9]:

from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.0")
conf.set("spark.mongodb.input.uri", "mongodb://127.0.0.1/test.coll")
conf.set("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.coll")

spark = SparkSession.builder \
    .appName("test-mongo") \
    .master('local[*]') \
    .config(conf=conf) \
    .getOrCreate()


In [10]:

people = spark.createDataFrame([
    ("Bilbo Baggins",  50), ("Gandalf", 1000), ("Thorin", 195),
    ("Balin", 178), ("Kili", 77), ("Dwalin", 169), ("Oin", 167),
    ("Gloin", 158), ("Fili", 82), ("Bombur", None)], ["name", "age"])

people.show()


+-------------+----+
|         name| age|
+-------------+----+
|Bilbo Baggins|  50|
|      Gandalf|1000|
|       Thorin| 195|
|        Balin| 178|
|         Kili|  77|
|       Dwalin| 169|
|          Oin| 167|
|        Gloin| 158|
|         Fili|  82|
|       Bombur|null|
+-------------+----+



In [14]:

print("WRITE")

people.write.format("com.mongodb.spark.sql.DefaultSource").mode("append")\
    .option("database", "people") \
    .option("collection", "contacts") \
    .save()

people.printSchema()


WRITE
root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [17]:

print("READ")

df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
    .option("uri", "mongodb://127.0.0.1/people.contacts")\
    .load()

df.show()


READ
+--------------------+----+-------------+
|                 _id| age|         name|
+--------------------+----+-------------+
|[5cde6e7860462413...|1000|      Gandalf|
|[5cde6e7860462413...| 158|        Gloin|
|[5cde6e7860462413...|  82|         Fili|
|[5cde6e7860462413...|null|       Bombur|
|[5cde6e7860462413...| 167|          Oin|
|[5cde6e7860462413...| 178|        Balin|
|[5cde6e7860462413...|  77|         Kili|
|[5cde6e7860462413...| 169|       Dwalin|
|[5cde6e7860462413...|  50|Bilbo Baggins|
|[5cde6e7860462413...| 195|       Thorin|
|[5cde6ea960462413...| 169|       Dwalin|
|[5cde6ea960462413...| 158|        Gloin|
|[5cde6ea960462413...| 167|          Oin|
|[5cde6ea960462413...|1000|      Gandalf|
|[5cde6ea960462413...| 178|        Balin|
|[5cde6ea960462413...|  77|         Kili|
|[5cde6ea960462413...|  82|         Fili|
|[5cde6ea960462413...|null|       Bombur|
|[5cde6ea960462413...|  50|Bilbo Baggins|
|[5cde6ea960462413...| 195|       Thorin|
+--------------------+----+--

In [19]:

print("AGGREGATION")

pipeline = "{'$match': {'age': { '$gte': 150 }}}"
df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
    .option("uri", "mongodb://127.0.0.1/people.contacts")\
    .option("pipeline", pipeline)\
    .load()

df.show()


AGGREGATION
+--------------------+----+-------+
|                 _id| age|   name|
+--------------------+----+-------+
|[5cde6e7860462413...|1000|Gandalf|
|[5cde6e7860462413...| 158|  Gloin|
|[5cde6e7860462413...| 167|    Oin|
|[5cde6e7860462413...| 178|  Balin|
|[5cde6e7860462413...| 169| Dwalin|
|[5cde6e7860462413...| 195| Thorin|
|[5cde6ea960462413...| 169| Dwalin|
|[5cde6ea960462413...| 158|  Gloin|
|[5cde6ea960462413...| 167|    Oin|
|[5cde6ea960462413...|1000|Gandalf|
|[5cde6ea960462413...| 178|  Balin|
|[5cde6ea960462413...| 195| Thorin|
+--------------------+----+-------+



In [21]:

print("FILTER")
df.filter(df['age'] >= 1000)\
    .show()


FILTER
+--------------------+----+-------+
|                 _id| age|   name|
+--------------------+----+-------+
|[5cde6e7860462413...|1000|Gandalf|
|[5cde6ea960462413...|1000|Gandalf|
+--------------------+----+-------+



In [22]:

print("SQL")

df.createOrReplaceTempView("temp")
df = spark.sql("SELECT name, age FROM temp WHERE name LIKE '%Gandalf%'")
df.show()


SQL
+-------+----+
|   name| age|
+-------+----+
|Gandalf|1000|
|Gandalf|1000|
+-------+----+



In [6]:
spark.stop()
