# Example to Read / Write to Cassandra with Spark

Documentation: https://github.com/datastax/spark-cassandra-connector/blob/master/doc/15_python.md


In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# CASSANDRA CONFIGURATION
cassandra_host = "cassandra"


In [3]:
# Spark init
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.cassandra.connection.host", cassandra_host) \
      .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.1.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [4]:
# read local data
df = spark.read.option("multiline","true").json("/home/jovyan/datasets/json-samples/stocks.json")
df.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.12,AMZN
2,251.11,FB
3,1725.05,GOOG
4,128.39,IBM
5,212.55,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.8,TSLA
9,45.11,TWTR


In [5]:
# WE NEED A TABLE BEFORE WE CAN WRITE, Using Plain old Python
!pip install -q cassandra-driver
from cassandra.cluster import Cluster
with Cluster([cassandra_host]) as cluster:
    session = cluster.connect()
    session.execute("CREATE KEYSPACE IF NOT EXISTS example WITH replication={ 'class': 'SimpleStrategy', 'replication_factor' : 1 };")
    session.execute("CREATE TABLE IF NOT EXISTS example.stocks (symbol text, price decimal, primary key (symbol));")



In [6]:
# Write to back to our newly-minted Cassandra table, Append mode is okay here because of Cassandra's default upsert behavior.
df.write.format("org.apache.spark.sql.cassandra")\
  .mode("Append")\
  .option("table", "stocks")\
  .option("keyspace","example")\
  .save()

In [7]:
# read back from Cassandra
df1 =spark.read.format("org.apache.spark.sql.cassandra")\
    .options(table="stocks", keyspace="example") \
    .load()
df1.toPandas()

Unnamed: 0,symbol,price
0,MSFT,212.55
1,NFLX,497.0
2,NET,78.0
3,TSLA,823.8
4,TWTR,45.11
5,IBM,128.39
6,AMZN,3098.12
7,GOOG,1725.05
8,FB,251.11
9,AAPL,126.82
