In [None]:
# Set up Cassandra database for below tests using CQLSH
CREATE KEYSPACE IF NOT EXISTS test
   WITH REPLICATION =
   { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };
USE test;
CREATE TABLE IF NOT EXISTS testing123 (id int, name text, city text, PRIMARY KEY (id));
INSERT INTO testing123 (id, name, city) VALUES (1, 'Amanda', 'Bay Area');
INSERT INTO testing123 (id, name, city) VALUES (2, 'Toby', 'NYC');

In [None]:
# Configure Spark to include DataStax Cassandra:
# https://www.datastax.com/blog/install-all-things-especially-apache-cassandra-apache-spark-and-jupyter
#cd $SPARK_HOME/conf
#vim spark-defaults.conf
#//Add line spark.jars.packages
#spark.jars.packages     com.datastax.spark:spark-cassandra-connector_2.12:3.0.0

In [None]:
# Working standalone PySpark call for use with DataStax Cassandra Spark connector
# Check out: https://github.com/datastax/spark-cassandra-connector/blob/master/doc/15_python.md
#pyspark --conf spark.cassandra.connection.host=cassandra --packages com.datastax.spark:spark-cassandra-connector_2.12:3.0.0 --conf spark.sql.extensions=com.datastax.spark.connector.CassandraSparkExtensions

# Check following for query examples:
# http://rustyrazorblade.com/post/2015/2015-07-30-python-dataframes-revisited/

In [1]:
# Set PySpark configuration
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.cassandra.connection.host=cassandra --packages com.datastax.spark:spark-cassandra-connector_2.12:3.0.0 --conf spark.sql.extensions=com.datastax.spark.connector.CassandraSparkExtensions pyspark-shell'

In [3]:
# Get a Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('demo').master("spark://spark-master:7077").getOrCreate()
spark

In [6]:
# Read the testdata from Cassandra using Spark and the DataStax Spark Cassandra Connector
df = spark.read.format("org.apache.spark.sql.cassandra").options(table="testing123", keyspace="test").load()
print ("Table Row Count: ", df.count())
df.show()

Table Row Count:  2
+---+--------+------+
| id|    city|  name|
+---+--------+------+
|  2|     NYC|  Toby|
|  1|Bay Area|Amanda|
+---+--------+------+



In [10]:
# Terminate the Spark session
spark.stop()

In [8]:
# Install the DataStax Python driver
!pip3 install cassandra-driver

Collecting cassandra-driver
  Downloading cassandra_driver-3.24.0-cp38-cp38-manylinux1_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 5.5 MB/s eta 0:00:01
Collecting geomet<0.3,>=0.1
  Downloading geomet-0.2.1.post1-py3-none-any.whl (18 kB)
Installing collected packages: geomet, cassandra-driver
Successfully installed cassandra-driver-3.24.0 geomet-0.2.1.post1


In [9]:
# Straight connection to Cassandra using the DataStax Python driver
import cassandra

from cassandra.cluster import Cluster
cluster = Cluster(['cassandra']) # include all nodes inhere
session = cluster.connect('test') # connect to this keyspace

rows = session.execute('SELECT * FROM testing123')
for row in rows:
    print (row)

Row(id=1, city='Bay Area', name='Amanda')
Row(id=2, city='NYC', name='Toby')
