# Unit G
# Wide-Column Database Model

- Examples From Video Lecture 

In [17]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# CASSANDRA CONFIGURATION
cassandra_host = "cassandra"
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.cassandra.connection.host", cassandra_host) \
      .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.1.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

## CASSANDRA CQL

- SQL-Like Syntax for Cassandra


### Connecting to the Cassandra client

`PS> docker-compose exec cassandra cqlsh`


### Keyspaces

```
Cqsql> describe keyspaces;
Cqlsh> create keyspace sysmon with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };
Cqlsh> Use sysmon;
Cqlsh> HELP

```

### Table Basics

All from the `cqlsh:sysmon>` prompt

```
# let’s make a table - every taable MUST have a primary key!! name is BOTH partion and cluster key

CREATE table users (name text, age tinyint, primary key (name));

# take a look at it 
DESCRIBE table users;

# you can drop tables 
Drop table users;

# now re-create the table

# Some inserts – you can insert the same thing
Insert into users (name, age) values (‘mike’, 47);
Insert into users (name, age) values (‘mike’, 47);
Insert into users (name, age) values (‘mike’, 47);

# show data
Select * from users;

#what happens when you insert the same key with different values? – its like an update!
Insert into users (name, age) values (‘mike’, 48);


# but its only there once – There are no integrity constraints!
# every INSERT is an UPSERT. INSERT or UPDATE IF EXISTS


```


### Understanding Partitioning

Let's look at a more realistic example:

```
CREATE TABLE system_utilization (
	hostname TEXT,
	os TEXT,
	measured_on TIMESTAMP,
	cpu_pct TINYINT,
	PRIMARY KEY (hostname, measured_on)
);

#
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ('Saturn', 'windows', '2018-07-19 09:00', 90);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'Saturn', 'windows', '2018-07-19 10:00', 5);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'Saturn', 'windows', '2018-07-19 11:00', 10);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'venus', 'osx', '2018-07-19 09:00', 5);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'venus', 'osx', '2018-07-19 10:00', 0);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'venus', 'osx', '2018-07-19 11:00', 15);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'mars', 'windows', '2018-07-19 09:00', 5);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'mars', 'windows', '2018-07-19 10:00', 50);
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'mars', 'windows', '2018-07-19 11:00', 75);

# you can filter by partition key
Select * from system_utilization where hostname ='mars';

# you can filter by particion key and cluster key
Select * from system_utilization where hostname ='mars' and measured_on > '2018-07-19 9:30';

#you cannot filter by non key
Select * from system_utilization where os = 'windows'

# or even just by cluster key
Select * from system_utilization where measured_on >'2018-07-19 9:30';

# or a combination of partition key plus a non-key
Select * from system_utilization where hostname ='mars' and cpu_pct=5;

# youc can add  aLLOW FILTERING to override this but need to consider what you are doing! 

# This is why the partitioning scheme is so important to Cassandra queries This data was partitioned so that each hostname is on its own node in the cluster. So when we omit hostname from the query Cassandra must ask every node for its data. This can be quite time consuming when there are dozens of nodes!
```

### Secondary Indexes

Indexes permit querrying non-key columns when the parition key is specified.

```
# Needs allow fildtering
Select * from system_utilization where os='windows' and hostname='Saturn';

# Create the index 
Create index ix_system_utilization_os ON system_utilization (os);

# now these queries work without the ALLOW FILTERING
Select * from system_utilization where os='windows' and hostname='Saturn';

# You can leave off the hostname and it works, but this is a bad idea since indexes are distributed.
Select * from system_utilization where os=‘windows’;

# to show the index on the table; It’s attached to it!
Describe system_utilitzation

# drop the index
Drop index ix_system_utilization_os
```

### Materialized Views

Mat Views re-write the partition key so the same data may be queried in different ways.


```
# Create materialized view
Create materialized view system_utilization_by_os 
	as 
	select * from system_utilization where os is not null and hostname is not null and measured on is not null primary key (os, hostname, measured_on);
#let’s see the view attached to the table
Desc system_utilization;

# Can’t filter on the os column without ALLOW FILTERING
Select * from system_utilization where os = 'osx';

#but you can filter by os on the MV since it has a new partition key!
Select * from system_utilization_by_os where os = 'osx';

#insert to show it works!
Insert into system_utilization (hostname, os, measured_on, cpu_pct) values ( 'earth', 'osx', '2018-07-19 9:00', 100);

# Select from table / and materliized view as proof
Select * from system_utilization_by_os where os = 'osx';

# This can be costly as we are duplicating a lot of data. at least the API allows us to insert once as opposed to separate tables!
```


### Updates and Deletes 

```
#alter table
Alter table systems_utilization add applications list<text>;

Update systems_utilizations set applications = ['word', 'excel'] where hostname ='mars' and measured_on '2019-07-19 9:00'

Update systems_utilizations set applications = ['calc', 'word'] where hostname ='mars' and measured_on '2019-07-19 10:00'

Update systems_utilizations set applications = ['calc', 'solitare', 'doom'] where hostname ='mars' and measured_on ‘2019-07-19 11:00'

Select * from systems_utilization where applications contains 'word' allow filtering;

#set it to null
Delete applications from systems_utilization where hostname ='mars' and measured_on '2019-07-19 11:00'

Select * from systems_utilizations;

# delete row
Delete from systems_utilizations where hostname ='mars' and measured_on '2019-07-19 11:00'
```

### Consistency Levels

```
Consistency
Consistency all
Consistency quorum
Consistency one;
```


## Loading Sample Data

- Run this code to load some sample data into Cassandra
- Cassandra requires a schema. You cannot create this schema in Spark, so we will use plain-old-python to run the CQL DDL code to make the table
- The CSV file does not understand dates, so we must use `withColumn()` to cast the string type to date type prior to loading in cassandra.
- Because Cassandra is key based, and supports UPSERT we can use the "Append" mode to write data to the table.



In [27]:
# WE NEED A TABLE BEFORE WE CAN WRITE, Using Plain old Python
!pip install -q cassandra-driver
from cassandra.cluster import Cluster
with Cluster([cassandra_host]) as cluster:
    session = cluster.connect()
    session.execute("CREATE KEYSPACE IF NOT EXISTS gdemo WITH replication={ 'class': 'SimpleStrategy', 'replication_factor' : 1 };")
    table = '''
    CREATE TABLE IF NOT EXISTS gdemo.fudgemart_order_details (
        customer_id int,
        customer_email text,
        customer_name text,
        customer_address text,
        customer_city text,
        customer_state text,
        customer_zip text,
        order_id int,
        order_date date,
        creditcard_number text,
        creditcard_exp_date text, 
        order_total decimal ,
        ship_via text,
        shipped_date date,
        product_id int,
        order_item_id int,
        order_qty int,
        product_name text,
        product_retail_price decimal,
    primary key ((customer_id, order_id), order_item_id) 
    );
    '''
    session.execute(table)

# NOTE: CSV File format does not understand dates, but Cassandra does, so we must cast the string columns to date before loading into the table
od = spark.read.option("inferSchema",True).option("header",True).csv("file:///home/jovyan/datasets/fudgemart/fudgemart-order-details.csv")\
    .withColumn("order_date", col("order_date").cast("date")).withColumn("shipped_date", col("shipped_date").cast("date")) 
    
od.write.format("org.apache.spark.sql.cassandra")\
  .mode("Append")\
  .option("table", "fudgemart_order_details")\
  .option("keyspace","gdemo")\
  .save()

In [23]:
df =spark.read.format("org.apache.spark.sql.cassandra")\
    .options(table="fudgemart_order_details", keyspace="gdemo") \
    .load()

In [29]:
df.printSchema()

root
 |-- customer_id: integer (nullable = false)
 |-- order_id: integer (nullable = false)
 |-- order_item_id: integer (nullable = true)
 |-- creditcard_exp_date: string (nullable = true)
 |-- creditcard_number: string (nullable = true)
 |-- customer_address: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_zip: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_qty: integer (nullable = true)
 |-- order_total: decimal(38,18) (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_retail_price: decimal(38,18) (nullable = true)
 |-- ship_via: string (nullable = true)
 |-- shipped_date: date (nullable = true)



In [36]:
# This CAN  be filtered in Cassandra, uses partition key
df.filter("customer_id=13 and order_id=1843").explain()

== Physical Plan ==
*(1) Project [customer_id#694, order_id#695, order_item_id#696, creditcard_exp_date#697, creditcard_number#698, customer_address#699, customer_city#700, customer_email#701, customer_name#702, customer_state#703, customer_zip#704, order_date#705, order_qty#706, order_total#707, product_id#708, product_name#709, product_retail_price#710, ship_via#711, shipped_date#712]
+- BatchScan[customer_id#694, order_id#695, order_item_id#696, creditcard_exp_date#697, creditcard_number#698, customer_address#699, customer_city#700, customer_email#701, customer_name#702, customer_state#703, customer_zip#704, order_date#705, order_qty#706, order_total#707, product_id#708, product_name#709, product_retail_price#710, ship_via#711, shipped_date#712] Cassandra Scan: gdemo.fudgemart_order_details
 - Cassandra Filters: [["customer_id" = ?, 13],["order_id" = ?, 1843]]
 - Requested Columns: [customer_id,order_id,order_item_id,creditcard_exp_date,creditcard_number,customer_address,customer_ci

In [35]:
# This CANNOT be filtered in Cassandra, Optimizations left to Spark. ANTI-PATTERN you can do this, but dont!
# Every row of data in your table within this cassandra cluster ends up in your spark cluster!
df.filter("ship_via='Postal Service'").explain()


== Physical Plan ==
*(1) Filter (ship_via#711 = Postal Service)
+- BatchScan[customer_id#694, order_id#695, order_item_id#696, creditcard_exp_date#697, creditcard_number#698, customer_address#699, customer_city#700, customer_email#701, customer_name#702, customer_state#703, customer_zip#704, order_date#705, order_qty#706, order_total#707, product_id#708, product_name#709, product_retail_price#710, ship_via#711, shipped_date#712] Cassandra Scan: gdemo.fudgemart_order_details
 - Cassandra Filters: []
 - Requested Columns: [customer_id,order_id,order_item_id,creditcard_exp_date,creditcard_number,customer_address,customer_city,customer_email,customer_name,customer_state,customer_zip,order_date,order_qty,order_total,product_id,product_name,product_retail_price,ship_via,shipped_date]


