In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# CASSANDRA CONFIGURATION
cassandra_host = "cassandra"
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.cassandra.connection.host", cassandra_host) \
      .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.1.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [7]:
w = spark.read.json("file:///home/jovyan/datasets/weather/weather.json")
w.printSchema()
w.show()

root
 |-- 2020census: long (nullable = true)
 |-- city: string (nullable = true)
 |-- condition: string (nullable = true)
 |-- date: string (nullable = true)
 |-- description: string (nullable = true)
 |-- dew_point: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- moon_phase: double (nullable = true)
 |-- pct_clouds: long (nullable = true)
 |-- pct_humidity: long (nullable = true)
 |-- pressure: long (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- snowfall: double (nullable = true)
 |-- state: string (nullable = true)
 |-- temperature.day: double (nullable = true)
 |-- temperature.eve: double (nullable = true)
 |-- temperature.max: double (nullable = true)
 |-- temperature.min: double (nullable = true)
 |-- temperature.morn: double (nullable = true)
 |-- temperature.night: double (nullable = true)
 |-- timezone: string (nullable = true)
 |-- uv_index: double (nullable = true)
 |-- wind.direction_deg: long (nu

In [12]:
print(w.count())
print(w.select("date","city","state").distinct().count())

1600
1600


In [18]:
!pip install -q cassandra-driver

sql = '''
create table glab.daily_city_weather (
    city2020census int, 
    city text,
    condition text,
    date text, 
    description text, 
    dew_point text,
    latitude double,
    longitude double,
    moon_phase double,
    pct_clouds int,
    pct_humidity int,
    pressure int, 
    rainfall double,
    snowfall double,
    state text,
    temp_day double,
    temp_eve double,
    temp_max double,
    temp_min double,
    temp_morn double,
    temp_night double,
    timezone text,
    uv_index double,
    wind_dir_deg int,
    wind_gust double, 
    wind_speed double,
    primary key( (city,state), date)
);

'''
from cassandra.cluster import Cluster
with Cluster([cassandra_host]) as cluster:
    session = cluster.connect()
    session.execute(sql)

In [33]:
w.toDF("city2020census","city","condition","date","description","dew_point","latitude","longitude","moon_phase", "pct_clouds",
       "pct_humidity","pressure","rainfall","snowfall","state","temp_day","temp_eve","temp_max","temp_min","temp_morn","temp_night",
       "timezone","uv_index","wind_dir_deg","wind_gust", "wind_speed").write.format("org.apache.spark.sql.cassandra")\
    .mode("Append")\
    .option("table", "daily_city_weather")\
    .option("keyspace","glab")\
    .save()

w2 = spark.read.format("org.apache.spark.sql.cassandra")\
    .option("table", "daily_city_weather")\
    .option("keyspace","glab")\
    .load()

print(w.count())
print(w2.count())

1600
1600


In [35]:
w2.createOrReplaceTempView("daily_city_weather")
spark.sql("select * from daily_city_weather where city='Syracuse' and state='New York'").explain()

== Physical Plan ==
*(1) Project [city#1516, state#1517, date#1518, city2020census#1519, condition#1520, description#1521, dew_point#1522, latitude#1523, longitude#1524, moon_phase#1525, pct_clouds#1526, pct_humidity#1527, pressure#1528, rainfall#1529, snowfall#1530, temp_day#1531, temp_eve#1532, temp_max#1533, temp_min#1534, temp_morn#1535, temp_night#1536, timezone#1537, uv_index#1538, wind_dir_deg#1539, ... 2 more fields]
+- BatchScan[city#1516, state#1517, date#1518, city2020census#1519, condition#1520, description#1521, dew_point#1522, latitude#1523, longitude#1524, moon_phase#1525, pct_clouds#1526, pct_humidity#1527, pressure#1528, rainfall#1529, snowfall#1530, temp_day#1531, temp_eve#1532, temp_max#1533, temp_min#1534, temp_morn#1535, temp_night#1536, timezone#1537, uv_index#1538, wind_dir_deg#1539, ... 2 more fields] Cassandra Scan: glab.daily_city_weather
 - Cassandra Filters: [["city" = ?, Syracuse],["state" = ?, New York]]
 - Requested Columns: [city,state,date,city2020censu

In [43]:
query = '''
select city,state,date,condition,description
    from daily_city_weather
    where condition = 'Rain'
    and date = '2021-10-20'
'''
spark.sql(query).explain()

== Physical Plan ==
*(1) Filter (condition#1520 = Rain)
+- BatchScan[city#1516, state#1517, date#1518, condition#1520, description#1521] Cassandra Scan: glab.daily_city_weather
 - Cassandra Filters: [["date" = ?, 2021-10-20]]
 - Requested Columns: [city,state,date,condition,description]




In [9]:
spark.read.format("org.apache.spark.sql.cassandra")\
    .option("table", "daily_city_weather_by_date_condition")\
    .option("keyspace","glab")\
    .option("pushdown",True)\
    .load()\
    .createOrReplaceTempView("daily_city_weather_by_date_condition")

AnalysisException: Couldn't find table daily_city_weather_by_date_condition in glab - Found similar tables in that keyspace:
glab.daily_city_weather_by_date_condition
glab.daily_city_weather

In [5]:
spark.sql('''
CREATE TEMPORARY TABLE daily_city_weather_by_date_condition 
USING org.apache.spark.sql.cassandra 
OPTIONS ( table "daily_city_weather_by_date_condition", 
keyspace "glab",   
cluster "cassandra",
pushdown "true")
''')  

AnalysisException: org.apache.spark.sql.cassandra is not a valid Spark SQL Data Source.