In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# CASSANDRA CONFIGURATION
cassandra_host = "cassandra"
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.cassandra.connection.host", cassandra_host) \
      .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.1.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars


:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


com.datastax.spark#spark-cassandra-connector-assembly_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a1d3e988-d467-4738-9fe1-77689f85d7db;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector-assembly_2.12;3.1.0 in central
downloading https://repo1.maven.org/maven2/com/datastax/spark/spark-cassandra-connector-assembly_2.12/3.1.0/spark-cassandra-connector-assembly_2.12-3.1.0.jar ...
	[SUCCESSFUL ] com.datastax.spark#spark-cassandra-connector-assembly_2.12;3.1.0!spark-cassandra-connector-assembly_2.12.jar (1959ms)
:: resolution report :: resolve 782ms :: artifacts dl 1962ms
	:: modules in use:
	com.datastax.spark#spark-cassandra-connector-assembly_2.12;3.1.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-----------------------------

In [2]:
weather = spark.read.json("file:///home/jovyan/datasets/weather/weather.json")
weather.printSchema()

root
 |-- 2020census: long (nullable = true)
 |-- city: string (nullable = true)
 |-- condition: string (nullable = true)
 |-- date: string (nullable = true)
 |-- description: string (nullable = true)
 |-- dew_point: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- moon_phase: double (nullable = true)
 |-- pct_clouds: long (nullable = true)
 |-- pct_humidity: long (nullable = true)
 |-- pressure: long (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- snowfall: double (nullable = true)
 |-- state: string (nullable = true)
 |-- temperature.day: double (nullable = true)
 |-- temperature.eve: double (nullable = true)
 |-- temperature.max: double (nullable = true)
 |-- temperature.min: double (nullable = true)
 |-- temperature.morn: double (nullable = true)
 |-- temperature.night: double (nullable = true)
 |-- timezone: string (nullable = true)
 |-- uv_index: double (nullable = true)
 |-- wind.direction_deg: long (nu

                                                                                

In [4]:
print(weather.count())

1600


In [5]:
print(weather.select("date").distinct().count())



8


                                                                                

In [6]:
print(weather.select("city","state").distinct().count())



200


                                                                                

In [7]:
print(weather.select("date","city","state").distinct().count())



1600


                                                                                

In [4]:
weather.select("description").distinct().show(10)

+--------------------+
|         description|
+--------------------+
|          few clouds|
|          light rain|
|       broken clouds|
|           clear sky|
|     overcast clouds|
|          light snow|
|    scattered clouds|
|heavy intensity rain|
|       moderate rain|
|       rain and snow|
+--------------------+



In [None]:
# Data Profiling. Understanding your data set.
# - what is the nat / business key? date, city,state
# - what does "one row mean? Weather report for {city}, {state} on that {date}
# - for each column / attribute what are the data deps?

# CASSANDRA
# - Part Key: State, City Clus: Date




In [5]:
!pip install -q cassandra-driver

In [13]:
sql = '''
create table if not exists glab.weather (
  census2020 int,
  city text,
  condition text,
  weatherdate date,
  description text,
  dew_point decimal,
  latitude decimal,
  longitude decimal, 
  moon_phase decimal,
  pct_clouds int, 
  pct_humidity int, 
  pressure int, 
  rainfall decimal, 
  snowfall decimal, 
  state text,
  temperature_day decimal,
  temperature_eve decimal,
  temperature_max decimal,
  temperature_min decimal,
  temperature_morn decimal,
  temperature_night decimal,
  timezone text,
  uv_index decimal,
  wind_direction_deg int,
  wind_gust decimal,
  wind_speed decimal,
primary key ( (state,city),weatherdate )
);
'''
from cassandra.cluster import Cluster
with Cluster([cassandra_host]) as cluster:
    session = cluster.connect()
    session.execute(sql)
    


In [8]:
sql = '''
create table if not exists glab.weather_by_date (
  census2020 int,
  city text,
  condition text,
  weatherdate date,
  description text,
  dew_point decimal,
  latitude decimal,
  longitude decimal, 
  moon_phase decimal,
  pct_clouds int, 
  pct_humidity int, 
  pressure int, 
  rainfall decimal, 
  snowfall decimal, 
  state text,
  temperature_day decimal,
  temperature_eve decimal,
  temperature_max decimal,
  temperature_min decimal,
  temperature_morn decimal,
  temperature_night decimal,
  timezone text,
  uv_index decimal,
  wind_direction_deg int,
  wind_gust decimal,
  wind_speed decimal,
primary key ( weatherdate,state,city )
);
'''
from cassandra.cluster import Cluster
with Cluster([cassandra_host]) as cluster:
    session = cluster.connect()
    session.execute(sql)

In [9]:
w = weather.toDF("census2020","city","condition","weatherdate","description","dew_point",
                 "latitude","longitude","moon_phase","pct_clouds","pct_humidity","pressure",
                 "rainfall","snowfall","state","temperature_day","temperature_eve","temperature_max",
                 "temperature_min","temperature_morn","temperature_night","timezone","uv_index",
                 "wind_direction_deg","wind_gust","wind_speed")

In [12]:
w.write.format("org.apache.spark.sql.cassandra")\
  .mode("Append")\
  .options(table="weather_by_date", keyspace = "glab")\
  .option("table", "weather_by_date")\
  .option("keyspace","glab")\
  .save()

                                                                                

In [19]:
w.write.format("org.apache.spark.sql.cassandra")\
  .mode("Append")\
  .option("table", "weather")\
  .option("keyspace","glab")\
  .save()

In [15]:
w2 = spark.read.format("org.apache.spark.sql.cassandra")\
  .option("table", "weather_by_date")\
  .option("keyspace","glab")\
  .load()
w2.createOrReplaceTempView("daily_city_weather_by_date")
w2.count()

1600

In [22]:
w2 = spark.read.format("org.apache.spark.sql.cassandra")\
  .option("table", "weather")\
  .option("keyspace","glab")\
  .load()
w2.createOrReplaceTempView("daily_city_weather")
w2.count()

1600

In [21]:
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where city='Syracuse' and state='New York';
'''
spark.sql(query).explain()

== Physical Plan ==
*(1) Project [city#671, state#670, weatherdate#672, condition#674, description#675, temperature_day#685]
+- BatchScan[state#670, city#671, weatherdate#672, condition#674, description#675, temperature_day#685] Cassandra Scan: glab.weather
 - Cassandra Filters: [["state" = ?, New York],["city" = ?, Syracuse]]
 - Requested Columns: [state,city,weatherdate,condition,description,temperature_day]




In [24]:
w.select("moon_phase").distinct().show()

+----------+
|moon_phase|
+----------+
|      0.66|
|      0.54|
|       0.6|
|      0.63|
|      0.68|
|      0.57|
|       0.5|
|      0.62|
|      0.56|
|      0.65|
|      0.47|
|      0.53|
|      0.59|
|      0.69|
|      0.48|
+----------+



In [27]:
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where moon_phase=0.5;
'''
spark.sql(query).explain()

== Physical Plan ==
*(1) Project [city#766, state#765, weatherdate#767, condition#769, description#770, temperature_day#780]
+- *(1) Filter (moon_phase#774 = 0.500000000000000000)
   +- BatchScan[state#765, city#766, weatherdate#767, condition#769, description#770, moon_phase#774, temperature_day#780] Cassandra Scan: glab.weather
 - Cassandra Filters: []
 - Requested Columns: [state,city,weatherdate,condition,description,moon_phase,temperature_day]




In [23]:
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where condition='Rain';
'''
spark.sql(query).explain()

== Physical Plan ==
*(1) Project [city#766, state#765, weatherdate#767, condition#769, description#770, temperature_day#780]
+- BatchScan[state#765, city#766, weatherdate#767, condition#769, description#770, temperature_day#780] Cassandra Scan: glab.weather
 - Cassandra Filters: [["condition" = ?, Rain]]
 - Requested Columns: [state,city,weatherdate,condition,description,temperature_day]




In [49]:
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where condition = 'Rain'
union 
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where condition = 'Cloudy';
'''
spark.sql(query).show()


+--------------+--------------+-----------+---------+-------------+--------------------+
|          city|         state|weatherdate|condition|  description|     temperature_day|
+--------------+--------------+-----------+---------+-------------+--------------------+
| Moreno Valley|    California| 2021-10-23|     Rain|   light rain|61.93000000000000...|
|        Dallas|         Texas| 2021-10-26|     Rain|moderate rain|72.57000000000000...|
|   Baton Rouge|     Louisiana| 2021-10-26|     Rain|   light rain|82.87000000000000...|
|Virginia Beach|      Virginia| 2021-10-26|     Rain|   light rain|61.45000000000000...|
|       Hayward|    California| 2021-10-21|     Rain|moderate rain|67.78000000000000...|
|Pembroke Pines|       Florida| 2021-10-26|     Rain|   light rain|83.70000000000000...|
|    Pittsburgh|  Pennsylvania| 2021-10-22|     Rain|   light rain|53.62000000000000...|
|    Cincinnati|          Ohio| 2021-10-26|     Rain|   light rain|82.08000000000000...|
|        Mobile|     

                                                                                

In [45]:
w.select("condition").distinct().toJSON().collect()


['{"condition":"Clear"}',
 '{"condition":"Clouds"}',
 '{"condition":"Rain"}',
 '{"condition":"Snow"}']

In [56]:
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where condition='Rain' and weatherdate='2021-10-22'
'''
spark.sql(query).toPandas()

Unnamed: 0,city,state,weatherdate,condition,description,temperature_day
0,Mobile,Alabama,2021-10-22,Rain,light rain,82.710000000000000000
1,Montgomery,Alabama,2021-10-22,Rain,moderate rain,78.240000000000000000
2,Little Rock,Arkansas,2021-10-22,Rain,light rain,68.810000000000000000
3,Bakersfield,California,2021-10-22,Rain,light rain,73.690000000000000000
4,Elk Grove,California,2021-10-22,Rain,heavy intensity rain,57.880000000000000000
...,...,...,...,...,...,...
75,Bellevue,Washington,2021-10-22,Rain,moderate rain,55.630000000000000000
76,Seattle,Washington,2021-10-22,Rain,moderate rain,55.000000000000000000
77,Spokane,Washington,2021-10-22,Rain,moderate rain,51.370000000000000000
78,Tacoma,Washington,2021-10-22,Rain,moderate rain,55.240000000000000000


In [29]:
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where condition='Rain' and weatherdate='2021-10-22'
'''
spark.sql(query).explain()

== Physical Plan ==
*(1) Project [city#363, state#362, weatherdate#364, condition#366, description#367, temperature_day#377]
+- BatchScan[state#362, city#363, weatherdate#364, condition#366, description#367, temperature_day#377] Cassandra Scan: glab.weather
 - Cassandra Filters: [["weatherdate" = ?, 2021-10-22],["condition" = ?, Rain]]
 - Requested Columns: [state,city,weatherdate,condition,description,temperature_day]




In [23]:
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather_by_date 
    where condition='Rain' and weatherdate='2021-10-22'
'''
spark.sql(query).explain()

AnalysisException: Table or view not found: daily_city_weather_by_date; line 3 pos 9;
'Project ['city, 'state, 'weatherdate, 'condition, 'description, 'temperature_day]
+- 'Filter (('condition = Rain) AND ('weatherdate = 2021-10-22))
   +- 'UnresolvedRelation [daily_city_weather_by_date], [], false


Create cassandra indexes:
    
    create index ix_condition_by_date on weather_by_date(condition);
    create index ix_condition on weather(condition);
    
**IMPORTANT** Now to see the indexes in spark, we must reload the dataframes...

In [6]:
w2 = spark.read.format("org.apache.spark.sql.cassandra")\
  .options(table="weather",keyspace="glab")\
  .load()

w2.createOrReplaceTempView("daily_city_weather")
query = '''
select city,state,weatherdate,condition,description,temperature_day 
    from daily_city_weather 
    where condition='Rain' and weatherdate='2021-10-22'
'''
spark.sql(query).explain()

== Physical Plan ==
*(1) Project [city#252, state#251, weatherdate#253, condition#255, description#256, temperature_day#266]
+- BatchScan[state#251, city#252, weatherdate#253, condition#255, description#256, temperature_day#266] Cassandra Scan: glab.weather
 - Cassandra Filters: [["weatherdate" = ?, 2021-10-22],["condition" = ?, Rain]]
 - Requested Columns: [state,city,weatherdate,condition,description,temperature_day]




Help on built-in function print in module builtins:

print(...)
    print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)
    
    Prints the values to a stream, or to sys.stdout by default.
    Optional keyword arguments:
    file:  a file-like object (stream); defaults to the current sys.stdout.
    sep:   string inserted between values, default a space.
    end:   string appended after the last value, default a newline.
    flush: whether to forcibly flush the stream.



In [7]:
w2 = spark.read.format("org.apache.spark.sql.cassandra")\
  .option("table", "weather")\
  .option("keyspace","glab")\
  .load()
w2.where("weatherdate='2021-10-22' and condition='Rain'").explain()

== Physical Plan ==
*(1) Project [state#341, city#342, weatherdate#343, census2020#344, condition#345, description#346, dew_point#347, latitude#348, longitude#349, moon_phase#350, pct_clouds#351, pct_humidity#352, pressure#353, rainfall#354, snowfall#355, temperature_day#356, temperature_eve#357, temperature_max#358, temperature_min#359, temperature_morn#360, temperature_night#361, timezone#362, uv_index#363, wind_direction_deg#364, ... 2 more fields]
+- BatchScan[state#341, city#342, weatherdate#343, census2020#344, condition#345, description#346, dew_point#347, latitude#348, longitude#349, moon_phase#350, pct_clouds#351, pct_humidity#352, pressure#353, rainfall#354, snowfall#355, temperature_day#356, temperature_eve#357, temperature_max#358, temperature_min#359, temperature_morn#360, temperature_night#361, timezone#362, uv_index#363, wind_direction_deg#364, ... 2 more fields] Cassandra Scan: glab.weather
 - Cassandra Filters: [["weatherdate" = ?, 2021-10-22],["condition" = ?, Rain]]
