# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Spark SQL: Soluciones de almacenamiento para Big Data (Postgres)** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Storage-Solutions-Cassandra-JJAE") \
    .master("spark://spark-master:7077") \
    .config("spark.cassandra.connection.host", "cassandra-iteso") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.13:3.5.0") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext


:: loading settings :: url = jar:file:/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f661edfc-3ce5-4a5d-ac16-367991f9f3ad;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector_2.13;3.5.0 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.13;3.5.0 in central
	found org.scala-lang.modules#scala-collection-compat_2.13;2.11.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.0.4 in central
	found com.datastax.oss#java-driver-core-shaded;4.13.0 in central
	found com.datastax.oss#native-protocol;1.5.0 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.

In [3]:
try:
    df = spark.read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table="netflix", keyspace="my_keyspace") \
        .load()
    df.printSchema()
    df.show()  # Should show empty result if the table has no data, but without crashing
except Exception as e:
    print(f"Connection or read error: {e}")


root
 |-- show_id: string (nullable = false)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- director: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)



                                                                                

+-------+-------+----------+--------+--------+---------+------+------------+-----+----+
|show_id|country|date_added|director|duration|listed_in|rating|release_year|title|type|
+-------+-------+----------+--------+--------+---------+------+------------+-----+----+
+-------+-------+----------+--------+--------+---------+------+------------+-----+----+



In [5]:
print(spark.version)


3.5.4


#### Cargamos el DF de Netflix

In [4]:
from team_name.spark_utils import SparkUtils

columns_info = [("show_id", "string"),("type", "string"),("title", "string"),("director", "string"),("country", "string"),("date_added", "string"),("release_year", "integer"),("rating", "string"), ("duration", "string"),("listed_in", "string")]

netflix_schema = SparkUtils.generate_schema(columns_info)

netflix_df = spark.read \
                .schema(netflix_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/netflix1.csv")

netflix_df.printSchema()

netflix_df.show(5, truncate=False)

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)

+-------+-------+--------------------------------+---------------+-------------+----------+------------+------+--------+-------------------------------------------------------------+
|show_id|type   |title                           |director       |country      |date_added|release_year|rating|duration|listed_in                                                    |
+-------+-------+--------------------------------+---------------+-------------+----------+------------+------+--------+-------------------------------------------------------------+
|s1     |Movie  |Dick Johnson Is Dead           

                                                                                

#### Escribir un DF a Cassandra

In [5]:
netflix_df.write \
    .format("org.apache.spark.sql.cassandra") \
    .mode("append") \
    .options(table="netflix", keyspace="my_keyspace") \
    .save()

print("Done")

[Stage 4:>                                                          (0 + 1) / 1]

Done


                                                                                

#### Leer info desde Cassandra

In [6]:
# Read from Cassandra
cassandra_df = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .option("keyspace", "my_keyspace") \
    .option("table", "netflix").load() 

cassandra_df.show(5)


+-------+--------------+----------+--------------------+--------+--------------------+------+------------+---------------+-----+
|show_id|       country|date_added|            director|duration|           listed_in|rating|release_year|          title| type|
+-------+--------------+----------+--------------------+--------+--------------------+------+------------+---------------+-----+
|  s6541| United States| 10/1/2019|         Paul Haggis| 112 min|Dramas, Independe...|     R|        2004|          Crash|Movie|
|  s7516| United States|  4/9/2019|Peter Farrelly, W...|  94 min|            Comedies|     R|        2013|       Movie 43|Movie|
|   s586|United Kingdom|  7/1/2021|     Claire McCarthy| 106 min|Dramas, Romantic ...| PG-13|        2018|        Ophelia|Movie|
|  s8265|         India|  1/1/2018|       Satish Manwar|  99 min|Dramas, Internati...| TV-14|        2009|The Damned Rain|Movie|
|  s7834|      Pakistan|  8/1/2018|      Amir Mohiuddin| 126 min|Dramas, Internati...| TV-14|    

25/03/21 14:40:23 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/blockmgr-a5655a2d-d23a-4df8-a4d6-383dc7e294ff. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/blockmgr-a5655a2d-d23a-4df8-a4d6-383dc7e294ff
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:174)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:109)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:90)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively(SparkFileUtils.scala:121)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively$(SparkFileUtils.scala:120)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1126)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1(DiskBlockManager.scala:368)
	at org.apache.spark.storage.DiskBlockManager.$anonfun$doStop$1$adapted(DiskBlockManager.scala:364)
	at scala.collection.ArrayOps$.foreach$

In [10]:
sc.stop()

25/03/21 05:05:19 WARN ChannelPool: [s0|cassandra-iteso/172.18.0.2:9042]  Error while opening new channel (ConnectionInitException: [s0|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=8f58954a-3fb6-4df6-942f-7fad6939c982, APPLICATION_NAME=Spark-Cassandra-Connector-app-20250321043324-0000}): failed to send request (com.datastax.oss.driver.shaded.netty.channel.StacklessClosedChannelException))
25/03/21 05:05:19 WARN ControlConnection: [s0] Error connecting to Node(endPoint=cassandra-iteso/172.18.0.2:9042, hostId=703cfdb6-e72b-4e40-957c-73c00437f99d, hashCode=35fb762c), trying next node (ConnectionInitException: [s0|control|connecting...] Protocol initialization request, step 1 (STARTUP {CQL_VERSION=3.0.0, DRIVER_NAME=DataStax Java driver for Apache Cassandra(R), DRIVER_VERSION=4.13.0, CLIENT_ID=8f58954a-3fb6-4df6-942f-7fad6939c982, APPLICATION_NAME=Spark-Cassandr