In [1]:
# prompt: create a spark session and load a loal data via parallelize into an rdd.

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Create a SparkSession
spark = SparkSession.builder \
    .appName("LoadLocalData") \
    .getOrCreate()


In [2]:
spark

In [None]:

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]

# Create an RDD from the local data
rdd = spark.sparkContext.parallelize(data)

# Print the RDD contents
print("RDD Contents:")
for item in rdd.collect():
    print(item)

# Infer schema (optional)
schema = StructType([StructField("Name", StringType(), True), StructField("Age", IntegerType(), True)])
df = spark.createDataFrame(rdd, schema)

# Show DataFrame (optional)
print("\nDataFrame Contents:")
df.show()


In [None]:
df.collect()

[Row(Name='Alice', Age=25),
 Row(Name='Bob', Age=30),
 Row(Name='Charlie', Age=35)]

In [None]:
spark

In [None]:
!pip install -q pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.config('spark.ui.port', '4050').getOrCreate()

!wget -qnc https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -n -q ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 4050 &')
!sleep 5
!curl -s http://localhost:4040/api/tunnels | grep -Po 'public_url":"(?=https)\K[^"]*'

In [None]:
# prompt: give me a way to access Spark UI on my laptop directly

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Create a SparkSession with UI settings
spark = SparkSession.builder \
    .appName("LoadLocalData") \
    .config("spark.ui.port", "4040") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()

# Sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]

# Create an RDD from the local data
rdd = spark.sparkContext.parallelize(data)

# Print the RDD contents
print("RDD Contents:")
for item in rdd.collect():
    print(item)

# Infer schema (optional)
schema = StructType([StructField("Name", StringType(), True), StructField("Age", IntegerType(), True)])
df = spark.createDataFrame(rdd, schema)

# Show DataFrame (optional)
print("\nDataFrame Contents:")
df.show()

# Print the Spark UI URL
print(f"\nSpark UI URL: http://localhost:4040")

spark

RDD Contents:
('Alice', 25)
('Bob', 30)
('Charlie', 35)

DataFrame Contents:
+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+


Spark UI URL: http://localhost:4040


In [None]:
spark.stop()

In [None]:
from google.colab import output
output.serve_kernel_port_as_window(4040, path='/jobs/index.html')


Try `serve_kernel_port_as_iframe` instead. [0m


<IPython.core.display.Javascript object>

In [None]:
!pip install pyspark



In [None]:

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Create a SparkSession
spark = SparkSession.builder \
    .appName("LoadLocalData") \
    .getOrCreate()

In [None]:
spark

In [None]:
spark.sql('create database customers_db')

DataFrame[]

In [None]:
spark.sql('show databases').show()


+------------+
|   namespace|
+------------+
|customers_db|
|     default|
+------------+



In [None]:

spark.sql('show databases').filter("namespace like 'customers%'").show()

+------------+
|   namespace|
+------------+
|customers_db|
+------------+



In [None]:
spark.sql('use customers_db')

DataFrame[]

In [None]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [None]:
data = [(1, "Alice", "Mumbai", "2023-01-15", True),
(2, "Bob", "Delhi", "2023-03-25", False),
(3, "Charlie", "Chennai", "2023-05-10", True)]

columns = ["customer_id", "name", "city", "registration_date", "is_active"]

In [None]:
df = spark.createDataFrame(data, columns)

In [None]:
df.write.saveAsTable("customers_db.customers")

In [None]:
df.repartition(10).write.saveAsTable("customers_db.customers_2")

In [None]:
spark.sql('show tables').show()

+------------+---------+-----------+
|   namespace|tableName|isTemporary|
+------------+---------+-----------+
|customers_db|customers|      false|
+------------+---------+-----------+



In [None]:
spark.sql('describe exhibtended customers').show(truncate=False)

+----------------------------+-------------------------------------------------------+-------+
|col_name                    |data_type                                              |comment|
+----------------------------+-------------------------------------------------------+-------+
|customer_id                 |bigint                                                 |NULL   |
|name                        |string                                                 |NULL   |
|city                        |string                                                 |NULL   |
|registration_date           |string                                                 |NULL   |
|is_active                   |boolean                                                |NULL   |
|                            |                                                       |       |
|# Detailed Table Information|                                                       |       |
|Catalog                     |spark_catalog       

In [None]:
spark.sql('drop table customers')

DataFrame[]

In [None]:
spark.sql('describe extended customers').show(truncate=False)

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `customers` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 18;
'DescribeRelation true, [col_name#418, data_type#419, comment#420]
+- 'UnresolvedTableOrView [customers], DESCRIBE TABLE, true


In [None]:
spark.sql('''
create table if not exists managed_customers (
customer_id int,
name string,
city string,
registration_date date,
is_active boolean
) using csv
''')

DataFrame[]

In [None]:
spark.sql('show tables').show()

+------------+-----------------+-----------+
|   namespace|        tableName|isTemporary|
+------------+-----------------+-----------+
|customers_db|      customers_2|      false|
|customers_db|managed_customers|      false|
+------------+-----------------+-----------+



In [None]:
spark.sql('select * from managed_customers').show()

+-----------+----+----+-----------------+---------+
|customer_id|name|city|registration_date|is_active|
+-----------+----+----+-----------------+---------+
+-----------+----+----+-----------------+---------+



In [None]:
df - 10gb  2gb
10gb ---> 1000 partitions
200 paritions ->

In [None]:
df.write.mode('overwrite').saveAsTable("managed_customers")

In [None]:
spark.sql('select * from managed_customers').show()

+-----------+-------+-------+-----------------+---------+
|customer_id|   name|   city|registration_date|is_active|
+-----------+-------+-------+-----------------+---------+
|          1|  Alice| Mumbai|       2023-01-15|     true|
|          2|    Bob|  Delhi|       2023-03-25|    false|
|          3|Charlie|Chennai|       2023-05-10|     true|
+-----------+-------+-------+-----------------+---------+



In [None]:
spark.sql('describe extended managed_customers').show(truncate=False)

+----------------------------+---------------------------------------------------------------+-------+
|col_name                    |data_type                                                      |comment|
+----------------------------+---------------------------------------------------------------+-------+
|customer_id                 |bigint                                                         |NULL   |
|name                        |string                                                         |NULL   |
|city                        |string                                                         |NULL   |
|registration_date           |string                                                         |NULL   |
|is_active                   |boolean                                                        |NULL   |
|                            |                                                               |       |
|# Detailed Table Information|                                           

In [None]:
# prompt: save df as a csv in external table folder

# Assuming 'df' is your DataFrame and you have a folder named 'external_table_folder' in your desired location.
# Replace 'external_table_folder' with the actual path.

df.repartition(1).write.format("csv").mode("overwrite").save("external_table")


In [None]:
spark.sql('drop table external_customers')

DataFrame[]

In [None]:
spark.sql('show tables').show()

+------------+------------------+-----------+
|   namespace|         tableName|isTemporary|
+------------+------------------+-----------+
|customers_db|       customers_2|      false|
|customers_db|external_customers|      false|
|customers_db| managed_customers|      false|
+------------+------------------+-----------+



In [None]:
!ls /content/external_table

data.csv


In [None]:
spark.sql('''
create table if not exists external_customers(
  customer_id int,
  name string,
  city string,
  registration_date date,
  is_active boolean
) using csv location '/content/external_table'
''')

DataFrame[]

In [None]:
spark.sql('select * from external_customers').show()

+-----------+-------+-------+-----------------+---------+
|customer_id|   name|   city|registration_date|is_active|
+-----------+-------+-------+-----------------+---------+
|          1|  Alice| Mumbai|       2023-01-15|     true|
|          2|    Bob|  Delhi|       2023-03-25|    false|
|          3|Charlie|Chennai|       2023-05-10|     true|
+-----------+-------+-------+-----------------+---------+



In [None]:
# prompt: insert a row in external customer using spark sql insert into

# Insert a new row into the external_customers table
spark.sql("""
INSERT INTO external_customers (customer_id, name, city, is_active)
VALUES (4, 'David', 'Bangalore' , True)
""")

# # Verify the insertion
# spark.sql("SELECT * FROM external_customers").show()


DataFrame[]

In [None]:
spark.sql('describe extended external_customers').show(truncate = False)

+----------------------------+------------------------------+-------+
|col_name                    |data_type                     |comment|
+----------------------------+------------------------------+-------+
|customer_id                 |int                           |NULL   |
|name                        |string                        |NULL   |
|city                        |string                        |NULL   |
|registration_date           |date                          |NULL   |
|is_active                   |boolean                       |NULL   |
|                            |                              |       |
|# Detailed Table Information|                              |       |
|Catalog                     |spark_catalog                 |       |
|Database                    |customers_db                  |       |
|Table                       |external_customers            |       |
|Created Time                |Sat Feb 01 04:40:40 UTC 2025  |       |
|Last Access        

In [None]:
spark.sql('drop table managed_customers')

DataFrame[]

In [None]:
spark.sql('show tables').show()

+------------+-----------+-----------+
|   namespace|  tableName|isTemporary|
+------------+-----------+-----------+
|customers_db|customers_2|      false|
+------------+-----------+-----------+



In [None]:
spark.sql('drop table external_customers')

DataFrame[]

In [None]:
where is this data stored for Managed table ? in HDFS ?