# Start Spark Session

In [1]:
from pyspark.sql import SparkSession

# Configure SparkSession to connect to the cluster
spark = SparkSession.builder \
    .appName("Jupyter-Spark") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.cores.max", "4") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://hdfs-namenode:8020") \
    .getOrCreate()

print(spark.version)

3.5.0


# Read csv from local

In [3]:
data = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]

# Parallelize the data and convert it to a DataFrame
df = spark.sparkContext.parallelize(data).toDF(["id", "name"])# Read a text file

In [4]:
df.count()

3

# Reading From Web

In [6]:
import pandas as pd
# Read CSV using pandas
pdf = pd.read_csv("https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv")

# Convert pandas DataFrame to Spark DataFrame
df = spark.createDataFrame(pdf)

df.show(5)

+-----+-------+-------+-------+
|Month| "1958"| "1959"| "1960"|
+-----+-------+-------+-------+
|  JAN|    340|    360|    417|
|  FEB|    318|    342|    391|
|  MAR|    362|    406|    419|
|  APR|    348|    396|    461|
|  MAY|    363|    420|    472|
+-----+-------+-------+-------+
only showing top 5 rows



# Read from hdfs

In [5]:
# Read from HDFS 
df = spark.read.csv("hdfs://hdfs-namenode:8020/sample_data.csv", header=True, inferSchema=True)

### Option 2 - with hdfs specified during session creation

In [6]:
df = spark.read.csv("/sample_data.csv", header=True, inferSchema=True)

In [7]:
df.show()

+---+---+
|  A|  B|
+---+---+
|  1|  2|
|  3|  4|
+---+---+



In [9]:
df.write.csv("/sample_data_2.csv")

# Read Content From Streaming Data

In [18]:
df = spark.read.csv("/tmp/data/*.csv")

In [19]:
df.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
|NULL|{"id": 2, "name":...|
|NULL|{"id": 2, "name":...|
|NULL|{"id": 3, "name":...|
|NULL|{"id": 2, "name":...|
|NULL|{"id": 2, "name":...|
+----+--------------------+



# Stop Spark

In [None]:
spark.stop()