In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.postgresql:postgresql:42.2.19,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 pyspark-shell'

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, from_json, date_format, to_timestamp
from pyspark.sql.types import *
from pyspark.sql import functions as F


spark = SparkSession. \
    builder. \
    appName("Data Sources"). \
    master("local"). \
    getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 14)

# config("spark.jars", "jars/postgresql-42.2.19.jar,jars/spark-sql-kafka-0-10_2.12-3.3.1.jar")

```python
"""
config("spark.python.worker.memory", "8g"). \
config("spark.driver.memory", "8g"). \
config("spark.executor.memory", "8g"). \
"""
```

# Read/Write DataFrame with file system, HDFS, S3, FTP

In [2]:
cars_df = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    option("mode", "failFast"). \
    option("path", "data/cars"). \
    load()

cars_df.show()

In [3]:
# HDFS
# option("path", "hdfs://nn1home:8020/sources/cars"). \

# FTP
# option("path", "ftp://user:pwd/192.168.1.5/sources/cars"). \

# S3
# option("path", s3://bucket-name/sources/cars)


In [4]:
cars_df_v2 = spark.read. \
    format("json"). \
    options(mode="failFast", path="data/cars", inferSchema="true"). \
    load()

cars_df_v2.show()

# /sources/cars
# 10.1.1.1 node1 -> block1     S3 NETWORK                             -> partition1 -> task1
# 10.1.1.2 node2 -> block2 -> Spark Driver -> Name Node -> ip adress -> partition2 -> task2
# 10.1.1.3 node3 -> block2                                           -> parttion3 -> task3


In [5]:
cars_df.show(10, False)

In [6]:
cars_df \
  .repartition(3) \
  .write \
  .mode("overwrite") \
  .option("compression", "snappy") \
  .parquet("../sources/parquet")

In [7]:
!ls -l ../sources/parquet

## Round Robin

In [8]:
cars_df \
  .repartition(3) \
  .write \
  .partitionBy("Year") \
  .mode("overwrite") \
  .format("csv") \
  .save('../sources/csv')

In [9]:
!wc -l ../sources/csv/Year\=1973-01-01/*.csv

In [10]:
# Round Robin
cars_df \
  .repartition(3) \
  .explain()

## Hash partitioning

In [11]:
cars_df \
  .repartition(3, "Horsepower") \
  .write \
  .partitionBy("Year") \
  .mode("overwrite") \
  .format("csv") \
  .save('../sources/csv')


In [12]:
!wc -l ../sources/csv/Year\=1973-01-01/*.csv

In [13]:
# Hash Paritioning
cars_df \
  .repartition(3, "Horsepower") \
  .explain()

## Text file format

In [14]:
# each row is a value in a DF with a SINGLE column ("value")
text_df = spark.read.text("data/lipsum")
text_df.count()

In [15]:
text_df.printSchema()

In [16]:
text_df \
  .filter("length(value) > 0") \
  .show(10, 120)

## Hadoop HDFS

### Writing to hdfs

In [17]:
cars_df \
  .write \
  .partitionBy("Year") \
  .mode("overwrite") \
  .format("csv") \
  .save("hdfs://hadoop:9000/user/hdfs/cars")

### Reading From HDFS

In [18]:
spark \
  .read \
  .option("inferSchema", "true") \
  .option("mode", "failFast") \
  .option("path", "hdfs://hadoop:9000/user/hdfs/cars") \
  .format("csv") \
  .load() \
  .show(5, False)

# JDBC Postgres

In [19]:
driver = "org.postgresql.Driver"
url = "jdbc:postgresql://postgres:5432/spark"
user = "docker"
password = "docker"

DBPARAMS = {
    "user": user,
    "password": password,
    "driver": driver
}

employees = "public.employees"
employees_pruned = """(select e.first_name, e.last_name, e.hire_date from public.employees e where e.gender = 'F') as new_emp"""

df = spark.read.jdbc(url=url, table=employees, properties=DBPARAMS)

print("Общее колличество партиций:", df.rdd.getNumPartitions())

df.show()

In [20]:
df.printSchema()

In [21]:
df.agg(F.max(F.col("emp_no")), F.min(F.col("emp_no"))) \
  .show()

In [22]:
# lowerBound = 10010
# UpperBound = 499990

df = spark.read.jdbc(
    url=url,
    table="public.employees",
    properties=DBPARAMS,
    column="emp_no", # обязательно все 4 опции или ни одной
    lowerBound = 10010, # обязательно все 4 опции или ни одной
    upperBound = 499990, # обязательно все 4 опции или ни одной
    numPartitions = 10 # обязательно все 4 опции или ни одной
)

print("Колличество партиций:", df.rdd.getNumPartitions())
df.show(10)

In [23]:
# Предикаты

pred1 = [ "gender = 'F'", "gender = 'M'", "gender = 'O'"]

# Внимательнее с границами!
pred2 = ["emp_no > 10010 and emp_no <= 50000", "emp_no >= 50000 and emp_no <= 100000"]

df = spark.read.jdbc(
    url=url,
    table="public.employees",
    properties=DBPARAMS,
    predicates=pred1
)

print("Колличество партиций:", df.rdd.getNumPartitions())

In [24]:
df.show(10, False)

In [25]:
employees_df = spark.read. \
    format("jdbc"). \
    option("driver", driver). \
    option("url", url). \
    option("user", user). \
    option("password", password). \
    option("dbtable", "public.employees"). \
    option("partitionColumn", "emp_no"). \
    option("lowerBound", 10010). \
    option("upperBound", 499990). \
    option("numPartitions", "10"). \
    load()

print("Колличество партиций:", employees_df.rdd.getNumPartitions())

In [26]:
depts_prunned = """(
  select de.emp_no
       , d.dept_no
       , d.dept_name
    from public.departments d
    join public.dept_emp de using (dept_no)
) as new_emp"""

department_df = spark \
  .read \
  .format("jdbc") \
  .option("driver", driver) \
  .option("url", url) \
  .option("user", user) \
  .option("password", password) \
  .option("dbtable", depts_prunned) \
  .load()

In [27]:
# department_df = F.broadcast(department_df)

emp_dept_df = employees_df \
  .join(department_df, employees_df.emp_no == department_df.emp_no,  "inner") \
  .select(employees_df.emp_no, employees_df.first_name, employees_df.last_name, department_df.dept_name, department_df.dept_no) \
  .groupBy("dept_no") \
  .count()

print("Колличество партиций:", emp_dept_df.rdd.getNumPartitions())

emp_dept_df.explain()

In [28]:
emp_dept_df.show()

emp_dept_df \
  .write \
  .bucketBy(4, "dept_no") \
  .sortBy("count") \
  .mode("overwrite") \
  .option("path", "hdfs://hadoop:9000/user/hdfs/spark-warehouse") \
  .saveAsTable("employee_bucketed")

In [29]:
spark \
  .sql("SELECT * FROM employee_bucketed order by count desc") \
  .show(10, False)

# Kafka Consumer

In [30]:
"""
kafka-topics.sh --bootstrap-server localhost:9092 --topic my-pyspark-topic --create --partitions 3 --replication-factor 1
"""

kafka_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "my-pyspark-topic") \
  .load()

print("Is kafka_df a sream based DataFrame? ", kafka_df.isStreaming)
kafka_df.printSchema()

In [31]:
"""
kafka-console-producer.sh --bootstrap-server localhost:9092 --topic my-pyspark-topic
"""

my_pyspark_topic = kafka_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .writeStream \
  .format("csv") \
  .option("path", "../out/kafka/my-pyspark-message") \
  .option("checkpointLocation", "../out/kafka/checkpoint") \
  .start()

In [32]:
my_pyspark_topic.awaitTermination(timeout=5)
my_pyspark_topic.stop()

Exercise: read the movies DF, then write it as
- tab-separated "CSV"
- parquet
- table "public.movies" in the Postgres DB

Exercise #2: find a way to read the people-1m dataFrame. Then write it as JSON.