Zadanie 2: Schemat Danych

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
spark = SparkSession.builder \
    .appName("Schematy") \
    .getOrCreate()

In [0]:
file_path = "dbfs:/FileStore/tables/Files/actors.csv"
schema = StructType([
    StructField("imdb_title_id", StringType(), True),
    StructField("ordering", IntegerType(), True),
    StructField("imdb_name_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("job", StringType(), True),
    StructField("characters", StringType(), True)
])

dataFrame = spark.read.csv(file_path, 
                           header=True, 
                           schema=schema, 
                           inferSchema=False)

dataFrame.show(10)
dataFrame.printSchema()

+-------------+--------+------------+--------+--------+--------------------+
|imdb_title_id|ordering|imdb_name_id|category|     job|          characters|
+-------------+--------+------------+--------+--------+--------------------+
|    tt0000009|       1|   nm0063086| actress|    null|[Miss Geraldine H...|
|    tt0000009|       2|   nm0183823|   actor|    null|      [Mr. Hamilton]|
|    tt0000009|       3|   nm1309758|   actor|    null|[Chauncey Depew -...|
|    tt0000009|       4|   nm0085156|director|    null|                null|
|    tt0000574|       1|   nm0846887| actress|    null|        [Kate Kelly]|
|    tt0000574|       2|   nm0846894|   actor|    null|     [School Master]|
|    tt0000574|       3|   nm3002376|   actor|    null|        [Steve Hart]|
|    tt0000574|       4|   nm0170118| actress|    null|                null|
|    tt0000574|       5|   nm0846879|director|    null|                null|
|    tt0000574|       6|   nm0317210|producer|producer|                null|

Zadanie 3: Read Modes

In [0]:
from pyspark.sql.functions import when

spark = SparkSession.builder.appName("ReadModes").getOrCreate()

df = dataFrame.withColumn(
    "ordering", 
    when(dataFrame["ordering"] == 5, "??").otherwise(dataFrame["ordering"])
)

df.show()

error_file_path = "dbfs:/FileStore/tables/Files/corrupted_actors.csv"
df.write.csv(error_file_path, header=True, mode="overwrite")

+-------------+--------+------------+---------------+----------+--------------------+
|imdb_title_id|ordering|imdb_name_id|       category|       job|          characters|
+-------------+--------+------------+---------------+----------+--------------------+
|    tt0000009|       1|   nm0063086|        actress|      null|[Miss Geraldine H...|
|    tt0000009|       2|   nm0183823|          actor|      null|      [Mr. Hamilton]|
|    tt0000009|       3|   nm1309758|          actor|      null|[Chauncey Depew -...|
|    tt0000009|       4|   nm0085156|       director|      null|                null|
|    tt0000574|       1|   nm0846887|        actress|      null|        [Kate Kelly]|
|    tt0000574|       2|   nm0846894|          actor|      null|     [School Master]|
|    tt0000574|       3|   nm3002376|          actor|      null|        [Steve Hart]|
|    tt0000574|       4|   nm0170118|        actress|      null|                null|
|    tt0000574|      ??|   nm0846879|       director| 

In [0]:
#PERMISSIVE - default

df_permissive = spark.read.csv(error_file_path, header=True, schema=schema, mode="PERMISSIVE")
df_permissive.show()

#DROPMALFORMED - skips rows

df_dropmalformed = spark.read.csv(error_file_path, header=True, schema=schema, mode="DROPMALFORMED")
df_dropmalformed.show()

#FAILFAST - stops when 1st error occurs

try:
    df_failfast = spark.read.csv(error_file_path, header=True, schema=schema, mode="FAILFAST")
    df_failfast.show()
except Exception as e:
    print("Error!!!")

+-------------+--------+------------+---------------+----------+--------------------+
|imdb_title_id|ordering|imdb_name_id|       category|       job|          characters|
+-------------+--------+------------+---------------+----------+--------------------+
|    tt0000009|       1|   nm0063086|        actress|      null|[Miss Geraldine H...|
|    tt0000009|       2|   nm0183823|          actor|      null|      [Mr. Hamilton]|
|    tt0000009|       3|   nm1309758|          actor|      null|[Chauncey Depew -...|
|    tt0000009|       4|   nm0085156|       director|      null|                null|
|    tt0000574|       1|   nm0846887|        actress|      null|        [Kate Kelly]|
|    tt0000574|       2|   nm0846894|          actor|      null|     [School Master]|
|    tt0000574|       3|   nm3002376|          actor|      null|        [Steve Hart]|
|    tt0000574|       4|   nm0170118|        actress|      null|                null|
|    tt0000574|    null|   nm0846879|       director| 

Zadanie 4: DataFrameWriter

In [0]:
spark = SparkSession.builder.appName("DataFrameWriter").getOrCreate()


file_path = "dbfs:/FileStore/tables/Files/actors.csv"
dataFrame = spark.read.csv(file_path, header=True, inferSchema=True)

#WRITE
dataFrame.write.parquet("dbfs:/FileStore/Tables/Files/parquet_file.parquet", mode="overwrite")
dataFrame.write.json("dbfs:/FileStore/Tables/Files/json_file.json", mode="overwrite")


#READ
df_parquet = spark.read.parquet("dbfs:/FileStore/Tables/Files/parquet_file.parquet")
df_parquet.show()

df_json = spark.read.json("dbfs:/FileStore/Tables/Files/json_file.json")
df_json.show()

+-------------+--------+------------+-------------------+--------+------------+
|imdb_title_id|ordering|imdb_name_id|           category|     job|  characters|
+-------------+--------+------------+-------------------+--------+------------+
|    tt3249124|       4|   nm6005417|              actor|    null|       [Tom]|
|    tt3249124|       5|   nm1871431|           director|    null|        null|
|    tt3249124|       6|   nm1862032|           producer|producer|        null|
|    tt3249124|       7|   nm4261282|           composer|    null|        null|
|    tt3249124|       8|   nm1677303|production_designer|    null|        null|
|    tt3249158|       1|   nm2946712|              actor|    null| [Sebastian]|
|    tt3249158|       2|   nm2578315|            actress|    null|   [Miranda]|
|    tt3249158|       3|   nm7033017|              actor|    null|   [Jacinto]|
|    tt3249158|       4|   nm7033021|              actor|    null|      [Rosa]|
|    tt3249158|       5|   nm1848095|   