In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("FileFormats")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

In [4]:
# inferSchema will cause performance issue while loading

movieDf = spark.read.format("csv")\
                .option("header", True)\
                .option("inferSchema", True)\
                .load("hdfs://localhost:9000/movies")

In [6]:
movieDf.rdd.getNumPartitions()

1

In [5]:
ratingDf = spark.read.format("csv")\
                .option("header", True)\
                .option("inferSchema", True)\
                .load("hdfs://localhost:9000/ratings")

In [7]:
# convert movies to json 
movieDf.write.mode("overwrite")\
                .json("hdfs://localhost:9000/movies-json")


In [8]:
# convert ratings to json
ratingDf.write.mode("overwrite")\
                .json("hdfs://localhost:9000/ratings-json")

In [9]:
# READ JSON FILE into DataFrame
movieJsonDf = spark.read.format("json")\
                .option("inferSchema", True)\
                .load("hdfs://localhost:9000/movies-json")

movieJsonDf.printSchema()
movieJsonDf.show(5)

root
 |-- genres: string (nullable = true)
 |-- movieId: long (nullable = true)
 |-- title: string (nullable = true)

+--------------------+-------+--------------------+
|              genres|movieId|               title|
+--------------------+-------+--------------------+
|Adventure|Animati...|      1|    Toy Story (1995)|
|Adventure|Childre...|      2|      Jumanji (1995)|
|      Comedy|Romance|      3|Grumpier Old Men ...|
|Comedy|Drama|Romance|      4|Waiting to Exhale...|
|              Comedy|      5|Father of the Bri...|
+--------------------+-------+--------------------+
only showing top 5 rows



In [10]:
# READ JSON FILE into DataFrame
ratingJsonDf = spark.read.format("json")\
                .option("inferSchema", True)\
                .load("hdfs://localhost:9000/ratings-json")

ratingJsonDf.printSchema()
ratingJsonDf.show(5)

root
 |-- movieId: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- userId: long (nullable = true)

+-------+------+---------+------+
|movieId|rating|timestamp|userId|
+-------+------+---------+------+
|      1|   4.0|964982703|     1|
|      3|   4.0|964981247|     1|
|      6|   4.0|964982224|     1|
|     47|   5.0|964983815|     1|
|     50|   5.0|964982931|     1|
+-------+------+---------+------+
only showing top 5 rows



In [11]:
# parquet
# Write data frame into parquet format

movieDf.write.mode("overwrite")\
                .parquet("hdfs://localhost:9000/movies-parquet")

ratingDf.write.mode("overwrite")\
                .parquet("hdfs://localhost:9000/ratings-parquet")

In [13]:
# READ Parquet FILE into DataFrame
# inferSchema not needed as parquet has schema itself
movieParquetDf = spark.read.format("parquet")\
                .load("hdfs://localhost:9000/movies-parquet")

movieParquetDf.printSchema()
movieParquetDf.show(5)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [14]:
# READ Parquet FILE into DataFrame
# inferSchema not needed as parquet has schema itself
ratingParquetDf = spark.read.format("parquet")\
                .load("hdfs://localhost:9000/ratings-parquet")

ratingParquetDf.printSchema()
ratingParquetDf.show(5)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [12]:
# orc, optimized columnar format
# Write data frame into orc format
# https://orc.apache.org
movieDf.write.mode("overwrite")\
                .orc("hdfs://localhost:9000/movies-orc")

ratingDf.write.mode("overwrite")\
                .orc("hdfs://localhost:9000/ratings-orc")

In [15]:
# READ Orc FILE into DataFrame
# inferSchema not needed as orc has schema itself
ratingOrcDf = spark.read.format("orc")\
                .load("hdfs://localhost:9000/ratings-orc")

ratingOrcDf.printSchema()
ratingOrcDf.show(5)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [16]:
# READ Orc FILE into DataFrame
# inferSchema not needed as orc has schema itself
moviesOrcDf = spark.read.format("orc")\
                .load("hdfs://localhost:9000/movies-orc")

moviesOrcDf.printSchema()
moviesOrcDf.show(5)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

