In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

24/06/03 12:27:32 WARN Utils: Your hostname, Arbnors-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.11.6.58 instead (on interface en0)
24/06/03 12:27:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/03 12:27:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark = SparkSession.builder \
    .appName("Working with Datatypes and Joins") \
    .config("spark.driver.host", "driver-hostname") \
    .config("spark.driver.port", "driver-port") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .getOrCreate()

24/06/03 12:27:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
csvFile = spark.read.format("csv")\
    .option("header", "true")\
    .option("mode", "FAILFAST")\
    .option("inferSchema", "true")\
    .load("data/flight-data/csv/2010-summary.csv")

                                                                                

In [4]:
csvFile.show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+
only showing top 10 rows



In [5]:
csvFile.write.format("csv")\
    .mode("overwrite")\
    .option("delimiter", ",")\
    .save("data/flight-data/csv/testCSV.csv")

In [6]:
parquetFile = spark.read.format("parquet")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("data/flight-data/parquet/2010-summary.parquet")

In [7]:
parquetFile.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
+-----------------+-------------------+-----+
only showing top 2 rows



In [10]:
from pyspark.sql.functions import count, col

# Group by ORIGIN_COUNTRY_NAME and count the occurrences, then select the original count and doubled count
result = parquetFile.groupBy("DEST_COUNTRY_NAME") \
    .agg(count("*").alias("count")) \
    .selectExpr("count * 2 as doubled_count", "count").sort(col("count"), ascending = False).show(5)

+-------------+-----+
|doubled_count|count|
+-------------+-----+
|          262|  131|
|            2|    1|
|            2|    1|
|            2|    1|
|            2|    1|
+-------------+-----+
only showing top 5 rows



In [11]:
csvFile.repartition(5).write.format("csv").save("data/flight-data/csv/partitionData/multiple.csv")

                                                                                

In [26]:
csvFile.limit(10).write.format("parquet").partitionBy("ORIGIN_COUNTRY_NAME").mode("overwrite").save("data/flight-data/csv/partitionData/CountryPartition.parquet")

                                                                                

In [33]:
countryData = spark.read.format("parquet")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("data/flight-data/csv/partitionData/CountryPartition.parquet/ORIGIN_COUNTRY_NAME=Grenada/part-00000-3b961257-3a57-42c5-975b-34dbf7f66b76.c000.snappy.parquet")

In [34]:
countryData.show(5)

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|    United States|   54|
+-----------------+-----+



In [35]:
# To find the right number of buckets, user should divide total size of data / 128 MB. 128MB is the size of blocks in HDFS. 

numberOfBuckets = 5
columnToBucket = "count"

csvFile.write.format("parquet").bucketBy(numberOfBuckets, columnToBucket).saveAsTable("bucketedTable")

                                                                                