In [1]:
from pyspark.sql import SparkSession
 
spark = SparkSession.builder \
    .appName("GKSOrderToParttitions") \
    .master("yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.executor.instances", "1") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 09:50:19 INFO SparkEnv: Registering MapOutputTracker
25/02/06 09:50:19 INFO SparkEnv: Registering BlockManagerMaster
25/02/06 09:50:19 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/02/06 09:50:19 INFO SparkEnv: Registering OutputCommitCoordinator


In [3]:
from pyspark.sql.functions import col, year, month, dayofmonth


In [None]:

# GCS Bucket and File Paths
input_path = "gs://tpch-source/orders/"  # Replace with your actual bucket path
output_path = "gs://tpch-source/orders-partittions/"


In [5]:

# Read the CSV file from GCS (ensure header is considered)
df = spark.read.option("header", "true").option("inferSchema", "true").csv(input_path)

df.printSchema()
df.show(2)



root
 |-- o_orderkey: integer (nullable = true)
 |-- o_custkey: integer (nullable = true)
 |-- o_orderstatus: string (nullable = true)
 |-- o_totalprice: double (nullable = true)
 |-- o_orderdate: date (nullable = true)
 |-- o_orderpriority: string (nullable = true)
 |-- o_clerk: string (nullable = true)
 |-- o_shippriority: integer (nullable = true)
 |-- o_comment: string (nullable = true)



                                                                                

In [6]:
# Convert o_orderdate to DateType (assuming it's in 'yyyy-MM-dd' format)
df = df.withColumn("o_orderdate", col("o_orderdate").cast("date"))
df.printSchema()
df.show(2)

root
 |-- o_orderkey: integer (nullable = true)
 |-- o_custkey: integer (nullable = true)
 |-- o_orderstatus: string (nullable = true)
 |-- o_totalprice: double (nullable = true)
 |-- o_orderdate: date (nullable = true)
 |-- o_orderpriority: string (nullable = true)
 |-- o_clerk: string (nullable = true)
 |-- o_shippriority: integer (nullable = true)
 |-- o_comment: string (nullable = true)

+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|o_orderkey|o_custkey|o_orderstatus|o_totalprice|o_orderdate|o_orderpriority|        o_clerk|o_shippriority|           o_comment|
+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|  18728327|   420898|            F|   130172.09| 1994-01-20|       1-URGENT|Clerk#000000726|             0| final packages afte|
|  18728352|    18241|            O|    61088.33| 1997-08-12|         2-HIGH|Clerk#00

In [7]:

# Add year, month, and day columns for partitioning
df = df.withColumn("year", year(col("o_orderdate"))) \
       .withColumn("month", month(col("o_orderdate"))) \
       .withColumn("day", dayofmonth(col("o_orderdate")))

df.printSchema()
df.show(2)

root
 |-- o_orderkey: integer (nullable = true)
 |-- o_custkey: integer (nullable = true)
 |-- o_orderstatus: string (nullable = true)
 |-- o_totalprice: double (nullable = true)
 |-- o_orderdate: date (nullable = true)
 |-- o_orderpriority: string (nullable = true)
 |-- o_clerk: string (nullable = true)
 |-- o_shippriority: integer (nullable = true)
 |-- o_comment: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+----+-----+---+
|o_orderkey|o_custkey|o_orderstatus|o_totalprice|o_orderdate|o_orderpriority|        o_clerk|o_shippriority|           o_comment|year|month|day|
+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+----+-----+---+
|  18728327|   420898|            F|   130172.09| 1994-01-2

In [8]:

# Sort data by o_orderdate in ascending order
df_sorted = df.orderBy(col("o_orderdate").asc())
df.printSchema()
df.show(2)

root
 |-- o_orderkey: integer (nullable = true)
 |-- o_custkey: integer (nullable = true)
 |-- o_orderstatus: string (nullable = true)
 |-- o_totalprice: double (nullable = true)
 |-- o_orderdate: date (nullable = true)
 |-- o_orderpriority: string (nullable = true)
 |-- o_clerk: string (nullable = true)
 |-- o_shippriority: integer (nullable = true)
 |-- o_comment: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+----+-----+---+
|o_orderkey|o_custkey|o_orderstatus|o_totalprice|o_orderdate|o_orderpriority|        o_clerk|o_shippriority|           o_comment|year|month|day|
+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+----+-----+---+
|  18728327|   420898|            F|   130172.09| 1994-01-2

In [9]:

# Write the partitioned output as CSV with headers
df_sorted.write \
    .option("header", "true") \
    .mode("overwrite") \
    .partitionBy("year", "month", "day") \
    .csv(output_path)

print(f"Partitioned CSV data written to {output_path}")

                                                                                

Partitioned CSV data written to gs://tpch-source/orders-partittions/


In [None]:
 spark.stop()