We will:

✅ Store loan data in multiple formats (CSV, Parquet, ORC)

✅ Compare storage size & query performance

In [1]:
#/home/labuser/Documents/Level2_Day2/

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("FileFormats to Store").getOrCreate()



In [4]:
df_loan = spark.read.csv("/home/labuser/Documents/Level2_Day2/loans.csv", header=True, inferSchema=True)
df_loan.show()

+-------+-----------+-----------+-------------+---------+-----------+
|loan_id|customer_id|loan_amount|interest_rate|loan_term|loan_status|
+-------+-----------+-----------+-------------+---------+-----------+
| LOAN_1|   CUST_623|   356880.6|         2.68|       12|   Rejected|
| LOAN_2|  CUST_3299|  408851.88|         7.34|       12|    Pending|
| LOAN_3|  CUST_2356|  454599.98|         9.19|       36|   Approved|
| LOAN_4|  CUST_3598|  228835.23|         8.57|       36|   Rejected|
| LOAN_5|   CUST_305|   73602.87|         5.63|       36|   Rejected|
| LOAN_6|  CUST_9492|   89225.37|         3.33|       48|    Pending|
| LOAN_7|  CUST_7978|   46994.41|         9.47|       12|   Approved|
| LOAN_8|  CUST_9382|  197149.52|         4.97|       12|   Approved|
| LOAN_9|  CUST_4788|  102803.98|         2.82|       36|    Pending|
|LOAN_10|  CUST_6463|  162353.48|         3.73|       60|    Pending|
|LOAN_11|  CUST_5059|  225898.53|         2.67|       24|   Approved|
|LOAN_12|  CUST_1783

In [5]:
# Convert to Optimized Format

# 1. Convert to Parquet

df_loan.write.parquet("/home/labuser/Documents/Level2_Day2/loans_parquet")

In [None]:
# 2. Convert to AVRO
df_loan.write.format("avro").save("/home/labuser/Documents/Level2_Day2/loans_avro")

In [6]:
# 3. Convert to ORC

df_loan.write.orc("/home/labuser/Documents/Level2_Day2/loans_orc")

In [7]:
# Compare Performance 
# Compare Storage Size:

!du -sh /home/labuser/Documents/Level2_Day2/loans_*

872K	/home/labuser/Documents/Level2_Day2/loans_orc
780K	/home/labuser/Documents/Level2_Day2/loans_parquet


In [8]:
df_parquet_read = spark.read.parquet("/home/labuser/Documents/Level2_Day2/loans_parquet")
df_orc_read = spark.read.orc("/home/labuser/Documents/Level2_Day2/loans_orc")

# Compare the Query Performance
df_parquet_read.filter("loan_amount > 100000").show()
df_orc_read.filter("loan_amount > 100000").show()

+-------+-----------+-----------+-------------+---------+-----------+
|loan_id|customer_id|loan_amount|interest_rate|loan_term|loan_status|
+-------+-----------+-----------+-------------+---------+-----------+
| LOAN_1|   CUST_623|   356880.6|         2.68|       12|   Rejected|
| LOAN_2|  CUST_3299|  408851.88|         7.34|       12|    Pending|
| LOAN_3|  CUST_2356|  454599.98|         9.19|       36|   Approved|
| LOAN_4|  CUST_3598|  228835.23|         8.57|       36|   Rejected|
| LOAN_8|  CUST_9382|  197149.52|         4.97|       12|   Approved|
| LOAN_9|  CUST_4788|  102803.98|         2.82|       36|    Pending|
|LOAN_10|  CUST_6463|  162353.48|         3.73|       60|    Pending|
|LOAN_11|  CUST_5059|  225898.53|         2.67|       24|   Approved|
|LOAN_12|  CUST_1783|  267359.05|        10.08|       60|   Rejected|
|LOAN_15|  CUST_7143|  184285.11|         4.06|       48|    Pending|
|LOAN_16|  CUST_9330|  393331.45|         5.67|       60|    Pending|
|LOAN_17|  CUST_5238