In [17]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *


In [18]:
spark = SparkSession.builder.appName("MultiSchemaFileParsing").getOrCreate()

ConnectionRefusedError: [Errno 61] Connection refused

In [4]:
df = spark.read.text("data/input/data_with_ht.csv") \
    .withColumn("fields", F.split(F.col("value"), ",")) \
    .select("fields")

df.show(truncate=False)

+-------------------------------+
|fields                         |
+-------------------------------+
|[H, data_with_ht.csv, 20260130]|
|[D, Rahul, 35, M, 50000]       |
|[D, Anita, 28, F, 60000]       |
|[D, Sunil, 40, F, 70000]       |
|[D, Preeti, 32, F, 75000]      |
|[D, Ajay, 46, M, 80000]        |
|[T, 5, 180, 3, 2, 325000]      |
+-------------------------------+



In [7]:
schema_map = {
    "H": [
        ("rec_type", StringType()),
        ("file_name", StringType()),
        ("date", StringType())
    ],
    "D": [
        ("rec_type", StringType()),
        ("name", StringType()),
        ("age", IntegerType()),
        ("gender", StringType()),
        ("salary", DoubleType())
    ],
    "T": [
        ("rec_type", StringType()),
        ("total_records", IntegerType()),
        ("total_age", IntegerType()),
        ("total_males", IntegerType()),
        ("total_females", IntegerType()),
        ("total_salary", DoubleType())
    ]
}

In [10]:
def parse_by_type(df, rec_type, schema_def):
    filtered_df = df.filter(F.col("fields")[0] == rec_type)

    select_expr = []

    for idx, (name, dtype) in enumerate(schema_def):
        select_expr.append(F.col("fields")[idx].cast(dtype).alias(name))

    return filtered_df.select(*select_expr)
    


In [11]:
header_df = parse_by_type(df, "H", schema_map["H"])
data_df = parse_by_type(df, "D", schema_map["D"])
trailer_df = parse_by_type(df, "T", schema_map["T"])

In [12]:
header_df.show(truncate=False)

trailer_df.show(truncate=False)

+--------+----------------+--------+
|rec_type|file_name       |date    |
+--------+----------------+--------+
|H       |data_with_ht.csv|20260130|
+--------+----------------+--------+

+--------+-------------+---------+-----------+-------------+------------+
|rec_type|total_records|total_age|total_males|total_females|total_salary|
+--------+-------------+---------+-----------+-------------+------------+
|T       |5            |180      |3          |2            |325000.0    |
+--------+-------------+---------+-----------+-------------+------------+



In [13]:
data_df.show(truncate=False)

+--------+------+---+------+-------+
|rec_type|name  |age|gender|salary |
+--------+------+---+------+-------+
|D       |Rahul |35 |M     |50000.0|
|D       |Anita |28 |F     |60000.0|
|D       |Sunil |40 |F     |70000.0|
|D       |Preeti|32 |F     |75000.0|
|D       |Ajay  |46 |M     |80000.0|
+--------+------+---+------+-------+



In [16]:
data_df.explain(extended=True)

ConnectionRefusedError: [Errno 61] Connection refused