In [5]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.appName("FileValidation").getOrCreate()

In [6]:
schema_header = StructType([
    StructField("RecordType", StringType(), True),
    StructField("FileDate", StringType(), True),
    StructField("FileVersion", StringType(), True)
])

schema_trailer = StructType([
    StructField("RecordType", StringType(), True),
    StructField("TotalRecords", IntegerType(), True),
    StructField("TotalAge", IntegerType(), True),
    StructField("TotalMales", IntegerType(), True),
    StructField("TotalFemales", IntegerType(), True),
    StructField("TotalSalary", IntegerType(), True)
])

schema_data = StructType([
    StructField("RecordType", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Salary", IntegerType(), True)
])

In [11]:
df = spark.read.text("data/input/data_with_ht.csv") \
    .withColumn("fields", F.split(F.col("value"), ",")) \
    .select("fields")

df.show(truncate=False)

+-------------------------------+
|fields                         |
+-------------------------------+
|[H, data_with_ht.csv, 20260130]|
|[D, Rahul, 35, M, 50000]       |
|[D, Anita, 28, F, 60000]       |
|[D, Sunil, 40, F, 70000]       |
|[D, Preeti, 32, F, 75000]      |
|[D, Ajay, 46, M, 80000]        |
|[T, 5, 180, 3, 2, 325000]      |
+-------------------------------+



In [29]:
schema_map = {
    "H": [
        ("rec_type", StringType()),
        ("file_name", StringType()),
        ("date", StringType())
    ],
    "D": [
        ("rec_type", StringType()),
        ("name", StringType()),
        ("age", IntegerType()),
        ("gender", StringType()),
        ("salary", DoubleType())
    ],
    "T": [
        ("rec_type", StringType()),
        ("total_records", IntegerType()),
        ("total_age", IntegerType()),
        ("total_males", IntegerType()),
        ("total_females", IntegerType()),
        ("total_salary", DoubleType())
    ]
}

In [31]:
def parse_by_type(df, rec_type, schema_def):
    filtered_df = df.filter(F.col("fields")[0] == rec_type)

    select_expr = []

    for idx, (name, dtype) in enumerate(schema_def):
        select_expr.append(F.col("fields")[idx].cast(dtype).alias(name))

    print(select_expr)
    return filtered_df.select(*select_expr)
    


In [32]:
header_df = parse_by_type(df, "H", schema_map["H"])
data_df = parse_by_type(df, "D", schema_map["D"])
trailer_df = parse_by_type(df, "T", schema_map["T"])

[Column<'CAST(fields[0] AS STRING) AS rec_type'>, Column<'CAST(fields[1] AS STRING) AS file_name'>, Column<'CAST(fields[2] AS STRING) AS date'>]
[Column<'CAST(fields[0] AS STRING) AS rec_type'>, Column<'CAST(fields[1] AS STRING) AS name'>, Column<'CAST(fields[2] AS INT) AS age'>, Column<'CAST(fields[3] AS STRING) AS gender'>, Column<'CAST(fields[4] AS DOUBLE) AS salary'>]
[Column<'CAST(fields[0] AS STRING) AS rec_type'>, Column<'CAST(fields[1] AS INT) AS total_records'>, Column<'CAST(fields[2] AS INT) AS total_age'>, Column<'CAST(fields[3] AS INT) AS total_males'>, Column<'CAST(fields[4] AS INT) AS total_females'>, Column<'CAST(fields[5] AS DOUBLE) AS total_salary'>]


In [20]:
header_df.show(truncate=False)

trailer_df.show(truncate=False)

+--------+----------------+--------+
|rec_type|file_name       |date    |
+--------+----------------+--------+
|H       |data_with_ht.csv|20260130|
+--------+----------------+--------+

+--------+-------------+---------+-----------+-------------+------------+
|rec_type|total_records|total_age|total_males|total_females|total_salary|
+--------+-------------+---------+-----------+-------------+------------+
|T       |5            |180      |3          |2            |325000.0    |
+--------+-------------+---------+-----------+-------------+------------+



In [28]:
data_df.show(truncate=False)

+--------+------+---+------+-------+
|rec_type|name  |age|gender|salary |
+--------+------+---+------+-------+
|D       |Rahul |35 |M     |50000.0|
|D       |Anita |28 |F     |60000.0|
|D       |Sunil |40 |F     |70000.0|
|D       |Preeti|32 |F     |75000.0|
|D       |Ajay  |46 |M     |80000.0|
+--------+------+---+------+-------+



In [42]:
def df_to_str_list(df):
    return df.rdd.map(lambda row: ",".join([str(x) for x in row])).collect()

lines = df_to_str_list(header_df) + df_to_str_list(data_df) + df_to_str_list(trailer_df)

with open("data/output/employee_file.txt", "w") as f:
    for line in lines:
        f.write(line + "\n")

In [41]:
print(lines)

['H,data_with_ht.csv,20260130', 'D,Rahul,35,M,50000.0', 'D,Anita,28,F,60000.0', 'D,Sunil,40,F,70000.0', 'D,Preeti,32,F,75000.0', 'D,Ajay,46,M,80000.0', 'T,5,180,3,2,325000.0']


In [39]:
header_df.write.mode("overwrite").csv("data/output/employee.csv", header=False)