In [16]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

def read_custom_text_file(filepath):
    # Create or get SparkSession
    spark = SparkSession.builder.appName("customTextFileReader").getOrCreate()

    # Read the file as RDD
    rdd = spark.sparkContext.textFile(filepath)

    # Filter out unwanted lines and split the content by '|'
    def clean_row(line):
        return not line.startswith('+') and not all([x.strip() == '' for x in line.split('|')])

    rdd_clean = rdd.filter(clean_row).map(lambda line: [x.strip() for x in line.split('|')[1:-1]])

    # Extract headers
    headers = rdd_clean.first()

    # Remove the headers from the RDD
    rdd_data = rdd_clean.filter(lambda row: row != headers)

    # Convert RDD to DataFrame using headers for column names
    df = spark.createDataFrame(rdd_data, schema=headers)

    return df

# Test
filepath = "test_sample2.txt"
df = read_custom_text_file(filepath)
df.show()

+----+------------------+------+
|idv2|            Col1v2|Col2v2|
+----+------------------+------+
|   1|     one,two,three|   one|
|   2|     four,one,five|   six|
|   3|seven,nine,one,two| eight|
|   4|    two,three,five|  five|
|   5|      six,five,one| seven|
+----+------------------+------+

