#### consider an input text file, holding single row with pipe delimited as shown below. How will you "apply line break to every 5th occurrence of pipe delimiter" and display as shown below ?
input text:
Name|Branch|Per|Tech|phone|Naresh|B.Tech|65%|Bigdata,Devops|9980528846|Nikky|M.Tech|95%|FullStackDev|+890916|Milky|BE|75%|.net|98909543|Miraj|Degree|55%|RDO|989091689|Madhu|NTech|45%|FrontendDev|99990916|

In [32]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('demo').getOrCreate()
df = spark.read.csv("branch.csv")
df.printSchema()
df.show(truncate=False)

root
 |-- _c0: string (nullable = true)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0                                                                                                                                                                             |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Naresh|B.Tech|65%|Bigdata,Devops|9980528846|Nikky|M.Tech|95%|FullStackDev|+890916|Milky|BE|75%|.net|98909543|Miraj|Degree|55%|RDO|989091689|Madhu|NTech|45%|FrontendDev|99990916|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [33]:
from pyspark.sql.functions import regexp_replace

df1 = df.withColumn("col",regexp_replace("_c0","(.*?\\|){5}","$0="))
df1.select("col").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|col                                                                                                                                                                                 |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Naresh|B.Tech|65%|Bigdata,Devops|9980528846|=Nikky|M.Tech|95%|FullStackDev|+890916|=Milky|BE|75%|.net|98909543|=Miraj|Degree|55%|RDO|989091689|=Madhu|NTech|45%|FrontendDev|99990916|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [34]:
from pyspark.sql.functions import split, explode

df2 = df1.withColumn('col_explode',explode(split('col','\|='))).select('col_explode')
df2.show(truncate=False)

+-------------------------------------------+
|col_explode                                |
+-------------------------------------------+
|Naresh|B.Tech|65%|Bigdata,Devops|9980528846|
|Nikky|M.Tech|95%|FullStackDev|+890916      |
|Milky|BE|75%|.net|98909543                 |
|Miraj|Degree|55%|RDO|989091689             |
|Madhu|NTech|45%|FrontendDev|99990916       |
+-------------------------------------------+



In [35]:
rdd_df = df2.select('col_explode').rdd.map(lambda x:x[0].split("|"))
rdd_df.toDF(["Name","Branch","Per","Tech","phone"]).show()

+------+------+---+--------------+----------+
|  Name|Branch|Per|          Tech|     phone|
+------+------+---+--------------+----------+
|Naresh|B.Tech|65%|Bigdata,Devops|9980528846|
| Nikky|M.Tech|95%|  FullStackDev|   +890916|
| Milky|    BE|75%|          .net|  98909543|
| Miraj|Degree|55%|           RDO| 989091689|
| Madhu| NTech|45%|   FrontendDev|  99990916|
+------+------+---+--------------+----------+

