In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName("JsonStreamProcessing").getOrCreate()

In [21]:
import pyspark.sql.functions as F

In [24]:
# Define your schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("_corrupt_record", StringType(), True)
])
path="/home/user/Documents/spark/interview_questions/file1.json"
# Assuming 'input_path' is your directory where JSON files will arrive
df = (spark.read
    .format("json") 
    .schema(schema) 
    .option("mode", "PERMISSIVE") 
    .load(path) ) 

corrupted_records = df.filter(F.col("_corrupt_record").isNotNull()) 
corrupted_records.show() 
good_records      = df.filter(F.col("_corrupt_record").isNull()).drop("_corrupt_record")  
good_records.show()

+----+----+----+--------------------+
|  id|name|city|     _corrupt_record|
+----+----+----+--------------------+
|NULL|NULL|NULL|{"id":test,"name"...|
|NULL|NULL|NULL|               NULL |
+----+----+----+--------------------+

+---+------------+--------+
| id|        name|    city|
+---+------------+--------+
|  1| Subhas Hati|New York|
|  2|  Rupa Rudra|New York|
|  4|Monica Rudra|New York|
+---+------------+--------+



In [16]:
df.show(truncate=False)

+----+------------+--------+---------------------------------------------------------+
|id  |name        |city    |_corrupt_record                                          |
+----+------------+--------+---------------------------------------------------------+
|1   |Subhas Hati |New York|NULL                                                     |
|2   |Rupa Rudra  |New York|NULL                                                     |
|NULL|NULL        |NULL    |{"id":test,"name":"Debangshu Hati","city":"Jersey City"} |
|4   |Monica Rudra|New York|NULL                                                     |
|NULL|NULL        |NULL    |NULL                                                     |
+----+------------+--------+---------------------------------------------------------+



In [17]:
!cat file1.json

{"id":1,"name":"Subhas Hati","city":"New York"} 
{"id":2,"name":"Rupa Rudra","city":"New York"} 
{"id":test,"name":"Debangshu Hati","city":"Jersey City"} 
{"id":4,"name":"Monica Rudra","city":"New York", "extra":10} 
NULL 



