In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import (
    StructType, StructField,
    StringType, DoubleType, IntegerType, DateType
)


In [2]:
spark = SparkSession.builder.getOrCreate()


25/05/26 23:27:07 WARN Utils: Your hostname, Lydias-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.68.50 instead (on interface en0)
25/05/26 23:27:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/26 23:27:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/26 23:27:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Define schema for each record:

address_schema = StructType([
    StructField("street", StringType(), True),
    StructField("city",   StringType(), True),
    StructField("state",  StringType(), True),
    StructField("zip",    StringType(), True),
])

personal_detail_schema = StructType([
    StructField("person_name", StringType(), True),
    StructField("gender",      StringType(), True),
    StructField("address",     StringType(), True),
    StructField("lat",         StringType(), True),
    StructField("long",        StringType(), True),
    StructField("city_pop",    StringType(), True),
    StructField("job",         StringType(), True),
    StructField("dob",         DateType(), True),
])

tranasction_schema = StructType([
    StructField("amt",StringType(), True),
    StructField("category", StringType(), True),
    StructField("cc_bic",StringType(), True),
    StructField("cc_num",StringType(), True),
    StructField("is_fraud",StringType(), True),
    StructField("merch_eff_time",StringType(), True),
    StructField("merch_last_update_time",StringType(), True),
    StructField("merch_lat",StringType(), True),
    StructField("merch_long",StringType(), True),
    StructField("merch_zipcode",StringType(), True),
    StructField("merchant",StringType(), True),
    StructField("personal_detail",StringType(), True),
    StructField("trans_date_trans_time",StringType(), True),
    StructField("trans_num",StringType(), True),
])

def parse_json_field(df, column_name, schema, new_column_name=None):
    """
    Parse a JSON field in a DataFrame using the provided schema.
    
    Args:
        df: Input DataFrame
        column_name: Name of the column containing JSON data
        schema: Schema to use for parsing
        new_column_name: Name for the new column (defaults to column_name)
    
    Returns:
        DataFrame with parsed JSON field
    """
    if new_column_name is None:
        new_column_name = column_name
        
    return df.withColumn(
        new_column_name,
        from_json(col(column_name), schema)
    )


In [4]:
# Read JSON data
raw = spark.read.schema(tranasction_schema).json("data_fixtures/cc_sample_transaction.json")

raw.show(5)

+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+--------------------+--------------------+---------------------+--------------------+
|   amt|     category|     cc_bic|          cc_num|is_fraud|  merch_eff_time|merch_last_update_time|         merch_lat| merch_long|merch_zipcode|            merchant|     personal_detail|trans_date_trans_time|           trans_num|
+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+--------------------+--------------------+---------------------+--------------------+
|  4.97|     misc_net|CITIUS33CHI|2703186189652095|       0|1325376018798532|         1325376018666|         36.011293| -82.048315|        28705|fraud_Rippin, Kub...|{"person_name":"J...|  2019-01-01 00:00:18|0b242abb623afc578...|
|107.23|  grocery_pos|   ADMDUS41|    630423337322|       0|1325376044867960

In [5]:
# since the JSON fields were given to us as  escaped strings, 
# we need to parse them first in order to make use of dot notation later
parsed_personal_detail = parse_json_field(raw, "personal_detail", personal_detail_schema)
parsed_address = parse_json_field(parsed_personal_detail, "personal_detail.address", address_schema, "address")
parsed_address.show(5)


+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|   amt|     category|     cc_bic|          cc_num|is_fraud|  merch_eff_time|merch_last_update_time|         merch_lat| merch_long|merch_zipcode|            merchant|     personal_detail|trans_date_trans_time|           trans_num|             address|
+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|  4.97|     misc_net|CITIUS33CHI|2703186189652095|       0|1325376018798532|         1325376018666|         36.011293| -82.048315|        28705|fraud_Rippin, Kub...|{Jennifer,Banks,e...|  2019-01-01 00:00:18|0b242abb623afc578...|{561 Perry Cov

In [6]:
clean = parsed_address.select(
    "trans_date_trans_time",
    "cc_num",
    "merchant",
    "category",
    "amt",
    "personal_detail.person_name",
    "personal_detail.gender",
    "address.street",
    "address.city",
    "address.state",
    "address.zip",
    "personal_detail.lat",
    "personal_detail.long",
    "personal_detail.city_pop",
    "personal_detail.job",
    "personal_detail.dob",
    "trans_num",
    "merch_lat",
    "merch_long",
    "is_fraud",
    "merch_zipcode",
    "merch_last_update_time",
    "merch_eff_time",
    "cc_bic",
)

clean = (clean
    .withColumn("lat",      col("lat").cast(DoubleType()))
    .withColumn("long",     col("long").cast(DoubleType()))
    .withColumn("city_pop", col("city_pop").cast(IntegerType()))
    .withColumn("dob",      col("dob").cast(DateType()))
)

clean.show(5)


+---------------------+----------------+--------------------+-------------+------+--------------------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+------------------+-----------+--------+-------------+----------------------+----------------+-----------+
|trans_date_trans_time|          cc_num|            merchant|     category|   amt|         person_name|gender|              street|          city|state|  zip|    lat|     long|city_pop|                 job|       dob|           trans_num|         merch_lat| merch_long|is_fraud|merch_zipcode|merch_last_update_time|  merch_eff_time|     cc_bic|
+---------------------+----------------+--------------------+-------------+------+--------------------+------+--------------------+--------------+-----+-----+-------+---------+--------+--------------------+----------+--------------------+------------------+-----------+--------+-------------+------------------

In [7]:
# Write as Parquet file
clean.write.mode("overwrite").option("compression", "snappy").parquet("data_fixtures/cc_sample_transaction.parquet")

print("Successfully converted JSON to Parquet format!")
print(f"Number of rows: {clean.count()}")
print("Schema:")
clean.printSchema()

25/05/26 23:27:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/05/26 23:27:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/05/26 23:27:11 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/05/26 23:27:17 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/05/26 23:27:17 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

Successfully converted JSON to Parquet format!


[Stage 4:>                                                        (0 + 10) / 10]

Number of rows: 1296675
Schema:
root
 |-- trans_date_trans_time: string (nullable = true)
 |-- cc_num: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- person_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- merch_lat: string (nullable = true)
 |-- merch_long: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- merch_zipcode: string (nullable = true)
 |-- merch_last_update_time: string (nullable = true)
 |-- merch_eff_time: string (nullable = true)
 |-- cc_bic: string (nullable = true)



                                                                                