Downloading Kaggle dataset and store it in this repo

In [1]:
import kagglehub
import shutil
import os

# Download latest version
path = kagglehub.dataset_download("jinquan/cc-sample-data")
print("Path to dataset files:", path)

target_path = "./datasets/cc-sample-data"

os.makedirs(target_path, exist_ok=True)
for filename in os.listdir(path):
    full_file_name = os.path.join(path, filename)
    if os.path.isfile(full_file_name):
        shutil.copy(full_file_name, target_path)

print("Files moved to:", target_path)

Path to dataset files: C:\Users\MSI\.cache\kagglehub\datasets\jinquan\cc-sample-data\versions\1
Files moved to: ./datasets/cc-sample-data


Setting Up Spark

In [2]:
import pyspark
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CreditCardFraud") \
    .master("local[*]") \
    .getOrCreate()


Dynamically get the downloaded file path and store in variable

In [3]:
json_files = [f for f in os.listdir(target_path) if f.endswith(".json")]
if not json_files:
    raise FileNotFoundError("No JSON file found in downloaded dataset")

json_path = os.path.join(target_path, json_files[0])  # Use first JSON file found
print("Reading JSON from:", json_path)

Reading JSON from: ./datasets/cc-sample-data\cc_sample_transaction.json


Read json file using Spark

In [4]:
from pyspark.sql.functions import col

df_raw = spark.read.json(json_path)  # Replace with actual path to JSON file
df_raw.printSchema()
df_raw.show(5)


root
 |-- Unnamed: 0: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- category: string (nullable = true)
 |-- cc_bic: string (nullable = true)
 |-- cc_num: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- merch_eff_time: string (nullable = true)
 |-- merch_last_update_time: string (nullable = true)
 |-- merch_lat: string (nullable = true)
 |-- merch_long: string (nullable = true)
 |-- merch_zipcode: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- personal_detail: string (nullable = true)
 |-- trans_date_trans_time: string (nullable = true)
 |-- trans_num: string (nullable = true)

+----------+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+--------------------+--------------------+---------------------+--------------------+
|Unnamed: 0|   amt|     category|     cc_bic|          cc_num|is_fraud|  merch_eff_time|merch_last_update_time

In [9]:
from pyspark.sql.functions import col

df = df_raw.select(
    col("Unnamed: 0").cast("int"),
    col("amt").cast("double"),
    col("category"),
    col("cc_bic"),
    col("cc_num").cast("long"),
    col("is_fraud").cast("int"),
    col("merch_eff_time").cast("bigint"),
    col("merch_last_update_time").cast("bigint"),
    col("merch_lat").cast("double"),
    col("merch_long").cast("double"),
    col("merch_zipcode"),
    col("merchant"),
    col("personal_detail"),
    col("trans_date_trans_time"),
    col("trans_num")
)


In [14]:
# df_raw.select("personal_detail").show(truncate=False)
# after viewing a sample of the available dataset, i need redefine my schema to flatten all json columns: address, personal_detail

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType

# Define schema for nested address
address_schema = StructType() \
    .add("street", StringType()) \
    .add("city", StringType()) \
    .add("state", StringType()) \
    .add("zip", StringType())

# Define schema for personal_detail
personal_schema = StructType() \
    .add("person_name", StringType()) \
    .add("dob", StringType()) \
    .add("gender", StringType()) \
    .add("job", StringType()) \
    .add("address", StringType()) \
    .add("lat", StringType()) \
    .add("long", StringType()) \
    .add("city_pop", StringType())
# # note: address still a string, we'll parse next

# First parse the personal_detail JSON
df = df.withColumn("personal_json", from_json(col("personal_detail"), personal_schema))

# Now parse the stringified address JSON
df = df.withColumn("address_json", from_json(col("personal_json.address"), address_schema))

# Flatten personal fields
for field in ["person_name", "dob", "gender", "job", "lat", "long", "city_pop"]:
    df = df.withColumn(field, col("personal_json")[field])

# Flatten address fields
for field in ["street", "city", "state", "zip"]:
    df = df.withColumn(field, col("address_json")[field])

df = df.drop("personal_detail", "personal_json", "address_json")


Successfully Flatten the df (yay!)

In [15]:
df.show()

+----------+------+-------------+-----------+-------------------+--------+----------------+----------------------+------------------+------------------+-------------+--------------------+---------------------+--------------------+--------------------+----------+------+--------------------+-------+------------------+--------+--------------------+--------------------+-----+-----+
|Unnamed: 0|   amt|     category|     cc_bic|             cc_num|is_fraud|  merch_eff_time|merch_last_update_time|         merch_lat|        merch_long|merch_zipcode|            merchant|trans_date_trans_time|           trans_num|         person_name|       dob|gender|                 job|    lat|              long|city_pop|              street|                city|state|  zip|
+----------+------+-------------+-----------+-------------------+--------+----------------+----------------------+------------------+------------------+-------------+--------------------+---------------------+--------------------+--------

Let's go to the Data Cleaning Part

In [None]:
# Handling PII Data


Testing Grounds

In [1]:
# to test JAVA_HOME
import os
print("JAVA_HOME:", os.environ.get("JAVA_HOME"))


JAVA_HOME: C:\Program Files\Eclipse Adoptium\jdk-17.0.15.6-hotspot\
