1.1 

For “Credit Card System,” create a Python and PySpark SQL program to read/extract the following JSON files according to the specifications found in the mapping document.

In [20]:
import pyspark
from pyspark.sql import SparkSession 


spark = SparkSession.builder.appName('Credit Card Data Loader').getOrCreate()

df_branch = spark.read.option("multiline", "true").json(r"C:\Users\malik.alston\Desktop\Data\Credit Card Dataset Overview\cdw_sapp_branch.json")
df_cc = spark.read.option("multiline", "true").json(r"C:\Users\malik.alston\Desktop\Data\Credit Card Dataset Overview\cdw_sapp_credit.json")
df_customer = spark.read.option("multiline", "true").json(r"C:\Users\malik.alston\Desktop\Data\Credit Card Dataset Overview\cdw_sapp_customer.json")

#df_branch.show()
#df_cc.show()
df_customer.show()


+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+
|APT_NO|  CREDIT_CARD_NO|   CUST_CITY| CUST_COUNTRY|          CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN|      STREET_NAME|
+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+
|   656|4210653310061055|     Natchez|United States| AHooper@example.com|   1237818|        MS|   39120|      Alec|   Hooper|2018-04-21T12:49:...|         Wm|123456100|Main Street North|
|   829|4210653310102868|Wethersfield|United States| EHolman@example.com|   1238933|        CT|   06109|      Etta|   Holman|2018-04-21T12:49:...|    Brendan|123453023|    Redwood Drive|
|   683|4210653310116272|     Huntley|United States| WDunham@exam

Transform Data

In [25]:
from pyspark.sql.functions import initcap, lower, concat_ws

df_customer = df_customer.withColumn("FIRST_NAME", initcap("FIRST_NAME"))
df_customer = df_customer.withColumn("LAST_NAME", initcap("LAST_NAME"))
df_customer = df_customer.withColumn("MIDDLE_NAME", lower("MIDDLE_NAME"))
df_customer = df_customer.withColumn("ADDRESS", concat_ws(",", "STREET_NAME", "APT_NO"))

df_customer.show()

+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+--------------------+
|APT_NO|  CREDIT_CARD_NO|   CUST_CITY| CUST_COUNTRY|          CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN|      STREET_NAME|             ADDRESS|
+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+--------------------+
|   656|4210653310061055|     Natchez|United States| AHooper@example.com|   1237818|        MS|   39120|      Alec|   Hooper|2018-04-21T12:49:...|         wm|123456100|Main Street North|Main Street North...|
|   829|4210653310102868|Wethersfield|United States| EHolman@example.com|   1238933|        CT|   06109|      Etta|   Holman|2018-04-21T12:49:...|    brendan|123453023|

In [None]:
from pyspark.sql.functions import format_string, col, lpad, lit

# Ensure the phone number is 7 digits long as a string
df_customer = df_customer.withColumn("CUST_PHONE_STR", lpad(col("CUST_PHONE").cast("string"), 7, "0"))

# Insert 'XXX' as placeholder for area code
df_customer = df_customer.withColumn(
    "PHONE_FORMATTED",
    format_string("(%s)%s-%s",
                  lit("XXX"),
                  col("CUST_PHONE_STR").substr(1, 3),
                  col("CUST_PHONE_STR").substr(4, 4))
)

df_customer.select("CUST_PHONE", "PHONE_FORMATTED").show(truncate=False)
