Extract Data 1.1 

For “Credit Card System,” create a Python and PySpark SQL program to read/extract the following JSON files according to the specifications found in the mapping document.

In [None]:
# Import neccessary Pyspark modules
import pyspark
from pyspark.sql import SparkSession
# Create or retrieve an existing SparkSession.
# This is the entry point to use DataFrame and SQL functionality in PySpark.
# The 'appName' is used to name your Spark application, which helps when monitoring jobs in Spark UI.
os.environ["HADOOP_OPTS"] = "-Djava.library.path="
spark.stop()
spark = SparkSession.builder.appName('Credit Card Data Loader').config("spark.hadoop.disable.native.lib", "true").getOrCreate()
# Load the 'cdw_sapp_branch.json' JSON file into a DataFrame.
# Load the 'cdw_sapp_credit.json' JSON file into a DataFrame.
# Load the 'cdw_sapp_customer.json' JSON file into a DataFrame.
# The 'option("multiline", "true")' tells Spark to handle JSON entries that span multiple lines.
# This is useful when each record is formatted across several lines for readability.
df_branch = spark.read.option("multiline", "true").json(r"C:\Users\malik.alston\Desktop\Data\Credit Card Dataset Overview\cdw_sapp_branch.json")
df_cc = spark.read.option("multiline", "true").json(r"C:\Users\malik.alston\Desktop\Data\Credit Card Dataset Overview\cdw_sapp_credit.json")
df_customer = spark.read.option("multiline", "true").json(r"C:\Users\malik.alston\Desktop\Data\Credit Card Dataset Overview\cdw_sapp_customer.json")
# The below lines are currently commented out but can be used to visually inspect
# The contents of each DataFrame. They display the top 20 rows by default in a tabular format.
# df_customer.show()
# df_branch.show()
# df_cc.show()

Transform Data

In [3]:
from pyspark.sql.functions import initcap, lower, concat_ws, format_string, col, lpad, lit, concat, when
# initcap()	Capitalize first letter of each word, lower() Convert string to lowercase, concat_ws()	Concatenate with separator, format_string()	String formatting
# col()	Reference to a column, lpad() Left-pad a string, lit()	Add a constant/literal column, concat()	Concatenate columns (no separator), when() Conditional logic
df_customer = df_customer.withColumn("FIRST_NAME", initcap("FIRST_NAME"))
df_customer = df_customer.withColumn("LAST_NAME", initcap("LAST_NAME"))
df_customer = df_customer.withColumn("MIDDLE_NAME", lower("MIDDLE_NAME"))
df_customer = df_customer.withColumn("ADDRESS", concat_ws(",", "STREET_NAME", "APT_NO"))
df_customer = df_customer.withColumn("CUST_PHONE",concat(lit("(XXX)"),col("CUST_PHONE").substr(1, 3),lit("-"),col("CUST_PHONE").substr(4, 4)))

df_branch = df_branch.withColumn("BRANCH_PHONE",concat(lit("(XXX)"),col("BRANCH_PHONE").substr(1, 3),lit("-"),col("BRANCH_PHONE").substr(4, 4)))
df_branch = df_branch.withColumn("BRANCH_ZIP", when(col("BRANCH_ZIP").isNull(), lit("99999")).otherwise(col("BRANCH_ZIP")))

df_cc = df_cc.withColumn("TIMEID", format_string("%04d%02d%02d", col("YEAR"), col("MONTH"), col("DAY")))
#df_customer.show()
#df_branch.show()
#df_cc.show()

Load Data 1.2

In [4]:
mysql_url = "jdbc:mysql://localhost:3306/creditcard_capstone"

mysql_config = {
    "user": "root",
    "password": "password",
    "driver": "com.mysql.cj.jdbc.Driver"
}

df_customer.write.jdbc(
    url=mysql_url,
    table="CDW_SAPP_CUSTOMER",
    mode="append",  # use 'append' if you want to preserve existing data
    properties=mysql_config
)

df_branch.write.jdbc(
    url=mysql_url,
    table="CDW_SAPP_BRANCH",
    mode="append",
    properties=mysql_config
)

df_cc.write.jdbc(
    url=mysql_url,
    table="CDW_SAPP_CREDIT_CARD",
    mode="append",
    properties=mysql_config
)

print("All DataFrames transferred to MySQL.")

All DataFrames transferred to MySQL.


df_customer.printSchema()
df_branch.printSchema()
df_cc.printSchema()

CREATE TABLE CDW_SAPP_CUSTOMER (
    APT_NO VARCHAR(10),
    CREDIT_CARD_NO VARCHAR(20),
    CUST_CITY VARCHAR(50),
    CUST_COUNTRY VARCHAR(50),
    CUST_EMAIL VARCHAR(100),
    CUST_PHONE VARCHAR(20),
    CUST_STATE VARCHAR(50),
    CUST_ZIP VARCHAR(10),
    FIRST_NAME VARCHAR(50),
    LAST_NAME VARCHAR(50),
    LAST_UPDATED VARCHAR(50),
    MIDDLE_NAME VARCHAR(50),
    SSN BIGINT PRIMARY KEY,
    STREET_NAME VARCHAR(100),
    ADDRESS VARCHAR(200) NOT NULL
);


CREATE TABLE CDW_SAPP_BRANCH (
    BRANCH_CITY VARCHAR(50),
    BRANCH_CODE BIGINT PRIMARY KEY,
    BRANCH_NAME VARCHAR(100),
    BRANCH_PHONE VARCHAR(20),
    BRANCH_STATE VARCHAR(50),
    BRANCH_STREET VARCHAR(100),
    BRANCH_ZIP VARCHAR(10),
    LAST_UPDATED VARCHAR(50)
);


CREATE TABLE CDW_SAPP_CREDIT_CARD (
    BRANCH_CODE BIGINT,
    CREDIT_CARD_NO VARCHAR(20),
    CUST_SSN BIGINT,
    DAY INT,
    MONTH INT,
    TRANSACTION_ID BIGINT PRIMARY KEY,
    TRANSACTION_TYPE VARCHAR(50),
    TRANSACTION_VALUE DOUBLE,
    YEAR INT,
    TIMEID VARCHAR(20) NOT NULL
);
