In [None]:
pip install findspark

In [None]:
pip install requests

In [1]:
import json
import requests
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import concat, concat_ws, col, initcap, lower, substring, lit

In [2]:
spark = SparkSession.builder.master("local[*]").appName('CreditCardData').getOrCreate()

In [9]:
from pyspark.sql.types import StructType,StructField, StringType,IntegerType,BooleanType,DoubleType,LongType



In [12]:
cust_df = spark.read.json('Credit Card Dataset\cdw_sapp_custmer.json')
cust_df.printSchema() #print original schema for the Dataframe



cust_df.createOrReplaceTempView('CDW_SAPP_CUSTOMER') #create a temp view of the customer table

new_cust_df = spark.sql('SELECT CONCAT(CDW_SAPP_CUSTOMER.STREET_NAME,",",CDW_SAPP_CUSTOMER.APT_NO) as STREET_NAME_APT_NO, lower(CDW_SAPP_CUSTOMER.MIDDLE_NAME) as middle_name,  concat("(", substring(CDW_SAPP_CUSTOMER.CUST_PHONE, 1, 3), ")",\
               substring(CDW_SAPP_CUSTOMER.CUST_PHONE, 4, 3), "-",\
               substring(CDW_SAPP_CUSTOMER.CUST_PHONE, 7, 4), "000") AS FORMAT_CUST_PHONE FROM CDW_SAPP_CUSTOMER ')

new_cust_df.show(100) #query concatenates the street name and apt no, lower case for middle name, and reformats the customer phone

#cust_df.show(100)
type(cust_df.dtypes)




root
 |-- APT_NO: string (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_CITY: string (nullable = true)
 |-- CUST_COUNTRY: string (nullable = true)
 |-- CUST_EMAIL: string (nullable = true)
 |-- CUST_PHONE: long (nullable = true)
 |-- CUST_STATE: string (nullable = true)
 |-- CUST_ZIP: string (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- LAST_UPDATED: string (nullable = true)
 |-- MIDDLE_NAME: string (nullable = true)
 |-- SSN: long (nullable = true)
 |-- STREET_NAME: string (nullable = true)

+--------------------+-----------+-----------------+
|  STREET_NAME_APT_NO|middle_name|FORMAT_CUST_PHONE|
+--------------------+-----------+-----------------+
|Main Street North...|         wm|    (123)781-8000|
|   Redwood Drive,829|    brendan|    (123)893-3000|
|12th Street East,683|   ezequiel|    (124)301-8000|
|Country Club Road...|      trina|    (124)321-5000|
|  Madison Street,301|        may|    (124)2

list

In [13]:
schema = StructType([
    StructField('APT_NO', StringType(), True),
    StructField('CREDIT_CARD_NO',StringType(),True),
    StructField('CUST_CITY',StringType(),True),
    StructField('CUST_COUNTRY',StringType(),True),
    StructField('CUST_EMAIL',StringType(),True),
    StructField('CUST_PHONE',StringType(),True),
    StructField('CUST_STATE',StringType(),True),
    StructField('CUST_ZIP',StringType(),True),
    StructField('FIRST_NAME', StringType(),True),
    StructField('LAST_NAME',StringType(), True),
    StructField('LAST_UPDATED', StringType(),True),
    StructField('MIDDLE_NAME', StringType(),True),
    StructField('SSN', LongType(), True),
    StructField('STREET_NAME',StringType(),True),
])

df_with_schema = spark.read.schema(schema).json('Credit Card Dataset\cdw_sapp_custmer.json')
df_with_schema.createOrReplaceTempView('CDW_SAPP_CUSTOMER') #create a temp view of the customer table

new_df_with_schema = spark.sql('SELECT *,CONCAT(CDW_SAPP_CUSTOMER.STREET_NAME,", ",CDW_SAPP_CUSTOMER.APT_NO) as Full_address, lower(CDW_SAPP_CUSTOMER.MIDDLE_NAME) as middle_name,  concat("(", substring(CDW_SAPP_CUSTOMER.CUST_PHONE, 1, 3), ")",\
                substring(CDW_SAPP_CUSTOMER.CUST_PHONE, 4, 3), "-",\
                substring(CDW_SAPP_CUSTOMER.CUST_PHONE, 7, 4), "000") AS FORMAT_CUST_PHONE FROM CDW_SAPP_CUSTOMER ')
new_df_with_schema.show(100) #query concatenates the street name and apt no, lower case for middle name, and reformats the customer phone

#df_with_schema.printSchema()
#df_with_schema.show(100)

+------+----------------+-----------------+-------------+--------------------+----------+----------+--------+----------+----------+--------------------+-----------+---------+-----------------+--------------------+-----------+-----------------+
|APT_NO|  CREDIT_CARD_NO|        CUST_CITY| CUST_COUNTRY|          CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME| LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN|      STREET_NAME|        Full_address|middle_name|FORMAT_CUST_PHONE|
+------+----------------+-----------------+-------------+--------------------+----------+----------+--------+----------+----------+--------------------+-----------+---------+-----------------+--------------------+-----------+-----------------+
|   656|4210653310061055|          Natchez|United States| AHooper@example.com|   1237818|        MS|   39120|      Alec|    Hooper|2018-04-21T12:49:...|         Wm|123456100|Main Street North|Main Street North...|         wm|    (123)781-8000|
|   829|4210653310102868

In [6]:
cc_df = spark.read.json('Credit Card Dataset\cdw_sapp_credit.json')

cc_df.printSchema()
cc_df.createOrReplaceTempView('CDW_SAPP_CREDIT_CARD')
date = spark.sql('SELECT *, CONCAT(DAY,"-",MONTH,"-",YEAR) AS DAY_MONTH_YEAR FROM CDW_SAPP_CREDIT_CARD')
#date.show()
updated_cc_df = date.select('CREDIT_CARD_NO','DAY_MONTH_YEAR','CUST_SSN','BRANCH_CODE','TRANSACTION_TYPE','TRANSACTION_VALUE','TRANSACTION_ID')
updated_cc_df.printSchema()
updated_cc_df.show()
type(updated_cc_df)

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: long (nullable = true)
 |-- MONTH: long (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: long (nullable = true)

root
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- DAY_MONTH_YEAR: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- BRANCH_CODE: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)

+----------------+--------------+---------+-----------+----------------+-----------------+--------------+
|  CREDIT_CARD_NO|DAY_MONTH_YEAR| CUST_SSN|BRANCH_CODE|TRANSACTION_TYPE|TRANSACTION_VALUE|TRANSACTION_ID|
+----------------+--------------+---------+-----------+----------------+-----------------+------

pyspark.sql.dataframe.DataFrame

In [8]:

branch_df = spark.read.json('Credit Card Dataset\cdw_sapp_branch.json')
branch_df.createOrReplaceTempView('CDW_SAPP_BRANCH')


spark.sql('select *, concat("(", substring(CDW_SAPP_BRANCH.BRANCH_PHONE, 1, 3), ")",\
               substring(CDW_SAPP_BRANCH.BRANCH_PHONE, 4, 3), "-",\
               substring(CDW_SAPP_BRANCH.BRANCH_PHONE, 7, 4)) AS FORMATTED_BRANCH_PHONE,\
        CASE WHEN CDW_SAPP_BRANCH.BRANCH_ZIP IS NULL THEN "99999" ELSE CDW_SAPP_BRANCH.BRANCH_ZIP END AS BRANCH_ZIP \
        from CDW_SAPP_BRANCH').show()

branch_df.printSchema()





+-----------------+-----------+------------+------------+------------+-------------------+----------+--------------------+----------------------+----------+
|      BRANCH_CITY|BRANCH_CODE| BRANCH_NAME|BRANCH_PHONE|BRANCH_STATE|      BRANCH_STREET|BRANCH_ZIP|        LAST_UPDATED|FORMATTED_BRANCH_PHONE|BRANCH_ZIP|
+-----------------+-----------+------------+------------+------------+-------------------+----------+--------------------+----------------------+----------+
|        Lakeville|          1|Example Bank|  1234565276|          MN|       Bridle Court|     55044|2018-04-18T16:51:...|         (123)456-5276|     55044|
|          Huntley|          2|Example Bank|  1234618993|          IL|  Washington Street|     60142|2018-04-18T16:51:...|         (123)461-8993|     60142|
|SouthRichmondHill|          3|Example Bank|  1234985926|          NY|      Warren Street|     11419|2018-04-18T16:51:...|         (123)498-5926|     11419|
|       Middleburg|          4|Example Bank|  1234663064| 

In [9]:
cust_df.write.format("jdbc") \
.mode("overwrite") \
.option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
.option("dbtable", "creditcard_capstone.CDW_SAPP_CUSTOMER") \
.option("user", "root") \
.option("password", "password") \
.save()

In [22]:
branch_df.write.format("jdbc") \
.mode("append") \
.option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
.option("dbtable", "creditcard_capstone.CDW_SAPP_BRANCH") \
.option("user", "root") \
.option("password", "password") \
.save()

In [10]:
cc_df.write.format("jdbc") \
.mode("overwrite") \
.option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
.option("dbtable", "creditcard_capstone.CDW_SAPP_CREDIT_CARD") \
.option("user", "root") \
.option("password", "password") \
.save()

In [56]:
spark.stop()