# Extraction

In [1]:
import requests
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
import mysql.connector as msql
from mysql.connector import Error
# This file contains our login information for the MySQL server
import credentials as C

In [2]:
# store API url
url = 'https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json'

# assign the headers- not always necessary, but something we have to do with the GitHub API
headers = {'Accept': 'application/vnd.github.v3+json'}

# assign the requests method
r = requests.get(url, headers=headers)

# print a status update for the requests command
print(f"Status code: {r.status_code}")

# store API response to variable
api_results = r.json()

spark = SparkSession.builder.appName("Loan Data Load").getOrCreate()

# Convert the API response to an RDD
json_rdd = spark.sparkContext.parallelize([api_results])

# Read the RDD as a DataFrame using spark.read.json()
loan_data_df = spark.read.json(json_rdd)

print("Dataframe succefully created")

Status code: 200
Dataframe succefully created


In [8]:
loan_data_df.printSchema()
loan_data_df.show(10)

root
 |-- Application_ID: string (nullable = true)
 |-- Application_Status: string (nullable = true)
 |-- Credit_History: long (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Self_Employed: string (nullable = true)

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|
|      LP001003|                 N|             1|         1|    G

Lets see what the unique values are in each of the dataframe columns

In [10]:
for column in loan_data_df.columns:
    # skip the first column since that column is all unique
    if column == 'Application_ID':
        continue
    unique_values = loan_data_df.select(column).distinct().collect()
    print(f"Column: {column}")
    for row in unique_values:
        print(row[column])
    print()

Unique values in column 'Application_Status':
Y
N

Unique values in column 'Credit_History':
0
1

Unique values in column 'Dependents':
0
1
3+
2

Unique values in column 'Education':
Not Graduate
Graduate

Unique values in column 'Gender':
Female
Male

Unique values in column 'Income':
low
high
medium

Unique values in column 'Married':
No
Yes

Unique values in column 'Property_Area':
Urban
Semiurban
Rural

Unique values in column 'Self_Employed':
No
Yes



Let's create the sql query to create the table.

In [34]:
jdbc_url = f"jdbc:mysql://{C.host_name}:3306/creditcard_capstone"
table_name = "cdw_sapp_loan_application"
connection_properties = {
    "user": C.user_name,
    "password": C.password,
    "driver": "com.mysql.jdbc.Driver"}

In [50]:
column_types = [
    "ID CHAR(8) NOT NULL",
    "Gender VARCHAR(6)",
    "Married VARCHAR(3)",
    "Dependents VARCHAR(2)",
    "Education VARCHAR(12)",
    "Self_Employed VARCHAR(3)",
    "Credit_History VARCHAR(1)",
    "Property_Area VARCHAR(10)",
    "Income VARCHAR(8)",
    "Application_Status VARCHAR(1)"]

In [52]:
# Define the primary key constraint separately
primary_key_constraint = "PRIMARY KEY (ID)"

# Combine the column types and primary key constraint
create_table_query = f"({', '.join(column_types)}, {primary_key_constraint})"

In [54]:
# Write the DataFrame to the MySQL database
loan_data_df.write.mode("overwrite")\
    .option("dropTableIfExists", "true")\
    .jdbc(url=jdbc_url, table=table_name, properties=connection_properties)

In [6]:
mysql_properties = {
    "url": "jdbc:mysql://{host}:{port}/{database}".format(
        host=C.host_name,
        port=C.port,
        database="creditcard_capstone"
    ),
    "driver": "com.mysql.jdbc.Driver",
    "user": C.user_name,
    "password": C.password,
    "dbtable": "cdw_sapp_loan_application"
}

try:
    # Write the PySpark DataFrame to MySQL
    loan_data_df.write \
        .mode("overwrite") \
        .format("jdbc") \
        .options(**mysql_properties) \
        .save()

    print("Loan Application data fully loaded")

except Exception as e:
    print('Error while connecting to MySQL:', e)

Loan Application data fully loaded


In [3]:
loan_table = (
"CREATE TABLE IF NOT EXISTS `cdw_sapp_loan_application` ("
"  `ID` CHAR(8) NOT NULL,"
"  `Gender` VARCHAR(6) NULL,"
"  `Married` VARCHAR(3) NULL,"
"  `Dependents` VARCHAR(2) NULL,"
"  `Education` VARCHAR(12) NULL,"
"  `Self_Employed` VARCHAR(3) NULL," 
"  `Credit_History` VARCHAR(1) NULL,"
"  `Property_Area` VARCHAR(10) NULL,"
"  `Income` VARCHAR(8) NULL,"
"  `Application_Status` VARCHAR(1) NULL,"
"  PRIMARY KEY (`ID`))"
"ENGINE = InnoDB  ")

In [4]:
# Connect to MySQL
conn = msql.connect(host = C.host_name, database = 'creditcard_capstone',
                        user = C.user_name, password = C.password)

# Create the table
cursor = conn.cursor()
cursor.execute(loan_table)

In [5]:
loan_data_df.write.jdbc(
    "jdbc:mysql://localhost:3306/creditcard_capstone", "cdw_sapp_loan_application",
    "mypassword", mode="overwrite")

TypeError: DataFrameWriter.jdbc() got multiple values for argument 'mode'

Let's connect to the database.

In [None]:
try:
    conn = msql.connect(host = C.host_name, database = 'creditcard_capstone',
                        user = C.user_name, password = C.password)
    if conn.is_connected():
        cursor = conn.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("You're connected to database: ", record)
except Error as e:
    print('Error while connecting to MySQL',e)

You're connected to database:  ('creditcard_capstone',)


In [None]:
loan_table = (
"CREATE TABLE IF NOT EXISTS `cdw_sapp_loan_application` ("
"  `ID` CHAR(8) NOT NULL,"
"  `Gender` VARCHAR(6) NULL,"
"  `Married` VARCHAR(3) NULL,"
"  `Dependents` VARCHAR(2) NULL,"
"  `Education` VARCHAR(12) NULL,"
"  `Self_Employed` VARCHAR(3) NULL," 
"  `Credit_History` VARCHAR(1) NULL,"
"  `Property_Area` VARCHAR(10) NULL,"
"  `Income` VARCHAR(8) NULL,"
"  `Application_Status` VARCHAR(1) NULL,"
"  PRIMARY KEY (`ID`))"
"ENGINE = InnoDB  ")

In [14]:
try:
    if conn.is_connected():
        #lets add the loan into the db
        cursor.execute('DROP TABLE IF EXISTS cdw_sapp_loan_application;')
        print('Creating cdw_sapp_loan_application table....')
        cursor.execute(loan_table)
        print("cdw_sapp_branch table is created....")
        #loop through the data frame
        for i,row in loan_data_df.iterrows():
            #here %S represents each object in the row, one object per column
            sql = "INSERT INTO creditcard_capstone.cdw_sapp_loan_application\
                  VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            cursor.execute(sql, tuple(row))
            print(f"{i+1} Loan Application Records inserted")
            # the connection is not auto committed by default so we must commit to save our changes
            conn.commit()
        print("Loan Application data fulled loaded")
except Error as e:
    print('Error while connecting to MySQL',e)

Creating cdw_sapp_loan_application table....
cdw_sapp_branch table is created....


AttributeError: 'DataFrame' object has no attribute 'iterrows'