# Extraction

In [1]:
import requests
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
import mysql.connector as msql
from mysql.connector import Error
# This file contains our login information for the MySQL server
import credentials as C

In [9]:
# store API url
url = 'https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json'

# assign the headers- not always necessary, but something we have to do with the GitHub API
headers = {'Accept': 'application/vnd.github.v3+json'}

# assign the requests method
r = requests.get(url, headers=headers)

# print a status update for the requests command
print(f"Status code: {r.status_code}")

# store API response to variable
api_results = r.json()

spark = SparkSession.builder.appName("Loan Data Load").getOrCreate()

# Convert the API response to an RDD
json_rdd = spark.sparkContext.parallelize([api_results])

# Read the RDD as a DataFrame using spark.read.json()
loan_data_df = spark.read.json(json_rdd)

print("Dataframe succefully created")

Status code: 200
Dataframe succefully created


In [10]:
loan_data_df.printSchema()
loan_data_df.show(10)

root
 |-- Application_ID: string (nullable = true)
 |-- Application_Status: string (nullable = true)
 |-- Credit_History: long (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Self_Employed: string (nullable = true)

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|
|      LP001003|                 N|             1|         1|    G

Lets see what the unique values are in each of the dataframe columns

In [10]:
for column in loan_data_df.columns:
    # skip the first column since that column is all unique
    if column == 'Application_ID':
        continue
    unique_values = loan_data_df.select(column).distinct().collect()
    print(f"Column: {column}")
    for row in unique_values:
        print(row[column])
    print()

Unique values in column 'Application_Status':
Y
N

Unique values in column 'Credit_History':
0
1

Unique values in column 'Dependents':
0
1
3+
2

Unique values in column 'Education':
Not Graduate
Graduate

Unique values in column 'Gender':
Female
Male

Unique values in column 'Income':
low
high
medium

Unique values in column 'Married':
No
Yes

Unique values in column 'Property_Area':
Urban
Semiurban
Rural

Unique values in column 'Self_Employed':
No
Yes



Let's create the sql query to create the table.

In [3]:
loan_table = (
"CREATE TABLE IF NOT EXISTS `cdw_sapp_loan_application` ("
"  `ID` CHAR(8) NOT NULL,"
"  `Gender` VARCHAR(6) NULL,"
"  `Married` VARCHAR(3) NULL,"
"  `Dependents` VARCHAR(2) NULL,"
"  `Education` VARCHAR(12) NULL,"
"  `Self_Employed` VARCHAR(3) NULL," 
"  `Credit_History` VARCHAR(1) NULL,"
"  `Property_Area` VARCHAR(10) NULL,"
"  `Income` VARCHAR(8) NULL,"
"  `Application_Status` VARCHAR(1) NULL,"
"  PRIMARY KEY (`ID`))"
"ENGINE = InnoDB  ")

In [5]:
# Connect to MySQL
conn = msql.connect(host = C.host_name, database = 'creditcard_capstone',
                        user = C.user_name, password = C.password)

# Create the table
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS cdw_sapp_loan_application;')
cursor.execute(loan_table)

In [None]:
mysql_config = {
    'host': C.host_name,
    'database': 'creditcard_capstone',
    'user': C.user_name,
    'password': C.password
}

# Create a Spark session
spark = SparkSession.builder.getOrCreate()

# Define a function to insert rows into MySQL
def insert_row(row):
    cnx = msql.connect(**mysql_config)
    cursor = cnx.cursor()
    
    insert_query = "INSERT INTO cdw_sapp_loan_application VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
    values = (row.Application_ID, row.Gender, row.Married,
              row.Dependents, row.Education, row.Self_Employed,
              row.Credit_History, row.Property_Area, row.Income,
              row.Application_Status)
    
    cursor.execute(insert_query, values)
    
    cnx.commit()
    cursor.close()
    cnx.close()

# Iterate over the PySpark DataFrame and insert rows into MySQL
loan_data_df.foreach(insert_row)

Let's connect to the database.

In [None]:
try:
    conn = msql.connect(host = C.host_name, database = 'creditcard_capstone',
                        user = C.user_name, password = C.password)
    if conn.is_connected():
        cursor = conn.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("You're connected to database: ", record)
except Error as e:
    print('Error while connecting to MySQL',e)

You're connected to database:  ('creditcard_capstone',)
