In [7]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib
import seaborn as sns
import findspark
import json
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

Using matplotlib backend: TkAgg


In [8]:
#assign url variable to .json link
url = 'https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json'

#try to get the specified url, if successful print the status code
try:
    response = requests.get(url)
    print(f'Status code: {response.status_code}')
    data = response.json()
except requests.exceptions.RequestException as e:
    print(f'Error ocurred during API request: {e}')
    
#if data does not return NoneType and response status is 200
if data is not None and response.status_code == 200:
    print('Loan data retrieved successfuly!')
    print(f'Total loan applications: {len(data)}')



Status code: 200
Loan data retrieved successfuly!
Total loan applications: 511


In [9]:
#create a Spark Session builder Obect
spark = SparkSession.builder.appName('LoanApplicationApp').getOrCreate()

In [10]:
#Read json file into loan_df variable  
loan_df = spark.read.json('loan_data.json')
loan_df.show()

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|
|      LP001003|                 N|             1|         1|    Graduate|  Male|medium|    Yes|        Rural|           No|
|      LP001005|                 Y|             1|         0|    Graduate|  Male|   low|    Yes|        Urban|          Yes|
|      LP001006|                 Y|             1|         0|Not Graduate|  Male|   low|    Yes|        Urban|           No|
|      LP001008|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|


In [8]:
#write the content of the loan_df dataframe into the credtcard_capstone table in MySql, Creating a table name CDW_SAPP_loan_application
loan_df.write.format("jdbc") \
.mode("append") \
.option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
.option("dbtable", "creditcard_capstone.CDW_SAPP_loan_application") \
.option("user", "root") \
.option("password", "password") \
.save()

In [13]:
# Load the loan application data into a Pandas DataFrame
loan_data = pd.read_json("loan_data.json")

#select te rows in the df where the value in self-employed is equal to yes
self_employed_df = loan_data[loan_data['Self_Employed'] == 'Yes']
self_employed_df


Unnamed: 0,Application_ID,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
2,LP001005,Male,Yes,0,Graduate,Yes,1,Urban,low,Y
5,LP001011,Male,Yes,2,Graduate,Yes,1,Urban,medium,Y
20,LP001066,Male,Yes,0,Graduate,Yes,1,Semiurban,medium,Y
25,LP001097,Male,No,1,Graduate,Yes,1,Rural,medium,N
36,LP001136,Male,Yes,0,Not Graduate,Yes,1,Urban,medium,Y
...,...,...,...,...,...,...,...,...,...,...
492,LP002931,Male,Yes,2,Graduate,Yes,1,Semiurban,medium,N
494,LP002938,Male,Yes,0,Graduate,Yes,1,Urban,high,Y
496,LP002941,Male,Yes,2,Not Graduate,Yes,1,Rural,medium,N
497,LP002945,Male,Yes,0,Graduate,Yes,1,Rural,medium,Y


In [14]:
#returns a series contain the count of unique values in the application status column in the self-employed df
approval_percentage = self_employed_df['Application_Status'].value_counts(normalize=True) * 100
approval_percentage

Application_Status
Y    65.714286
N    34.285714
Name: proportion, dtype: float64

In [9]:
#plot the application status in a bar chart
sns.barplot(x=approval_percentage.index, y=approval_percentage.values)
plt.xlabel('Loan Approval')
plt.ylabel('Percentage')
plt.title('Fig-5.1 Percentage of Applications Approved for Self-Employed Applications')
plt.show()

In [15]:
#Selects only married male applicants who were rejected
rejected_df = loan_data[(loan_data['Married'] == 'Yes') & (loan_data['Gender'] == 'Male') & (loan_data['Application_Status'] == 'N')]
rejected_df

Unnamed: 0,Application_ID,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Income,Application_Status
1,LP001003,Male,Yes,1,Graduate,No,1,Rural,medium,N
7,LP001014,Male,Yes,3+,Graduate,No,0,Semiurban,low,N
9,LP001020,Male,Yes,1,Graduate,No,1,Semiurban,high,N
16,LP001038,Male,Yes,0,Not Graduate,No,1,Rural,medium,N
17,LP001043,Male,Yes,0,Not Graduate,No,0,Urban,medium,N
...,...,...,...,...,...,...,...,...,...,...
487,LP002912,Male,Yes,1,Graduate,No,1,Rural,medium,N
490,LP002926,Male,Yes,2,Graduate,Yes,0,Semiurban,low,N
492,LP002931,Male,Yes,2,Graduate,Yes,1,Semiurban,medium,N
496,LP002941,Male,Yes,2,Not Graduate,Yes,1,Rural,medium,N


In [16]:
#Calculate the percentage of rejected applications among married male applicants
rejection_percentage = (rejected_df.shape[0] / loan_data[loan_data['Married'] == 'Yes'].shape[0])*100
rejection_percentage

26.283987915407852

In [12]:
#pie chart slices
labels = ['Rejected','Approved']
#creates a list of sizes for the two pie chart slices. Rejection percentage vairable = 26.28, 73.71
sizes = [rejection_percentage, 100 - rejection_percentage]

plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
plt.title('Fig-5.2 Percentage of Rejection for Married Applicants')
plt.axis('equal')
plt.show()

In [13]:
cc_data = pd.read_json('Credit Card Dataset\cdw_sapp_credit.json', lines=True)
cc_data

Unnamed: 0,TRANSACTION_ID,DAY,MONTH,YEAR,CREDIT_CARD_NO,CUST_SSN,BRANCH_CODE,TRANSACTION_TYPE,TRANSACTION_VALUE
0,1,14,2,2018,4210653349028689,123459988,114,Education,78.90
1,2,20,3,2018,4210653349028689,123459988,35,Entertainment,14.24
2,3,8,7,2018,4210653349028689,123459988,160,Grocery,56.70
3,4,19,4,2018,4210653349028689,123459988,114,Entertainment,59.73
4,5,10,10,2018,4210653349028689,123459988,93,Gas,3.59
...,...,...,...,...,...,...,...,...,...
46689,46690,12,9,2018,4210653344660822,123451007,49,Gas,66.20
46690,46691,5,2,2018,4210653344660822,123451007,168,Grocery,100.13
46691,46692,16,12,2018,4210653344660822,123451007,104,Test,35.83
46692,46693,15,1,2018,4210653344660822,123451007,32,Entertainment,90.99


In [14]:
#group data by month and calculate the total transaction amount for each month
monthly_data = cc_data.groupby(cc_data['MONTH'])['TRANSACTION_VALUE'].sum()
monthly_data

MONTH
1     196568.87
2     201086.67
3     196488.59
4     194203.25
5     201310.26
6     195468.74
7     201199.35
8     196453.41
9     196069.44
10    202583.89
11    200549.36
12    201251.08
Name: TRANSACTION_VALUE, dtype: float64

In [15]:
#sort monthly data in descending order ad select the top 3 months
top_three_months = monthly_data.nlargest(3)
top_three_months

MONTH
10    202583.89
5     201310.26
12    201251.08
Name: TRANSACTION_VALUE, dtype: float64

In [24]:
top_three_months.plot(kind='bar')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.title('Fig 5.3 Top Three Months with Largest Transaction Data')
plt.xticks(rotation=0)
plt.show()

In [27]:
#Filter dataframe to nclude only healthcare transactions
healthcare_df = cc_data[cc_data['TRANSACTION_TYPE'] == 'Healthcare']
healthcare_df

Unnamed: 0,TRANSACTION_ID,DAY,MONTH,YEAR,CREDIT_CARD_NO,CUST_SSN,BRANCH_CODE,TRANSACTION_TYPE,TRANSACTION_VALUE
27,28,6,8,2018,4210653349028689,123459988,43,Healthcare,98.78
32,33,20,10,2018,4210653349028689,123459988,117,Healthcare,24.37
43,44,13,11,2018,4210653349028689,123459988,36,Healthcare,79.91
44,45,8,9,2018,4210653349028689,123459988,124,Healthcare,20.81
48,49,26,8,2018,4210653349028689,123459988,69,Healthcare,26.34
...,...,...,...,...,...,...,...,...,...
46633,46634,17,2,2018,4210653375981317,123451012,66,Healthcare,10.03
46675,46676,18,10,2018,4210653344660822,123451007,2,Healthcare,70.01
46680,46681,27,10,2018,4210653344660822,123451007,38,Healthcare,72.78
46686,46687,22,12,2018,4210653344660822,123451007,63,Healthcare,5.56


In [28]:
#group data by branch and calculate the total dollar value of heathcare trnsactons for each branch
branch_total = healthcare_df.groupby('BRANCH_CODE')["TRANSACTION_VALUE"].sum()
branch_total

BRANCH_CODE
1      2920.15
2      3284.31
3      3353.55
4      2436.80
5      2736.69
        ...   
175    3341.57
178    2957.41
180    2344.08
188    2869.18
192    1960.35
Name: TRANSACTION_VALUE, Length: 114, dtype: float64

In [42]:
#Find the branch with the highest total dolar value
# sorted_branches = branch_total.sort_values(ascending=False)
# highest_branch = sorted_branches.index[0]
# highest_branch

#idxmax() returns the same results as the above syntax
highest_branch = branch_total.idxmax()
highest_branch

#Finds the highest value for the highest branch
max_value = branch_total.loc[highest_branch]
max_value

#Finds the largest branch and value
highest_amount = branch_total.nlargest(1)
highest_amount

BRANCH_CODE
25    4370.18
Name: TRANSACTION_VALUE, dtype: float64

In [38]:
highest_amount.plot(kind='bar')
plt.xlabel('Branch')
plt.ylabel('Total Dollar Amount')
plt.title('Fig 5.4 Healthcare Transactons by Branch')
plt.xticks(rotation=45)
plt.show()