In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib
import seaborn as sns
import findspark
import json
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

Using matplotlib backend: <object object at 0x00000159717E1880>


In [2]:
url = 'https://raw.githubusercontent.com/platformps/LoanDataset/main/loan_data.json'

try:
    response = requests.get(url)
    print(f'Status code: {response.status_code}')
    data = response.json()
except requests.exceptions.RequestException as e:
    print(f'Error ocurred during API request: {e}')
    
    
if data is not None:
    print('Loan data retrieved successfuly!')
    print(f'Total loan applications: {len(data)}')



Status code: 200
Loan data retrieved successfuly!
Total loan applications: 511


In [6]:
spark = SparkSession.builder.appName('LoanApplicationApp').getOrCreate()

In [7]:
loan_df = spark.read.json('loan_data.json')
loan_df.show()

+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|Application_ID|Application_Status|Credit_History|Dependents|   Education|Gender|Income|Married|Property_Area|Self_Employed|
+--------------+------------------+--------------+----------+------------+------+------+-------+-------------+-------------+
|      LP001002|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|
|      LP001003|                 N|             1|         1|    Graduate|  Male|medium|    Yes|        Rural|           No|
|      LP001005|                 Y|             1|         0|    Graduate|  Male|   low|    Yes|        Urban|          Yes|
|      LP001006|                 Y|             1|         0|Not Graduate|  Male|   low|    Yes|        Urban|           No|
|      LP001008|                 Y|             1|         0|    Graduate|  Male|medium|     No|        Urban|           No|


In [8]:
loan_df.write.format("jdbc") \
.mode("overwrite") \
.option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
.option("dbtable", "creditcard_capstone.CDW_SAPP_loan_application") \
.option("user", "root") \
.option("password", "password") \
.save()

In [8]:
# Load the loan application data into a Pandas DataFrame
loan_data = pd.read_json("loan_data.json")

self_employed_df = loan_data[loan_data['Self_Employed'] == 'Yes']



In [9]:
approval_percentage = self_employed_df['Application_Status'].value_counts(normalize=True) * 100

In [10]:
sns.barplot(x=approval_percentage.index, y=approval_percentage.values)
plt.xlabel('Loan Approval')
plt.ylabel('Percentage')
plt.title('Fig-5.1 Percentage of Applications Approved for Self-Employed Applications')
plt.show()

In [12]:
#Selects only married mae applicants who were rejected
rejected_df = loan_data[(loan_data['Married'] == 'Yes') & (loan_data['Gender'] == 'Male') & (loan_data['Application_Status'] == 'N')]

In [13]:
#Calculate the percentage of rejected applications among married male applicants
rejection_percentage = (rejected_df.shape[0] / loan_data[loan_data['Married'] == 'Yes'].shape[0])*100

In [14]:
labels = ['Rejected','Approved']
sizes = [rejection_percentage, 100 - rejection_percentage]

plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
plt.title('Fig-5.2 Percentage of Rejection for Married Applicants')
plt.axis('equal')
plt.show()

In [18]:
cc_data = pd.read_json('Credit Card Dataset\cdw_sapp_credit.json', lines=True)
cc_data

Unnamed: 0,TRANSACTION_ID,DAY,MONTH,YEAR,CREDIT_CARD_NO,CUST_SSN,BRANCH_CODE,TRANSACTION_TYPE,TRANSACTION_VALUE
0,1,14,2,2018,4210653349028689,123459988,114,Education,78.90
1,2,20,3,2018,4210653349028689,123459988,35,Entertainment,14.24
2,3,8,7,2018,4210653349028689,123459988,160,Grocery,56.70
3,4,19,4,2018,4210653349028689,123459988,114,Entertainment,59.73
4,5,10,10,2018,4210653349028689,123459988,93,Gas,3.59
...,...,...,...,...,...,...,...,...,...
46689,46690,12,9,2018,4210653344660822,123451007,49,Gas,66.20
46690,46691,5,2,2018,4210653344660822,123451007,168,Grocery,100.13
46691,46692,16,12,2018,4210653344660822,123451007,104,Test,35.83
46692,46693,15,1,2018,4210653344660822,123451007,32,Entertainment,90.99


In [23]:
#group data by month and calculate the total transaction amount for each month
monthly_data = cc_data.groupby(cc_data['MONTH'])['TRANSACTION_VALUE'].sum()
monthly_data

MONTH
1     196568.87
2     201086.67
3     196488.59
4     194203.25
5     201310.26
6     195468.74
7     201199.35
8     196453.41
9     196069.44
10    202583.89
11    200549.36
12    201251.08
Name: TRANSACTION_VALUE, dtype: float64

In [28]:
#sort monthly data in descending order ad select the top 3 months
top_three_months = monthly_data.nlargest(3)
top_three_months

MONTH
10    202583.89
5     201310.26
12    201251.08
Name: TRANSACTION_VALUE, dtype: float64

In [None]:
top_three_months.plot(kind='bar')
plt.xlabel('Month')
plt.ylabel('Total Transaction Amount')
plt.title('Fig 5.3 Top Three Months with Largest Transaction Data')
plt.show()