In [0]:
spark

In [0]:
%sql
use catalog `ng_db`; select * from `default`.`loan` limit 100;

Customer_ID,Age,Gender,Occupation,Marital Status,Family Size,Income,Expenditure,Use Frequency,Loan Category,Loan Amount,Overdue,Debt Record,Returned Cheque,Dishonour of Bill
IB14001,30,MALE,BANK MANAGER,SINGLE,4,50000.0,22199.0,6,HOUSING,1000000,5,42898,6,9
IB14008,44,MALE,PROFESSOR,MARRIED,6,51000.0,19999.0,4,SHOPPING,50000,3,33999,1,5
IB14012,30,FEMALE,DENTIST,SINGLE,3,58450.0,27675.0,5,TRAVELLING,75000,6,20876,3,1
IB14018,29,MALE,TEACHER,MARRIED,5,45767.0,12787.0,3,GOLD LOAN,600000,7,11000,0,4
IB14022,34,MALE,POLICE,SINGLE,4,43521.0,11999.0,3,AUTOMOBILE,200000,2,43898,1,2
IB14024,55,FEMALE,NURSE,MARRIED,6,34999.0,19888.0,4,AUTOMOBILE,47787,1,50000,0,3
IB14025,39,FEMALE,TEACHER,MARRIED,6,46619.0,18675.0,4,HOUSING,1209867,8,29999,6,8
IB14027,51,MALE,SYSTEM MANAGER,MARRIED,3,49999.0,19111.0,5,RESTAURANTS,60676,8,13000,2,5
IB14029,24,FEMALE,TEACHER,SINGLE,3,45008.0,17454.0,4,AUTOMOBILE,399435,9,51987,4,7
IB14031,37,FEMALE,SOFTWARE ENGINEER,MARRIED,5,55999.0,23999.0,5,AUTOMOBILE,60999,2,0,5,3


In [0]:
df = spark.table('ng_db.default.loan')
df.show()

+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|           

In [0]:
from pyspark.sql.types import IntegerType,StringType

def get_digits(s):
    return ''.join([i for i in s if i.isdigit()])

get_digits_udf = udf(get_digits, StringType())

In [0]:
from pyspark.sql.functions import col, udf

df = df.withColumn('Loan Amount', get_digits_udf(col('Loan Amount')).cast(IntegerType()))\
    .withColumnRenamed(' Debt Record','Debt Record')\
    .withColumn('Debt Record', get_digits_udf(col('Debt Record')).cast(IntegerType()))\
    .withColumnRenamed(' Returned Cheque','Returned Cheque')\
    .withColumnRenamed(' Dishonour of Bill','Dishonor of Bill')

In [0]:
# Number of loans in each category
from pyspark.sql import functions as f

df.groupBy("Loan Category").agg(f.count("Loan Category").alias("count")).orderBy(
    "count", ascending=False
).show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|         GOLD LOAN|   77|
|           HOUSING|   67|
|        AUTOMOBILE|   60|
|        TRAVELLING|   53|
|       RESTAURANTS|   41|
|COMPUTER SOFTWARES|   35|
|          SHOPPING|   35|
|          BUSINESS|   24|
|  EDUCATIONAL LOAN|   20|
|        RESTAURANT|   20|
|           DINNING|   14|
|       ELECTRONICS|   14|
|   HOME APPLIANCES|   14|
|       AGRICULTURE|   12|
|       BOOK STORES|    7|
|          BUILDING|    7|
+------------------+-----+



In [0]:
# b. Number of people who have taken more than 1 lakh loan
# df.where('"Loan Amount" > "1,00,000"').show()
# df.filter(df['Loan Amount']> '1,00,000').agg(f.count('Loan Amount').alias('count of loan_amount_more_than_1lac')).show()
df.filter(df["Loan Amount"] > 100000).count()

450

In [0]:
# c. Number of people with income greater than 60000 rupees
df.filter(df['Income'] > 60000).count()

198

In [0]:
# d. Number of people with 2 or more returned cheques and income less than 50000
df.filter((df['Returned Cheque'] > 2) & (df['Income'] < 50000)).count()

117

In [0]:
# e. Number of people with 2 or more returned cheques and are single
df.filter((df['Returned Cheque'] > 2) & (df['Marital Status']=='SINGLE')).count()

90

In [0]:
# f. Number of people with expenditure over 50000 a month
df.filter(df['Expenditure'] > 50000).count()

6