In [100]:
import findspark

findspark.find()
findspark.init()


In [101]:
# importing sparksession
from pyspark.sql import SparkSession

# starting the spark session
spark = SparkSession.builder.master("local[*]").appName("Banking").getOrCreate()
spark


In [102]:
# load data
df = spark.read.csv("./BankTransactionDataset/Loan.csv", inferSchema=True, header=True)
df.show(10)


+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|       Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-----------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|     BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|        PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|          DENTIST|        SINGLE|          3| 

In [103]:
df.dtypes

[('Customer_ID', 'string'),
 ('Age', 'int'),
 ('Gender', 'string'),
 ('Occupation', 'string'),
 ('Marital Status', 'string'),
 ('Family Size', 'int'),
 ('Income', 'int'),
 ('Expenditure', 'int'),
 ('Use Frequency', 'int'),
 ('Loan Category', 'string'),
 ('Loan Amount', 'string'),
 ('Overdue', 'int'),
 (' Debt Record', 'string'),
 (' Returned Cheque', 'int'),
 (' Dishonour of Bill', 'int')]

In [104]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType,StringType

def get_digits(s):
    return ''.join([i for i in s if i.isdigit()])

get_digits_udf = udf(get_digits, StringType())

In [105]:
df = df.withColumn('Loan Amount', get_digits_udf(col('Loan Amount')).cast(IntegerType()))\
    .withColumnRenamed(' Debt Record','Debt Record')\
    .withColumn('Debt Record', get_digits_udf(col('Debt Record')).cast(IntegerType()))\
    .withColumnRenamed(' Returned Cheque','Returned Cheque')\
    .withColumnRenamed(' Dishonour of Bill','Dishonor of Bill')


In [106]:
df.show()

+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+-----------+---------------+----------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue|Debt Record|Returned Cheque|Dishonor of Bill|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+-----------+---------------+----------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING|    1000000|      5|      42898|              6|               9|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SHOPPING|      50000|      3|      33999|              1|               5|
|    IB14012| 30|FEMALE|            DENTIST|        SIN

In [107]:
df.dtypes

[('Customer_ID', 'string'),
 ('Age', 'int'),
 ('Gender', 'string'),
 ('Occupation', 'string'),
 ('Marital Status', 'string'),
 ('Family Size', 'int'),
 ('Income', 'int'),
 ('Expenditure', 'int'),
 ('Use Frequency', 'int'),
 ('Loan Category', 'string'),
 ('Loan Amount', 'int'),
 ('Overdue', 'int'),
 ('Debt Record', 'int'),
 ('Returned Cheque', 'int'),
 ('Dishonor of Bill', 'int')]

In [108]:
# Number of loans in each category
from pyspark.sql import functions as f

df.groupBy("Loan Category").agg(f.count("Loan Category").alias("count")).orderBy(
    "count", ascending=False
).show()


+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|         GOLD LOAN|   77|
|           HOUSING|   67|
|        AUTOMOBILE|   60|
|        TRAVELLING|   53|
|       RESTAURANTS|   41|
|COMPUTER SOFTWARES|   35|
|          SHOPPING|   35|
|          BUSINESS|   24|
|  EDUCATIONAL LOAN|   20|
|        RESTAURANT|   20|
|           DINNING|   14|
|       ELECTRONICS|   14|
|   HOME APPLIANCES|   14|
|       AGRICULTURE|   12|
|       BOOK STORES|    7|
|          BUILDING|    7|
+------------------+-----+



In [109]:
# b. Number of people who have taken more than 1 lakh loan
# df.where('"Loan Amount" > "1,00,000"').show()
# df.filter(df['Loan Amount']> '1,00,000').agg(f.count('Loan Amount').alias('count of loan_amount_more_than_1lac')).show()
df.filter(df["Loan Amount"] > 100000).count()

450

In [110]:
# c. Number of people with income greater than 60000 rupees
df.filter(df['Income'] > 60000).count()

198

In [111]:
# d. Number of people with 2 or more returned cheques and income less than 50000
df.filter((df['Returned Cheque'] > 2) & (df['Income'] < 50000)).count()

117

In [116]:
# e. Number of people with 2 or more returned cheques and are single
df.filter((df['Returned Cheque'] > 2) & (df['Marital Status']=='SINGLE')).count()

90

In [117]:
# f. Number of people with expenditure over 50000 a month
df.filter(df['Expenditure'] > 50000).count()

6