In [0]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import *

# LOAN DATASET #

In [0]:
dbutils.fs.ls("dbfs:/Volumes/data_engineering/default/data/loan.csv")

[FileInfo(path='dbfs:/Volumes/data_engineering/default/data/loan.csv', name='loan.csv', size=43967, modificationTime=1725654157000)]

In [0]:
#read in loan dataset
loan_df = spark.read.csv("dbfs:/Volumes/data_engineering/default/data/loan.csv", header=True, inferSchema=True)

In [0]:
# Print the schema of the loan dataset
loan_df.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)



In [0]:
loan_df.columns

['Customer_ID',
 'Age',
 'Gender',
 'Occupation',
 'Marital Status',
 'Family Size',
 'Income',
 'Expenditure',
 'Use Frequency',
 'Loan Category',
 'Loan Amount',
 'Overdue',
 ' Debt Record',
 ' Returned Cheque',
 ' Dishonour of Bill']

In [0]:
# Print the first 5 lines of the loan dataset
display(loan_df.show(5))

+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|     DENTIST|        SINGLE|          3| 58450|      27675|            

In [0]:
# Print the number of columns in the loan dataset
num_rows = loan_df.count()
num_columns = len(loan_df.columns)
print(f"Shape of DataFrame: ({num_rows}, {num_columns})")
print(f"Number of columns: {num_columns}")

Shape of DataFrame: (500, 15)
Number of columns: 15


In [0]:
# Print the number of rows in the loan dataset
print(f"Number of rows: {num_rows}")

Number of rows: 500


In [0]:
# Print the count of distinct records in the loan dataset
print(f"Number of distinct records: {loan_df.distinct().count()}")

Number of distinct records: 500


In [0]:
# find the number of loans in each category
print(loan_df.groupBy("Loan Category").count().show())

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+

None


In [0]:
# find the number of people who have taken more than 1 lakh loan
# Clean the 'Loan Amount' column by removing non-numeric characters and converting it to an integer
from pyspark.sql import functions as F

loan_df = loan_df.withColumn("Loan Amount", F.regexp_replace(F.col("Loan Amount"), "[^0-9]", "").cast("integer"))

# Filter for people who have taken a loan amount greater than 1 lakh (100,000)
count_more_than_one_lakh = loan_df.filter(loan_df['Loan Amount'] > 100000).count()
print(count_more_than_one_lakh)

450


In [0]:
# find the number of people with income greater than 60000 rupees
print(loan_df.filter(loan_df['Income'] > 60000).count())

198


In [0]:
# fidn the number of people with 2 or more returned cheques and income less than 50000
print(loan_df.filter((loan_df['Income'] < 50000) & (loan_df[' Returned Cheque'] >= 2)).count())

137


In [0]:
# find the number of people with 2 or more returned cheques and are single
print(loan_df.filter((loan_df['Marital Status'] == 'SINGLE') & (loan_df[' Returned Cheque'] >= 2)).count())

111


In [0]:
# find the  number of people with expenditure over 50000 a month 
print(loan_df.filter((loan_df['Expenditure'] > 5000)).count())

481


# CREDIT CARD DATASET #

In [0]:
# Load the credit card dataset
credit_df = spark.read.csv("dbfs:/Volumes/data_engineering/default/data/credit card.csv", header=True, inferSchema=True)

In [0]:
# Print the schema of the credit card dataset
credit_df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [0]:
# Print the number of columns in the credit card dataset
credit_df_num_columns = len(credit_df.columns)
print(f"Number of columns: {credit_df_num_columns}")


Number of columns: 13


In [0]:
# Print the number of rows in the credit card dataset
credit_df_num_rows = (credit_df.count())
print(f"Number of rows: {credit_df_num_rows}")

Number of rows: 10000


In [0]:
# Print the number of distinct records in the credit card dataset
credit_df_num_distinct = credit_df.distinct().count()
print(f"Number of distinct records: {credit_df_num_distinct}")

Number of distinct records: 10000


In [0]:
# Print the first 5 rows in the credit card dataset
print(credit_df.show(5))

+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|RowNumber|CustomerId| Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+--------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|        1|  15634602|Hargrave|        619|   France|Female| 42|     2|      0.0|            1|             1|      101348.88|     1|
|        2|  15647311|    Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|             1|      112542.58|     0|
|        3|  15619304|    Onio|        502|   France|Female| 42|     8| 159660.8|            3|             0|      113931.57|     1|
|        4|  15701354|    Boni|        699|   France|Female| 39|     1|      0.0|            2|             0|       93826.63|     0|
|        5|  15737888|Mitchell|        850|    Spain|Female| 4

In [0]:
# Find the number of members who are eligible for a credit card
# Define the eligibility criteria

eligible_customers = credit_df.filter(
    (credit_df['CreditScore'] >= 650) &     # Minimum credit score of 650
    (credit_df['Age']  >= 18) &              # At least 18 years old
    (credit_df['Balance']  > 1000) &         # Minimum balance above 1000
    (credit_df['NumOfProducts']  >= 1) &     # At least 1 product with the bank
    (credit_df['IsActiveMember'] == 1)      # Must be an active member
)
print(eligible_customers.count())

1713


In [0]:
# Find the number of members who are  elgible and active in the bank
credit_df.filter('IsActiveMember == 1').count()

5151

In [0]:
# Find the credit card users in Spain 
credit_df.filter('Geography == "Spain"').count()

2477

In [0]:
# Find the credit card users with Estimated Salary greater than 100000 and have exited the card
credit_df.filter((credit_df['EstimatedSalary'] > 100000) & (credit_df['Exited'] == 1)).count()

1044

In [0]:
# Find the credit card users with Estimated Salary less than 100000 and have more than 1 product
credit_df.filter((credit_df['EstimatedSalary'] < 100000) & (credit_df['NumOfProducts'] > 1)).count()

2432

# TRANSACTION DATASET #

In [0]:
# Load the transacton dataset
transaction_df = spark.read.csv("dbfs:/Volumes/data_engineering/default/data/txn.csv", header=True, inferSchema=True)

In [0]:
# Print the schema of the transacton dataset
transaction_df.printSchema()

root
 |-- Account No: string (nullable = true)
 |-- TRANSACTION DETAILS: string (nullable = true)
 |-- VALUE DATE: string (nullable = true)
 |--  WITHDRAWAL AMT : double (nullable = true)
 |--  DEPOSIT AMT : double (nullable = true)
 |-- BALANCE AMT: double (nullable = true)



In [0]:
transaction_df.columns

['Account No',
 'TRANSACTION DETAILS',
 'VALUE DATE',
 ' WITHDRAWAL AMT ',
 ' DEPOSIT AMT ',
 'BALANCE AMT']

In [0]:
#COUNT OF TRANSACTION ON EVERY ACCOUNT
transaction_df.groupBy("Account No").count().show()

+-------------+-----+
|   Account No|count|
+-------------+-----+
|409000438611'| 4588|
|     1196711'|10536|
|     1196428'|48779|
|409000493210'| 6014|
|409000611074'| 1093|
|409000425051'|  802|
|409000405747'|   51|
|409000493201'| 1044|
|409000438620'|13454|
|409000362497'|29840|
+-------------+-----+



In [0]:
# Find the Maximum withdrawal amount for each account
transaction_df.groupby('Account No').max(' WITHDRAWAL AMT ').show()

+-------------+---------------------+
|   Account No|max( WITHDRAWAL AMT )|
+-------------+---------------------+
|409000438611'|                2.4E8|
|     1196711'|        4.594475464E8|
|     1196428'|                1.5E8|
|409000493210'|                1.5E7|
|409000611074'|             912000.0|
|409000425051'|               3.54E8|
|409000405747'|                1.7E8|
|409000493201'|            2500000.0|
|409000438620'|                4.0E8|
|409000362497'|        1.413662392E8|
+-------------+---------------------+



In [0]:
#MINIMUM WITHDRAWAL AMOUNT OF AN ACCOUNT
transaction_df.groupby('Account No').min(' WITHDRAWAL AMT ').orderBy('min( WITHDRAWAL AMT )').show(1)

+-------------+---------------------+
|   Account No|min( WITHDRAWAL AMT )|
+-------------+---------------------+
|409000493210'|                 0.01|
+-------------+---------------------+
only showing top 1 row



In [0]:
#MAXIMUM DEPOSIT AMOUNT OF AN ACCOUNT
transaction_df.groupby('Account No').max(' DEPOSIT AMT ').orderBy('max( DEPOSIT AMT )').show(1)

+-------------+------------------+
|   Account No|max( DEPOSIT AMT )|
+-------------+------------------+
|409000493201'|         1000000.0|
+-------------+------------------+
only showing top 1 row



In [0]:
#MINIMUM DEPOSIT AMOUNT OF AN ACCOUNT
transaction_df.groupby('Account No').min(' DEPOSIT AMT ').orderBy('min( DEPOSIT AMT )').show(1)

+-------------+------------------+
|   Account No|min( DEPOSIT AMT )|
+-------------+------------------+
|409000493210'|              0.01|
+-------------+------------------+
only showing top 1 row



In [0]:
#sum of balance in every bank account
transaction_df.groupby('Account No').sum('BALANCE AMT').orderBy('sum(BALANCE AMT)').show()

+-------------+--------------------+
|   Account No|    sum(BALANCE AMT)|
+-------------+--------------------+
|     1196428'| -8.1418498130721E13|
|409000362497'| -5.2860004792808E13|
|     1196711'|-1.60476498101275E13|
|409000438620'|-7.12291867951358...|
|409000493210'|-3.27584952132095...|
|409000438611'|-2.49486577068339...|
|409000405747'|-2.43108047067000...|
|409000425051'|-3.77211841164998...|
|409000493201'|1.0420831829499985E9|
|409000611074'|       1.615533622E9|
+-------------+--------------------+



In [0]:
#Number of transaction on each date
transaction_df.groupby('VALUE DATE').count().orderBy('count').show()

+----------+-----+
|VALUE DATE|count|
+----------+-----+
| 22-Jul-17|    1|
| 23-Apr-16|    1|
| 13-Sep-16|    1|
| 29-Jan-17|    1|
|  9-Oct-16|    1|
| 12-Jul-15|    1|
|  3-Jun-18|    1|
| 16-Aug-15|    1|
| 13-Oct-18|    1|
| 13-Sep-15|    1|
| 18-Nov-18|    1|
| 11-Oct-16|    1|
| 14-Nov-15|    1|
| 17-Jul-16|    1|
| 15-Aug-15|    1|
|  3-Jul-16|    1|
|  3-Apr-15|    1|
| 25-Dec-18|    1|
|  9-Feb-19|    1|
| 27-Feb-16|    1|
+----------+-----+
only showing top 20 rows



In [0]:
#List of customers with withdrawal amount more than 1 lakh
transaction_df.groupby('Account No').sum(' WITHDRAWAL AMT ').filter(col('sum( WITHDRAWAL AMT )') > 100000).show()

+-------------+---------------------+
|   Account No|sum( WITHDRAWAL AMT )|
+-------------+---------------------+
|409000438611'|  4.705551279080001E9|
|     1196711'| 4.692584040299003E10|
|     1196428'| 6.848282907086004E10|
|409000493210'| 1.0060492432000004E8|
|409000611074'|         1.45397419E8|
|409000425051'|        3.845102355E8|
|409000405747'|        4.203178862E8|
|409000493201'|  9.537793091999999E7|
|409000438620'| 1.719608322354001...|
|409000362497'| 1.019351199124101...|
+-------------+---------------------+

