In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression


# Initialize SparkSession
spark = SparkSession.builder \
    .appName('Loan prediction classification') \
    .getOrCreate()

# Load the dataset
data_path = 'data/loan-prediction-train.csv'
loan_data = spark.read.csv(data_path, header=True, inferSchema=True)


24/11/20 23:50:36 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# Display data schema
loan_data.printSchema()

root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)



In [3]:
loan_data.show()

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y

In [4]:
# Train/test split
train_data, test_data = loan_data.randomSplit([0.8, 0.2], seed=42)


In [14]:
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline

pipeline_stages = []

# Impute numerical columns
imputer_columns = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']
imputed_columns = [f'{c}_imputed' for c in imputer_columns]
# Replace null values with column median
imputer = Imputer(
    inputCols=imputer_columns,
    outputCols=imputed_columns,
    strategy='median'
)
pipeline_stages.append(imputer)

# Transform categorical columns to one-hot encoded vectors
categorical_columns =  ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

for col in categorical_columns:
    string_indexer = StringIndexer(inputCol=col, outputCol= col + '_index', handleInvalid='keep')
    
    encoder = OneHotEncoder(inputCol=string_indexer.getOutputCol(), outputCol=col + '_vec', dropLast=False)
    
    pipeline_stages += [string_indexer, encoder]

pipeline_stages.append(StringIndexer(inputCol='Loan_Status', outputCol= 'Loan_Status_index'))


# Get a list of all features
feature_cols = [f'{col}_vec' for col in categorical_columns] + \
    imputed_columns + \
    ['ApplicantIncome', 'CoapplicantIncome']
    
print('Feature columns: ' + str(feature_cols))

# Create feature vectors
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
pipeline_stages.append(assembler)

# Create a classifier
dtc = DecisionTreeClassifier(featuresCol='features', labelCol='Loan_Status_index')
pipeline_stages.append(dtc)

# Create the pipeline
pipeline = Pipeline(stages=pipeline_stages)

Feature columns: ['Gender_vec', 'Married_vec', 'Dependents_vec', 'Education_vec', 'Self_Employed_vec', 'Property_Area_vec', 'LoanAmount_imputed', 'Loan_Amount_Term_imputed', 'Credit_History_imputed', 'ApplicantIncome', 'CoapplicantIncome']


In [15]:
loan_data.show()

+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
| Loan_ID|Gender|Married|Dependents|   Education|Self_Employed|ApplicantIncome|CoapplicantIncome|LoanAmount|Loan_Amount_Term|Credit_History|Property_Area|Loan_Status|
+--------+------+-------+----------+------------+-------------+---------------+-----------------+----------+----------------+--------------+-------------+-----------+
|LP001002|  Male|     No|         0|    Graduate|           No|           5849|              0.0|      NULL|             360|             1|        Urban|          Y|
|LP001003|  Male|    Yes|         1|    Graduate|           No|           4583|           1508.0|       128|             360|             1|        Rural|          N|
|LP001005|  Male|    Yes|         0|    Graduate|          Yes|           3000|              0.0|        66|             360|             1|        Urban|          Y

In [16]:
# Train the model
pipeline_model = pipeline.fit(train_data)

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the model
predictions = pipeline_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol='Loan_Status_index', metricName='accuracy')
accuracy = evaluator.evaluate(predictions) * 100
print(f'Accuracy = {accuracy:.2f}%')


Accuracy = 76.29%
