##Initial Setup

Installing PySpark and mounting Google Drive

In [1]:
!pip install pyspark

from google.colab import drive
drive.mount('/content/drive')

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 66.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=3ac865126ed853d87ed7537a9d9825c68a562737acc56d1021d741d0a4e4325f
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [3]:
#spark modules
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from time import time

#PySpark ML modules
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

#Numpy and Matplotlib modules to plot curves
import matplotlib.pyplot as plt
import numpy as np

In [4]:
spark = SparkSession.builder\
                    .appName('loan_default_prediction_DE_OW')\
                    .getOrCreate()

In [5]:
schema = StructType(
    [
        StructField("FDICCert", IntegerType()),
        StructField("Snapshotdate", StringType()),
        StructField("PeriodDate", StringType()),
        StructField("Period", StringType()),
        StructField("foliolossrate", DoubleType()),
        StructField("TotalAssets", DoubleType()),
        StructField("folioloan", DoubleType()),
        StructField("State", StringType()),
        StructField("Quarter_Period", IntegerType()),
        StructField("Year_Period", IntegerType()),
        StructField("MacroMergeKey", StringType()),
        StructField("foliolossrateLag1", DoubleType()),
        StructField("foliolossrateLag2", DoubleType()),
        StructField("foliolossrateLag3", DoubleType()),
        StructField("foliolossrateLag4", DoubleType()),
        StructField("unemployment", DoubleType()),
        StructField("unemployment_lag1", DoubleType()),
        StructField("unemployment_lag6", DoubleType()),
        StructField("unemployment_lag8", DoubleType()),
        StructField("unemployment_lag2growth", DoubleType()),
        StructField("house_prices_all_change", DoubleType()),
        StructField("house_purchase_prices_growth", DoubleType()),
        StructField("house_purchase_prices", DoubleType()),
        StructField("house_prices_all", DoubleType()),
        StructField("CommercialPriceNat", DoubleType()),
        StructField("CommercialPriceNat_lag8", DoubleType()),
        StructField("nominal_gdp_lag8", DoubleType()),
        StructField("nominal_personalincome_lag5change", DoubleType()),
        StructField("real_disposableincome_lag3change", DoubleType()),
        StructField("real_gdp", DoubleType()),
        StructField("RepDate", StringType()),
        StructField("MovingAverage", DoubleType()),
        StructField("Target", IntegerType()),
        StructField("P1", IntegerType()),
        StructField("P10", IntegerType()),
        StructField("P11", IntegerType()),
        StructField("P12", IntegerType()),
        StructField("P2", IntegerType()),
        StructField("P3", IntegerType()),
        StructField("P4", IntegerType()),
        StructField("P5", IntegerType()),
        StructField("P6", IntegerType()),
        StructField("P7", IntegerType()),
        StructField("P8", IntegerType()),
        StructField("P9", IntegerType())
    ]
)

In [6]:
input_df = spark.read.csv('/content/drive/MyDrive/DE OW Case Study/data.txt',
                            sep='\t',
                            header=True,
                            schema=schema)

In [7]:
categoricalColumns = ['FDICCert', 'Snapshotdate', 'PeriodDate', 'Period', 'State', 'Quarter_Period', 'Year_Period', 'MacroMergeKey', 'RepDate']
numericCols = [col for col in input_df.columns if col not in categoricalColumns and col!='Target']

In [8]:
stages = []

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
    
label_stringIdx = StringIndexer(inputCol = 'Target', outputCol = 'label')
stages += [label_stringIdx]

assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [9]:
path = "/content/drive/MyDrive/DE OW Case Study/dtModel"

saved_dtModel = DecisionTreeClassificationModel.load(path)

In [10]:
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(input_df)

new_df = pipelineModel.transform(input_df.limit(20)) # Add validation dataset for evaluation
selectedCols = ['features']
new_df = new_df.select(selectedCols)

new_pred = saved_dtModel.transform(new_df)
new_pred.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------+----------+
|features                                                                                                                                                                                                                                                                                                                                                                                              |rawPrediction|probability|prediction|
+-------------------------------------------------------------------------------------------------------------------------------------------