In [1]:
import pyspark
import yaml
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
def init_spark():
    spark = pyspark.sql.SparkSession.builder\
            .master("local")\
            .appName("Credit Score Card") \
            .config("spark.executor.memory","8G") \
            .config("spark.executor.instances","1") \
            .config("spark.executor.cores", "4") \
            .config("spark.default.parallelism", 400) \
            .config("spark.executor.memoryOverhead", "2G") \
            .getOrCreate()
    sc = spark.sparkContext
    print(sc.version)
    print(sc.applicationId)
    print(sc.uiWebUrl)
    return spark

def load_config(path):
    params = dict()
    with open(path, 'r') as stream:
        params = yaml.load(stream, Loader=yaml.FullLoader)
    return params

def read_dataset(spark, data_path, file_format='csv'):
    dataset = spark.read.format(file_format)\
      .option("header",  True)\
      .option("inferSchema",  True)\
      .load(data_path)  
    return dataset

In [3]:
spark = init_spark()

22/06/09 04:29:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


3.1.2
local-1654748948259
http://jupyter.my.nginx.test/hub/user-redirect/proxy/4040/jobs/


In [4]:
params = load_config('../conf/default_estimation_spark_lgbm_dev.yaml')

In [5]:
eval_test_dataset = read_dataset(spark, params['eval_out_path'], file_format='parquet')

                                                                                

In [6]:
eval_test_dataset.limit(10).toPandas()

                                                                                

Unnamed: 0,isDefault,features,rawPrediction,probability,prediction
0,0,"[1000.0, 3.0, 6.97, 30.87, 1.0, 2.0, 123318.0,...","[2.337995530153171, -2.337995530153171]","[0.911975306477623, 0.088024693522377]",0.0
1,0,"[1000.0, 3.0, 15.99, 35.16, 3.0, 14.0, 215340....","[-0.10965281459904996, 0.10965281459904996]","[0.4726142307997735, 0.5273857692002265]",1.0
2,0,"[1000.0, 3.0, 18.24, 36.28, 4.0, 19.0, 64536.0...","[0.32907556139404964, -0.32907556139404964]","[0.5815344297028524, 0.41846557029714754]",0.0
3,1,"[1500.0, 3.0, 11.49, 49.46, 2.0, 9.0, 212874.0...","[0.8718348674582469, -0.8718348674582469]","[0.7051273522619355, 0.2948726477380646]",0.0
4,0,"[1500.0, 3.0, 13.49, 50.9, 3.0, 11.0, 89676.0,...","[-0.1605600898253861, 0.1605600898253861]","[0.45994598843603185, 0.5400540115639682]",1.0
5,1,"[1500.0, 3.0, 16.99, 53.48, 4.0, 17.0, 6600.0,...","[-0.4225961586637862, 0.4225961586637862]","[0.39589567937414427, 0.6041043206258557]",1.0
6,0,"[1600.0, 3.0, 10.64, 52.11, 2.0, 8.0, 203091.0...","[0.9316509455109415, -0.9316509455109415]","[0.7174101064763352, 0.2825898935236648]",0.0
7,0,"[1600.0, 3.0, 13.53, 54.32, 2.0, 9.0, 65718.0,...","[1.2519325976063975, -1.2519325976063975]","[0.777634223779675, 0.22236577622032502]",0.0
8,0,"[1800.0, 3.0, 12.73, 60.42, 2.0, 9.0, 61357.0,...","[2.783424036088894, -2.783424036088894]","[0.9417734897923757, 0.05822651020762423]",0.0
9,0,"[1800.0, 3.0, 19.99, 66.89, 5.0, 20.0, 234119....","[0.2507570877638907, -0.2507570877638907]","[0.5623628372077925, 0.4376371627922075]",0.0


Assuming that we have reasonably estimated the loan default rate $p$ through the machine learning model, then we can give the following score

\begin{aligned}
\text{Score} &= A-B \ln(\text{odds})= A-B \ln\bigg( \frac{p}{1-p} \bigg) \\
B &= \frac{\text{PDO}}{{\ln2}} \\
A &= \text{S}_0 - B \ln(\text{odds}_0) \\
\end{aligned}

In [7]:
## Define PDO, S0, odds0
PDO = 20
S0 = 600
odds0 = 1/2

B = PDO / np.log(2)
A = S0 + B * np.log(odds0)
print("PDO=%d, B=%.2f, A=%.2f"%(PDO, B, A))

PDO=20, B=28.85, A=580.00


In [8]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf, col

def compute_credit_score(v):
    try:
        return float(A-B*np.log(float(v[1])/float(v[0])))
    except ValueError:
        return -1.0

credit_score_udf = udf(compute_credit_score, DoubleType())

result = eval_test_dataset.withColumn('credit_score', credit_score_udf(F.col("probability")))

In [9]:
result.limit(10).toPandas()

                                                                                

Unnamed: 0,isDefault,features,rawPrediction,probability,prediction,credit_score
0,0,"[1000.0, 3.0, 6.97, 30.87, 1.0, 2.0, 123318.0,...","[2.337995530153171, -2.337995530153171]","[0.911975306477623, 0.088024693522377]",0.0,647.460291
1,0,"[1000.0, 3.0, 15.99, 35.16, 3.0, 14.0, 215340....","[-0.10965281459904996, 0.10965281459904996]","[0.4726142307997735, 0.5273857692002265]",1.0,576.836089
2,0,"[1000.0, 3.0, 18.24, 36.28, 4.0, 19.0, 64536.0...","[0.32907556139404964, -0.32907556139404964]","[0.5815344297028524, 0.41846557029714754]",0.0,589.495114
3,1,"[1500.0, 3.0, 11.49, 49.46, 2.0, 9.0, 212874.0...","[0.8718348674582469, -0.8718348674582469]","[0.7051273522619355, 0.2948726477380646]",0.0,605.155837
4,0,"[1500.0, 3.0, 13.49, 50.9, 3.0, 11.0, 89676.0,...","[-0.1605600898253861, 0.1605600898253861]","[0.45994598843603185, 0.5400540115639682]",1.0,575.367215
5,1,"[1500.0, 3.0, 16.99, 53.48, 4.0, 17.0, 6600.0,...","[-0.4225961586637862, 0.4225961586637862]","[0.39589567937414427, 0.6041043206258557]",1.0,567.806452
6,0,"[1600.0, 3.0, 10.64, 52.11, 2.0, 8.0, 203091.0...","[0.9316509455109415, -0.9316509455109415]","[0.7174101064763352, 0.2825898935236648]",0.0,606.881764
7,0,"[1600.0, 3.0, 13.53, 54.32, 2.0, 9.0, 65718.0,...","[1.2519325976063975, -1.2519325976063975]","[0.777634223779675, 0.22236577622032502]",0.0,616.123139
8,0,"[1800.0, 3.0, 12.73, 60.42, 2.0, 9.0, 61357.0,...","[2.783424036088894, -2.783424036088894]","[0.9417734897923757, 0.05822651020762423]",0.0,660.312641
9,0,"[1800.0, 3.0, 19.99, 66.89, 5.0, 20.0, 234119....","[0.2507570877638907, -0.2507570877638907]","[0.5623628372077925, 0.4376371627922075]",0.0,587.23532
