In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
spark = SparkSession.builder \
    .appName("Analysis") \
    .getOrCreate()

In [2]:
table1= (
    spark.read.csv(
        "../dataset/New/final_set2.csv", 
        header=True,
        inferSchema=True
    )
    .withColumn("good", f.when(f.col("90>=day_contract") == 0, 1).otherwise(0))
    .withColumn("bad", f.when(f.col("90>=day_contract") == 1, 1).otherwise(0))
    .select(
        "FAC NO",
        "BRN CD",
        "PROCODE",
        "CUST_PERMRES",
        "GUAR_PERMRES",
        "EXP_CD",
        "DSCR_C",
        "connex_date",
        "DPD",
        "90>=day_contract",
        "good",
        "bad"
    )
# .withColumn("GUAR_PERMRES", f.when((f.col("GUAR_PERMRES").isNull())&(f.col("DSCR_C").isNotNull()), "No Guaranter").otherwise(f.col("GUAR_PERMRES")))
.filter(f.col("90>=day_contract").isNotNull())
.filter(f.col("connex_date").between("2022-04-01","2023-04-01"))
.withColumn("Product",f.when(f.col("PROCODE")==502,"Registered 3w").otherwise("Cash in hand"))
.withColumn("CUST_PERMRES",
            f.when(f.col("CUST_PERMRES")=="With Parents/spouse",
                   "With Parents/Spouse"
                  ).otherwise(f.col("CUST_PERMRES")))
.withColumn("GUAR_PERMRES",
            f.when(f.col("GUAR_PERMRES")=="With Parents/spouse",
                   "With Parents/Spouse"
                  ).otherwise(f.col("GUAR_PERMRES")))
.withColumn("GUAR_PERMRES",(f.when(
    (f.col("GUAR_PERMRES").isNull())
    &(f.col("EXP_CD").isNotNull())
    &(f.col("DSCR_C").isNotNull()),"No Guaranter").otherwise(f.col("GUAR_PERMRES")))
)  
)
table2=(
    spark.read.csv(
        "../dataset/New/detail_report.csv", 
        header=True,
        inferSchema=True
    ).select(
        "ACCOUNT NUMBER",
        "Conex Date",
        "Portfolio",
        "TOTAL DUE",
        "TOTAL COLLECTION",
        "Collection Ratio"
    )
    .withColumnRenamed("ACCOUNT NUMBER","FAC NO")
    .filter(f.col("Conex Date").between("2022-04-01","2023-04-01"))
    )
table3=(
    spark.read.csv(
        "../dataset/New/finalcrib.csv", 
        header=True,
        inferSchema=True
    ).select(
        "Fac",
        "CRIB OUTPUT",
        "CRIB score"
    )
    .filter(f.col("CRIB score")!=0)
    .withColumnRenamed("Fac","FAC NO")
)
table4=(spark.read.csv(
        "../dataset/New/finalcrib.csv", 
        header=True,
        inferSchema=True
    ).select(
        "Fac",
        "CRIB OUTPUT",
        "CRIB score"
    )
.filter(f.col("CRIB score")==0)
    .withColumnRenamed("Fac","FAC NO")
    .withColumn("CRIB score",f.lit("NO HIT"))
       )
    



In [3]:
table3.count()

18277

In [4]:
table4.count()

1115

In [5]:
table5= (table3.union(table4).select("FAC NO","CRIB score"))

In [6]:
table5.count()

19392

In [7]:
table3.show()

+-------------------+-----------+----------+
|             FAC NO|CRIB OUTPUT|CRIB score|
+-------------------+-----------+----------+
|'041600794120050202|    NO CRIB|  -400.000|
|'006100798148050201|    NO CRIB|  -400.000|
|'011500798116050201|    NO CRIB|  -400.000|
|'011500044709050202|    NO CRIB|  -400.000|
|'013800796663050202|    NO CRIB|  -400.000|
|'013800796663050201|    NO CRIB|  -400.000|
|'007000797964050801|    NO CRIB|  -400.000|
|'017500798051050201|    NO CRIB|  -400.000|
|'005000413648050801|        HIT|    50.950|
|'006800796830050202|    NO CRIB|  -400.000|
|'002900711921050801|    NO CRIB|  -400.000|
|'005200797575050202|    NO CRIB|  -400.000|
|'005000622866050802|    NO CRIB|  -400.000|
|'000600798195050201|    NO CRIB|  -400.000|
|'013900797777050201|    NO CRIB|  -400.000|
|'003400798039050202|    NO CRIB|  -400.000|
|'005000797918050201|    NO CRIB|  -400.000|
|'000800798010050201|    NO CRIB|  -400.000|
|'000300798074050201|    NO CRIB|  -400.000|
|'00060079

In [8]:
master_table=(
    table1
    .join(table2,"FAC NO","left")
    .join(table5,"FAC NO","left")
    .filter(f.col("Conex Date").between("2022-04-01","2023-04-01"))
)

In [10]:
master_table.show()

+-------------------+------+-------+--------------------+-------------------+------+---------+-----------+---+----------------+----+---+-------------+----------+---------+---------+----------------+----------------+----------+
|             FAC NO|BRN CD|PROCODE|        CUST_PERMRES|       GUAR_PERMRES|EXP_CD|   DSCR_C|connex_date|DPD|90>=day_contract|good|bad|      Product|Conex Date|Portfolio|TOTAL DUE|TOTAL COLLECTION|Collection Ratio|CRIB score|
+-------------------+------+-------+--------------------+-------------------+------+---------+-----------+---+----------------+----+---+-------------+----------+---------+---------+----------------+----------------+----------+
|'014200183766050801|   142|    508|  With Billing Proof|       No Guaranter| 32.04|3.2785542| 2022-06-29| 11|               0|   1|  0| Cash in hand|2022-06-29|  143,315|  445,127|         513,619|         274.17%|  -400.000|
|'041200813926050803|   412|    508|                Rent|               Rent|  31.2|5.345211

In [11]:
master_table.select("90>=day_contract").distinct().show()

+----------------+
|90>=day_contract|
+----------------+
|               0|
|               1|
+----------------+



In [12]:
master_table.count()

7654

In [13]:
master_table_pd=master_table.toPandas()

In [14]:
# master_table_pd.to_csv("../output/20240126_final_output/master_table3.csv")

## Analysis

In [15]:
master_table_new= (
      master_table
    )


In [16]:
master_table_new.count()

7654

# Taking good/bad as target variable(90 DPD Contracts)

### Product wise

In [17]:
master_table_new_product=(
     master_table_new
)
Product_wise= (
        master_table_new_product
        .groupBy("Product")
        .agg(
            f.sum("good").alias("good"),
            f.sum("bad").alias("bad")
        )
    )
total_good=(
        Product_wise
        .agg(f.sum("good").alias("good"))
        .collect()[0]['good']
    )
total_bad= (
        Product_wise
        .agg(f.sum("bad").alias("bad"))
        .collect()[0]['bad']
    )

Product_wise=(
    Product_wise
    .withColumn(
        "Total_rate",
        ((f.col("good")+f.col("bad"))/(total_good+total_bad))
        )
    .withColumn(
        "good_rate",
        f.col("good")/total_good    
    )
    .withColumn(
        "bad_rate",
        f.col("bad")/total_bad   
    )
    .withColumn(
        "odd",
        f.col("good")/f.col("bad")  
    )
    .withColumn(
    "IV",
    (
        ((f.col("good") / total_good) - (f.col("bad") / total_bad)) *
        f.ln((f.col("good") / total_good) / (f.col("bad") / total_bad))
    )
)
    .withColumn(
       "good_rate_of_sub_cat",
    f.col("good") / (f.col("good") + f.col("bad"))
    )
    .orderBy("good_rate_of_sub_cat")
)

# Iv_value_Product_wise= 0.029287+0.058048
Iv_value_Product_wise= 0.031576+0.049028

In [18]:
master_table_new_product.select("Product").distinct().show()

+-------------+
|      Product|
+-------------+
|Registered 3w|
| Cash in hand|
+-------------+



In [19]:
total_bad

1376

In [20]:
total_good

6278

In [21]:
Product_wise.toPandas()

Unnamed: 0,Product,good,bad,Total_rate,good_rate,bad_rate,odd,IV,good_rate_of_sub_cat
0,Registered 3w,3376,930,0.562582,0.537751,0.675872,3.630108,0.031576,0.784022
1,Cash in hand,2902,446,0.437418,0.462249,0.324128,6.506726,0.049028,0.866786


In [212]:
# Product_wise.toPandas().to_csv("../output/20240126_final_output/product_wise_final.csv")

In [22]:
Iv_value_Product_wise

0.08060400000000001

#### CUST_PERMRES

In [23]:
master_table_new_custer_pre=(
     master_table_new
    .filter(f.col("CUST_PERMRES").isNotNull())
)
CUST_PERMRES_wise= (
       master_table_new_custer_pre

    .groupBy("CUST_PERMRES")
        .agg(
            f.sum("good").alias("good"),
            f.sum("bad").alias("bad")
        )
    )
total_good_CUST_PERMRES_wise=(
       CUST_PERMRES_wise
        .agg(f.sum("good").alias("good"))
        .collect()[0]['good']
    )
total_bad_CUST_PERMRES_wise= (
        CUST_PERMRES_wise
        .agg(f.sum("bad").alias("bad"))
        .collect()[0]['bad']
    )

CUST_PERMRES_wise=(
    CUST_PERMRES_wise
    .withColumn(
        "Total_rate",
        ((f.col("good")+f.col("bad"))/(total_good_CUST_PERMRES_wise+total_bad_CUST_PERMRES_wise))
        )
    .withColumn(
        "good_rate",
        f.col("good")/total_good_CUST_PERMRES_wise    
    )
    .withColumn(
        "bad_rate",
        f.col("bad")/total_bad_CUST_PERMRES_wise   
    )
    .withColumn(
        "odd",
        f.col("good")/f.col("bad")  
    )
    .withColumn(
    "IV",
    (
        ((f.col("good") / total_good_CUST_PERMRES_wise) - (f.col("bad") / total_bad_CUST_PERMRES_wise)) *
        f.ln((f.col("good") / total_good_CUST_PERMRES_wise) / (f.col("bad") / total_bad_CUST_PERMRES_wise))
    )
)
     .withColumn(
       "good_rate_of_sub_cat",
    f.col("good") / (f.col("good") + f.col("bad"))
    )
    .orderBy("good_rate_of_sub_cat")
)
# IV_value_CUST_PERMRES= 0.001037 + 0.091448 + 0.002238 + 0.014116
IV_value_CUST_PERMRES=0.008460+0.003860	+0.000013+0.003943

In [24]:
CUST_PERMRES_wise.toPandas().head()

Unnamed: 0,CUST_PERMRES,good,bad,Total_rate,good_rate,bad_rate,odd,IV,good_rate_of_sub_cat
0,With G S Certificate,403,114,0.072076,0.067663,0.093673,3.535088,0.00846,0.779497
1,With Parents/Spouse,1057,249,0.182072,0.177468,0.204601,4.24498,0.00386,0.809342
2,Rent,165,33,0.027604,0.027703,0.027116,5.0,1.3e-05,0.833333
3,With Billing Proof,4331,821,0.718249,0.727166,0.67461,5.275274,0.003943,0.840644


In [25]:
# CUST_PERMRES_wise.toPandas().to_csv("../output/20240126_final_output/cust_pre_wise_active3w.csv")

In [26]:
IV_value_CUST_PERMRES

0.016276000000000002

### Guarantor PreNises

In [27]:
master_table_new_Guarantor_PreNises=(
     master_table_new
     .filter(f.col("GUAR_PERMRES").isNotNull())
#     .filter((f.col("CUST_PERMRES")=="With G S Certificate")|
#             (f.col("CUST_PERMRES")=="Rent")|
#            (f.col("CUST_PERMRES")=="No Guarantors")|
#             (f.col("CUST_PERMRES")=="With Parents/spouse")|
#             (f.col("CUST_PERMRES")=="With Billing Proof")
            
           )

Guarantor_PreNises_wise= (
       master_table_new_Guarantor_PreNises
#     .filter(~f.col("Guarantor PreNises").isNull())
    .groupBy("GUAR_PERMRES")
        .agg(
            f.sum("good").alias("good"),
            f.sum("bad").alias("bad")
        )
    )
total_good_Guarantor_PreNises_wise=(
       Guarantor_PreNises_wise
        .agg(f.sum("good").alias("good"))
        .collect()[0]['good']
    )
total_bad_Guarantor_PreNises_wise= (
        CUST_PERMRES_wise
        .agg(f.sum("bad").alias("bad"))
        .collect()[0]['bad']
    )

Guarantor_PreNises_wise=(
    Guarantor_PreNises_wise
    .withColumn(
        "Total_rate",
        ((f.col("good")+f.col("bad"))/(total_good_Guarantor_PreNises_wise+total_bad_Guarantor_PreNises_wise))
        )
    .withColumn(
        "good_rate",
        f.col("good")/total_good_Guarantor_PreNises_wise    
    )
    .withColumn(
        "bad_rate",
        f.col("bad")/total_bad_Guarantor_PreNises_wise   
    )
    .withColumn(
        "odd",
        f.col("good")/f.col("bad")  
    )
    .withColumn(
    "IV",
    (
        ((f.col("good") / total_good_Guarantor_PreNises_wise) - (f.col("bad") / total_bad_Guarantor_PreNises_wise)) *
        f.ln((f.col("good") / total_good_Guarantor_PreNises_wise) / (f.col("bad") / total_bad_Guarantor_PreNises_wise))
    )
)
    .withColumn(
       "good_rate_of_sub_cat",
    f.col("good") / (f.col("good") + f.col("bad"))
    )
    .orderBy("good_rate_of_sub_cat")
)

# IV_Guarantor_PreNises_wise=4.694247e-08+5.527456e-05+2.444583e-04+1.225676e-03+3.686925e-05
IV_Guarantor_PreNises_wise=	0.003338+0.009574+0.002563+0.000303+0.046931

In [28]:
Guarantor_PreNises_wise.toPandas()

Unnamed: 0,GUAR_PERMRES,good,bad,Total_rate,good_rate,bad_rate,odd,IV,good_rate_of_sub_cat
0,With G S Certificate,122,36,0.022033,0.02049,0.029581,3.388889,0.003338,0.772152
1,Rent,1039,265,0.181844,0.174505,0.217749,3.920755,0.009574,0.796779
2,With Parents/Spouse,952,220,0.163436,0.159893,0.180772,4.327273,0.002563,0.812287
3,With Billing Proof,2781,583,0.469112,0.467081,0.479047,4.770154,0.000303,0.826694
4,No Guaranter,1060,120,0.164552,0.178032,0.098603,8.833333,0.046931,0.898305


In [29]:
# Guarantor_PreNises_wise.toPandas().to_csv("../output/20240126_final_output/guarantor_pre_wise_active3w.csv")

In [30]:
IV_Guarantor_PreNises_wise

0.062709

### Exposure Category(Score)

In [38]:
master_table_exp=(
    master_table_new.orderBy("EXP_CD")
    .filter(f.col("EXP_CD").isNotNull())
#     .filter(~f.col("Exposure").isNull())
#     .withColumn("Exposure_cat",
#                 f.when(f.col("Exposure").between(0,10),"0-10")
#                 .when(f.col("Exposure").between(11,20),"11-20")
#                 .when(f.col("Exposure").between(21,30),"21-30")
#                 .when(f.col("Exposure").between(31,40),"31-40")
#                 .when(f.col("Exposure").between(41,50),"41-50")
#                 .when(f.col("Exposure").between(51,60),"51-60")
#                 .when(f.col("Exposure").between(61,70),"61-70")
#                 .when(f.col("Exposure").between(71,80),"71-80")
#                 .when(f.col("Exposure")>80,"80>")
#                )
#     .withColumn("Exposure_cat",
#                 f.when(f.col("EXP_CD").between(0,20),"0-20")
#                 .when(f.col("EXP_CD").between(21,40),"21-40")
#                 .when(f.col("EXP_CD").between(41,60),"41-60")
#                 .when(f.col("EXP_CD").between(61,80),"61-80")
#                 .when(f.col("EXP_CD")>=80,"80>")
#                )
    .withColumn("Exposure_cat",
                f.when(f.col("EXP_CD").between(50,60),"50-60")
                .when(f.col("EXP_CD").between(60,70),"60-70")
                .when(f.col("EXP_CD").between(70,80),"70-80")
                .when(f.col("EXP_CD")<50,"below 50")
#                 .when(f.col("EXP_CD").between(61,80),"61-80")
                .when(f.col("EXP_CD")>=80,"80>")
               )
    .filter(f.col("Exposure_cat").isNotNull())
    
)

master_table_new1=master_table_exp.groupBy("Exposure_cat").agg(f.count("FAC NO"))

Exposure_Category_Score_wise= (
        master_table_exp.groupBy("Exposure_cat")
        .agg(
            f.sum("good").alias("good"),
            f.sum("bad").alias("bad")
        )
    )
total_good_Exposure_Category_Score_wise=(
      Exposure_Category_Score_wise
        .agg(f.sum("good").alias("good"))
        .collect()[0]['good']
    )
total_bad_Exposure_Category_Score_wise= (
       Exposure_Category_Score_wise
        .agg(f.sum("bad").alias("bad"))
        .collect()[0]['bad']
    )

Exposure_Category_Score_wise=(
    Exposure_Category_Score_wise
    .withColumnRenamed("Exposure Category(Score)","Exp")
    .withColumn(
        "Total_rate",
        ((f.col("good")+f.col("bad"))/(total_good_Exposure_Category_Score_wise+total_bad_Exposure_Category_Score_wise))
        )
    .withColumn(
        "good_rate",
        f.col("good")/total_good_Exposure_Category_Score_wise    
    )
    .withColumn(
        "bad_rate",
        f.col("bad")/total_bad_Exposure_Category_Score_wise  
    )
    .withColumn(
        "odd",
        f.col("good")/f.col("bad")  
    )
    .withColumn(
    "IV",
    (
        ((f.col("good") / total_good_Exposure_Category_Score_wise) - (f.col("bad") / total_bad_Exposure_Category_Score_wise)) *
        f.ln((f.col("good") / total_good_Exposure_Category_Score_wise) / (f.col("bad") / total_bad_Exposure_Category_Score_wise))
    )
)
    .withColumn(
       "good_rate_of_sub_cat",
    f.col("good") / (f.col("good") + f.col("bad"))
    )
    .orderBy("good_rate_of_sub_cat")
)
# IVP_Exp_value=(
# 0.050055+
# 0.030004+
# 0.055117+
# 0.001172+
# 0.053804+
# 0.039878+
# 0.034502+
# 0.007726+
# 0.003169
# )
# IVP_Exp_value=0.063524+0.018908+0.005051+0.048916+0.029908
IVP_EXP_value=0.101762+0.004935+0.033289+0.060101+0.018490

In [39]:
master_table_new1.show()

+------------+-------------+
|Exposure_cat|count(FAC NO)|
+------------+-------------+
|    below 50|         1548|
|       60-70|          988|
|       50-60|          859|
|         80>|         2370|
|       70-80|         1274|
+------------+-------------+



In [40]:
Exposure_Category_Score_wise.toPandas().head(10)

Unnamed: 0,Exposure_cat,good,bad,Total_rate,good_rate,bad_rate,odd,IV,good_rate_of_sub_cat
0,80>,1764,606,0.336696,0.302107,0.505,2.910891,0.104242,0.744304
1,70-80,1070,204,0.180992,0.183251,0.17,5.245098,0.000995,0.839874
2,60-70,846,142,0.140361,0.144888,0.118333,5.957746,0.005376,0.856275
3,50-60,751,108,0.122034,0.128618,0.09,6.953704,0.013788,0.874272
4,below 50,1408,140,0.219918,0.241137,0.116667,10.057143,0.090371,0.909561


In [37]:
Exposure_Category_Score_wise.toPandas().to_csv("../output/20240126_final_output/exp_cat_wise_active3w_new.csv")

In [35]:
IVP_EXP_value

0.21857700000000002

### CRIB

In [226]:
master_table_new_CRIB = (
    master_table_new
    .filter(~f.col("CRIB score").isin([-400]))  # Filter out -400 values
    .withColumn("CRIB_score_cat",
                f.when(f.col("CRIB score").between(-80, -61), "(-80)-(-61)")
                .when(f.col("CRIB score").between(-60, -41), "(-60)-(-41)")
                .when(f.col("CRIB score").between(-40, -21), "(-40)-(-21)")
                .when(f.col("CRIB score").between(-20, 0), "(-20)-0")
                .when(f.col("CRIB score").between(1, 20), "1-20")
                .when(f.col("CRIB score").between(21, 40), "21-40")
                .when(f.col("CRIB score").between(41, 60), "41-60")
                .when(f.col("CRIB score").between(61, 80), "61-80")
                .when(f.col("CRIB score") > 80, "80>")
             .when(f.col("CRIB score")=="NO HIT","NO HIT")
#                 .otherwise("Unknown")  # Handle other cases
    
               )
    .filter(f.col("CRIB_score_cat").isNotNull())
)
master_table_new_CRIB1=master_table_new_CRIB.groupBy("CRIB_score_cat").agg(f.count("CRIB score"))
CRIB= (
      master_table_new_CRIB.groupBy("CRIB_score_cat")
        .agg(
            f.sum("good").alias("good"),
            f.sum("bad").alias("bad")
        )
    )
total_good_CRIB=(
       CRIB
        .agg(f.sum("good").alias("good"))
        .collect()[0]['good']
    )
total_bad_CRIB= (
        CRIB
        .agg(f.sum("bad").alias("bad"))
        .collect()[0]['bad']
    )

CRIB=(
    CRIB
    .withColumn(
        "Total_rate",
        ((f.col("good")+f.col("bad"))/(total_good_CRIB+total_bad_CRIB))
        )
    .withColumn(
        "good_rate",
        f.col("good")/total_good_CRIB   
    )
    .withColumn(
        "bad_rate",
        f.col("bad")/total_bad_CRIB 
    )
    .withColumn(
        "odd",
        f.col("good")/f.col("bad")  
    )
    .withColumn(
    "IV",
    (
        ((f.col("good") / total_good_CRIB) - (f.col("bad") / total_bad_CRIB)) *
        f.ln((f.col("good") / total_good_CRIB) / (f.col("bad") / total_bad_CRIB))
    )
)
    .withColumn(
       "good_rate_of_sub_cat",
    f.col("good") / (f.col("good") + f.col("bad"))
    )
    .orderBy("good_rate_of_sub_cat")
)
# IV_value_DSCR= 0.000052+0.004582+0.0036=66+0.023002+0.000092
IV_value_CRIB=0.062192+0.015977+0.009420+0.007757+0.003961+0.000498+0.000215+0.004002+0.054429+0.001011	

In [227]:
master_table_new_CRIB1.show()

+--------------+-----------------+
|CRIB_score_cat|count(CRIB score)|
+--------------+-----------------+
|       (-20)-0|               72|
|        NO HIT|              243|
|         21-40|              826|
|         61-80|              592|
|   (-80)-(-61)|               18|
|           80>|             1080|
|   (-60)-(-41)|               38|
|          1-20|              146|
|         41-60|              494|
|   (-40)-(-21)|               55|
+--------------+-----------------+



In [228]:
CRIB.toPandas().head(10)

Unnamed: 0,CRIB_score_cat,good,bad,Total_rate,good_rate,bad_rate,odd,IV,good_rate_of_sub_cat
0,1-20,92,54,0.040965,0.03106,0.089701,1.703704,0.062192,0.630137
1,(-60)-(-41),24,14,0.010662,0.008103,0.023256,1.714286,0.015977,0.631579
2,(-40)-(-21),39,16,0.015432,0.013167,0.026578,2.4375,0.00942,0.709091
3,(-20)-0,53,19,0.020202,0.017893,0.031561,2.789474,0.007757,0.736111
4,21-40,671,155,0.231762,0.226536,0.257475,4.329032,0.003961,0.812349
5,NO HIT,199,44,0.068182,0.067184,0.07309,4.522727,0.000498,0.81893
6,61-80,489,103,0.166105,0.165091,0.171096,4.747573,0.000215,0.826014
7,41-60,422,72,0.138608,0.142471,0.119601,5.861111,0.004002,0.854251
8,80>,957,123,0.30303,0.323093,0.204319,7.780488,0.054429,0.886111
9,(-80)-(-61),16,2,0.005051,0.005402,0.003322,8.0,0.001011,0.888889


In [229]:
CRIB.toPandas().to_csv("../output/20240126_final_output/CRIB_cat_wise_active.csv")

In [230]:
IV_value_CRIB

0.15946200000000002

## DCSR

In [231]:
master_table_new_dcsr=(
     master_table_new
.filter(~f.col("DSCR_C").isNull())
    .withColumn("DSCR_cat",
                f.when(f.col("DSCR_C").between(0,5),"0-5")
                .when(f.col("DSCR_C").between(6,11),"6-11")
                .when(f.col("DSCR_C").between(12,17),"12-17")
                .when(f.col("DSCR_C").between(18,23),"18-23")
                .when(f.col("DSCR_C")>23,"23>")
               )
    .filter(f.col("DSCR_cat").isNotNull())
)
master_table_new_dcsr1=master_table_new_dcsr.groupBy("DSCR_cat").agg(f.count("DSCR_C"))
DSCR= (
      master_table_new_dcsr.groupBy("DSCR_cat")
        .agg(
            f.sum("good").alias("good"),
            f.sum("bad").alias("bad")
        )
    )
total_good_DSCR=(
       DSCR
        .agg(f.sum("good").alias("good"))
        .collect()[0]['good']
    )
total_bad_DSCR= (
        DSCR
        .agg(f.sum("bad").alias("bad"))
        .collect()[0]['bad']
    )

DSCR=(
    DSCR
    .withColumn(
        "Total_rate",
        ((f.col("good")+f.col("bad"))/(total_good_DSCR+total_bad_DSCR))
        )
    .withColumn(
        "good_rate",
        f.col("good")/total_good_DSCR   
    )
    .withColumn(
        "bad_rate",
        f.col("bad")/total_bad_DSCR 
    )
    .withColumn(
        "odd",
        f.col("good")/f.col("bad")  
    )
    .withColumn(
    "IV",
    (
        ((f.col("good") / total_good_DSCR) - (f.col("bad") / total_bad_DSCR)) *
        f.ln((f.col("good") / total_good_DSCR) / (f.col("bad") / total_bad_DSCR))
    )
)
    .withColumn(
       "good_rate_of_sub_cat",
    f.col("good") / (f.col("good") + f.col("bad"))
    )
    .orderBy("good_rate_of_sub_cat")
)
# IV_value_DSCR= 0.000052+0.004582+0.0036=66+0.023002+0.000092
IV_value_DSCR= 	0.004085+0.001120+	0.003951+0.019609+0.003596

In [232]:
master_table_new_dcsr1.show()

+--------+-------------+
|DSCR_cat|count(DSCR_C)|
+--------+-------------+
|   12-17|           70|
|     23>|          153|
|     0-5|         5653|
|    6-11|          750|
|   18-23|           39|
+--------+-------------+



In [233]:
DSCR.toPandas().head(10)

Unnamed: 0,DSCR_cat,good,bad,Total_rate,good_rate,bad_rate,odd,IV,good_rate_of_sub_cat
0,0-5,4620,1033,0.848162,0.837867,0.89748,4.47241,0.004097,0.817265
1,12-17,61,9,0.010503,0.011063,0.007819,6.777778,0.001125,0.871429
2,23>,135,18,0.022956,0.024483,0.015639,7.5,0.003965,0.882353
3,6-11,662,88,0.112528,0.120058,0.076455,7.522727,0.019677,0.882667
4,18-23,36,3,0.005851,0.006529,0.002606,12.0,0.003602,0.923077


In [234]:
DSCR.toPandas().to_csv("../output/20240126_final_output/DSCR_cat_wise_active_3w.csv")

In [235]:
IV_value_DSCR

0.032361

IV (Information Value Statistic)	Description of Correlation
<0.01                               No Correlation
0.01 - 0.03                        	Marginal contribution
0.03 - 0.10	                        Adequate predictability
0.10 - 0.30	                        Strong predictor
0.30 - 0.50	                        Very Strong predictor
>=0.5	                            "Suspicious" - Too good to be true


In [53]:
print("IV_value_CUST_PERMRES: ",IV_value_CUST_PERMRES)
print("IV_Guarantor_PreNises_wise:",IV_Guarantor_PreNises_wise)
print("IV_value_DSCR:",IV_value_DSCR)
print("IVP_Exp_value:",IVP_EXP_value)
print("Iv_value_Product_wise:",Iv_value_Product_wise)

IV_value_CUST_PERMRES:  0.01641
IV_Guarantor_PreNises_wise: 0.062701
IV_value_DSCR: 0.032361
IVP_Exp_value: 0.217904
Iv_value_Product_wise: 0.08003299999999999


# Selecting Customer good or bad (On Total Collectiion Ratio)