In [6]:
from pyspark.sql import *
import getpass
username = getpass.getuser()
spark = SparkSession.builder \
    .appName('loans_defaulter') \
    .config('spark.ui.port', '0') \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [5]:
spark.stop()

In [7]:
schema = "member_id string, delinq_2yrs float,delinq_amnt float,pub_rec float, pub_rec_bankruptcies float, inq_last_6mths float,total_rec_late_fee float,mths_since_last_delinq float, mths_since_last_record float"

In [8]:
defaulters_df = spark.read.csv("/user/itv012667/lendingclub/raw/loans_defaulters_csv",header=True,schema=schema)

In [9]:
defaulters_df.show()

+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|           member_id|delinq_2yrs|delinq_amnt|pub_rec|pub_rec_bankruptcies|inq_last_6mths|total_rec_late_fee|mths_since_last_delinq|mths_since_last_record|
+--------------------+-----------+-----------+-------+--------------------+--------------+------------------+----------------------+----------------------+
|9cb79aa7323e81be1...|        2.0|        0.0|    0.0|                 0.0|           0.0|               0.0|                  11.0|                  null|
|0dd2bbc517e3c8f9e...|        0.0|        0.0|    1.0|                 1.0|           3.0|               0.0|                  null|                 115.0|
|458458599d3df3bfc...|        0.0|        0.0|    1.0|                 1.0|           1.0|               0.0|                  null|                  76.0|
|05ea141ec28b5c7f7...|        0.0|        0.0|    0.0|          

In [10]:
defaulters_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)



In [11]:
from pyspark.sql.functions import *

# Cast float data into integer

In [17]:
defaulters_df.withColumn("mths_since_last_delinq",col("mths_since_last_delinq").cast("integer")).fillna(0,subset=["mths_since_last_delinq"])

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
9cb79aa7323e81be1...,2.0,0.0,0.0,0.0,0.0,0.0,11,
0dd2bbc517e3c8f9e...,0.0,0.0,1.0,1.0,3.0,0.0,0,115.0
458458599d3df3bfc...,0.0,0.0,1.0,1.0,1.0,0.0,0,76.0
05ea141ec28b5c7f7...,0.0,0.0,0.0,0.0,0.0,0.0,0,
aac68850fdac09fd0...,1.0,0.0,0.0,0.0,0.0,0.0,21,
3a423e4589e89f429...,0.0,0.0,0.0,0.0,0.0,0.0,0,
f1efcf7dfbfef21be...,0.0,0.0,0.0,0.0,1.0,0.0,0,
c89986155a070db2e...,1.0,0.0,0.0,0.0,1.0,15.0,5,
118dc629b6e134419...,0.0,0.0,0.0,0.0,0.0,0.0,0,
a86fa4b7493708333...,0.0,0.0,0.0,0.0,0.0,0.0,0,


In [18]:
del_nulls = defaulters_df.withColumn("delinq_2yrs",col("delinq_2yrs").cast("integer")).fillna(0,subset=["delinq_2yrs"])

In [14]:
del_nulls.filter("delinq_2yrs=0")

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
0dd2bbc517e3c8f9e...,0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0,0.0,0.0,0.0,0.0,0.0,,
3a423e4589e89f429...,0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0,0.0,0.0,0.0,1.0,0.0,,
118dc629b6e134419...,0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0,0.0,0.0,0.0,0.0,0.0,,
6e8d94bf446e97025...,0,0.0,0.0,0.0,0.0,0.0,36.0,
3de585156dc6b73f6...,0,0.0,0.0,0.0,0.0,0.0,,
e88945f86a96f8d71...,0,0.0,0.0,0.0,1.0,0.0,,


# Columns related to delinquation 

In [53]:
delinq_df = del_nulls.select("member_id","delinq_2yrs","delinq_amnt","mths_since_last_delinq").withColumn("mths_since_last_delinq",col("mths_since_last_delinq").cast("integer")).filter("delinq_2yrs>0 or mths_since_last_delinq>0")

In [56]:
delinq_df.count()

1106163

# Columns related to public records

In [20]:
del_nulls = del_nulls.withColumn("pub_rec",col("pub_rec").cast("integer")).fillna(0,subset=["pub_rec"]) \
                      .withColumn("pub_rec_bankruptcies",col("pub_rec_bankruptcies").cast("integer")).fillna(0,subset=["pub_rec_bankruptcies"]) \
                      .withColumn("inq_last_6mths",col("inq_last_6mths").cast("integer")).fillna(0,subset=["inq_last_6mths"])

In [21]:
public_records_df_detail = del_nulls.filter("pub_rec>0.0 or pub_rec_bankruptcies>0.0 or inq_last_6mths>0.0").select("member_id","pub_rec","pub_rec_bankruptcies","inq_last_6mths")

# write data to disk

In [22]:
public_records_df_detail.write.mode("overwrite").option("path","/user/itv012667/lendingclub/cleaneddata/loans_defaulters_detail_records_enq_parquet").save()