In [4]:
from pyspark.sql import SparkSession
spark=SparkSession.\
builder.\
config('spark.shuffle.useOldFetchProtocol','true').\
config('spark.ui.port','0').\
config("spark.sql.warehouse.dir","/user/itv012713/warehouse").\
enableHiveSupport().\
master('yarn').\
getOrCreate()



In [5]:
from pyspark.sql.functions import current_timestamp, when, col

## Generating a Dataframe with proper datatypes

In [6]:
defaulter_schema="member_id string,delinq_2yrs float,delinq_amnt float,pub_rec float,pub_rec_bankruptcies float,inq_last_6mths float,total_rec_late_fee float,mths_since_last_delinq float,mths_since_last_record float"

In [19]:
loans_def_raw_df = spark.read.format("csv").option("header",True).schema(defaulter_schema).load("/public/trendytech/lendingclubproject/raw/loans_defaulters_csv")

In [20]:
loans_def_raw_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
9cb79aa7323e81be1...,2.0,0.0,0.0,0.0,0.0,0.0,11.0,
0dd2bbc517e3c8f9e...,0.0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0.0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0.0,0.0,0.0,0.0,0.0,0.0,,
aac68850fdac09fd0...,1.0,0.0,0.0,0.0,0.0,0.0,21.0,
3a423e4589e89f429...,0.0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0.0,0.0,0.0,0.0,1.0,0.0,,
c89986155a070db2e...,1.0,0.0,0.0,0.0,1.0,15.0,5.0,
118dc629b6e134419...,0.0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0.0,0.0,0.0,0.0,0.0,0.0,,


In [21]:
loans_def_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: float (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- pub_rec: float (nullable = true)
 |-- pub_rec_bankruptcies: float (nullable = true)
 |-- inq_last_6mths: float (nullable = true)
 |-- total_rec_late_fee: float (nullable = true)
 |-- mths_since_last_delinq: float (nullable = true)
 |-- mths_since_last_record: float (nullable = true)



In [22]:
loans_def_raw_df.createOrReplaceTempView("loan_defaultors")

In [23]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaultors group by delinq_2yrs order by total desc").show(40)

+-----------+-------+
|delinq_2yrs|  total|
+-----------+-------+
|        0.0|1838878|
|        1.0| 281335|
|        2.0|  81285|
|        3.0|  29539|
|        4.0|  13179|
|        5.0|   6599|
|        6.0|   3717|
|        7.0|   2062|
|        8.0|   1223|
|        9.0|    818|
|       10.0|    556|
|       11.0|    363|
|       12.0|    264|
|       null|    261|
|       13.0|    165|
|       14.0|    120|
|       15.0|     87|
|       16.0|     55|
|       17.0|     30|
|       18.0|     30|
|       19.0|     23|
|       20.0|     17|
|       21.0|     12|
|       22.0|      5|
|       24.0|      4|
|       26.0|      3|
|       23.0|      2|
|       29.0|      2|
|       25.0|      2|
|       30.0|      2|
|       3.44|      2|
|      18.53|      1|
|      26.24|      1|
|      17.17|      1|
|       39.0|      1|
|       1.37|      1|
|       58.0|      1|
|      13.76|      1|
|       9.44|      1|
|       5.52|      1|
+-----------+-------+
only showing top 40 rows



## Converting deling_2yrs column to integer, and nulls to 0

In [24]:
loans_def_processed_df = loans_def_raw_df.withColumn("delinq_2yrs", col("delinq_2yrs").cast("integer")).fillna(0, subset = ["delinq_2yrs"])

In [25]:
loans_def_processed_df.createOrReplaceTempView("loan_defaultors")

In [26]:
spark.sql("select count(*) from loan_defaultors where delinq_2yrs is null")

count(1)
0


In [27]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaultors group by delinq_2yrs order by total desc").show(40)

+-----------+-------+
|delinq_2yrs|  total|
+-----------+-------+
|          0|1839141|
|          1| 281337|
|          2|  81285|
|          3|  29545|
|          4|  13180|
|          5|   6601|
|          6|   3719|
|          7|   2063|
|          8|   1226|
|          9|    821|
|         10|    558|
|         11|    363|
|         12|    266|
|         13|    167|
|         14|    123|
|         15|     90|
|         16|     56|
|         17|     33|
|         18|     32|
|         19|     24|
|         20|     19|
|         21|     16|
|         22|      7|
|         24|      6|
|         23|      5|
|         26|      4|
|         25|      2|
|         30|      2|
|         29|      2|
|         27|      1|
|         35|      1|
|         28|      1|
|         42|      1|
|         39|      1|
|         58|      1|
|         36|      1|
|         32|      1|
+-----------+-------+



In [28]:
spark.sql("select * from loan_defaultors where delinq_2yrs = 0")

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
0dd2bbc517e3c8f9e...,0,0.0,1.0,1.0,3.0,0.0,,115.0
458458599d3df3bfc...,0,0.0,1.0,1.0,1.0,0.0,,76.0
05ea141ec28b5c7f7...,0,0.0,0.0,0.0,0.0,0.0,,
3a423e4589e89f429...,0,0.0,0.0,0.0,0.0,0.0,,
f1efcf7dfbfef21be...,0,0.0,0.0,0.0,1.0,0.0,,
118dc629b6e134419...,0,0.0,0.0,0.0,0.0,0.0,,
a86fa4b7493708333...,0,0.0,0.0,0.0,0.0,0.0,,
6e8d94bf446e97025...,0,0.0,0.0,0.0,0.0,0.0,36.0,
3de585156dc6b73f6...,0,0.0,0.0,0.0,0.0,0.0,,
e88945f86a96f8d71...,0,0.0,0.0,0.0,1.0,0.0,,


## Divinding two seperate datasets. One for deling and other for public inquiries

In [39]:
loans_def_deling_df = spark.sql("select member_id, delinq_2yrs, int(mths_since_last_delinq) from loan_defaultors where delinq_2yrs > 0 or mths_since_last_delinq > 0")

In [40]:
loans_def_deling_df

member_id,delinq_2yrs,mths_since_last_delinq
9cb79aa7323e81be1...,2,11
aac68850fdac09fd0...,1,21
c89986155a070db2e...,1,5
6e8d94bf446e97025...,0,36
42f73fd8a01f1c475...,0,46
1eef79a0e79b72c7a...,1,21
1dd1d1b51473d4993...,0,44
ec1953dba2cfb89ad...,2,13
8241a6bb3a9350fb8...,0,57
cdc94fa1c29a6a70a...,0,44


In [43]:
loans_def_records_enq_df = spark.sql("select member_id from loan_defaultors where pub_rec > 0.0 or pub_rec_bankruptcies > 0.0 or inq_last_6mths > 0")

In [44]:
loans_def_records_enq_df

member_id
0dd2bbc517e3c8f9e...
458458599d3df3bfc...
f1efcf7dfbfef21be...
c89986155a070db2e...
e88945f86a96f8d71...
4e1c30a5dfe9f1e20...
76cbefe31f7834f47...
47d002f59a274c6f2...
09a1c6855801dad88...
56d4375718ad6940d...


## Writing the data back to hdfs in parquet and csv format for further use

In [None]:
loans_def_deling_df.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv012713/lendingclubproject/clean_data/loans_defaultors_deling_csv").save()