In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [8]:
import pyspark.sql.functions as F

In [3]:
lending = spark.read.csv('accepted_2007_to_2018Q4.csv.gz', inferSchema=True, header=True)
lending.printSchema()

root
 |-- id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: double (nullable = true)
 |-- funded_amnt: double (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- url: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string 

## Decide on categories for features
loan_amnt, funded_amnt, term, int_rate, grade, emp_length, home_ownership, annual_inc, addr_state?, fico_range_low (there's no point in using both FICO fields, they are close together and highly correlated), what else??

In [4]:
lending.select('loan_status').distinct().show() #shows different loan statuses

+--------------------+
|         loan_status|
+--------------------+
|          Fully Paid|
|             Default|
|                null|
|     In Grace Period|
|Does not meet the...|
|         Charged Off|
|            Oct-2015|
|  Late (31-120 days)|
|             Current|
|Does not meet the...|
|   Late (16-30 days)|
+--------------------+



Response variable is loan_status. We'll need to change to a binary with 'Default', 'Charged Off', and 'Does not meet the credit policy. Status:Charged Off' can be 1 for default and the others can be 0 for not defaulted. Null rows will need to be deleted.

In [5]:
df_train = lending.select(['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'grade', 'emp_length',
                          'home_ownership', 'annual_inc', 'addr_state', 'fico_range_low'])

In [11]:
# for each field, compute missing percentage

df_train.agg(*[
    (1 - F.count(c) / F.count('*')).alias(c + '_miss')
    for c in df_train.columns
]).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      loan_amnt_miss|    funded_amnt_miss|           term_miss|       int_rate_miss|          grade_miss|    emp_length_miss| home_ownership_miss|     annual_inc_miss|     addr_state_miss| fico_range_low_miss|
+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|1.459724218288993...|1.459724218288993...|1.459724218288993...|1.459724218288993...|1.459724218288993...|0.06499753837415911|1.459724218288993...|1.636660487169816...|1.503958285509199...|1.503958285509199...|
+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+---------