# Models

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
import pandas as pd

In [2]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression

In [3]:
df = spark.read.csv('train_data.csv', inferSchema=True, header=True)

In [4]:
dfs = df.select([col for col in df.columns if col not in ['addr_state', 'term','_c0', 'emp_length_low']])

In [5]:
dfs.show(5)

+------------------+---------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+
|loan_status_binary|loan_amnt|int_rate|home_ownership|tot_cur_bal|total_pymnt|annual_inc|fico_range_low|last_pymnt_amnt|grade_CD|grade_EFG|
+------------------+---------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+
|                 0|  11000.0|    7.21|           ANY|    28511.0|    1354.03|   40000.0|         715.0|         340.71|       0|        0|
|                 0|   4000.0|    22.9|           ANY|   108997.0|    1386.67|   40000.0|         665.0|         154.64|       0|        1|
|                 0|  10000.0|   17.97|           ANY|    20320.0|    1435.54|   60000.0|         665.0|         361.38|       1|        0|
|                 0|  13000.0|   10.91|           ANY|    34947.0|  14686.205|   45000.0|         725.0|        7470.61|       0|        0|
|                 0|

In [6]:
dfs.printSchema()

root
 |-- loan_status_binary: integer (nullable = true)
 |-- loan_amnt: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- tot_cur_bal: double (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- fico_range_low: double (nullable = true)
 |-- last_pymnt_amnt: double (nullable = true)
 |-- grade_CD: integer (nullable = true)
 |-- grade_EFG: integer (nullable = true)



In [18]:
dfs.filter(dfs['tot_cur_bal'].isNull()).count()
#Need to do something abou this

64515

In [26]:
train, test = dfs.randomSplit([0.8, 0.2], seed=12345)

In [27]:
stringIndexer = StringIndexer(inputCol="home_ownership", outputCol="homeIndex")

In [28]:
encoder = OneHotEncoder(inputCol="homeIndex", outputCol="homeVec")

In [29]:
scaled_vectors = VectorAssembler(inputCols = ['loan_amnt',
                                             'total_pymnt', 'annual_inc', 
                                             'fico_range_low', 'last_pymnt_amnt',
                                             'int_rate'], outputCol='vector_features')

In [30]:
output = scaled_vectors.transform(train)

In [31]:
output.show()

+------------------+---------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+--------------------+
|loan_status_binary|loan_amnt|int_rate|home_ownership|tot_cur_bal|total_pymnt|annual_inc|fico_range_low|last_pymnt_amnt|grade_CD|grade_EFG|     vector_features|
+------------------+---------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+--------------------+
|                 0|    500.0|     8.0|          RENT|       null|   541.2802|    3300.0|         745.0|          26.39|       0|        0|[500.0,541.2802,3...|
|                 0|    500.0|    8.07|          RENT|       null|  565.02515|   18000.0|         720.0|          33.16|       0|        0|[500.0,565.02515,...|
|                 0|    500.0|    8.32|         OTHER|       null|     503.54|  100000.0|         745.0|         503.59|       0|        0|[500.0,503.54,100...|
|                 0|    500.0|    

In [32]:
scaler = StandardScaler(inputCol='vector_features', outputCol='scaled_features')

In [39]:
labelIndexer = StringIndexer(inputCol='loan_status_binary', outputCol='label')

In [33]:
assembler = VectorAssembler(inputCols=['scaled_features', 'homeVec', 'grade_CD', 
                                       'grade_EFG'], outputCol='features')

In [34]:
lr = LogisticRegression(maxIter=10, regParam=0.01)

In [40]:
pipeline = Pipeline(stages=[stringIndexer, encoder, scaler, labelIndexer,
                            assembler, lr])

In [41]:
# Fit the pipeline
model = pipeline.fit(output)

# Make a prediction
prediction = model.transform(test)

IllegalArgumentException: 'requirement failed: The input column homeIndex should have at least two distinct values.'