# Models

Matt Thomas, Max McGaw, Liam Mulcahy, Will Carruthers

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
import pandas as pd

In [2]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression

In [3]:
df = spark.read.csv('train_data.csv', inferSchema=True, header=True)

In [4]:
df.show(2)

+---+------------------+---------+----------+--------+--------------+-----------+-----------+----------+----------+--------------+---------------+--------+---------+--------------+
|_c0|loan_status_binary|loan_amnt|      term|int_rate|home_ownership|tot_cur_bal|total_pymnt|annual_inc|addr_state|fico_range_low|last_pymnt_amnt|grade_CD|grade_EFG|emp_length_low|
+---+------------------+---------+----------+--------+--------------+-----------+-----------+----------+----------+--------------+---------------+--------+---------+--------------+
|  0|                 0|  11000.0| 36 months|    7.21|           ANY|    28511.0|    1354.03|   40000.0|        PA|         715.0|         340.71|       0|        0|             0|
|  1|                 0|   4000.0| 36 months|    22.9|           ANY|   108997.0|    1386.67|   40000.0|        MI|         665.0|         154.64|       0|        1|             0|
+---+------------------+---------+----------+--------+--------------+-----------+-----------+--

In [5]:
dfs = df.select([col for col in df.columns if col not in ['addr_state','_c0', 'emp_length','emp_length_low']])

In [6]:
dfs.show(5)

+------------------+---------+----------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+
|loan_status_binary|loan_amnt|      term|int_rate|home_ownership|tot_cur_bal|total_pymnt|annual_inc|fico_range_low|last_pymnt_amnt|grade_CD|grade_EFG|
+------------------+---------+----------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+
|                 0|  11000.0| 36 months|    7.21|           ANY|    28511.0|    1354.03|   40000.0|         715.0|         340.71|       0|        0|
|                 0|   4000.0| 36 months|    22.9|           ANY|   108997.0|    1386.67|   40000.0|         665.0|         154.64|       0|        1|
|                 0|  10000.0| 36 months|   17.97|           ANY|    20320.0|    1435.54|   60000.0|         665.0|         361.38|       1|        0|
|                 0|  13000.0| 36 months|   10.91|           ANY|    34947.0|  14686.205|   45

In [7]:
dfs.printSchema()

root
 |-- loan_status_binary: integer (nullable = true)
 |-- loan_amnt: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- tot_cur_bal: double (nullable = true)
 |-- total_pymnt: double (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- fico_range_low: double (nullable = true)
 |-- last_pymnt_amnt: double (nullable = true)
 |-- grade_CD: integer (nullable = true)
 |-- grade_EFG: integer (nullable = true)



In [8]:
dfs.filter(dfs['tot_cur_bal'].isNull()).count()
#Need to do something abou this

70018

In [9]:
train, test = dfs.randomSplit([0.8, 0.2], seed=12345)

In [10]:
categories = ['home_ownership', 'term']
stages = []

In [11]:
for col in categories:
    stringIndexer = StringIndexer(inputCol=col, outputCol=col + "_Index")
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                            outputCol=col + "classVec")
    stages += [stringIndexer, encoder]

In [12]:
#AT added
#va = VectorAssembler(inputCols=["home_ownershipclassVec", "termclassVec"], outputCol="features")  
#stages += [va]

In [13]:
scaled_vectors = VectorAssembler(inputCols = ['loan_amnt',\
                                             'total_pymnt', 'annual_inc', \
                                             'fico_range_low', 'last_pymnt_amnt',\
                                             'int_rate'], outputCol='vector_features')

In [14]:
#AT commented
#output = scaled_vectors.transform(train)

In [15]:
#output.show()

In [16]:
scaler = StandardScaler(inputCol='vector_features', outputCol='scaled_features')
#scaler = StandardScaler(inputCol='vector_features', outputCol='scaled_features')

In [17]:
labelIndexer = StringIndexer(inputCol='loan_status_binary', outputCol='label')

In [18]:
#AT added
assembler = VectorAssembler(inputCols=['scaled_features', 'home_ownershipclassVec', 'termclassVec', 'grade_EFG', 'grade_CD' ], outputCol='features')

In [19]:
stages += [scaled_vectors, scaler, labelIndexer, assembler]

In [20]:
# AT commented
#assembler = VectorAssembler(inputCols=['scaled_features', 'home_ownershipclassVec',\
#                                       'grade_CD', 'termclassVec',\
#                                       'grade_EFG'], outputCol='features')
#stages += [scaler, labelIndexer, assembler]

In [21]:
lr = LogisticRegression(maxIter=10, regParam=0.01)
stages += [lr]
stages

[StringIndexer_04a394ff8bbb,
 OneHotEncoder_1ee409dcfcf9,
 StringIndexer_dd11e97877e6,
 OneHotEncoder_443f25da0d34,
 VectorAssembler_2b81480e6240,
 StandardScaler_eb5892137e04,
 StringIndexer_64a9c83f9577,
 VectorAssembler_4421b4096779,
 LogisticRegression_96a3061e05a7]

In [22]:
pipeline = Pipeline(stages=stages)

In [23]:
# Fit the pipeline
model = pipeline.fit(train)

In [24]:
# Make a prediction
prediction = model.transform(test)
prediction.show(2)

+------------------+---------+----------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+--------------------+----------------------+----------+-------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|loan_status_binary|loan_amnt|      term|int_rate|home_ownership|tot_cur_bal|total_pymnt|annual_inc|fico_range_low|last_pymnt_amnt|grade_CD|grade_EFG|home_ownership_Index|home_ownershipclassVec|term_Index| termclassVec|     vector_features|     scaled_features|label|            features|       rawPrediction|         probability|prediction|
+------------------+---------+----------+--------+--------------+-----------+-----------+----------+--------------+---------------+--------+---------+--------------------+----------------------+----------+-------------+--------------------+--------------------+-----+--------------------+--------------------+-------

In [25]:
prediction = prediction.withColumn("loan_status_binary", df["loan_status_binary"].cast('float'))
#This is necessary to compare predictions with actual values

In [32]:
matches = prediction.filter(prediction['loan_status_binary'] == prediction['prediction']).count()

In [33]:
counts = prediction.count()

In [34]:
accuracy = matches / counts
print(f"accuracy = {accuracy}")

accuracy = 0.8812233175845823


In [37]:
!jupyter nbconvert --to pdf '../Spark_ML_DS5559'/*.ipynb

[NbConvertApp] Converting notebook ../Spark_ML_DS5559/Final_Project_DS5559_data.ipynb to pdf
[NbConvertApp] Support files will be in Final_Project_DS5559_data_files/
[NbConvertApp] Making directory ./Final_Project_DS5559_data_files
[NbConvertApp] Writing 80572 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 96996 bytes to ../Spark_ML_DS5559/Final_Project_DS5559_data.pdf
[NbConvertApp] Converting notebook ../Spark_ML_DS5559/Project_Models.ipynb to pdf
[NbConvertApp] Writing 43276 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 43651 bytes to ../Spark_ML_DS5559/Project_Models