Pipeline Tests

In [51]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler, Imputer, VectorSlicer
from pyspark.ml import Pipeline
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import DenseVector
from pyspark.sql import functions as F

In [52]:
import random
import numpy as np
from pyspark.sql import Row
from sklearn import neighbors
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.stat import Statistics

In [53]:
from pyspark.sql import SparkSession

In [54]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [55]:
spark = SparkSession\
    .builder\
    .appName("LC_Model_Pipeline")\
    .getOrCreate()

In [56]:
df = spark.read.option('inferschema','true').csv('data/Data_Exploration.csv', header=True)

In [57]:
df = df.limit(20000)

In [58]:
#We remove categorical features that have too broad a set of values, or are highly imbalanced, or could cause data leakage. 
#We can elaborate and use them for feature extraction later, but they are not needed for a baseline
remove = ['addr_state', 'earliest_cr_line', 'home_ownership', 'initial_list_status', 'issue_d', 'emp_length',
          'loan_status', 'purpose', 'sub_grade', 'term', 'title', 'zip_code', 'application_type']
df = df.drop(*remove)

In [59]:
#Creating list of categorical and numeric features
cat_cols = [item[0] for item in df.dtypes if item[1].startswith('string')]
num_cols = [item[0] for item in df.dtypes if item[1].startswith('in') or item[1].startswith('dou')]

In [60]:
#Count number of nulls for each column:
nulls = df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()

In [61]:
nulls.T[(nulls.T > 0).any(axis=1)].index

Index(['dti', 'int_rate', 'revol_util'], dtype='object')

In [62]:
impute_list = list(nulls.T[(nulls.T > 0).any(axis=1)].index)

In [None]:
#Both attributes are continuous so we will impute

In [64]:
imputer = Imputer(inputCols=impute_list, outputCols=[i+"_imp" for i in impute_list])

In [65]:
model = imputer.fit(df)

In [66]:
df = model.transform(df)

In [67]:
num_cols+=[i+"_imp" for i in impute_list]

In [68]:
num_cols = [i for i in num_cols if i not in impute_list]

In [69]:
num_cols.remove("is_default")

In [70]:
#We will choose these features for our baseline model:
num_att, cat_att = num_cols, cat_cols

In [71]:
#Removing the target feature from pipeline inputs
if "label" in num_att:
    num_att.remove("label")

In [72]:
stages= []

for col in cat_att:

    stringIndexer = StringIndexer(inputCol = col , outputCol = col + '_StringIndex')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[col + '_ClassVect'])
    stages += [stringIndexer, encoder]

#Assembling mixed data type transformations:
assemblerInputs = [c + "_ClassVect" for c in cat_att] + num_att
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

stages += [assembler]

In [73]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
stages += [scaler]

In [74]:
#Recreating the Chosen Logistic Regression model
#Just load it from models dir
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='is_default', maxIter=10, regParam=0.1, elasticNetParam=0.01)
stages += [lr]

In [75]:
#Creating and running the pipeline:
pipeline = Pipeline(stages=stages)

In [76]:
pipelineModel = pipeline.fit(df)

In [77]:
out_df = pipelineModel.transform(df)

In [78]:
#notice we did not split between train and test here - all we're looking for is a dry run of the model pipeline end to end

In [79]:
out_df.drop("features", "rawPrediction", "scaledFeatures", "grade_StringIndex", "grade_ClassVect", "verification_status_StringIndex", "verification_status_ClassVect")\
    .toPandas().to_csv("data/chosen_model/predictions/data_w_predictions.csv", index=False)

In [80]:
out_df

DataFrame[acc_now_delinq: int, acc_open_past_24mths: int, annual_inc: double, avg_cur_bal: int, chargeoff_within_12_mths: int, collections_12_mths_ex_med: int, delinq_2yrs: int, delinq_amnt: int, dti: double, funded_amnt: int, funded_amnt_inv: double, grade: string, inq_last_6mths: int, installment: double, int_rate: double, loan_amnt: int, mo_sin_old_rev_tl_op: int, mo_sin_rcnt_rev_tl_op: int, mo_sin_rcnt_tl: int, mort_acc: int, num_accts_ever_120_pd: int, num_actv_bc_tl: int, num_actv_rev_tl: int, num_bc_sats: int, num_bc_tl: int, num_il_tl: int, num_op_rev_tl: int, num_rev_accts: double, num_rev_tl_bal_gt_0: int, num_sats: int, num_tl_30dpd: int, num_tl_90g_dpd_24m: int, num_tl_op_past_12m: int, open_acc: int, pct_tl_nvr_dlq: double, policy_code: int, pub_rec: int, pub_rec_bankruptcies: int, revol_bal: int, revol_util: double, tax_liens: int, tot_cur_bal: int, tot_hi_cred_lim: int, total_acc: int, total_bal_ex_mort: int, total_bc_limit: int, total_il_high_credit_limit: int, total_re

In [95]:
out_df.take(1)

[Row(acc_now_delinq=0, acc_open_past_24mths=8, annual_inc=65000.0, avg_cur_bal=10086, chargeoff_within_12_mths=0, collections_12_mths_ex_med=0, delinq_2yrs=0, delinq_amnt=0, dti=23.84, funded_amnt=12000, funded_amnt_inv=12000.0, grade='B', inq_last_6mths=0, installment=253.79, int_rate=9.0, loan_amnt=12000, mo_sin_old_rev_tl_op=145, mo_sin_rcnt_rev_tl_op=4, mo_sin_rcnt_tl=1, mort_acc=2, num_accts_ever_120_pd=0, num_actv_bc_tl=2, num_actv_rev_tl=4, num_bc_sats=5, num_bc_tl=10, num_il_tl=13, num_op_rev_tl=11, num_rev_accts=22.0, num_rev_tl_bal_gt_0=4, num_sats=18, num_tl_30dpd=0, num_tl_90g_dpd_24m=0, num_tl_op_past_12m=3, open_acc=18, pct_tl_nvr_dlq=100.0, policy_code=1, pub_rec=0, pub_rec_bankruptcies=0, revol_bal=9786, revol_util=13.4, tax_liens=0, tot_cur_bal=181540, tot_hi_cred_lim=291626, total_acc=37, total_bal_ex_mort=74787, total_bc_limit=49500, total_il_high_credit_limit=105910, total_rev_hi_lim=73200, verification_status='Not Verified', is_default=0, issue_month=12, dti_imp=23

In [96]:
listss = [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 8.0, 65000.0, 10086.0, 0.0, 0.0, 0.0, 0.0, 12000.0, 12000.0, 0.0, 253.79, 12000.0, 145.0, 4.0, 1.0, 2.0, 0.0, 2.0, 4.0, 5.0, 10.0, 13.0, 11.0, 22.0, 4.0, 18.0, 0.0, 0.0, 3.0, 18.0, 100.0, 1.0, 0.0, 0.0, 9786.0, 0.0, 181540.0, 291626.0, 37.0, 74787.0, 49500.0, 105910.0, 73200.0, 12.0, 23.84, 9.0, 13.4]

In [101]:
print(listss[:50])

[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 8.0, 65000.0, 10086.0, 0.0, 0.0, 0.0, 0.0, 12000.0, 12000.0, 0.0, 253.79, 12000.0, 145.0, 4.0, 1.0, 2.0, 0.0, 2.0, 4.0, 5.0, 10.0, 13.0, 11.0, 22.0, 4.0, 18.0, 0.0, 0.0, 3.0, 18.0, 100.0, 1.0, 0.0, 0.0, 9786.0, 0.0, 181540.0, 291626.0, 37.0, 74787.0, 49500.0]


In [86]:
type(pipelineModel)

pyspark.ml.pipeline.PipelineModel

In [93]:
#pipelineModel.write().overwrite().save("data/chosen_model/pipeline")

In [91]:
pipelineModel.write().overwrite().save("s3a://cdp-sandbox-default-se.s3.us-east-2.amazonaws.com/pdefusco")

Py4JJavaError: An error occurred while calling o4025.save.
: org.apache.hadoop.fs.s3a.AWSClientIOException: doesBucketExist on cdp-sandbox-default-se.s3.us-east-2.amazonaws.com: com.amazonaws.SdkClientException: Unable to execute HTTP request: Certificate for <cdp-sandbox-default-se.s3.us-east-2.amazonaws.com.s3.amazonaws.com> doesn't match any of the subject alternative names: [*.s3.amazonaws.com, s3.amazonaws.com]: Unable to execute HTTP request: Certificate for <cdp-sandbox-default-se.s3.us-east-2.amazonaws.com.s3.amazonaws.com> doesn't match any of the subject alternative names: [*.s3.amazonaws.com, s3.amazonaws.com]
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:204)
	at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:112)
	at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:315)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:407)
	at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:311)
	at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:286)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.verifyBucketExists(S3AFileSystem.java:506)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:387)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3344)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:138)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3404)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3372)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:493)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:699)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:180)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: com.amazonaws.SdkClientException: Unable to execute HTTP request: Certificate for <cdp-sandbox-default-se.s3.us-east-2.amazonaws.com.s3.amazonaws.com> doesn't match any of the subject alternative names: [*.s3.amazonaws.com, s3.amazonaws.com]
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleRetryableException(AmazonHttpClient.java:1175)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1121)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:770)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:744)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:726)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:686)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:668)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:532)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:512)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4920)
	at com.amazonaws.services.s3.AmazonS3Client.getBucketRegionViaHeadRequest(AmazonS3Client.java:5700)
	at com.amazonaws.services.s3.AmazonS3Client.fetchRegionFromCache(AmazonS3Client.java:5673)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4904)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4866)
	at com.amazonaws.services.s3.AmazonS3Client.headBucket(AmazonS3Client.java:1394)
	at com.amazonaws.services.s3.AmazonS3Client.doesBucketExist(AmazonS3Client.java:1333)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$verifyBucketExists$1(S3AFileSystem.java:507)
	at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:110)
	... 25 more
Caused by: javax.net.ssl.SSLPeerUnverifiedException: Certificate for <cdp-sandbox-default-se.s3.us-east-2.amazonaws.com.s3.amazonaws.com> doesn't match any of the subject alternative names: [*.s3.amazonaws.com, s3.amazonaws.com]
	at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.verifyHostname(SSLConnectionSocketFactory.java:467)
	at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:397)
	at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:355)
	at com.amazonaws.thirdparty.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142)
	at com.amazonaws.thirdparty.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:373)
	at sun.reflect.GeneratedMethodAccessor345.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at com.amazonaws.http.conn.ClientConnectionManagerFactory$Handler.invoke(ClientConnectionManagerFactory.java:76)
	at com.amazonaws.http.conn.$Proxy45.connect(Unknown Source)
	at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:381)
	at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:237)
	at com.amazonaws.thirdparty.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185)
	at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
	at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
	at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
	at com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1297)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1113)
	... 41 more
