# Description
Create a logistic regression

# Setup

In [1]:
import pandas as pd
import plotly.express as px
import re
import numpy as np
from IPython.display import Image
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import joblib

import findspark
import pyspark
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql import DataFrame
import pyspark.sql.functions as sql
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.functions import vector_to_array

In [2]:
findspark.init()
sc = pyspark.SparkContext(appName="Test")
spark = SparkSession.builder.master("local[*]") \
                    .appName('test') \
                    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/10 14:10:40 WARN Utils: Your hostname, pop-os-note, resolves to a loopback address: 127.0.0.1; using 192.168.0.4 instead (on interface wlp2s0)
25/06/10 14:10:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/10 14:10:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/10 14:10:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
pd.set_option('display.max_rows', 1000)

In [4]:
input_path = "../data/"
output_path = "../outputs/"

# Data

In [5]:
train_data = spark.read.parquet(f"{output_path}train_df_woe.parquet")
test_data = spark.read.parquet(f"{output_path}test_df_woe.parquet")

                                                                                

# Analysis

In [6]:
selected_cols = \
['funded_amnt_woe',
 'term_woe',
 'sub_grade_woe',
 'home_ownership_woe',
 'annual_inc_woe',
 'verification_status_woe',
 'zip_code_woe',
 'dti_woe',
 'earliest_cr_line_woe',
 'fico_range_low_woe',
 'inq_last_6mths_woe',
 'revol_util_woe']

target_col = "default_flag"

Xtrain = train_data.select(selected_cols).toPandas()
ytrain = train_data.select(target_col).toPandas()[target_col]

                                                                                

## Sklearn

In [7]:
from sklearn.linear_model import LogisticRegression

log_reg_sk= LogisticRegression(penalty=None, C=99999, solver="saga")
log_reg_sk.fit(Xtrain, ytrain)



In [8]:
print(log_reg_sk.intercept_)
print(log_reg_sk.coef_)

[-1.39248857]
[[ 0.6824754   0.5353565   0.6363676   0.88731919  0.65011516  0.22695381
   0.91335388  0.5689844   0.60373196  0.46776623  0.46249377 -0.27359909]]


In [9]:
import joblib
joblib.dump(log_reg_sk, f"{output_path}sk_log_reg.gz")

['../outputs/sk_log_reg.gz']

## Stats Model

In [13]:
import statsmodels.api as sm
Xtrain_cons = sm.add_constant(Xtrain)
log_reg = sm.Logit(ytrain, Xtrain_cons).fit()
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.452541
         Iterations 6


0,1,2,3
Dep. Variable:,default_flag,No. Observations:,809048.0
Model:,Logit,Df Residuals:,809035.0
Method:,MLE,Df Model:,12.0
Date:,"Mon, 09 Jun 2025",Pseudo R-squ.:,0.09411
Time:,10:38:23,Log-Likelihood:,-366130.0
converged:,True,LL-Null:,-404160.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.3925,0.003,-463.422,0.000,-1.398,-1.387
funded_amnt_woe,0.6825,0.020,33.474,0.000,0.643,0.722
term_woe,0.5354,0.008,64.282,0.000,0.519,0.552
sub_grade_woe,0.6364,0.006,110.500,0.000,0.625,0.648
home_ownership_woe,0.8873,0.018,50.464,0.000,0.853,0.922
annual_inc_woe,0.6501,0.021,30.764,0.000,0.609,0.691
verification_status_woe,0.2270,0.013,16.931,0.000,0.201,0.253
zip_code_woe,0.9134,0.016,57.019,0.000,0.882,0.945
dti_woe,0.5690,0.011,50.103,0.000,0.547,0.591


## Pyspark ML

In [None]:
selected_cols = \
['funded_amnt_woe',
 'term_woe',
 'sub_grade_woe',
 'home_ownership_woe',
 'annual_inc_woe',
 'verification_status_woe',
 'zip_code_woe',
 'dti_woe',
 'earliest_cr_line_woe',
 'fico_range_low_woe',
 'inq_last_6mths_woe',
 'revol_util_woe']

target_col = "default_flag"

vectoriser = VectorAssembler(inputCols=selected_cols, outputCol="features")
glr = GeneralizedLinearRegression(family="binomial", 
                                  link="logit", 
                                  maxIter=10, 
                                  regParam=0, 
                                  featuresCol="features",
                                  labelCol=target_col)

pipeline = Pipeline(stages=[vectoriser, glr])


model = pipeline.fit(train_data)

25/06/09 10:54:01 WARN Instrumentation: [872d0bf2] regParam is zero, which might cause numerical instability and overfitting.
25/06/09 10:54:08 WARN Instrumentation: [872d0bf2] regParam is zero, which might cause numerical instability and overfitting.
25/06/09 10:54:14 WARN Instrumentation: [872d0bf2] regParam is zero, which might cause numerical instability and overfitting.
25/06/09 10:54:18 WARN Instrumentation: [872d0bf2] regParam is zero, which might cause numerical instability and overfitting.
25/06/09 10:54:23 WARN Instrumentation: [872d0bf2] regParam is zero, which might cause numerical instability and overfitting.
25/06/09 10:54:27 WARN Instrumentation: [872d0bf2] regParam is zero, which might cause numerical instability and overfitting.
25/06/09 10:54:31 WARN Instrumentation: [872d0bf2] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

In [30]:
model.stages[1].summary

                                                                                

Coefficients:
             Feature Estimate Std Error   T Value P Value
         (Intercept)  -1.3925    0.0030 -463.4218  0.0000
     funded_amnt_woe   0.6825    0.0204   33.4737  0.0000
            term_woe   0.5354    0.0083   64.2819  0.0000
       sub_grade_woe   0.6364    0.0058  110.4996  0.0000
  home_ownership_woe   0.8873    0.0176   50.4644  0.0000
      annual_inc_woe   0.6501    0.0211   30.7642  0.0000
verification_stat...   0.2270    0.0134   16.9309  0.0000
        zip_code_woe   0.9134    0.0160   57.0190  0.0000
             dti_woe   0.5690    0.0114   50.1032  0.0000
earliest_cr_line_woe   0.6038    0.0202   29.8682  0.0000
  fico_range_low_woe   0.4678    0.0109   42.9222  0.0000
  inq_last_6mths_woe   0.4625    0.0182   25.4253  0.0000
      revol_util_woe  -0.2736    0.0220  -12.4080  0.0000

(Dispersion parameter for binomial family taken to be 1.0000)
    Null deviance: 808325.9463 on 809035 degrees of freedom
Residual deviance: 732254.3396 on 809035 degrees of

In [50]:
model\
    .transform(train_data)\
    .agg(avg(target_col).alias("actual"),
         avg("prediction").alias("pred"))\
    .show()



+-------------------+-------------------+
|             actual|               pred|
+-------------------+-------------------+
|0.19938866420780968|0.19938866420781015|
+-------------------+-------------------+



                                                                                

In [51]:
test_data\
    .transform(model.transform)\
    .agg(avg(target_col).alias("actual"),
         avg("prediction").alias("pred"))\
    .show()



+-------------------+-------------------+
|             actual|               pred|
+-------------------+-------------------+
|0.20043558030687264|0.19942816374955474|
+-------------------+-------------------+



                                                                                

In [None]:
model.stages[1].summary

Coefficients:
             Feature Estimate Std Error   T Value P Value
         (Intercept)  -1.3925    0.0030 -463.4218  0.0000
     funded_amnt_woe   0.6825    0.0204   33.4737  0.0000
            term_woe   0.5354    0.0083   64.2819  0.0000
       sub_grade_woe   0.6364    0.0058  110.4996  0.0000
  home_ownership_woe   0.8873    0.0176   50.4644  0.0000
      annual_inc_woe   0.6501    0.0211   30.7642  0.0000
verification_stat...   0.2270    0.0134   16.9309  0.0000
        zip_code_woe   0.9134    0.0160   57.0190  0.0000
             dti_woe   0.5690    0.0114   50.1032  0.0000
earliest_cr_line_woe   0.6038    0.0202   29.8682  0.0000
  fico_range_low_woe   0.4678    0.0109   42.9222  0.0000
  inq_last_6mths_woe   0.4625    0.0182   25.4253  0.0000
      revol_util_woe  -0.2736    0.0220  -12.4080  0.0000

(Dispersion parameter for binomial family taken to be 1.0000)
    Null deviance: 808325.9463 on 809035 degrees of freedom
Residual deviance: 732254.3396 on 809035 degrees of

In [57]:
model.save(f"{output_path}_log_reg.gz")

                                                                                

In [None]:
pipe = PipelineModel.load(f"{output_path}_log_reg.gz")

In [82]:
pipe.stages[-1].coefficients

DenseVector([0.6825, 0.5354, 0.6364, 0.8873, 0.6501, 0.227, 0.9134, 0.569, 0.6038, 0.4678, 0.4625, -0.2736])

In [None]:
dt = spark.read.parquet(f"{output_path}/_log_reg.gz/stages/1_GeneralizedLinearRegression_bbc52cf66d2b/data")

In [101]:
dt\
.withColumn("features", lit(pipe.stages[0].getInputCols()))\
.withColumn("temp", explode(F.arrays_zip("features", vector_to_array("coefficients").alias("coefficients"))))\
.withColumn("features", F.col("temp.features"))\
.withColumn("coefficients", F.col("temp.coefficients"))\
.drop("temp")\
.groupBy("intercept")\
.pivot("features")\
.agg(sum("coefficients"))\
.show()

[Stage 140:>                                                        (0 + 1) / 1]

+-------------------+------------------+------------------+--------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+-----------------------+------------------+
|          intercept|    annual_inc_woe|           dti_woe|earliest_cr_line_woe|fico_range_low_woe|   funded_amnt_woe|home_ownership_woe| inq_last_6mths_woe|     revol_util_woe|     sub_grade_woe|         term_woe|verification_status_woe|      zip_code_woe|
+-------------------+------------------+------------------+--------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+-----------------------+------------------+
|-1.3924913836320794|0.6500697919534217|0.5690049329222368|  0.6037555702965285|0.4677702711714637|0.6824781085590917|0.8872963663697023|0.46248972965391955|-0.2735607855371191|0.6363798884908634|0.535351288968826|    0.226971

                                                                                

In [81]:
pipe.stages[0].getInputCols()

['funded_amnt_woe',
 'term_woe',
 'sub_grade_woe',
 'home_ownership_woe',
 'annual_inc_woe',
 'verification_status_woe',
 'zip_code_woe',
 'dti_woe',
 'earliest_cr_line_woe',
 'fico_range_low_woe',
 'inq_last_6mths_woe',
 'revol_util_woe']

In [104]:
model.stages[1].getParam("link")

Param(parent='GeneralizedLinearRegression_bbc52cf66d2b', name='link', doc='The name of link function which provides the relationship between the linear predictor and the mean of the distribution function. Supported options: identity, log, inverse, logit, probit, cloglog and sqrt.')

## Pyspark ML Pipe

In [None]:
from pyspark.ml.param.shared import s