In [1]:
# librerie

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.types import StringType
from pyspark.sql import Row

from pyspark.ml.linalg import Vector as MLVector, Vectors as MLVectors
from pyspark.ml.feature import *

from pyspark.ml.classification import *
from pyspark.ml.regression import *

from pyspark.ml.tuning import CrossValidator, CrossValidatorModel, ParamGridBuilder
from pyspark.ml import Pipeline, PipelineModel

from pyspark.ml.evaluation import *


#####################################   mllib   #####################################

from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics

In [2]:
# sessione

sc = SparkContext(appName="DDAM_Project", master="local[*]")
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("DDAM_Project") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
sdf = spark.read.parquet("hdfs://kddrtserver11.isti.cnr.it:9000/user/hpsa04/bank_loan_status_dataset")

columns = sdf.schema.names
columns_categorical = [col.name for col in sdf.schema.fields if isinstance(col.dataType, StringType)]
columns_numerical = [col for col in columns if col not in columns_categorical]

sdf.printSchema()

root
 |-- Annual_Income: long (nullable = true)
 |-- Bankruptcies: long (nullable = true)
 |-- Credit_Score: long (nullable = true)
 |-- Current_Credit_Balance: long (nullable = true)
 |-- Current_Loan_Amount: long (nullable = true)
 |-- Debt_Income_Rate: double (nullable = true)
 |-- Home_Ownership: string (nullable = true)
 |-- Installment_Rate: double (nullable = true)
 |-- Loan_Status: string (nullable = true)
 |-- Maximum_Open_Credit: long (nullable = true)
 |-- Monthly_Debt: double (nullable = true)
 |-- Months_since_last_delinquent: string (nullable = true)
 |-- Number_of_Credit_Problems: long (nullable = true)
 |-- Number_of_Open_Accounts: long (nullable = true)
 |-- Purpose: string (nullable = true)
 |-- Tax_Liens: long (nullable = true)
 |-- Term: string (nullable = true)
 |-- Years_in_current_job: double (nullable = true)
 |-- Years_of_Credit_History: double (nullable = true)
 |-- cluster_label: long (nullable = true)



# Classification on "Loan_Status"

# - Numeric Only Classifiers

Data Preparation

In [4]:
data=sdf[columns_numerical+['Loan_Status']]


assembler = VectorAssembler(
    inputCols=columns_numerical,
    outputCol="features")
output = assembler.transform(data)


indexer = StringIndexer(inputCol="Loan_Status", outputCol="label")
indexed = indexer.fit(output).transform(output)

standardizer = StandardScaler(withMean=True, withStd=True,
                              inputCol='features',
                              outputCol='std_features')

sonar = indexed.select(['features', 'Loan_Status','label']) #sdf with one column being the vectorized features, another column 

# Prepare training and test set
training, test = sonar.randomSplit([0.7, 0.3])

###  Logistic Regression

In [5]:
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[ lr])

#This grid will have 6 x 2 x 3 = 36 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.elasticNetParam, [0.0,0.2,0.4,0.6,0.8,1]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.family, ['auto', 'binomial', 'multinomial' ]) \
    .build()


crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),  # areaUnderROC" (default)
                          numFolds=3,
                          parallelism=10) 

cvModel = crossval.fit(training)

prediction = cvModel.transform(test)  # Make predictions on test. cvModel uses the best model found

In [6]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="accuracy")

accuracy = evaluator.evaluate(prediction)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g" % (accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedPrecision")

precision = evaluator.evaluate(prediction)
print("Precision = %g" % (precision))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedRecall")

recall = evaluator.evaluate(prediction)
print("Recall = %g" % (recall))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="f1")

f1 = evaluator.evaluate(prediction)
print("F1 = %g" % (f1))

Test Error = 0.302937
Accuracy = 0.697063
Precision = 0.625775
Recall = 0.697063
F1 = 0.578406


###  Support Vector Machines

In [7]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)
pipeline = Pipeline(stages=[ standardizer,lsvc])

# this grid will have 5 x 2 x 5 = 50 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder()\
    .addGrid(lsvc.aggregationDepth, [2,3,4,7,10]) \
    .addGrid(lsvc.fitIntercept, [True, False]) \
    .addGrid(lsvc.regParam,[0.0,0.2,0.4,0.7,0.9]) \
    .build()


crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          parallelism=10)


cvModel = crossval.fit(training)

prediction = cvModel.transform(test)

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.html
Only one metric can be chosen in the multiclassclassificationevaluator.

In [8]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="accuracy")

accuracy = evaluator.evaluate(prediction)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g" % (accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedPrecision")

precision = evaluator.evaluate(prediction)
print("Precision = %g" % (precision))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedRecall")

recall = evaluator.evaluate(prediction)
print("Recall = %g" % (recall))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="f1")

f1 = evaluator.evaluate(prediction)
print("F1 = %g" % (f1))

Test Error = 0.302394
Accuracy = 0.697606
Precision = 0.486654
Recall = 0.697606
F1 = 0.573342


# - Classifiers considering mixed categorical and numeric features

Data Preparation for mixed input type classifiers

In [9]:
edf=sdf #encoded dataframe

#encoding


for col in columns_categorical:
    indexer = StringIndexer(inputCol=col, outputCol=col+'_Encoded')
    edf = indexer.fit(edf).transform(edf)


columnse = edf.schema.names #names of all columns in the encoded dataset
columns_categoricale = [col.name for col in edf.schema.fields if isinstance(col.dataType, StringType)]
columns_numericale = [col for col in columnse if (col not in columns_categoricale and col!= 'Loan_Status_Encoded') ]


data=edf[columns_numericale+['Loan_Status']]

# vectorize the numerical and freshly encoded categorical features
assembler = VectorAssembler(
    inputCols=columns_numericale, 
    outputCol="features", handleInvalid = "skip") #handleInvalid = "skip" is a current way to deal with missing values.
output = assembler.transform(data)


labelIndexer = StringIndexer(inputCol="Loan_Status", outputCol="label").fit(output)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Split the data into training and test sets (30% held out for testing)
(training, test) = output.randomSplit([0.7, 0.3], seed=0)

In [10]:
output.first

<bound method DataFrame.first of DataFrame[Annual_Income: bigint, Bankruptcies: bigint, Credit_Score: bigint, Current_Credit_Balance: bigint, Current_Loan_Amount: bigint, Debt_Income_Rate: double, Installment_Rate: double, Maximum_Open_Credit: bigint, Monthly_Debt: double, Number_of_Credit_Problems: bigint, Number_of_Open_Accounts: bigint, Tax_Liens: bigint, Years_in_current_job: double, Years_of_Credit_History: double, cluster_label: bigint, Home_Ownership_Encoded: double, Months_since_last_delinquent_Encoded: double, Purpose_Encoded: double, Term_Encoded: double, Loan_Status: string, features: vector]>

### Decision Tree

In [11]:
ldt = DecisionTreeClassifier()

In [12]:
pipeline = Pipeline(stages=[labelIndexer,  ldt, labelConverter]) #featureIndexer,

#This grid will have 6 x 6 x 2 x 2 = 144 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder()\
        .addGrid(ldt.maxBins, [112,150,180,200,300,500] )\
        .addGrid(ldt.maxDepth, [2,6,10,15,20,30] )\
        .addGrid(ldt.cacheNodeIds, [True, False]) \
        .addGrid(ldt.impurity, ['entropy', 'gini']) \
        .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          parallelism=10)  

cvModel = crossval.fit(training)

prediction = cvModel.transform(test)

In [13]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="accuracy")

accuracy = evaluator.evaluate(prediction)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g" % (accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedPrecision")

precision = evaluator.evaluate(prediction)
print("Precision = %g" % (precision))
evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedRecall")

recall = evaluator.evaluate(prediction)
print("Recall = %g" % (recall))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="f1")

f1 = evaluator.evaluate(prediction)
print("F1 = %g" % (f1))

Test Error = 0.400841
Accuracy = 0.599159
Precision = 0.605296
Recall = 0.599159
F1 = 0.602075


### Random Forest

In [14]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
#feature 11 has 112 categorical values and  therefore maxbins needs 
#to be at least as large as the max number of cat values

In [15]:
pipeline = Pipeline(stages=[labelIndexer,  rf, labelConverter]) #featureIndexer,

#This grid will have 6 x 2 x 4 x 5 = 180 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder()\
    .addGrid(rf.numTrees, [5,10,15,20,30])\
    .addGrid(rf.maxDepth, [2,10,15])  \
    .build()

#  .addGrid(rf.featureSubsetStrategy, ['1','5','6','10','14','17']) 
#    .addGrid(rf.impurity, ['entropy','gini'])  \
#    .addGrid(rf.maxBins, [112,200,300,400])  \

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          parallelism=10)  

cvModel = crossval.fit(training)

prediction = cvModel.transform(test)
selected = prediction.select("label", "features", "probability", "prediction")

In [16]:
selected.show(5)

+-----+--------------------+--------------------+----------+
|label|            features|         probability|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|[1859.0,0.0,688.0...|[0.48694014628216...|       1.0|
|  0.0|[2143.0,0.0,745.0...|[0.81457781425442...|       0.0|
|  1.0|[2188.0,0.0,728.0...|[0.61733384301568...|       0.0|
|  0.0|[2297.0,1.0,697.0...|[0.60014927148771...|       0.0|
|  1.0|[2599.0,0.0,738.0...|[0.65021602604699...|       0.0|
+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [17]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="accuracy")

accuracy = evaluator.evaluate(prediction)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g" % (accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedPrecision")

precision = evaluator.evaluate(prediction)
print("Precision = %g" % (precision))
evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedRecall")

recall = evaluator.evaluate(prediction)
print("Recall = %g" % (recall))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="f1")

f1 = evaluator.evaluate(prediction)
print("F1 = %g" % (f1))

Test Error = 0.299031
Accuracy = 0.700969
Precision = 0.669123
Recall = 0.700969
F1 = 0.60747


###  Naive Bayes Classifier

In [18]:
nb = NaiveBayes(smoothing=1.0, modelType='multinomial')

In [19]:
pipeline = Pipeline(stages=[labelIndexer,  nb, labelConverter]) #featureIndexer,


paramGrid = ParamGridBuilder()\
    .addGrid(nb.smoothing,[0.4,0.7,0.9,1,1.1,1.2,1.3,1.5]) \
    .build()


crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3,
                          parallelism=10) 

cvModel = crossval.fit(training)

prediction = cvModel.transform(test)

In [20]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="accuracy")

accuracy = evaluator.evaluate(prediction)
print("Test Error = %g" % (1.0 - accuracy))
print("Accuracy = %g" % (accuracy))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedPrecision")

precision = evaluator.evaluate(prediction)
print("Precision = %g" % (precision))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="weightedRecall")

recall = evaluator.evaluate(prediction)
print("Recall = %g" % (recall))

evaluator = MulticlassClassificationEvaluator(labelCol="label"
                                              , predictionCol="prediction"
                                              , metricName="f1")

f1 = evaluator.evaluate(prediction)
print("F1 = %g" % (f1))

Test Error = 0.561186
Accuracy = 0.438814
Precision = 0.618969
Recall = 0.438814
F1 = 0.430101


--------------------

# - Classification with PCA:

iterating over k, fitting the logistic regression model, and measuring the results.

Here is where each set of k principal components is plugged into the Logistic Regression model to see how well the metrics come out relative to the unreduced dataset. Even though the results are impressive (we could cut the number of dimensions by half, namely from 14 down to 7, and lose only a bit over 1% in accuracy), the idea here was to demonstratte the use of PCA in the Apache Spark paradigm rather than actually reducing the dataset.

In [8]:
numd=columns_numerical+ ['Loan_Status'] #numerical columns+dependent variable (Loan Status)

datas=sdf[numd] #spark numerical + loan status

datas = datas.withColumnRenamed("Loan_Status","label")

#vectorizing the numerical columns
assembler = VectorAssembler(
    inputCols=columns_numerical,
    outputCol="features")
output = assembler.transform(datas)

#normalization
standardizer = StandardScaler(withMean=True, withStd=True,
                              inputCol='features',
                              outputCol='std_features')
model = standardizer.fit(output)
output = model.transform(output)

indexer = StringIndexer(inputCol="label", outputCol="label_idx")
indexed = indexer.fit(output).transform(output)

sonar = indexed.select(['std_features', 'label', 'label_idx']) #extract only features of interest

In [21]:
for k in range (1,(len(columns_numerical))):
    pca = PCA(k=k, inputCol="std_features", outputCol="pca")
    model = pca.fit(sonar)
    transformed = model.transform(sonar)

    #data=transformed.select('pca', 'label_idx')

    data = transformed.rdd.map(lambda x: LabeledPoint(x[2], MLLibVectors.fromML(x[3]))) 

    train, test = data.randomSplit([0.7, 0.3])

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(train, numClasses=2)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)
    binary= BinaryClassificationMetrics(predictionAndLabels)
    # Overall statistics
    print('k number of components=',k)
    
    accuracy=metrics.accuracy 
    cm=metrics.confusionMatrix()
    aupr=binary.areaUnderPR
    auc=binary.areaUnderROC
    print("Summary Stats")
    print("Area under precision recall curve = %s" % aupr)
    print("Area under roc curve = %s" % auc)
    print('Accuracy = ', accuracy)
    
    #precision, recall and f1 are all depracated since 2.2.2. version of Spark Apache. Only accuracy is available. 
    print(cm)

    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    print('\n')

k number of components= 1
Summary Stats
Area under precision recall curve = 0.31068258695100054
Area under roc curve = 0.5147377356450528
Accuracy =  0.5326195899772209
DenseMatrix([[8591., 6762.],
             [3497., 3100.]])
Class 0.0 precision = 0.710704831237591
Class 0.0 recall = 0.5595649058815867
Class 0.0 F1 Measure = 0.6261433621223716
Class 1.0 precision = 0.314337862502535
Class 1.0 recall = 0.469910565408519
Class 1.0 F1 Measure = 0.3766936022844644
Weighted recall = 0.5326195899772209
Weighted precision = 0.5915780479690186
Weighted F(1) Score = 0.5511720607259856
Weighted F(0.5) Score = 0.5727994887250181
Weighted false positive rate = 0.5031441186871153


k number of components= 2
Summary Stats
Area under precision recall curve = 0.31853811117098574
Area under roc curve = 0.51524314870289
Accuracy =  0.5336223595609261
DenseMatrix([[8450., 6557.],
             [3555., 3120.]])
Class 0.0 precision = 0.7038733860891295
Class 0.0 recall = 0.5630705670687013
Class 0.0 F1 Me

Summary Stats
Area under precision recall curve = 0.36386067864656957
Area under roc curve = 0.587792470061391
Accuracy =  0.5852024070021882
DenseMatrix([[8908., 6417.],
             [2682., 3929.]])
Class 0.0 precision = 0.7685936151855047
Class 0.0 recall = 0.5812724306688417
Class 0.0 F1 Measure = 0.661935723574215
Class 1.0 precision = 0.37976029383336557
Class 1.0 recall = 0.5943125094539404
Class 1.0 F1 Measure = 0.46340744235419
Weighted recall = 0.5852024070021882
Weighted precision = 0.6514082993823048
Weighted F(1) Score = 0.6021039189085702
Weighted F(0.5) Score = 0.6278024024066586
Weighted false positive rate = 0.40961746687940603


k number of components= 14
Summary Stats
Area under precision recall curve = 0.36622127643412605
Area under roc curve = 0.5837297357319509
Accuracy =  0.5871868321738735
DenseMatrix([[9055., 6225.],
             [2854., 3859.]])
Class 0.0 precision = 0.7603493156436308
Class 0.0 recall = 0.5926047120418848
Class 0.0 F1 Measure = 0.666078193387

As can be observed below, and as was observed previously with PCA, the LR with MLlib didn't yield impressive results in terms of accuracy.

In [23]:
sonar=sonar.rdd

In [24]:
data = sonar.map(lambda x: LabeledPoint(x[2], MLLibVectors.fromML(x[0]))) 
#converting a ml vector to mll vectr

train, test = data.randomSplit([0.7, 0.3])

model = LogisticRegressionWithLBFGS.train(train)

y_yhat = test.map(lambda x: (x.label, model.predict(x.features)))
err = y_yhat.filter(lambda x: x[0] != x[1]).count() / float(test.count())
print("Error = " + str(err), 'Accuracy = ', 1-err )

Error = 0.41671639050810116 Accuracy =  0.5832836094918988


-----------------

# Regression on "Credit_Score"

In [4]:
training, test = sdf.randomSplit([0.7, 0.3])

training.rdd.first()

Row(Annual_Income=1859, Bankruptcies=0, Credit_Score=688, Current_Credit_Balance=1436, Current_Loan_Amount=684, Debt_Income_Rate=27.401, Home_Ownership='Own Home', Installment_Rate=6.206, Loan_Status='Fully Paid', Maximum_Open_Credit=2739, Monthly_Debt=42.4487, Months_since_last_delinquent='Never committed', Number_of_Credit_Problems=0, Number_of_Open_Accounts=3, Purpose='Other', Tax_Liens=0, Term='Short Term', Years_in_current_job=0.0, Years_of_Credit_History=21.3, cluster_label=1)

In [5]:
perc_Loan_Status = test.groupby('Loan_Status').count().collect()
perc_Charged_Off = 100 * perc_Loan_Status[1]['count']/(perc_Loan_Status[0]['count'] + perc_Loan_Status[1]['count'])
perc_Fully_Paid = 100 * perc_Loan_Status[0]['count']/(perc_Loan_Status[0]['count'] + perc_Loan_Status[1]['count'])

print('test set "Charged_Off" percentage over total:', perc_Charged_Off)
print('test set "Fully_Paid" percentage over total:', perc_Fully_Paid)

test set "Charged_Off" percentage over total: 30.72883368344141
test set "Fully_Paid" percentage over total: 69.27116631655859


In [6]:
def get_ml_data_preparation_stages(spark_df, model_type, target_col, columns_toExclude, encode=True, scale=False):
    
    columns_categorical = [col.name for col in spark_df.schema.fields if isinstance(col.dataType, StringType)]
    
    columns_toExclude += columns_categorical + [target_col]
    
    ml_features = [col for col in spark_df.columns if col not in columns_toExclude]
    
    stages = []
    
    if encode:
        for col in columns_categorical:
            encoder = StringIndexer(inputCol = col, outputCol = col + "_encoded", stringOrderType = 'alphabetDesc')
            stages += [encoder]
            ml_features += [encoder.getOutputCol()]
            
    assembler = VectorAssembler(inputCols = ml_features, outputCol = "features")
    stages += [assembler]
    
    if scale:
        scaler = StandardScaler(inputCol = "features", outputCol = "scaledFeatures", withStd=True, withMean=False)
        stages += [scaler]
    
    if model_type == 'classification':
        label_encoder = StringIndexer(inputCol = target_col, outputCol = "label")
        stages += [label_encoder]
    
    return stages

### Linear Regression

In [7]:
LR_DP_stages = get_ml_data_preparation_stages(spark_df = sdf, model_type = 'regression', target_col = 'Credit_Score',
                                              columns_toExclude = ['Loan_Status'], encode = False, scale = True)
LR_DP_stages

[VectorAssembler_b8f3a25cea7a, StandardScaler_fdb29b59fa19]

In [8]:
LR_DP_pipeline = Pipeline(stages = LR_DP_stages).fit(sdf)

training_prepared = LR_DP_pipeline.transform(training)
test_prepared = LR_DP_pipeline.transform(test)

print('training:')
print(training.rdd.first())
print('\ntraining_prepared:')
print(training_prepared.rdd.first())
print('\ntest:')
print(test.rdd.first())
print('\ntest_prepared:')
print(test_prepared.rdd.first())

training:
Row(Annual_Income=1859, Bankruptcies=0, Credit_Score=688, Current_Credit_Balance=1436, Current_Loan_Amount=684, Debt_Income_Rate=27.401, Home_Ownership='Own Home', Installment_Rate=6.206, Loan_Status='Fully Paid', Maximum_Open_Credit=2739, Monthly_Debt=42.4487, Months_since_last_delinquent='Never committed', Number_of_Credit_Problems=0, Number_of_Open_Accounts=3, Purpose='Other', Tax_Liens=0, Term='Short Term', Years_in_current_job=0.0, Years_of_Credit_History=21.3, cluster_label=1)

training_prepared:
Row(Annual_Income=1859, Bankruptcies=0, Credit_Score=688, Current_Credit_Balance=1436, Current_Loan_Amount=684, Debt_Income_Rate=27.401, Home_Ownership='Own Home', Installment_Rate=6.206, Loan_Status='Fully Paid', Maximum_Open_Credit=2739, Monthly_Debt=42.4487, Months_since_last_delinquent='Never committed', Number_of_Credit_Problems=0, Number_of_Open_Accounts=3, Purpose='Other', Tax_Liens=0, Term='Short Term', Years_in_current_job=0.0, Years_of_Credit_History=21.3, cluster_lab

In [9]:
print(LinearRegression().explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxIter: max number of iterations (>= 0). (default: 100)
predictionCol: prediction column name. (default: prediction)
regParam: regularization parameter (>= 0). (default: 0.0)
solver: The solver algorithm for optimization. Supported options: auto, normal, l-bfgs. (default: auto)
standardization: whether to standardize the training features before fitting the model.

In [10]:
training_prepared = training_prepared.withColumnRenamed('Credit_Score', 'label')

features_len = len(training_prepared.rdd.first()['features'])

pca = PCA(inputCol="scaledFeatures", outputCol="pcaFeatures")
linear_regression = LinearRegression(featuresCol="pcaFeatures", labelCol="label", predictionCol="prediction")

stages = [pca, linear_regression]

paramGrid = ParamGridBuilder()\
        .addGrid(pca.k, [1, 2, features_len-3, features_len-1, features_len])\
        .addGrid(linear_regression.loss, ['squaredError', 'huber'])\
        .addGrid(linear_regression.regParam, [0, 0.5, 1])\
        .build()

crossval = CrossValidator(estimator = Pipeline(stages = stages),
                          estimatorParamMaps = paramGrid,
                          evaluator = RegressionEvaluator(),
                          numFolds = 3,
                          parallelism = 10)

Best_LR_pipeline = crossval.fit(training_prepared)

print('Best Pipeline:')
print(Best_LR_pipeline.bestModel.stages)
print('\nBest PCA Parameters:')
print(Best_LR_pipeline.bestModel.stages[0].extractParamMap())
print('\nBest Linear Regression Parameters:')
print(Best_LR_pipeline.bestModel.stages[1].extractParamMap())

Best Pipeline:
[PCA_36e756fa74f7, LinearRegression_5fa949d9048d]

Best PCA Parameters:
{Param(parent='PCA_36e756fa74f7', name='outputCol', doc='output column name'): 'pcaFeatures', Param(parent='PCA_36e756fa74f7', name='inputCol', doc='input column name'): 'scaledFeatures', Param(parent='PCA_36e756fa74f7', name='k', doc='the number of principal components (> 0)'): 13}

Best Linear Regression Parameters:
{Param(parent='LinearRegression_5fa949d9048d', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2, Param(parent='LinearRegression_5fa949d9048d', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0, Param(parent='LinearRegression_5fa949d9048d', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0.'): 1.35, Param(parent='LinearRegression_5fa949d9048d', name='featuresCol', doc='features column name'): 'pcaFeatures'

In [11]:
print('Intercetta della Regressione Lienare:')
print(Best_LR_pipeline.bestModel.stages[1].intercept)
print('\nCoefficienti della Regressione Lienare:')
print(Best_LR_pipeline.bestModel.stages[1].coefficients)

Intercetta della Regressione Lienare:
719.2276872313762

Coefficienti della Regressione Lienare:
[1.3210834297056928,0.5679753198064444,-1.1562795588198562,0.6816839237518202,4.1164309344350185,-2.063111474211884,1.48121538271208,1.872291683709596,1.060521199869502,1.3999631013954168,1.249060127279574,6.241891611698334,1.4343659516247973]


In [12]:
prediction = Best_LR_pipeline.transform(test_prepared)

r2 = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="r2").evaluate(prediction)

mse = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="mse").evaluate(prediction)

rmse = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="rmse").evaluate(prediction)

mae = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="mae").evaluate(prediction)

print("Determination Coefficient (R^2) on test data = %g" % r2)
print("Mean Squared Error (MSE) on test data = %g" % mse)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

Determination Coefficient (R^2) on test data = 0.0666471
Mean Squared Error (MSE) on test data = 647.419
Root Mean Squared Error (RMSE) on test data = 25.4444
Mean Absolute Error (MAE) on test data = 18.9679


### Random Forest Regressor

In [13]:
RF_DP_stages = get_ml_data_preparation_stages(spark_df = sdf, model_type = 'regression', target_col = 'Credit_Score',
                                              columns_toExclude = ['Loan_Status'], encode = True, scale = False)
RF_DP_stages

[StringIndexer_32abee704304,
 StringIndexer_730f95e41f00,
 StringIndexer_26514bdf8c98,
 StringIndexer_0ed720463535,
 StringIndexer_afe68a4427b7,
 VectorAssembler_e2747e071125]

In [14]:
RF_DP_pipeline = Pipeline(stages = RF_DP_stages).fit(sdf)

# RF_DP_pipeline.save('RF_DP_pipeline')
# RF_DP_pipeline = PipelineModel.load('RF_DP_pipeline')

training_prepared = RF_DP_pipeline.transform(training)
test_prepared = RF_DP_pipeline.transform(test)

print('training:')
print(training.rdd.first())
print('\ntraining_prepared:')
print(training_prepared.rdd.first())
print('\ntest:')
print(test.rdd.first())
print('\ntest_prepared:')
print(test_prepared.rdd.first())

training:
Row(Annual_Income=1859, Bankruptcies=0, Credit_Score=688, Current_Credit_Balance=1436, Current_Loan_Amount=684, Debt_Income_Rate=27.401, Home_Ownership='Own Home', Installment_Rate=6.206, Loan_Status='Fully Paid', Maximum_Open_Credit=2739, Monthly_Debt=42.4487, Months_since_last_delinquent='Never committed', Number_of_Credit_Problems=0, Number_of_Open_Accounts=3, Purpose='Other', Tax_Liens=0, Term='Short Term', Years_in_current_job=0.0, Years_of_Credit_History=21.3, cluster_label=1)

training_prepared:
Row(Annual_Income=1859, Bankruptcies=0, Credit_Score=688, Current_Credit_Balance=1436, Current_Loan_Amount=684, Debt_Income_Rate=27.401, Home_Ownership='Own Home', Installment_Rate=6.206, Loan_Status='Fully Paid', Maximum_Open_Credit=2739, Monthly_Debt=42.4487, Months_since_last_delinquent='Never committed', Number_of_Credit_Problems=0, Number_of_Open_Accounts=3, Purpose='Other', Tax_Liens=0, Term='Short Term', Years_in_current_job=0.0, Years_of_Credit_History=21.3, cluster_lab

In [15]:
print(RandomForestRegressor().explainParams())

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 

In [16]:
training_prepared = training_prepared.withColumnRenamed('Credit_Score', 'label')

features_len = len(training_prepared.rdd.first()['features'])

pca = PCA(inputCol="features", outputCol="pcaFeatures")
random_forest = RandomForestRegressor(featuresCol="pcaFeatures", labelCol="label", predictionCol="prediction")

stages = [pca, random_forest]

paramGrid = ParamGridBuilder()\
        .addGrid(pca.k, [1, 2, features_len-3, features_len-1, features_len] )\
        .addGrid(random_forest.numTrees, [5, 10, 20, 30])\
        .addGrid(random_forest.maxDepth, [1, 3, 5, 10])\
        .build()

crossval = CrossValidator(estimator = Pipeline(stages = stages),
                          estimatorParamMaps = paramGrid,
                          evaluator = RegressionEvaluator(),
                          numFolds=3,
                          parallelism=10)

Best_RF_Pipeline = crossval.fit(training_prepared)

print('Best Pipeline:')
print(Best_RF_Pipeline.bestModel.stages)
print('\nBest PCA Parameters:')
print(Best_RF_Pipeline.bestModel.stages[0].extractParamMap())
print('\nBest Random Forest Parameters:')
print(Best_RF_Pipeline.bestModel.stages[1].extractParamMap())

Best Pipeline:
[PCA_2e237e96c506, RandomForestRegressionModel (uid=RandomForestRegressor_5feca2eb162c) with 30 trees]

Best PCA Parameters:
{Param(parent='PCA_2e237e96c506', name='outputCol', doc='output column name'): 'pcaFeatures', Param(parent='PCA_2e237e96c506', name='inputCol', doc='input column name'): 'features', Param(parent='PCA_2e237e96c506', name='k', doc='the number of principal components (> 0)'): 18}

Best Random Forest Parameters:
{Param(parent='RandomForestRegressor_5feca2eb162c', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False, Param(parent='RandomForestRegressor_5feca2eb162c', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint di

In [17]:
print('Best Random Forest Tree:')
print(Best_RF_Pipeline.bestModel.stages[1].toDebugString)

Best Random Forest Tree:


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
prediction = Best_RF_Pipeline.transform(test_prepared)

r2 = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="r2").evaluate(prediction)

mse = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="mse").evaluate(prediction)

rmse = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="rmse").evaluate(prediction)

mae = RegressionEvaluator(labelCol="Credit_Score", predictionCol="prediction", metricName="mae").evaluate(prediction)

print("Determination Coefficient (R^2) on test data = %g" % r2)
print("Mean Squared Error (MSE) on test data = %g" % mse)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

Determination Coefficient (R^2) on test data = 0.286025
Mean Squared Error (MSE) on test data = 495.248
Root Mean Squared Error (RMSE) on test data = 22.2542
Mean Absolute Error (MAE) on test data = 16.7105


In [19]:
# Best_RF_Pipeline.save('Best_Regression_Model')

### Features Importance for Regression

Fittiamo nuovamente il migior modello di regressione ottenuto, ma senza eseguire la PCA, in modo da poter ottenere l'importanza delle Features originali. La feature importance della Random Forest precedente fornirebbe solo l'importanza delle Componenti Principali, che non Ã¨ quindi rilevante.

In [20]:
random_forest = RandomForestRegressor(featuresCol="features",
                                      labelCol="label",
                                      predictionCol="prediction")
Best_Random_Forest_Regressor = random_forest.fit(training_prepared)

In [21]:
features_importance = list(zip(RF_DP_stages[-1].getInputCols(), Best_Random_Forest_Regressor.featureImportances))
features_importance.sort(key = lambda x: x[1], reverse = True)
features_importance

[('Term_encoded', 0.6400198774247271),
 ('Maximum_Open_Credit', 0.09780219059924301),
 ('Current_Loan_Amount', 0.08768526434166263),
 ('Purpose_encoded', 0.05953197356751779),
 ('Loan_Status_encoded', 0.023931316491823756),
 ('Years_of_Credit_History', 0.023096143221317107),
 ('Debt_Income_Rate', 0.017426426159552366),
 ('Months_since_last_delinquent_encoded', 0.017277096930852543),
 ('Home_Ownership_encoded', 0.006584514953259562),
 ('Annual_Income', 0.006578341665088518),
 ('Installment_Rate', 0.0051843573315337255),
 ('Current_Credit_Balance', 0.005087746905857592),
 ('Monthly_Debt', 0.0041511295552908575),
 ('Number_of_Open_Accounts', 0.0034396579369030196),
 ('Number_of_Credit_Problems', 0.0015007655616440393),
 ('Years_in_current_job', 0.0003131999664563091),
 ('Tax_Liens', 0.0002123704187862363),
 ('Bankruptcies', 0.0001776269684840113),
 ('cluster_label', 0.0)]

---------------------------------------------

# Pattern Mining