In [148]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [None]:
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
            .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc3") \
            .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
            .getOrCreate()

import mmlspark

In [149]:
dfRaw = spark.read.csv('loan_data.csv', header=True, inferSchema =True)

# Viewing data

In [None]:
dfRaw.show()

In [150]:
df = dfRaw.toDF(*(c.replace('.', '_') for c in dfRaw.columns))

In [None]:
df.columns

In [None]:
df.printSchema()

In [None]:
df.show(1, vertical=True)

In [None]:
df.select('credit_policy').show()

Count label

In [None]:
df.groupBy('credit_policy').count().show()

# Feature transformer

Filter Rows with NULL Values

In [None]:
df.filter('credit_policy is Null').show()

from pyspark.sql.functions import col
df.filter(col('credit_policy').isNull()).show()

StringIndexer encodes a string column of labels to a column of label indices

In [151]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="purpose", outputCol="purposeIndex")
df = indexer.fit(df).transform(df)

### Generate files

In [167]:
columnsOut = df.columns
columnsOut.remove('purpose')

dfOut = df.select(columnsOut)

[trainOut, testOut] = dfOut.randomSplit([0.8,0.2])

In [168]:
xColumnsOut = columnsOut
xColumnsOut.remove('credit_policy')

trainOut.select(xColumnsOut).write.csv('xTrain.csv', header=True, mode='overwrite')
testOut.select(xColumnsOut).write.csv('xTest.csv', header=True, mode='overwrite')

In [169]:
trainOut.select('credit_policy').write.csv('yTrain.csv', header=True, mode='overwrite')
testOut.select('credit_policy').write.csv('yTest.csv', header=True, mode='overwrite')

One-hot encoding maps a categorical feature, represented as a label index, to a binary vector with at most a single one-value indicating the presence of a specific feature value from among the set of all feature values.

In [None]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol="purposeIndex", outputCol="purposeVec")
df = encoder.transform(df)

VectorAssembler is a transformer that combines a given list of columns into a single vector column.

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['int_rate','installment','log_annual_inc',
                                       'dti','fico','days_with_cr_line',
                                       'revol_bal','revol_util',
                                       'inq_last_6mths','delinq_2yrs',
                                       'pub_rec','not_fully_paid',
                                       'purposeVec'],
                                        outputCol="rawFeatures")

df = assembler.transform(df)

Scaling features

In [None]:
from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler(inputCol="rawFeatures", outputCol="scaledFeatures")
df = scaler.fit(df).transform(df)

# Training

### Create dataframe for training

In [None]:
import pyspark.sql.functions as F

dfTraining = df.select(['credit_policy','scaledFeatures'])
dfTraining = dfTraining.withColumnRenamed('credit_policy', 'label').withColumnRenamed('scaledFeatures', 'features')

Split train and test

In [None]:
#[train, valid, test] = dfTraining.randomSplit([0.7,0.1,0.2])
[train, test] = dfTraining.randomSplit([0.8,0.2])

### Count label

In [None]:
train.groupBy('label').count().show()

In [None]:
test.groupBy('label').count().show()

### Training model

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.01)

paramMap = {lr.maxIter: 20}
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})

#modelLr = lr.fit(train)
model = lr.fit(train, paramMap)

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rfc = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

model = rfc.fit(train)

In [None]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.tree import RandomForest

trainRdd = train.rdd.map(lambda row: LabeledPoint(row['label'], row['features'].toArray()))

model = RandomForest.trainClassifier(trainRdd, numClasses=2, categoricalFeaturesInfo={}, numTrees=3)

# Evaluation

In [None]:
prediction = model.transform(test)

In [None]:
prediction = prediction.select("label", "prediction")

In [None]:
prediction.show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

In [None]:
evaluatorMulti.metricName

In [None]:
print(evaluatorMulti.evaluate(prediction, {evaluatorMulti.metricName: "f1"}))