In [1]:
import findspark
findspark.init()
import pyspark
import random
from pyspark.sql import SparkSession
import pyspark.sql
from pyspark.sql.functions import *
sc = pyspark.SparkContext(appName="bank")

In [2]:
spark = SparkSession(sc)
bank = spark.read.csv(path='bank.csv',
                        sep=',',
                        encoding='UTF-8',
                        comment=None,
                        header=True, 
                        inferSchema=True)

In [3]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

df = bank.select('age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit')
cols = df.columns

categoricalColumns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol = 'deposit', outputCol = 'label')
stages += [label_stringIdx]
numericCols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['label', 'features'] + cols
df = df.select(selectedCols)

### Logistic Regression

In [4]:
from pyspark.ml.classification import LogisticRegression
train, test = df.randomSplit([0.7, 0.3])
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

In [5]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions_lr = lrModel.transform(test)
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions_lr)

0.8786206847886866

### Decision Tree

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label')
dtModel = dt.fit(train)

In [7]:
predictions_dt = dtModel.transform(test)
evaluator.evaluate(predictions_dt)

0.6852198334335932

### Random forest

In [8]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)

In [9]:
predictions_rf = rfModel.transform(test)
evaluator.evaluate(predictions_rf)

0.8747424853711946

### Factorization Machine

In [10]:
from pyspark.ml.classification import FMClassifier
fm=FMClassifier( featuresCol='features',labelCol='label')
fmModel = fm.fit(train)

In [11]:
predictions_fm = fmModel.transform(test)
evaluator.evaluate(predictions_fm)

0.6890260575343868

### Gradient boosted trees

In [12]:
from pyspark.ml.classification import GBTClassifier
gbt=GBTClassifier( featuresCol='features',labelCol='label')
gbtModel = gbt.fit(train)

In [13]:
predictions_gbt = gbtModel.transform(test)
evaluator.evaluate(predictions_gbt)

0.882896865369111