In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.context import SQLContext

In [2]:
spark = SparkSession.builder.appName('spark-sql').master('local').getOrCreate()
sqlContext = SQLContext(spark)
filepath = 'data/bank-additional-full.csv'
df = sqlContext.read.load(filepath, format='com.databricks.spark.csv', header='true', inferSchema='true', sep=';')
df = df.toDF(*(c.replace('.', '_') for c in df.columns))

In [3]:
df.show(n=5)

+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|      job|marital|  education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|  y|
+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 57| services|married|high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 37| serv

Example of the first row

In [4]:
df.show(1, vertical = True)

-RECORD 0---------------------
 age            | 56          
 job            | housemaid   
 marital        | married     
 education      | basic.4y    
 default        | no          
 housing        | no          
 loan           | no          
 contact        | telephone   
 month          | may         
 day_of_week    | mon         
 duration       | 261         
 campaign       | 1           
 pdays          | 999         
 previous       | 0           
 poutcome       | nonexistent 
 emp_var_rate   | 1.1         
 cons_price_idx | 93.994      
 cons_conf_idx  | -36.4       
 euribor3m      | 4.857       
 nr_employed    | 5191.0      
 y              | no          
only showing top 1 row



In [5]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [6]:
x_numeric = [
    'age',
    'campaign',
    'previous',
    'emp_var_rate',
    'cons_price_idx',
    'cons_conf_idx',
    'euribor3m',
    'nr_employed'
]

TARGET_COLUMN = 'y'
TARGET_COLUMN_ENC = 'y_encoded'
FEATURES_COLUMN = 'features'
PREDICTION_COLUMN = 'prediction'

indexer = StringIndexer(inputCol=TARGET_COLUMN, outputCol=TARGET_COLUMN_ENC) 
df = indexer.fit(df).transform(df) 

vector_assembler = VectorAssembler(inputCols = x_numeric, outputCol = FEATURES_COLUMN)
v_df = vector_assembler.transform(df)
v_df = v_df.select([FEATURES_COLUMN, TARGET_COLUMN_ENC])

In [7]:
v_df.show(3)

+--------------------+---------+
|            features|y_encoded|
+--------------------+---------+
|[56.0,1.0,0.0,1.1...|      0.0|
|[57.0,1.0,0.0,1.1...|      0.0|
|[37.0,1.0,0.0,1.1...|      0.0|
+--------------------+---------+
only showing top 3 rows



In [8]:
(training_data, test_data) = v_df.randomSplit([0.8,0.2])
print("Training Dataset Count: " + str(training_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

Training Dataset Count: 32973
Test Dataset Count: 8215


#### Model training

In [9]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [10]:
evaluator = BinaryClassificationEvaluator(labelCol = TARGET_COLUMN_ENC, metricName = 'areaUnderPR')

### Logistic Regression

In [11]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol = FEATURES_COLUMN, 
    labelCol=TARGET_COLUMN_ENC, 
    predictionCol=PREDICTION_COLUMN
)
lr_model = lr.fit(training_data)
lr_predict = lr_model.transform(test_data)

In [12]:
lr_predict.show(3)

+--------------------+---------+--------------------+--------------------+----------+
|            features|y_encoded|       rawPrediction|         probability|prediction|
+--------------------+---------+--------------------+--------------------+----------+
|[18.0,1.0,0.0,-1....|      0.0|[2.04060588770253...|[0.88499494891790...|       0.0|
|[18.0,1.0,2.0,-1....|      0.0|[1.98930752859724...|[0.87966985814996...|       0.0|
|[18.0,1.0,2.0,-1....|      0.0|[1.98933633920729...|[0.87967290774269...|       0.0|
+--------------------+---------+--------------------+--------------------+----------+
only showing top 3 rows



In [13]:
evaluator.evaluate(lr_predict)

0.3647630251313429

### Decision Tree

In [14]:
from pyspark.ml.classification import DecisionTreeClassifier

tree = DecisionTreeClassifier(
    featuresCol = FEATURES_COLUMN, 
    labelCol=TARGET_COLUMN_ENC, 
    predictionCol=PREDICTION_COLUMN,
    maxDepth=5
)
tree_model = tree.fit(training_data)
tree_predict = tree_model.transform(test_data)

In [15]:
evaluator.evaluate(tree_predict)

0.07760461087470466

### Random Forest

In [16]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    featuresCol = FEATURES_COLUMN, 
    labelCol=TARGET_COLUMN_ENC, 
    predictionCol=PREDICTION_COLUMN,
    maxDepth=5
)
rf_model = rf.fit(training_data)
rf_predict = rf_model.transform(test_data)

In [17]:
evaluator.evaluate(rf_predict)

0.4031946011102858

### Gradient Boosted Tree

In [18]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    featuresCol = FEATURES_COLUMN, 
    labelCol=TARGET_COLUMN_ENC, 
    predictionCol=PREDICTION_COLUMN
)
gbt_model = gbt.fit(training_data)
gbt_predict = gbt_model.transform(test_data)

In [19]:
evaluator.evaluate(gbt_predict)

0.39728658421880136