# BANK MARKETING

<br><br>
Membros:
- Anderson
- Caio Viera
- Pedro Correia



#### Inicializando sessão do Spark

In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

In [2]:
sc

''

#### Lendo os dados do HDFS

In [3]:
data = spark.read.csv(
    "hdfs://elephant:8020/user/labdata/bank.csv",
    header=True,
    sep=";",
    inferSchema=True
)

In [4]:
data = data.selectExpr(*["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns])

In [5]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp_var_rate: double (nullable = true)
 |-- cons_price_idx: double (nullable = true)
 |-- cons_conf_idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr_employed: double (nullable = true)
 |-- y: string (nullable = true)



#### Preparação dos Dados

In [6]:
data.toPandas().head().T

Unnamed: 0,0,1,2,3,4
age,56,57,37,40,56
job,housemaid,services,services,admin.,services
marital,married,married,married,married,married
education,basic.4y,high.school,high.school,basic.6y,high.school
default,no,unknown,no,no,no
housing,no,no,yes,no,no
loan,no,no,no,no,yes
contact,telephone,telephone,telephone,telephone,telephone
month,may,may,may,may,may
day_of_week,mon,mon,mon,mon,mon


In [7]:
categoricalColumns = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'day_of_week',
    'poutcome'
]

numericColumns = [
    'pdays',
    'previous',
    'emp_var_rate',
    'cons_price_idx',
    'cons_conf_idx',
    'euribor3m',
    'nr_employed'
]

In [8]:
# checando cardinalidade das colunas categóricas
from pyspark.sql.functions import countDistinct

print('Distinct Categories:')
for categoricalCol in categoricalColumns:
    print('  - {:<12} {}'.format(categoricalCol, data.agg(countDistinct(categoricalCol)).collect()[0][0]))

Distinct Categories:
  - job          12
  - marital      4
  - education    8
  - default      3
  - housing      3
  - loan         3
  - contact      2
  - month        10
  - day_of_week  5
  - poutcome     3


Nenhuma das variáveis categóricas apresenta um caso grave de cardinalidade excessiva.

In [9]:
# criando o pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

stages = []

In [10]:
# transformações dados categóricos
for categoricalCol in categoricalColumns:
    # nomes para valores [0:n_cats-1]
    indexer = StringIndexer(
        inputCol=categoricalCol, 
        outputCol=categoricalCol+'_index'
    )
    # criando dummies
    encoder = OneHotEncoder(
        inputCol=categoricalCol+'_index',
        outputCol=categoricalCol+'_class_vec'
    )
    # inserindo estágios de transformação
    stages += [indexer, encoder]

In [11]:
# indexação da variável resposta
indexer = StringIndexer(
    inputCol='y', 
    outputCol='label'
)

stages += [indexer]

In [12]:
# transformando variáveis numéricas para o tipo double
for numericCol in numericColumns:
    data = data.withColumn(numericCol, data[numericCol].cast('double'))

In [13]:
# criando assembler, que deixa os dados no formato vetorial 
# demandado pela biblioteca ML do Spark

assembler_inputs = [categoricalCol+'_class_vec' for categoricalCol in categoricalColumns]
assembler_inputs += numericColumns
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

stages += [assembler]

In [14]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(data)

In [15]:
data_model = pipelineModel.transform(data)
data_model = data_model.select(["label", "features"])

In [16]:
(trainingData, testData) = data_model.randomSplit([0.7, 0.3], seed=420)

print('Observações para treino: {}'.format(trainingData.count()))
print('Observações para teste:  {}'.format(testData.count()))

Observações para treino: 28877
Observações para teste:  12311


#### Modelagem

In [17]:
# verificando porcentagem de classes na variável resposta

data.createOrReplaceTempView('data')
label_count = spark.sql("""
SELECT \
    y, count(*) as freq \
FROM \
    data \
GROUP BY \
    y \
""")

label_count.withColumn('freq', label_count.freq/data.count()).show()

+---+-------------------+
|  y|               freq|
+---+-------------------+
| no| 0.8873458288821987|
|yes|0.11265417111780131|
+---+-------------------+



Percebemos que o resultado positivo `yes` que buscamos prever é relativamente raro.

1. Regressão Logística

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [33]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=20)

In [40]:
lrModel = lr.fit(trainingData)

In [41]:
predictions = lrModel.transform(testData)

In [42]:
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
auc = evaluator.evaluate(predictions)
print(f'areaUnderROC = {auc:.4f}')

areaUnderROC = 0.7919


2. Decision Tree

In [84]:
from pyspark.ml.classification import DecisionTreeClassifier

In [87]:
dt = DecisionTreeClassifier(
    labelCol="label",
    featuresCol="features",
    maxDepth=30,
    maxBins=50
)

In [88]:
dtModel = dt.fit(trainingData)

In [89]:
predictions_dt = dtModel.transform(testData)
predictions_dt_train = dtModel.transform(trainingData)

In [120]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction"
)


accuracy_dt = evaluator_accuracy.evaluate(predictions_dt)
accuracy_dt_train = evaluator_accuracy.evaluate(predictions_dt_train)
print(f'Accuracy:         {accuracy_dt:.4f}')
print(f'Accuracy (TRAIN): {accuracy_dt_train:.4f}')
auc_dt = evaluator_auc.evaluate(predictions_dt)
print(f'areaUnderROC:     {auc_dt:.4f}')

Accuracy:         0.8643
Accuracy (TRAIN): 0.9689
areaUnderROC:     0.6326


3. RandomForest

In [77]:
from pyspark.ml.classification import RandomForestClassifier

In [127]:
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=20,
    maxDepth=30,
    seed=420
)

In [128]:
rfModel = rf.fit(trainingData)

In [129]:
predictions_rf = rfModel.transform(testData)
predictions_rf_train = rfModel.transform(trainingData)

In [130]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction"
)


accuracy_rf = evaluator_accuracy.evaluate(predictions_rf)
accuracy_rf_train = evaluator_accuracy.evaluate(predictions_rf_train)
print(f'Accuracy:         {accuracy_rf:.4f}')
print(f'Accuracy (TRAIN): {accuracy_rf_train:.4f}')
auc_rf = evaluator_auc.evaluate(predictions_rf)
print(f'areaUnderROC:     {auc_rf:.4f}')

Accuracy:         0.8952
Accuracy (TRAIN): 0.9457
areaUnderROC:     0.7820


4. Gradient Boosting Machine

In [136]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    labelCol="label",
    featuresCol="features",
    maxDepth=5,
    seed=420
)

In [137]:
gbtModel = gbt.fit(trainingData)

In [140]:
predictions_gbt = gbtModel.transform(testData)
predictions_gbt_train = gbtModel.transform(trainingData)

In [141]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction"
)


accuracy_gbt = evaluator_accuracy.evaluate(predictions_gbt)
accuracy_gbt_train = evaluator_accuracy.evaluate(predictions_gbt_train)
print(f'Accuracy:         {accuracy_gbt:.4f}')
print(f'Accuracy (TRAIN): {accuracy_gbt_train:.4f}')
auc_gbt = evaluator_auc.evaluate(predictions_gbt)
print(f'areaUnderROC:     {auc_gbt:.4f}')

Accuracy:         0.9011
Accuracy (TRAIN): 0.9065
areaUnderROC:     0.8014


In [176]:
predictions_gbt.select('label', 'prediction').createOrReplaceTempView('predictions')

aaa = spark.sql("""
SELECT
    round((tp+tn)/(tp+tn+fp+fn), 4) as accuracy,
    round(tp/(tp+fp), 4) as precision,
    round(tp/(tp+fn), 4) as recall
FROM (
    SELECT
        sum(tn) as tn,
        sum(tp) as tp,
        sum(fn) as fn,
        sum(fp) as fp
    FROM (
        SELECT
            case when label = 0 and prediction = 0 then 1 else 0 end as tn,
            case when label = 1 and prediction = 1 then 1 else 0 end as tp,
            case when label = 1 and prediction = 0 then 1 else 0 end as fn,
            case when label = 0 and prediction = 1 then 1 else 0 end as fp
        FROM
            predictions
    )
)
""").show()

+--------+---------+------+
|accuracy|precision|recall|
+--------+---------+------+
|  0.9011|   0.6598|0.2737|
+--------+---------+------+



5. SVM

In [177]:
from pyspark.ml.classification import LinearSVC

In [178]:
svm = LinearSVC(
    labelCol="label",
    featuresCol = "features"
)

In [179]:
svmModel = svm.fit(trainingData)

In [180]:
predictions_svm = svmModel.transform(testData)
predictions_svm_train = svmModel.transform(trainingData)

In [181]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction"
)


accuracy_svm = evaluator_accuracy.evaluate(predictions_svm)
accuracy_svm_train = evaluator_accuracy.evaluate(predictions_svm_train)
print(f'Accuracy:         {accuracy_svm:.4f}')
print(f'Accuracy (TRAIN): {accuracy_svm_train:.4f}')
auc_svm = evaluator_auc.evaluate(predictions_svm)
print(f'areaUnderROC:     {auc_svm:.4f}')

Accuracy:         0.8863
Accuracy (TRAIN): 0.8880
areaUnderROC:     0.6682


In [182]:
predictions_svm.select('label', 'prediction').createOrReplaceTempView('predictions')

aaa = spark.sql("""
SELECT
    round((tp+tn)/(tp+tn+fp+fn), 4) as accuracy,
    round(tp/(tp+fp), 4) as precision,
    round(tp/(tp+fn), 4) as recall
FROM (
    SELECT
        sum(tn) as tn,
        sum(tp) as tp,
        sum(fn) as fn,
        sum(fp) as fp
    FROM (
        SELECT
            case when label = 0 and prediction = 0 then 1 else 0 end as tn,
            case when label = 1 and prediction = 1 then 1 else 0 end as tp,
            case when label = 1 and prediction = 0 then 1 else 0 end as fn,
            case when label = 0 and prediction = 1 then 1 else 0 end as fp
        FROM
            predictions
    )
)
""").show()

+--------+---------+------+
|accuracy|precision|recall|
+--------+---------+------+
|  0.8863|   0.5096| 0.057|
+--------+---------+------+



6. Rede Neural

In [183]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [186]:
attrs = trainingData.schema["features"].metadata["ml_attr"]["num_attrs"]
layers = [attrs, 100, 100, 100, 2]

In [190]:
mlp = MultilayerPerceptronClassifier(
    labelCol="label", 
    featuresCol="features", 
    layers=layers,
    tol=1e-7,
    seed = 420
)

In [191]:
%%time
mlpModel = mlp.fit(trainingData)

CPU times: user 19.3 ms, sys: 5.7 ms, total: 25 ms
Wall time: 1min 23s


In [192]:
predictions_mlp = mlpModel.transform(testData)
predictions_mlp_train = mlpModel.transform(trainingData)

In [194]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction"
)


accuracy_mlp = evaluator_accuracy.evaluate(predictions_mlp)
accuracy_mlp_train = evaluator_accuracy.evaluate(predictions_mlp_train)
print(f'Accuracy:         {accuracy_mlp:.4f}')
print(f'Accuracy (TRAIN): {accuracy_mlp_train:.4f}')
# auc_mlp = evaluator_auc.evaluate(predictions_mlp)
# print(f'areaUnderROC:     {auc_mlp:.4f}')

Accuracy:         0.8981
Accuracy (TRAIN): 0.8973


In [195]:
predictions_mlp.select('label', 'prediction').createOrReplaceTempView('predictions')

aaa = spark.sql("""
SELECT
    round((tp+tn)/(tp+tn+fp+fn), 4) as accuracy,
    round(tp/(tp+fp), 4) as precision,
    round(tp/(tp+fn), 4) as recall
FROM (
    SELECT
        sum(tn) as tn,
        sum(tp) as tp,
        sum(fn) as fn,
        sum(fp) as fp
    FROM (
        SELECT
            case when label = 0 and prediction = 0 then 1 else 0 end as tn,
            case when label = 1 and prediction = 1 then 1 else 0 end as tp,
            case when label = 1 and prediction = 0 then 1 else 0 end as fn,
            case when label = 0 and prediction = 1 then 1 else 0 end as fp
        FROM
            predictions
    )
)
""").show()

+--------+---------+------+
|accuracy|precision|recall|
+--------+---------+------+
|  0.8981|   0.6616|0.2174|
+--------+---------+------+



#### Conclusão

`GBTClassifier` apresentou maior potencial, apesar de ter um recall ainda muito baixo. Na sequência, vamos buscar encontrar melhores hiperparâmetros.

In [19]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [21]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    labelCol="label",
    featuresCol="features",
    seed=420
)

In [22]:
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 15, 20, 30]) \
    .addGrid(gbt.subsamplingRate, [.75, 1]) \
    .addGrid(gbt.maxIter, [60]) \
    .build()

In [23]:
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20, 40, 60]) \
    .build()

In [24]:
crossval = CrossValidator(estimator = gbt,
                          estimatorParamMaps = paramGrid,
                          evaluator = BinaryClassificationEvaluator(),
                          numFolds = 3)

In [25]:
%%time
gbtModel_cv = crossval.fit(trainingData)

CPU times: user 151 ms, sys: 44.8 ms, total: 196 ms
Wall time: 3min 19s


In [225]:
gbtModel_cv.avgMetrics

[0.7954570117058624, 0.7962791204553268, 0.7955932855441463]

In [226]:
gbtModel_cv.bestModel._java_obj.getMaxIter()

60

In [None]:
gbtModel_cv.bestModel._java_obj.getSubsamplingRate()