# BANK MARKETING

<br><br>
Membros:
- Anderson
- Caio Viera
- Pedro Correia



#### Inicializando sessão do Spark

In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

#### Lendo os dados do HDFS

In [2]:
data = spark.read.csv(
    "hdfs://elephant:8020/user/labdata/bank.csv",
    header=True,
    sep=";",
    inferSchema=True
)

In [3]:
data = data.selectExpr(*["`{}` as {}".format(col, col.replace('.', '_')) for col in data.columns])

#### Preparação dos Dados

In [4]:
categoricalColumns = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'day_of_week',
    'poutcome'
]

numericColumns = [
    'pdays',
    'previous',
    'emp_var_rate',
    'cons_price_idx',
    'cons_conf_idx',
    'euribor3m',
    'nr_employed'
]

Nenhuma das variáveis categóricas apresenta um caso grave de cardinalidade excessiva.

In [5]:
# criando o pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

stages = []

In [6]:
# transformações dados categóricos
for categoricalCol in categoricalColumns:
    # nomes para valores [0:n_cats-1]
    indexer = StringIndexer(
        inputCol=categoricalCol, 
        outputCol=categoricalCol+'_index'
    )
    # criando dummies
    encoder = OneHotEncoder(
        inputCol=categoricalCol+'_index',
        outputCol=categoricalCol+'_class_vec'
    )
    # inserindo estágios de transformação
    stages += [indexer, encoder]

In [7]:
# indexação da variável resposta
indexer = StringIndexer(
    inputCol='y', 
    outputCol='label'
)

stages += [indexer]

In [8]:
# transformando variáveis numéricas para o tipo double
for numericCol in numericColumns:
    data = data.withColumn(numericCol, data[numericCol].cast('double'))

In [9]:
# criando assembler, que deixa os dados no formato vetorial 
# demandado pela biblioteca ML do Spark

assembler_inputs = [categoricalCol+'_class_vec' for categoricalCol in categoricalColumns]
assembler_inputs += numericColumns
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

stages += [assembler]

In [10]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(data)

In [11]:
data_model = pipelineModel.transform(data)
data_model = data_model.select(["label", "features"])

In [12]:
(trainingData, testData) = data_model.randomSplit([0.8, 0.2], seed=420)

print('Observações para treino: {}'.format(trainingData.count()))
print('Observações para teste:  {}'.format(testData.count()))

Observações para treino: 32988
Observações para teste:  8200


#### Modelagem

4. Gradient Boosting Machine

In [26]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    labelCol="label",
    featuresCol="features",
    maxDepth=2,
    maxIter=60,
    seed=420
)

In [27]:
gbtModel = gbt.fit(trainingData)

In [28]:
predictions_gbt = gbtModel.transform(testData)
predictions_gbt_train = gbtModel.transform(trainingData)

In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [30]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="label", 
    rawPredictionCol="rawPrediction"
)


accuracy_gbt = evaluator_accuracy.evaluate(predictions_gbt)
accuracy_gbt_train = evaluator_accuracy.evaluate(predictions_gbt_train)
print(f'Accuracy:         {accuracy_gbt:.4f}')
print(f'Accuracy (TRAIN): {accuracy_gbt_train:.4f}')
auc_gbt = evaluator_auc.evaluate(predictions_gbt)
print(f'areaUnderROC:     {auc_gbt:.4f}')

Accuracy:         0.9001
Accuracy (TRAIN): 0.9014
areaUnderROC:     0.8007


In [31]:
predictions_gbt.select('label', 'prediction').createOrReplaceTempView('predictions')

aaa = spark.sql("""
SELECT
    round((tp+tn)/(tp+tn+fp+fn), 4) as accuracy,
    round(tp/(tp+fp), 4) as precision,
    round(tp/(tp+fn), 4) as recall
FROM (
    SELECT
        sum(tn) as tn,
        sum(tp) as tp,
        sum(fn) as fn,
        sum(fp) as fp
    FROM (
        SELECT
            case when label = 0 and prediction = 0 then 1 else 0 end as tn,
            case when label = 1 and prediction = 1 then 1 else 0 end as tp,
            case when label = 1 and prediction = 0 then 1 else 0 end as fn,
            case when label = 0 and prediction = 1 then 1 else 0 end as fp
        FROM
            predictions
    )
)
""").show()

+--------+---------+------+
|accuracy|precision|recall|
+--------+---------+------+
|  0.9001|   0.6887|0.2332|
+--------+---------+------+



#### Conclusão

`GBTClassifier` apresentou maior potencial, apesar de ter um recall ainda muito baixo. Na sequência, vamos buscar encontrar melhores hiperparâmetros.

In [33]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [48]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(
    labelCol="label",
    featuresCol="features",
    maxIter=60,
    seed=420
)

In [49]:
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 5, 15]) \
    .build()

In [43]:
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [30, 60, 200]) \
    .build()

In [50]:
crossval = CrossValidator(estimator = gbt,
                          estimatorParamMaps = paramGrid,
                          evaluator = BinaryClassificationEvaluator(),
                          numFolds = 3)

In [51]:
%%time
gbtModel_cv = crossval.fit(trainingData)

KeyboardInterrupt: 

In [46]:
gbtModel_cv.avgMetrics

[0.798126861187815, 0.7988190510799111, 0.7920124220238502]

In [47]:
gbtModel_cv.bestModel._java_obj.getMaxIter()

60

In [None]:
gbtModel_cv.bestModel._java_obj.getSubsamplingRate()