Como o problema de negócio consiste em classificar um cliente numa categoria binária, então se implementará um modelo de regressão logística.

In [78]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer

training_features = ['V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']

spark = SparkSession.builder.getOrCreate()
db_cc_training = spark.read.parquet('spark-warehouse/db_cc_training')

train = VectorAssembler(
    inputCols = training_features,
    outputCol = 'features'
)\
    .transform(db_cc_training)\
    .drop('V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17')\
    .withColumnRenamed('Class','label')

test, validate = spark.read.parquet('spark-warehouse/db_cc_norm').randomSplit([0.7, 0.3])

test = VectorAssembler(
    inputCols = training_features,
    outputCol = 'features'
)\
    .transform(test)\
    .drop('Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28')\
    .withColumnRenamed('Class','label')

validate = VectorAssembler(
    inputCols = training_features,
    outputCol = 'features'
)\
    .transform(validate)\
    .drop('Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28')\
    .withColumnRenamed('Class','label')

In [69]:
from pyspark.ml.classification import LogisticRegression

ITERATIONS = 100
REGPARAM = 0.03
ELASTICNETPARAM = 0.8

logit_reg = LogisticRegression(
    maxIter = ITERATIONS,
    regParam = REGPARAM,
    elasticNetParam = ELASTICNETPARAM
)

In [70]:
logit_reg_model = logit_reg.fit(train)

In [86]:
test_prediction = logit_reg_model.transform(test)

In [87]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

BinaryClassificationEvaluator().evaluate(test_prediction)

                                                                                

0.9762263300038645

In [84]:
validate_prediction = logit_reg_model.transform(validate)

BinaryClassificationEvaluator().evaluate(validate_prediction)

                                                                                

0.9780937322180784

In [90]:
logit_reg_model.save('antifraud_logit_model')

                                                                                