In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

def main():
    spark = SparkSession.builder.appName("BankMarketing").master("local[*]").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    df = spark.read.options(delimiter=";", header="true", inferSchema="true").csv("dat/full.csv")
    integer_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, (IntegerType, DoubleType))]

    marital_indexer = StringIndexer(inputCol="marital", outputCol="mIdx")
    marital_encoder = OneHotEncoder(inputCol="mIdx", outputCol="mVector", dropLast=False)
    education_indexer = StringIndexer(inputCol="education", outputCol="eIdx")
    education_encoder = OneHotEncoder(inputCol="eIdx", outputCol="eVector", dropLast=False)

    assembler = VectorAssembler(inputCols=["mVector", "eVector"] + integer_cols, outputCol="features")
    label_indexer = StringIndexer(inputCol="y", outputCol="label")
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
    regression = LogisticRegression(featuresCol="scaledFeatures", labelCol="label")

    pipeline = Pipeline(stages=[marital_indexer, marital_encoder, education_indexer, 
                                education_encoder, assembler, label_indexer, scaler, regression])

    training, validation = df.randomSplit([0.8, 0.2])

    pipeline_model = pipeline.fit(training)

    ef = pipeline_model.transform(training)
    ef.select("marital", "education", "features", "label", "prediction").show(truncate=False)
    pipeline_model.write().overwrite().save("bin/model")
    ef.select("rawPrediction", "prediction", "label").show()

    ff = pipeline_model.transform(validation)

    evaluator = BinaryClassificationEvaluator()
    area_under_roc = evaluator.evaluate(ef)
    area_under_roc_validation = evaluator.evaluate(ff)

    print(f"areaUnderROC (training) = {area_under_roc}")
    print(f"areaUnderROC (validation) = {area_under_roc_validation}")

    spark.stop()

if __name__ == "__main__":
    main()
