In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.sql.types import IntegerType

In [23]:
def vector_assembler() -> VectorAssembler:
    features = ['age','sex_index','married_index','salary',
                'successfully_credit_completed','credit_completed_amount','active_credits','active_credits_amount','credit_amount']
    feature = VectorAssembler(inputCols=features, outputCol="features")
    return feature

In [24]:
def model_params(rf):
    return ParamGridBuilder() \
        .addGrid(rf.maxDepth, [2, 3, 4, 5]) \
        .addGrid(rf.maxBins, [2, 3, 4]) \
        .build()

In [25]:
def prepare_data(df: DataFrame, assembler) -> DataFrame:
    df=df.withColumn('married_index', df.married.cast(IntegerType()))
    
    # df=df.withColumn('sex_index', df.sex.cast(StringIndexer()))
    sex_index = StringIndexer(inputCol='sex', outputCol="sex_index")
    df = sex_index.fit(df).transform(df)
    
    df = assembler.transform(df)
    return df

In [26]:
def build_random_forest() -> RandomForestClassifier:
    rf = RandomForestClassifier(labelCol='is_credit_closed', featuresCol='features')
    return rf

In [27]:
def build_evaluator() -> MulticlassClassificationEvaluator:
    evaluator = MulticlassClassificationEvaluator(labelCol='is_credit_closed', predictionCol='prediction', metricName='accuracy')
    return evaluator

In [28]:
def build_tvs(rand_forest, evaluator, model_params) -> TrainValidationSplit:
    tvs = TrainValidationSplit(estimator=rand_forest,
            estimatorParamMaps=model_params,
            evaluator=evaluator,
            trainRatio=0.8)
    return tvs

In [29]:
def train_model(train_df, test_df) -> (RandomForestClassificationModel, float):
    assembler = vector_assembler()
    train_pdf = prepare_data(train_df, assembler)
    test_pdf = prepare_data(test_df, assembler)
    rf = build_random_forest()
    evaluator = build_evaluator()
    tvs = build_tvs(rf, evaluator, model_params(rf))
    models = tvs.fit(train_pdf)
    best = models.bestModel
    predictions = best.transform(test_pdf)
    accuracy = evaluator.evaluate(predictions)
    print(f"Accuracy: {accuracy}")
    print(f'Model maxDepth: {best._java_obj.getMaxDepth()}')
    print(f'Model maxBins: {best._java_obj.getMaxBins()}')
    return best, accuracy

if __name__ == "__main__":
    spark = SparkSession.builder.appName('PySparkMLJob').getOrCreate()
    train_df = spark.read.parquet("train.parquet")
    test_df = spark.read.parquet("test.parquet")
    train_model(train_df, test_df)

In [30]:
#test
spark = SparkSession.builder.appName('PySparkMLJob').getOrCreate()
train_df = spark.read.parquet("train.parquet")
test_df = spark.read.parquet("test.parquet")
# train_df=train_df.withColumn("married_index", train_df.married.cast(IntegerType()))
# train_df=train_df.withColumn("sex_index", train_df.sex.cast(IntegerType()))
# test_df=test_df.withColumn("married_index", test_df.married.cast(IntegerType()))
# test_df=test_df.withColumn("sex_index", test_df.sex.cast(IntegerType()))

In [31]:
#test
assembler = vector_assembler()
train_pdf = prepare_data(train_df, assembler)
test_pdf = prepare_data(test_df, assembler)

                                                                                

In [33]:
#test
train_pdf.printSchema()
train_pdf.select('age','married','married_index','sex','sex_index').show()

root
 |-- client_id: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- married: boolean (nullable = true)
 |-- salary: integer (nullable = true)
 |-- successfully_credit_completed: integer (nullable = true)
 |-- credit_completed_amount: integer (nullable = true)
 |-- active_credits: integer (nullable = true)
 |-- active_credits_amount: integer (nullable = true)
 |-- credit_amount: integer (nullable = true)
 |-- is_credit_closed: integer (nullable = true)
 |-- married_index: integer (nullable = true)
 |-- sex_index: double (nullable = false)
 |-- features: vector (nullable = true)

+---+-------+-------------+----+---------+
|age|married|married_index| sex|sex_index|
+---+-------+-------------+----+---------+
| 29|  false|            0|male|      0.0|
| 36|   true|            1|male|      0.0|
| 37|  false|            0|male|      0.0|
| 53|  false|            0|male|      0.0|
| 43|  false|            0|male|      0.0|
| 54|   true|    

In [34]:
rf = build_random_forest()
evaluator = build_evaluator()
tvs = build_tvs(rf, evaluator, model_params(rf))

# accuracy = evaluator.evaluate(predictions)
# print(f"Accuracy: {accuracy}")
# print(f'Model maxDepth: {best._java_obj.getMaxDepth()}')
# print(f'Model maxBins: {best._java_obj.getMaxBins()}')
# return best, accuracy

In [35]:
# rf.fit(train_pdf)

In [36]:
models = tvs.fit(train_pdf)

                                                                                

In [37]:
best = models.bestModel

In [38]:
predictions = best.transform(test_pdf)

In [40]:
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")
print(f'Model maxDepth: {best._java_obj.getMaxDepth()}')
print(f'Model maxBins: {best._java_obj.getMaxBins()}')

[Stage 208:>                                                        (0 + 1) / 1]

Accuracy: 0.7990726429675425
Model maxDepth: 5
Model maxBins: 4


                                                                                