In [19]:
import numpy
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import col
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

 读取数据

In [20]:
# 初始化Spark会话
spark = SparkSession.builder.appName("Classifier").getOrCreate()

# 读取CSV文件
train_data = spark.read.csv("file:///home/gyy/train_data/part-00000.csv", header=True, inferSchema=True)
test_data = spark.read.csv("file:///home/gyy/test_data/part-00000.csv", header=True, inferSchema=True)
# 选择相关特征和目标列
feature_columns = [
    "CNT_CHILDREN_value", "REGION_POPULATION_RELATIVE_value", "FLAG_EMP_PHONE_value",
    "FLAG_WORK_PHONE_value", "FLAG_CONT_MOBILE_value", "FLAG_PHONE_value", "FLAG_EMAIL_value",
    "REGION_RATING_CLIENT_value", "REGION_RATING_CLIENT_W_CITY_value", "REG_REGION_NOT_WORK_REGION_value",
    "LIVE_REGION_NOT_WORK_REGION_value", "REG_CITY_NOT_LIVE_CITY_value", "REG_CITY_NOT_WORK_CITY_value",
    "LIVE_CITY_NOT_WORK_CITY_value", "HOUR_APPR_PROCESS_START_value", "WEEKDAY_INDEX_value"
]

In [21]:
# 特征向量
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

Random Forest

In [12]:
# 初始化模型
rf = RandomForestClassifier(featuresCol="features", labelCol="TARGET")

# 训练模型
rf_model = rf.fit(train_data)

# 测试集上进行预测
rf_predictions = rf_model.transform(test_data)

In [14]:
# 初始化评估器
evaluator = MulticlassClassificationEvaluator(labelCol="TARGET", predictionCol="prediction")

# 评估模型的准确率
rf_accuracy = evaluator.evaluate(rf_predictions, {evaluator.metricName: "accuracy"})
print(f"Random Forest Accuracy: {rf_accuracy}")

# 评估模型的F1分数
rf_f1 = evaluator.evaluate(rf_predictions, {evaluator.metricName: "f1"})
print(f"Random Forest F1 Score: {rf_f1}")

# 评估模型的召回率
rf_recall = evaluator.evaluate(rf_predictions, {evaluator.metricName: "weightedRecall"})
print(f"Random Forest Recall: {rf_recall}")

Random Forest Accuracy: 0.9228509648729142
Random Forest F1 Score: 0.8858241423023199
Random Forest Recall: 0.9228509648729142


关闭Spark会话

In [18]:
# 关闭Spark会话
spark.stop()