In [14]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.functions import col, when

In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("GamingBehavior").getOrCreate()

df = spark.read.csv("data/train/*", header=True, inferSchema=True)

Tiền xử lí

In [16]:
df_result = df.select(
    col("Age"),
    col("Gender"),
    col("Location"),
    col("GameGenre"),
    col("InGamePurchases"),
    col("SessionsPerWeek"),
    col("AvgSessionDurationMinutes"),
    col("PlayerLevel"),
    col("AchievementsUnlocked"),
    col("EngagementLevel"),
    when(col("GameDifficulty") == "Hard", "true").otherwise("false").alias("IsStressed"),
    when(col("GameDifficulty") == "Easy", 1)
        .when(col("GameDifficulty") == "Medium", 4)
        .when(col("GameDifficulty") == "Hard", 8).alias("GameDifficultyQuantified"),
)

In [17]:
ADDICTION_CUTOFF = 1280
NUMBER_OF_FOLDS = 10
RANDOM_STATE = 42
TARGET_VARIABLE = "EngagementLevel"
TEST_SIZE = 0.15


In [18]:
numerical_columns = [f.name for f in df_result.schema.fields if f.dataType.typeName() in ['integer', 'double', 'long', 'float']]
categorical_columns = [c for c in df_result.columns if c not in numerical_columns + ["EngagementLevel"]]

In [19]:
indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in categorical_columns]
encoder = OneHotEncoder(inputCols=[c+"_idx" for c in categorical_columns],
                        outputCols=[c+"_enc" for c in categorical_columns])
feature_cols = numerical_columns + [c+"_enc" for c in categorical_columns]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
label_indexer = StringIndexer(inputCol="EngagementLevel", outputCol="label")

rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=RANDOM_STATE)

pipeline = Pipeline(stages = indexers + [encoder, label_indexer, assembler, rf])


train

In [20]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [100]) \
    .addGrid(rf.maxDepth, [8]) \
    .addGrid(rf.featureSubsetStrategy, ["sqrt"]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy"),
                          numFolds=3,
                          parallelism=4,
                          seed=RANDOM_STATE)

In [21]:
cvModel = crossval.fit(df_result)
cvModel.write().overwrite().save("c:/Python/BTLBD/models/cv_pipeline_model")

In [22]:
print("Best CV-Accuracy:", max(cvModel.avgMetrics))

Best CV-Accuracy: 0.855705424536854


Test

In [23]:

cvModel_loaded = CrossValidatorModel.load("c:\\Python\\BTLBD\\models\\cv_pipeline_model")


In [24]:
test_df = spark.read.csv("data/test/*", header=True, inferSchema=True)
test_processed = test_df.select(
    col("PlayerID"),
    col("Age"),
    col("Age"),
    col("Gender"),
    col("Location"),
    col("GameGenre"),
    col("InGamePurchases"),
    col("SessionsPerWeek"),
    col("AvgSessionDurationMinutes"),
    col("PlayerLevel"),
    col("AchievementsUnlocked"),
    when(col("GameDifficulty") == "Hard", "true").otherwise("false").alias("IsStressed"),
    when(col("GameDifficulty") == "Easy", 1)
        .when(col("GameDifficulty") == "Medium", 4)
        .when(col("GameDifficulty") == "Hard", 8).alias("GameDifficultyQuantified"),
)

In [25]:
predictions = cvModel_loaded.transform(test_processed)


In [29]:
results = predictions.select(
    col("PlayerID"),
    col("prediction").cast("int").alias("PredictedEngagement")
)

results.coalesce(1).write.mode("overwrite").option("header", "true").csv("data/results")

print("Predictions saved to data/results/")

Predictions saved to data/results/


In [30]:
results.show(20, truncate=False)

+--------+-------------------+
|PlayerID|PredictedEngagement|
+--------+-------------------+
|9000    |0                  |
|9001    |0                  |
|9002    |1                  |
|9003    |0                  |
|9004    |0                  |
|9005    |2                  |
|9006    |2                  |
|9007    |0                  |
|9008    |0                  |
|9009    |1                  |
|9010    |0                  |
|9011    |1                  |
|9012    |2                  |
|9013    |2                  |
|9014    |0                  |
|9015    |1                  |
|9016    |1                  |
|9017    |2                  |
|9018    |0                  |
|9019    |2                  |
+--------+-------------------+
only showing top 20 rows

