In [2]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.functions import col, when, expr

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("GamingBehavior").getOrCreate()

df = spark.read.csv("data/train/*", header=True, inferSchema=True)

In [4]:
df.head(20)

[Row(PlayerID=9000, Age=43, Gender='Male', Location='Other', GameGenre='Strategy', PlayTimeHours=16.271118760553215, InGamePurchases=0, GameDifficulty='Medium', SessionsPerWeek=6, AvgSessionDurationMinutes=108, PlayerLevel=79, AchievementsUnlocked=25, EngagementLevel='Medium'),
 Row(PlayerID=9001, Age=29, Gender='Female', Location='USA', GameGenre='Strategy', PlayTimeHours=5.525961380570566, InGamePurchases=0, GameDifficulty='Medium', SessionsPerWeek=5, AvgSessionDurationMinutes=144, PlayerLevel=11, AchievementsUnlocked=10, EngagementLevel='Medium'),
 Row(PlayerID=9002, Age=22, Gender='Female', Location='USA', GameGenre='Sports', PlayTimeHours=8.223755243499511, InGamePurchases=0, GameDifficulty='Easy', SessionsPerWeek=16, AvgSessionDurationMinutes=142, PlayerLevel=35, AchievementsUnlocked=41, EngagementLevel='High'),
 Row(PlayerID=9003, Age=35, Gender='Male', Location='USA', GameGenre='Action', PlayTimeHours=5.265351277318268, InGamePurchases=1, GameDifficulty='Easy', SessionsPerWeek=

Tiền xử lí

In [5]:
ADDICTION_CUTOFF = 1280
RANDOM_STATE = 42
TARGET_VARIABLE = "EngagementLevel"

In [6]:
df_result = df.select(
    col("Age"),
    col("Gender"),
    col("Location"),
    col("GameGenre"),
    col("InGamePurchases"),
    col("SessionsPerWeek"),
    col("AvgSessionDurationMinutes"),
    col("PlayerLevel"),
    col("AchievementsUnlocked"),
    col("EngagementLevel"),
    when(col("GameDifficulty") == "Hard", "true").otherwise("false").alias("IsStressed"),
    when(col("GameDifficulty") == "Easy", 1)
        .when(col("GameDifficulty") == "Medium", 4)
        .when(col("GameDifficulty") == "Hard", 8).alias("GameDifficultyQuantified"),
)
df_result = df_result.withColumn(
    "isAddicted",
    when((col("AvgSessionDurationMinutes") * col("SessionsPerWeek")) > ADDICTION_CUTOFF, 1).otherwise(0)
)

In [7]:
numerical_columns = [f.name for f in df_result.schema.fields if f.dataType.typeName() in ['integer', 'double', 'long', 'float']]
categorical_columns = [c for c in df_result.columns if c not in numerical_columns + [TARGET_VARIABLE]]

In [8]:
indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in categorical_columns]
encoder = OneHotEncoder(inputCols=[c+"_idx" for c in categorical_columns],
                        outputCols=[c+"_enc" for c in categorical_columns])
feature_cols = numerical_columns + [c+"_enc" for c in categorical_columns]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
label_indexer = StringIndexer(inputCol=TARGET_VARIABLE, outputCol="label")

rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=RANDOM_STATE)

pipeline = Pipeline(stages = indexers + [encoder, label_indexer, assembler, rf])


train

In [9]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [100, 200]) \
    .addGrid(rf.maxDepth, [8, 10]) \
    .addGrid(rf.featureSubsetStrategy, ["sqrt"]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy"),
                          numFolds=3,
                          parallelism=4,
                          seed=RANDOM_STATE)

In [10]:
cvModel = crossval.fit(df_result)
cvModel.write().overwrite().save("c:/Python/BTLBD/models/cv_pipeline_model")

In [None]:
print("Best CV-Accuracy:", max(cvModel.avgMetrics))

Best CV-Accuracy: 0.8815155505354973


Test

In [13]:

cvModel_loaded = CrossValidatorModel.load("c:\\Python\\BTLBD\\models\\cv_pipeline_model")


In [17]:
test_df = spark.read.csv("data/test/*", header=True, inferSchema=True)
test_processed = test_df.select(
    col("PlayerID"),
    col("Age"),
    col("Age"),
    col("Gender"),
    col("Location"),
    col("GameGenre"),
    col("InGamePurchases"),
    col("SessionsPerWeek"),
    col("AvgSessionDurationMinutes"),
    col("PlayerLevel"),
    col("AchievementsUnlocked"),
    when(col("GameDifficulty") == "Hard", "true").otherwise("false").alias("IsStressed"),
    when(col("GameDifficulty") == "Easy", 1)
        .when(col("GameDifficulty") == "Medium", 4)
        .when(col("GameDifficulty") == "Hard", 8).alias("GameDifficultyQuantified"),
        
)
test_processed = test_processed.withColumn(
    "isAddicted",
    when((col("AvgSessionDurationMinutes") * col("SessionsPerWeek")) > ADDICTION_CUTOFF, 1).otherwise(0)
)

In [18]:
predictions = cvModel_loaded.transform(test_processed)


In [19]:
results = predictions.select(
    col("PlayerID"),
    col("prediction").cast("int").alias("PredictedEngagement")
)

results.coalesce(1).write.mode("overwrite").option("header", "true").csv("data/results")

print("Predictions saved to data/results/")

Predictions saved to data/results/


In [None]:
results.show(20, truncate=False)

+--------+-------------------+
|PlayerID|PredictedEngagement|
+--------+-------------------+
|9000    |0                  |
|9001    |0                  |
|9002    |1                  |
|9003    |0                  |
|9004    |0                  |
|9005    |2                  |
|9006    |2                  |
|9007    |0                  |
|9008    |0                  |
|9009    |1                  |
|9010    |0                  |
|9011    |1                  |
|9012    |2                  |
|9013    |2                  |
|9014    |0                  |
|9015    |1                  |
|9016    |1                  |
|9017    |2                  |
|9018    |0                  |
|9019    |2                  |
+--------+-------------------+
only showing top 20 rows

